natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/categorizing-documents/index.md +168 -0
- docs/data-extraction/index.md +87 -0
- docs/element-selection/index.ipynb +218 -164
- docs/element-selection/index.md +20 -0
- docs/index.md +19 -0
- docs/ocr/index.md +63 -16
- docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
- docs/tutorials/02-finding-elements.ipynb +123 -46
- docs/tutorials/03-extracting-blocks.ipynb +24 -19
- docs/tutorials/04-table-extraction.ipynb +17 -12
- docs/tutorials/05-excluding-content.ipynb +37 -32
- docs/tutorials/06-document-qa.ipynb +36 -31
- docs/tutorials/07-layout-analysis.ipynb +45 -40
- docs/tutorials/07-working-with-regions.ipynb +61 -60
- docs/tutorials/08-spatial-navigation.ipynb +76 -71
- docs/tutorials/09-section-extraction.ipynb +160 -155
- docs/tutorials/10-form-field-extraction.ipynb +71 -66
- docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
- docs/tutorials/12-ocr-integration.ipynb +3420 -312
- docs/tutorials/12-ocr-integration.md +68 -106
- docs/tutorials/13-semantic-search.ipynb +641 -251
- natural_pdf/__init__.py +2 -0
- natural_pdf/classification/manager.py +343 -0
- natural_pdf/classification/mixin.py +149 -0
- natural_pdf/classification/results.py +62 -0
- natural_pdf/collections/mixins.py +63 -0
- natural_pdf/collections/pdf_collection.py +321 -15
- natural_pdf/core/element_manager.py +67 -0
- natural_pdf/core/page.py +227 -64
- natural_pdf/core/pdf.py +387 -378
- natural_pdf/elements/collections.py +272 -41
- natural_pdf/elements/region.py +99 -15
- natural_pdf/elements/text.py +5 -2
- natural_pdf/exporters/paddleocr.py +1 -1
- natural_pdf/extraction/manager.py +134 -0
- natural_pdf/extraction/mixin.py +246 -0
- natural_pdf/extraction/result.py +37 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_manager.py +85 -25
- natural_pdf/ocr/ocr_options.py +33 -10
- natural_pdf/ocr/utils.py +14 -3
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/selectors/parser.py +363 -238
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/text_extraction.py +52 -1
- natural_pdf/utils/tqdm_utils.py +43 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -6,14 +6,19 @@ import logging
|
|
6
6
|
import os
|
7
7
|
import re
|
8
8
|
import tempfile
|
9
|
+
import time # Import time
|
9
10
|
from pathlib import Path
|
10
11
|
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
12
|
+
import concurrent.futures # Added import
|
13
|
+
from tqdm.auto import tqdm # Added tqdm import
|
14
|
+
import threading
|
11
15
|
|
12
16
|
import pdfplumber
|
13
17
|
from PIL import Image, ImageDraw
|
14
18
|
|
15
19
|
from natural_pdf.elements.collections import ElementCollection
|
16
20
|
from natural_pdf.elements.region import Region
|
21
|
+
from natural_pdf.utils.locks import pdf_render_lock # Import from utils instead
|
17
22
|
|
18
23
|
if TYPE_CHECKING:
|
19
24
|
import pdfplumber
|
@@ -46,10 +51,20 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveV
|
|
46
51
|
from natural_pdf.qa import DocumentQA, get_qa_engine
|
47
52
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
48
53
|
|
54
|
+
# --- Classification Imports --- #
|
55
|
+
from natural_pdf.classification.mixin import ClassificationMixin
|
56
|
+
from natural_pdf.classification.manager import ClassificationManager # For type hint
|
57
|
+
# --- End Classification Imports --- #
|
58
|
+
|
59
|
+
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
60
|
+
from natural_pdf.elements.base import Element # Import base element
|
61
|
+
from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
|
62
|
+
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
63
|
+
|
49
64
|
logger = logging.getLogger(__name__)
|
50
65
|
|
51
66
|
|
52
|
-
class Page:
|
67
|
+
class Page(ClassificationMixin, ExtractionMixin):
|
53
68
|
"""
|
54
69
|
Enhanced Page wrapper built on top of pdfplumber.Page.
|
55
70
|
|
@@ -73,14 +88,21 @@ class Page:
|
|
73
88
|
self._text_styles = None # Lazy-loaded text style analyzer results
|
74
89
|
self._exclusions = [] # List to store exclusion functions/regions
|
75
90
|
|
91
|
+
# --- ADDED --- Metadata store for mixins
|
92
|
+
self.metadata: Dict[str, Any] = {}
|
93
|
+
# --- END ADDED ---
|
94
|
+
|
76
95
|
# Region management
|
77
96
|
self._regions = {
|
78
97
|
"detected": [], # Layout detection results
|
79
98
|
"named": {}, # Named regions (name -> region)
|
80
99
|
}
|
81
100
|
|
82
|
-
# Initialize ElementManager
|
83
|
-
self._element_mgr = ElementManager(self, font_attrs)
|
101
|
+
# Initialize ElementManager, passing font_attrs
|
102
|
+
self._element_mgr = ElementManager(self, font_attrs=font_attrs)
|
103
|
+
# self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
|
104
|
+
# --- NEW --- Central registry for analysis results
|
105
|
+
self.analyses: Dict[str, Any] = {}
|
84
106
|
|
85
107
|
# --- Get OCR Manager Instance ---
|
86
108
|
if (
|
@@ -115,6 +137,8 @@ class Page:
|
|
115
137
|
# Initialize the internal variable with a single underscore
|
116
138
|
self._layout_analyzer = None
|
117
139
|
|
140
|
+
self._load_elements()
|
141
|
+
|
118
142
|
@property
|
119
143
|
def pdf(self) -> "PDF":
|
120
144
|
"""Provides public access to the parent PDF object."""
|
@@ -1257,38 +1281,48 @@ class Page:
|
|
1257
1281
|
"""
|
1258
1282
|
image = None
|
1259
1283
|
render_resolution = resolution if resolution is not None else scale * 72
|
1284
|
+
thread_id = threading.current_thread().name
|
1285
|
+
logger.debug(f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image...")
|
1286
|
+
lock_wait_start = time.monotonic()
|
1260
1287
|
try:
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1266
|
-
|
1267
|
-
|
1268
|
-
|
1269
|
-
|
1270
|
-
|
1271
|
-
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
|
1277
|
-
|
1278
|
-
|
1279
|
-
|
1280
|
-
|
1281
|
-
|
1282
|
-
|
1283
|
-
|
1288
|
+
# Acquire the global PDF rendering lock
|
1289
|
+
with pdf_render_lock:
|
1290
|
+
lock_acquired_time = time.monotonic()
|
1291
|
+
logger.debug(f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render...")
|
1292
|
+
if include_highlights:
|
1293
|
+
# Delegate rendering to the central service
|
1294
|
+
image = self._highlighter.render_page(
|
1295
|
+
page_index=self.index,
|
1296
|
+
scale=scale, # Note: scale is used by highlighter internally for drawing
|
1297
|
+
labels=labels,
|
1298
|
+
legend_position=legend_position,
|
1299
|
+
render_ocr=render_ocr,
|
1300
|
+
resolution=render_resolution, # Pass the calculated resolution
|
1301
|
+
**kwargs,
|
1302
|
+
)
|
1303
|
+
else:
|
1304
|
+
# Get the base page image directly from pdfplumber if no highlights needed
|
1305
|
+
# Use the underlying pdfplumber page object
|
1306
|
+
img_object = self._page.to_image(resolution=render_resolution, **kwargs)
|
1307
|
+
# Access the PIL image directly (assuming pdfplumber structure)
|
1308
|
+
image = (
|
1309
|
+
img_object.annotated
|
1310
|
+
if hasattr(img_object, "annotated")
|
1311
|
+
else img_object._repr_png_()
|
1312
|
+
)
|
1313
|
+
if isinstance(image, bytes): # Handle cases where it returns bytes
|
1314
|
+
from io import BytesIO
|
1284
1315
|
|
1285
|
-
|
1286
|
-
|
1287
|
-
|
1316
|
+
image = Image.open(BytesIO(image)).convert(
|
1317
|
+
"RGB"
|
1318
|
+
) # Convert to RGB for consistency
|
1288
1319
|
|
1289
1320
|
except Exception as e:
|
1290
1321
|
logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
|
1291
1322
|
return None # Return None on error
|
1323
|
+
finally:
|
1324
|
+
render_end_time = time.monotonic()
|
1325
|
+
logger.debug(f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s")
|
1292
1326
|
|
1293
1327
|
if image is None:
|
1294
1328
|
return None
|
@@ -1384,6 +1418,7 @@ class Page:
|
|
1384
1418
|
resolution: Optional[int] = None,
|
1385
1419
|
detect_only: bool = False,
|
1386
1420
|
apply_exclusions: bool = True,
|
1421
|
+
replace: bool = True,
|
1387
1422
|
) -> "Page":
|
1388
1423
|
"""
|
1389
1424
|
Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
|
@@ -1397,13 +1432,21 @@ class Page:
|
|
1397
1432
|
resolution: DPI resolution for rendering page image before OCR.
|
1398
1433
|
apply_exclusions: If True (default), render page image for OCR
|
1399
1434
|
with excluded areas masked (whited out).
|
1435
|
+
detect_only: If True, only detect text bounding boxes, don't perform OCR.
|
1436
|
+
replace: If True (default), remove any existing OCR elements before
|
1437
|
+
adding new ones. If False, add new OCR elements to existing ones.
|
1400
1438
|
|
1401
1439
|
Returns:
|
1402
|
-
|
1440
|
+
Self for method chaining.
|
1403
1441
|
"""
|
1404
1442
|
if not hasattr(self._parent, "apply_ocr"):
|
1405
1443
|
logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
|
1406
|
-
return
|
1444
|
+
return self # Return self for chaining
|
1445
|
+
|
1446
|
+
# Remove existing OCR elements if replace is True
|
1447
|
+
if replace and hasattr(self, "_element_mgr"):
|
1448
|
+
logger.info(f"Page {self.number}: Removing existing OCR elements before applying new OCR.")
|
1449
|
+
self._element_mgr.remove_ocr_elements()
|
1407
1450
|
|
1408
1451
|
logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
|
1409
1452
|
try:
|
@@ -1419,18 +1462,13 @@ class Page:
|
|
1419
1462
|
resolution=resolution,
|
1420
1463
|
detect_only=detect_only,
|
1421
1464
|
apply_exclusions=apply_exclusions,
|
1465
|
+
replace=replace, # Pass the replace parameter to PDF.apply_ocr
|
1422
1466
|
)
|
1423
1467
|
except Exception as e:
|
1424
1468
|
logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
|
1425
|
-
return
|
1469
|
+
return self # Return self for chaining
|
1426
1470
|
|
1427
|
-
# Return
|
1428
|
-
ocr_elements = [el for el in self.words if getattr(el, "source", None) == "ocr"]
|
1429
|
-
logger.debug(
|
1430
|
-
f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements."
|
1431
|
-
)
|
1432
|
-
# Note: The method is typed to return Page for chaining, but the log indicates
|
1433
|
-
# finding elements. Let's stick to returning self for chaining consistency.
|
1471
|
+
# Return self for chaining
|
1434
1472
|
return self
|
1435
1473
|
|
1436
1474
|
def extract_ocr_elements(
|
@@ -1471,11 +1509,13 @@ class Page:
|
|
1471
1509
|
|
1472
1510
|
try:
|
1473
1511
|
# Get base image without highlights using the determined resolution
|
1474
|
-
|
1475
|
-
|
1476
|
-
|
1477
|
-
|
1478
|
-
|
1512
|
+
# Use the global PDF rendering lock
|
1513
|
+
with pdf_render_lock:
|
1514
|
+
image = self.to_image(resolution=final_resolution, include_highlights=False)
|
1515
|
+
if not image:
|
1516
|
+
logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
|
1517
|
+
return []
|
1518
|
+
logger.debug(f" Rendered image size: {image.width}x{image.height}")
|
1479
1519
|
except Exception as e:
|
1480
1520
|
logger.error(f" Failed to render page {self.number} to image: {e}", exc_info=True)
|
1481
1521
|
return []
|
@@ -2027,43 +2067,166 @@ class Page:
|
|
2027
2067
|
def correct_ocr(
|
2028
2068
|
self,
|
2029
2069
|
correction_callback: Callable[[Any], Optional[str]],
|
2070
|
+
max_workers: Optional[int] = None,
|
2071
|
+
progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
|
2030
2072
|
) -> "Page": # Return self for chaining
|
2031
2073
|
"""
|
2032
2074
|
Applies corrections to OCR-generated text elements on this page
|
2033
|
-
using a user-provided callback function.
|
2075
|
+
using a user-provided callback function, potentially in parallel.
|
2034
2076
|
|
2035
2077
|
Finds text elements on this page whose 'source' attribute starts
|
2036
2078
|
with 'ocr' and calls the `correction_callback` for each, passing the
|
2037
|
-
element itself.
|
2038
|
-
|
2039
|
-
The `correction_callback` should contain the logic to:
|
2040
|
-
1. Determine if the element needs correction.
|
2041
|
-
2. Perform the correction (e.g., call an LLM).
|
2042
|
-
3. Return the new text (`str`) or `None`.
|
2043
|
-
|
2044
|
-
If the callback returns a string, the element's `.text` is updated.
|
2045
|
-
Metadata updates (source, confidence, etc.) should happen within the callback.
|
2079
|
+
element itself. Updates the element's text if the callback returns
|
2080
|
+
a new string.
|
2046
2081
|
|
2047
2082
|
Args:
|
2048
2083
|
correction_callback: A function accepting an element and returning
|
2049
2084
|
`Optional[str]` (new text or None).
|
2085
|
+
max_workers: The maximum number of threads to use for parallel execution.
|
2086
|
+
If None or 0 or 1, runs sequentially.
|
2087
|
+
progress_callback: Optional callback function to call after processing each element.
|
2050
2088
|
|
2051
2089
|
Returns:
|
2052
2090
|
Self for method chaining.
|
2053
2091
|
"""
|
2054
2092
|
logger.info(
|
2055
|
-
f"Page {self.number}: Starting OCR correction
|
2093
|
+
f"Page {self.number}: Starting OCR correction with callback '{correction_callback.__name__}' (max_workers={max_workers})"
|
2094
|
+
)
|
2095
|
+
|
2096
|
+
target_elements_collection = self.find_all(
|
2097
|
+
selector="text[source=ocr]", apply_exclusions=False
|
2056
2098
|
)
|
2099
|
+
target_elements = target_elements_collection.elements # Get the list
|
2057
2100
|
|
2058
|
-
|
2059
|
-
|
2060
|
-
|
2101
|
+
if not target_elements:
|
2102
|
+
logger.info(f"Page {self.number}: No OCR elements found to correct.")
|
2103
|
+
return self
|
2104
|
+
|
2105
|
+
processed_count = 0
|
2106
|
+
updated_count = 0
|
2107
|
+
error_count = 0
|
2108
|
+
|
2109
|
+
# Define the task to be run by the worker thread or sequentially
|
2110
|
+
def _process_element_task(element):
|
2111
|
+
try:
|
2112
|
+
current_text = getattr(element, 'text', None)
|
2113
|
+
# Call the user-provided callback
|
2114
|
+
corrected_text = correction_callback(element)
|
2115
|
+
|
2116
|
+
# Validate result type
|
2117
|
+
if corrected_text is not None and not isinstance(corrected_text, str):
|
2118
|
+
logger.warning(f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update.")
|
2119
|
+
return element, None, None # Treat as no correction
|
2120
|
+
|
2121
|
+
return element, corrected_text, None # Return element, result, no error
|
2122
|
+
except Exception as e:
|
2123
|
+
logger.error(
|
2124
|
+
f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
|
2125
|
+
exc_info=False # Keep log concise
|
2126
|
+
)
|
2127
|
+
return element, None, e # Return element, no result, error
|
2128
|
+
finally:
|
2129
|
+
# --- Call progress callback here --- #
|
2130
|
+
if progress_callback:
|
2131
|
+
try:
|
2132
|
+
progress_callback()
|
2133
|
+
except Exception as cb_e:
|
2134
|
+
# Log error in callback itself, but don't stop processing
|
2135
|
+
logger.error(f"Page {self.number}: Error executing progress_callback: {cb_e}", exc_info=False)
|
2136
|
+
|
2137
|
+
# Choose execution strategy based on max_workers
|
2138
|
+
if max_workers is not None and max_workers > 1:
|
2139
|
+
# --- Parallel execution --- #
|
2140
|
+
logger.info(f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers.")
|
2141
|
+
futures = []
|
2142
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
2143
|
+
# Submit all tasks
|
2144
|
+
future_to_element = {executor.submit(_process_element_task, element): element for element in target_elements}
|
2145
|
+
|
2146
|
+
# Process results as they complete (progress_callback called by worker)
|
2147
|
+
for future in concurrent.futures.as_completed(future_to_element):
|
2148
|
+
processed_count += 1
|
2149
|
+
try:
|
2150
|
+
element, corrected_text, error = future.result()
|
2151
|
+
if error:
|
2152
|
+
error_count += 1
|
2153
|
+
# Error already logged in worker
|
2154
|
+
elif corrected_text is not None:
|
2155
|
+
# Apply correction if text changed
|
2156
|
+
current_text = getattr(element, 'text', None)
|
2157
|
+
if corrected_text != current_text:
|
2158
|
+
element.text = corrected_text
|
2159
|
+
updated_count += 1
|
2160
|
+
except Exception as exc:
|
2161
|
+
# Catch errors from future.result() itself
|
2162
|
+
element = future_to_element[future] # Find original element
|
2163
|
+
logger.error(f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}", exc_info=True)
|
2164
|
+
error_count += 1
|
2165
|
+
# Note: progress_callback was already called in the worker's finally block
|
2166
|
+
|
2167
|
+
else:
|
2168
|
+
# --- Sequential execution --- #
|
2169
|
+
logger.info(f"Page {self.number}: Running OCR correction sequentially.")
|
2170
|
+
for element in target_elements:
|
2171
|
+
# Call the task function directly (it handles progress_callback)
|
2172
|
+
processed_count += 1
|
2173
|
+
_element, corrected_text, error = _process_element_task(element)
|
2174
|
+
if error:
|
2175
|
+
error_count += 1
|
2176
|
+
elif corrected_text is not None:
|
2177
|
+
# Apply correction if text changed
|
2178
|
+
current_text = getattr(_element, 'text', None)
|
2179
|
+
if corrected_text != current_text:
|
2180
|
+
_element.text = corrected_text
|
2181
|
+
updated_count += 1
|
2061
2182
|
|
2062
|
-
|
2063
|
-
|
2064
|
-
elements=target_elements, # Pass the ElementCollection directly
|
2065
|
-
correction_callback=correction_callback,
|
2066
|
-
caller_info=f"Page({self.number})", # Pass caller info
|
2183
|
+
logger.info(
|
2184
|
+
f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
|
2067
2185
|
)
|
2068
2186
|
|
2069
|
-
return self
|
2187
|
+
return self # Return self for chaining
|
2188
|
+
|
2189
|
+
# --- Classification Mixin Implementation --- #
|
2190
|
+
def _get_classification_manager(self) -> "ClassificationManager":
|
2191
|
+
if not hasattr(self, 'pdf') or not hasattr(self.pdf, 'get_manager'):
|
2192
|
+
raise AttributeError("ClassificationManager cannot be accessed: Parent PDF or get_manager method missing.")
|
2193
|
+
try:
|
2194
|
+
# Use the PDF's manager registry accessor
|
2195
|
+
return self.pdf.get_manager('classification')
|
2196
|
+
except (ValueError, RuntimeError, AttributeError) as e:
|
2197
|
+
# Wrap potential errors from get_manager for clarity
|
2198
|
+
raise AttributeError(f"Failed to get ClassificationManager from PDF: {e}") from e
|
2199
|
+
|
2200
|
+
def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, "Image"]: # Use "Image" for lazy import
|
2201
|
+
if model_type == 'text':
|
2202
|
+
text_content = self.extract_text(layout=False, use_exclusions=False) # Simple join, ignore exclusions for classification
|
2203
|
+
if not text_content or text_content.isspace():
|
2204
|
+
raise ValueError("Cannot classify page with 'text' model: No text content found.")
|
2205
|
+
return text_content
|
2206
|
+
elif model_type == 'vision':
|
2207
|
+
# Get resolution from manager/kwargs if possible, else default
|
2208
|
+
manager = self._get_classification_manager()
|
2209
|
+
default_resolution = 150
|
2210
|
+
# Access kwargs passed to classify method if needed
|
2211
|
+
resolution = kwargs.get('resolution', default_resolution) if 'kwargs' in locals() else default_resolution
|
2212
|
+
|
2213
|
+
# Use to_image, ensuring no highlights interfere
|
2214
|
+
img = self.to_image(
|
2215
|
+
resolution=resolution,
|
2216
|
+
include_highlights=False,
|
2217
|
+
labels=False,
|
2218
|
+
exclusions=None # Don't mask exclusions for classification input image
|
2219
|
+
)
|
2220
|
+
if img is None:
|
2221
|
+
raise ValueError("Cannot classify page with 'vision' model: Failed to render image.")
|
2222
|
+
return img
|
2223
|
+
else:
|
2224
|
+
raise ValueError(f"Unsupported model_type for classification: {model_type}")
|
2225
|
+
|
2226
|
+
def _get_metadata_storage(self) -> Dict[str, Any]:
|
2227
|
+
# Ensure metadata exists
|
2228
|
+
if not hasattr(self, 'metadata') or self.metadata is None:
|
2229
|
+
self.metadata = {}
|
2230
|
+
return self.metadata
|
2231
|
+
|
2232
|
+
# --- Content Extraction ---
|