natural-pdf 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/finetuning/index.md +176 -0
- docs/tutorials/01-loading-and-extraction.ipynb +34 -1550
- natural_pdf/__init__.py +1 -0
- natural_pdf/analyzers/layout/gemini.py +63 -47
- natural_pdf/collections/pdf_collection.py +5 -2
- natural_pdf/core/element_manager.py +6 -4
- natural_pdf/core/page.py +36 -27
- natural_pdf/core/pdf.py +25 -16
- natural_pdf/elements/base.py +1 -3
- natural_pdf/elements/collections.py +13 -14
- natural_pdf/elements/region.py +7 -6
- natural_pdf/exporters/__init__.py +4 -0
- natural_pdf/exporters/base.py +61 -0
- natural_pdf/exporters/paddleocr.py +345 -0
- natural_pdf/ocr/__init__.py +16 -8
- natural_pdf/ocr/engine.py +46 -30
- natural_pdf/ocr/engine_easyocr.py +81 -40
- natural_pdf/ocr/engine_paddle.py +39 -28
- natural_pdf/ocr/engine_surya.py +32 -16
- natural_pdf/ocr/ocr_factory.py +34 -23
- natural_pdf/ocr/ocr_manager.py +15 -11
- natural_pdf/ocr/ocr_options.py +5 -0
- natural_pdf/ocr/utils.py +46 -31
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
- natural_pdf/utils/debug.py +4 -2
- natural_pdf/utils/identifiers.py +9 -5
- natural_pdf/utils/packaging.py +172 -105
- natural_pdf/utils/text_extraction.py +44 -64
- natural_pdf/utils/visualization.py +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +5 -3
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +34 -30
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -0
natural_pdf/__init__.py
CHANGED
@@ -13,6 +13,7 @@ from PIL import Image
|
|
13
13
|
try:
|
14
14
|
from openai import OpenAI
|
15
15
|
from openai.types.chat import ChatCompletion
|
16
|
+
|
16
17
|
# Import OpenAIError for exception handling if needed
|
17
18
|
except ImportError:
|
18
19
|
OpenAI = None
|
@@ -32,7 +33,7 @@ except ImportError:
|
|
32
33
|
class LayoutDetector:
|
33
34
|
def __init__(self):
|
34
35
|
self.logger = logging.getLogger()
|
35
|
-
self.supported_classes = set()
|
36
|
+
self.supported_classes = set() # Will be dynamic based on user request
|
36
37
|
|
37
38
|
def _get_model(self, options):
|
38
39
|
raise NotImplementedError
|
@@ -41,17 +42,20 @@ except ImportError:
|
|
41
42
|
return n.lower().replace("_", "-").replace(" ", "-")
|
42
43
|
|
43
44
|
def validate_classes(self, c):
|
44
|
-
pass
|
45
|
+
pass # Less strict validation needed for LLM
|
45
46
|
|
46
47
|
logging.basicConfig()
|
47
48
|
|
48
49
|
logger = logging.getLogger(__name__)
|
49
50
|
|
51
|
+
|
50
52
|
# Define Pydantic model for the expected output structure
|
51
53
|
# This is used by the openai library's `response_format`
|
52
54
|
class DetectedRegion(BaseModel):
|
53
55
|
label: str = Field(description="The identified class name.")
|
54
|
-
bbox: List[float] = Field(
|
56
|
+
bbox: List[float] = Field(
|
57
|
+
description="Bounding box coordinates [xmin, ymin, xmax, ymax].", min_items=4, max_items=4
|
58
|
+
)
|
55
59
|
confidence: float = Field(description="Confidence score [0.0, 1.0].", ge=0.0, le=1.0)
|
56
60
|
|
57
61
|
|
@@ -63,23 +67,27 @@ class GeminiLayoutDetector(LayoutDetector):
|
|
63
67
|
|
64
68
|
def __init__(self):
|
65
69
|
super().__init__()
|
66
|
-
self.supported_classes = set()
|
70
|
+
self.supported_classes = set() # Indicate dynamic nature
|
67
71
|
|
68
72
|
def is_available(self) -> bool:
|
69
73
|
"""Check if openai library is installed and GOOGLE_API_KEY is available."""
|
70
74
|
api_key = os.environ.get("GOOGLE_API_KEY")
|
71
75
|
if not api_key:
|
72
|
-
logger.warning(
|
76
|
+
logger.warning(
|
77
|
+
"GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available."
|
78
|
+
)
|
73
79
|
return False
|
74
80
|
if OpenAI is None:
|
75
|
-
|
76
|
-
|
81
|
+
logger.warning(
|
82
|
+
"openai package not found. Gemini detector (via OpenAI lib) will not be available."
|
83
|
+
)
|
84
|
+
return False
|
77
85
|
return True
|
78
86
|
|
79
87
|
def _get_cache_key(self, options: GeminiLayoutOptions) -> str:
|
80
88
|
"""Generate cache key based on model name."""
|
81
89
|
if not isinstance(options, GeminiLayoutOptions):
|
82
|
-
options = GeminiLayoutOptions()
|
90
|
+
options = GeminiLayoutOptions() # Use defaults
|
83
91
|
|
84
92
|
model_key = options.model_name
|
85
93
|
# Prompt is built dynamically, so not part of cache key based on options
|
@@ -101,9 +109,7 @@ class GeminiLayoutDetector(LayoutDetector):
|
|
101
109
|
def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
|
102
110
|
"""Detect layout elements in an image using Gemini via OpenAI library."""
|
103
111
|
if not self.is_available():
|
104
|
-
raise RuntimeError(
|
105
|
-
"OpenAI library not installed or GOOGLE_API_KEY not set."
|
106
|
-
)
|
112
|
+
raise RuntimeError("OpenAI library not installed or GOOGLE_API_KEY not set.")
|
107
113
|
|
108
114
|
# Ensure options are the correct type
|
109
115
|
if not isinstance(options, GeminiLayoutOptions):
|
@@ -124,10 +130,7 @@ class GeminiLayoutDetector(LayoutDetector):
|
|
124
130
|
detections = []
|
125
131
|
try:
|
126
132
|
# --- 1. Initialize OpenAI Client for Gemini ---
|
127
|
-
client = OpenAI(
|
128
|
-
api_key=api_key,
|
129
|
-
base_url=self.GEMINI_BASE_URL
|
130
|
-
)
|
133
|
+
client = OpenAI(api_key=api_key, base_url=self.GEMINI_BASE_URL)
|
131
134
|
|
132
135
|
# --- 2. Prepare Input for OpenAI API ---
|
133
136
|
if not options.classes:
|
@@ -139,11 +142,11 @@ class GeminiLayoutDetector(LayoutDetector):
|
|
139
142
|
# Convert image to base64
|
140
143
|
buffered = io.BytesIO()
|
141
144
|
image.save(buffered, format="PNG")
|
142
|
-
img_base64 = base64.b64encode(buffered.getvalue()).decode(
|
145
|
+
img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
143
146
|
image_url = f"data:image/png;base64,{img_base64}"
|
144
147
|
|
145
148
|
# Construct the prompt text
|
146
|
-
class_list_str = ", ".join(f
|
149
|
+
class_list_str = ", ".join(f"`{c}`" for c in options.classes)
|
147
150
|
prompt_text = (
|
148
151
|
f"Analyze the provided image of a document page ({width}x{height}). "
|
149
152
|
f"Identify all regions corresponding to the following types: {class_list_str}. "
|
@@ -165,14 +168,18 @@ class GeminiLayoutDetector(LayoutDetector):
|
|
165
168
|
]
|
166
169
|
|
167
170
|
# --- 3. Call OpenAI API using .parse for structured output ---
|
168
|
-
logger.debug(
|
171
|
+
logger.debug(
|
172
|
+
f"Running Gemini detection via OpenAI lib (Model: {model_name}). Asking for classes: {options.classes}"
|
173
|
+
)
|
169
174
|
|
170
175
|
# Extract relevant generation parameters from extra_args if provided
|
171
176
|
# Mapping common names: temperature, top_p, max_tokens
|
172
177
|
completion_kwargs = {
|
173
|
-
"temperature": options.extra_args.get("temperature", 0.2),
|
178
|
+
"temperature": options.extra_args.get("temperature", 0.2), # Default to low temp
|
174
179
|
"top_p": options.extra_args.get("top_p"),
|
175
|
-
"max_tokens": options.extra_args.get(
|
180
|
+
"max_tokens": options.extra_args.get(
|
181
|
+
"max_tokens", 4096
|
182
|
+
), # Map from max_output_tokens
|
176
183
|
}
|
177
184
|
# Filter out None values
|
178
185
|
completion_kwargs = {k: v for k, v in completion_kwargs.items() if v is not None}
|
@@ -180,13 +187,13 @@ class GeminiLayoutDetector(LayoutDetector):
|
|
180
187
|
completion: ChatCompletion = client.beta.chat.completions.parse(
|
181
188
|
model=model_name,
|
182
189
|
messages=messages,
|
183
|
-
response_format=List[DetectedRegion],
|
184
|
-
**completion_kwargs
|
190
|
+
response_format=List[DetectedRegion], # Pass the Pydantic model list
|
191
|
+
**completion_kwargs,
|
185
192
|
)
|
186
193
|
|
187
194
|
logger.debug(f"Gemini response received via OpenAI lib.")
|
188
195
|
|
189
|
-
# --- 4. Process Parsed Response ---
|
196
|
+
# --- 4. Process Parsed Response ---
|
190
197
|
if not completion.choices:
|
191
198
|
logger.error("Gemini response (via OpenAI lib) contained no choices.")
|
192
199
|
return []
|
@@ -194,16 +201,18 @@ class GeminiLayoutDetector(LayoutDetector):
|
|
194
201
|
# Get the parsed Pydantic objects
|
195
202
|
parsed_results = completion.choices[0].message.parsed
|
196
203
|
if not parsed_results or not isinstance(parsed_results, list):
|
197
|
-
|
198
|
-
|
204
|
+
logger.error(
|
205
|
+
f"Gemini response (via OpenAI lib) did not contain a valid list of parsed regions. Found: {type(parsed_results)}"
|
206
|
+
)
|
207
|
+
return []
|
199
208
|
|
200
|
-
# --- 5. Convert to Detections & Filter ---
|
201
|
-
normalized_classes_req = {
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
209
|
+
# --- 5. Convert to Detections & Filter ---
|
210
|
+
normalized_classes_req = {self._normalize_class_name(c) for c in options.classes}
|
211
|
+
normalized_classes_excl = (
|
212
|
+
{self._normalize_class_name(c) for c in options.exclude_classes}
|
213
|
+
if options.exclude_classes
|
214
|
+
else set()
|
215
|
+
)
|
207
216
|
|
208
217
|
for item in parsed_results:
|
209
218
|
# The item is already a validated DetectedRegion Pydantic object
|
@@ -215,33 +224,41 @@ class GeminiLayoutDetector(LayoutDetector):
|
|
215
224
|
# Coordinates should already be floats, but ensure tuple format
|
216
225
|
xmin, ymin, xmax, ymax = tuple(bbox_raw)
|
217
226
|
|
218
|
-
# --- Apply Filtering ---
|
227
|
+
# --- Apply Filtering ---
|
219
228
|
normalized_class = self._normalize_class_name(label)
|
220
229
|
|
221
230
|
# Check against requested classes (Should be guaranteed by schema, but doesn't hurt)
|
222
231
|
if normalized_class not in normalized_classes_req:
|
223
|
-
logger.warning(
|
232
|
+
logger.warning(
|
233
|
+
f"Gemini (via OpenAI) returned unexpected class '{label}' despite schema. Skipping."
|
234
|
+
)
|
224
235
|
continue
|
225
236
|
|
226
237
|
# Check against excluded classes
|
227
238
|
if normalized_class in normalized_classes_excl:
|
228
|
-
logger.debug(
|
239
|
+
logger.debug(
|
240
|
+
f"Skipping excluded class '{label}' (normalized: {normalized_class})."
|
241
|
+
)
|
229
242
|
continue
|
230
|
-
|
243
|
+
|
231
244
|
# Check against base confidence threshold from options
|
232
245
|
if confidence_score < options.confidence:
|
233
|
-
logger.debug(
|
246
|
+
logger.debug(
|
247
|
+
f"Skipping item with confidence {confidence_score:.3f} below threshold {options.confidence}."
|
248
|
+
)
|
234
249
|
continue
|
235
250
|
|
236
251
|
# Add detection
|
237
|
-
detections.append(
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
252
|
+
detections.append(
|
253
|
+
{
|
254
|
+
"bbox": (xmin, ymin, xmax, ymax),
|
255
|
+
"class": label, # Use original label from LLM
|
256
|
+
"confidence": confidence_score,
|
257
|
+
"normalized_class": normalized_class,
|
258
|
+
"source": "layout",
|
259
|
+
"model": "gemini", # Keep model name generic as gemini
|
260
|
+
}
|
261
|
+
)
|
245
262
|
|
246
263
|
self.logger.info(
|
247
264
|
f"Gemini (via OpenAI lib) processed response. Detected {len(detections)} layout elements matching criteria."
|
@@ -260,5 +277,4 @@ class GeminiLayoutDetector(LayoutDetector):
|
|
260
277
|
|
261
278
|
def validate_classes(self, classes: List[str]):
|
262
279
|
"""Validation is less critical as we pass requested classes to the LLM."""
|
263
|
-
pass
|
264
|
-
|
280
|
+
pass # Override base validation if needed, but likely not necessary
|
@@ -279,14 +279,17 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
|
|
279
279
|
"""
|
280
280
|
try:
|
281
281
|
from natural_pdf.utils.packaging import create_correction_task_package
|
282
|
+
|
282
283
|
# Pass the collection itself (self) as the source
|
283
284
|
create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
|
284
285
|
except ImportError:
|
285
|
-
logger.error(
|
286
|
+
logger.error(
|
287
|
+
"Failed to import 'create_correction_task_package'. Packaging utility might be missing."
|
288
|
+
)
|
286
289
|
# Or raise
|
287
290
|
except Exception as e:
|
288
291
|
logger.error(f"Failed to export correction task for collection: {e}", exc_info=True)
|
289
|
-
raise
|
292
|
+
raise # Re-raise the exception from the utility function
|
290
293
|
|
291
294
|
# --- Mixin Required Implementation ---
|
292
295
|
def get_indexable_items(self) -> Iterable[Indexable]:
|
@@ -359,8 +359,10 @@ class ElementManager:
|
|
359
359
|
|
360
360
|
# Handle potential None confidence
|
361
361
|
raw_confidence = result.get("confidence")
|
362
|
-
confidence_value =
|
363
|
-
|
362
|
+
confidence_value = (
|
363
|
+
float(raw_confidence) if raw_confidence is not None else None
|
364
|
+
) # Keep None if it was None
|
365
|
+
ocr_text = result.get("text") # Get text, will be None if detect_only
|
364
366
|
|
365
367
|
# Create the TextElement for the word
|
366
368
|
word_element_data = {
|
@@ -373,7 +375,7 @@ class ElementManager:
|
|
373
375
|
"height": pdf_height,
|
374
376
|
"object_type": "word", # Treat OCR results as whole words
|
375
377
|
"source": "ocr",
|
376
|
-
"confidence": confidence_value,
|
378
|
+
"confidence": confidence_value, # Use the handled confidence
|
377
379
|
"fontname": "OCR", # Use consistent OCR fontname
|
378
380
|
"size": (
|
379
381
|
round(pdf_height) if pdf_height > 0 else 10.0
|
@@ -391,7 +393,7 @@ class ElementManager:
|
|
391
393
|
ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
|
392
394
|
|
393
395
|
# Add the char dict list to the word data before creating TextElement
|
394
|
-
word_element_data["_char_dicts"] = [ocr_char_dict]
|
396
|
+
word_element_data["_char_dicts"] = [ocr_char_dict] # Store itself as its only char
|
395
397
|
|
396
398
|
word_elem = TextElement(word_element_data, self._page)
|
397
399
|
added_word_elements.append(word_elem)
|
natural_pdf/core/page.py
CHANGED
@@ -1233,7 +1233,7 @@ class Page:
|
|
1233
1233
|
render_ocr: bool = False,
|
1234
1234
|
resolution: Optional[float] = None,
|
1235
1235
|
include_highlights: bool = True,
|
1236
|
-
exclusions: Optional[str] = None,
|
1236
|
+
exclusions: Optional[str] = None, # New parameter
|
1237
1237
|
**kwargs,
|
1238
1238
|
) -> Optional[Image.Image]:
|
1239
1239
|
"""
|
@@ -1262,11 +1262,11 @@ class Page:
|
|
1262
1262
|
# Delegate rendering to the central service
|
1263
1263
|
image = self._highlighter.render_page(
|
1264
1264
|
page_index=self.index,
|
1265
|
-
scale=scale,
|
1265
|
+
scale=scale, # Note: scale is used by highlighter internally for drawing
|
1266
1266
|
labels=labels,
|
1267
1267
|
legend_position=legend_position,
|
1268
1268
|
render_ocr=render_ocr,
|
1269
|
-
resolution=render_resolution,
|
1269
|
+
resolution=render_resolution, # Pass the calculated resolution
|
1270
1270
|
**kwargs,
|
1271
1271
|
)
|
1272
1272
|
else:
|
@@ -1322,16 +1322,21 @@ class Page:
|
|
1322
1322
|
max(0, img_x0),
|
1323
1323
|
max(0, img_top),
|
1324
1324
|
min(image.width, img_x1),
|
1325
|
-
min(image.height, img_bottom)
|
1325
|
+
min(image.height, img_bottom),
|
1326
1326
|
)
|
1327
1327
|
if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
|
1328
|
-
|
1328
|
+
draw.rectangle(img_coords, fill="white")
|
1329
1329
|
else:
|
1330
|
-
|
1330
|
+
logger.warning(
|
1331
|
+
f"Skipping invalid exclusion rect for masking: {img_coords}"
|
1332
|
+
)
|
1331
1333
|
|
1332
|
-
del draw
|
1334
|
+
del draw # Release drawing context
|
1333
1335
|
except Exception as mask_error:
|
1334
|
-
logger.error(
|
1336
|
+
logger.error(
|
1337
|
+
f"Error applying exclusion mask to page {self.index}: {mask_error}",
|
1338
|
+
exc_info=True,
|
1339
|
+
)
|
1335
1340
|
# Decide if you want to return None or continue without mask
|
1336
1341
|
# For now, continue without mask
|
1337
1342
|
|
@@ -1398,7 +1403,7 @@ class Page:
|
|
1398
1403
|
"""
|
1399
1404
|
if not hasattr(self._parent, "apply_ocr"):
|
1400
1405
|
logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
|
1401
|
-
return []
|
1406
|
+
return [] # Return empty list for consistency
|
1402
1407
|
|
1403
1408
|
logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
|
1404
1409
|
try:
|
@@ -1459,11 +1464,11 @@ class Page:
|
|
1459
1464
|
return []
|
1460
1465
|
|
1461
1466
|
logger.info(f"Page {self.number}: Extracting OCR elements (extract only)...")
|
1462
|
-
|
1467
|
+
|
1463
1468
|
# Determine rendering resolution
|
1464
|
-
final_resolution = resolution if resolution is not None else 150
|
1469
|
+
final_resolution = resolution if resolution is not None else 150 # Default to 150 DPI
|
1465
1470
|
logger.debug(f" Using rendering resolution: {final_resolution} DPI")
|
1466
|
-
|
1471
|
+
|
1467
1472
|
try:
|
1468
1473
|
# Get base image without highlights using the determined resolution
|
1469
1474
|
image = self.to_image(resolution=final_resolution, include_highlights=False)
|
@@ -1477,12 +1482,12 @@ class Page:
|
|
1477
1482
|
|
1478
1483
|
# Prepare arguments for the OCR Manager call
|
1479
1484
|
manager_args = {
|
1480
|
-
|
1481
|
-
|
1482
|
-
|
1483
|
-
|
1484
|
-
|
1485
|
-
|
1485
|
+
"images": image,
|
1486
|
+
"engine": engine,
|
1487
|
+
"languages": languages,
|
1488
|
+
"min_confidence": min_confidence,
|
1489
|
+
"device": device,
|
1490
|
+
"options": options,
|
1486
1491
|
}
|
1487
1492
|
manager_args = {k: v for k, v in manager_args.items() if v is not None}
|
1488
1493
|
|
@@ -1514,7 +1519,7 @@ class Page:
|
|
1514
1519
|
scale_x = self.width / image.width if image.width else 1
|
1515
1520
|
scale_y = self.height / image.height if image.height else 1
|
1516
1521
|
for result in results:
|
1517
|
-
try:
|
1522
|
+
try: # Added try-except around result processing
|
1518
1523
|
x0, top, x1, bottom = [float(c) for c in result["bbox"]]
|
1519
1524
|
elem_data = {
|
1520
1525
|
"text": result["text"],
|
@@ -1525,15 +1530,17 @@ class Page:
|
|
1525
1530
|
"bottom": bottom * scale_y,
|
1526
1531
|
"width": (x1 - x0) * scale_x,
|
1527
1532
|
"height": (bottom - top) * scale_y,
|
1528
|
-
"object_type": "text",
|
1533
|
+
"object_type": "text", # Using text for temporary elements
|
1529
1534
|
"source": "ocr",
|
1530
|
-
"fontname": "OCR-extract",
|
1535
|
+
"fontname": "OCR-extract", # Different name for clarity
|
1531
1536
|
"size": 10.0,
|
1532
1537
|
"page_number": self.number,
|
1533
1538
|
}
|
1534
1539
|
temp_elements.append(TextElement(elem_data, self))
|
1535
1540
|
except (KeyError, ValueError, TypeError) as convert_err:
|
1536
|
-
|
1541
|
+
logger.warning(
|
1542
|
+
f" Skipping invalid OCR result during conversion: {result}. Error: {convert_err}"
|
1543
|
+
)
|
1537
1544
|
|
1538
1545
|
logger.info(f" Created {len(temp_elements)} TextElements from OCR (extract only).")
|
1539
1546
|
return temp_elements
|
@@ -2020,7 +2027,7 @@ class Page:
|
|
2020
2027
|
def correct_ocr(
|
2021
2028
|
self,
|
2022
2029
|
correction_callback: Callable[[Any], Optional[str]],
|
2023
|
-
) -> "Page":
|
2030
|
+
) -> "Page": # Return self for chaining
|
2024
2031
|
"""
|
2025
2032
|
Applies corrections to OCR-generated text elements on this page
|
2026
2033
|
using a user-provided callback function.
|
@@ -2044,7 +2051,9 @@ class Page:
|
|
2044
2051
|
Returns:
|
2045
2052
|
Self for method chaining.
|
2046
2053
|
"""
|
2047
|
-
logger.info(
|
2054
|
+
logger.info(
|
2055
|
+
f"Page {self.number}: Starting OCR correction process using callback '{correction_callback.__name__}'"
|
2056
|
+
)
|
2048
2057
|
|
2049
2058
|
# Find OCR elements specifically on this page
|
2050
2059
|
# Note: We typically want to correct even if the element falls in an excluded area
|
@@ -2052,9 +2061,9 @@ class Page:
|
|
2052
2061
|
|
2053
2062
|
# Delegate to the utility function
|
2054
2063
|
_apply_ocr_correction_to_elements(
|
2055
|
-
elements=target_elements,
|
2064
|
+
elements=target_elements, # Pass the ElementCollection directly
|
2056
2065
|
correction_callback=correction_callback,
|
2057
|
-
caller_info=f"Page({self.number})",
|
2066
|
+
caller_info=f"Page({self.number})", # Pass caller info
|
2058
2067
|
)
|
2059
2068
|
|
2060
|
-
return self
|
2069
|
+
return self # Return self for chaining
|
natural_pdf/core/pdf.py
CHANGED
@@ -239,13 +239,13 @@ class PDF:
|
|
239
239
|
engine: Optional[str] = None,
|
240
240
|
# --- Common OCR Parameters (Direct Arguments) ---
|
241
241
|
languages: Optional[List[str]] = None,
|
242
|
-
min_confidence: Optional[float] = None,
|
242
|
+
min_confidence: Optional[float] = None, # Min confidence threshold
|
243
243
|
device: Optional[str] = None,
|
244
|
-
resolution: Optional[int] = None,
|
245
|
-
apply_exclusions: bool = True,
|
244
|
+
resolution: Optional[int] = None, # DPI for rendering before OCR
|
245
|
+
apply_exclusions: bool = True, # New parameter
|
246
246
|
detect_only: bool = False,
|
247
247
|
# --- Engine-Specific Options --- Use 'options=' for this
|
248
|
-
options: Optional[Any] = None,
|
248
|
+
options: Optional[Any] = None, # e.g., EasyOCROptions(...), PaddleOCROptions(...), or dict
|
249
249
|
# **kwargs: Optional[Dict[str, Any]] = None # Allow potential extra args?
|
250
250
|
) -> "PDF":
|
251
251
|
"""
|
@@ -314,7 +314,7 @@ class PDF:
|
|
314
314
|
logger.info(f"Applying batch OCR to pages: {page_numbers}...")
|
315
315
|
# --- Determine Rendering Resolution ---
|
316
316
|
# Priority: 1. direct `resolution` arg, 2. PDF config, 3. default 150
|
317
|
-
final_resolution = resolution
|
317
|
+
final_resolution = resolution # Use direct arg if provided
|
318
318
|
if final_resolution is None:
|
319
319
|
final_resolution = getattr(self, "_config", {}).get("resolution", 150)
|
320
320
|
|
@@ -323,7 +323,9 @@ class PDF:
|
|
323
323
|
# --- Render Images for Batch ---
|
324
324
|
images_pil: List[Image.Image] = []
|
325
325
|
page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
|
326
|
-
logger.info(
|
326
|
+
logger.info(
|
327
|
+
f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})..."
|
328
|
+
)
|
327
329
|
failed_page_num = "unknown" # Keep track of potentially failing page
|
328
330
|
try:
|
329
331
|
for i, page in enumerate(target_pages):
|
@@ -339,7 +341,7 @@ class PDF:
|
|
339
341
|
if img is None:
|
340
342
|
logger.error(f" Failed to render page {page.number} to image.")
|
341
343
|
# Decide how to handle: skip page, raise error? For now, skip.
|
342
|
-
continue
|
344
|
+
continue # Skip this page if rendering failed
|
343
345
|
images_pil.append(img)
|
344
346
|
page_image_map.append((page, img)) # Store pair
|
345
347
|
except Exception as e:
|
@@ -356,7 +358,7 @@ class PDF:
|
|
356
358
|
"images": images_pil,
|
357
359
|
"engine": engine,
|
358
360
|
"languages": languages,
|
359
|
-
"min_confidence": min_confidence,
|
361
|
+
"min_confidence": min_confidence, # Use the renamed parameter
|
360
362
|
"device": device,
|
361
363
|
"options": options,
|
362
364
|
"detect_only": detect_only,
|
@@ -366,7 +368,9 @@ class PDF:
|
|
366
368
|
manager_args = {k: v for k, v in manager_args.items() if v is not None}
|
367
369
|
|
368
370
|
# --- Call OCR Manager for Batch Processing ---
|
369
|
-
logger.info(
|
371
|
+
logger.info(
|
372
|
+
f"Calling OCR Manager with args: { {k:v for k,v in manager_args.items() if k!='images'} } ..."
|
373
|
+
)
|
370
374
|
try:
|
371
375
|
# Manager's apply_ocr signature needs to accept common args directly
|
372
376
|
batch_results = self._ocr_manager.apply_ocr(**manager_args)
|
@@ -948,19 +952,22 @@ class PDF:
|
|
948
952
|
"""
|
949
953
|
try:
|
950
954
|
from natural_pdf.utils.packaging import create_correction_task_package
|
955
|
+
|
951
956
|
create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
|
952
957
|
except ImportError:
|
953
|
-
logger.error(
|
958
|
+
logger.error(
|
959
|
+
"Failed to import 'create_correction_task_package'. Packaging utility might be missing."
|
960
|
+
)
|
954
961
|
# Or raise
|
955
962
|
except Exception as e:
|
956
963
|
logger.error(f"Failed to export correction task for {self.path}: {e}", exc_info=True)
|
957
|
-
raise
|
964
|
+
raise # Re-raise the exception from the utility function
|
958
965
|
|
959
966
|
def correct_ocr(
|
960
967
|
self,
|
961
968
|
correction_callback: Callable[[Any], Optional[str]],
|
962
969
|
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
963
|
-
) -> "PDF":
|
970
|
+
) -> "PDF": # Return self for chaining
|
964
971
|
"""
|
965
972
|
Applies corrections to OCR-generated text elements using a callback function,
|
966
973
|
delegating the core work to the `Page.correct_ocr` method.
|
@@ -989,7 +996,9 @@ class PDF:
|
|
989
996
|
if not (0 <= idx < len(self._pages)):
|
990
997
|
raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
|
991
998
|
except (IndexError, TypeError, ValueError) as e:
|
992
|
-
raise ValueError(
|
999
|
+
raise ValueError(
|
1000
|
+
f"Invalid page index or type provided in 'pages': {pages}. Error: {e}"
|
1001
|
+
) from e
|
993
1002
|
else:
|
994
1003
|
raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
|
995
1004
|
|
@@ -997,7 +1006,9 @@ class PDF:
|
|
997
1006
|
logger.warning("No pages selected for OCR correction.")
|
998
1007
|
return self
|
999
1008
|
|
1000
|
-
logger.info(
|
1009
|
+
logger.info(
|
1010
|
+
f"Starting OCR correction process via Page delegation for pages: {target_page_indices}"
|
1011
|
+
)
|
1001
1012
|
|
1002
1013
|
# Iterate through target pages and call their correct_ocr method
|
1003
1014
|
for page_idx in target_page_indices:
|
@@ -1071,8 +1082,6 @@ class PDF:
|
|
1071
1082
|
"""Context manager exit."""
|
1072
1083
|
self.close()
|
1073
1084
|
|
1074
|
-
|
1075
1085
|
# --- Indexable Protocol Methods --- Needed for search/sync
|
1076
1086
|
def get_id(self) -> str:
|
1077
1087
|
return self.path
|
1078
|
-
|
natural_pdf/elements/base.py
CHANGED
@@ -21,7 +21,7 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
|
|
21
21
|
from natural_pdf.elements.text import TextElement # Needed for isinstance check
|
22
22
|
from natural_pdf.ocr import OCROptions
|
23
23
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
24
|
-
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
24
|
+
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import the new utility
|
25
25
|
|
26
26
|
logger = logging.getLogger(__name__)
|
27
27
|
|
@@ -1151,9 +1151,9 @@ class ElementCollection(Generic[T]):
|
|
1151
1151
|
_apply_ocr_correction_to_elements(
|
1152
1152
|
elements=self._elements,
|
1153
1153
|
correction_callback=correction_callback,
|
1154
|
-
caller_info=f"ElementCollection(len={len(self._elements)})",
|
1154
|
+
caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
|
1155
1155
|
)
|
1156
|
-
return self
|
1156
|
+
return self # Return self for chaining
|
1157
1157
|
|
1158
1158
|
|
1159
1159
|
class PageCollection(Generic[P]):
|
@@ -1217,12 +1217,12 @@ class PageCollection(Generic[P]):
|
|
1217
1217
|
engine: Optional[str] = None,
|
1218
1218
|
# --- Common OCR Parameters (Direct Arguments) ---
|
1219
1219
|
languages: Optional[List[str]] = None,
|
1220
|
-
min_confidence: Optional[float] = None,
|
1220
|
+
min_confidence: Optional[float] = None, # Min confidence threshold
|
1221
1221
|
device: Optional[str] = None,
|
1222
|
-
resolution: Optional[int] = None,
|
1223
|
-
apply_exclusions: bool = True,
|
1222
|
+
resolution: Optional[int] = None, # DPI for rendering
|
1223
|
+
apply_exclusions: bool = True, # New parameter
|
1224
1224
|
# --- Engine-Specific Options ---
|
1225
|
-
options: Optional[Any] = None,
|
1225
|
+
options: Optional[Any] = None, # e.g., EasyOCROptions(...)
|
1226
1226
|
) -> "PageCollection[P]":
|
1227
1227
|
"""
|
1228
1228
|
Applies OCR to all pages within this collection using batch processing.
|
@@ -1273,10 +1273,10 @@ class PageCollection(Generic[P]):
|
|
1273
1273
|
pages=page_indices,
|
1274
1274
|
engine=engine,
|
1275
1275
|
languages=languages,
|
1276
|
-
min_confidence=min_confidence,
|
1276
|
+
min_confidence=min_confidence, # Pass the renamed parameter
|
1277
1277
|
device=device,
|
1278
1278
|
resolution=resolution,
|
1279
|
-
apply_exclusions=apply_exclusions,
|
1279
|
+
apply_exclusions=apply_exclusions, # Pass down
|
1280
1280
|
options=options,
|
1281
1281
|
)
|
1282
1282
|
# The PDF method modifies the Page objects directly by adding elements.
|
@@ -1351,13 +1351,12 @@ class PageCollection(Generic[P]):
|
|
1351
1351
|
parent_pdf = self.pages[0]._parent
|
1352
1352
|
|
1353
1353
|
page_indices = [p.index for p in self.pages]
|
1354
|
-
logger.info(
|
1354
|
+
logger.info(
|
1355
|
+
f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices}."
|
1356
|
+
)
|
1355
1357
|
|
1356
1358
|
# Delegate the call to the parent PDF object for the relevant pages
|
1357
|
-
parent_pdf.correct_ocr(
|
1358
|
-
correction_callback=correction_callback,
|
1359
|
-
pages=page_indices
|
1360
|
-
)
|
1359
|
+
parent_pdf.correct_ocr(correction_callback=correction_callback, pages=page_indices)
|
1361
1360
|
|
1362
1361
|
return self
|
1363
1362
|
|