natural-pdf 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. docs/finetuning/index.md +176 -0
  2. docs/tutorials/01-loading-and-extraction.ipynb +34 -1550
  3. natural_pdf/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/gemini.py +63 -47
  5. natural_pdf/collections/pdf_collection.py +5 -2
  6. natural_pdf/core/element_manager.py +6 -4
  7. natural_pdf/core/page.py +36 -27
  8. natural_pdf/core/pdf.py +25 -16
  9. natural_pdf/elements/base.py +1 -3
  10. natural_pdf/elements/collections.py +13 -14
  11. natural_pdf/elements/region.py +7 -6
  12. natural_pdf/exporters/__init__.py +4 -0
  13. natural_pdf/exporters/base.py +61 -0
  14. natural_pdf/exporters/paddleocr.py +345 -0
  15. natural_pdf/ocr/__init__.py +16 -8
  16. natural_pdf/ocr/engine.py +46 -30
  17. natural_pdf/ocr/engine_easyocr.py +81 -40
  18. natural_pdf/ocr/engine_paddle.py +39 -28
  19. natural_pdf/ocr/engine_surya.py +32 -16
  20. natural_pdf/ocr/ocr_factory.py +34 -23
  21. natural_pdf/ocr/ocr_manager.py +15 -11
  22. natural_pdf/ocr/ocr_options.py +5 -0
  23. natural_pdf/ocr/utils.py +46 -31
  24. natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
  25. natural_pdf/utils/debug.py +4 -2
  26. natural_pdf/utils/identifiers.py +9 -5
  27. natural_pdf/utils/packaging.py +172 -105
  28. natural_pdf/utils/text_extraction.py +44 -64
  29. natural_pdf/utils/visualization.py +1 -1
  30. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +5 -3
  31. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +34 -30
  32. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +0 -0
  33. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
  34. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -0
natural_pdf/__init__.py CHANGED
@@ -33,6 +33,7 @@ def configure_logging(level=logging.INFO, handler=None):
33
33
 
34
34
  logger.propagate = False
35
35
 
36
+
36
37
  from natural_pdf.core.page import Page
37
38
  from natural_pdf.core.pdf import PDF
38
39
  from natural_pdf.elements.collections import ElementCollection
@@ -13,6 +13,7 @@ from PIL import Image
13
13
  try:
14
14
  from openai import OpenAI
15
15
  from openai.types.chat import ChatCompletion
16
+
16
17
  # Import OpenAIError for exception handling if needed
17
18
  except ImportError:
18
19
  OpenAI = None
@@ -32,7 +33,7 @@ except ImportError:
32
33
  class LayoutDetector:
33
34
  def __init__(self):
34
35
  self.logger = logging.getLogger()
35
- self.supported_classes = set() # Will be dynamic based on user request
36
+ self.supported_classes = set() # Will be dynamic based on user request
36
37
 
37
38
  def _get_model(self, options):
38
39
  raise NotImplementedError
@@ -41,17 +42,20 @@ except ImportError:
41
42
  return n.lower().replace("_", "-").replace(" ", "-")
42
43
 
43
44
  def validate_classes(self, c):
44
- pass # Less strict validation needed for LLM
45
+ pass # Less strict validation needed for LLM
45
46
 
46
47
  logging.basicConfig()
47
48
 
48
49
  logger = logging.getLogger(__name__)
49
50
 
51
+
50
52
  # Define Pydantic model for the expected output structure
51
53
  # This is used by the openai library's `response_format`
52
54
  class DetectedRegion(BaseModel):
53
55
  label: str = Field(description="The identified class name.")
54
- bbox: List[float] = Field(description="Bounding box coordinates [xmin, ymin, xmax, ymax].", min_items=4, max_items=4)
56
+ bbox: List[float] = Field(
57
+ description="Bounding box coordinates [xmin, ymin, xmax, ymax].", min_items=4, max_items=4
58
+ )
55
59
  confidence: float = Field(description="Confidence score [0.0, 1.0].", ge=0.0, le=1.0)
56
60
 
57
61
 
@@ -63,23 +67,27 @@ class GeminiLayoutDetector(LayoutDetector):
63
67
 
64
68
  def __init__(self):
65
69
  super().__init__()
66
- self.supported_classes = set() # Indicate dynamic nature
70
+ self.supported_classes = set() # Indicate dynamic nature
67
71
 
68
72
  def is_available(self) -> bool:
69
73
  """Check if openai library is installed and GOOGLE_API_KEY is available."""
70
74
  api_key = os.environ.get("GOOGLE_API_KEY")
71
75
  if not api_key:
72
- logger.warning("GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.")
76
+ logger.warning(
77
+ "GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available."
78
+ )
73
79
  return False
74
80
  if OpenAI is None:
75
- logger.warning("openai package not found. Gemini detector (via OpenAI lib) will not be available.")
76
- return False
81
+ logger.warning(
82
+ "openai package not found. Gemini detector (via OpenAI lib) will not be available."
83
+ )
84
+ return False
77
85
  return True
78
86
 
79
87
  def _get_cache_key(self, options: GeminiLayoutOptions) -> str:
80
88
  """Generate cache key based on model name."""
81
89
  if not isinstance(options, GeminiLayoutOptions):
82
- options = GeminiLayoutOptions() # Use defaults
90
+ options = GeminiLayoutOptions() # Use defaults
83
91
 
84
92
  model_key = options.model_name
85
93
  # Prompt is built dynamically, so not part of cache key based on options
@@ -101,9 +109,7 @@ class GeminiLayoutDetector(LayoutDetector):
101
109
  def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
102
110
  """Detect layout elements in an image using Gemini via OpenAI library."""
103
111
  if not self.is_available():
104
- raise RuntimeError(
105
- "OpenAI library not installed or GOOGLE_API_KEY not set."
106
- )
112
+ raise RuntimeError("OpenAI library not installed or GOOGLE_API_KEY not set.")
107
113
 
108
114
  # Ensure options are the correct type
109
115
  if not isinstance(options, GeminiLayoutOptions):
@@ -124,10 +130,7 @@ class GeminiLayoutDetector(LayoutDetector):
124
130
  detections = []
125
131
  try:
126
132
  # --- 1. Initialize OpenAI Client for Gemini ---
127
- client = OpenAI(
128
- api_key=api_key,
129
- base_url=self.GEMINI_BASE_URL
130
- )
133
+ client = OpenAI(api_key=api_key, base_url=self.GEMINI_BASE_URL)
131
134
 
132
135
  # --- 2. Prepare Input for OpenAI API ---
133
136
  if not options.classes:
@@ -139,11 +142,11 @@ class GeminiLayoutDetector(LayoutDetector):
139
142
  # Convert image to base64
140
143
  buffered = io.BytesIO()
141
144
  image.save(buffered, format="PNG")
142
- img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
145
+ img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
143
146
  image_url = f"data:image/png;base64,{img_base64}"
144
147
 
145
148
  # Construct the prompt text
146
- class_list_str = ", ".join(f'`{c}`' for c in options.classes)
149
+ class_list_str = ", ".join(f"`{c}`" for c in options.classes)
147
150
  prompt_text = (
148
151
  f"Analyze the provided image of a document page ({width}x{height}). "
149
152
  f"Identify all regions corresponding to the following types: {class_list_str}. "
@@ -165,14 +168,18 @@ class GeminiLayoutDetector(LayoutDetector):
165
168
  ]
166
169
 
167
170
  # --- 3. Call OpenAI API using .parse for structured output ---
168
- logger.debug(f"Running Gemini detection via OpenAI lib (Model: {model_name}). Asking for classes: {options.classes}")
171
+ logger.debug(
172
+ f"Running Gemini detection via OpenAI lib (Model: {model_name}). Asking for classes: {options.classes}"
173
+ )
169
174
 
170
175
  # Extract relevant generation parameters from extra_args if provided
171
176
  # Mapping common names: temperature, top_p, max_tokens
172
177
  completion_kwargs = {
173
- "temperature": options.extra_args.get("temperature", 0.2), # Default to low temp
178
+ "temperature": options.extra_args.get("temperature", 0.2), # Default to low temp
174
179
  "top_p": options.extra_args.get("top_p"),
175
- "max_tokens": options.extra_args.get("max_tokens", 4096), # Map from max_output_tokens
180
+ "max_tokens": options.extra_args.get(
181
+ "max_tokens", 4096
182
+ ), # Map from max_output_tokens
176
183
  }
177
184
  # Filter out None values
178
185
  completion_kwargs = {k: v for k, v in completion_kwargs.items() if v is not None}
@@ -180,13 +187,13 @@ class GeminiLayoutDetector(LayoutDetector):
180
187
  completion: ChatCompletion = client.beta.chat.completions.parse(
181
188
  model=model_name,
182
189
  messages=messages,
183
- response_format=List[DetectedRegion], # Pass the Pydantic model list
184
- **completion_kwargs
190
+ response_format=List[DetectedRegion], # Pass the Pydantic model list
191
+ **completion_kwargs,
185
192
  )
186
193
 
187
194
  logger.debug(f"Gemini response received via OpenAI lib.")
188
195
 
189
- # --- 4. Process Parsed Response ---
196
+ # --- 4. Process Parsed Response ---
190
197
  if not completion.choices:
191
198
  logger.error("Gemini response (via OpenAI lib) contained no choices.")
192
199
  return []
@@ -194,16 +201,18 @@ class GeminiLayoutDetector(LayoutDetector):
194
201
  # Get the parsed Pydantic objects
195
202
  parsed_results = completion.choices[0].message.parsed
196
203
  if not parsed_results or not isinstance(parsed_results, list):
197
- logger.error(f"Gemini response (via OpenAI lib) did not contain a valid list of parsed regions. Found: {type(parsed_results)}")
198
- return []
204
+ logger.error(
205
+ f"Gemini response (via OpenAI lib) did not contain a valid list of parsed regions. Found: {type(parsed_results)}"
206
+ )
207
+ return []
199
208
 
200
- # --- 5. Convert to Detections & Filter ---
201
- normalized_classes_req = {
202
- self._normalize_class_name(c) for c in options.classes
203
- }
204
- normalized_classes_excl = {
205
- self._normalize_class_name(c) for c in options.exclude_classes
206
- } if options.exclude_classes else set()
209
+ # --- 5. Convert to Detections & Filter ---
210
+ normalized_classes_req = {self._normalize_class_name(c) for c in options.classes}
211
+ normalized_classes_excl = (
212
+ {self._normalize_class_name(c) for c in options.exclude_classes}
213
+ if options.exclude_classes
214
+ else set()
215
+ )
207
216
 
208
217
  for item in parsed_results:
209
218
  # The item is already a validated DetectedRegion Pydantic object
@@ -215,33 +224,41 @@ class GeminiLayoutDetector(LayoutDetector):
215
224
  # Coordinates should already be floats, but ensure tuple format
216
225
  xmin, ymin, xmax, ymax = tuple(bbox_raw)
217
226
 
218
- # --- Apply Filtering ---
227
+ # --- Apply Filtering ---
219
228
  normalized_class = self._normalize_class_name(label)
220
229
 
221
230
  # Check against requested classes (Should be guaranteed by schema, but doesn't hurt)
222
231
  if normalized_class not in normalized_classes_req:
223
- logger.warning(f"Gemini (via OpenAI) returned unexpected class '{label}' despite schema. Skipping.")
232
+ logger.warning(
233
+ f"Gemini (via OpenAI) returned unexpected class '{label}' despite schema. Skipping."
234
+ )
224
235
  continue
225
236
 
226
237
  # Check against excluded classes
227
238
  if normalized_class in normalized_classes_excl:
228
- logger.debug(f"Skipping excluded class '{label}' (normalized: {normalized_class}).")
239
+ logger.debug(
240
+ f"Skipping excluded class '{label}' (normalized: {normalized_class})."
241
+ )
229
242
  continue
230
-
243
+
231
244
  # Check against base confidence threshold from options
232
245
  if confidence_score < options.confidence:
233
- logger.debug(f"Skipping item with confidence {confidence_score:.3f} below threshold {options.confidence}.")
246
+ logger.debug(
247
+ f"Skipping item with confidence {confidence_score:.3f} below threshold {options.confidence}."
248
+ )
234
249
  continue
235
250
 
236
251
  # Add detection
237
- detections.append({
238
- "bbox": (xmin, ymin, xmax, ymax),
239
- "class": label, # Use original label from LLM
240
- "confidence": confidence_score,
241
- "normalized_class": normalized_class,
242
- "source": "layout",
243
- "model": "gemini", # Keep model name generic as gemini
244
- })
252
+ detections.append(
253
+ {
254
+ "bbox": (xmin, ymin, xmax, ymax),
255
+ "class": label, # Use original label from LLM
256
+ "confidence": confidence_score,
257
+ "normalized_class": normalized_class,
258
+ "source": "layout",
259
+ "model": "gemini", # Keep model name generic as gemini
260
+ }
261
+ )
245
262
 
246
263
  self.logger.info(
247
264
  f"Gemini (via OpenAI lib) processed response. Detected {len(detections)} layout elements matching criteria."
@@ -260,5 +277,4 @@ class GeminiLayoutDetector(LayoutDetector):
260
277
 
261
278
  def validate_classes(self, classes: List[str]):
262
279
  """Validation is less critical as we pass requested classes to the LLM."""
263
- pass # Override base validation if needed, but likely not necessary
264
-
280
+ pass # Override base validation if needed, but likely not necessary
@@ -279,14 +279,17 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
279
279
  """
280
280
  try:
281
281
  from natural_pdf.utils.packaging import create_correction_task_package
282
+
282
283
  # Pass the collection itself (self) as the source
283
284
  create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
284
285
  except ImportError:
285
- logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
286
+ logger.error(
287
+ "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
288
+ )
286
289
  # Or raise
287
290
  except Exception as e:
288
291
  logger.error(f"Failed to export correction task for collection: {e}", exc_info=True)
289
- raise # Re-raise the exception from the utility function
292
+ raise # Re-raise the exception from the utility function
290
293
 
291
294
  # --- Mixin Required Implementation ---
292
295
  def get_indexable_items(self) -> Iterable[Indexable]:
@@ -359,8 +359,10 @@ class ElementManager:
359
359
 
360
360
  # Handle potential None confidence
361
361
  raw_confidence = result.get("confidence")
362
- confidence_value = float(raw_confidence) if raw_confidence is not None else None # Keep None if it was None
363
- ocr_text = result.get("text") # Get text, will be None if detect_only
362
+ confidence_value = (
363
+ float(raw_confidence) if raw_confidence is not None else None
364
+ ) # Keep None if it was None
365
+ ocr_text = result.get("text") # Get text, will be None if detect_only
364
366
 
365
367
  # Create the TextElement for the word
366
368
  word_element_data = {
@@ -373,7 +375,7 @@ class ElementManager:
373
375
  "height": pdf_height,
374
376
  "object_type": "word", # Treat OCR results as whole words
375
377
  "source": "ocr",
376
- "confidence": confidence_value, # Use the handled confidence
378
+ "confidence": confidence_value, # Use the handled confidence
377
379
  "fontname": "OCR", # Use consistent OCR fontname
378
380
  "size": (
379
381
  round(pdf_height) if pdf_height > 0 else 10.0
@@ -391,7 +393,7 @@ class ElementManager:
391
393
  ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
392
394
 
393
395
  # Add the char dict list to the word data before creating TextElement
394
- word_element_data["_char_dicts"] = [ocr_char_dict] # Store itself as its only char
396
+ word_element_data["_char_dicts"] = [ocr_char_dict] # Store itself as its only char
395
397
 
396
398
  word_elem = TextElement(word_element_data, self._page)
397
399
  added_word_elements.append(word_elem)
natural_pdf/core/page.py CHANGED
@@ -1233,7 +1233,7 @@ class Page:
1233
1233
  render_ocr: bool = False,
1234
1234
  resolution: Optional[float] = None,
1235
1235
  include_highlights: bool = True,
1236
- exclusions: Optional[str] = None, # New parameter
1236
+ exclusions: Optional[str] = None, # New parameter
1237
1237
  **kwargs,
1238
1238
  ) -> Optional[Image.Image]:
1239
1239
  """
@@ -1262,11 +1262,11 @@ class Page:
1262
1262
  # Delegate rendering to the central service
1263
1263
  image = self._highlighter.render_page(
1264
1264
  page_index=self.index,
1265
- scale=scale, # Note: scale is used by highlighter internally for drawing
1265
+ scale=scale, # Note: scale is used by highlighter internally for drawing
1266
1266
  labels=labels,
1267
1267
  legend_position=legend_position,
1268
1268
  render_ocr=render_ocr,
1269
- resolution=render_resolution, # Pass the calculated resolution
1269
+ resolution=render_resolution, # Pass the calculated resolution
1270
1270
  **kwargs,
1271
1271
  )
1272
1272
  else:
@@ -1322,16 +1322,21 @@ class Page:
1322
1322
  max(0, img_x0),
1323
1323
  max(0, img_top),
1324
1324
  min(image.width, img_x1),
1325
- min(image.height, img_bottom)
1325
+ min(image.height, img_bottom),
1326
1326
  )
1327
1327
  if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
1328
- draw.rectangle(img_coords, fill="white")
1328
+ draw.rectangle(img_coords, fill="white")
1329
1329
  else:
1330
- logger.warning(f"Skipping invalid exclusion rect for masking: {img_coords}")
1330
+ logger.warning(
1331
+ f"Skipping invalid exclusion rect for masking: {img_coords}"
1332
+ )
1331
1333
 
1332
- del draw # Release drawing context
1334
+ del draw # Release drawing context
1333
1335
  except Exception as mask_error:
1334
- logger.error(f"Error applying exclusion mask to page {self.index}: {mask_error}", exc_info=True)
1336
+ logger.error(
1337
+ f"Error applying exclusion mask to page {self.index}: {mask_error}",
1338
+ exc_info=True,
1339
+ )
1335
1340
  # Decide if you want to return None or continue without mask
1336
1341
  # For now, continue without mask
1337
1342
 
@@ -1398,7 +1403,7 @@ class Page:
1398
1403
  """
1399
1404
  if not hasattr(self._parent, "apply_ocr"):
1400
1405
  logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
1401
- return [] # Return empty list for consistency
1406
+ return [] # Return empty list for consistency
1402
1407
 
1403
1408
  logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
1404
1409
  try:
@@ -1459,11 +1464,11 @@ class Page:
1459
1464
  return []
1460
1465
 
1461
1466
  logger.info(f"Page {self.number}: Extracting OCR elements (extract only)...")
1462
-
1467
+
1463
1468
  # Determine rendering resolution
1464
- final_resolution = resolution if resolution is not None else 150 # Default to 150 DPI
1469
+ final_resolution = resolution if resolution is not None else 150 # Default to 150 DPI
1465
1470
  logger.debug(f" Using rendering resolution: {final_resolution} DPI")
1466
-
1471
+
1467
1472
  try:
1468
1473
  # Get base image without highlights using the determined resolution
1469
1474
  image = self.to_image(resolution=final_resolution, include_highlights=False)
@@ -1477,12 +1482,12 @@ class Page:
1477
1482
 
1478
1483
  # Prepare arguments for the OCR Manager call
1479
1484
  manager_args = {
1480
- "images": image,
1481
- "engine": engine,
1482
- "languages": languages,
1483
- "min_confidence": min_confidence,
1484
- "device": device,
1485
- "options": options
1485
+ "images": image,
1486
+ "engine": engine,
1487
+ "languages": languages,
1488
+ "min_confidence": min_confidence,
1489
+ "device": device,
1490
+ "options": options,
1486
1491
  }
1487
1492
  manager_args = {k: v for k, v in manager_args.items() if v is not None}
1488
1493
 
@@ -1514,7 +1519,7 @@ class Page:
1514
1519
  scale_x = self.width / image.width if image.width else 1
1515
1520
  scale_y = self.height / image.height if image.height else 1
1516
1521
  for result in results:
1517
- try: # Added try-except around result processing
1522
+ try: # Added try-except around result processing
1518
1523
  x0, top, x1, bottom = [float(c) for c in result["bbox"]]
1519
1524
  elem_data = {
1520
1525
  "text": result["text"],
@@ -1525,15 +1530,17 @@ class Page:
1525
1530
  "bottom": bottom * scale_y,
1526
1531
  "width": (x1 - x0) * scale_x,
1527
1532
  "height": (bottom - top) * scale_y,
1528
- "object_type": "text", # Using text for temporary elements
1533
+ "object_type": "text", # Using text for temporary elements
1529
1534
  "source": "ocr",
1530
- "fontname": "OCR-extract", # Different name for clarity
1535
+ "fontname": "OCR-extract", # Different name for clarity
1531
1536
  "size": 10.0,
1532
1537
  "page_number": self.number,
1533
1538
  }
1534
1539
  temp_elements.append(TextElement(elem_data, self))
1535
1540
  except (KeyError, ValueError, TypeError) as convert_err:
1536
- logger.warning(f" Skipping invalid OCR result during conversion: {result}. Error: {convert_err}")
1541
+ logger.warning(
1542
+ f" Skipping invalid OCR result during conversion: {result}. Error: {convert_err}"
1543
+ )
1537
1544
 
1538
1545
  logger.info(f" Created {len(temp_elements)} TextElements from OCR (extract only).")
1539
1546
  return temp_elements
@@ -2020,7 +2027,7 @@ class Page:
2020
2027
  def correct_ocr(
2021
2028
  self,
2022
2029
  correction_callback: Callable[[Any], Optional[str]],
2023
- ) -> "Page": # Return self for chaining
2030
+ ) -> "Page": # Return self for chaining
2024
2031
  """
2025
2032
  Applies corrections to OCR-generated text elements on this page
2026
2033
  using a user-provided callback function.
@@ -2044,7 +2051,9 @@ class Page:
2044
2051
  Returns:
2045
2052
  Self for method chaining.
2046
2053
  """
2047
- logger.info(f"Page {self.number}: Starting OCR correction process using callback '{correction_callback.__name__}'")
2054
+ logger.info(
2055
+ f"Page {self.number}: Starting OCR correction process using callback '{correction_callback.__name__}'"
2056
+ )
2048
2057
 
2049
2058
  # Find OCR elements specifically on this page
2050
2059
  # Note: We typically want to correct even if the element falls in an excluded area
@@ -2052,9 +2061,9 @@ class Page:
2052
2061
 
2053
2062
  # Delegate to the utility function
2054
2063
  _apply_ocr_correction_to_elements(
2055
- elements=target_elements, # Pass the ElementCollection directly
2064
+ elements=target_elements, # Pass the ElementCollection directly
2056
2065
  correction_callback=correction_callback,
2057
- caller_info=f"Page({self.number})", # Pass caller info
2066
+ caller_info=f"Page({self.number})", # Pass caller info
2058
2067
  )
2059
2068
 
2060
- return self # Return self for chaining
2069
+ return self # Return self for chaining
natural_pdf/core/pdf.py CHANGED
@@ -239,13 +239,13 @@ class PDF:
239
239
  engine: Optional[str] = None,
240
240
  # --- Common OCR Parameters (Direct Arguments) ---
241
241
  languages: Optional[List[str]] = None,
242
- min_confidence: Optional[float] = None, # Min confidence threshold
242
+ min_confidence: Optional[float] = None, # Min confidence threshold
243
243
  device: Optional[str] = None,
244
- resolution: Optional[int] = None, # DPI for rendering before OCR
245
- apply_exclusions: bool = True, # New parameter
244
+ resolution: Optional[int] = None, # DPI for rendering before OCR
245
+ apply_exclusions: bool = True, # New parameter
246
246
  detect_only: bool = False,
247
247
  # --- Engine-Specific Options --- Use 'options=' for this
248
- options: Optional[Any] = None, # e.g., EasyOCROptions(...), PaddleOCROptions(...), or dict
248
+ options: Optional[Any] = None, # e.g., EasyOCROptions(...), PaddleOCROptions(...), or dict
249
249
  # **kwargs: Optional[Dict[str, Any]] = None # Allow potential extra args?
250
250
  ) -> "PDF":
251
251
  """
@@ -314,7 +314,7 @@ class PDF:
314
314
  logger.info(f"Applying batch OCR to pages: {page_numbers}...")
315
315
  # --- Determine Rendering Resolution ---
316
316
  # Priority: 1. direct `resolution` arg, 2. PDF config, 3. default 150
317
- final_resolution = resolution # Use direct arg if provided
317
+ final_resolution = resolution # Use direct arg if provided
318
318
  if final_resolution is None:
319
319
  final_resolution = getattr(self, "_config", {}).get("resolution", 150)
320
320
 
@@ -323,7 +323,9 @@ class PDF:
323
323
  # --- Render Images for Batch ---
324
324
  images_pil: List[Image.Image] = []
325
325
  page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
326
- logger.info(f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})...")
326
+ logger.info(
327
+ f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})..."
328
+ )
327
329
  failed_page_num = "unknown" # Keep track of potentially failing page
328
330
  try:
329
331
  for i, page in enumerate(target_pages):
@@ -339,7 +341,7 @@ class PDF:
339
341
  if img is None:
340
342
  logger.error(f" Failed to render page {page.number} to image.")
341
343
  # Decide how to handle: skip page, raise error? For now, skip.
342
- continue # Skip this page if rendering failed
344
+ continue # Skip this page if rendering failed
343
345
  images_pil.append(img)
344
346
  page_image_map.append((page, img)) # Store pair
345
347
  except Exception as e:
@@ -356,7 +358,7 @@ class PDF:
356
358
  "images": images_pil,
357
359
  "engine": engine,
358
360
  "languages": languages,
359
- "min_confidence": min_confidence, # Use the renamed parameter
361
+ "min_confidence": min_confidence, # Use the renamed parameter
360
362
  "device": device,
361
363
  "options": options,
362
364
  "detect_only": detect_only,
@@ -366,7 +368,9 @@ class PDF:
366
368
  manager_args = {k: v for k, v in manager_args.items() if v is not None}
367
369
 
368
370
  # --- Call OCR Manager for Batch Processing ---
369
- logger.info(f"Calling OCR Manager with args: { {k:v for k,v in manager_args.items() if k!='images'} } ...")
371
+ logger.info(
372
+ f"Calling OCR Manager with args: { {k:v for k,v in manager_args.items() if k!='images'} } ..."
373
+ )
370
374
  try:
371
375
  # Manager's apply_ocr signature needs to accept common args directly
372
376
  batch_results = self._ocr_manager.apply_ocr(**manager_args)
@@ -948,19 +952,22 @@ class PDF:
948
952
  """
949
953
  try:
950
954
  from natural_pdf.utils.packaging import create_correction_task_package
955
+
951
956
  create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
952
957
  except ImportError:
953
- logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
958
+ logger.error(
959
+ "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
960
+ )
954
961
  # Or raise
955
962
  except Exception as e:
956
963
  logger.error(f"Failed to export correction task for {self.path}: {e}", exc_info=True)
957
- raise # Re-raise the exception from the utility function
964
+ raise # Re-raise the exception from the utility function
958
965
 
959
966
  def correct_ocr(
960
967
  self,
961
968
  correction_callback: Callable[[Any], Optional[str]],
962
969
  pages: Optional[Union[Iterable[int], range, slice]] = None,
963
- ) -> "PDF": # Return self for chaining
970
+ ) -> "PDF": # Return self for chaining
964
971
  """
965
972
  Applies corrections to OCR-generated text elements using a callback function,
966
973
  delegating the core work to the `Page.correct_ocr` method.
@@ -989,7 +996,9 @@ class PDF:
989
996
  if not (0 <= idx < len(self._pages)):
990
997
  raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
991
998
  except (IndexError, TypeError, ValueError) as e:
992
- raise ValueError(f"Invalid page index or type provided in 'pages': {pages}. Error: {e}") from e
999
+ raise ValueError(
1000
+ f"Invalid page index or type provided in 'pages': {pages}. Error: {e}"
1001
+ ) from e
993
1002
  else:
994
1003
  raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
995
1004
 
@@ -997,7 +1006,9 @@ class PDF:
997
1006
  logger.warning("No pages selected for OCR correction.")
998
1007
  return self
999
1008
 
1000
- logger.info(f"Starting OCR correction process via Page delegation for pages: {target_page_indices}")
1009
+ logger.info(
1010
+ f"Starting OCR correction process via Page delegation for pages: {target_page_indices}"
1011
+ )
1001
1012
 
1002
1013
  # Iterate through target pages and call their correct_ocr method
1003
1014
  for page_idx in target_page_indices:
@@ -1071,8 +1082,6 @@ class PDF:
1071
1082
  """Context manager exit."""
1072
1083
  self.close()
1073
1084
 
1074
-
1075
1085
  # --- Indexable Protocol Methods --- Needed for search/sync
1076
1086
  def get_id(self) -> str:
1077
1087
  return self.path
1078
-
@@ -306,9 +306,7 @@ class DirectionalMixin:
306
306
  **kwargs,
307
307
  )
308
308
 
309
- def to_region(
310
- self
311
- ):
309
+ def to_region(self):
312
310
  return self.expand()
313
311
 
314
312
  def expand(
@@ -21,7 +21,7 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
21
21
  from natural_pdf.elements.text import TextElement # Needed for isinstance check
22
22
  from natural_pdf.ocr import OCROptions
23
23
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
24
- from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import the new utility
24
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import the new utility
25
25
 
26
26
  logger = logging.getLogger(__name__)
27
27
 
@@ -1151,9 +1151,9 @@ class ElementCollection(Generic[T]):
1151
1151
  _apply_ocr_correction_to_elements(
1152
1152
  elements=self._elements,
1153
1153
  correction_callback=correction_callback,
1154
- caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
1154
+ caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
1155
1155
  )
1156
- return self # Return self for chaining
1156
+ return self # Return self for chaining
1157
1157
 
1158
1158
 
1159
1159
  class PageCollection(Generic[P]):
@@ -1217,12 +1217,12 @@ class PageCollection(Generic[P]):
1217
1217
  engine: Optional[str] = None,
1218
1218
  # --- Common OCR Parameters (Direct Arguments) ---
1219
1219
  languages: Optional[List[str]] = None,
1220
- min_confidence: Optional[float] = None, # Min confidence threshold
1220
+ min_confidence: Optional[float] = None, # Min confidence threshold
1221
1221
  device: Optional[str] = None,
1222
- resolution: Optional[int] = None, # DPI for rendering
1223
- apply_exclusions: bool = True, # New parameter
1222
+ resolution: Optional[int] = None, # DPI for rendering
1223
+ apply_exclusions: bool = True, # New parameter
1224
1224
  # --- Engine-Specific Options ---
1225
- options: Optional[Any] = None, # e.g., EasyOCROptions(...)
1225
+ options: Optional[Any] = None, # e.g., EasyOCROptions(...)
1226
1226
  ) -> "PageCollection[P]":
1227
1227
  """
1228
1228
  Applies OCR to all pages within this collection using batch processing.
@@ -1273,10 +1273,10 @@ class PageCollection(Generic[P]):
1273
1273
  pages=page_indices,
1274
1274
  engine=engine,
1275
1275
  languages=languages,
1276
- min_confidence=min_confidence, # Pass the renamed parameter
1276
+ min_confidence=min_confidence, # Pass the renamed parameter
1277
1277
  device=device,
1278
1278
  resolution=resolution,
1279
- apply_exclusions=apply_exclusions, # Pass down
1279
+ apply_exclusions=apply_exclusions, # Pass down
1280
1280
  options=options,
1281
1281
  )
1282
1282
  # The PDF method modifies the Page objects directly by adding elements.
@@ -1351,13 +1351,12 @@ class PageCollection(Generic[P]):
1351
1351
  parent_pdf = self.pages[0]._parent
1352
1352
 
1353
1353
  page_indices = [p.index for p in self.pages]
1354
- logger.info(f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices}.")
1354
+ logger.info(
1355
+ f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices}."
1356
+ )
1355
1357
 
1356
1358
  # Delegate the call to the parent PDF object for the relevant pages
1357
- parent_pdf.correct_ocr(
1358
- correction_callback=correction_callback,
1359
- pages=page_indices
1360
- )
1359
+ parent_pdf.correct_ocr(correction_callback=correction_callback, pages=page_indices)
1361
1360
 
1362
1361
  return self
1363
1362