natural-pdf 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. docs/finetuning/index.md +176 -0
  2. docs/ocr/index.md +34 -47
  3. docs/tutorials/01-loading-and-extraction.ipynb +34 -1536
  4. docs/tutorials/02-finding-elements.ipynb +42 -42
  5. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  6. docs/tutorials/04-table-extraction.ipynb +12 -12
  7. docs/tutorials/05-excluding-content.ipynb +30 -30
  8. docs/tutorials/06-document-qa.ipynb +28 -28
  9. docs/tutorials/07-layout-analysis.ipynb +63 -35
  10. docs/tutorials/07-working-with-regions.ipynb +55 -51
  11. docs/tutorials/07-working-with-regions.md +2 -2
  12. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  13. docs/tutorials/09-section-extraction.ipynb +113 -113
  14. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  15. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  16. docs/tutorials/12-ocr-integration.ipynb +149 -131
  17. docs/tutorials/12-ocr-integration.md +0 -13
  18. docs/tutorials/13-semantic-search.ipynb +313 -873
  19. natural_pdf/__init__.py +21 -22
  20. natural_pdf/analyzers/layout/gemini.py +280 -0
  21. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  22. natural_pdf/analyzers/layout/layout_options.py +11 -0
  23. natural_pdf/analyzers/layout/yolo.py +6 -2
  24. natural_pdf/collections/pdf_collection.py +24 -0
  25. natural_pdf/core/element_manager.py +18 -13
  26. natural_pdf/core/page.py +174 -36
  27. natural_pdf/core/pdf.py +156 -42
  28. natural_pdf/elements/base.py +9 -17
  29. natural_pdf/elements/collections.py +99 -38
  30. natural_pdf/elements/region.py +77 -37
  31. natural_pdf/elements/text.py +5 -0
  32. natural_pdf/exporters/__init__.py +4 -0
  33. natural_pdf/exporters/base.py +61 -0
  34. natural_pdf/exporters/paddleocr.py +345 -0
  35. natural_pdf/ocr/__init__.py +57 -36
  36. natural_pdf/ocr/engine.py +160 -49
  37. natural_pdf/ocr/engine_easyocr.py +178 -157
  38. natural_pdf/ocr/engine_paddle.py +114 -189
  39. natural_pdf/ocr/engine_surya.py +87 -144
  40. natural_pdf/ocr/ocr_factory.py +125 -0
  41. natural_pdf/ocr/ocr_manager.py +65 -89
  42. natural_pdf/ocr/ocr_options.py +8 -13
  43. natural_pdf/ocr/utils.py +113 -0
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
  45. natural_pdf/templates/spa/css/style.css +334 -0
  46. natural_pdf/templates/spa/index.html +31 -0
  47. natural_pdf/templates/spa/js/app.js +472 -0
  48. natural_pdf/templates/spa/words.txt +235976 -0
  49. natural_pdf/utils/debug.py +34 -0
  50. natural_pdf/utils/identifiers.py +33 -0
  51. natural_pdf/utils/packaging.py +485 -0
  52. natural_pdf/utils/text_extraction.py +44 -64
  53. natural_pdf/utils/visualization.py +1 -1
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +44 -20
  55. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +58 -47
  56. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +1 -1
  57. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -1
  58. natural_pdf/templates/ocr_debug.html +0 -517
  59. tests/test_loading.py +0 -50
  60. tests/test_optional_deps.py +0 -298
  61. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
@@ -306,17 +306,17 @@ class DirectionalMixin:
306
306
  **kwargs,
307
307
  )
308
308
 
309
+ def to_region(self):
310
+ return self.expand()
311
+
309
312
  def expand(
310
313
  self,
311
314
  left: float = 0,
312
315
  right: float = 0,
313
- top_expand: float = 0, # Renamed to avoid conflict
314
- bottom_expand: float = 0, # Renamed to avoid conflict
316
+ top: float = 0,
317
+ bottom: float = 0,
315
318
  width_factor: float = 1.0,
316
319
  height_factor: float = 1.0,
317
- # Keep original parameter names for backward compatibility
318
- top: float = None,
319
- bottom: float = None,
320
320
  ) -> "Region":
321
321
  """
322
322
  Create a new region expanded from this element/region.
@@ -324,12 +324,10 @@ class DirectionalMixin:
324
324
  Args:
325
325
  left: Amount to expand left edge (positive value expands leftwards)
326
326
  right: Amount to expand right edge (positive value expands rightwards)
327
- top_expand: Amount to expand top edge (positive value expands upwards)
328
- bottom_expand: Amount to expand bottom edge (positive value expands downwards)
327
+ top: Amount to expand top edge (positive value expands upwards)
328
+ bottom: Amount to expand bottom edge (positive value expands downwards)
329
329
  width_factor: Factor to multiply width by (applied after absolute expansion)
330
330
  height_factor: Factor to multiply height by (applied after absolute expansion)
331
- top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
332
- bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
333
331
 
334
332
  Returns:
335
333
  New expanded Region object
@@ -340,17 +338,11 @@ class DirectionalMixin:
340
338
  new_top = self.top
341
339
  new_bottom = self.bottom
342
340
 
343
- # Handle the deprecated parameter names for backward compatibility
344
- if top is not None:
345
- top_expand = top
346
- if bottom is not None:
347
- bottom_expand = bottom
348
-
349
341
  # Apply absolute expansions first
350
342
  new_x0 -= left
351
343
  new_x1 += right
352
- new_top -= top_expand # Expand upward (decrease top coordinate)
353
- new_bottom += bottom_expand # Expand downward (increase bottom coordinate)
344
+ new_top -= top # Expand upward (decrease top coordinate)
345
+ new_bottom += bottom # Expand downward (increase bottom coordinate)
354
346
 
355
347
  # Apply percentage factors if provided
356
348
  if width_factor != 1.0 or height_factor != 1.0:
@@ -21,6 +21,7 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
21
21
  from natural_pdf.elements.text import TextElement # Needed for isinstance check
22
22
  from natural_pdf.ocr import OCROptions
23
23
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
24
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import the new utility
24
25
 
25
26
  logger = logging.getLogger(__name__)
26
27
 
@@ -1118,6 +1119,42 @@ class ElementCollection(Generic[T]):
1118
1119
  results = self.find_all(selector, regex=regex, case=case, **kwargs)
1119
1120
  return results.first
1120
1121
 
1122
+ def correct_ocr(
1123
+ self,
1124
+ correction_callback: Callable[[Any], Optional[str]],
1125
+ ) -> "ElementCollection":
1126
+ """
1127
+ Applies corrections to OCR-generated text elements within this collection
1128
+ using a user-provided callback function.
1129
+
1130
+ Iterates through elements currently in the collection. If an element's
1131
+ 'source' attribute starts with 'ocr', it calls the `correction_callback`
1132
+ for that element, passing the element itself.
1133
+
1134
+ The `correction_callback` should contain the logic to:
1135
+ 1. Determine if the element needs correction.
1136
+ 2. Perform the correction (e.g., call an LLM).
1137
+ 3. Return the new text (`str`) or `None`.
1138
+
1139
+ If the callback returns a string, the element's `.text` is updated in place.
1140
+ Metadata updates (source, confidence, etc.) should happen within the callback.
1141
+ Elements without a source starting with 'ocr' are skipped.
1142
+
1143
+ Args:
1144
+ correction_callback: A function accepting an element and returning
1145
+ `Optional[str]` (new text or None).
1146
+
1147
+ Returns:
1148
+ Self for method chaining.
1149
+ """
1150
+ # Delegate to the utility function
1151
+ _apply_ocr_correction_to_elements(
1152
+ elements=self._elements,
1153
+ correction_callback=correction_callback,
1154
+ caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
1155
+ )
1156
+ return self # Return self for chaining
1157
+
1121
1158
 
1122
1159
  class PageCollection(Generic[P]):
1123
1160
  """
@@ -1178,33 +1215,38 @@ class PageCollection(Generic[P]):
1178
1215
  def apply_ocr(
1179
1216
  self,
1180
1217
  engine: Optional[str] = None,
1181
- options: Optional[OCROptions] = None,
1218
+ # --- Common OCR Parameters (Direct Arguments) ---
1182
1219
  languages: Optional[List[str]] = None,
1183
- min_confidence: Optional[float] = None,
1220
+ min_confidence: Optional[float] = None, # Min confidence threshold
1184
1221
  device: Optional[str] = None,
1222
+ resolution: Optional[int] = None, # DPI for rendering
1223
+ apply_exclusions: bool = True, # New parameter
1224
+ # --- Engine-Specific Options ---
1225
+ options: Optional[Any] = None, # e.g., EasyOCROptions(...)
1185
1226
  ) -> "PageCollection[P]":
1186
1227
  """
1187
1228
  Applies OCR to all pages within this collection using batch processing.
1188
1229
 
1189
- This delegates the work to the parent PDF object's `apply_ocr` method for efficiency. The OCR results (TextElements) are added directly
1190
- to the respective Page objects within this collection.
1230
+ This delegates the work to the parent PDF object's `apply_ocr` method.
1191
1231
 
1192
1232
  Args:
1193
- engine: Name of the engine (e.g., 'easyocr', 'paddleocr', 'surya').
1194
- Uses manager's default if None. Ignored if 'options' is provided.
1195
- options: An specific Options object (e.g., EasyOCROptions) for
1196
- advanced configuration. Overrides simple arguments.
1197
- languages: List of language codes for simple mode.
1198
- min_confidence: Minimum confidence threshold for simple mode.
1199
- device: Device string ('cpu', 'cuda', etc.) for simple mode.
1233
+ engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr').
1234
+ languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch']).
1235
+ **Must be codes understood by the specific selected engine.**
1236
+ No mapping is performed.
1237
+ min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
1238
+ device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
1239
+ resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
1240
+ apply_exclusions: If True (default), render page images for OCR with
1241
+ excluded areas masked (whited out). If False, OCR
1242
+ the raw page images without masking exclusions.
1243
+ options: An engine-specific options object (e.g., EasyOCROptions) or dict.
1200
1244
 
1201
1245
  Returns:
1202
1246
  Self for method chaining.
1203
1247
 
1204
1248
  Raises:
1205
- RuntimeError: If pages in the collection lack a parent PDF object
1206
- or if the parent PDF object lacks the required
1207
- `apply_ocr` method.
1249
+ RuntimeError: If pages lack a parent PDF or parent lacks `apply_ocr`.
1208
1250
  (Propagates exceptions from PDF.apply_ocr)
1209
1251
  """
1210
1252
  if not self.pages:
@@ -1218,7 +1260,6 @@ class PageCollection(Generic[P]):
1218
1260
 
1219
1261
  parent_pdf = first_page._parent
1220
1262
 
1221
- # Updated check for renamed method
1222
1263
  if not hasattr(parent_pdf, "apply_ocr") or not callable(parent_pdf.apply_ocr):
1223
1264
  raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
1224
1265
 
@@ -1227,15 +1268,16 @@ class PageCollection(Generic[P]):
1227
1268
 
1228
1269
  logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
1229
1270
 
1230
- # Delegate the batch call to the parent PDF object (using renamed method)
1271
+ # Delegate the batch call to the parent PDF object, passing direct args and apply_exclusions
1231
1272
  parent_pdf.apply_ocr(
1232
1273
  pages=page_indices,
1233
1274
  engine=engine,
1234
- options=options,
1235
1275
  languages=languages,
1236
- min_confidence=min_confidence,
1276
+ min_confidence=min_confidence, # Pass the renamed parameter
1237
1277
  device=device,
1238
- # Pass any other relevant simple_kwargs here if added
1278
+ resolution=resolution,
1279
+ apply_exclusions=apply_exclusions, # Pass down
1280
+ options=options,
1239
1281
  )
1240
1282
  # The PDF method modifies the Page objects directly by adding elements.
1241
1283
 
@@ -1279,25 +1321,44 @@ class PageCollection(Generic[P]):
1279
1321
 
1280
1322
  return ElementCollection(all_elements)
1281
1323
 
1282
- # def debug_ocr(self, output_path):
1283
- # """
1284
- # Generate an interactive HTML debug report for OCR results.
1285
-
1286
- # This creates a single-file HTML report with:
1287
- # - Side-by-side view of image regions and OCR text
1288
- # - Confidence scores with color coding
1289
- # - Editable correction fields
1290
- # - Filtering and sorting options
1291
- # - Export functionality for corrected text
1292
-
1293
- # Args:
1294
- # output_path: Path to save the HTML report
1295
-
1296
- # Returns:
1297
- # Path to the generated HTML file
1298
- # """
1299
- # from natural_pdf.utils.ocr import debug_ocr_to_html
1300
- # return debug_ocr_to_html(self.pages, output_path)
1324
+ def correct_ocr(
1325
+ self,
1326
+ correction_callback: Callable[[Any], Optional[str]],
1327
+ ) -> "PageCollection[P]":
1328
+ """
1329
+ Applies corrections to OCR-generated text elements across all pages
1330
+ in this collection using a user-provided callback function.
1331
+
1332
+ This method delegates to the parent PDF's `correct_ocr` method,
1333
+ targeting all pages within this collection.
1334
+
1335
+ Args:
1336
+ correction_callback: A function that accepts a single argument (an element
1337
+ object) and returns `Optional[str]` (new text or None).
1338
+
1339
+ Returns:
1340
+ A dictionary containing aggregate statistics for the process across all pages:
1341
+ {'elements_checked': total_checked, 'corrections_applied': total_applied}
1342
+
1343
+ Raises:
1344
+ RuntimeError: If the collection is empty, pages lack a parent PDF reference,
1345
+ or the parent PDF lacks the `correct_ocr` method.
1346
+ """
1347
+ if not self.pages:
1348
+ logger.warning("Cannot correct OCR for an empty PageCollection.")
1349
+
1350
+ # Assume all pages share the same parent PDF object
1351
+ parent_pdf = self.pages[0]._parent
1352
+
1353
+ page_indices = [p.index for p in self.pages]
1354
+ logger.info(
1355
+ f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices}."
1356
+ )
1357
+
1358
+ # Delegate the call to the parent PDF object for the relevant pages
1359
+ parent_pdf.correct_ocr(correction_callback=correction_callback, pages=page_indices)
1360
+
1361
+ return self
1301
1362
 
1302
1363
  def get_sections(
1303
1364
  self,
@@ -11,6 +11,8 @@ from natural_pdf.elements.base import DirectionalMixin
11
11
  # Import new utils
12
12
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
13
13
 
14
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
15
+
14
16
  if TYPE_CHECKING:
15
17
  from natural_pdf.core.page import Page
16
18
  from natural_pdf.elements.text import TextElement
@@ -1082,12 +1084,18 @@ class Region(DirectionalMixin):
1082
1084
  filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
1083
1085
  return ElementCollection(filtered_elements)
1084
1086
 
1085
- def apply_ocr(self, **ocr_params) -> List["TextElement"]: # Return type hint updated
1087
+ def apply_ocr(self, **ocr_params) -> "Region":
1086
1088
  """
1087
1089
  Apply OCR to this region and return the created text elements.
1088
1090
 
1089
1091
  Args:
1090
- **ocr_params: OCR parameters to override defaults (passed to OCRManager)
1092
+ **ocr_params: Keyword arguments passed to the OCR Manager.
1093
+ Common parameters like `engine`, `languages`, `min_confidence`,
1094
+ `device`, and `resolution` (for image rendering) should be
1095
+ provided here. **The `languages` list must contain codes
1096
+ understood by the specific engine selected.** No mapping
1097
+ is performed. Engine-specific settings can be passed in
1098
+ an `options` object (e.g., `options=EasyOCROptions(...)`).
1091
1099
 
1092
1100
  Returns:
1093
1101
  List of created TextElement objects representing OCR words/lines.
@@ -1098,20 +1106,20 @@ class Region(DirectionalMixin):
1098
1106
  return []
1099
1107
  ocr_mgr = self.page._parent._ocr_manager
1100
1108
 
1101
- # Get OCR configuration from kwargs or PDF defaults if needed
1102
- # We'll mostly rely on passing ocr_params directly to the manager
1103
- # For rendering, use a reasonable default scale
1104
- ocr_image_scale = self.page._parent._config.get("ocr_image_scale", 2.0)
1105
-
1109
+ # Determine rendering resolution from parameters
1110
+ final_resolution = ocr_params.get("resolution")
1111
+ if final_resolution is None and hasattr(self.page, "_parent") and self.page._parent:
1112
+ final_resolution = getattr(self.page._parent, "_config", {}).get("resolution", 150)
1113
+ elif final_resolution is None:
1114
+ final_resolution = 150
1106
1115
  logger.debug(
1107
- f"Region {self.bbox}: Applying OCR with scale {ocr_image_scale} and params: {ocr_params}"
1116
+ f"Region {self.bbox}: Applying OCR with resolution {final_resolution} DPI and params: {ocr_params}"
1108
1117
  )
1109
1118
 
1110
- # Render the page region to an image
1119
+ # Render the page region to an image using the determined resolution
1111
1120
  try:
1112
- # Crop the page image to this region's bbox
1113
1121
  region_image = self.to_image(
1114
- scale=ocr_image_scale, include_highlights=False, crop_only=True
1122
+ resolution=final_resolution, include_highlights=False, crop_only=True
1115
1123
  )
1116
1124
  if not region_image:
1117
1125
  logger.error("Failed to render region to image for OCR.")
@@ -1121,12 +1129,21 @@ class Region(DirectionalMixin):
1121
1129
  logger.error(f"Error rendering region to image for OCR: {e}", exc_info=True)
1122
1130
  return []
1123
1131
 
1132
+ # Prepare args for the OCR Manager
1133
+ manager_args = {
1134
+ "images": region_image,
1135
+ "engine": ocr_params.get("engine"),
1136
+ "languages": ocr_params.get("languages"),
1137
+ "min_confidence": ocr_params.get("min_confidence"),
1138
+ "device": ocr_params.get("device"),
1139
+ "options": ocr_params.get("options"),
1140
+ "detect_only": ocr_params.get("detect_only"),
1141
+ }
1142
+ manager_args = {k: v for k, v in manager_args.items() if v is not None}
1143
+
1124
1144
  # Run OCR on this region's image using the manager
1125
1145
  try:
1126
- # Pass the single image and any specific options/kwargs
1127
- # The manager handles engine selection based on ocr_params or defaults
1128
- results = ocr_mgr.apply_ocr(images=region_image, **ocr_params)
1129
- # apply_ocr returns List[Dict] for single image
1146
+ results = ocr_mgr.apply_ocr(**manager_args)
1130
1147
  if not isinstance(results, list):
1131
1148
  logger.error(
1132
1149
  f"OCRManager returned unexpected type for single region image: {type(results)}"
@@ -1137,25 +1154,19 @@ class Region(DirectionalMixin):
1137
1154
  logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
1138
1155
  return []
1139
1156
 
1140
- # Convert results to TextElements, scaling coordinates relative to the page
1141
- # Calculate scaling factors based on the region image vs the region PDF coords
1157
+ # Convert results to TextElements
1142
1158
  scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
1143
1159
  scale_y = self.height / region_image.height if region_image.height > 0 else 1.0
1144
1160
  logger.debug(f"Region OCR scaling factors (PDF/Img): x={scale_x:.2f}, y={scale_y:.2f}")
1145
-
1146
1161
  created_elements = []
1147
1162
  for result in results:
1148
1163
  try:
1149
1164
  img_x0, img_top, img_x1, img_bottom = map(float, result["bbox"])
1150
1165
  pdf_height = (img_bottom - img_top) * scale_y
1151
-
1152
- # Convert IMAGE coordinates (relative to region crop) to PAGE coordinates
1153
1166
  page_x0 = self.x0 + (img_x0 * scale_x)
1154
1167
  page_top = self.top + (img_top * scale_y)
1155
1168
  page_x1 = self.x0 + (img_x1 * scale_x)
1156
1169
  page_bottom = self.top + (img_bottom * scale_y)
1157
-
1158
- # Create element data using PAGE coordinates
1159
1170
  element_data = {
1160
1171
  "text": result["text"],
1161
1172
  "x0": page_x0,
@@ -1164,45 +1175,34 @@ class Region(DirectionalMixin):
1164
1175
  "bottom": page_bottom,
1165
1176
  "width": page_x1 - page_x0,
1166
1177
  "height": page_bottom - page_top,
1167
- "object_type": "word", # Treat as word
1178
+ "object_type": "word",
1168
1179
  "source": "ocr",
1169
1180
  "confidence": float(result.get("confidence", 0.0)),
1170
1181
  "fontname": "OCR",
1171
- "size": round(pdf_height) if pdf_height > 0 else 10.0, # Size based on height
1182
+ "size": round(pdf_height) if pdf_height > 0 else 10.0,
1172
1183
  "page_number": self.page.number,
1173
1184
  "bold": False,
1174
1185
  "italic": False,
1175
1186
  "upright": True,
1176
1187
  "doctop": page_top + self.page._page.initial_doctop,
1177
1188
  }
1178
-
1179
- # Create the representative char dict
1180
1189
  ocr_char_dict = element_data.copy()
1181
1190
  ocr_char_dict["object_type"] = "char"
1182
1191
  ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
1183
-
1184
- # Add char dicts to word data
1185
1192
  element_data["_char_dicts"] = [ocr_char_dict]
1186
-
1187
- # Create the TextElement word
1188
- from natural_pdf.elements.text import TextElement # Local import ok here
1193
+ from natural_pdf.elements.text import TextElement
1189
1194
 
1190
1195
  elem = TextElement(element_data, self.page)
1191
1196
  created_elements.append(elem)
1192
-
1193
- # Add the element to the page's element manager
1194
1197
  self.page._element_mgr.add_element(elem, element_type="words")
1195
- # Add the char dict to the manager's char list
1196
1198
  self.page._element_mgr.add_element(ocr_char_dict, element_type="chars")
1197
-
1198
1199
  except Exception as e:
1199
1200
  logger.error(
1200
1201
  f"Failed to convert region OCR result to element: {result}. Error: {e}",
1201
1202
  exc_info=True,
1202
1203
  )
1203
-
1204
1204
  logger.info(f"Region {self.bbox}: Added {len(created_elements)} elements from OCR.")
1205
- return created_elements
1205
+ return self
1206
1206
 
1207
1207
  def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
1208
1208
  """
@@ -1689,3 +1689,43 @@ class Region(DirectionalMixin):
1689
1689
  type_info = f" type='{self.region_type}'" if self.region_type else ""
1690
1690
  source_info = f" source='{self.source}'" if self.source else ""
1691
1691
  return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
1692
+
1693
+ def correct_ocr(
1694
+ self,
1695
+ correction_callback: Callable[[Any], Optional[str]],
1696
+ ) -> "Region": # Return self for chaining
1697
+ """
1698
+ Applies corrections to OCR-generated text elements within this region
1699
+ using a user-provided callback function.
1700
+
1701
+ Finds text elements within this region whose 'source' attribute starts
1702
+ with 'ocr' and calls the `correction_callback` for each, passing the
1703
+ element itself.
1704
+
1705
+ The `correction_callback` should contain the logic to:
1706
+ 1. Determine if the element needs correction.
1707
+ 2. Perform the correction (e.g., call an LLM).
1708
+ 3. Return the new text (`str`) or `None`.
1709
+
1710
+ If the callback returns a string, the element's `.text` is updated.
1711
+ Metadata updates (source, confidence, etc.) should happen within the callback.
1712
+
1713
+ Args:
1714
+ correction_callback: A function accepting an element and returning
1715
+ `Optional[str]` (new text or None).
1716
+
1717
+ Returns:
1718
+ Self for method chaining.
1719
+ """
1720
+ # Find OCR elements specifically within this region
1721
+ # Note: We typically want to correct even if the element falls in an excluded area
1722
+ target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
1723
+
1724
+ # Delegate to the utility function
1725
+ _apply_ocr_correction_to_elements(
1726
+ elements=target_elements, # Pass the ElementCollection directly
1727
+ correction_callback=correction_callback,
1728
+ caller_info=f"Region({self.bbox})", # Pass caller info
1729
+ )
1730
+
1731
+ return self # Return self for chaining
@@ -41,6 +41,11 @@ class TextElement(Element):
41
41
  """Get the text content."""
42
42
  return self._obj.get("text", "")
43
43
 
44
+ @text.setter
45
+ def text(self, value: str):
46
+ """Set the text content."""
47
+ self._obj["text"] = value
48
+
44
49
  @property
45
50
  def source(self) -> str:
46
51
  """Get the source of this text element (pdf or ocr)."""
@@ -0,0 +1,4 @@
1
+ from .base import FinetuneExporter
2
+ from .paddleocr import PaddleOCRRecognitionExporter
3
+
4
+ __all__ = ["FinetuneExporter", "PaddleOCRRecognitionExporter"]
@@ -0,0 +1,61 @@
1
+ import abc
2
+ import logging
3
+ from typing import Union, List, TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from natural_pdf.core.pdf import PDF
7
+ from natural_pdf.collections.pdf_collection import PDFCollection
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class FinetuneExporter(abc.ABC):
13
+ """
14
+ Abstract base class for exporting data suitable for fine-tuning models.
15
+ """
16
+
17
+ @abc.abstractmethod
18
+ def __init__(self, **kwargs):
19
+ """
20
+ Initialize the exporter with format-specific options.
21
+ """
22
+ pass
23
+
24
+ @abc.abstractmethod
25
+ def export(self, source: Union["PDF", "PDFCollection", List["PDF"]], output_dir: str, **kwargs):
26
+ """
27
+ Exports the data from the source PDF(s) to the specified output directory
28
+ in a format suitable for fine-tuning a specific model type.
29
+
30
+ Args:
31
+ source: The PDF object, PDFCollection, or list of PDF objects to process.
32
+ output_dir: The path to the directory where the exported files will be saved.
33
+ **kwargs: Additional export-time arguments.
34
+ """
35
+ pass
36
+
37
+ def _resolve_source_pdfs(
38
+ self, source: Union["PDF", "PDFCollection", List["PDF"]]
39
+ ) -> List["PDF"]:
40
+ """
41
+ Helper to consistently resolve the input source to a list of PDF objects.
42
+ """
43
+ from natural_pdf.core.pdf import PDF # Avoid circular import at module level
44
+ from natural_pdf.collections.pdf_collection import PDFCollection # Avoid circular import
45
+
46
+ pdfs_to_process: List["PDF"] = []
47
+ if isinstance(source, PDF):
48
+ pdfs_to_process = [source]
49
+ elif isinstance(source, PDFCollection):
50
+ pdfs_to_process = source.pdfs
51
+ elif isinstance(source, list) and all(isinstance(p, PDF) for p in source):
52
+ pdfs_to_process = source
53
+ else:
54
+ raise TypeError(
55
+ f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF]."
56
+ )
57
+
58
+ if not pdfs_to_process:
59
+ logger.warning("No PDF documents provided in the source.")
60
+
61
+ return pdfs_to_process