natural-pdf 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. docs/ocr/index.md +34 -47
  2. docs/tutorials/01-loading-and-extraction.ipynb +60 -46
  3. docs/tutorials/02-finding-elements.ipynb +42 -42
  4. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  5. docs/tutorials/04-table-extraction.ipynb +12 -12
  6. docs/tutorials/05-excluding-content.ipynb +30 -30
  7. docs/tutorials/06-document-qa.ipynb +28 -28
  8. docs/tutorials/07-layout-analysis.ipynb +63 -35
  9. docs/tutorials/07-working-with-regions.ipynb +55 -51
  10. docs/tutorials/07-working-with-regions.md +2 -2
  11. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  12. docs/tutorials/09-section-extraction.ipynb +113 -113
  13. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  14. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  15. docs/tutorials/12-ocr-integration.ipynb +149 -131
  16. docs/tutorials/12-ocr-integration.md +0 -13
  17. docs/tutorials/13-semantic-search.ipynb +313 -873
  18. natural_pdf/__init__.py +21 -23
  19. natural_pdf/analyzers/layout/gemini.py +264 -0
  20. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  21. natural_pdf/analyzers/layout/layout_options.py +11 -0
  22. natural_pdf/analyzers/layout/yolo.py +6 -2
  23. natural_pdf/collections/pdf_collection.py +21 -0
  24. natural_pdf/core/element_manager.py +16 -13
  25. natural_pdf/core/page.py +165 -36
  26. natural_pdf/core/pdf.py +146 -41
  27. natural_pdf/elements/base.py +11 -17
  28. natural_pdf/elements/collections.py +100 -38
  29. natural_pdf/elements/region.py +77 -38
  30. natural_pdf/elements/text.py +5 -0
  31. natural_pdf/ocr/__init__.py +49 -36
  32. natural_pdf/ocr/engine.py +146 -51
  33. natural_pdf/ocr/engine_easyocr.py +141 -161
  34. natural_pdf/ocr/engine_paddle.py +107 -193
  35. natural_pdf/ocr/engine_surya.py +75 -148
  36. natural_pdf/ocr/ocr_factory.py +114 -0
  37. natural_pdf/ocr/ocr_manager.py +65 -93
  38. natural_pdf/ocr/ocr_options.py +7 -17
  39. natural_pdf/ocr/utils.py +98 -0
  40. natural_pdf/templates/spa/css/style.css +334 -0
  41. natural_pdf/templates/spa/index.html +31 -0
  42. natural_pdf/templates/spa/js/app.js +472 -0
  43. natural_pdf/templates/spa/words.txt +235976 -0
  44. natural_pdf/utils/debug.py +32 -0
  45. natural_pdf/utils/identifiers.py +29 -0
  46. natural_pdf/utils/packaging.py +418 -0
  47. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +41 -19
  48. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/RECORD +51 -44
  49. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  50. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/top_level.txt +0 -1
  51. natural_pdf/templates/ocr_debug.html +0 -517
  52. tests/test_loading.py +0 -50
  53. tests/test_optional_deps.py +0 -298
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -306,17 +306,19 @@ class DirectionalMixin:
306
306
  **kwargs,
307
307
  )
308
308
 
309
+ def to_region(
310
+ self
311
+ ):
312
+ return self.expand()
313
+
309
314
  def expand(
310
315
  self,
311
316
  left: float = 0,
312
317
  right: float = 0,
313
- top_expand: float = 0, # Renamed to avoid conflict
314
- bottom_expand: float = 0, # Renamed to avoid conflict
318
+ top: float = 0,
319
+ bottom: float = 0,
315
320
  width_factor: float = 1.0,
316
321
  height_factor: float = 1.0,
317
- # Keep original parameter names for backward compatibility
318
- top: float = None,
319
- bottom: float = None,
320
322
  ) -> "Region":
321
323
  """
322
324
  Create a new region expanded from this element/region.
@@ -324,12 +326,10 @@ class DirectionalMixin:
324
326
  Args:
325
327
  left: Amount to expand left edge (positive value expands leftwards)
326
328
  right: Amount to expand right edge (positive value expands rightwards)
327
- top_expand: Amount to expand top edge (positive value expands upwards)
328
- bottom_expand: Amount to expand bottom edge (positive value expands downwards)
329
+ top: Amount to expand top edge (positive value expands upwards)
330
+ bottom: Amount to expand bottom edge (positive value expands downwards)
329
331
  width_factor: Factor to multiply width by (applied after absolute expansion)
330
332
  height_factor: Factor to multiply height by (applied after absolute expansion)
331
- top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
332
- bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
333
333
 
334
334
  Returns:
335
335
  New expanded Region object
@@ -340,17 +340,11 @@ class DirectionalMixin:
340
340
  new_top = self.top
341
341
  new_bottom = self.bottom
342
342
 
343
- # Handle the deprecated parameter names for backward compatibility
344
- if top is not None:
345
- top_expand = top
346
- if bottom is not None:
347
- bottom_expand = bottom
348
-
349
343
  # Apply absolute expansions first
350
344
  new_x0 -= left
351
345
  new_x1 += right
352
- new_top -= top_expand # Expand upward (decrease top coordinate)
353
- new_bottom += bottom_expand # Expand downward (increase bottom coordinate)
346
+ new_top -= top # Expand upward (decrease top coordinate)
347
+ new_bottom += bottom # Expand downward (increase bottom coordinate)
354
348
 
355
349
  # Apply percentage factors if provided
356
350
  if width_factor != 1.0 or height_factor != 1.0:
@@ -21,6 +21,7 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
21
21
  from natural_pdf.elements.text import TextElement # Needed for isinstance check
22
22
  from natural_pdf.ocr import OCROptions
23
23
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
24
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import the new utility
24
25
 
25
26
  logger = logging.getLogger(__name__)
26
27
 
@@ -1118,6 +1119,42 @@ class ElementCollection(Generic[T]):
1118
1119
  results = self.find_all(selector, regex=regex, case=case, **kwargs)
1119
1120
  return results.first
1120
1121
 
1122
+ def correct_ocr(
1123
+ self,
1124
+ correction_callback: Callable[[Any], Optional[str]],
1125
+ ) -> "ElementCollection":
1126
+ """
1127
+ Applies corrections to OCR-generated text elements within this collection
1128
+ using a user-provided callback function.
1129
+
1130
+ Iterates through elements currently in the collection. If an element's
1131
+ 'source' attribute starts with 'ocr', it calls the `correction_callback`
1132
+ for that element, passing the element itself.
1133
+
1134
+ The `correction_callback` should contain the logic to:
1135
+ 1. Determine if the element needs correction.
1136
+ 2. Perform the correction (e.g., call an LLM).
1137
+ 3. Return the new text (`str`) or `None`.
1138
+
1139
+ If the callback returns a string, the element's `.text` is updated in place.
1140
+ Metadata updates (source, confidence, etc.) should happen within the callback.
1141
+ Elements without a source starting with 'ocr' are skipped.
1142
+
1143
+ Args:
1144
+ correction_callback: A function accepting an element and returning
1145
+ `Optional[str]` (new text or None).
1146
+
1147
+ Returns:
1148
+ Self for method chaining.
1149
+ """
1150
+ # Delegate to the utility function
1151
+ _apply_ocr_correction_to_elements(
1152
+ elements=self._elements,
1153
+ correction_callback=correction_callback,
1154
+ caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
1155
+ )
1156
+ return self # Return self for chaining
1157
+
1121
1158
 
1122
1159
  class PageCollection(Generic[P]):
1123
1160
  """
@@ -1178,33 +1215,38 @@ class PageCollection(Generic[P]):
1178
1215
  def apply_ocr(
1179
1216
  self,
1180
1217
  engine: Optional[str] = None,
1181
- options: Optional[OCROptions] = None,
1218
+ # --- Common OCR Parameters (Direct Arguments) ---
1182
1219
  languages: Optional[List[str]] = None,
1183
- min_confidence: Optional[float] = None,
1220
+ min_confidence: Optional[float] = None, # Min confidence threshold
1184
1221
  device: Optional[str] = None,
1222
+ resolution: Optional[int] = None, # DPI for rendering
1223
+ apply_exclusions: bool = True, # New parameter
1224
+ # --- Engine-Specific Options ---
1225
+ options: Optional[Any] = None, # e.g., EasyOCROptions(...)
1185
1226
  ) -> "PageCollection[P]":
1186
1227
  """
1187
1228
  Applies OCR to all pages within this collection using batch processing.
1188
1229
 
1189
- This delegates the work to the parent PDF object's `apply_ocr` method for efficiency. The OCR results (TextElements) are added directly
1190
- to the respective Page objects within this collection.
1230
+ This delegates the work to the parent PDF object's `apply_ocr` method.
1191
1231
 
1192
1232
  Args:
1193
- engine: Name of the engine (e.g., 'easyocr', 'paddleocr', 'surya').
1194
- Uses manager's default if None. Ignored if 'options' is provided.
1195
- options: An specific Options object (e.g., EasyOCROptions) for
1196
- advanced configuration. Overrides simple arguments.
1197
- languages: List of language codes for simple mode.
1198
- min_confidence: Minimum confidence threshold for simple mode.
1199
- device: Device string ('cpu', 'cuda', etc.) for simple mode.
1233
+ engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr').
1234
+ languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch']).
1235
+ **Must be codes understood by the specific selected engine.**
1236
+ No mapping is performed.
1237
+ min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
1238
+ device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
1239
+ resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
1240
+ apply_exclusions: If True (default), render page images for OCR with
1241
+ excluded areas masked (whited out). If False, OCR
1242
+ the raw page images without masking exclusions.
1243
+ options: An engine-specific options object (e.g., EasyOCROptions) or dict.
1200
1244
 
1201
1245
  Returns:
1202
1246
  Self for method chaining.
1203
1247
 
1204
1248
  Raises:
1205
- RuntimeError: If pages in the collection lack a parent PDF object
1206
- or if the parent PDF object lacks the required
1207
- `apply_ocr` method.
1249
+ RuntimeError: If pages lack a parent PDF or parent lacks `apply_ocr`.
1208
1250
  (Propagates exceptions from PDF.apply_ocr)
1209
1251
  """
1210
1252
  if not self.pages:
@@ -1218,7 +1260,6 @@ class PageCollection(Generic[P]):
1218
1260
 
1219
1261
  parent_pdf = first_page._parent
1220
1262
 
1221
- # Updated check for renamed method
1222
1263
  if not hasattr(parent_pdf, "apply_ocr") or not callable(parent_pdf.apply_ocr):
1223
1264
  raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
1224
1265
 
@@ -1227,15 +1268,16 @@ class PageCollection(Generic[P]):
1227
1268
 
1228
1269
  logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
1229
1270
 
1230
- # Delegate the batch call to the parent PDF object (using renamed method)
1271
+ # Delegate the batch call to the parent PDF object, passing direct args and apply_exclusions
1231
1272
  parent_pdf.apply_ocr(
1232
1273
  pages=page_indices,
1233
1274
  engine=engine,
1234
- options=options,
1235
1275
  languages=languages,
1236
- min_confidence=min_confidence,
1276
+ min_confidence=min_confidence, # Pass the renamed parameter
1237
1277
  device=device,
1238
- # Pass any other relevant simple_kwargs here if added
1278
+ resolution=resolution,
1279
+ apply_exclusions=apply_exclusions, # Pass down
1280
+ options=options,
1239
1281
  )
1240
1282
  # The PDF method modifies the Page objects directly by adding elements.
1241
1283
 
@@ -1279,25 +1321,45 @@ class PageCollection(Generic[P]):
1279
1321
 
1280
1322
  return ElementCollection(all_elements)
1281
1323
 
1282
- # def debug_ocr(self, output_path):
1283
- # """
1284
- # Generate an interactive HTML debug report for OCR results.
1285
-
1286
- # This creates a single-file HTML report with:
1287
- # - Side-by-side view of image regions and OCR text
1288
- # - Confidence scores with color coding
1289
- # - Editable correction fields
1290
- # - Filtering and sorting options
1291
- # - Export functionality for corrected text
1292
-
1293
- # Args:
1294
- # output_path: Path to save the HTML report
1295
-
1296
- # Returns:
1297
- # Path to the generated HTML file
1298
- # """
1299
- # from natural_pdf.utils.ocr import debug_ocr_to_html
1300
- # return debug_ocr_to_html(self.pages, output_path)
1324
+ def correct_ocr(
1325
+ self,
1326
+ correction_callback: Callable[[Any], Optional[str]],
1327
+ ) -> "PageCollection[P]":
1328
+ """
1329
+ Applies corrections to OCR-generated text elements across all pages
1330
+ in this collection using a user-provided callback function.
1331
+
1332
+ This method delegates to the parent PDF's `correct_ocr` method,
1333
+ targeting all pages within this collection.
1334
+
1335
+ Args:
1336
+ correction_callback: A function that accepts a single argument (an element
1337
+ object) and returns `Optional[str]` (new text or None).
1338
+
1339
+ Returns:
1340
+ A dictionary containing aggregate statistics for the process across all pages:
1341
+ {'elements_checked': total_checked, 'corrections_applied': total_applied}
1342
+
1343
+ Raises:
1344
+ RuntimeError: If the collection is empty, pages lack a parent PDF reference,
1345
+ or the parent PDF lacks the `correct_ocr` method.
1346
+ """
1347
+ if not self.pages:
1348
+ logger.warning("Cannot correct OCR for an empty PageCollection.")
1349
+
1350
+ # Assume all pages share the same parent PDF object
1351
+ parent_pdf = self.pages[0]._parent
1352
+
1353
+ page_indices = [p.index for p in self.pages]
1354
+ logger.info(f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices}.")
1355
+
1356
+ # Delegate the call to the parent PDF object for the relevant pages
1357
+ parent_pdf.correct_ocr(
1358
+ correction_callback=correction_callback,
1359
+ pages=page_indices
1360
+ )
1361
+
1362
+ return self
1301
1363
 
1302
1364
  def get_sections(
1303
1365
  self,
@@ -11,6 +11,8 @@ from natural_pdf.elements.base import DirectionalMixin
11
11
  # Import new utils
12
12
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
13
13
 
14
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
15
+
14
16
  if TYPE_CHECKING:
15
17
  from natural_pdf.core.page import Page
16
18
  from natural_pdf.elements.text import TextElement
@@ -1082,12 +1084,18 @@ class Region(DirectionalMixin):
1082
1084
  filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
1083
1085
  return ElementCollection(filtered_elements)
1084
1086
 
1085
- def apply_ocr(self, **ocr_params) -> List["TextElement"]: # Return type hint updated
1087
+ def apply_ocr(self, **ocr_params) -> "Region":
1086
1088
  """
1087
1089
  Apply OCR to this region and return the created text elements.
1088
1090
 
1089
1091
  Args:
1090
- **ocr_params: OCR parameters to override defaults (passed to OCRManager)
1092
+ **ocr_params: Keyword arguments passed to the OCR Manager.
1093
+ Common parameters like `engine`, `languages`, `min_confidence`,
1094
+ `device`, and `resolution` (for image rendering) should be
1095
+ provided here. **The `languages` list must contain codes
1096
+ understood by the specific engine selected.** No mapping
1097
+ is performed. Engine-specific settings can be passed in
1098
+ an `options` object (e.g., `options=EasyOCROptions(...)`).
1091
1099
 
1092
1100
  Returns:
1093
1101
  List of created TextElement objects representing OCR words/lines.
@@ -1098,20 +1106,20 @@ class Region(DirectionalMixin):
1098
1106
  return []
1099
1107
  ocr_mgr = self.page._parent._ocr_manager
1100
1108
 
1101
- # Get OCR configuration from kwargs or PDF defaults if needed
1102
- # We'll mostly rely on passing ocr_params directly to the manager
1103
- # For rendering, use a reasonable default scale
1104
- ocr_image_scale = self.page._parent._config.get("ocr_image_scale", 2.0)
1105
-
1109
+ # Determine rendering resolution from parameters
1110
+ final_resolution = ocr_params.get("resolution")
1111
+ if final_resolution is None and hasattr(self.page, '_parent') and self.page._parent:
1112
+ final_resolution = getattr(self.page._parent, "_config", {}).get("resolution", 150)
1113
+ elif final_resolution is None:
1114
+ final_resolution = 150
1106
1115
  logger.debug(
1107
- f"Region {self.bbox}: Applying OCR with scale {ocr_image_scale} and params: {ocr_params}"
1116
+ f"Region {self.bbox}: Applying OCR with resolution {final_resolution} DPI and params: {ocr_params}"
1108
1117
  )
1109
1118
 
1110
- # Render the page region to an image
1119
+ # Render the page region to an image using the determined resolution
1111
1120
  try:
1112
- # Crop the page image to this region's bbox
1113
1121
  region_image = self.to_image(
1114
- scale=ocr_image_scale, include_highlights=False, crop_only=True
1122
+ resolution=final_resolution, include_highlights=False, crop_only=True
1115
1123
  )
1116
1124
  if not region_image:
1117
1125
  logger.error("Failed to render region to image for OCR.")
@@ -1121,12 +1129,21 @@ class Region(DirectionalMixin):
1121
1129
  logger.error(f"Error rendering region to image for OCR: {e}", exc_info=True)
1122
1130
  return []
1123
1131
 
1132
+ # Prepare args for the OCR Manager
1133
+ manager_args = {
1134
+ "images": region_image,
1135
+ "engine": ocr_params.get("engine"),
1136
+ "languages": ocr_params.get("languages"),
1137
+ "min_confidence": ocr_params.get("min_confidence"),
1138
+ "device": ocr_params.get("device"),
1139
+ "options": ocr_params.get("options"),
1140
+ "detect_only": ocr_params.get("detect_only"),
1141
+ }
1142
+ manager_args = {k: v for k, v in manager_args.items() if v is not None}
1143
+
1124
1144
  # Run OCR on this region's image using the manager
1125
1145
  try:
1126
- # Pass the single image and any specific options/kwargs
1127
- # The manager handles engine selection based on ocr_params or defaults
1128
- results = ocr_mgr.apply_ocr(images=region_image, **ocr_params)
1129
- # apply_ocr returns List[Dict] for single image
1146
+ results = ocr_mgr.apply_ocr(**manager_args)
1130
1147
  if not isinstance(results, list):
1131
1148
  logger.error(
1132
1149
  f"OCRManager returned unexpected type for single region image: {type(results)}"
@@ -1137,25 +1154,19 @@ class Region(DirectionalMixin):
1137
1154
  logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
1138
1155
  return []
1139
1156
 
1140
- # Convert results to TextElements, scaling coordinates relative to the page
1141
- # Calculate scaling factors based on the region image vs the region PDF coords
1157
+ # Convert results to TextElements
1142
1158
  scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
1143
1159
  scale_y = self.height / region_image.height if region_image.height > 0 else 1.0
1144
1160
  logger.debug(f"Region OCR scaling factors (PDF/Img): x={scale_x:.2f}, y={scale_y:.2f}")
1145
-
1146
1161
  created_elements = []
1147
1162
  for result in results:
1148
1163
  try:
1149
1164
  img_x0, img_top, img_x1, img_bottom = map(float, result["bbox"])
1150
1165
  pdf_height = (img_bottom - img_top) * scale_y
1151
-
1152
- # Convert IMAGE coordinates (relative to region crop) to PAGE coordinates
1153
1166
  page_x0 = self.x0 + (img_x0 * scale_x)
1154
1167
  page_top = self.top + (img_top * scale_y)
1155
1168
  page_x1 = self.x0 + (img_x1 * scale_x)
1156
1169
  page_bottom = self.top + (img_bottom * scale_y)
1157
-
1158
- # Create element data using PAGE coordinates
1159
1170
  element_data = {
1160
1171
  "text": result["text"],
1161
1172
  "x0": page_x0,
@@ -1164,45 +1175,33 @@ class Region(DirectionalMixin):
1164
1175
  "bottom": page_bottom,
1165
1176
  "width": page_x1 - page_x0,
1166
1177
  "height": page_bottom - page_top,
1167
- "object_type": "word", # Treat as word
1178
+ "object_type": "word",
1168
1179
  "source": "ocr",
1169
1180
  "confidence": float(result.get("confidence", 0.0)),
1170
1181
  "fontname": "OCR",
1171
- "size": round(pdf_height) if pdf_height > 0 else 10.0, # Size based on height
1182
+ "size": round(pdf_height) if pdf_height > 0 else 10.0,
1172
1183
  "page_number": self.page.number,
1173
1184
  "bold": False,
1174
1185
  "italic": False,
1175
1186
  "upright": True,
1176
1187
  "doctop": page_top + self.page._page.initial_doctop,
1177
1188
  }
1178
-
1179
- # Create the representative char dict
1180
1189
  ocr_char_dict = element_data.copy()
1181
1190
  ocr_char_dict["object_type"] = "char"
1182
1191
  ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
1183
-
1184
- # Add char dicts to word data
1185
1192
  element_data["_char_dicts"] = [ocr_char_dict]
1186
-
1187
- # Create the TextElement word
1188
- from natural_pdf.elements.text import TextElement # Local import ok here
1189
-
1193
+ from natural_pdf.elements.text import TextElement
1190
1194
  elem = TextElement(element_data, self.page)
1191
1195
  created_elements.append(elem)
1192
-
1193
- # Add the element to the page's element manager
1194
1196
  self.page._element_mgr.add_element(elem, element_type="words")
1195
- # Add the char dict to the manager's char list
1196
1197
  self.page._element_mgr.add_element(ocr_char_dict, element_type="chars")
1197
-
1198
1198
  except Exception as e:
1199
1199
  logger.error(
1200
1200
  f"Failed to convert region OCR result to element: {result}. Error: {e}",
1201
1201
  exc_info=True,
1202
1202
  )
1203
-
1204
1203
  logger.info(f"Region {self.bbox}: Added {len(created_elements)} elements from OCR.")
1205
- return created_elements
1204
+ return self
1206
1205
 
1207
1206
  def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
1208
1207
  """
@@ -1689,3 +1688,43 @@ class Region(DirectionalMixin):
1689
1688
  type_info = f" type='{self.region_type}'" if self.region_type else ""
1690
1689
  source_info = f" source='{self.source}'" if self.source else ""
1691
1690
  return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
1691
+
1692
+ def correct_ocr(
1693
+ self,
1694
+ correction_callback: Callable[[Any], Optional[str]],
1695
+ ) -> "Region": # Return self for chaining
1696
+ """
1697
+ Applies corrections to OCR-generated text elements within this region
1698
+ using a user-provided callback function.
1699
+
1700
+ Finds text elements within this region whose 'source' attribute starts
1701
+ with 'ocr' and calls the `correction_callback` for each, passing the
1702
+ element itself.
1703
+
1704
+ The `correction_callback` should contain the logic to:
1705
+ 1. Determine if the element needs correction.
1706
+ 2. Perform the correction (e.g., call an LLM).
1707
+ 3. Return the new text (`str`) or `None`.
1708
+
1709
+ If the callback returns a string, the element's `.text` is updated.
1710
+ Metadata updates (source, confidence, etc.) should happen within the callback.
1711
+
1712
+ Args:
1713
+ correction_callback: A function accepting an element and returning
1714
+ `Optional[str]` (new text or None).
1715
+
1716
+ Returns:
1717
+ Self for method chaining.
1718
+ """
1719
+ # Find OCR elements specifically within this region
1720
+ # Note: We typically want to correct even if the element falls in an excluded area
1721
+ target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
1722
+
1723
+ # Delegate to the utility function
1724
+ _apply_ocr_correction_to_elements(
1725
+ elements=target_elements, # Pass the ElementCollection directly
1726
+ correction_callback=correction_callback,
1727
+ caller_info=f"Region({self.bbox})", # Pass caller info
1728
+ )
1729
+
1730
+ return self # Return self for chaining
@@ -41,6 +41,11 @@ class TextElement(Element):
41
41
  """Get the text content."""
42
42
  return self._obj.get("text", "")
43
43
 
44
+ @text.setter
45
+ def text(self, value: str):
46
+ """Set the text content."""
47
+ self._obj["text"] = value
48
+
44
49
  @property
45
50
  def source(self) -> str:
46
51
  """Get the source of this text element (pdf or ocr)."""
@@ -8,58 +8,71 @@ import logging
8
8
 
9
9
  # Set up module logger
10
10
  logger = logging.getLogger("natural_pdf.ocr")
11
+
12
+ # Import the base classes that are always available
11
13
  from .engine import OCREngine
12
- from .engine_paddle import PaddleOCREngine
13
- from .engine_surya import SuryaOCREngine
14
+ from .ocr_options import OCROptions, BaseOCROptions, EasyOCROptions, PaddleOCROptions, SuryaOCROptions
14
15
  from .ocr_manager import OCRManager
15
- from .ocr_options import OCROptions
16
+ from .ocr_factory import OCRFactory
16
17
 
18
+ # Add all public symbols that should be available when importing this module
17
19
  __all__ = [
18
20
  "OCRManager",
19
21
  "OCREngine",
20
22
  "OCROptions",
21
- "EasyOCREngine",
22
- "PaddleOCREngine",
23
- "SuryaOCREngine",
23
+ "BaseOCROptions",
24
+ "EasyOCROptions",
25
+ "PaddleOCROptions",
26
+ "SuryaOCROptions",
27
+ "OCRFactory",
28
+ "get_engine",
29
+ "list_available_engines"
24
30
  ]
25
31
 
26
- DEFAULT_ENGINE = SuryaOCREngine
27
-
28
-
29
32
  def get_engine(engine_name=None, **kwargs):
30
33
  """
31
- Get OCR engine by name.
34
+ Get OCR engine by name with graceful handling of missing dependencies.
32
35
 
33
36
  Args:
34
- engine_name: Name of the engine to use ('easyocr', 'paddleocr', etc.)
35
- If None, the default engine is used (PaddleOCR if available, otherwise EasyOCR)
37
+ engine_name: Name of the engine to use ('easyocr', 'paddle', 'surya')
38
+ If None, the best available engine is used
36
39
  **kwargs: Additional arguments to pass to the engine constructor
37
40
 
38
41
  Returns:
39
42
  OCREngine instance
43
+
44
+ Raises:
45
+ ImportError: If the requested engine's dependencies aren't installed
46
+ ValueError: If the engine_name is unknown
40
47
  """
41
- logger.debug(f"Initializing OCR engine: {engine_name or 'default'}")
42
-
43
- if engine_name is None or engine_name == "default":
44
- engine = DEFAULT_ENGINE(**kwargs)
45
- logger.info(f"Using default OCR engine: {engine.__class__.__name__}")
46
- return engine
47
-
48
- if engine_name.lower() == "easyocr":
49
- logger.info("Initializing EasyOCR engine")
50
- return EasyOCREngine(**kwargs)
48
+ logger.debug(f"Initializing OCR engine: {engine_name or 'best available'}")
49
+
50
+ try:
51
+ if engine_name is None or engine_name == "default":
52
+ # Use the factory to get the best available engine
53
+ engine = OCRFactory.get_recommended_engine(**kwargs)
54
+ logger.info(f"Using recommended OCR engine: {engine.__class__.__name__}")
55
+ return engine
56
+
57
+ # Use the factory to create a specific engine
58
+ normalized_name = engine_name.lower()
59
+ if normalized_name in ["easyocr", "paddle", "surya"]:
60
+ return OCRFactory.create_engine(normalized_name, **kwargs)
61
+ else:
62
+ raise ValueError(f"Unknown OCR engine: {engine_name}")
63
+
64
+ except ImportError as e:
65
+ logger.error(f"OCR engine dependency error: {e}")
66
+ raise
67
+ except Exception as e:
68
+ logger.error(f"Error initializing OCR engine: {e}")
69
+ raise
51
70
 
52
- if engine_name.lower() == "paddleocr":
53
- try:
54
- from .engine_paddle import PaddleOCREngine
55
-
56
- logger.info("Initializing PaddleOCR engine")
57
- return PaddleOCREngine(**kwargs)
58
- except ImportError:
59
- logger.error("PaddleOCR is not installed")
60
- raise ImportError(
61
- "PaddleOCR is not installed. Please install it with: pip install paddlepaddle paddleocr"
62
- )
63
-
64
- logger.error(f"Unknown OCR engine: {engine_name}")
65
- raise ValueError(f"Unknown OCR engine: {engine_name}")
71
+ def list_available_engines():
72
+ """
73
+ List all available OCR engines.
74
+
75
+ Returns:
76
+ Dict[str, bool]: Dictionary mapping engine names to availability status
77
+ """
78
+ return OCRFactory.list_available_engines()