natural-pdf 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/ocr/index.md +34 -47
- docs/tutorials/01-loading-and-extraction.ipynb +60 -46
- docs/tutorials/02-finding-elements.ipynb +42 -42
- docs/tutorials/03-extracting-blocks.ipynb +17 -17
- docs/tutorials/04-table-extraction.ipynb +12 -12
- docs/tutorials/05-excluding-content.ipynb +30 -30
- docs/tutorials/06-document-qa.ipynb +28 -28
- docs/tutorials/07-layout-analysis.ipynb +63 -35
- docs/tutorials/07-working-with-regions.ipynb +55 -51
- docs/tutorials/07-working-with-regions.md +2 -2
- docs/tutorials/08-spatial-navigation.ipynb +60 -60
- docs/tutorials/09-section-extraction.ipynb +113 -113
- docs/tutorials/10-form-field-extraction.ipynb +78 -50
- docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
- docs/tutorials/12-ocr-integration.ipynb +149 -131
- docs/tutorials/12-ocr-integration.md +0 -13
- docs/tutorials/13-semantic-search.ipynb +313 -873
- natural_pdf/__init__.py +21 -23
- natural_pdf/analyzers/layout/gemini.py +264 -0
- natural_pdf/analyzers/layout/layout_manager.py +28 -1
- natural_pdf/analyzers/layout/layout_options.py +11 -0
- natural_pdf/analyzers/layout/yolo.py +6 -2
- natural_pdf/collections/pdf_collection.py +21 -0
- natural_pdf/core/element_manager.py +16 -13
- natural_pdf/core/page.py +165 -36
- natural_pdf/core/pdf.py +146 -41
- natural_pdf/elements/base.py +11 -17
- natural_pdf/elements/collections.py +100 -38
- natural_pdf/elements/region.py +77 -38
- natural_pdf/elements/text.py +5 -0
- natural_pdf/ocr/__init__.py +49 -36
- natural_pdf/ocr/engine.py +146 -51
- natural_pdf/ocr/engine_easyocr.py +141 -161
- natural_pdf/ocr/engine_paddle.py +107 -193
- natural_pdf/ocr/engine_surya.py +75 -148
- natural_pdf/ocr/ocr_factory.py +114 -0
- natural_pdf/ocr/ocr_manager.py +65 -93
- natural_pdf/ocr/ocr_options.py +7 -17
- natural_pdf/ocr/utils.py +98 -0
- natural_pdf/templates/spa/css/style.css +334 -0
- natural_pdf/templates/spa/index.html +31 -0
- natural_pdf/templates/spa/js/app.js +472 -0
- natural_pdf/templates/spa/words.txt +235976 -0
- natural_pdf/utils/debug.py +32 -0
- natural_pdf/utils/identifiers.py +29 -0
- natural_pdf/utils/packaging.py +418 -0
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +41 -19
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/RECORD +51 -44
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/top_level.txt +0 -1
- natural_pdf/templates/ocr_debug.html +0 -517
- tests/test_loading.py +0 -50
- tests/test_optional_deps.py +0 -298
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
natural_pdf/elements/base.py
CHANGED
@@ -306,17 +306,19 @@ class DirectionalMixin:
|
|
306
306
|
**kwargs,
|
307
307
|
)
|
308
308
|
|
309
|
+
def to_region(
|
310
|
+
self
|
311
|
+
):
|
312
|
+
return self.expand()
|
313
|
+
|
309
314
|
def expand(
|
310
315
|
self,
|
311
316
|
left: float = 0,
|
312
317
|
right: float = 0,
|
313
|
-
|
314
|
-
|
318
|
+
top: float = 0,
|
319
|
+
bottom: float = 0,
|
315
320
|
width_factor: float = 1.0,
|
316
321
|
height_factor: float = 1.0,
|
317
|
-
# Keep original parameter names for backward compatibility
|
318
|
-
top: float = None,
|
319
|
-
bottom: float = None,
|
320
322
|
) -> "Region":
|
321
323
|
"""
|
322
324
|
Create a new region expanded from this element/region.
|
@@ -324,12 +326,10 @@ class DirectionalMixin:
|
|
324
326
|
Args:
|
325
327
|
left: Amount to expand left edge (positive value expands leftwards)
|
326
328
|
right: Amount to expand right edge (positive value expands rightwards)
|
327
|
-
|
328
|
-
|
329
|
+
top: Amount to expand top edge (positive value expands upwards)
|
330
|
+
bottom: Amount to expand bottom edge (positive value expands downwards)
|
329
331
|
width_factor: Factor to multiply width by (applied after absolute expansion)
|
330
332
|
height_factor: Factor to multiply height by (applied after absolute expansion)
|
331
|
-
top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
|
332
|
-
bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
|
333
333
|
|
334
334
|
Returns:
|
335
335
|
New expanded Region object
|
@@ -340,17 +340,11 @@ class DirectionalMixin:
|
|
340
340
|
new_top = self.top
|
341
341
|
new_bottom = self.bottom
|
342
342
|
|
343
|
-
# Handle the deprecated parameter names for backward compatibility
|
344
|
-
if top is not None:
|
345
|
-
top_expand = top
|
346
|
-
if bottom is not None:
|
347
|
-
bottom_expand = bottom
|
348
|
-
|
349
343
|
# Apply absolute expansions first
|
350
344
|
new_x0 -= left
|
351
345
|
new_x1 += right
|
352
|
-
new_top -=
|
353
|
-
new_bottom +=
|
346
|
+
new_top -= top # Expand upward (decrease top coordinate)
|
347
|
+
new_bottom += bottom # Expand downward (increase bottom coordinate)
|
354
348
|
|
355
349
|
# Apply percentage factors if provided
|
356
350
|
if width_factor != 1.0 or height_factor != 1.0:
|
@@ -21,6 +21,7 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
|
|
21
21
|
from natural_pdf.elements.text import TextElement # Needed for isinstance check
|
22
22
|
from natural_pdf.ocr import OCROptions
|
23
23
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
24
|
+
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import the new utility
|
24
25
|
|
25
26
|
logger = logging.getLogger(__name__)
|
26
27
|
|
@@ -1118,6 +1119,42 @@ class ElementCollection(Generic[T]):
|
|
1118
1119
|
results = self.find_all(selector, regex=regex, case=case, **kwargs)
|
1119
1120
|
return results.first
|
1120
1121
|
|
1122
|
+
def correct_ocr(
|
1123
|
+
self,
|
1124
|
+
correction_callback: Callable[[Any], Optional[str]],
|
1125
|
+
) -> "ElementCollection":
|
1126
|
+
"""
|
1127
|
+
Applies corrections to OCR-generated text elements within this collection
|
1128
|
+
using a user-provided callback function.
|
1129
|
+
|
1130
|
+
Iterates through elements currently in the collection. If an element's
|
1131
|
+
'source' attribute starts with 'ocr', it calls the `correction_callback`
|
1132
|
+
for that element, passing the element itself.
|
1133
|
+
|
1134
|
+
The `correction_callback` should contain the logic to:
|
1135
|
+
1. Determine if the element needs correction.
|
1136
|
+
2. Perform the correction (e.g., call an LLM).
|
1137
|
+
3. Return the new text (`str`) or `None`.
|
1138
|
+
|
1139
|
+
If the callback returns a string, the element's `.text` is updated in place.
|
1140
|
+
Metadata updates (source, confidence, etc.) should happen within the callback.
|
1141
|
+
Elements without a source starting with 'ocr' are skipped.
|
1142
|
+
|
1143
|
+
Args:
|
1144
|
+
correction_callback: A function accepting an element and returning
|
1145
|
+
`Optional[str]` (new text or None).
|
1146
|
+
|
1147
|
+
Returns:
|
1148
|
+
Self for method chaining.
|
1149
|
+
"""
|
1150
|
+
# Delegate to the utility function
|
1151
|
+
_apply_ocr_correction_to_elements(
|
1152
|
+
elements=self._elements,
|
1153
|
+
correction_callback=correction_callback,
|
1154
|
+
caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
|
1155
|
+
)
|
1156
|
+
return self # Return self for chaining
|
1157
|
+
|
1121
1158
|
|
1122
1159
|
class PageCollection(Generic[P]):
|
1123
1160
|
"""
|
@@ -1178,33 +1215,38 @@ class PageCollection(Generic[P]):
|
|
1178
1215
|
def apply_ocr(
|
1179
1216
|
self,
|
1180
1217
|
engine: Optional[str] = None,
|
1181
|
-
|
1218
|
+
# --- Common OCR Parameters (Direct Arguments) ---
|
1182
1219
|
languages: Optional[List[str]] = None,
|
1183
|
-
min_confidence: Optional[float] = None,
|
1220
|
+
min_confidence: Optional[float] = None, # Min confidence threshold
|
1184
1221
|
device: Optional[str] = None,
|
1222
|
+
resolution: Optional[int] = None, # DPI for rendering
|
1223
|
+
apply_exclusions: bool = True, # New parameter
|
1224
|
+
# --- Engine-Specific Options ---
|
1225
|
+
options: Optional[Any] = None, # e.g., EasyOCROptions(...)
|
1185
1226
|
) -> "PageCollection[P]":
|
1186
1227
|
"""
|
1187
1228
|
Applies OCR to all pages within this collection using batch processing.
|
1188
1229
|
|
1189
|
-
This delegates the work to the parent PDF object's `apply_ocr` method
|
1190
|
-
to the respective Page objects within this collection.
|
1230
|
+
This delegates the work to the parent PDF object's `apply_ocr` method.
|
1191
1231
|
|
1192
1232
|
Args:
|
1193
|
-
engine: Name of the engine (e.g., 'easyocr', 'paddleocr'
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1233
|
+
engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr').
|
1234
|
+
languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch']).
|
1235
|
+
**Must be codes understood by the specific selected engine.**
|
1236
|
+
No mapping is performed.
|
1237
|
+
min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
|
1238
|
+
device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
|
1239
|
+
resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
|
1240
|
+
apply_exclusions: If True (default), render page images for OCR with
|
1241
|
+
excluded areas masked (whited out). If False, OCR
|
1242
|
+
the raw page images without masking exclusions.
|
1243
|
+
options: An engine-specific options object (e.g., EasyOCROptions) or dict.
|
1200
1244
|
|
1201
1245
|
Returns:
|
1202
1246
|
Self for method chaining.
|
1203
1247
|
|
1204
1248
|
Raises:
|
1205
|
-
RuntimeError: If pages
|
1206
|
-
or if the parent PDF object lacks the required
|
1207
|
-
`apply_ocr` method.
|
1249
|
+
RuntimeError: If pages lack a parent PDF or parent lacks `apply_ocr`.
|
1208
1250
|
(Propagates exceptions from PDF.apply_ocr)
|
1209
1251
|
"""
|
1210
1252
|
if not self.pages:
|
@@ -1218,7 +1260,6 @@ class PageCollection(Generic[P]):
|
|
1218
1260
|
|
1219
1261
|
parent_pdf = first_page._parent
|
1220
1262
|
|
1221
|
-
# Updated check for renamed method
|
1222
1263
|
if not hasattr(parent_pdf, "apply_ocr") or not callable(parent_pdf.apply_ocr):
|
1223
1264
|
raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
|
1224
1265
|
|
@@ -1227,15 +1268,16 @@ class PageCollection(Generic[P]):
|
|
1227
1268
|
|
1228
1269
|
logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
|
1229
1270
|
|
1230
|
-
# Delegate the batch call to the parent PDF object
|
1271
|
+
# Delegate the batch call to the parent PDF object, passing direct args and apply_exclusions
|
1231
1272
|
parent_pdf.apply_ocr(
|
1232
1273
|
pages=page_indices,
|
1233
1274
|
engine=engine,
|
1234
|
-
options=options,
|
1235
1275
|
languages=languages,
|
1236
|
-
min_confidence=min_confidence,
|
1276
|
+
min_confidence=min_confidence, # Pass the renamed parameter
|
1237
1277
|
device=device,
|
1238
|
-
|
1278
|
+
resolution=resolution,
|
1279
|
+
apply_exclusions=apply_exclusions, # Pass down
|
1280
|
+
options=options,
|
1239
1281
|
)
|
1240
1282
|
# The PDF method modifies the Page objects directly by adding elements.
|
1241
1283
|
|
@@ -1279,25 +1321,45 @@ class PageCollection(Generic[P]):
|
|
1279
1321
|
|
1280
1322
|
return ElementCollection(all_elements)
|
1281
1323
|
|
1282
|
-
|
1283
|
-
|
1284
|
-
|
1285
|
-
|
1286
|
-
|
1287
|
-
|
1288
|
-
|
1289
|
-
|
1290
|
-
|
1291
|
-
|
1292
|
-
|
1293
|
-
|
1294
|
-
|
1295
|
-
|
1296
|
-
|
1297
|
-
|
1298
|
-
|
1299
|
-
|
1300
|
-
|
1324
|
+
def correct_ocr(
|
1325
|
+
self,
|
1326
|
+
correction_callback: Callable[[Any], Optional[str]],
|
1327
|
+
) -> "PageCollection[P]":
|
1328
|
+
"""
|
1329
|
+
Applies corrections to OCR-generated text elements across all pages
|
1330
|
+
in this collection using a user-provided callback function.
|
1331
|
+
|
1332
|
+
This method delegates to the parent PDF's `correct_ocr` method,
|
1333
|
+
targeting all pages within this collection.
|
1334
|
+
|
1335
|
+
Args:
|
1336
|
+
correction_callback: A function that accepts a single argument (an element
|
1337
|
+
object) and returns `Optional[str]` (new text or None).
|
1338
|
+
|
1339
|
+
Returns:
|
1340
|
+
A dictionary containing aggregate statistics for the process across all pages:
|
1341
|
+
{'elements_checked': total_checked, 'corrections_applied': total_applied}
|
1342
|
+
|
1343
|
+
Raises:
|
1344
|
+
RuntimeError: If the collection is empty, pages lack a parent PDF reference,
|
1345
|
+
or the parent PDF lacks the `correct_ocr` method.
|
1346
|
+
"""
|
1347
|
+
if not self.pages:
|
1348
|
+
logger.warning("Cannot correct OCR for an empty PageCollection.")
|
1349
|
+
|
1350
|
+
# Assume all pages share the same parent PDF object
|
1351
|
+
parent_pdf = self.pages[0]._parent
|
1352
|
+
|
1353
|
+
page_indices = [p.index for p in self.pages]
|
1354
|
+
logger.info(f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices}.")
|
1355
|
+
|
1356
|
+
# Delegate the call to the parent PDF object for the relevant pages
|
1357
|
+
parent_pdf.correct_ocr(
|
1358
|
+
correction_callback=correction_callback,
|
1359
|
+
pages=page_indices
|
1360
|
+
)
|
1361
|
+
|
1362
|
+
return self
|
1301
1363
|
|
1302
1364
|
def get_sections(
|
1303
1365
|
self,
|
natural_pdf/elements/region.py
CHANGED
@@ -11,6 +11,8 @@ from natural_pdf.elements.base import DirectionalMixin
|
|
11
11
|
# Import new utils
|
12
12
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
13
13
|
|
14
|
+
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
|
15
|
+
|
14
16
|
if TYPE_CHECKING:
|
15
17
|
from natural_pdf.core.page import Page
|
16
18
|
from natural_pdf.elements.text import TextElement
|
@@ -1082,12 +1084,18 @@ class Region(DirectionalMixin):
|
|
1082
1084
|
filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
|
1083
1085
|
return ElementCollection(filtered_elements)
|
1084
1086
|
|
1085
|
-
def apply_ocr(self, **ocr_params) ->
|
1087
|
+
def apply_ocr(self, **ocr_params) -> "Region":
|
1086
1088
|
"""
|
1087
1089
|
Apply OCR to this region and return the created text elements.
|
1088
1090
|
|
1089
1091
|
Args:
|
1090
|
-
**ocr_params:
|
1092
|
+
**ocr_params: Keyword arguments passed to the OCR Manager.
|
1093
|
+
Common parameters like `engine`, `languages`, `min_confidence`,
|
1094
|
+
`device`, and `resolution` (for image rendering) should be
|
1095
|
+
provided here. **The `languages` list must contain codes
|
1096
|
+
understood by the specific engine selected.** No mapping
|
1097
|
+
is performed. Engine-specific settings can be passed in
|
1098
|
+
an `options` object (e.g., `options=EasyOCROptions(...)`).
|
1091
1099
|
|
1092
1100
|
Returns:
|
1093
1101
|
List of created TextElement objects representing OCR words/lines.
|
@@ -1098,20 +1106,20 @@ class Region(DirectionalMixin):
|
|
1098
1106
|
return []
|
1099
1107
|
ocr_mgr = self.page._parent._ocr_manager
|
1100
1108
|
|
1101
|
-
#
|
1102
|
-
|
1103
|
-
|
1104
|
-
|
1105
|
-
|
1109
|
+
# Determine rendering resolution from parameters
|
1110
|
+
final_resolution = ocr_params.get("resolution")
|
1111
|
+
if final_resolution is None and hasattr(self.page, '_parent') and self.page._parent:
|
1112
|
+
final_resolution = getattr(self.page._parent, "_config", {}).get("resolution", 150)
|
1113
|
+
elif final_resolution is None:
|
1114
|
+
final_resolution = 150
|
1106
1115
|
logger.debug(
|
1107
|
-
f"Region {self.bbox}: Applying OCR with
|
1116
|
+
f"Region {self.bbox}: Applying OCR with resolution {final_resolution} DPI and params: {ocr_params}"
|
1108
1117
|
)
|
1109
1118
|
|
1110
|
-
# Render the page region to an image
|
1119
|
+
# Render the page region to an image using the determined resolution
|
1111
1120
|
try:
|
1112
|
-
# Crop the page image to this region's bbox
|
1113
1121
|
region_image = self.to_image(
|
1114
|
-
|
1122
|
+
resolution=final_resolution, include_highlights=False, crop_only=True
|
1115
1123
|
)
|
1116
1124
|
if not region_image:
|
1117
1125
|
logger.error("Failed to render region to image for OCR.")
|
@@ -1121,12 +1129,21 @@ class Region(DirectionalMixin):
|
|
1121
1129
|
logger.error(f"Error rendering region to image for OCR: {e}", exc_info=True)
|
1122
1130
|
return []
|
1123
1131
|
|
1132
|
+
# Prepare args for the OCR Manager
|
1133
|
+
manager_args = {
|
1134
|
+
"images": region_image,
|
1135
|
+
"engine": ocr_params.get("engine"),
|
1136
|
+
"languages": ocr_params.get("languages"),
|
1137
|
+
"min_confidence": ocr_params.get("min_confidence"),
|
1138
|
+
"device": ocr_params.get("device"),
|
1139
|
+
"options": ocr_params.get("options"),
|
1140
|
+
"detect_only": ocr_params.get("detect_only"),
|
1141
|
+
}
|
1142
|
+
manager_args = {k: v for k, v in manager_args.items() if v is not None}
|
1143
|
+
|
1124
1144
|
# Run OCR on this region's image using the manager
|
1125
1145
|
try:
|
1126
|
-
|
1127
|
-
# The manager handles engine selection based on ocr_params or defaults
|
1128
|
-
results = ocr_mgr.apply_ocr(images=region_image, **ocr_params)
|
1129
|
-
# apply_ocr returns List[Dict] for single image
|
1146
|
+
results = ocr_mgr.apply_ocr(**manager_args)
|
1130
1147
|
if not isinstance(results, list):
|
1131
1148
|
logger.error(
|
1132
1149
|
f"OCRManager returned unexpected type for single region image: {type(results)}"
|
@@ -1137,25 +1154,19 @@ class Region(DirectionalMixin):
|
|
1137
1154
|
logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
|
1138
1155
|
return []
|
1139
1156
|
|
1140
|
-
# Convert results to TextElements
|
1141
|
-
# Calculate scaling factors based on the region image vs the region PDF coords
|
1157
|
+
# Convert results to TextElements
|
1142
1158
|
scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
|
1143
1159
|
scale_y = self.height / region_image.height if region_image.height > 0 else 1.0
|
1144
1160
|
logger.debug(f"Region OCR scaling factors (PDF/Img): x={scale_x:.2f}, y={scale_y:.2f}")
|
1145
|
-
|
1146
1161
|
created_elements = []
|
1147
1162
|
for result in results:
|
1148
1163
|
try:
|
1149
1164
|
img_x0, img_top, img_x1, img_bottom = map(float, result["bbox"])
|
1150
1165
|
pdf_height = (img_bottom - img_top) * scale_y
|
1151
|
-
|
1152
|
-
# Convert IMAGE coordinates (relative to region crop) to PAGE coordinates
|
1153
1166
|
page_x0 = self.x0 + (img_x0 * scale_x)
|
1154
1167
|
page_top = self.top + (img_top * scale_y)
|
1155
1168
|
page_x1 = self.x0 + (img_x1 * scale_x)
|
1156
1169
|
page_bottom = self.top + (img_bottom * scale_y)
|
1157
|
-
|
1158
|
-
# Create element data using PAGE coordinates
|
1159
1170
|
element_data = {
|
1160
1171
|
"text": result["text"],
|
1161
1172
|
"x0": page_x0,
|
@@ -1164,45 +1175,33 @@ class Region(DirectionalMixin):
|
|
1164
1175
|
"bottom": page_bottom,
|
1165
1176
|
"width": page_x1 - page_x0,
|
1166
1177
|
"height": page_bottom - page_top,
|
1167
|
-
"object_type": "word",
|
1178
|
+
"object_type": "word",
|
1168
1179
|
"source": "ocr",
|
1169
1180
|
"confidence": float(result.get("confidence", 0.0)),
|
1170
1181
|
"fontname": "OCR",
|
1171
|
-
"size": round(pdf_height) if pdf_height > 0 else 10.0,
|
1182
|
+
"size": round(pdf_height) if pdf_height > 0 else 10.0,
|
1172
1183
|
"page_number": self.page.number,
|
1173
1184
|
"bold": False,
|
1174
1185
|
"italic": False,
|
1175
1186
|
"upright": True,
|
1176
1187
|
"doctop": page_top + self.page._page.initial_doctop,
|
1177
1188
|
}
|
1178
|
-
|
1179
|
-
# Create the representative char dict
|
1180
1189
|
ocr_char_dict = element_data.copy()
|
1181
1190
|
ocr_char_dict["object_type"] = "char"
|
1182
1191
|
ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
|
1183
|
-
|
1184
|
-
# Add char dicts to word data
|
1185
1192
|
element_data["_char_dicts"] = [ocr_char_dict]
|
1186
|
-
|
1187
|
-
# Create the TextElement word
|
1188
|
-
from natural_pdf.elements.text import TextElement # Local import ok here
|
1189
|
-
|
1193
|
+
from natural_pdf.elements.text import TextElement
|
1190
1194
|
elem = TextElement(element_data, self.page)
|
1191
1195
|
created_elements.append(elem)
|
1192
|
-
|
1193
|
-
# Add the element to the page's element manager
|
1194
1196
|
self.page._element_mgr.add_element(elem, element_type="words")
|
1195
|
-
# Add the char dict to the manager's char list
|
1196
1197
|
self.page._element_mgr.add_element(ocr_char_dict, element_type="chars")
|
1197
|
-
|
1198
1198
|
except Exception as e:
|
1199
1199
|
logger.error(
|
1200
1200
|
f"Failed to convert region OCR result to element: {result}. Error: {e}",
|
1201
1201
|
exc_info=True,
|
1202
1202
|
)
|
1203
|
-
|
1204
1203
|
logger.info(f"Region {self.bbox}: Added {len(created_elements)} elements from OCR.")
|
1205
|
-
return
|
1204
|
+
return self
|
1206
1205
|
|
1207
1206
|
def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
|
1208
1207
|
"""
|
@@ -1689,3 +1688,43 @@ class Region(DirectionalMixin):
|
|
1689
1688
|
type_info = f" type='{self.region_type}'" if self.region_type else ""
|
1690
1689
|
source_info = f" source='{self.source}'" if self.source else ""
|
1691
1690
|
return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
|
1691
|
+
|
1692
|
+
def correct_ocr(
|
1693
|
+
self,
|
1694
|
+
correction_callback: Callable[[Any], Optional[str]],
|
1695
|
+
) -> "Region": # Return self for chaining
|
1696
|
+
"""
|
1697
|
+
Applies corrections to OCR-generated text elements within this region
|
1698
|
+
using a user-provided callback function.
|
1699
|
+
|
1700
|
+
Finds text elements within this region whose 'source' attribute starts
|
1701
|
+
with 'ocr' and calls the `correction_callback` for each, passing the
|
1702
|
+
element itself.
|
1703
|
+
|
1704
|
+
The `correction_callback` should contain the logic to:
|
1705
|
+
1. Determine if the element needs correction.
|
1706
|
+
2. Perform the correction (e.g., call an LLM).
|
1707
|
+
3. Return the new text (`str`) or `None`.
|
1708
|
+
|
1709
|
+
If the callback returns a string, the element's `.text` is updated.
|
1710
|
+
Metadata updates (source, confidence, etc.) should happen within the callback.
|
1711
|
+
|
1712
|
+
Args:
|
1713
|
+
correction_callback: A function accepting an element and returning
|
1714
|
+
`Optional[str]` (new text or None).
|
1715
|
+
|
1716
|
+
Returns:
|
1717
|
+
Self for method chaining.
|
1718
|
+
"""
|
1719
|
+
# Find OCR elements specifically within this region
|
1720
|
+
# Note: We typically want to correct even if the element falls in an excluded area
|
1721
|
+
target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
|
1722
|
+
|
1723
|
+
# Delegate to the utility function
|
1724
|
+
_apply_ocr_correction_to_elements(
|
1725
|
+
elements=target_elements, # Pass the ElementCollection directly
|
1726
|
+
correction_callback=correction_callback,
|
1727
|
+
caller_info=f"Region({self.bbox})", # Pass caller info
|
1728
|
+
)
|
1729
|
+
|
1730
|
+
return self # Return self for chaining
|
natural_pdf/elements/text.py
CHANGED
@@ -41,6 +41,11 @@ class TextElement(Element):
|
|
41
41
|
"""Get the text content."""
|
42
42
|
return self._obj.get("text", "")
|
43
43
|
|
44
|
+
@text.setter
|
45
|
+
def text(self, value: str):
|
46
|
+
"""Set the text content."""
|
47
|
+
self._obj["text"] = value
|
48
|
+
|
44
49
|
@property
|
45
50
|
def source(self) -> str:
|
46
51
|
"""Get the source of this text element (pdf or ocr)."""
|
natural_pdf/ocr/__init__.py
CHANGED
@@ -8,58 +8,71 @@ import logging
|
|
8
8
|
|
9
9
|
# Set up module logger
|
10
10
|
logger = logging.getLogger("natural_pdf.ocr")
|
11
|
+
|
12
|
+
# Import the base classes that are always available
|
11
13
|
from .engine import OCREngine
|
12
|
-
from .
|
13
|
-
from .engine_surya import SuryaOCREngine
|
14
|
+
from .ocr_options import OCROptions, BaseOCROptions, EasyOCROptions, PaddleOCROptions, SuryaOCROptions
|
14
15
|
from .ocr_manager import OCRManager
|
15
|
-
from .
|
16
|
+
from .ocr_factory import OCRFactory
|
16
17
|
|
18
|
+
# Add all public symbols that should be available when importing this module
|
17
19
|
__all__ = [
|
18
20
|
"OCRManager",
|
19
21
|
"OCREngine",
|
20
22
|
"OCROptions",
|
21
|
-
"
|
22
|
-
"
|
23
|
-
"
|
23
|
+
"BaseOCROptions",
|
24
|
+
"EasyOCROptions",
|
25
|
+
"PaddleOCROptions",
|
26
|
+
"SuryaOCROptions",
|
27
|
+
"OCRFactory",
|
28
|
+
"get_engine",
|
29
|
+
"list_available_engines"
|
24
30
|
]
|
25
31
|
|
26
|
-
DEFAULT_ENGINE = SuryaOCREngine
|
27
|
-
|
28
|
-
|
29
32
|
def get_engine(engine_name=None, **kwargs):
|
30
33
|
"""
|
31
|
-
Get OCR engine by name.
|
34
|
+
Get OCR engine by name with graceful handling of missing dependencies.
|
32
35
|
|
33
36
|
Args:
|
34
|
-
engine_name: Name of the engine to use ('easyocr', '
|
35
|
-
If None, the
|
37
|
+
engine_name: Name of the engine to use ('easyocr', 'paddle', 'surya')
|
38
|
+
If None, the best available engine is used
|
36
39
|
**kwargs: Additional arguments to pass to the engine constructor
|
37
40
|
|
38
41
|
Returns:
|
39
42
|
OCREngine instance
|
43
|
+
|
44
|
+
Raises:
|
45
|
+
ImportError: If the requested engine's dependencies aren't installed
|
46
|
+
ValueError: If the engine_name is unknown
|
40
47
|
"""
|
41
|
-
logger.debug(f"Initializing OCR engine: {engine_name or '
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
48
|
+
logger.debug(f"Initializing OCR engine: {engine_name or 'best available'}")
|
49
|
+
|
50
|
+
try:
|
51
|
+
if engine_name is None or engine_name == "default":
|
52
|
+
# Use the factory to get the best available engine
|
53
|
+
engine = OCRFactory.get_recommended_engine(**kwargs)
|
54
|
+
logger.info(f"Using recommended OCR engine: {engine.__class__.__name__}")
|
55
|
+
return engine
|
56
|
+
|
57
|
+
# Use the factory to create a specific engine
|
58
|
+
normalized_name = engine_name.lower()
|
59
|
+
if normalized_name in ["easyocr", "paddle", "surya"]:
|
60
|
+
return OCRFactory.create_engine(normalized_name, **kwargs)
|
61
|
+
else:
|
62
|
+
raise ValueError(f"Unknown OCR engine: {engine_name}")
|
63
|
+
|
64
|
+
except ImportError as e:
|
65
|
+
logger.error(f"OCR engine dependency error: {e}")
|
66
|
+
raise
|
67
|
+
except Exception as e:
|
68
|
+
logger.error(f"Error initializing OCR engine: {e}")
|
69
|
+
raise
|
51
70
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
raise ImportError(
|
61
|
-
"PaddleOCR is not installed. Please install it with: pip install paddlepaddle paddleocr"
|
62
|
-
)
|
63
|
-
|
64
|
-
logger.error(f"Unknown OCR engine: {engine_name}")
|
65
|
-
raise ValueError(f"Unknown OCR engine: {engine_name}")
|
71
|
+
def list_available_engines():
|
72
|
+
"""
|
73
|
+
List all available OCR engines.
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
Dict[str, bool]: Dictionary mapping engine names to availability status
|
77
|
+
"""
|
78
|
+
return OCRFactory.list_available_engines()
|