natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/categorizing-documents/index.md +168 -0
- docs/data-extraction/index.md +87 -0
- docs/element-selection/index.ipynb +218 -164
- docs/element-selection/index.md +20 -0
- docs/finetuning/index.md +176 -0
- docs/index.md +19 -0
- docs/ocr/index.md +63 -16
- docs/tutorials/01-loading-and-extraction.ipynb +411 -248
- docs/tutorials/02-finding-elements.ipynb +123 -46
- docs/tutorials/03-extracting-blocks.ipynb +24 -19
- docs/tutorials/04-table-extraction.ipynb +17 -12
- docs/tutorials/05-excluding-content.ipynb +37 -32
- docs/tutorials/06-document-qa.ipynb +36 -31
- docs/tutorials/07-layout-analysis.ipynb +45 -40
- docs/tutorials/07-working-with-regions.ipynb +61 -60
- docs/tutorials/08-spatial-navigation.ipynb +76 -71
- docs/tutorials/09-section-extraction.ipynb +160 -155
- docs/tutorials/10-form-field-extraction.ipynb +71 -66
- docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
- docs/tutorials/12-ocr-integration.ipynb +3420 -312
- docs/tutorials/12-ocr-integration.md +68 -106
- docs/tutorials/13-semantic-search.ipynb +641 -251
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/gemini.py +63 -47
- natural_pdf/classification/manager.py +343 -0
- natural_pdf/classification/mixin.py +149 -0
- natural_pdf/classification/results.py +62 -0
- natural_pdf/collections/mixins.py +63 -0
- natural_pdf/collections/pdf_collection.py +326 -17
- natural_pdf/core/element_manager.py +73 -4
- natural_pdf/core/page.py +255 -83
- natural_pdf/core/pdf.py +385 -367
- natural_pdf/elements/base.py +1 -3
- natural_pdf/elements/collections.py +279 -49
- natural_pdf/elements/region.py +106 -21
- natural_pdf/elements/text.py +5 -2
- natural_pdf/exporters/__init__.py +4 -0
- natural_pdf/exporters/base.py +61 -0
- natural_pdf/exporters/paddleocr.py +345 -0
- natural_pdf/extraction/manager.py +134 -0
- natural_pdf/extraction/mixin.py +246 -0
- natural_pdf/extraction/result.py +37 -0
- natural_pdf/ocr/__init__.py +16 -8
- natural_pdf/ocr/engine.py +46 -30
- natural_pdf/ocr/engine_easyocr.py +86 -42
- natural_pdf/ocr/engine_paddle.py +39 -28
- natural_pdf/ocr/engine_surya.py +32 -16
- natural_pdf/ocr/ocr_factory.py +34 -23
- natural_pdf/ocr/ocr_manager.py +98 -34
- natural_pdf/ocr/ocr_options.py +38 -10
- natural_pdf/ocr/utils.py +59 -33
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/selectors/parser.py +363 -238
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
- natural_pdf/utils/debug.py +4 -2
- natural_pdf/utils/identifiers.py +9 -5
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +172 -105
- natural_pdf/utils/text_extraction.py +96 -65
- natural_pdf/utils/tqdm_utils.py +43 -0
- natural_pdf/utils/visualization.py +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
natural_pdf/utils/packaging.py
CHANGED
@@ -28,11 +28,12 @@ from natural_pdf.utils.identifiers import generate_short_path_hash
|
|
28
28
|
|
29
29
|
logger = logging.getLogger(__name__)
|
30
30
|
|
31
|
+
|
31
32
|
def create_correction_task_package(
|
32
|
-
source: Union[
|
33
|
+
source: Union["PDF", "PDFCollection", List["PDF"]],
|
33
34
|
output_zip_path: str,
|
34
35
|
overwrite: bool = False,
|
35
|
-
suggest
|
36
|
+
suggest=None,
|
36
37
|
resolution: int = 150,
|
37
38
|
) -> None:
|
38
39
|
"""
|
@@ -55,27 +56,32 @@ def create_correction_task_package(
|
|
55
56
|
ValueError: If no valid pages with OCR data are found in the source.
|
56
57
|
"""
|
57
58
|
if os.path.exists(output_zip_path) and not overwrite:
|
58
|
-
raise FileExistsError(
|
59
|
+
raise FileExistsError(
|
60
|
+
f"Output file already exists: {output_zip_path}. Set overwrite=True to replace it."
|
61
|
+
)
|
59
62
|
|
60
63
|
# --- Resolve source to a list of PDF objects ---
|
61
|
-
pdfs_to_process: List[
|
62
|
-
if
|
64
|
+
pdfs_to_process: List["PDF"] = []
|
65
|
+
if (
|
66
|
+
hasattr(source, "__class__") and source.__class__.__name__ == "PDF"
|
67
|
+
): # Check type without direct import
|
63
68
|
pdfs_to_process = [source]
|
64
|
-
elif hasattr(source,
|
65
|
-
pdfs_to_process = source.pdfs
|
66
|
-
elif isinstance(source, list) and all(
|
69
|
+
elif hasattr(source, "__class__") and source.__class__.__name__ == "PDFCollection":
|
70
|
+
pdfs_to_process = source.pdfs # Assuming PDFCollection has a .pdfs property
|
71
|
+
elif isinstance(source, list) and all(
|
72
|
+
hasattr(p, "__class__") and p.__class__.__name__ == "PDF" for p in source
|
73
|
+
):
|
67
74
|
pdfs_to_process = source
|
68
75
|
else:
|
69
|
-
raise TypeError(
|
76
|
+
raise TypeError(
|
77
|
+
f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF]."
|
78
|
+
)
|
70
79
|
|
71
80
|
if not pdfs_to_process:
|
72
81
|
logger.warning("No PDF documents provided in the source.")
|
73
82
|
return
|
74
83
|
|
75
|
-
manifest_data = {
|
76
|
-
"pdfs": [], # Store pdf-level info if needed later?
|
77
|
-
"pages": []
|
78
|
-
}
|
84
|
+
manifest_data = {"pdfs": [], "pages": []} # Store pdf-level info if needed later?
|
79
85
|
total_regions_found = 0
|
80
86
|
|
81
87
|
# Use a temporary directory for staging files before zipping
|
@@ -84,38 +90,52 @@ def create_correction_task_package(
|
|
84
90
|
os.makedirs(images_dir)
|
85
91
|
logger.info(f"Using temporary directory for staging: {temp_dir}")
|
86
92
|
|
87
|
-
# --- Process each PDF ---
|
93
|
+
# --- Process each PDF ---
|
88
94
|
for pdf in pdfs_to_process:
|
89
|
-
if not hasattr(pdf,
|
95
|
+
if not hasattr(pdf, "path") or not hasattr(pdf, "pages"):
|
90
96
|
logger.warning(f"Skipping invalid PDF object: {pdf}")
|
91
97
|
continue
|
92
98
|
|
93
|
-
pdf_path = pdf.path
|
99
|
+
pdf_path = pdf.path # Should be the resolved, absolute path
|
94
100
|
pdf_short_id = generate_short_path_hash(pdf_path)
|
95
101
|
logger.debug(f"Processing PDF: {pdf_path} (ID: {pdf_short_id})")
|
96
102
|
|
97
103
|
pdf_has_ocr_regions = False
|
98
104
|
for page in pdf.pages:
|
99
|
-
if
|
100
|
-
not hasattr(page,
|
101
|
-
|
102
|
-
|
105
|
+
if (
|
106
|
+
not hasattr(page, "index")
|
107
|
+
or not hasattr(page, "number")
|
108
|
+
or not hasattr(page, "width")
|
109
|
+
or not hasattr(page, "height")
|
110
|
+
or not hasattr(page, "find_all")
|
111
|
+
or not hasattr(page, "to_image")
|
112
|
+
):
|
113
|
+
logger.warning(
|
114
|
+
f"Skipping invalid Page object in {pdf_path}: page index {getattr(page, 'index', 'unknown')}"
|
115
|
+
)
|
103
116
|
continue
|
104
117
|
|
105
118
|
# 1. Extract OCR elements for this page
|
106
119
|
try:
|
107
120
|
# Important: apply_exclusions=False ensures we get *all* OCR data
|
108
121
|
# regardless of user exclusions set on the PDF/page object.
|
109
|
-
ocr_elements = page.find_all(
|
122
|
+
ocr_elements = page.find_all(
|
123
|
+
"text[source=ocr]", apply_exclusions=False
|
124
|
+
).elements
|
110
125
|
except Exception as e:
|
111
|
-
logger.error(
|
112
|
-
|
126
|
+
logger.error(
|
127
|
+
f"Failed to extract OCR elements for {pdf_path} page {page.number}: {e}",
|
128
|
+
exc_info=True,
|
129
|
+
)
|
130
|
+
continue # Skip this page if element extraction fails
|
113
131
|
|
114
132
|
if not ocr_elements:
|
115
|
-
logger.debug(
|
116
|
-
|
133
|
+
logger.debug(
|
134
|
+
f"No OCR elements found for {pdf_path} page {page.number}. Skipping page in manifest."
|
135
|
+
)
|
136
|
+
continue # Skip page if no OCR elements
|
117
137
|
|
118
|
-
pdf_has_ocr_regions = True
|
138
|
+
pdf_has_ocr_regions = True # Mark that this PDF is relevant
|
119
139
|
logger.debug(f" Found {len(ocr_elements)} OCR elements on page {page.number}")
|
120
140
|
total_regions_found += len(ocr_elements)
|
121
141
|
|
@@ -128,9 +148,12 @@ def create_correction_task_package(
|
|
128
148
|
raise ValueError("page.to_image returned None")
|
129
149
|
img.save(image_save_path, "PNG")
|
130
150
|
except Exception as e:
|
131
|
-
logger.error(
|
151
|
+
logger.error(
|
152
|
+
f"Failed to render/save image for {pdf_path} page {page.number}: {e}",
|
153
|
+
exc_info=True,
|
154
|
+
)
|
132
155
|
# If image fails, we cannot proceed with this page for the task
|
133
|
-
pdf_has_ocr_regions = False
|
156
|
+
pdf_has_ocr_regions = False # Reset flag for this page
|
134
157
|
continue
|
135
158
|
|
136
159
|
# 3. Prepare region data for manifest
|
@@ -142,72 +165,85 @@ def create_correction_task_package(
|
|
142
165
|
for elem in tqdm(ocr_elements):
|
143
166
|
i += 1
|
144
167
|
# Basic check for necessary attributes
|
145
|
-
if not all(
|
146
|
-
|
168
|
+
if not all(
|
169
|
+
hasattr(elem, attr) for attr in ["x0", "top", "x1", "bottom", "text"]
|
170
|
+
):
|
171
|
+
logger.warning(
|
172
|
+
f"Skipping invalid OCR element {i} on {pdf_path} page {page.number}"
|
173
|
+
)
|
147
174
|
continue
|
148
|
-
region_id = f"r_{page.index}_{i}"
|
175
|
+
region_id = f"r_{page.index}_{i}" # ID unique within page
|
149
176
|
|
150
177
|
# Scale coordinates to match the 300 DPI image
|
151
178
|
scaled_bbox = [
|
152
179
|
elem.x0 * coord_scale_factor,
|
153
180
|
elem.top * coord_scale_factor,
|
154
181
|
elem.x1 * coord_scale_factor,
|
155
|
-
elem.bottom * coord_scale_factor
|
182
|
+
elem.bottom * coord_scale_factor,
|
156
183
|
]
|
157
184
|
|
158
185
|
corrected = elem.text
|
159
186
|
|
160
187
|
if suggest:
|
161
|
-
corrected = suggest(elem.to_region(), getattr(elem,
|
162
|
-
|
163
|
-
page_regions_data.append(
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
188
|
+
corrected = suggest(elem.to_region(), getattr(elem, "confidence", None))
|
189
|
+
|
190
|
+
page_regions_data.append(
|
191
|
+
{
|
192
|
+
"resolution": resolution,
|
193
|
+
"id": region_id,
|
194
|
+
"bbox": scaled_bbox,
|
195
|
+
"ocr_text": elem.text,
|
196
|
+
"confidence": getattr(
|
197
|
+
elem, "confidence", None
|
198
|
+
), # Include confidence if available
|
199
|
+
"corrected_text": corrected,
|
200
|
+
"modified": False,
|
201
|
+
}
|
202
|
+
)
|
172
203
|
|
173
204
|
# 4. Add page data to manifest if it has regions
|
174
205
|
if page_regions_data:
|
175
|
-
manifest_data["pages"].append(
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
206
|
+
manifest_data["pages"].append(
|
207
|
+
{
|
208
|
+
"pdf_source": pdf_path,
|
209
|
+
"pdf_short_id": pdf_short_id,
|
210
|
+
"page_number": page.number,
|
211
|
+
"page_index": page.index,
|
212
|
+
"image_path": f"images/{image_filename}", # Relative path within zip
|
213
|
+
"width": page.width,
|
214
|
+
"height": page.height,
|
215
|
+
"regions": page_regions_data,
|
216
|
+
}
|
217
|
+
)
|
185
218
|
else:
|
186
219
|
# If, after checks, no valid regions remain, ensure flag is correct
|
187
220
|
pdf_has_ocr_regions = False
|
188
221
|
|
189
|
-
|
190
|
-
# --- Final Checks and Zipping ---
|
222
|
+
# --- Final Checks and Zipping ---
|
191
223
|
if not manifest_data["pages"] or total_regions_found == 0:
|
192
|
-
|
193
|
-
|
194
|
-
|
224
|
+
logger.error(
|
225
|
+
"No pages with valid OCR regions and successfully rendered images found in the source PDFs. Cannot create task package."
|
226
|
+
)
|
227
|
+
# Consider raising ValueError here instead of just returning
|
228
|
+
raise ValueError("No valid pages with OCR data found to create a task package.")
|
195
229
|
|
196
230
|
manifest_path = os.path.join(temp_dir, "manifest.json")
|
197
231
|
try:
|
198
|
-
with open(manifest_path,
|
232
|
+
with open(manifest_path, "w", encoding="utf-8") as f_manifest:
|
199
233
|
json.dump(manifest_data, f_manifest, indent=2)
|
200
234
|
except Exception as e:
|
201
235
|
logger.error(f"Failed to write manifest.json: {e}", exc_info=True)
|
202
|
-
raise
|
236
|
+
raise # Re-raise error, cannot proceed
|
203
237
|
|
204
238
|
# --- Copy SPA files into temp dir ---
|
205
239
|
try:
|
206
240
|
# Find the path to the spa template directory relative to this file
|
207
241
|
# Using __file__ assumes this script is installed alongside the templates
|
208
242
|
utils_dir = os.path.dirname(os.path.abspath(__file__))
|
209
|
-
templates_dir = os.path.join(
|
210
|
-
|
243
|
+
templates_dir = os.path.join(
|
244
|
+
os.path.dirname(utils_dir), "templates"
|
245
|
+
) # Go up one level from utils
|
246
|
+
spa_template_dir = os.path.join(templates_dir, "spa")
|
211
247
|
|
212
248
|
if not os.path.isdir(spa_template_dir):
|
213
249
|
raise FileNotFoundError(f"SPA template directory not found at {spa_template_dir}")
|
@@ -224,32 +260,34 @@ def create_correction_task_package(
|
|
224
260
|
# --- Create the final zip file ---
|
225
261
|
try:
|
226
262
|
logger.info(f"Creating zip package at: {output_zip_path}")
|
227
|
-
with zipfile.ZipFile(output_zip_path,
|
263
|
+
with zipfile.ZipFile(output_zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
|
228
264
|
# Add manifest.json
|
229
265
|
zipf.write(manifest_path, arcname="manifest.json")
|
230
266
|
# Add images directory
|
231
267
|
for root, _, files in os.walk(images_dir):
|
232
268
|
for file in files:
|
233
269
|
full_path = os.path.join(root, file)
|
234
|
-
# Create the correct archive name (e.g., images/...)
|
270
|
+
# Create the correct archive name (e.g., images/...)
|
235
271
|
arcname = os.path.relpath(full_path, temp_dir)
|
236
272
|
zipf.write(full_path, arcname=arcname)
|
237
|
-
logger.info(
|
273
|
+
logger.info(
|
274
|
+
f"Successfully created correction task package: {output_zip_path} ({total_regions_found} regions total)"
|
275
|
+
)
|
238
276
|
|
239
277
|
except Exception as e:
|
240
278
|
logger.error(f"Failed to create zip file {output_zip_path}: {e}", exc_info=True)
|
241
279
|
# Attempt to clean up existing zip if creation failed partially
|
242
280
|
if os.path.exists(output_zip_path):
|
243
|
-
try:
|
244
|
-
|
245
|
-
|
281
|
+
try:
|
282
|
+
os.remove(output_zip_path)
|
283
|
+
except:
|
284
|
+
pass
|
285
|
+
raise # Re-raise error
|
246
286
|
|
247
|
-
# Temporary directory is automatically cleaned up by context manager
|
287
|
+
# Temporary directory is automatically cleaned up by context manager
|
248
288
|
|
249
|
-
|
250
|
-
|
251
|
-
manifest_path: str
|
252
|
-
) -> Dict[str, int]:
|
289
|
+
|
290
|
+
def import_ocr_from_manifest(pdf: "PDF", manifest_path: str) -> Dict[str, int]:
|
253
291
|
"""
|
254
292
|
Imports OCR data into a PDF object from a manifest file.
|
255
293
|
|
@@ -275,8 +313,8 @@ def import_ocr_from_manifest(
|
|
275
313
|
ValueError: If the manifest is invalid or contains data for a different PDF.
|
276
314
|
TypeError: If the input pdf object is not a valid PDF instance.
|
277
315
|
"""
|
278
|
-
if not (hasattr(pdf,
|
279
|
-
|
316
|
+
if not (hasattr(pdf, "__class__") and pdf.__class__.__name__ == "PDF"):
|
317
|
+
raise TypeError(f"Input must be a natural_pdf PDF object, got {type(pdf)}")
|
280
318
|
|
281
319
|
if not os.path.exists(manifest_path):
|
282
320
|
raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
|
@@ -284,7 +322,7 @@ def import_ocr_from_manifest(
|
|
284
322
|
logger.info(f"Importing OCR data into PDF '{pdf.path}' from manifest '{manifest_path}'")
|
285
323
|
|
286
324
|
try:
|
287
|
-
with open(manifest_path,
|
325
|
+
with open(manifest_path, "r", encoding="utf-8") as f:
|
288
326
|
manifest_data = json.load(f)
|
289
327
|
except json.JSONDecodeError as e:
|
290
328
|
logger.error(f"Failed to parse manifest file: {e}")
|
@@ -300,18 +338,21 @@ def import_ocr_from_manifest(
|
|
300
338
|
manifest_pages = manifest_data.get("pages", [])
|
301
339
|
if not manifest_pages:
|
302
340
|
logger.warning("Manifest contains no page data.")
|
303
|
-
return {
|
341
|
+
return {"imported": 0, "skipped": 0}
|
304
342
|
|
305
343
|
# --- Pre-check PDF source consistency ---
|
306
344
|
first_manifest_pdf_path = manifest_pages[0].get("pdf_source")
|
307
345
|
if first_manifest_pdf_path != pdf.path:
|
308
346
|
# Allow matching based on just the filename if paths differ (e.g., absolute vs relative)
|
309
347
|
if os.path.basename(first_manifest_pdf_path) != os.path.basename(pdf.path):
|
310
|
-
logger.error(
|
348
|
+
logger.error(
|
349
|
+
f"Manifest PDF source ('{first_manifest_pdf_path}') does not match target PDF path ('{pdf.path}'). Aborting."
|
350
|
+
)
|
311
351
|
raise ValueError("Manifest source PDF does not match the provided PDF object.")
|
312
352
|
else:
|
313
|
-
logger.warning(
|
314
|
-
|
353
|
+
logger.warning(
|
354
|
+
f"Manifest PDF source path ('{first_manifest_pdf_path}') differs from target PDF path ('{pdf.path}'), but filenames match. Proceeding cautiously."
|
355
|
+
)
|
315
356
|
|
316
357
|
pdf_pages_by_index = {page.index: page for page in pdf.pages}
|
317
358
|
|
@@ -320,19 +361,27 @@ def import_ocr_from_manifest(
|
|
320
361
|
manifest_pdf_path = page_data.get("pdf_source")
|
321
362
|
|
322
363
|
# Check consistency for every page? (Maybe overkill if pre-checked)
|
323
|
-
if manifest_pdf_path != pdf.path and os.path.basename(
|
324
|
-
|
325
|
-
|
326
|
-
|
364
|
+
if manifest_pdf_path != pdf.path and os.path.basename(
|
365
|
+
manifest_pdf_path
|
366
|
+
) != os.path.basename(pdf.path):
|
367
|
+
logger.warning(
|
368
|
+
f"Skipping page index {page_index} due to PDF source mismatch ('{manifest_pdf_path}' vs '{pdf.path}')"
|
369
|
+
)
|
370
|
+
skipped_count += len(page_data.get("regions", [])) # Count all regions as skipped
|
371
|
+
continue
|
327
372
|
|
328
373
|
if page_index is None:
|
329
|
-
logger.warning(
|
374
|
+
logger.warning(
|
375
|
+
f"Skipping page entry with missing 'page_index': {page_data.get('page_number')}"
|
376
|
+
)
|
330
377
|
skipped_count += len(page_data.get("regions", []))
|
331
378
|
continue
|
332
379
|
|
333
380
|
page = pdf_pages_by_index.get(page_index)
|
334
381
|
if page is None:
|
335
|
-
logger.warning(
|
382
|
+
logger.warning(
|
383
|
+
f"Could not find page with index {page_index} in the target PDF. Skipping."
|
384
|
+
)
|
336
385
|
skipped_count += len(page_data.get("regions", []))
|
337
386
|
continue
|
338
387
|
|
@@ -353,11 +402,13 @@ def import_ocr_from_manifest(
|
|
353
402
|
if text_to_import is None:
|
354
403
|
text_to_import = region_data.get("ocr_text")
|
355
404
|
|
356
|
-
resolution = region_data.get("resolution")
|
357
|
-
confidence = region_data.get("confidence")
|
405
|
+
resolution = region_data.get("resolution") # Mandatory from export
|
406
|
+
confidence = region_data.get("confidence") # Optional
|
358
407
|
|
359
408
|
if not all([manifest_bbox, text_to_import is not None, resolution]):
|
360
|
-
logger.warning(
|
409
|
+
logger.warning(
|
410
|
+
f"Skipping incomplete/invalid region data on page {page_index}, region id '{region_id}': Missing bbox, text, or resolution."
|
411
|
+
)
|
361
412
|
skipped_count += 1
|
362
413
|
continue
|
363
414
|
|
@@ -369,11 +420,13 @@ def import_ocr_from_manifest(
|
|
369
420
|
pdf_x1 = manifest_bbox[2] * scale_factor
|
370
421
|
pdf_bottom = manifest_bbox[3] * scale_factor
|
371
422
|
except (ValueError, TypeError, IndexError, ZeroDivisionError):
|
372
|
-
logger.warning(
|
423
|
+
logger.warning(
|
424
|
+
f"Invalid bbox or resolution for region '{region_id}' on page {page_index}. Skipping."
|
425
|
+
)
|
373
426
|
skipped_count += 1
|
374
427
|
continue
|
375
428
|
|
376
|
-
# --- Create New Element ---
|
429
|
+
# --- Create New Element ---
|
377
430
|
try:
|
378
431
|
new_element = TextElement(
|
379
432
|
text=text_to_import,
|
@@ -381,31 +434,44 @@ def import_ocr_from_manifest(
|
|
381
434
|
top=pdf_top,
|
382
435
|
x1=pdf_x1,
|
383
436
|
bottom=pdf_bottom,
|
384
|
-
page=page,
|
385
|
-
source=
|
386
|
-
confidence=confidence,
|
437
|
+
page=page, # Reference to the parent Page object
|
438
|
+
source="manifest-import", # Indicate origin
|
439
|
+
confidence=confidence, # Pass confidence if available
|
387
440
|
# Add metadata from manifest if needed? Maybe original_ocr?
|
388
|
-
metadata=
|
441
|
+
metadata=(
|
442
|
+
{"original_ocr": region_data.get("ocr_text")}
|
443
|
+
if region_data.get("ocr_text") != text_to_import
|
444
|
+
else {}
|
445
|
+
),
|
389
446
|
)
|
390
447
|
regions_to_add.append(new_element)
|
391
448
|
imported_count += 1
|
392
449
|
except Exception as e:
|
393
|
-
|
394
|
-
|
450
|
+
logger.error(
|
451
|
+
f"Error creating TextElement for region '{region_id}' on page {page_index}: {e}",
|
452
|
+
exc_info=True,
|
453
|
+
)
|
454
|
+
skipped_count += 1
|
395
455
|
|
396
456
|
# --- Add Elements to Page ---
|
397
457
|
# Add all created elements for this page in one go
|
398
458
|
if regions_to_add:
|
399
459
|
try:
|
400
460
|
# Accessing _elements directly; use manager if a public add method exists
|
401
|
-
if
|
461
|
+
if (
|
462
|
+
hasattr(page, "_elements")
|
463
|
+
and hasattr(page._elements, "elements")
|
464
|
+
and isinstance(page._elements.elements, list)
|
465
|
+
):
|
402
466
|
page._elements.elements.extend(regions_to_add)
|
403
467
|
# TODO: Should potentially invalidate page element cache if exists
|
404
468
|
else:
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
469
|
+
logger.error(
|
470
|
+
f"Could not add elements to page {page.index}, page._elements structure unexpected."
|
471
|
+
)
|
472
|
+
# Decrement count as they weren't actually added
|
473
|
+
imported_count -= len(regions_to_add)
|
474
|
+
skipped_count += len(regions_to_add)
|
409
475
|
|
410
476
|
except Exception as e:
|
411
477
|
logger.error(f"Error adding elements to page {page.index}: {e}", exc_info=True)
|
@@ -413,6 +479,7 @@ def import_ocr_from_manifest(
|
|
413
479
|
imported_count -= len(regions_to_add)
|
414
480
|
skipped_count += len(regions_to_add)
|
415
481
|
|
416
|
-
|
417
|
-
|
418
|
-
|
482
|
+
logger.info(
|
483
|
+
f"Import process finished. Imported: {imported_count}, Skipped: {skipped_count}. Processed {processed_pages} pages from manifest."
|
484
|
+
)
|
485
|
+
return {"imported": imported_count, "skipped": skipped_count}
|