natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/finetuning/index.md +176 -0
  6. docs/index.md +19 -0
  7. docs/ocr/index.md +63 -16
  8. docs/tutorials/01-loading-and-extraction.ipynb +411 -248
  9. docs/tutorials/02-finding-elements.ipynb +123 -46
  10. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  11. docs/tutorials/04-table-extraction.ipynb +17 -12
  12. docs/tutorials/05-excluding-content.ipynb +37 -32
  13. docs/tutorials/06-document-qa.ipynb +36 -31
  14. docs/tutorials/07-layout-analysis.ipynb +45 -40
  15. docs/tutorials/07-working-with-regions.ipynb +61 -60
  16. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  17. docs/tutorials/09-section-extraction.ipynb +160 -155
  18. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  19. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  20. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  21. docs/tutorials/12-ocr-integration.md +68 -106
  22. docs/tutorials/13-semantic-search.ipynb +641 -251
  23. natural_pdf/__init__.py +3 -0
  24. natural_pdf/analyzers/layout/gemini.py +63 -47
  25. natural_pdf/classification/manager.py +343 -0
  26. natural_pdf/classification/mixin.py +149 -0
  27. natural_pdf/classification/results.py +62 -0
  28. natural_pdf/collections/mixins.py +63 -0
  29. natural_pdf/collections/pdf_collection.py +326 -17
  30. natural_pdf/core/element_manager.py +73 -4
  31. natural_pdf/core/page.py +255 -83
  32. natural_pdf/core/pdf.py +385 -367
  33. natural_pdf/elements/base.py +1 -3
  34. natural_pdf/elements/collections.py +279 -49
  35. natural_pdf/elements/region.py +106 -21
  36. natural_pdf/elements/text.py +5 -2
  37. natural_pdf/exporters/__init__.py +4 -0
  38. natural_pdf/exporters/base.py +61 -0
  39. natural_pdf/exporters/paddleocr.py +345 -0
  40. natural_pdf/extraction/manager.py +134 -0
  41. natural_pdf/extraction/mixin.py +246 -0
  42. natural_pdf/extraction/result.py +37 -0
  43. natural_pdf/ocr/__init__.py +16 -8
  44. natural_pdf/ocr/engine.py +46 -30
  45. natural_pdf/ocr/engine_easyocr.py +86 -42
  46. natural_pdf/ocr/engine_paddle.py +39 -28
  47. natural_pdf/ocr/engine_surya.py +32 -16
  48. natural_pdf/ocr/ocr_factory.py +34 -23
  49. natural_pdf/ocr/ocr_manager.py +98 -34
  50. natural_pdf/ocr/ocr_options.py +38 -10
  51. natural_pdf/ocr/utils.py +59 -33
  52. natural_pdf/qa/document_qa.py +0 -4
  53. natural_pdf/selectors/parser.py +363 -238
  54. natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
  55. natural_pdf/utils/debug.py +4 -2
  56. natural_pdf/utils/identifiers.py +9 -5
  57. natural_pdf/utils/locks.py +8 -0
  58. natural_pdf/utils/packaging.py +172 -105
  59. natural_pdf/utils/text_extraction.py +96 -65
  60. natural_pdf/utils/tqdm_utils.py +43 -0
  61. natural_pdf/utils/visualization.py +1 -1
  62. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
  63. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
  64. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  65. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  66. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -28,11 +28,12 @@ from natural_pdf.utils.identifiers import generate_short_path_hash
28
28
 
29
29
  logger = logging.getLogger(__name__)
30
30
 
31
+
31
32
  def create_correction_task_package(
32
- source: Union['PDF', 'PDFCollection', List['PDF']],
33
+ source: Union["PDF", "PDFCollection", List["PDF"]],
33
34
  output_zip_path: str,
34
35
  overwrite: bool = False,
35
- suggest = None,
36
+ suggest=None,
36
37
  resolution: int = 150,
37
38
  ) -> None:
38
39
  """
@@ -55,27 +56,32 @@ def create_correction_task_package(
55
56
  ValueError: If no valid pages with OCR data are found in the source.
56
57
  """
57
58
  if os.path.exists(output_zip_path) and not overwrite:
58
- raise FileExistsError(f"Output file already exists: {output_zip_path}. Set overwrite=True to replace it.")
59
+ raise FileExistsError(
60
+ f"Output file already exists: {output_zip_path}. Set overwrite=True to replace it."
61
+ )
59
62
 
60
63
  # --- Resolve source to a list of PDF objects ---
61
- pdfs_to_process: List['PDF'] = []
62
- if hasattr(source, '__class__') and source.__class__.__name__ == 'PDF': # Check type without direct import
64
+ pdfs_to_process: List["PDF"] = []
65
+ if (
66
+ hasattr(source, "__class__") and source.__class__.__name__ == "PDF"
67
+ ): # Check type without direct import
63
68
  pdfs_to_process = [source]
64
- elif hasattr(source, '__class__') and source.__class__.__name__ == 'PDFCollection':
65
- pdfs_to_process = source.pdfs # Assuming PDFCollection has a .pdfs property
66
- elif isinstance(source, list) and all(hasattr(p, '__class__') and p.__class__.__name__ == 'PDF' for p in source):
69
+ elif hasattr(source, "__class__") and source.__class__.__name__ == "PDFCollection":
70
+ pdfs_to_process = source.pdfs # Assuming PDFCollection has a .pdfs property
71
+ elif isinstance(source, list) and all(
72
+ hasattr(p, "__class__") and p.__class__.__name__ == "PDF" for p in source
73
+ ):
67
74
  pdfs_to_process = source
68
75
  else:
69
- raise TypeError(f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF].")
76
+ raise TypeError(
77
+ f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF]."
78
+ )
70
79
 
71
80
  if not pdfs_to_process:
72
81
  logger.warning("No PDF documents provided in the source.")
73
82
  return
74
83
 
75
- manifest_data = {
76
- "pdfs": [], # Store pdf-level info if needed later?
77
- "pages": []
78
- }
84
+ manifest_data = {"pdfs": [], "pages": []} # Store pdf-level info if needed later?
79
85
  total_regions_found = 0
80
86
 
81
87
  # Use a temporary directory for staging files before zipping
@@ -84,38 +90,52 @@ def create_correction_task_package(
84
90
  os.makedirs(images_dir)
85
91
  logger.info(f"Using temporary directory for staging: {temp_dir}")
86
92
 
87
- # --- Process each PDF ---
93
+ # --- Process each PDF ---
88
94
  for pdf in pdfs_to_process:
89
- if not hasattr(pdf, 'path') or not hasattr(pdf, 'pages'):
95
+ if not hasattr(pdf, "path") or not hasattr(pdf, "pages"):
90
96
  logger.warning(f"Skipping invalid PDF object: {pdf}")
91
97
  continue
92
98
 
93
- pdf_path = pdf.path # Should be the resolved, absolute path
99
+ pdf_path = pdf.path # Should be the resolved, absolute path
94
100
  pdf_short_id = generate_short_path_hash(pdf_path)
95
101
  logger.debug(f"Processing PDF: {pdf_path} (ID: {pdf_short_id})")
96
102
 
97
103
  pdf_has_ocr_regions = False
98
104
  for page in pdf.pages:
99
- if not hasattr(page, 'index') or not hasattr(page, 'number') or \
100
- not hasattr(page, 'width') or not hasattr(page, 'height') or \
101
- not hasattr(page, 'find_all') or not hasattr(page, 'to_image'):
102
- logger.warning(f"Skipping invalid Page object in {pdf_path}: page index {getattr(page, 'index', 'unknown')}")
105
+ if (
106
+ not hasattr(page, "index")
107
+ or not hasattr(page, "number")
108
+ or not hasattr(page, "width")
109
+ or not hasattr(page, "height")
110
+ or not hasattr(page, "find_all")
111
+ or not hasattr(page, "to_image")
112
+ ):
113
+ logger.warning(
114
+ f"Skipping invalid Page object in {pdf_path}: page index {getattr(page, 'index', 'unknown')}"
115
+ )
103
116
  continue
104
117
 
105
118
  # 1. Extract OCR elements for this page
106
119
  try:
107
120
  # Important: apply_exclusions=False ensures we get *all* OCR data
108
121
  # regardless of user exclusions set on the PDF/page object.
109
- ocr_elements = page.find_all('text[source=ocr]', apply_exclusions=False).elements
122
+ ocr_elements = page.find_all(
123
+ "text[source=ocr]", apply_exclusions=False
124
+ ).elements
110
125
  except Exception as e:
111
- logger.error(f"Failed to extract OCR elements for {pdf_path} page {page.number}: {e}", exc_info=True)
112
- continue # Skip this page if element extraction fails
126
+ logger.error(
127
+ f"Failed to extract OCR elements for {pdf_path} page {page.number}: {e}",
128
+ exc_info=True,
129
+ )
130
+ continue # Skip this page if element extraction fails
113
131
 
114
132
  if not ocr_elements:
115
- logger.debug(f"No OCR elements found for {pdf_path} page {page.number}. Skipping page in manifest.")
116
- continue # Skip page if no OCR elements
133
+ logger.debug(
134
+ f"No OCR elements found for {pdf_path} page {page.number}. Skipping page in manifest."
135
+ )
136
+ continue # Skip page if no OCR elements
117
137
 
118
- pdf_has_ocr_regions = True # Mark that this PDF is relevant
138
+ pdf_has_ocr_regions = True # Mark that this PDF is relevant
119
139
  logger.debug(f" Found {len(ocr_elements)} OCR elements on page {page.number}")
120
140
  total_regions_found += len(ocr_elements)
121
141
 
@@ -128,9 +148,12 @@ def create_correction_task_package(
128
148
  raise ValueError("page.to_image returned None")
129
149
  img.save(image_save_path, "PNG")
130
150
  except Exception as e:
131
- logger.error(f"Failed to render/save image for {pdf_path} page {page.number}: {e}", exc_info=True)
151
+ logger.error(
152
+ f"Failed to render/save image for {pdf_path} page {page.number}: {e}",
153
+ exc_info=True,
154
+ )
132
155
  # If image fails, we cannot proceed with this page for the task
133
- pdf_has_ocr_regions = False # Reset flag for this page
156
+ pdf_has_ocr_regions = False # Reset flag for this page
134
157
  continue
135
158
 
136
159
  # 3. Prepare region data for manifest
@@ -142,72 +165,85 @@ def create_correction_task_package(
142
165
  for elem in tqdm(ocr_elements):
143
166
  i += 1
144
167
  # Basic check for necessary attributes
145
- if not all(hasattr(elem, attr) for attr in ['x0', 'top', 'x1', 'bottom', 'text']):
146
- logger.warning(f"Skipping invalid OCR element {i} on {pdf_path} page {page.number}")
168
+ if not all(
169
+ hasattr(elem, attr) for attr in ["x0", "top", "x1", "bottom", "text"]
170
+ ):
171
+ logger.warning(
172
+ f"Skipping invalid OCR element {i} on {pdf_path} page {page.number}"
173
+ )
147
174
  continue
148
- region_id = f"r_{page.index}_{i}" # ID unique within page
175
+ region_id = f"r_{page.index}_{i}" # ID unique within page
149
176
 
150
177
  # Scale coordinates to match the 300 DPI image
151
178
  scaled_bbox = [
152
179
  elem.x0 * coord_scale_factor,
153
180
  elem.top * coord_scale_factor,
154
181
  elem.x1 * coord_scale_factor,
155
- elem.bottom * coord_scale_factor
182
+ elem.bottom * coord_scale_factor,
156
183
  ]
157
184
 
158
185
  corrected = elem.text
159
186
 
160
187
  if suggest:
161
- corrected = suggest(elem.to_region(), getattr(elem, 'confidence', None))
162
-
163
- page_regions_data.append({
164
- "resolution": resolution,
165
- "id": region_id,
166
- "bbox": scaled_bbox,
167
- "ocr_text": elem.text,
168
- "confidence": getattr(elem, 'confidence', None), # Include confidence if available
169
- "corrected_text": corrected,
170
- "modified": False
171
- })
188
+ corrected = suggest(elem.to_region(), getattr(elem, "confidence", None))
189
+
190
+ page_regions_data.append(
191
+ {
192
+ "resolution": resolution,
193
+ "id": region_id,
194
+ "bbox": scaled_bbox,
195
+ "ocr_text": elem.text,
196
+ "confidence": getattr(
197
+ elem, "confidence", None
198
+ ), # Include confidence if available
199
+ "corrected_text": corrected,
200
+ "modified": False,
201
+ }
202
+ )
172
203
 
173
204
  # 4. Add page data to manifest if it has regions
174
205
  if page_regions_data:
175
- manifest_data["pages"].append({
176
- "pdf_source": pdf_path,
177
- "pdf_short_id": pdf_short_id,
178
- "page_number": page.number,
179
- "page_index": page.index,
180
- "image_path": f"images/{image_filename}", # Relative path within zip
181
- "width": page.width,
182
- "height": page.height,
183
- "regions": page_regions_data
184
- })
206
+ manifest_data["pages"].append(
207
+ {
208
+ "pdf_source": pdf_path,
209
+ "pdf_short_id": pdf_short_id,
210
+ "page_number": page.number,
211
+ "page_index": page.index,
212
+ "image_path": f"images/{image_filename}", # Relative path within zip
213
+ "width": page.width,
214
+ "height": page.height,
215
+ "regions": page_regions_data,
216
+ }
217
+ )
185
218
  else:
186
219
  # If, after checks, no valid regions remain, ensure flag is correct
187
220
  pdf_has_ocr_regions = False
188
221
 
189
-
190
- # --- Final Checks and Zipping ---
222
+ # --- Final Checks and Zipping ---
191
223
  if not manifest_data["pages"] or total_regions_found == 0:
192
- logger.error("No pages with valid OCR regions and successfully rendered images found in the source PDFs. Cannot create task package.")
193
- # Consider raising ValueError here instead of just returning
194
- raise ValueError("No valid pages with OCR data found to create a task package.")
224
+ logger.error(
225
+ "No pages with valid OCR regions and successfully rendered images found in the source PDFs. Cannot create task package."
226
+ )
227
+ # Consider raising ValueError here instead of just returning
228
+ raise ValueError("No valid pages with OCR data found to create a task package.")
195
229
 
196
230
  manifest_path = os.path.join(temp_dir, "manifest.json")
197
231
  try:
198
- with open(manifest_path, 'w', encoding='utf-8') as f_manifest:
232
+ with open(manifest_path, "w", encoding="utf-8") as f_manifest:
199
233
  json.dump(manifest_data, f_manifest, indent=2)
200
234
  except Exception as e:
201
235
  logger.error(f"Failed to write manifest.json: {e}", exc_info=True)
202
- raise # Re-raise error, cannot proceed
236
+ raise # Re-raise error, cannot proceed
203
237
 
204
238
  # --- Copy SPA files into temp dir ---
205
239
  try:
206
240
  # Find the path to the spa template directory relative to this file
207
241
  # Using __file__ assumes this script is installed alongside the templates
208
242
  utils_dir = os.path.dirname(os.path.abspath(__file__))
209
- templates_dir = os.path.join(os.path.dirname(utils_dir), 'templates') # Go up one level from utils
210
- spa_template_dir = os.path.join(templates_dir, 'spa')
243
+ templates_dir = os.path.join(
244
+ os.path.dirname(utils_dir), "templates"
245
+ ) # Go up one level from utils
246
+ spa_template_dir = os.path.join(templates_dir, "spa")
211
247
 
212
248
  if not os.path.isdir(spa_template_dir):
213
249
  raise FileNotFoundError(f"SPA template directory not found at {spa_template_dir}")
@@ -224,32 +260,34 @@ def create_correction_task_package(
224
260
  # --- Create the final zip file ---
225
261
  try:
226
262
  logger.info(f"Creating zip package at: {output_zip_path}")
227
- with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
263
+ with zipfile.ZipFile(output_zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
228
264
  # Add manifest.json
229
265
  zipf.write(manifest_path, arcname="manifest.json")
230
266
  # Add images directory
231
267
  for root, _, files in os.walk(images_dir):
232
268
  for file in files:
233
269
  full_path = os.path.join(root, file)
234
- # Create the correct archive name (e.g., images/...)
270
+ # Create the correct archive name (e.g., images/...)
235
271
  arcname = os.path.relpath(full_path, temp_dir)
236
272
  zipf.write(full_path, arcname=arcname)
237
- logger.info(f"Successfully created correction task package: {output_zip_path} ({total_regions_found} regions total)")
273
+ logger.info(
274
+ f"Successfully created correction task package: {output_zip_path} ({total_regions_found} regions total)"
275
+ )
238
276
 
239
277
  except Exception as e:
240
278
  logger.error(f"Failed to create zip file {output_zip_path}: {e}", exc_info=True)
241
279
  # Attempt to clean up existing zip if creation failed partially
242
280
  if os.path.exists(output_zip_path):
243
- try: os.remove(output_zip_path)
244
- except: pass
245
- raise # Re-raise error
281
+ try:
282
+ os.remove(output_zip_path)
283
+ except:
284
+ pass
285
+ raise # Re-raise error
246
286
 
247
- # Temporary directory is automatically cleaned up by context manager
287
+ # Temporary directory is automatically cleaned up by context manager
248
288
 
249
- def import_ocr_from_manifest(
250
- pdf: 'PDF',
251
- manifest_path: str
252
- ) -> Dict[str, int]:
289
+
290
+ def import_ocr_from_manifest(pdf: "PDF", manifest_path: str) -> Dict[str, int]:
253
291
  """
254
292
  Imports OCR data into a PDF object from a manifest file.
255
293
 
@@ -275,8 +313,8 @@ def import_ocr_from_manifest(
275
313
  ValueError: If the manifest is invalid or contains data for a different PDF.
276
314
  TypeError: If the input pdf object is not a valid PDF instance.
277
315
  """
278
- if not (hasattr(pdf, '__class__') and pdf.__class__.__name__ == 'PDF'):
279
- raise TypeError(f"Input must be a natural_pdf PDF object, got {type(pdf)}")
316
+ if not (hasattr(pdf, "__class__") and pdf.__class__.__name__ == "PDF"):
317
+ raise TypeError(f"Input must be a natural_pdf PDF object, got {type(pdf)}")
280
318
 
281
319
  if not os.path.exists(manifest_path):
282
320
  raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
@@ -284,7 +322,7 @@ def import_ocr_from_manifest(
284
322
  logger.info(f"Importing OCR data into PDF '{pdf.path}' from manifest '{manifest_path}'")
285
323
 
286
324
  try:
287
- with open(manifest_path, 'r', encoding='utf-8') as f:
325
+ with open(manifest_path, "r", encoding="utf-8") as f:
288
326
  manifest_data = json.load(f)
289
327
  except json.JSONDecodeError as e:
290
328
  logger.error(f"Failed to parse manifest file: {e}")
@@ -300,18 +338,21 @@ def import_ocr_from_manifest(
300
338
  manifest_pages = manifest_data.get("pages", [])
301
339
  if not manifest_pages:
302
340
  logger.warning("Manifest contains no page data.")
303
- return {'imported': 0, 'skipped': 0}
341
+ return {"imported": 0, "skipped": 0}
304
342
 
305
343
  # --- Pre-check PDF source consistency ---
306
344
  first_manifest_pdf_path = manifest_pages[0].get("pdf_source")
307
345
  if first_manifest_pdf_path != pdf.path:
308
346
  # Allow matching based on just the filename if paths differ (e.g., absolute vs relative)
309
347
  if os.path.basename(first_manifest_pdf_path) != os.path.basename(pdf.path):
310
- logger.error(f"Manifest PDF source ('{first_manifest_pdf_path}') does not match target PDF path ('{pdf.path}'). Aborting.")
348
+ logger.error(
349
+ f"Manifest PDF source ('{first_manifest_pdf_path}') does not match target PDF path ('{pdf.path}'). Aborting."
350
+ )
311
351
  raise ValueError("Manifest source PDF does not match the provided PDF object.")
312
352
  else:
313
- logger.warning(f"Manifest PDF source path ('{first_manifest_pdf_path}') differs from target PDF path ('{pdf.path}'), but filenames match. Proceeding cautiously.")
314
-
353
+ logger.warning(
354
+ f"Manifest PDF source path ('{first_manifest_pdf_path}') differs from target PDF path ('{pdf.path}'), but filenames match. Proceeding cautiously."
355
+ )
315
356
 
316
357
  pdf_pages_by_index = {page.index: page for page in pdf.pages}
317
358
 
@@ -320,19 +361,27 @@ def import_ocr_from_manifest(
320
361
  manifest_pdf_path = page_data.get("pdf_source")
321
362
 
322
363
  # Check consistency for every page? (Maybe overkill if pre-checked)
323
- if manifest_pdf_path != pdf.path and os.path.basename(manifest_pdf_path) != os.path.basename(pdf.path):
324
- logger.warning(f"Skipping page index {page_index} due to PDF source mismatch ('{manifest_pdf_path}' vs '{pdf.path}')")
325
- skipped_count += len(page_data.get("regions", [])) # Count all regions as skipped
326
- continue
364
+ if manifest_pdf_path != pdf.path and os.path.basename(
365
+ manifest_pdf_path
366
+ ) != os.path.basename(pdf.path):
367
+ logger.warning(
368
+ f"Skipping page index {page_index} due to PDF source mismatch ('{manifest_pdf_path}' vs '{pdf.path}')"
369
+ )
370
+ skipped_count += len(page_data.get("regions", [])) # Count all regions as skipped
371
+ continue
327
372
 
328
373
  if page_index is None:
329
- logger.warning(f"Skipping page entry with missing 'page_index': {page_data.get('page_number')}")
374
+ logger.warning(
375
+ f"Skipping page entry with missing 'page_index': {page_data.get('page_number')}"
376
+ )
330
377
  skipped_count += len(page_data.get("regions", []))
331
378
  continue
332
379
 
333
380
  page = pdf_pages_by_index.get(page_index)
334
381
  if page is None:
335
- logger.warning(f"Could not find page with index {page_index} in the target PDF. Skipping.")
382
+ logger.warning(
383
+ f"Could not find page with index {page_index} in the target PDF. Skipping."
384
+ )
336
385
  skipped_count += len(page_data.get("regions", []))
337
386
  continue
338
387
 
@@ -353,11 +402,13 @@ def import_ocr_from_manifest(
353
402
  if text_to_import is None:
354
403
  text_to_import = region_data.get("ocr_text")
355
404
 
356
- resolution = region_data.get("resolution") # Mandatory from export
357
- confidence = region_data.get("confidence") # Optional
405
+ resolution = region_data.get("resolution") # Mandatory from export
406
+ confidence = region_data.get("confidence") # Optional
358
407
 
359
408
  if not all([manifest_bbox, text_to_import is not None, resolution]):
360
- logger.warning(f"Skipping incomplete/invalid region data on page {page_index}, region id '{region_id}': Missing bbox, text, or resolution.")
409
+ logger.warning(
410
+ f"Skipping incomplete/invalid region data on page {page_index}, region id '{region_id}': Missing bbox, text, or resolution."
411
+ )
361
412
  skipped_count += 1
362
413
  continue
363
414
 
@@ -369,11 +420,13 @@ def import_ocr_from_manifest(
369
420
  pdf_x1 = manifest_bbox[2] * scale_factor
370
421
  pdf_bottom = manifest_bbox[3] * scale_factor
371
422
  except (ValueError, TypeError, IndexError, ZeroDivisionError):
372
- logger.warning(f"Invalid bbox or resolution for region '{region_id}' on page {page_index}. Skipping.")
423
+ logger.warning(
424
+ f"Invalid bbox or resolution for region '{region_id}' on page {page_index}. Skipping."
425
+ )
373
426
  skipped_count += 1
374
427
  continue
375
428
 
376
- # --- Create New Element ---
429
+ # --- Create New Element ---
377
430
  try:
378
431
  new_element = TextElement(
379
432
  text=text_to_import,
@@ -381,31 +434,44 @@ def import_ocr_from_manifest(
381
434
  top=pdf_top,
382
435
  x1=pdf_x1,
383
436
  bottom=pdf_bottom,
384
- page=page, # Reference to the parent Page object
385
- source='manifest-import', # Indicate origin
386
- confidence=confidence, # Pass confidence if available
437
+ page=page, # Reference to the parent Page object
438
+ source="manifest-import", # Indicate origin
439
+ confidence=confidence, # Pass confidence if available
387
440
  # Add metadata from manifest if needed? Maybe original_ocr?
388
- metadata={'original_ocr': region_data.get("ocr_text")} if region_data.get("ocr_text") != text_to_import else {}
441
+ metadata=(
442
+ {"original_ocr": region_data.get("ocr_text")}
443
+ if region_data.get("ocr_text") != text_to_import
444
+ else {}
445
+ ),
389
446
  )
390
447
  regions_to_add.append(new_element)
391
448
  imported_count += 1
392
449
  except Exception as e:
393
- logger.error(f"Error creating TextElement for region '{region_id}' on page {page_index}: {e}", exc_info=True)
394
- skipped_count += 1
450
+ logger.error(
451
+ f"Error creating TextElement for region '{region_id}' on page {page_index}: {e}",
452
+ exc_info=True,
453
+ )
454
+ skipped_count += 1
395
455
 
396
456
  # --- Add Elements to Page ---
397
457
  # Add all created elements for this page in one go
398
458
  if regions_to_add:
399
459
  try:
400
460
  # Accessing _elements directly; use manager if a public add method exists
401
- if hasattr(page, '_elements') and hasattr(page._elements, 'elements') and isinstance(page._elements.elements, list):
461
+ if (
462
+ hasattr(page, "_elements")
463
+ and hasattr(page._elements, "elements")
464
+ and isinstance(page._elements.elements, list)
465
+ ):
402
466
  page._elements.elements.extend(regions_to_add)
403
467
  # TODO: Should potentially invalidate page element cache if exists
404
468
  else:
405
- logger.error(f"Could not add elements to page {page.index}, page._elements structure unexpected.")
406
- # Decrement count as they weren't actually added
407
- imported_count -= len(regions_to_add)
408
- skipped_count += len(regions_to_add)
469
+ logger.error(
470
+ f"Could not add elements to page {page.index}, page._elements structure unexpected."
471
+ )
472
+ # Decrement count as they weren't actually added
473
+ imported_count -= len(regions_to_add)
474
+ skipped_count += len(regions_to_add)
409
475
 
410
476
  except Exception as e:
411
477
  logger.error(f"Error adding elements to page {page.index}: {e}", exc_info=True)
@@ -413,6 +479,7 @@ def import_ocr_from_manifest(
413
479
  imported_count -= len(regions_to_add)
414
480
  skipped_count += len(regions_to_add)
415
481
 
416
-
417
- logger.info(f"Import process finished. Imported: {imported_count}, Skipped: {skipped_count}. Processed {processed_pages} pages from manifest.")
418
- return {'imported': imported_count, 'skipped': skipped_count}
482
+ logger.info(
483
+ f"Import process finished. Imported: {imported_count}, Skipped: {skipped_count}. Processed {processed_pages} pages from manifest."
484
+ )
485
+ return {"imported": imported_count, "skipped": skipped_count}