natural-pdf 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. docs/finetuning/index.md +176 -0
  2. docs/tutorials/01-loading-and-extraction.ipynb +34 -1550
  3. natural_pdf/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/gemini.py +63 -47
  5. natural_pdf/collections/pdf_collection.py +5 -2
  6. natural_pdf/core/element_manager.py +6 -4
  7. natural_pdf/core/page.py +36 -27
  8. natural_pdf/core/pdf.py +25 -16
  9. natural_pdf/elements/base.py +1 -3
  10. natural_pdf/elements/collections.py +13 -14
  11. natural_pdf/elements/region.py +7 -6
  12. natural_pdf/exporters/__init__.py +4 -0
  13. natural_pdf/exporters/base.py +61 -0
  14. natural_pdf/exporters/paddleocr.py +345 -0
  15. natural_pdf/ocr/__init__.py +16 -8
  16. natural_pdf/ocr/engine.py +46 -30
  17. natural_pdf/ocr/engine_easyocr.py +81 -40
  18. natural_pdf/ocr/engine_paddle.py +39 -28
  19. natural_pdf/ocr/engine_surya.py +32 -16
  20. natural_pdf/ocr/ocr_factory.py +34 -23
  21. natural_pdf/ocr/ocr_manager.py +15 -11
  22. natural_pdf/ocr/ocr_options.py +5 -0
  23. natural_pdf/ocr/utils.py +46 -31
  24. natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
  25. natural_pdf/utils/debug.py +4 -2
  26. natural_pdf/utils/identifiers.py +9 -5
  27. natural_pdf/utils/packaging.py +172 -105
  28. natural_pdf/utils/text_extraction.py +44 -64
  29. natural_pdf/utils/visualization.py +1 -1
  30. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +5 -3
  31. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +34 -30
  32. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +0 -0
  33. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
  34. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -0
@@ -28,11 +28,12 @@ from natural_pdf.utils.identifiers import generate_short_path_hash
28
28
 
29
29
  logger = logging.getLogger(__name__)
30
30
 
31
+
31
32
  def create_correction_task_package(
32
- source: Union['PDF', 'PDFCollection', List['PDF']],
33
+ source: Union["PDF", "PDFCollection", List["PDF"]],
33
34
  output_zip_path: str,
34
35
  overwrite: bool = False,
35
- suggest = None,
36
+ suggest=None,
36
37
  resolution: int = 150,
37
38
  ) -> None:
38
39
  """
@@ -55,27 +56,32 @@ def create_correction_task_package(
55
56
  ValueError: If no valid pages with OCR data are found in the source.
56
57
  """
57
58
  if os.path.exists(output_zip_path) and not overwrite:
58
- raise FileExistsError(f"Output file already exists: {output_zip_path}. Set overwrite=True to replace it.")
59
+ raise FileExistsError(
60
+ f"Output file already exists: {output_zip_path}. Set overwrite=True to replace it."
61
+ )
59
62
 
60
63
  # --- Resolve source to a list of PDF objects ---
61
- pdfs_to_process: List['PDF'] = []
62
- if hasattr(source, '__class__') and source.__class__.__name__ == 'PDF': # Check type without direct import
64
+ pdfs_to_process: List["PDF"] = []
65
+ if (
66
+ hasattr(source, "__class__") and source.__class__.__name__ == "PDF"
67
+ ): # Check type without direct import
63
68
  pdfs_to_process = [source]
64
- elif hasattr(source, '__class__') and source.__class__.__name__ == 'PDFCollection':
65
- pdfs_to_process = source.pdfs # Assuming PDFCollection has a .pdfs property
66
- elif isinstance(source, list) and all(hasattr(p, '__class__') and p.__class__.__name__ == 'PDF' for p in source):
69
+ elif hasattr(source, "__class__") and source.__class__.__name__ == "PDFCollection":
70
+ pdfs_to_process = source.pdfs # Assuming PDFCollection has a .pdfs property
71
+ elif isinstance(source, list) and all(
72
+ hasattr(p, "__class__") and p.__class__.__name__ == "PDF" for p in source
73
+ ):
67
74
  pdfs_to_process = source
68
75
  else:
69
- raise TypeError(f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF].")
76
+ raise TypeError(
77
+ f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF]."
78
+ )
70
79
 
71
80
  if not pdfs_to_process:
72
81
  logger.warning("No PDF documents provided in the source.")
73
82
  return
74
83
 
75
- manifest_data = {
76
- "pdfs": [], # Store pdf-level info if needed later?
77
- "pages": []
78
- }
84
+ manifest_data = {"pdfs": [], "pages": []} # Store pdf-level info if needed later?
79
85
  total_regions_found = 0
80
86
 
81
87
  # Use a temporary directory for staging files before zipping
@@ -84,38 +90,52 @@ def create_correction_task_package(
84
90
  os.makedirs(images_dir)
85
91
  logger.info(f"Using temporary directory for staging: {temp_dir}")
86
92
 
87
- # --- Process each PDF ---
93
+ # --- Process each PDF ---
88
94
  for pdf in pdfs_to_process:
89
- if not hasattr(pdf, 'path') or not hasattr(pdf, 'pages'):
95
+ if not hasattr(pdf, "path") or not hasattr(pdf, "pages"):
90
96
  logger.warning(f"Skipping invalid PDF object: {pdf}")
91
97
  continue
92
98
 
93
- pdf_path = pdf.path # Should be the resolved, absolute path
99
+ pdf_path = pdf.path # Should be the resolved, absolute path
94
100
  pdf_short_id = generate_short_path_hash(pdf_path)
95
101
  logger.debug(f"Processing PDF: {pdf_path} (ID: {pdf_short_id})")
96
102
 
97
103
  pdf_has_ocr_regions = False
98
104
  for page in pdf.pages:
99
- if not hasattr(page, 'index') or not hasattr(page, 'number') or \
100
- not hasattr(page, 'width') or not hasattr(page, 'height') or \
101
- not hasattr(page, 'find_all') or not hasattr(page, 'to_image'):
102
- logger.warning(f"Skipping invalid Page object in {pdf_path}: page index {getattr(page, 'index', 'unknown')}")
105
+ if (
106
+ not hasattr(page, "index")
107
+ or not hasattr(page, "number")
108
+ or not hasattr(page, "width")
109
+ or not hasattr(page, "height")
110
+ or not hasattr(page, "find_all")
111
+ or not hasattr(page, "to_image")
112
+ ):
113
+ logger.warning(
114
+ f"Skipping invalid Page object in {pdf_path}: page index {getattr(page, 'index', 'unknown')}"
115
+ )
103
116
  continue
104
117
 
105
118
  # 1. Extract OCR elements for this page
106
119
  try:
107
120
  # Important: apply_exclusions=False ensures we get *all* OCR data
108
121
  # regardless of user exclusions set on the PDF/page object.
109
- ocr_elements = page.find_all('text[source=ocr]', apply_exclusions=False).elements
122
+ ocr_elements = page.find_all(
123
+ "text[source=ocr]", apply_exclusions=False
124
+ ).elements
110
125
  except Exception as e:
111
- logger.error(f"Failed to extract OCR elements for {pdf_path} page {page.number}: {e}", exc_info=True)
112
- continue # Skip this page if element extraction fails
126
+ logger.error(
127
+ f"Failed to extract OCR elements for {pdf_path} page {page.number}: {e}",
128
+ exc_info=True,
129
+ )
130
+ continue # Skip this page if element extraction fails
113
131
 
114
132
  if not ocr_elements:
115
- logger.debug(f"No OCR elements found for {pdf_path} page {page.number}. Skipping page in manifest.")
116
- continue # Skip page if no OCR elements
133
+ logger.debug(
134
+ f"No OCR elements found for {pdf_path} page {page.number}. Skipping page in manifest."
135
+ )
136
+ continue # Skip page if no OCR elements
117
137
 
118
- pdf_has_ocr_regions = True # Mark that this PDF is relevant
138
+ pdf_has_ocr_regions = True # Mark that this PDF is relevant
119
139
  logger.debug(f" Found {len(ocr_elements)} OCR elements on page {page.number}")
120
140
  total_regions_found += len(ocr_elements)
121
141
 
@@ -128,9 +148,12 @@ def create_correction_task_package(
128
148
  raise ValueError("page.to_image returned None")
129
149
  img.save(image_save_path, "PNG")
130
150
  except Exception as e:
131
- logger.error(f"Failed to render/save image for {pdf_path} page {page.number}: {e}", exc_info=True)
151
+ logger.error(
152
+ f"Failed to render/save image for {pdf_path} page {page.number}: {e}",
153
+ exc_info=True,
154
+ )
132
155
  # If image fails, we cannot proceed with this page for the task
133
- pdf_has_ocr_regions = False # Reset flag for this page
156
+ pdf_has_ocr_regions = False # Reset flag for this page
134
157
  continue
135
158
 
136
159
  # 3. Prepare region data for manifest
@@ -142,72 +165,85 @@ def create_correction_task_package(
142
165
  for elem in tqdm(ocr_elements):
143
166
  i += 1
144
167
  # Basic check for necessary attributes
145
- if not all(hasattr(elem, attr) for attr in ['x0', 'top', 'x1', 'bottom', 'text']):
146
- logger.warning(f"Skipping invalid OCR element {i} on {pdf_path} page {page.number}")
168
+ if not all(
169
+ hasattr(elem, attr) for attr in ["x0", "top", "x1", "bottom", "text"]
170
+ ):
171
+ logger.warning(
172
+ f"Skipping invalid OCR element {i} on {pdf_path} page {page.number}"
173
+ )
147
174
  continue
148
- region_id = f"r_{page.index}_{i}" # ID unique within page
175
+ region_id = f"r_{page.index}_{i}" # ID unique within page
149
176
 
150
177
  # Scale coordinates to match the 300 DPI image
151
178
  scaled_bbox = [
152
179
  elem.x0 * coord_scale_factor,
153
180
  elem.top * coord_scale_factor,
154
181
  elem.x1 * coord_scale_factor,
155
- elem.bottom * coord_scale_factor
182
+ elem.bottom * coord_scale_factor,
156
183
  ]
157
184
 
158
185
  corrected = elem.text
159
186
 
160
187
  if suggest:
161
- corrected = suggest(elem.to_region(), getattr(elem, 'confidence', None))
162
-
163
- page_regions_data.append({
164
- "resolution": resolution,
165
- "id": region_id,
166
- "bbox": scaled_bbox,
167
- "ocr_text": elem.text,
168
- "confidence": getattr(elem, 'confidence', None), # Include confidence if available
169
- "corrected_text": corrected,
170
- "modified": False
171
- })
188
+ corrected = suggest(elem.to_region(), getattr(elem, "confidence", None))
189
+
190
+ page_regions_data.append(
191
+ {
192
+ "resolution": resolution,
193
+ "id": region_id,
194
+ "bbox": scaled_bbox,
195
+ "ocr_text": elem.text,
196
+ "confidence": getattr(
197
+ elem, "confidence", None
198
+ ), # Include confidence if available
199
+ "corrected_text": corrected,
200
+ "modified": False,
201
+ }
202
+ )
172
203
 
173
204
  # 4. Add page data to manifest if it has regions
174
205
  if page_regions_data:
175
- manifest_data["pages"].append({
176
- "pdf_source": pdf_path,
177
- "pdf_short_id": pdf_short_id,
178
- "page_number": page.number,
179
- "page_index": page.index,
180
- "image_path": f"images/{image_filename}", # Relative path within zip
181
- "width": page.width,
182
- "height": page.height,
183
- "regions": page_regions_data
184
- })
206
+ manifest_data["pages"].append(
207
+ {
208
+ "pdf_source": pdf_path,
209
+ "pdf_short_id": pdf_short_id,
210
+ "page_number": page.number,
211
+ "page_index": page.index,
212
+ "image_path": f"images/{image_filename}", # Relative path within zip
213
+ "width": page.width,
214
+ "height": page.height,
215
+ "regions": page_regions_data,
216
+ }
217
+ )
185
218
  else:
186
219
  # If, after checks, no valid regions remain, ensure flag is correct
187
220
  pdf_has_ocr_regions = False
188
221
 
189
-
190
- # --- Final Checks and Zipping ---
222
+ # --- Final Checks and Zipping ---
191
223
  if not manifest_data["pages"] or total_regions_found == 0:
192
- logger.error("No pages with valid OCR regions and successfully rendered images found in the source PDFs. Cannot create task package.")
193
- # Consider raising ValueError here instead of just returning
194
- raise ValueError("No valid pages with OCR data found to create a task package.")
224
+ logger.error(
225
+ "No pages with valid OCR regions and successfully rendered images found in the source PDFs. Cannot create task package."
226
+ )
227
+ # Consider raising ValueError here instead of just returning
228
+ raise ValueError("No valid pages with OCR data found to create a task package.")
195
229
 
196
230
  manifest_path = os.path.join(temp_dir, "manifest.json")
197
231
  try:
198
- with open(manifest_path, 'w', encoding='utf-8') as f_manifest:
232
+ with open(manifest_path, "w", encoding="utf-8") as f_manifest:
199
233
  json.dump(manifest_data, f_manifest, indent=2)
200
234
  except Exception as e:
201
235
  logger.error(f"Failed to write manifest.json: {e}", exc_info=True)
202
- raise # Re-raise error, cannot proceed
236
+ raise # Re-raise error, cannot proceed
203
237
 
204
238
  # --- Copy SPA files into temp dir ---
205
239
  try:
206
240
  # Find the path to the spa template directory relative to this file
207
241
  # Using __file__ assumes this script is installed alongside the templates
208
242
  utils_dir = os.path.dirname(os.path.abspath(__file__))
209
- templates_dir = os.path.join(os.path.dirname(utils_dir), 'templates') # Go up one level from utils
210
- spa_template_dir = os.path.join(templates_dir, 'spa')
243
+ templates_dir = os.path.join(
244
+ os.path.dirname(utils_dir), "templates"
245
+ ) # Go up one level from utils
246
+ spa_template_dir = os.path.join(templates_dir, "spa")
211
247
 
212
248
  if not os.path.isdir(spa_template_dir):
213
249
  raise FileNotFoundError(f"SPA template directory not found at {spa_template_dir}")
@@ -224,32 +260,34 @@ def create_correction_task_package(
224
260
  # --- Create the final zip file ---
225
261
  try:
226
262
  logger.info(f"Creating zip package at: {output_zip_path}")
227
- with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
263
+ with zipfile.ZipFile(output_zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
228
264
  # Add manifest.json
229
265
  zipf.write(manifest_path, arcname="manifest.json")
230
266
  # Add images directory
231
267
  for root, _, files in os.walk(images_dir):
232
268
  for file in files:
233
269
  full_path = os.path.join(root, file)
234
- # Create the correct archive name (e.g., images/...)
270
+ # Create the correct archive name (e.g., images/...)
235
271
  arcname = os.path.relpath(full_path, temp_dir)
236
272
  zipf.write(full_path, arcname=arcname)
237
- logger.info(f"Successfully created correction task package: {output_zip_path} ({total_regions_found} regions total)")
273
+ logger.info(
274
+ f"Successfully created correction task package: {output_zip_path} ({total_regions_found} regions total)"
275
+ )
238
276
 
239
277
  except Exception as e:
240
278
  logger.error(f"Failed to create zip file {output_zip_path}: {e}", exc_info=True)
241
279
  # Attempt to clean up existing zip if creation failed partially
242
280
  if os.path.exists(output_zip_path):
243
- try: os.remove(output_zip_path)
244
- except: pass
245
- raise # Re-raise error
281
+ try:
282
+ os.remove(output_zip_path)
283
+ except:
284
+ pass
285
+ raise # Re-raise error
246
286
 
247
- # Temporary directory is automatically cleaned up by context manager
287
+ # Temporary directory is automatically cleaned up by context manager
248
288
 
249
- def import_ocr_from_manifest(
250
- pdf: 'PDF',
251
- manifest_path: str
252
- ) -> Dict[str, int]:
289
+
290
+ def import_ocr_from_manifest(pdf: "PDF", manifest_path: str) -> Dict[str, int]:
253
291
  """
254
292
  Imports OCR data into a PDF object from a manifest file.
255
293
 
@@ -275,8 +313,8 @@ def import_ocr_from_manifest(
275
313
  ValueError: If the manifest is invalid or contains data for a different PDF.
276
314
  TypeError: If the input pdf object is not a valid PDF instance.
277
315
  """
278
- if not (hasattr(pdf, '__class__') and pdf.__class__.__name__ == 'PDF'):
279
- raise TypeError(f"Input must be a natural_pdf PDF object, got {type(pdf)}")
316
+ if not (hasattr(pdf, "__class__") and pdf.__class__.__name__ == "PDF"):
317
+ raise TypeError(f"Input must be a natural_pdf PDF object, got {type(pdf)}")
280
318
 
281
319
  if not os.path.exists(manifest_path):
282
320
  raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
@@ -284,7 +322,7 @@ def import_ocr_from_manifest(
284
322
  logger.info(f"Importing OCR data into PDF '{pdf.path}' from manifest '{manifest_path}'")
285
323
 
286
324
  try:
287
- with open(manifest_path, 'r', encoding='utf-8') as f:
325
+ with open(manifest_path, "r", encoding="utf-8") as f:
288
326
  manifest_data = json.load(f)
289
327
  except json.JSONDecodeError as e:
290
328
  logger.error(f"Failed to parse manifest file: {e}")
@@ -300,18 +338,21 @@ def import_ocr_from_manifest(
300
338
  manifest_pages = manifest_data.get("pages", [])
301
339
  if not manifest_pages:
302
340
  logger.warning("Manifest contains no page data.")
303
- return {'imported': 0, 'skipped': 0}
341
+ return {"imported": 0, "skipped": 0}
304
342
 
305
343
  # --- Pre-check PDF source consistency ---
306
344
  first_manifest_pdf_path = manifest_pages[0].get("pdf_source")
307
345
  if first_manifest_pdf_path != pdf.path:
308
346
  # Allow matching based on just the filename if paths differ (e.g., absolute vs relative)
309
347
  if os.path.basename(first_manifest_pdf_path) != os.path.basename(pdf.path):
310
- logger.error(f"Manifest PDF source ('{first_manifest_pdf_path}') does not match target PDF path ('{pdf.path}'). Aborting.")
348
+ logger.error(
349
+ f"Manifest PDF source ('{first_manifest_pdf_path}') does not match target PDF path ('{pdf.path}'). Aborting."
350
+ )
311
351
  raise ValueError("Manifest source PDF does not match the provided PDF object.")
312
352
  else:
313
- logger.warning(f"Manifest PDF source path ('{first_manifest_pdf_path}') differs from target PDF path ('{pdf.path}'), but filenames match. Proceeding cautiously.")
314
-
353
+ logger.warning(
354
+ f"Manifest PDF source path ('{first_manifest_pdf_path}') differs from target PDF path ('{pdf.path}'), but filenames match. Proceeding cautiously."
355
+ )
315
356
 
316
357
  pdf_pages_by_index = {page.index: page for page in pdf.pages}
317
358
 
@@ -320,19 +361,27 @@ def import_ocr_from_manifest(
320
361
  manifest_pdf_path = page_data.get("pdf_source")
321
362
 
322
363
  # Check consistency for every page? (Maybe overkill if pre-checked)
323
- if manifest_pdf_path != pdf.path and os.path.basename(manifest_pdf_path) != os.path.basename(pdf.path):
324
- logger.warning(f"Skipping page index {page_index} due to PDF source mismatch ('{manifest_pdf_path}' vs '{pdf.path}')")
325
- skipped_count += len(page_data.get("regions", [])) # Count all regions as skipped
326
- continue
364
+ if manifest_pdf_path != pdf.path and os.path.basename(
365
+ manifest_pdf_path
366
+ ) != os.path.basename(pdf.path):
367
+ logger.warning(
368
+ f"Skipping page index {page_index} due to PDF source mismatch ('{manifest_pdf_path}' vs '{pdf.path}')"
369
+ )
370
+ skipped_count += len(page_data.get("regions", [])) # Count all regions as skipped
371
+ continue
327
372
 
328
373
  if page_index is None:
329
- logger.warning(f"Skipping page entry with missing 'page_index': {page_data.get('page_number')}")
374
+ logger.warning(
375
+ f"Skipping page entry with missing 'page_index': {page_data.get('page_number')}"
376
+ )
330
377
  skipped_count += len(page_data.get("regions", []))
331
378
  continue
332
379
 
333
380
  page = pdf_pages_by_index.get(page_index)
334
381
  if page is None:
335
- logger.warning(f"Could not find page with index {page_index} in the target PDF. Skipping.")
382
+ logger.warning(
383
+ f"Could not find page with index {page_index} in the target PDF. Skipping."
384
+ )
336
385
  skipped_count += len(page_data.get("regions", []))
337
386
  continue
338
387
 
@@ -353,11 +402,13 @@ def import_ocr_from_manifest(
353
402
  if text_to_import is None:
354
403
  text_to_import = region_data.get("ocr_text")
355
404
 
356
- resolution = region_data.get("resolution") # Mandatory from export
357
- confidence = region_data.get("confidence") # Optional
405
+ resolution = region_data.get("resolution") # Mandatory from export
406
+ confidence = region_data.get("confidence") # Optional
358
407
 
359
408
  if not all([manifest_bbox, text_to_import is not None, resolution]):
360
- logger.warning(f"Skipping incomplete/invalid region data on page {page_index}, region id '{region_id}': Missing bbox, text, or resolution.")
409
+ logger.warning(
410
+ f"Skipping incomplete/invalid region data on page {page_index}, region id '{region_id}': Missing bbox, text, or resolution."
411
+ )
361
412
  skipped_count += 1
362
413
  continue
363
414
 
@@ -369,11 +420,13 @@ def import_ocr_from_manifest(
369
420
  pdf_x1 = manifest_bbox[2] * scale_factor
370
421
  pdf_bottom = manifest_bbox[3] * scale_factor
371
422
  except (ValueError, TypeError, IndexError, ZeroDivisionError):
372
- logger.warning(f"Invalid bbox or resolution for region '{region_id}' on page {page_index}. Skipping.")
423
+ logger.warning(
424
+ f"Invalid bbox or resolution for region '{region_id}' on page {page_index}. Skipping."
425
+ )
373
426
  skipped_count += 1
374
427
  continue
375
428
 
376
- # --- Create New Element ---
429
+ # --- Create New Element ---
377
430
  try:
378
431
  new_element = TextElement(
379
432
  text=text_to_import,
@@ -381,31 +434,44 @@ def import_ocr_from_manifest(
381
434
  top=pdf_top,
382
435
  x1=pdf_x1,
383
436
  bottom=pdf_bottom,
384
- page=page, # Reference to the parent Page object
385
- source='manifest-import', # Indicate origin
386
- confidence=confidence, # Pass confidence if available
437
+ page=page, # Reference to the parent Page object
438
+ source="manifest-import", # Indicate origin
439
+ confidence=confidence, # Pass confidence if available
387
440
  # Add metadata from manifest if needed? Maybe original_ocr?
388
- metadata={'original_ocr': region_data.get("ocr_text")} if region_data.get("ocr_text") != text_to_import else {}
441
+ metadata=(
442
+ {"original_ocr": region_data.get("ocr_text")}
443
+ if region_data.get("ocr_text") != text_to_import
444
+ else {}
445
+ ),
389
446
  )
390
447
  regions_to_add.append(new_element)
391
448
  imported_count += 1
392
449
  except Exception as e:
393
- logger.error(f"Error creating TextElement for region '{region_id}' on page {page_index}: {e}", exc_info=True)
394
- skipped_count += 1
450
+ logger.error(
451
+ f"Error creating TextElement for region '{region_id}' on page {page_index}: {e}",
452
+ exc_info=True,
453
+ )
454
+ skipped_count += 1
395
455
 
396
456
  # --- Add Elements to Page ---
397
457
  # Add all created elements for this page in one go
398
458
  if regions_to_add:
399
459
  try:
400
460
  # Accessing _elements directly; use manager if a public add method exists
401
- if hasattr(page, '_elements') and hasattr(page._elements, 'elements') and isinstance(page._elements.elements, list):
461
+ if (
462
+ hasattr(page, "_elements")
463
+ and hasattr(page._elements, "elements")
464
+ and isinstance(page._elements.elements, list)
465
+ ):
402
466
  page._elements.elements.extend(regions_to_add)
403
467
  # TODO: Should potentially invalidate page element cache if exists
404
468
  else:
405
- logger.error(f"Could not add elements to page {page.index}, page._elements structure unexpected.")
406
- # Decrement count as they weren't actually added
407
- imported_count -= len(regions_to_add)
408
- skipped_count += len(regions_to_add)
469
+ logger.error(
470
+ f"Could not add elements to page {page.index}, page._elements structure unexpected."
471
+ )
472
+ # Decrement count as they weren't actually added
473
+ imported_count -= len(regions_to_add)
474
+ skipped_count += len(regions_to_add)
409
475
 
410
476
  except Exception as e:
411
477
  logger.error(f"Error adding elements to page {page.index}: {e}", exc_info=True)
@@ -413,6 +479,7 @@ def import_ocr_from_manifest(
413
479
  imported_count -= len(regions_to_add)
414
480
  skipped_count += len(regions_to_add)
415
481
 
416
-
417
- logger.info(f"Import process finished. Imported: {imported_count}, Skipped: {skipped_count}. Processed {processed_pages} pages from manifest.")
418
- return {'imported': imported_count, 'skipped': skipped_count}
482
+ logger.info(
483
+ f"Import process finished. Imported: {imported_count}, Skipped: {skipped_count}. Processed {processed_pages} pages from manifest."
484
+ )
485
+ return {"imported": imported_count, "skipped": skipped_count}
@@ -116,80 +116,60 @@ def filter_chars_spatially(
116
116
 
117
117
  def generate_text_layout(
118
118
  char_dicts: List[Dict[str, Any]],
119
- layout_context_bbox: Tuple[float, float, float, float],
120
- user_kwargs: Dict[str, Any],
119
+ layout_context_bbox: Optional[Tuple[float, float, float, float]] = None,
120
+ user_kwargs: Optional[Dict[str, Any]] = None,
121
121
  ) -> str:
122
122
  """
123
- Takes a list of filtered character dictionaries and generates
124
- text output using pdfplumber's layout engine.
123
+ Generates a string representation of text from character dictionaries,
124
+ attempting to reconstruct layout using pdfplumber's utilities.
125
125
 
126
126
  Args:
127
- char_dicts: The final list of character dictionaries to include.
128
- layout_context_bbox: The bounding box (x0, top, x1, bottom) to use for
129
- calculating default layout width/height/shifts.
130
- user_kwargs: Dictionary of user-provided keyword arguments.
127
+ char_dicts: List of character dictionary objects.
128
+ layout_context_bbox: Optional bounding box for layout context.
129
+ user_kwargs: User-provided kwargs, potentially overriding defaults.
131
130
 
132
131
  Returns:
133
- The formatted text string.
132
+ String representation of the text.
134
133
  """
135
- if not char_dicts:
136
- logger.debug("generate_text_layout: No characters provided.")
134
+ # --- Filter out invalid char dicts early ---
135
+ initial_count = len(char_dicts)
136
+ valid_char_dicts = [c for c in char_dicts if isinstance(c.get("text"), str)]
137
+ filtered_count = initial_count - len(valid_char_dicts)
138
+ if filtered_count > 0:
139
+ logger.debug(
140
+ f"generate_text_layout: Filtered out {filtered_count} char dicts with non-string/None text."
141
+ )
142
+
143
+ if not valid_char_dicts: # Return empty if no valid chars remain
144
+ logger.debug("generate_text_layout: No valid character dicts found after filtering.")
137
145
  return ""
138
146
 
139
- # Prepare layout kwargs, prioritizing user input
140
- layout_kwargs = {}
141
- allowed_keys = set(WORD_EXTRACTOR_KWARGS) | set(TEXTMAP_KWARGS)
142
- for key, value in user_kwargs.items():
143
- if key in allowed_keys:
144
- layout_kwargs[key] = value
145
-
146
- # Default to layout=True unless explicitly False
147
- use_layout = layout_kwargs.get("layout", True) # Default to layout if called
148
- layout_kwargs["layout"] = use_layout
149
-
150
- if use_layout:
151
- ctx_x0, ctx_top, ctx_x1, ctx_bottom = layout_context_bbox
152
- ctx_width = ctx_x1 - ctx_x0
153
- ctx_height = ctx_bottom - ctx_top
154
-
155
- # Set layout defaults based on context_bbox if not overridden by user
156
- if "layout_bbox" not in layout_kwargs:
157
- layout_kwargs["layout_bbox"] = layout_context_bbox
158
- # Only set default layout_width if neither width specifier is present
159
- if "layout_width_chars" not in layout_kwargs and "layout_width" not in layout_kwargs:
160
- layout_kwargs["layout_width"] = ctx_width
161
- if "layout_height" not in layout_kwargs:
162
- layout_kwargs["layout_height"] = ctx_height
163
- # Adjust shift based on context's top-left corner
164
- if "x_shift" not in layout_kwargs:
165
- layout_kwargs["x_shift"] = ctx_x0
166
- if "y_shift" not in layout_kwargs:
167
- layout_kwargs["y_shift"] = ctx_top
147
+ # Prepare layout arguments
148
+ layout_kwargs = _get_layout_kwargs(layout_context_bbox, user_kwargs)
149
+ use_layout = layout_kwargs.pop("layout", True) # Extract layout flag, default True
168
150
 
169
- logger.debug(
170
- f"generate_text_layout: Calling chars_to_textmap with {len(char_dicts)} chars and kwargs: {layout_kwargs}"
171
- )
172
- try:
173
- # Sort final list by reading order before passing to textmap
174
- # TODO: Make sorting key dynamic based on layout_kwargs directions?
175
- char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
176
- textmap = chars_to_textmap(char_dicts, **layout_kwargs)
177
- result = textmap.as_string
178
- except Exception as e:
179
- logger.error(
180
- f"generate_text_layout: Error calling chars_to_textmap: {e}", exc_info=True
181
- )
182
- logger.warning(
183
- "generate_text_layout: Falling back to simple character join due to layout error."
184
- )
185
- # Ensure chars are sorted before fallback join
186
- fallback_chars = sorted(char_dicts, key=lambda c: (c.get("top", 0), c.get("x0", 0)))
187
- result = "".join(c.get("text", "") for c in fallback_chars)
188
- else:
151
+ if not use_layout:
189
152
  # Simple join if layout=False
190
- logger.debug("generate_text_layout: Using simple join (layout=False).")
191
- # Sort by document order for simple join as well
192
- char_dicts.sort(key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0)))
193
- result = "".join(c.get("text", "") for c in char_dicts)
153
+ logger.debug("generate_text_layout: Using simple join (layout=False requested).")
154
+ # Sort before joining if layout is off
155
+ valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
156
+ result = "".join(c.get("text", "") for c in valid_char_dicts) # Use valid chars
157
+ return result
158
+
159
+ try:
160
+ # Sort chars primarily by top, then x0 before layout analysis
161
+ # This helps pdfplumber group lines correctly
162
+ valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
163
+ textmap = chars_to_textmap(valid_char_dicts, **layout_kwargs)
164
+ result = textmap.as_string
165
+ except Exception as e:
166
+ # Fallback to simple join on error
167
+ logger.error(f"generate_text_layout: Error calling chars_to_textmap: {e}", exc_info=False)
168
+ logger.warning(
169
+ "generate_text_layout: Falling back to simple character join due to layout error."
170
+ )
171
+ # Fallback already has sorted characters if layout was attempted
172
+ # Need to use the valid_char_dicts here too
173
+ result = "".join(c.get("text", "") for c in valid_char_dicts)
194
174
 
195
175
  return result
@@ -192,7 +192,7 @@ def merge_images_with_legend(
192
192
  if not legend:
193
193
  return image # Return original image if legend is None or empty
194
194
 
195
- bg_color = (255, 255, 255, 255) # Always use white for the merged background
195
+ bg_color = (255, 255, 255, 255) # Always use white for the merged background
196
196
 
197
197
  if position == "right":
198
198
  # Create a new image with extra width for the legend