natural-pdf 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. docs/finetuning/index.md +176 -0
  2. docs/ocr/index.md +34 -47
  3. docs/tutorials/01-loading-and-extraction.ipynb +34 -1536
  4. docs/tutorials/02-finding-elements.ipynb +42 -42
  5. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  6. docs/tutorials/04-table-extraction.ipynb +12 -12
  7. docs/tutorials/05-excluding-content.ipynb +30 -30
  8. docs/tutorials/06-document-qa.ipynb +28 -28
  9. docs/tutorials/07-layout-analysis.ipynb +63 -35
  10. docs/tutorials/07-working-with-regions.ipynb +55 -51
  11. docs/tutorials/07-working-with-regions.md +2 -2
  12. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  13. docs/tutorials/09-section-extraction.ipynb +113 -113
  14. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  15. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  16. docs/tutorials/12-ocr-integration.ipynb +149 -131
  17. docs/tutorials/12-ocr-integration.md +0 -13
  18. docs/tutorials/13-semantic-search.ipynb +313 -873
  19. natural_pdf/__init__.py +21 -22
  20. natural_pdf/analyzers/layout/gemini.py +280 -0
  21. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  22. natural_pdf/analyzers/layout/layout_options.py +11 -0
  23. natural_pdf/analyzers/layout/yolo.py +6 -2
  24. natural_pdf/collections/pdf_collection.py +24 -0
  25. natural_pdf/core/element_manager.py +18 -13
  26. natural_pdf/core/page.py +174 -36
  27. natural_pdf/core/pdf.py +156 -42
  28. natural_pdf/elements/base.py +9 -17
  29. natural_pdf/elements/collections.py +99 -38
  30. natural_pdf/elements/region.py +77 -37
  31. natural_pdf/elements/text.py +5 -0
  32. natural_pdf/exporters/__init__.py +4 -0
  33. natural_pdf/exporters/base.py +61 -0
  34. natural_pdf/exporters/paddleocr.py +345 -0
  35. natural_pdf/ocr/__init__.py +57 -36
  36. natural_pdf/ocr/engine.py +160 -49
  37. natural_pdf/ocr/engine_easyocr.py +178 -157
  38. natural_pdf/ocr/engine_paddle.py +114 -189
  39. natural_pdf/ocr/engine_surya.py +87 -144
  40. natural_pdf/ocr/ocr_factory.py +125 -0
  41. natural_pdf/ocr/ocr_manager.py +65 -89
  42. natural_pdf/ocr/ocr_options.py +8 -13
  43. natural_pdf/ocr/utils.py +113 -0
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
  45. natural_pdf/templates/spa/css/style.css +334 -0
  46. natural_pdf/templates/spa/index.html +31 -0
  47. natural_pdf/templates/spa/js/app.js +472 -0
  48. natural_pdf/templates/spa/words.txt +235976 -0
  49. natural_pdf/utils/debug.py +34 -0
  50. natural_pdf/utils/identifiers.py +33 -0
  51. natural_pdf/utils/packaging.py +485 -0
  52. natural_pdf/utils/text_extraction.py +44 -64
  53. natural_pdf/utils/visualization.py +1 -1
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +44 -20
  55. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +58 -47
  56. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +1 -1
  57. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -1
  58. natural_pdf/templates/ocr_debug.html +0 -517
  59. tests/test_loading.py +0 -50
  60. tests/test_optional_deps.py +0 -298
  61. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,34 @@
1
+ """
2
+ OCR debug utilities for natural-pdf.
3
+ """
4
+
5
+ import base64
6
+ import io
7
+ import json
8
+ import os
9
+ import importlib.util
10
+ import importlib.resources
11
+ import webbrowser
12
+ from typing import Dict, List, Any, Optional, Union, Tuple
13
+
14
+ from PIL import Image
15
+
16
+ # Assuming Page type hint is available or define a placeholder
17
+ try:
18
+ from natural_pdf.core.page import Page
19
+ except ImportError:
20
+ Page = Any # Placeholder
21
+
22
+
23
+ def _get_page_image_base64(page: Page) -> str:
24
+ """Generate a base64 encoded image of the page."""
25
+ # Create a clean image of the page without highlights for the base background
26
+ # Use a fixed scale consistent with the HTML/JS rendering logic
27
+ img = page.to_image(scale=2.0, include_highlights=False)
28
+ if img is None:
29
+ raise ValueError(f"Failed to render image for page {page.number}")
30
+
31
+ # Convert to base64
32
+ buffered = io.BytesIO()
33
+ img.save(buffered, format="PNG")
34
+ return f"data:image/png;base64,{base64.b64encode(buffered.getvalue()).decode('utf-8')}"
@@ -0,0 +1,33 @@
1
+ """
2
+ Utilities for generating consistent identifiers.
3
+ """
4
+
5
+ import hashlib
6
+ import base64
7
+ import os
8
+
9
+
10
+ def generate_short_path_hash(path_str: str, length: int = 8) -> str:
11
+ """
12
+ Generates a short, filesystem-safe hash ID from a path string.
13
+
14
+ Args:
15
+ path_str: The absolute path string.
16
+ length: The desired length of the short ID (default: 8).
17
+
18
+ Returns:
19
+ A short hash string using URL-safe Base64 encoding.
20
+ """
21
+ # Ensure consistency by using the absolute path
22
+ normalized_path = os.path.abspath(path_str)
23
+ path_bytes = normalized_path.encode("utf-8")
24
+ # Use SHA-256 for good collision resistance
25
+ full_hash = hashlib.sha256(path_bytes).digest() # Get binary hash
26
+ # Encode using URL-safe Base64 and remove padding '=' characters
27
+ b64_encoded = base64.urlsafe_b64encode(full_hash).decode("ascii").rstrip("=")
28
+ # Return the first 'length' characters
29
+ if length <= 0 or length > len(b64_encoded):
30
+ raise ValueError(
31
+ f"Invalid length specified: {length}. Must be between 1 and {len(b64_encoded)}."
32
+ )
33
+ return b64_encoded[:length]
@@ -0,0 +1,485 @@
1
+ """
2
+ Utilities for packaging data for external processes, like correction tasks.
3
+ """
4
+
5
+ import os
6
+ import base64
7
+ import io
8
+ import json
9
+ import zipfile
10
+ import tempfile
11
+ import logging
12
+ import shutil
13
+ from typing import Any, List, Union, Iterable, TYPE_CHECKING, Dict
14
+ from tqdm import tqdm
15
+ from natural_pdf.elements.text import TextElement
16
+
17
+ # Import the specific PDF/Page types if possible, otherwise use Any
18
+ if TYPE_CHECKING:
19
+ from natural_pdf.core.pdf import PDF
20
+ from natural_pdf.core.page import Page
21
+ from natural_pdf.collections.pdf_collection import PDFCollection
22
+ else:
23
+ PDF = Any
24
+ Page = Any
25
+ PDFCollection = Any
26
+
27
+ from natural_pdf.utils.identifiers import generate_short_path_hash
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ def create_correction_task_package(
33
+ source: Union["PDF", "PDFCollection", List["PDF"]],
34
+ output_zip_path: str,
35
+ overwrite: bool = False,
36
+ suggest=None,
37
+ resolution: int = 150,
38
+ ) -> None:
39
+ """
40
+ Creates a zip package containing data for an OCR correction task.
41
+
42
+ The package includes:
43
+ - manifest.json: Metadata about pages and OCR regions (using original PDF coordinates).
44
+ - images/ directory: Rendered full-page images.
45
+
46
+ Args:
47
+ source: The PDF object, PDFCollection, or list of PDF objects to process.
48
+ output_zip_path: The full path where the output zip file should be saved.
49
+ overwrite: If True, overwrite the output zip file if it already exists.
50
+ suggest: Function that takes the region and returns an OCR suggestion
51
+
52
+ Raises:
53
+ FileNotFoundError: If the output directory cannot be created.
54
+ FileExistsError: If the output zip file exists and overwrite is False.
55
+ TypeError: If the source type is invalid.
56
+ ValueError: If no valid pages with OCR data are found in the source.
57
+ """
58
+ if os.path.exists(output_zip_path) and not overwrite:
59
+ raise FileExistsError(
60
+ f"Output file already exists: {output_zip_path}. Set overwrite=True to replace it."
61
+ )
62
+
63
+ # --- Resolve source to a list of PDF objects ---
64
+ pdfs_to_process: List["PDF"] = []
65
+ if (
66
+ hasattr(source, "__class__") and source.__class__.__name__ == "PDF"
67
+ ): # Check type without direct import
68
+ pdfs_to_process = [source]
69
+ elif hasattr(source, "__class__") and source.__class__.__name__ == "PDFCollection":
70
+ pdfs_to_process = source.pdfs # Assuming PDFCollection has a .pdfs property
71
+ elif isinstance(source, list) and all(
72
+ hasattr(p, "__class__") and p.__class__.__name__ == "PDF" for p in source
73
+ ):
74
+ pdfs_to_process = source
75
+ else:
76
+ raise TypeError(
77
+ f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF]."
78
+ )
79
+
80
+ if not pdfs_to_process:
81
+ logger.warning("No PDF documents provided in the source.")
82
+ return
83
+
84
+ manifest_data = {"pdfs": [], "pages": []} # Store pdf-level info if needed later?
85
+ total_regions_found = 0
86
+
87
+ # Use a temporary directory for staging files before zipping
88
+ with tempfile.TemporaryDirectory() as temp_dir:
89
+ images_dir = os.path.join(temp_dir, "images")
90
+ os.makedirs(images_dir)
91
+ logger.info(f"Using temporary directory for staging: {temp_dir}")
92
+
93
+ # --- Process each PDF ---
94
+ for pdf in pdfs_to_process:
95
+ if not hasattr(pdf, "path") or not hasattr(pdf, "pages"):
96
+ logger.warning(f"Skipping invalid PDF object: {pdf}")
97
+ continue
98
+
99
+ pdf_path = pdf.path # Should be the resolved, absolute path
100
+ pdf_short_id = generate_short_path_hash(pdf_path)
101
+ logger.debug(f"Processing PDF: {pdf_path} (ID: {pdf_short_id})")
102
+
103
+ pdf_has_ocr_regions = False
104
+ for page in pdf.pages:
105
+ if (
106
+ not hasattr(page, "index")
107
+ or not hasattr(page, "number")
108
+ or not hasattr(page, "width")
109
+ or not hasattr(page, "height")
110
+ or not hasattr(page, "find_all")
111
+ or not hasattr(page, "to_image")
112
+ ):
113
+ logger.warning(
114
+ f"Skipping invalid Page object in {pdf_path}: page index {getattr(page, 'index', 'unknown')}"
115
+ )
116
+ continue
117
+
118
+ # 1. Extract OCR elements for this page
119
+ try:
120
+ # Important: apply_exclusions=False ensures we get *all* OCR data
121
+ # regardless of user exclusions set on the PDF/page object.
122
+ ocr_elements = page.find_all(
123
+ "text[source=ocr]", apply_exclusions=False
124
+ ).elements
125
+ except Exception as e:
126
+ logger.error(
127
+ f"Failed to extract OCR elements for {pdf_path} page {page.number}: {e}",
128
+ exc_info=True,
129
+ )
130
+ continue # Skip this page if element extraction fails
131
+
132
+ if not ocr_elements:
133
+ logger.debug(
134
+ f"No OCR elements found for {pdf_path} page {page.number}. Skipping page in manifest."
135
+ )
136
+ continue # Skip page if no OCR elements
137
+
138
+ pdf_has_ocr_regions = True # Mark that this PDF is relevant
139
+ logger.debug(f" Found {len(ocr_elements)} OCR elements on page {page.number}")
140
+ total_regions_found += len(ocr_elements)
141
+
142
+ # 2. Render and save page image
143
+ image_filename = f"{pdf_short_id}_page_{page.index}.png"
144
+ image_save_path = os.path.join(images_dir, image_filename)
145
+ try:
146
+ img = page.to_image(resolution=resolution, include_highlights=False)
147
+ if img is None:
148
+ raise ValueError("page.to_image returned None")
149
+ img.save(image_save_path, "PNG")
150
+ except Exception as e:
151
+ logger.error(
152
+ f"Failed to render/save image for {pdf_path} page {page.number}: {e}",
153
+ exc_info=True,
154
+ )
155
+ # If image fails, we cannot proceed with this page for the task
156
+ pdf_has_ocr_regions = False # Reset flag for this page
157
+ continue
158
+
159
+ # 3. Prepare region data for manifest
160
+ page_regions_data = []
161
+ # Calculate scaling factor from PDF coordinates (72 DPI) to image pixels
162
+ coord_scale_factor = resolution / 72.0
163
+
164
+ i = -1
165
+ for elem in tqdm(ocr_elements):
166
+ i += 1
167
+ # Basic check for necessary attributes
168
+ if not all(
169
+ hasattr(elem, attr) for attr in ["x0", "top", "x1", "bottom", "text"]
170
+ ):
171
+ logger.warning(
172
+ f"Skipping invalid OCR element {i} on {pdf_path} page {page.number}"
173
+ )
174
+ continue
175
+ region_id = f"r_{page.index}_{i}" # ID unique within page
176
+
177
+ # Scale coordinates to match the 300 DPI image
178
+ scaled_bbox = [
179
+ elem.x0 * coord_scale_factor,
180
+ elem.top * coord_scale_factor,
181
+ elem.x1 * coord_scale_factor,
182
+ elem.bottom * coord_scale_factor,
183
+ ]
184
+
185
+ corrected = elem.text
186
+
187
+ if suggest:
188
+ corrected = suggest(elem.to_region(), getattr(elem, "confidence", None))
189
+
190
+ page_regions_data.append(
191
+ {
192
+ "resolution": resolution,
193
+ "id": region_id,
194
+ "bbox": scaled_bbox,
195
+ "ocr_text": elem.text,
196
+ "confidence": getattr(
197
+ elem, "confidence", None
198
+ ), # Include confidence if available
199
+ "corrected_text": corrected,
200
+ "modified": False,
201
+ }
202
+ )
203
+
204
+ # 4. Add page data to manifest if it has regions
205
+ if page_regions_data:
206
+ manifest_data["pages"].append(
207
+ {
208
+ "pdf_source": pdf_path,
209
+ "pdf_short_id": pdf_short_id,
210
+ "page_number": page.number,
211
+ "page_index": page.index,
212
+ "image_path": f"images/{image_filename}", # Relative path within zip
213
+ "width": page.width,
214
+ "height": page.height,
215
+ "regions": page_regions_data,
216
+ }
217
+ )
218
+ else:
219
+ # If, after checks, no valid regions remain, ensure flag is correct
220
+ pdf_has_ocr_regions = False
221
+
222
+ # --- Final Checks and Zipping ---
223
+ if not manifest_data["pages"] or total_regions_found == 0:
224
+ logger.error(
225
+ "No pages with valid OCR regions and successfully rendered images found in the source PDFs. Cannot create task package."
226
+ )
227
+ # Consider raising ValueError here instead of just returning
228
+ raise ValueError("No valid pages with OCR data found to create a task package.")
229
+
230
+ manifest_path = os.path.join(temp_dir, "manifest.json")
231
+ try:
232
+ with open(manifest_path, "w", encoding="utf-8") as f_manifest:
233
+ json.dump(manifest_data, f_manifest, indent=2)
234
+ except Exception as e:
235
+ logger.error(f"Failed to write manifest.json: {e}", exc_info=True)
236
+ raise # Re-raise error, cannot proceed
237
+
238
+ # --- Copy SPA files into temp dir ---
239
+ try:
240
+ # Find the path to the spa template directory relative to this file
241
+ # Using __file__ assumes this script is installed alongside the templates
242
+ utils_dir = os.path.dirname(os.path.abspath(__file__))
243
+ templates_dir = os.path.join(
244
+ os.path.dirname(utils_dir), "templates"
245
+ ) # Go up one level from utils
246
+ spa_template_dir = os.path.join(templates_dir, "spa")
247
+
248
+ if not os.path.isdir(spa_template_dir):
249
+ raise FileNotFoundError(f"SPA template directory not found at {spa_template_dir}")
250
+
251
+ logger.info(f"Copying SPA shell from: {spa_template_dir}")
252
+ # Copy contents of spa_template_dir/* into temp_dir/
253
+ # dirs_exist_ok=True handles merging if subdirs like js/ already exist (Python 3.8+)
254
+ shutil.copytree(spa_template_dir, temp_dir, dirs_exist_ok=True)
255
+
256
+ except Exception as e:
257
+ logger.error(f"Failed to copy SPA template files: {e}", exc_info=True)
258
+ raise RuntimeError("Could not package SPA files.") from e
259
+
260
+ # --- Create the final zip file ---
261
+ try:
262
+ logger.info(f"Creating zip package at: {output_zip_path}")
263
+ with zipfile.ZipFile(output_zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
264
+ # Add manifest.json
265
+ zipf.write(manifest_path, arcname="manifest.json")
266
+ # Add images directory
267
+ for root, _, files in os.walk(images_dir):
268
+ for file in files:
269
+ full_path = os.path.join(root, file)
270
+ # Create the correct archive name (e.g., images/...)
271
+ arcname = os.path.relpath(full_path, temp_dir)
272
+ zipf.write(full_path, arcname=arcname)
273
+ logger.info(
274
+ f"Successfully created correction task package: {output_zip_path} ({total_regions_found} regions total)"
275
+ )
276
+
277
+ except Exception as e:
278
+ logger.error(f"Failed to create zip file {output_zip_path}: {e}", exc_info=True)
279
+ # Attempt to clean up existing zip if creation failed partially
280
+ if os.path.exists(output_zip_path):
281
+ try:
282
+ os.remove(output_zip_path)
283
+ except:
284
+ pass
285
+ raise # Re-raise error
286
+
287
+ # Temporary directory is automatically cleaned up by context manager
288
+
289
+
290
+ def import_ocr_from_manifest(pdf: "PDF", manifest_path: str) -> Dict[str, int]:
291
+ """
292
+ Imports OCR data into a PDF object from a manifest file.
293
+
294
+ Reads a manifest.json file (typically generated by create_correction_task_package
295
+ and potentially modified externally) and populates the corresponding pages
296
+ of the PDF object with new TextElement objects based on the manifest data.
297
+ It uses the 'corrected_text' field and bounding box from the manifest.
298
+
299
+ This function assumes you want to replace or provide the primary OCR data
300
+ from the manifest, rather than correcting existing elements.
301
+ Existing OCR elements on the pages are NOT automatically cleared.
302
+
303
+ Args:
304
+ pdf: The natural_pdf.core.pdf.PDF object to populate with OCR data.
305
+ manifest_path: Path to the manifest.json file.
306
+
307
+ Returns:
308
+ A dictionary containing counts of imported and skipped regions:
309
+ {'imported': count, 'skipped': count}
310
+
311
+ Raises:
312
+ FileNotFoundError: If the manifest_path does not exist.
313
+ ValueError: If the manifest is invalid or contains data for a different PDF.
314
+ TypeError: If the input pdf object is not a valid PDF instance.
315
+ """
316
+ if not (hasattr(pdf, "__class__") and pdf.__class__.__name__ == "PDF"):
317
+ raise TypeError(f"Input must be a natural_pdf PDF object, got {type(pdf)}")
318
+
319
+ if not os.path.exists(manifest_path):
320
+ raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
321
+
322
+ logger.info(f"Importing OCR data into PDF '{pdf.path}' from manifest '{manifest_path}'")
323
+
324
+ try:
325
+ with open(manifest_path, "r", encoding="utf-8") as f:
326
+ manifest_data = json.load(f)
327
+ except json.JSONDecodeError as e:
328
+ logger.error(f"Failed to parse manifest file: {e}")
329
+ raise ValueError(f"Invalid JSON in manifest file: {manifest_path}") from e
330
+ except Exception as e:
331
+ logger.error(f"Failed to read manifest file: {e}")
332
+ raise
333
+
334
+ imported_count = 0
335
+ skipped_count = 0
336
+ processed_pages = 0
337
+
338
+ manifest_pages = manifest_data.get("pages", [])
339
+ if not manifest_pages:
340
+ logger.warning("Manifest contains no page data.")
341
+ return {"imported": 0, "skipped": 0}
342
+
343
+ # --- Pre-check PDF source consistency ---
344
+ first_manifest_pdf_path = manifest_pages[0].get("pdf_source")
345
+ if first_manifest_pdf_path != pdf.path:
346
+ # Allow matching based on just the filename if paths differ (e.g., absolute vs relative)
347
+ if os.path.basename(first_manifest_pdf_path) != os.path.basename(pdf.path):
348
+ logger.error(
349
+ f"Manifest PDF source ('{first_manifest_pdf_path}') does not match target PDF path ('{pdf.path}'). Aborting."
350
+ )
351
+ raise ValueError("Manifest source PDF does not match the provided PDF object.")
352
+ else:
353
+ logger.warning(
354
+ f"Manifest PDF source path ('{first_manifest_pdf_path}') differs from target PDF path ('{pdf.path}'), but filenames match. Proceeding cautiously."
355
+ )
356
+
357
+ pdf_pages_by_index = {page.index: page for page in pdf.pages}
358
+
359
+ for page_data in tqdm(manifest_pages, desc="Importing OCR Data"):
360
+ page_index = page_data.get("page_index")
361
+ manifest_pdf_path = page_data.get("pdf_source")
362
+
363
+ # Check consistency for every page? (Maybe overkill if pre-checked)
364
+ if manifest_pdf_path != pdf.path and os.path.basename(
365
+ manifest_pdf_path
366
+ ) != os.path.basename(pdf.path):
367
+ logger.warning(
368
+ f"Skipping page index {page_index} due to PDF source mismatch ('{manifest_pdf_path}' vs '{pdf.path}')"
369
+ )
370
+ skipped_count += len(page_data.get("regions", [])) # Count all regions as skipped
371
+ continue
372
+
373
+ if page_index is None:
374
+ logger.warning(
375
+ f"Skipping page entry with missing 'page_index': {page_data.get('page_number')}"
376
+ )
377
+ skipped_count += len(page_data.get("regions", []))
378
+ continue
379
+
380
+ page = pdf_pages_by_index.get(page_index)
381
+ if page is None:
382
+ logger.warning(
383
+ f"Could not find page with index {page_index} in the target PDF. Skipping."
384
+ )
385
+ skipped_count += len(page_data.get("regions", []))
386
+ continue
387
+
388
+ processed_pages += 1
389
+ # We are adding elements, no need to fetch existing ones unless we want to prevent duplicates (not implemented here)
390
+
391
+ regions_to_add = []
392
+ for region_data in page_data.get("regions", []):
393
+ # We import all regions, not just modified ones
394
+ # if not region_data.get("modified", False):
395
+ # continue # Only process modified regions
396
+
397
+ region_id = region_data.get("id", "unknown")
398
+ manifest_bbox = region_data.get("bbox")
399
+ # Use corrected_text as the primary text source for the new element
400
+ text_to_import = region_data.get("corrected_text")
401
+ # Fallback to ocr_text if corrected_text is missing (though unlikely from the SPA)
402
+ if text_to_import is None:
403
+ text_to_import = region_data.get("ocr_text")
404
+
405
+ resolution = region_data.get("resolution") # Mandatory from export
406
+ confidence = region_data.get("confidence") # Optional
407
+
408
+ if not all([manifest_bbox, text_to_import is not None, resolution]):
409
+ logger.warning(
410
+ f"Skipping incomplete/invalid region data on page {page_index}, region id '{region_id}': Missing bbox, text, or resolution."
411
+ )
412
+ skipped_count += 1
413
+ continue
414
+
415
+ # Convert manifest bbox (image pixels) back to PDF coordinates (points @ 72 DPI)
416
+ try:
417
+ scale_factor = 72.0 / float(resolution)
418
+ pdf_x0 = manifest_bbox[0] * scale_factor
419
+ pdf_top = manifest_bbox[1] * scale_factor
420
+ pdf_x1 = manifest_bbox[2] * scale_factor
421
+ pdf_bottom = manifest_bbox[3] * scale_factor
422
+ except (ValueError, TypeError, IndexError, ZeroDivisionError):
423
+ logger.warning(
424
+ f"Invalid bbox or resolution for region '{region_id}' on page {page_index}. Skipping."
425
+ )
426
+ skipped_count += 1
427
+ continue
428
+
429
+ # --- Create New Element ---
430
+ try:
431
+ new_element = TextElement(
432
+ text=text_to_import,
433
+ x0=pdf_x0,
434
+ top=pdf_top,
435
+ x1=pdf_x1,
436
+ bottom=pdf_bottom,
437
+ page=page, # Reference to the parent Page object
438
+ source="manifest-import", # Indicate origin
439
+ confidence=confidence, # Pass confidence if available
440
+ # Add metadata from manifest if needed? Maybe original_ocr?
441
+ metadata=(
442
+ {"original_ocr": region_data.get("ocr_text")}
443
+ if region_data.get("ocr_text") != text_to_import
444
+ else {}
445
+ ),
446
+ )
447
+ regions_to_add.append(new_element)
448
+ imported_count += 1
449
+ except Exception as e:
450
+ logger.error(
451
+ f"Error creating TextElement for region '{region_id}' on page {page_index}: {e}",
452
+ exc_info=True,
453
+ )
454
+ skipped_count += 1
455
+
456
+ # --- Add Elements to Page ---
457
+ # Add all created elements for this page in one go
458
+ if regions_to_add:
459
+ try:
460
+ # Accessing _elements directly; use manager if a public add method exists
461
+ if (
462
+ hasattr(page, "_elements")
463
+ and hasattr(page._elements, "elements")
464
+ and isinstance(page._elements.elements, list)
465
+ ):
466
+ page._elements.elements.extend(regions_to_add)
467
+ # TODO: Should potentially invalidate page element cache if exists
468
+ else:
469
+ logger.error(
470
+ f"Could not add elements to page {page.index}, page._elements structure unexpected."
471
+ )
472
+ # Decrement count as they weren't actually added
473
+ imported_count -= len(regions_to_add)
474
+ skipped_count += len(regions_to_add)
475
+
476
+ except Exception as e:
477
+ logger.error(f"Error adding elements to page {page.index}: {e}", exc_info=True)
478
+ # Decrement count as they weren't actually added
479
+ imported_count -= len(regions_to_add)
480
+ skipped_count += len(regions_to_add)
481
+
482
+ logger.info(
483
+ f"Import process finished. Imported: {imported_count}, Skipped: {skipped_count}. Processed {processed_pages} pages from manifest."
484
+ )
485
+ return {"imported": imported_count, "skipped": skipped_count}
@@ -116,80 +116,60 @@ def filter_chars_spatially(
116
116
 
117
117
  def generate_text_layout(
118
118
  char_dicts: List[Dict[str, Any]],
119
- layout_context_bbox: Tuple[float, float, float, float],
120
- user_kwargs: Dict[str, Any],
119
+ layout_context_bbox: Optional[Tuple[float, float, float, float]] = None,
120
+ user_kwargs: Optional[Dict[str, Any]] = None,
121
121
  ) -> str:
122
122
  """
123
- Takes a list of filtered character dictionaries and generates
124
- text output using pdfplumber's layout engine.
123
+ Generates a string representation of text from character dictionaries,
124
+ attempting to reconstruct layout using pdfplumber's utilities.
125
125
 
126
126
  Args:
127
- char_dicts: The final list of character dictionaries to include.
128
- layout_context_bbox: The bounding box (x0, top, x1, bottom) to use for
129
- calculating default layout width/height/shifts.
130
- user_kwargs: Dictionary of user-provided keyword arguments.
127
+ char_dicts: List of character dictionary objects.
128
+ layout_context_bbox: Optional bounding box for layout context.
129
+ user_kwargs: User-provided kwargs, potentially overriding defaults.
131
130
 
132
131
  Returns:
133
- The formatted text string.
132
+ String representation of the text.
134
133
  """
135
- if not char_dicts:
136
- logger.debug("generate_text_layout: No characters provided.")
134
+ # --- Filter out invalid char dicts early ---
135
+ initial_count = len(char_dicts)
136
+ valid_char_dicts = [c for c in char_dicts if isinstance(c.get("text"), str)]
137
+ filtered_count = initial_count - len(valid_char_dicts)
138
+ if filtered_count > 0:
139
+ logger.debug(
140
+ f"generate_text_layout: Filtered out {filtered_count} char dicts with non-string/None text."
141
+ )
142
+
143
+ if not valid_char_dicts: # Return empty if no valid chars remain
144
+ logger.debug("generate_text_layout: No valid character dicts found after filtering.")
137
145
  return ""
138
146
 
139
- # Prepare layout kwargs, prioritizing user input
140
- layout_kwargs = {}
141
- allowed_keys = set(WORD_EXTRACTOR_KWARGS) | set(TEXTMAP_KWARGS)
142
- for key, value in user_kwargs.items():
143
- if key in allowed_keys:
144
- layout_kwargs[key] = value
145
-
146
- # Default to layout=True unless explicitly False
147
- use_layout = layout_kwargs.get("layout", True) # Default to layout if called
148
- layout_kwargs["layout"] = use_layout
149
-
150
- if use_layout:
151
- ctx_x0, ctx_top, ctx_x1, ctx_bottom = layout_context_bbox
152
- ctx_width = ctx_x1 - ctx_x0
153
- ctx_height = ctx_bottom - ctx_top
154
-
155
- # Set layout defaults based on context_bbox if not overridden by user
156
- if "layout_bbox" not in layout_kwargs:
157
- layout_kwargs["layout_bbox"] = layout_context_bbox
158
- # Only set default layout_width if neither width specifier is present
159
- if "layout_width_chars" not in layout_kwargs and "layout_width" not in layout_kwargs:
160
- layout_kwargs["layout_width"] = ctx_width
161
- if "layout_height" not in layout_kwargs:
162
- layout_kwargs["layout_height"] = ctx_height
163
- # Adjust shift based on context's top-left corner
164
- if "x_shift" not in layout_kwargs:
165
- layout_kwargs["x_shift"] = ctx_x0
166
- if "y_shift" not in layout_kwargs:
167
- layout_kwargs["y_shift"] = ctx_top
147
+ # Prepare layout arguments
148
+ layout_kwargs = _get_layout_kwargs(layout_context_bbox, user_kwargs)
149
+ use_layout = layout_kwargs.pop("layout", True) # Extract layout flag, default True
168
150
 
169
- logger.debug(
170
- f"generate_text_layout: Calling chars_to_textmap with {len(char_dicts)} chars and kwargs: {layout_kwargs}"
171
- )
172
- try:
173
- # Sort final list by reading order before passing to textmap
174
- # TODO: Make sorting key dynamic based on layout_kwargs directions?
175
- char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
176
- textmap = chars_to_textmap(char_dicts, **layout_kwargs)
177
- result = textmap.as_string
178
- except Exception as e:
179
- logger.error(
180
- f"generate_text_layout: Error calling chars_to_textmap: {e}", exc_info=True
181
- )
182
- logger.warning(
183
- "generate_text_layout: Falling back to simple character join due to layout error."
184
- )
185
- # Ensure chars are sorted before fallback join
186
- fallback_chars = sorted(char_dicts, key=lambda c: (c.get("top", 0), c.get("x0", 0)))
187
- result = "".join(c.get("text", "") for c in fallback_chars)
188
- else:
151
+ if not use_layout:
189
152
  # Simple join if layout=False
190
- logger.debug("generate_text_layout: Using simple join (layout=False).")
191
- # Sort by document order for simple join as well
192
- char_dicts.sort(key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0)))
193
- result = "".join(c.get("text", "") for c in char_dicts)
153
+ logger.debug("generate_text_layout: Using simple join (layout=False requested).")
154
+ # Sort before joining if layout is off
155
+ valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
156
+ result = "".join(c.get("text", "") for c in valid_char_dicts) # Use valid chars
157
+ return result
158
+
159
+ try:
160
+ # Sort chars primarily by top, then x0 before layout analysis
161
+ # This helps pdfplumber group lines correctly
162
+ valid_char_dicts.sort(key=lambda c: (c.get("top", 0), c.get("x0", 0)))
163
+ textmap = chars_to_textmap(valid_char_dicts, **layout_kwargs)
164
+ result = textmap.as_string
165
+ except Exception as e:
166
+ # Fallback to simple join on error
167
+ logger.error(f"generate_text_layout: Error calling chars_to_textmap: {e}", exc_info=False)
168
+ logger.warning(
169
+ "generate_text_layout: Falling back to simple character join due to layout error."
170
+ )
171
+ # Fallback already has sorted characters if layout was attempted
172
+ # Need to use the valid_char_dicts here too
173
+ result = "".join(c.get("text", "") for c in valid_char_dicts)
194
174
 
195
175
  return result