natural-pdf 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. docs/ocr/index.md +34 -47
  2. docs/tutorials/01-loading-and-extraction.ipynb +60 -46
  3. docs/tutorials/02-finding-elements.ipynb +42 -42
  4. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  5. docs/tutorials/04-table-extraction.ipynb +12 -12
  6. docs/tutorials/05-excluding-content.ipynb +30 -30
  7. docs/tutorials/06-document-qa.ipynb +28 -28
  8. docs/tutorials/07-layout-analysis.ipynb +63 -35
  9. docs/tutorials/07-working-with-regions.ipynb +55 -51
  10. docs/tutorials/07-working-with-regions.md +2 -2
  11. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  12. docs/tutorials/09-section-extraction.ipynb +113 -113
  13. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  14. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  15. docs/tutorials/12-ocr-integration.ipynb +149 -131
  16. docs/tutorials/12-ocr-integration.md +0 -13
  17. docs/tutorials/13-semantic-search.ipynb +313 -873
  18. natural_pdf/__init__.py +21 -23
  19. natural_pdf/analyzers/layout/gemini.py +264 -0
  20. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  21. natural_pdf/analyzers/layout/layout_options.py +11 -0
  22. natural_pdf/analyzers/layout/yolo.py +6 -2
  23. natural_pdf/collections/pdf_collection.py +21 -0
  24. natural_pdf/core/element_manager.py +16 -13
  25. natural_pdf/core/page.py +165 -36
  26. natural_pdf/core/pdf.py +146 -41
  27. natural_pdf/elements/base.py +11 -17
  28. natural_pdf/elements/collections.py +100 -38
  29. natural_pdf/elements/region.py +77 -38
  30. natural_pdf/elements/text.py +5 -0
  31. natural_pdf/ocr/__init__.py +49 -36
  32. natural_pdf/ocr/engine.py +146 -51
  33. natural_pdf/ocr/engine_easyocr.py +141 -161
  34. natural_pdf/ocr/engine_paddle.py +107 -193
  35. natural_pdf/ocr/engine_surya.py +75 -148
  36. natural_pdf/ocr/ocr_factory.py +114 -0
  37. natural_pdf/ocr/ocr_manager.py +65 -93
  38. natural_pdf/ocr/ocr_options.py +7 -17
  39. natural_pdf/ocr/utils.py +98 -0
  40. natural_pdf/templates/spa/css/style.css +334 -0
  41. natural_pdf/templates/spa/index.html +31 -0
  42. natural_pdf/templates/spa/js/app.js +472 -0
  43. natural_pdf/templates/spa/words.txt +235976 -0
  44. natural_pdf/utils/debug.py +32 -0
  45. natural_pdf/utils/identifiers.py +29 -0
  46. natural_pdf/utils/packaging.py +418 -0
  47. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +41 -19
  48. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/RECORD +51 -44
  49. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  50. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/top_level.txt +0 -1
  51. natural_pdf/templates/ocr_debug.html +0 -517
  52. tests/test_loading.py +0 -50
  53. tests/test_optional_deps.py +0 -298
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,32 @@
1
+ """
2
+ OCR debug utilities for natural-pdf.
3
+ """
4
+ import base64
5
+ import io
6
+ import json
7
+ import os
8
+ import importlib.util
9
+ import importlib.resources
10
+ import webbrowser
11
+ from typing import Dict, List, Any, Optional, Union, Tuple
12
+
13
+ from PIL import Image
14
+
15
+ # Assuming Page type hint is available or define a placeholder
16
+ try:
17
+ from natural_pdf.core.page import Page
18
+ except ImportError:
19
+ Page = Any # Placeholder
20
+
21
+ def _get_page_image_base64(page: Page) -> str:
22
+ """Generate a base64 encoded image of the page."""
23
+ # Create a clean image of the page without highlights for the base background
24
+ # Use a fixed scale consistent with the HTML/JS rendering logic
25
+ img = page.to_image(scale=2.0, include_highlights=False)
26
+ if img is None:
27
+ raise ValueError(f"Failed to render image for page {page.number}")
28
+
29
+ # Convert to base64
30
+ buffered = io.BytesIO()
31
+ img.save(buffered, format="PNG")
32
+ return f"data:image/png;base64,{base64.b64encode(buffered.getvalue()).decode('utf-8')}"
@@ -0,0 +1,29 @@
1
+ """
2
+ Utilities for generating consistent identifiers.
3
+ """
4
+ import hashlib
5
+ import base64
6
+ import os
7
+
8
+ def generate_short_path_hash(path_str: str, length: int = 8) -> str:
9
+ """
10
+ Generates a short, filesystem-safe hash ID from a path string.
11
+
12
+ Args:
13
+ path_str: The absolute path string.
14
+ length: The desired length of the short ID (default: 8).
15
+
16
+ Returns:
17
+ A short hash string using URL-safe Base64 encoding.
18
+ """
19
+ # Ensure consistency by using the absolute path
20
+ normalized_path = os.path.abspath(path_str)
21
+ path_bytes = normalized_path.encode('utf-8')
22
+ # Use SHA-256 for good collision resistance
23
+ full_hash = hashlib.sha256(path_bytes).digest() # Get binary hash
24
+ # Encode using URL-safe Base64 and remove padding '=' characters
25
+ b64_encoded = base64.urlsafe_b64encode(full_hash).decode('ascii').rstrip('=')
26
+ # Return the first 'length' characters
27
+ if length <= 0 or length > len(b64_encoded):
28
+ raise ValueError(f"Invalid length specified: {length}. Must be between 1 and {len(b64_encoded)}.")
29
+ return b64_encoded[:length]
@@ -0,0 +1,418 @@
1
+ """
2
+ Utilities for packaging data for external processes, like correction tasks.
3
+ """
4
+
5
+ import os
6
+ import base64
7
+ import io
8
+ import json
9
+ import zipfile
10
+ import tempfile
11
+ import logging
12
+ import shutil
13
+ from typing import Any, List, Union, Iterable, TYPE_CHECKING, Dict
14
+ from tqdm import tqdm
15
+ from natural_pdf.elements.text import TextElement
16
+
17
+ # Import the specific PDF/Page types if possible, otherwise use Any
18
+ if TYPE_CHECKING:
19
+ from natural_pdf.core.pdf import PDF
20
+ from natural_pdf.core.page import Page
21
+ from natural_pdf.collections.pdf_collection import PDFCollection
22
+ else:
23
+ PDF = Any
24
+ Page = Any
25
+ PDFCollection = Any
26
+
27
+ from natural_pdf.utils.identifiers import generate_short_path_hash
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ def create_correction_task_package(
32
+ source: Union['PDF', 'PDFCollection', List['PDF']],
33
+ output_zip_path: str,
34
+ overwrite: bool = False,
35
+ suggest = None,
36
+ resolution: int = 150,
37
+ ) -> None:
38
+ """
39
+ Creates a zip package containing data for an OCR correction task.
40
+
41
+ The package includes:
42
+ - manifest.json: Metadata about pages and OCR regions (using original PDF coordinates).
43
+ - images/ directory: Rendered full-page images.
44
+
45
+ Args:
46
+ source: The PDF object, PDFCollection, or list of PDF objects to process.
47
+ output_zip_path: The full path where the output zip file should be saved.
48
+ overwrite: If True, overwrite the output zip file if it already exists.
49
+ suggest: Function that takes the region and returns an OCR suggestion
50
+
51
+ Raises:
52
+ FileNotFoundError: If the output directory cannot be created.
53
+ FileExistsError: If the output zip file exists and overwrite is False.
54
+ TypeError: If the source type is invalid.
55
+ ValueError: If no valid pages with OCR data are found in the source.
56
+ """
57
+ if os.path.exists(output_zip_path) and not overwrite:
58
+ raise FileExistsError(f"Output file already exists: {output_zip_path}. Set overwrite=True to replace it.")
59
+
60
+ # --- Resolve source to a list of PDF objects ---
61
+ pdfs_to_process: List['PDF'] = []
62
+ if hasattr(source, '__class__') and source.__class__.__name__ == 'PDF': # Check type without direct import
63
+ pdfs_to_process = [source]
64
+ elif hasattr(source, '__class__') and source.__class__.__name__ == 'PDFCollection':
65
+ pdfs_to_process = source.pdfs # Assuming PDFCollection has a .pdfs property
66
+ elif isinstance(source, list) and all(hasattr(p, '__class__') and p.__class__.__name__ == 'PDF' for p in source):
67
+ pdfs_to_process = source
68
+ else:
69
+ raise TypeError(f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF].")
70
+
71
+ if not pdfs_to_process:
72
+ logger.warning("No PDF documents provided in the source.")
73
+ return
74
+
75
+ manifest_data = {
76
+ "pdfs": [], # Store pdf-level info if needed later?
77
+ "pages": []
78
+ }
79
+ total_regions_found = 0
80
+
81
+ # Use a temporary directory for staging files before zipping
82
+ with tempfile.TemporaryDirectory() as temp_dir:
83
+ images_dir = os.path.join(temp_dir, "images")
84
+ os.makedirs(images_dir)
85
+ logger.info(f"Using temporary directory for staging: {temp_dir}")
86
+
87
+ # --- Process each PDF ---
88
+ for pdf in pdfs_to_process:
89
+ if not hasattr(pdf, 'path') or not hasattr(pdf, 'pages'):
90
+ logger.warning(f"Skipping invalid PDF object: {pdf}")
91
+ continue
92
+
93
+ pdf_path = pdf.path # Should be the resolved, absolute path
94
+ pdf_short_id = generate_short_path_hash(pdf_path)
95
+ logger.debug(f"Processing PDF: {pdf_path} (ID: {pdf_short_id})")
96
+
97
+ pdf_has_ocr_regions = False
98
+ for page in pdf.pages:
99
+ if not hasattr(page, 'index') or not hasattr(page, 'number') or \
100
+ not hasattr(page, 'width') or not hasattr(page, 'height') or \
101
+ not hasattr(page, 'find_all') or not hasattr(page, 'to_image'):
102
+ logger.warning(f"Skipping invalid Page object in {pdf_path}: page index {getattr(page, 'index', 'unknown')}")
103
+ continue
104
+
105
+ # 1. Extract OCR elements for this page
106
+ try:
107
+ # Important: apply_exclusions=False ensures we get *all* OCR data
108
+ # regardless of user exclusions set on the PDF/page object.
109
+ ocr_elements = page.find_all('text[source=ocr]', apply_exclusions=False).elements
110
+ except Exception as e:
111
+ logger.error(f"Failed to extract OCR elements for {pdf_path} page {page.number}: {e}", exc_info=True)
112
+ continue # Skip this page if element extraction fails
113
+
114
+ if not ocr_elements:
115
+ logger.debug(f"No OCR elements found for {pdf_path} page {page.number}. Skipping page in manifest.")
116
+ continue # Skip page if no OCR elements
117
+
118
+ pdf_has_ocr_regions = True # Mark that this PDF is relevant
119
+ logger.debug(f" Found {len(ocr_elements)} OCR elements on page {page.number}")
120
+ total_regions_found += len(ocr_elements)
121
+
122
+ # 2. Render and save page image
123
+ image_filename = f"{pdf_short_id}_page_{page.index}.png"
124
+ image_save_path = os.path.join(images_dir, image_filename)
125
+ try:
126
+ img = page.to_image(resolution=resolution, include_highlights=False)
127
+ if img is None:
128
+ raise ValueError("page.to_image returned None")
129
+ img.save(image_save_path, "PNG")
130
+ except Exception as e:
131
+ logger.error(f"Failed to render/save image for {pdf_path} page {page.number}: {e}", exc_info=True)
132
+ # If image fails, we cannot proceed with this page for the task
133
+ pdf_has_ocr_regions = False # Reset flag for this page
134
+ continue
135
+
136
+ # 3. Prepare region data for manifest
137
+ page_regions_data = []
138
+ # Calculate scaling factor from PDF coordinates (72 DPI) to image pixels
139
+ coord_scale_factor = resolution / 72.0
140
+
141
+ i = -1
142
+ for elem in tqdm(ocr_elements):
143
+ i += 1
144
+ # Basic check for necessary attributes
145
+ if not all(hasattr(elem, attr) for attr in ['x0', 'top', 'x1', 'bottom', 'text']):
146
+ logger.warning(f"Skipping invalid OCR element {i} on {pdf_path} page {page.number}")
147
+ continue
148
+ region_id = f"r_{page.index}_{i}" # ID unique within page
149
+
150
+ # Scale coordinates to match the 300 DPI image
151
+ scaled_bbox = [
152
+ elem.x0 * coord_scale_factor,
153
+ elem.top * coord_scale_factor,
154
+ elem.x1 * coord_scale_factor,
155
+ elem.bottom * coord_scale_factor
156
+ ]
157
+
158
+ corrected = elem.text
159
+
160
+ if suggest:
161
+ corrected = suggest(elem.to_region(), getattr(elem, 'confidence', None))
162
+
163
+ page_regions_data.append({
164
+ "resolution": resolution,
165
+ "id": region_id,
166
+ "bbox": scaled_bbox,
167
+ "ocr_text": elem.text,
168
+ "confidence": getattr(elem, 'confidence', None), # Include confidence if available
169
+ "corrected_text": corrected,
170
+ "modified": False
171
+ })
172
+
173
+ # 4. Add page data to manifest if it has regions
174
+ if page_regions_data:
175
+ manifest_data["pages"].append({
176
+ "pdf_source": pdf_path,
177
+ "pdf_short_id": pdf_short_id,
178
+ "page_number": page.number,
179
+ "page_index": page.index,
180
+ "image_path": f"images/{image_filename}", # Relative path within zip
181
+ "width": page.width,
182
+ "height": page.height,
183
+ "regions": page_regions_data
184
+ })
185
+ else:
186
+ # If, after checks, no valid regions remain, ensure flag is correct
187
+ pdf_has_ocr_regions = False
188
+
189
+
190
+ # --- Final Checks and Zipping ---
191
+ if not manifest_data["pages"] or total_regions_found == 0:
192
+ logger.error("No pages with valid OCR regions and successfully rendered images found in the source PDFs. Cannot create task package.")
193
+ # Consider raising ValueError here instead of just returning
194
+ raise ValueError("No valid pages with OCR data found to create a task package.")
195
+
196
+ manifest_path = os.path.join(temp_dir, "manifest.json")
197
+ try:
198
+ with open(manifest_path, 'w', encoding='utf-8') as f_manifest:
199
+ json.dump(manifest_data, f_manifest, indent=2)
200
+ except Exception as e:
201
+ logger.error(f"Failed to write manifest.json: {e}", exc_info=True)
202
+ raise # Re-raise error, cannot proceed
203
+
204
+ # --- Copy SPA files into temp dir ---
205
+ try:
206
+ # Find the path to the spa template directory relative to this file
207
+ # Using __file__ assumes this script is installed alongside the templates
208
+ utils_dir = os.path.dirname(os.path.abspath(__file__))
209
+ templates_dir = os.path.join(os.path.dirname(utils_dir), 'templates') # Go up one level from utils
210
+ spa_template_dir = os.path.join(templates_dir, 'spa')
211
+
212
+ if not os.path.isdir(spa_template_dir):
213
+ raise FileNotFoundError(f"SPA template directory not found at {spa_template_dir}")
214
+
215
+ logger.info(f"Copying SPA shell from: {spa_template_dir}")
216
+ # Copy contents of spa_template_dir/* into temp_dir/
217
+ # dirs_exist_ok=True handles merging if subdirs like js/ already exist (Python 3.8+)
218
+ shutil.copytree(spa_template_dir, temp_dir, dirs_exist_ok=True)
219
+
220
+ except Exception as e:
221
+ logger.error(f"Failed to copy SPA template files: {e}", exc_info=True)
222
+ raise RuntimeError("Could not package SPA files.") from e
223
+
224
+ # --- Create the final zip file ---
225
+ try:
226
+ logger.info(f"Creating zip package at: {output_zip_path}")
227
+ with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
228
+ # Add manifest.json
229
+ zipf.write(manifest_path, arcname="manifest.json")
230
+ # Add images directory
231
+ for root, _, files in os.walk(images_dir):
232
+ for file in files:
233
+ full_path = os.path.join(root, file)
234
+ # Create the correct archive name (e.g., images/...)
235
+ arcname = os.path.relpath(full_path, temp_dir)
236
+ zipf.write(full_path, arcname=arcname)
237
+ logger.info(f"Successfully created correction task package: {output_zip_path} ({total_regions_found} regions total)")
238
+
239
+ except Exception as e:
240
+ logger.error(f"Failed to create zip file {output_zip_path}: {e}", exc_info=True)
241
+ # Attempt to clean up existing zip if creation failed partially
242
+ if os.path.exists(output_zip_path):
243
+ try: os.remove(output_zip_path)
244
+ except: pass
245
+ raise # Re-raise error
246
+
247
+ # Temporary directory is automatically cleaned up by context manager
248
+
249
+ def import_ocr_from_manifest(
250
+ pdf: 'PDF',
251
+ manifest_path: str
252
+ ) -> Dict[str, int]:
253
+ """
254
+ Imports OCR data into a PDF object from a manifest file.
255
+
256
+ Reads a manifest.json file (typically generated by create_correction_task_package
257
+ and potentially modified externally) and populates the corresponding pages
258
+ of the PDF object with new TextElement objects based on the manifest data.
259
+ It uses the 'corrected_text' field and bounding box from the manifest.
260
+
261
+ This function assumes you want to replace or provide the primary OCR data
262
+ from the manifest, rather than correcting existing elements.
263
+ Existing OCR elements on the pages are NOT automatically cleared.
264
+
265
+ Args:
266
+ pdf: The natural_pdf.core.pdf.PDF object to populate with OCR data.
267
+ manifest_path: Path to the manifest.json file.
268
+
269
+ Returns:
270
+ A dictionary containing counts of imported and skipped regions:
271
+ {'imported': count, 'skipped': count}
272
+
273
+ Raises:
274
+ FileNotFoundError: If the manifest_path does not exist.
275
+ ValueError: If the manifest is invalid or contains data for a different PDF.
276
+ TypeError: If the input pdf object is not a valid PDF instance.
277
+ """
278
+ if not (hasattr(pdf, '__class__') and pdf.__class__.__name__ == 'PDF'):
279
+ raise TypeError(f"Input must be a natural_pdf PDF object, got {type(pdf)}")
280
+
281
+ if not os.path.exists(manifest_path):
282
+ raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
283
+
284
+ logger.info(f"Importing OCR data into PDF '{pdf.path}' from manifest '{manifest_path}'")
285
+
286
+ try:
287
+ with open(manifest_path, 'r', encoding='utf-8') as f:
288
+ manifest_data = json.load(f)
289
+ except json.JSONDecodeError as e:
290
+ logger.error(f"Failed to parse manifest file: {e}")
291
+ raise ValueError(f"Invalid JSON in manifest file: {manifest_path}") from e
292
+ except Exception as e:
293
+ logger.error(f"Failed to read manifest file: {e}")
294
+ raise
295
+
296
+ imported_count = 0
297
+ skipped_count = 0
298
+ processed_pages = 0
299
+
300
+ manifest_pages = manifest_data.get("pages", [])
301
+ if not manifest_pages:
302
+ logger.warning("Manifest contains no page data.")
303
+ return {'imported': 0, 'skipped': 0}
304
+
305
+ # --- Pre-check PDF source consistency ---
306
+ first_manifest_pdf_path = manifest_pages[0].get("pdf_source")
307
+ if first_manifest_pdf_path != pdf.path:
308
+ # Allow matching based on just the filename if paths differ (e.g., absolute vs relative)
309
+ if os.path.basename(first_manifest_pdf_path) != os.path.basename(pdf.path):
310
+ logger.error(f"Manifest PDF source ('{first_manifest_pdf_path}') does not match target PDF path ('{pdf.path}'). Aborting.")
311
+ raise ValueError("Manifest source PDF does not match the provided PDF object.")
312
+ else:
313
+ logger.warning(f"Manifest PDF source path ('{first_manifest_pdf_path}') differs from target PDF path ('{pdf.path}'), but filenames match. Proceeding cautiously.")
314
+
315
+
316
+ pdf_pages_by_index = {page.index: page for page in pdf.pages}
317
+
318
+ for page_data in tqdm(manifest_pages, desc="Importing OCR Data"):
319
+ page_index = page_data.get("page_index")
320
+ manifest_pdf_path = page_data.get("pdf_source")
321
+
322
+ # Check consistency for every page? (Maybe overkill if pre-checked)
323
+ if manifest_pdf_path != pdf.path and os.path.basename(manifest_pdf_path) != os.path.basename(pdf.path):
324
+ logger.warning(f"Skipping page index {page_index} due to PDF source mismatch ('{manifest_pdf_path}' vs '{pdf.path}')")
325
+ skipped_count += len(page_data.get("regions", [])) # Count all regions as skipped
326
+ continue
327
+
328
+ if page_index is None:
329
+ logger.warning(f"Skipping page entry with missing 'page_index': {page_data.get('page_number')}")
330
+ skipped_count += len(page_data.get("regions", []))
331
+ continue
332
+
333
+ page = pdf_pages_by_index.get(page_index)
334
+ if page is None:
335
+ logger.warning(f"Could not find page with index {page_index} in the target PDF. Skipping.")
336
+ skipped_count += len(page_data.get("regions", []))
337
+ continue
338
+
339
+ processed_pages += 1
340
+ # We are adding elements, no need to fetch existing ones unless we want to prevent duplicates (not implemented here)
341
+
342
+ regions_to_add = []
343
+ for region_data in page_data.get("regions", []):
344
+ # We import all regions, not just modified ones
345
+ # if not region_data.get("modified", False):
346
+ # continue # Only process modified regions
347
+
348
+ region_id = region_data.get("id", "unknown")
349
+ manifest_bbox = region_data.get("bbox")
350
+ # Use corrected_text as the primary text source for the new element
351
+ text_to_import = region_data.get("corrected_text")
352
+ # Fallback to ocr_text if corrected_text is missing (though unlikely from the SPA)
353
+ if text_to_import is None:
354
+ text_to_import = region_data.get("ocr_text")
355
+
356
+ resolution = region_data.get("resolution") # Mandatory from export
357
+ confidence = region_data.get("confidence") # Optional
358
+
359
+ if not all([manifest_bbox, text_to_import is not None, resolution]):
360
+ logger.warning(f"Skipping incomplete/invalid region data on page {page_index}, region id '{region_id}': Missing bbox, text, or resolution.")
361
+ skipped_count += 1
362
+ continue
363
+
364
+ # Convert manifest bbox (image pixels) back to PDF coordinates (points @ 72 DPI)
365
+ try:
366
+ scale_factor = 72.0 / float(resolution)
367
+ pdf_x0 = manifest_bbox[0] * scale_factor
368
+ pdf_top = manifest_bbox[1] * scale_factor
369
+ pdf_x1 = manifest_bbox[2] * scale_factor
370
+ pdf_bottom = manifest_bbox[3] * scale_factor
371
+ except (ValueError, TypeError, IndexError, ZeroDivisionError):
372
+ logger.warning(f"Invalid bbox or resolution for region '{region_id}' on page {page_index}. Skipping.")
373
+ skipped_count += 1
374
+ continue
375
+
376
+ # --- Create New Element ---
377
+ try:
378
+ new_element = TextElement(
379
+ text=text_to_import,
380
+ x0=pdf_x0,
381
+ top=pdf_top,
382
+ x1=pdf_x1,
383
+ bottom=pdf_bottom,
384
+ page=page, # Reference to the parent Page object
385
+ source='manifest-import', # Indicate origin
386
+ confidence=confidence, # Pass confidence if available
387
+ # Add metadata from manifest if needed? Maybe original_ocr?
388
+ metadata={'original_ocr': region_data.get("ocr_text")} if region_data.get("ocr_text") != text_to_import else {}
389
+ )
390
+ regions_to_add.append(new_element)
391
+ imported_count += 1
392
+ except Exception as e:
393
+ logger.error(f"Error creating TextElement for region '{region_id}' on page {page_index}: {e}", exc_info=True)
394
+ skipped_count += 1
395
+
396
+ # --- Add Elements to Page ---
397
+ # Add all created elements for this page in one go
398
+ if regions_to_add:
399
+ try:
400
+ # Accessing _elements directly; use manager if a public add method exists
401
+ if hasattr(page, '_elements') and hasattr(page._elements, 'elements') and isinstance(page._elements.elements, list):
402
+ page._elements.elements.extend(regions_to_add)
403
+ # TODO: Should potentially invalidate page element cache if exists
404
+ else:
405
+ logger.error(f"Could not add elements to page {page.index}, page._elements structure unexpected.")
406
+ # Decrement count as they weren't actually added
407
+ imported_count -= len(regions_to_add)
408
+ skipped_count += len(regions_to_add)
409
+
410
+ except Exception as e:
411
+ logger.error(f"Error adding elements to page {page.index}: {e}", exc_info=True)
412
+ # Decrement count as they weren't actually added
413
+ imported_count -= len(regions_to_add)
414
+ skipped_count += len(regions_to_add)
415
+
416
+
417
+ logger.info(f"Import process finished. Imported: {imported_count}, Skipped: {skipped_count}. Processed {processed_pages} pages from manifest.")
418
+ return {'imported': imported_count, 'skipped': skipped_count}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.5
3
+ Version: 0.1.6
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -16,12 +16,7 @@ Requires-Dist: Pillow
16
16
  Requires-Dist: colour
17
17
  Requires-Dist: numpy
18
18
  Requires-Dist: urllib3
19
- Requires-Dist: torch
20
- Requires-Dist: torchvision
21
- Requires-Dist: transformers
22
- Requires-Dist: huggingface_hub
23
- Requires-Dist: ocrmypdf
24
- Requires-Dist: pikepdf
19
+ Requires-Dist: tqdm
25
20
  Provides-Extra: interactive
26
21
  Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
27
22
  Provides-Extra: haystack
@@ -29,16 +24,27 @@ Requires-Dist: haystack-ai; extra == "haystack"
29
24
  Requires-Dist: chroma-haystack; extra == "haystack"
30
25
  Requires-Dist: sentence-transformers; extra == "haystack"
31
26
  Requires-Dist: protobuf<4; extra == "haystack"
27
+ Requires-Dist: natural-pdf[core-ml]; extra == "haystack"
32
28
  Provides-Extra: easyocr
33
29
  Requires-Dist: easyocr; extra == "easyocr"
30
+ Requires-Dist: natural-pdf[core-ml]; extra == "easyocr"
34
31
  Provides-Extra: paddle
35
32
  Requires-Dist: paddlepaddle; extra == "paddle"
36
33
  Requires-Dist: paddleocr; extra == "paddle"
37
34
  Provides-Extra: layout-yolo
38
35
  Requires-Dist: doclayout_yolo; extra == "layout-yolo"
36
+ Requires-Dist: natural-pdf[core-ml]; extra == "layout-yolo"
39
37
  Provides-Extra: surya
40
38
  Requires-Dist: surya-ocr; extra == "surya"
39
+ Requires-Dist: natural-pdf[core-ml]; extra == "surya"
41
40
  Provides-Extra: qa
41
+ Requires-Dist: natural-pdf[core-ml]; extra == "qa"
42
+ Provides-Extra: docling
43
+ Requires-Dist: docling; extra == "docling"
44
+ Requires-Dist: natural-pdf[core-ml]; extra == "docling"
45
+ Provides-Extra: llm
46
+ Requires-Dist: openai>=1.0; extra == "llm"
47
+ Requires-Dist: pydantic; extra == "llm"
42
48
  Provides-Extra: test
43
49
  Requires-Dist: pytest; extra == "test"
44
50
  Provides-Extra: dev
@@ -50,18 +56,30 @@ Requires-Dist: nox; extra == "dev"
50
56
  Requires-Dist: nox-uv; extra == "dev"
51
57
  Requires-Dist: build; extra == "dev"
52
58
  Requires-Dist: uv; extra == "dev"
59
+ Requires-Dist: pipdeptree; extra == "dev"
60
+ Requires-Dist: nbformat; extra == "dev"
61
+ Requires-Dist: jupytext; extra == "dev"
62
+ Requires-Dist: nbclient; extra == "dev"
53
63
  Provides-Extra: all
54
- Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "all"
55
- Requires-Dist: easyocr; extra == "all"
56
- Requires-Dist: paddlepaddle; extra == "all"
57
- Requires-Dist: paddleocr; extra == "all"
58
- Requires-Dist: doclayout_yolo; extra == "all"
59
- Requires-Dist: surya-ocr; extra == "all"
60
- Requires-Dist: haystack-ai; extra == "all"
61
- Requires-Dist: chroma-haystack; extra == "all"
62
- Requires-Dist: sentence-transformers; extra == "all"
63
- Requires-Dist: protobuf<4; extra == "all"
64
- Requires-Dist: pytest; extra == "all"
64
+ Requires-Dist: natural-pdf[interactive]; extra == "all"
65
+ Requires-Dist: natural-pdf[haystack]; extra == "all"
66
+ Requires-Dist: natural-pdf[easyocr]; extra == "all"
67
+ Requires-Dist: natural-pdf[paddle]; extra == "all"
68
+ Requires-Dist: natural-pdf[layout_yolo]; extra == "all"
69
+ Requires-Dist: natural-pdf[surya]; extra == "all"
70
+ Requires-Dist: natural-pdf[qa]; extra == "all"
71
+ Requires-Dist: natural-pdf[ocr-export]; extra == "all"
72
+ Requires-Dist: natural-pdf[docling]; extra == "all"
73
+ Requires-Dist: natural-pdf[llm]; extra == "all"
74
+ Requires-Dist: natural-pdf[test]; extra == "all"
75
+ Provides-Extra: core-ml
76
+ Requires-Dist: torch; extra == "core-ml"
77
+ Requires-Dist: torchvision; extra == "core-ml"
78
+ Requires-Dist: transformers; extra == "core-ml"
79
+ Requires-Dist: huggingface_hub; extra == "core-ml"
80
+ Provides-Extra: ocr-export
81
+ Requires-Dist: ocrmypdf; extra == "ocr-export"
82
+ Requires-Dist: pikepdf; extra == "ocr-export"
65
83
  Dynamic: license-file
66
84
 
67
85
  # Natural PDF
@@ -89,6 +107,10 @@ pip install natural-pdf[easyocr]
89
107
  pip install natural-pdf[surya]
90
108
  pip install natural-pdf[paddle]
91
109
 
110
+ # Example: Install support for features using Large Language Models (e.g., via OpenAI-compatible APIs)
111
+ pip install natural-pdf[llm]
112
+ # (May require setting API key environment variables, e.g., GOOGLE_API_KEY for Gemini)
113
+
92
114
  # Example: Install with interactive viewer support
93
115
  pip install natural-pdf[interactive]
94
116
 
@@ -141,7 +163,7 @@ Natural PDF offers a range of features for working with PDFs:
141
163
  * **Spatial Navigation:** Select content relative to other elements (`heading.below()`, `element.select_until(...)`).
142
164
  * **Text & Table Extraction:** Get clean text or structured table data, automatically handling exclusions.
143
165
  * **OCR Integration:** Extract text from scanned documents using engines like EasyOCR, PaddleOCR, or Surya.
144
- * **Layout Analysis:** Detect document structures (titles, paragraphs, tables) using AI models.
166
+ * **Layout Analysis:** Detect document structures (titles, paragraphs, tables) using various engines (e.g., YOLO, Paddle, LLM via API).
145
167
  * **Document QA:** Ask natural language questions about your document's content.
146
168
  * **Semantic Search:** Index PDFs and find relevant pages or documents based on semantic meaning using Haystack.
147
169
  * **Visual Debugging:** Highlight elements and use an interactive viewer or save images to understand your selections.