natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/index.md +19 -0
  6. docs/ocr/index.md +63 -16
  7. docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
  8. docs/tutorials/02-finding-elements.ipynb +123 -46
  9. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  10. docs/tutorials/04-table-extraction.ipynb +17 -12
  11. docs/tutorials/05-excluding-content.ipynb +37 -32
  12. docs/tutorials/06-document-qa.ipynb +36 -31
  13. docs/tutorials/07-layout-analysis.ipynb +45 -40
  14. docs/tutorials/07-working-with-regions.ipynb +61 -60
  15. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  16. docs/tutorials/09-section-extraction.ipynb +160 -155
  17. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  18. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  19. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  20. docs/tutorials/12-ocr-integration.md +68 -106
  21. docs/tutorials/13-semantic-search.ipynb +641 -251
  22. natural_pdf/__init__.py +2 -0
  23. natural_pdf/classification/manager.py +343 -0
  24. natural_pdf/classification/mixin.py +149 -0
  25. natural_pdf/classification/results.py +62 -0
  26. natural_pdf/collections/mixins.py +63 -0
  27. natural_pdf/collections/pdf_collection.py +321 -15
  28. natural_pdf/core/element_manager.py +67 -0
  29. natural_pdf/core/page.py +227 -64
  30. natural_pdf/core/pdf.py +387 -378
  31. natural_pdf/elements/collections.py +272 -41
  32. natural_pdf/elements/region.py +99 -15
  33. natural_pdf/elements/text.py +5 -2
  34. natural_pdf/exporters/paddleocr.py +1 -1
  35. natural_pdf/extraction/manager.py +134 -0
  36. natural_pdf/extraction/mixin.py +246 -0
  37. natural_pdf/extraction/result.py +37 -0
  38. natural_pdf/ocr/engine_easyocr.py +6 -3
  39. natural_pdf/ocr/ocr_manager.py +85 -25
  40. natural_pdf/ocr/ocr_options.py +33 -10
  41. natural_pdf/ocr/utils.py +14 -3
  42. natural_pdf/qa/document_qa.py +0 -4
  43. natural_pdf/selectors/parser.py +363 -238
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
  45. natural_pdf/utils/locks.py +8 -0
  46. natural_pdf/utils/text_extraction.py +52 -1
  47. natural_pdf/utils/tqdm_utils.py +43 -0
  48. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
  49. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
  50. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  51. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py CHANGED
@@ -6,14 +6,19 @@ import logging
6
6
  import os
7
7
  import re
8
8
  import tempfile
9
+ import time # Import time
9
10
  from pathlib import Path
10
11
  from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
12
+ import concurrent.futures # Added import
13
+ from tqdm.auto import tqdm # Added tqdm import
14
+ import threading
11
15
 
12
16
  import pdfplumber
13
17
  from PIL import Image, ImageDraw
14
18
 
15
19
  from natural_pdf.elements.collections import ElementCollection
16
20
  from natural_pdf.elements.region import Region
21
+ from natural_pdf.utils.locks import pdf_render_lock # Import from utils instead
17
22
 
18
23
  if TYPE_CHECKING:
19
24
  import pdfplumber
@@ -46,10 +51,20 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveV
46
51
  from natural_pdf.qa import DocumentQA, get_qa_engine
47
52
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
48
53
 
54
+ # --- Classification Imports --- #
55
+ from natural_pdf.classification.mixin import ClassificationMixin
56
+ from natural_pdf.classification.manager import ClassificationManager # For type hint
57
+ # --- End Classification Imports --- #
58
+
59
+ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
60
+ from natural_pdf.elements.base import Element # Import base element
61
+ from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
62
+ from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
63
+
49
64
  logger = logging.getLogger(__name__)
50
65
 
51
66
 
52
- class Page:
67
+ class Page(ClassificationMixin, ExtractionMixin):
53
68
  """
54
69
  Enhanced Page wrapper built on top of pdfplumber.Page.
55
70
 
@@ -73,14 +88,21 @@ class Page:
73
88
  self._text_styles = None # Lazy-loaded text style analyzer results
74
89
  self._exclusions = [] # List to store exclusion functions/regions
75
90
 
91
+ # --- ADDED --- Metadata store for mixins
92
+ self.metadata: Dict[str, Any] = {}
93
+ # --- END ADDED ---
94
+
76
95
  # Region management
77
96
  self._regions = {
78
97
  "detected": [], # Layout detection results
79
98
  "named": {}, # Named regions (name -> region)
80
99
  }
81
100
 
82
- # Initialize ElementManager
83
- self._element_mgr = ElementManager(self, font_attrs)
101
+ # Initialize ElementManager, passing font_attrs
102
+ self._element_mgr = ElementManager(self, font_attrs=font_attrs)
103
+ # self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
104
+ # --- NEW --- Central registry for analysis results
105
+ self.analyses: Dict[str, Any] = {}
84
106
 
85
107
  # --- Get OCR Manager Instance ---
86
108
  if (
@@ -115,6 +137,8 @@ class Page:
115
137
  # Initialize the internal variable with a single underscore
116
138
  self._layout_analyzer = None
117
139
 
140
+ self._load_elements()
141
+
118
142
  @property
119
143
  def pdf(self) -> "PDF":
120
144
  """Provides public access to the parent PDF object."""
@@ -1257,38 +1281,48 @@ class Page:
1257
1281
  """
1258
1282
  image = None
1259
1283
  render_resolution = resolution if resolution is not None else scale * 72
1284
+ thread_id = threading.current_thread().name
1285
+ logger.debug(f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image...")
1286
+ lock_wait_start = time.monotonic()
1260
1287
  try:
1261
- if include_highlights:
1262
- # Delegate rendering to the central service
1263
- image = self._highlighter.render_page(
1264
- page_index=self.index,
1265
- scale=scale, # Note: scale is used by highlighter internally for drawing
1266
- labels=labels,
1267
- legend_position=legend_position,
1268
- render_ocr=render_ocr,
1269
- resolution=render_resolution, # Pass the calculated resolution
1270
- **kwargs,
1271
- )
1272
- else:
1273
- # Get the base page image directly from pdfplumber if no highlights needed
1274
- # Use the underlying pdfplumber page object
1275
- img_object = self._page.to_image(resolution=render_resolution, **kwargs)
1276
- # Access the PIL image directly (assuming pdfplumber structure)
1277
- image = (
1278
- img_object.annotated
1279
- if hasattr(img_object, "annotated")
1280
- else img_object._repr_png_()
1281
- )
1282
- if isinstance(image, bytes): # Handle cases where it returns bytes
1283
- from io import BytesIO
1288
+ # Acquire the global PDF rendering lock
1289
+ with pdf_render_lock:
1290
+ lock_acquired_time = time.monotonic()
1291
+ logger.debug(f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render...")
1292
+ if include_highlights:
1293
+ # Delegate rendering to the central service
1294
+ image = self._highlighter.render_page(
1295
+ page_index=self.index,
1296
+ scale=scale, # Note: scale is used by highlighter internally for drawing
1297
+ labels=labels,
1298
+ legend_position=legend_position,
1299
+ render_ocr=render_ocr,
1300
+ resolution=render_resolution, # Pass the calculated resolution
1301
+ **kwargs,
1302
+ )
1303
+ else:
1304
+ # Get the base page image directly from pdfplumber if no highlights needed
1305
+ # Use the underlying pdfplumber page object
1306
+ img_object = self._page.to_image(resolution=render_resolution, **kwargs)
1307
+ # Access the PIL image directly (assuming pdfplumber structure)
1308
+ image = (
1309
+ img_object.annotated
1310
+ if hasattr(img_object, "annotated")
1311
+ else img_object._repr_png_()
1312
+ )
1313
+ if isinstance(image, bytes): # Handle cases where it returns bytes
1314
+ from io import BytesIO
1284
1315
 
1285
- image = Image.open(BytesIO(image)).convert(
1286
- "RGB"
1287
- ) # Convert to RGB for consistency
1316
+ image = Image.open(BytesIO(image)).convert(
1317
+ "RGB"
1318
+ ) # Convert to RGB for consistency
1288
1319
 
1289
1320
  except Exception as e:
1290
1321
  logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
1291
1322
  return None # Return None on error
1323
+ finally:
1324
+ render_end_time = time.monotonic()
1325
+ logger.debug(f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s")
1292
1326
 
1293
1327
  if image is None:
1294
1328
  return None
@@ -1384,6 +1418,7 @@ class Page:
1384
1418
  resolution: Optional[int] = None,
1385
1419
  detect_only: bool = False,
1386
1420
  apply_exclusions: bool = True,
1421
+ replace: bool = True,
1387
1422
  ) -> "Page":
1388
1423
  """
1389
1424
  Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
@@ -1397,13 +1432,21 @@ class Page:
1397
1432
  resolution: DPI resolution for rendering page image before OCR.
1398
1433
  apply_exclusions: If True (default), render page image for OCR
1399
1434
  with excluded areas masked (whited out).
1435
+ detect_only: If True, only detect text bounding boxes, don't perform OCR.
1436
+ replace: If True (default), remove any existing OCR elements before
1437
+ adding new ones. If False, add new OCR elements to existing ones.
1400
1438
 
1401
1439
  Returns:
1402
- List of created TextElements derived from OCR results for this page.
1440
+ Self for method chaining.
1403
1441
  """
1404
1442
  if not hasattr(self._parent, "apply_ocr"):
1405
1443
  logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
1406
- return [] # Return empty list for consistency
1444
+ return self # Return self for chaining
1445
+
1446
+ # Remove existing OCR elements if replace is True
1447
+ if replace and hasattr(self, "_element_mgr"):
1448
+ logger.info(f"Page {self.number}: Removing existing OCR elements before applying new OCR.")
1449
+ self._element_mgr.remove_ocr_elements()
1407
1450
 
1408
1451
  logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
1409
1452
  try:
@@ -1419,18 +1462,13 @@ class Page:
1419
1462
  resolution=resolution,
1420
1463
  detect_only=detect_only,
1421
1464
  apply_exclusions=apply_exclusions,
1465
+ replace=replace, # Pass the replace parameter to PDF.apply_ocr
1422
1466
  )
1423
1467
  except Exception as e:
1424
1468
  logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
1425
- return []
1469
+ return self # Return self for chaining
1426
1470
 
1427
- # Return the OCR elements specifically added to this page
1428
- ocr_elements = [el for el in self.words if getattr(el, "source", None) == "ocr"]
1429
- logger.debug(
1430
- f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements."
1431
- )
1432
- # Note: The method is typed to return Page for chaining, but the log indicates
1433
- # finding elements. Let's stick to returning self for chaining consistency.
1471
+ # Return self for chaining
1434
1472
  return self
1435
1473
 
1436
1474
  def extract_ocr_elements(
@@ -1471,11 +1509,13 @@ class Page:
1471
1509
 
1472
1510
  try:
1473
1511
  # Get base image without highlights using the determined resolution
1474
- image = self.to_image(resolution=final_resolution, include_highlights=False)
1475
- if not image:
1476
- logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
1477
- return []
1478
- logger.debug(f" Rendered image size: {image.width}x{image.height}")
1512
+ # Use the global PDF rendering lock
1513
+ with pdf_render_lock:
1514
+ image = self.to_image(resolution=final_resolution, include_highlights=False)
1515
+ if not image:
1516
+ logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
1517
+ return []
1518
+ logger.debug(f" Rendered image size: {image.width}x{image.height}")
1479
1519
  except Exception as e:
1480
1520
  logger.error(f" Failed to render page {self.number} to image: {e}", exc_info=True)
1481
1521
  return []
@@ -2027,43 +2067,166 @@ class Page:
2027
2067
  def correct_ocr(
2028
2068
  self,
2029
2069
  correction_callback: Callable[[Any], Optional[str]],
2070
+ max_workers: Optional[int] = None,
2071
+ progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
2030
2072
  ) -> "Page": # Return self for chaining
2031
2073
  """
2032
2074
  Applies corrections to OCR-generated text elements on this page
2033
- using a user-provided callback function.
2075
+ using a user-provided callback function, potentially in parallel.
2034
2076
 
2035
2077
  Finds text elements on this page whose 'source' attribute starts
2036
2078
  with 'ocr' and calls the `correction_callback` for each, passing the
2037
- element itself.
2038
-
2039
- The `correction_callback` should contain the logic to:
2040
- 1. Determine if the element needs correction.
2041
- 2. Perform the correction (e.g., call an LLM).
2042
- 3. Return the new text (`str`) or `None`.
2043
-
2044
- If the callback returns a string, the element's `.text` is updated.
2045
- Metadata updates (source, confidence, etc.) should happen within the callback.
2079
+ element itself. Updates the element's text if the callback returns
2080
+ a new string.
2046
2081
 
2047
2082
  Args:
2048
2083
  correction_callback: A function accepting an element and returning
2049
2084
  `Optional[str]` (new text or None).
2085
+ max_workers: The maximum number of threads to use for parallel execution.
2086
+ If None or 0 or 1, runs sequentially.
2087
+ progress_callback: Optional callback function to call after processing each element.
2050
2088
 
2051
2089
  Returns:
2052
2090
  Self for method chaining.
2053
2091
  """
2054
2092
  logger.info(
2055
- f"Page {self.number}: Starting OCR correction process using callback '{correction_callback.__name__}'"
2093
+ f"Page {self.number}: Starting OCR correction with callback '{correction_callback.__name__}' (max_workers={max_workers})"
2094
+ )
2095
+
2096
+ target_elements_collection = self.find_all(
2097
+ selector="text[source=ocr]", apply_exclusions=False
2056
2098
  )
2099
+ target_elements = target_elements_collection.elements # Get the list
2057
2100
 
2058
- # Find OCR elements specifically on this page
2059
- # Note: We typically want to correct even if the element falls in an excluded area
2060
- target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
2101
+ if not target_elements:
2102
+ logger.info(f"Page {self.number}: No OCR elements found to correct.")
2103
+ return self
2104
+
2105
+ processed_count = 0
2106
+ updated_count = 0
2107
+ error_count = 0
2108
+
2109
+ # Define the task to be run by the worker thread or sequentially
2110
+ def _process_element_task(element):
2111
+ try:
2112
+ current_text = getattr(element, 'text', None)
2113
+ # Call the user-provided callback
2114
+ corrected_text = correction_callback(element)
2115
+
2116
+ # Validate result type
2117
+ if corrected_text is not None and not isinstance(corrected_text, str):
2118
+ logger.warning(f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update.")
2119
+ return element, None, None # Treat as no correction
2120
+
2121
+ return element, corrected_text, None # Return element, result, no error
2122
+ except Exception as e:
2123
+ logger.error(
2124
+ f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
2125
+ exc_info=False # Keep log concise
2126
+ )
2127
+ return element, None, e # Return element, no result, error
2128
+ finally:
2129
+ # --- Call progress callback here --- #
2130
+ if progress_callback:
2131
+ try:
2132
+ progress_callback()
2133
+ except Exception as cb_e:
2134
+ # Log error in callback itself, but don't stop processing
2135
+ logger.error(f"Page {self.number}: Error executing progress_callback: {cb_e}", exc_info=False)
2136
+
2137
+ # Choose execution strategy based on max_workers
2138
+ if max_workers is not None and max_workers > 1:
2139
+ # --- Parallel execution --- #
2140
+ logger.info(f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers.")
2141
+ futures = []
2142
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
2143
+ # Submit all tasks
2144
+ future_to_element = {executor.submit(_process_element_task, element): element for element in target_elements}
2145
+
2146
+ # Process results as they complete (progress_callback called by worker)
2147
+ for future in concurrent.futures.as_completed(future_to_element):
2148
+ processed_count += 1
2149
+ try:
2150
+ element, corrected_text, error = future.result()
2151
+ if error:
2152
+ error_count += 1
2153
+ # Error already logged in worker
2154
+ elif corrected_text is not None:
2155
+ # Apply correction if text changed
2156
+ current_text = getattr(element, 'text', None)
2157
+ if corrected_text != current_text:
2158
+ element.text = corrected_text
2159
+ updated_count += 1
2160
+ except Exception as exc:
2161
+ # Catch errors from future.result() itself
2162
+ element = future_to_element[future] # Find original element
2163
+ logger.error(f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}", exc_info=True)
2164
+ error_count += 1
2165
+ # Note: progress_callback was already called in the worker's finally block
2166
+
2167
+ else:
2168
+ # --- Sequential execution --- #
2169
+ logger.info(f"Page {self.number}: Running OCR correction sequentially.")
2170
+ for element in target_elements:
2171
+ # Call the task function directly (it handles progress_callback)
2172
+ processed_count += 1
2173
+ _element, corrected_text, error = _process_element_task(element)
2174
+ if error:
2175
+ error_count += 1
2176
+ elif corrected_text is not None:
2177
+ # Apply correction if text changed
2178
+ current_text = getattr(_element, 'text', None)
2179
+ if corrected_text != current_text:
2180
+ _element.text = corrected_text
2181
+ updated_count += 1
2061
2182
 
2062
- # Delegate to the utility function
2063
- _apply_ocr_correction_to_elements(
2064
- elements=target_elements, # Pass the ElementCollection directly
2065
- correction_callback=correction_callback,
2066
- caller_info=f"Page({self.number})", # Pass caller info
2183
+ logger.info(
2184
+ f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
2067
2185
  )
2068
2186
 
2069
- return self # Return self for chaining
2187
+ return self # Return self for chaining
2188
+
2189
+ # --- Classification Mixin Implementation --- #
2190
+ def _get_classification_manager(self) -> "ClassificationManager":
2191
+ if not hasattr(self, 'pdf') or not hasattr(self.pdf, 'get_manager'):
2192
+ raise AttributeError("ClassificationManager cannot be accessed: Parent PDF or get_manager method missing.")
2193
+ try:
2194
+ # Use the PDF's manager registry accessor
2195
+ return self.pdf.get_manager('classification')
2196
+ except (ValueError, RuntimeError, AttributeError) as e:
2197
+ # Wrap potential errors from get_manager for clarity
2198
+ raise AttributeError(f"Failed to get ClassificationManager from PDF: {e}") from e
2199
+
2200
+ def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, "Image"]: # Use "Image" for lazy import
2201
+ if model_type == 'text':
2202
+ text_content = self.extract_text(layout=False, use_exclusions=False) # Simple join, ignore exclusions for classification
2203
+ if not text_content or text_content.isspace():
2204
+ raise ValueError("Cannot classify page with 'text' model: No text content found.")
2205
+ return text_content
2206
+ elif model_type == 'vision':
2207
+ # Get resolution from manager/kwargs if possible, else default
2208
+ manager = self._get_classification_manager()
2209
+ default_resolution = 150
2210
+ # Access kwargs passed to classify method if needed
2211
+ resolution = kwargs.get('resolution', default_resolution) if 'kwargs' in locals() else default_resolution
2212
+
2213
+ # Use to_image, ensuring no highlights interfere
2214
+ img = self.to_image(
2215
+ resolution=resolution,
2216
+ include_highlights=False,
2217
+ labels=False,
2218
+ exclusions=None # Don't mask exclusions for classification input image
2219
+ )
2220
+ if img is None:
2221
+ raise ValueError("Cannot classify page with 'vision' model: Failed to render image.")
2222
+ return img
2223
+ else:
2224
+ raise ValueError(f"Unsupported model_type for classification: {model_type}")
2225
+
2226
+ def _get_metadata_storage(self) -> Dict[str, Any]:
2227
+ # Ensure metadata exists
2228
+ if not hasattr(self, 'metadata') or self.metadata is None:
2229
+ self.metadata = {}
2230
+ return self.metadata
2231
+
2232
+ # --- Content Extraction ---