natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/index.md +19 -0
  6. docs/ocr/index.md +63 -16
  7. docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
  8. docs/tutorials/02-finding-elements.ipynb +123 -46
  9. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  10. docs/tutorials/04-table-extraction.ipynb +17 -12
  11. docs/tutorials/05-excluding-content.ipynb +37 -32
  12. docs/tutorials/06-document-qa.ipynb +36 -31
  13. docs/tutorials/07-layout-analysis.ipynb +45 -40
  14. docs/tutorials/07-working-with-regions.ipynb +61 -60
  15. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  16. docs/tutorials/09-section-extraction.ipynb +160 -155
  17. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  18. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  19. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  20. docs/tutorials/12-ocr-integration.md +68 -106
  21. docs/tutorials/13-semantic-search.ipynb +641 -251
  22. natural_pdf/__init__.py +2 -0
  23. natural_pdf/classification/manager.py +343 -0
  24. natural_pdf/classification/mixin.py +149 -0
  25. natural_pdf/classification/results.py +62 -0
  26. natural_pdf/collections/mixins.py +63 -0
  27. natural_pdf/collections/pdf_collection.py +321 -15
  28. natural_pdf/core/element_manager.py +67 -0
  29. natural_pdf/core/page.py +227 -64
  30. natural_pdf/core/pdf.py +387 -378
  31. natural_pdf/elements/collections.py +272 -41
  32. natural_pdf/elements/region.py +99 -15
  33. natural_pdf/elements/text.py +5 -2
  34. natural_pdf/exporters/paddleocr.py +1 -1
  35. natural_pdf/extraction/manager.py +134 -0
  36. natural_pdf/extraction/mixin.py +246 -0
  37. natural_pdf/extraction/result.py +37 -0
  38. natural_pdf/ocr/engine_easyocr.py +6 -3
  39. natural_pdf/ocr/ocr_manager.py +85 -25
  40. natural_pdf/ocr/ocr_options.py +33 -10
  41. natural_pdf/ocr/utils.py +14 -3
  42. natural_pdf/qa/document_qa.py +0 -4
  43. natural_pdf/selectors/parser.py +363 -238
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
  45. natural_pdf/utils/locks.py +8 -0
  46. natural_pdf/utils/text_extraction.py +52 -1
  47. natural_pdf/utils/tqdm_utils.py +43 -0
  48. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
  49. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
  50. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  51. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -143,11 +143,13 @@ class EasyOCREngine(OCREngine):
143
143
  standardized_regions = []
144
144
 
145
145
  if detect_only:
146
+ results = raw_results[0]
146
147
  # In detect_only mode, raw_results is already a list of bounding boxes
147
148
  # Each bbox is in [x_min, x_max, y_min, y_max] format
148
- if isinstance(raw_results, list):
149
- for detection in raw_results:
149
+ if isinstance(results, list):
150
+ for detection in results:
150
151
  try:
152
+ # This block expects 'detection' to be a list/tuple of 4 numbers
151
153
  if isinstance(detection, (list, tuple)) and len(detection) == 4:
152
154
  x_min, x_max, y_min, y_max = detection
153
155
  # Convert to standardized (x0, y0, x1, y1) format
@@ -161,6 +163,7 @@ class EasyOCREngine(OCREngine):
161
163
  f"Invalid number format in EasyOCR detect bbox: {detection}"
162
164
  ) from e
163
165
  else:
166
+ # This is where the error is raised if 'detection' is not a list/tuple of 4 numbers
164
167
  raise ValueError(f"Invalid detection format from EasyOCR: {detection}")
165
168
  except ValueError as e:
166
169
  # Re-raise any value errors from standardization or format checks
@@ -172,7 +175,7 @@ class EasyOCREngine(OCREngine):
172
175
  ) from e
173
176
  else:
174
177
  raise ValueError(
175
- f"Expected list of bounding boxes in detect_only mode, got: {raw_results}"
178
+ f"Expected list of bounding boxes in detect_only mode, got: {type(raw_results)}"
176
179
  )
177
180
 
178
181
  return standardized_regions
@@ -2,6 +2,8 @@
2
2
  import copy # For deep copying options
3
3
  import logging
4
4
  from typing import Any, Dict, List, Optional, Type, Union
5
+ import threading # Import threading for lock
6
+ import time # Import time for timing
5
7
 
6
8
  from PIL import Image
7
9
 
@@ -30,30 +32,68 @@ class OCRManager:
30
32
  def __init__(self):
31
33
  """Initializes the OCR Manager."""
32
34
  self._engine_instances: Dict[str, OCREngine] = {} # Cache for engine instances
35
+ self._engine_locks: Dict[str, threading.Lock] = {} # Lock per engine type for initialization
36
+ self._engine_inference_locks: Dict[str, threading.Lock] = {} # Lock per engine type for inference
33
37
  logger.info("OCRManager initialized.")
34
38
 
35
39
  def _get_engine_instance(self, engine_name: str) -> OCREngine:
36
- """Retrieves or creates an instance of the specified OCR engine."""
40
+ """Retrieves or creates an instance of the specified OCR engine, ensuring thread-safe initialization."""
37
41
  engine_name = engine_name.lower()
38
42
  if engine_name not in self.ENGINE_REGISTRY:
39
43
  raise ValueError(
40
44
  f"Unknown OCR engine: '{engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}"
41
45
  )
42
46
 
43
- if engine_name not in self._engine_instances:
44
- logger.info(f"Creating instance of engine: {engine_name}")
45
- engine_class = self.ENGINE_REGISTRY[engine_name]["class"]
46
- engine_instance = engine_class() # Instantiate first
47
- if not engine_instance.is_available():
48
- # Check availability before storing
49
- # Construct helpful error message with install hint
50
- install_hint = f"pip install 'natural-pdf[{engine_name}]'"
51
- raise RuntimeError(
52
- f"Engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
53
- )
54
- self._engine_instances[engine_name] = engine_instance # Store if available
47
+ # Quick check if instance already exists (avoid lock contention)
48
+ if engine_name in self._engine_instances:
49
+ return self._engine_instances[engine_name]
55
50
 
56
- return self._engine_instances[engine_name]
51
+ # Get or create the lock for this engine type
52
+ if engine_name not in self._engine_locks:
53
+ self._engine_locks[engine_name] = threading.Lock()
54
+
55
+ engine_init_lock = self._engine_locks[engine_name]
56
+
57
+ # Acquire lock to safely check and potentially initialize the engine
58
+ with engine_init_lock:
59
+ # Double-check if another thread initialized it while we waited for the lock
60
+ if engine_name in self._engine_instances:
61
+ return self._engine_instances[engine_name]
62
+
63
+ # If still not initialized, create it now under the lock
64
+ logger.info(f"[{threading.current_thread().name}] Creating shared instance of engine: {engine_name}")
65
+ engine_class = self.ENGINE_REGISTRY[engine_name]["class"]
66
+ start_time = time.monotonic() # Optional: time initialization
67
+ try:
68
+ engine_instance = engine_class() # Instantiate first
69
+ if not engine_instance.is_available():
70
+ # Check availability before storing
71
+ install_hint = f"pip install 'natural-pdf[{engine_name}]'"
72
+ raise RuntimeError(
73
+ f"Engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
74
+ )
75
+ # Store the shared instance
76
+ self._engine_instances[engine_name] = engine_instance
77
+ end_time = time.monotonic()
78
+ logger.info(f"[{threading.current_thread().name}] Shared instance of {engine_name} created successfully (Duration: {end_time - start_time:.2f}s).")
79
+ return engine_instance
80
+ except Exception as e:
81
+ # Ensure we don't leave a partial state if init fails
82
+ logger.error(f"[{threading.current_thread().name}] Failed to create shared instance of {engine_name}: {e}", exc_info=True)
83
+ # Remove potentially partial entry if exists
84
+ if engine_name in self._engine_instances: del self._engine_instances[engine_name]
85
+ raise # Re-raise the exception after logging
86
+
87
+ def _get_engine_inference_lock(self, engine_name: str) -> threading.Lock:
88
+ """Gets or creates the inference lock for a given engine type."""
89
+ engine_name = engine_name.lower()
90
+ # Assume engine_name is valid as it's checked before this would be called
91
+ if engine_name not in self._engine_inference_locks:
92
+ # Create lock if it doesn't exist (basic thread safety for dict access)
93
+ # A more robust approach might lock around this check/creation too,
94
+ # but contention here is less critical than for engine init or inference itself.
95
+ self._engine_inference_locks[engine_name] = threading.Lock()
96
+ return self._engine_inference_locks[engine_name]
57
97
 
58
98
  def apply_ocr(
59
99
  self,
@@ -127,21 +167,41 @@ class OCRManager:
127
167
  try:
128
168
  engine_instance = self._get_engine_instance(selected_engine_name)
129
169
  processing_mode = "batch" if is_batch else "single image"
130
- logger.info(f"Processing {processing_mode} with engine '{selected_engine_name}'...")
170
+ # Log thread name for clarity during parallel calls
171
+ thread_id = threading.current_thread().name
172
+ logger.info(f"[{thread_id}] Processing {processing_mode} using shared engine instance '{selected_engine_name}'...")
131
173
  logger.debug(
132
174
  f" Engine Args: languages={languages}, min_confidence={min_confidence}, device={device}, options={final_options}"
133
175
  )
134
176
 
135
- # Call the engine's process_image, passing common args and options object
136
- # **ASSUMPTION**: Engine process_image signatures are updated to accept these common args.
137
- results = engine_instance.process_image(
138
- images=images,
139
- languages=languages,
140
- min_confidence=min_confidence,
141
- device=device,
142
- detect_only=detect_only,
143
- options=final_options,
144
- )
177
+ # Log image dimensions before processing
178
+ if is_batch:
179
+ image_dims = [f"{img.width}x{img.height}" for img in images if hasattr(img, 'width') and hasattr(img, 'height')]
180
+ logger.debug(f"[{thread_id}] Processing batch of {len(images)} images with dimensions: {image_dims}")
181
+ elif hasattr(images, 'width') and hasattr(images, 'height'):
182
+ logger.debug(f"[{thread_id}] Processing single image with dimensions: {images.width}x{images.height}")
183
+ else:
184
+ logger.warning(f"[{thread_id}] Could not determine dimensions of input image(s).")
185
+
186
+ # Acquire lock specifically for the inference call
187
+ inference_lock = self._get_engine_inference_lock(selected_engine_name)
188
+ logger.debug(f"[{thread_id}] Attempting to acquire inference lock for {selected_engine_name}...")
189
+ inference_wait_start = time.monotonic()
190
+ with inference_lock:
191
+ inference_acquired_time = time.monotonic()
192
+ logger.debug(f"[{thread_id}] Acquired inference lock for {selected_engine_name} (waited {inference_acquired_time - inference_wait_start:.2f}s). Calling process_image...")
193
+ inference_start_time = time.monotonic()
194
+
195
+ results = engine_instance.process_image(
196
+ images=images,
197
+ languages=languages,
198
+ min_confidence=min_confidence,
199
+ device=device,
200
+ detect_only=detect_only,
201
+ options=final_options,
202
+ )
203
+ inference_end_time = time.monotonic()
204
+ logger.debug(f"[{thread_id}] process_image call finished for {selected_engine_name} (Duration: {inference_end_time - inference_start_time:.2f}s). Releasing lock.")
145
205
 
146
206
  # Log result summary based on mode
147
207
  if is_batch:
@@ -67,9 +67,9 @@ class EasyOCROptions(BaseOCROptions):
67
67
  class PaddleOCROptions(BaseOCROptions):
68
68
  """Specific options for the PaddleOCR engine."""
69
69
 
70
- use_angle_cls: bool = True
70
+ # General
71
71
  use_gpu: Optional[bool] = None
72
- gpu_mem: int = 500
72
+ gpu_mem: int = 8000 # Default from Paddle documentation
73
73
  ir_optim: bool = True
74
74
  use_tensorrt: bool = False
75
75
  min_subgraph_size: int = 15
@@ -77,19 +77,42 @@ class PaddleOCROptions(BaseOCROptions):
77
77
  enable_mkldnn: bool = False
78
78
  cpu_threads: int = 10
79
79
  use_fp16: bool = False
80
+ show_log: bool = False
81
+ use_onnx: bool = False
82
+ use_zero_copy_run: bool = False
83
+
84
+ # Detection
85
+ det: bool = True
86
+ det_algorithm: str = "DB"
80
87
  det_model_dir: Optional[str] = None
88
+ det_limit_side_len: int = 960 # Corresponds to det_max_side_len
89
+ # DB specific
90
+ det_db_thresh: float = 0.3
91
+ det_db_box_thresh: float = 0.5
92
+ det_db_unclip_ratio: float = 2.0
93
+ # EAST specific
94
+ det_east_score_thresh: float = 0.8
95
+ det_east_cover_thresh: float = 0.1
96
+ det_east_nms_thresh: float = 0.2
97
+
98
+ # Recognition
99
+ rec: bool = True
100
+ rec_algorithm: str = "CRNN"
81
101
  rec_model_dir: Optional[str] = None
82
- cls_model_dir: Optional[str] = None
83
- det_limit_side_len: int = 960
84
- rec_batch_num: int = 6
102
+ rec_image_shape: str = "3, 32, 320" # Kept as string per Paddle examples
103
+ rec_batch_num: int = 30 # Default from Paddle documentation
85
104
  max_text_length: int = 25
105
+ rec_char_dict_path: Optional[str] = None # Path to char dictionary file
86
106
  use_space_char: bool = True
87
107
  drop_score: float = 0.5
88
- show_log: bool = False
89
- use_onnx: bool = False
90
- det: bool = True
91
- rec: bool = True
92
- cls: Optional[bool] = None
108
+
109
+ # Classification
110
+ cls: Optional[bool] = None # Often inferred from use_angle_cls
111
+ use_angle_cls: bool = False # Default from Paddle documentation
112
+ cls_model_dir: Optional[str] = None
113
+ cls_image_shape: str = "3, 48, 192" # Kept as string per Paddle examples
114
+ label_list: List[str] = field(default_factory=lambda: ['0', '180']) # Default from Paddle doc
115
+ cls_batch_num: int = 30
93
116
 
94
117
  def __post_init__(self):
95
118
  pass
natural_pdf/ocr/utils.py CHANGED
@@ -8,6 +8,9 @@ from tqdm.auto import tqdm
8
8
  if TYPE_CHECKING:
9
9
  from natural_pdf.elements.base import Element
10
10
 
11
+ # Import the global PDF render lock from dedicated locks module
12
+ from natural_pdf.utils.locks import pdf_render_lock
13
+
11
14
  logger = logging.getLogger(__name__)
12
15
 
13
16
 
@@ -72,7 +75,7 @@ def direct_ocr_llm(
72
75
  client,
73
76
  model="",
74
77
  resolution=150,
75
- prompt="OCR this image. Return only the exact text from the image. Include misspellings, punctuation, etc.",
78
+ prompt="OCR this image. Return only the exact text from the image. Include misspellings, punctuation, etc. If you cannot see any text, return an empty string.",
76
79
  padding=2,
77
80
  ) -> str:
78
81
  """Convenience method to directly OCR a region of the page."""
@@ -83,7 +86,15 @@ def direct_ocr_llm(
83
86
  region = element
84
87
 
85
88
  buffered = io.BytesIO()
86
- region_img = region.to_image(resolution=resolution, include_highlights=False)
89
+ # Use the global PDF render lock when rendering images
90
+ with pdf_render_lock:
91
+ region_img = region.to_image(resolution=resolution, include_highlights=False)
92
+
93
+ # Handle cases where image creation might fail (e.g., zero-dim region)
94
+ if region_img is None:
95
+ logger.warning(f"Could not generate image for region {region.bbox}, skipping OCR.")
96
+ return "" # Return empty string if image creation failed
97
+
87
98
  region_img.save(buffered, format="PNG")
88
99
  base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
89
100
 
@@ -107,7 +118,7 @@ def direct_ocr_llm(
107
118
  ],
108
119
  )
109
120
 
110
- corrected = response.choices[0].message.content
121
+ corrected = response.choices[0].message.content.strip()
111
122
  logger.debug(f"Corrected {region.extract_text()} to {corrected}")
112
123
 
113
124
  return corrected
@@ -58,10 +58,6 @@ class DocumentQA:
58
58
  import torch
59
59
  from transformers import pipeline
60
60
 
61
- # Determine device
62
- if device is None:
63
- device = "cuda" if torch.cuda.is_available() else "cpu"
64
-
65
61
  logger.info(f"Initializing DocumentQA with model {model_name} on {device}")
66
62
 
67
63
  # Initialize the pipeline