natural-pdf 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/__init__.py CHANGED
@@ -52,4 +52,36 @@ __version__ = "0.1.1"
52
52
  if HAS_QA:
53
53
  __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging", "DocumentQA", "get_qa_engine"]
54
54
  else:
55
- __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging"]
55
+ __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging"]
56
+
57
+ # Core classes
58
+ from .core.pdf import PDF
59
+ from .collections.pdf_collection import PDFCollection
60
+ from .elements.region import Region
61
+
62
+ # Search options (if extras installed)
63
+ try:
64
+ from .search.search_options import TextSearchOptions, MultiModalSearchOptions, BaseSearchOptions
65
+ except ImportError:
66
+ # Define dummy classes if extras not installed, so imports don't break
67
+ # but using them will raise the ImportError from check_haystack_availability
68
+ class TextSearchOptions:
69
+ def __init__(self, *args, **kwargs): pass
70
+ class MultiModalSearchOptions:
71
+ def __init__(self, *args, **kwargs): pass
72
+ class BaseSearchOptions:
73
+ def __init__(self, *args, **kwargs): pass
74
+
75
+ # Expose logging setup? (Optional)
76
+ # from . import logging_config
77
+ # logging_config.setup_logging()
78
+
79
+ # Explicitly define what gets imported with 'from natural_pdf import *'
80
+ __all__ = [
81
+ 'PDF',
82
+ 'PDFCollection',
83
+ 'Region',
84
+ 'TextSearchOptions', # Include search options
85
+ 'MultiModalSearchOptions',
86
+ 'BaseSearchOptions'
87
+ ]
@@ -1,10 +1,11 @@
1
1
  import logging
2
2
  from typing import List, Dict, Any, Optional, Union
3
3
  from PIL import Image
4
+ import copy
4
5
 
5
6
  from natural_pdf.elements.region import Region
6
7
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
7
- from natural_pdf.analyzers.layout.layout_options import LayoutOptions
8
+ from natural_pdf.analyzers.layout.layout_options import LayoutOptions, TATRLayoutOptions, BaseLayoutOptions
8
9
 
9
10
  logger = logging.getLogger(__name__)
10
11
 
@@ -36,20 +37,25 @@ class LayoutAnalyzer:
36
37
  classes: Optional[List[str]] = None,
37
38
  exclude_classes: Optional[List[str]] = None,
38
39
  device: Optional[str] = None,
39
- existing: str = "replace"
40
+ existing: str = "replace",
41
+ **kwargs
40
42
  ) -> List[Region]:
41
43
  """
42
44
  Analyze the page layout using the configured LayoutManager.
43
45
 
46
+ This method constructs the final options object, including internal context,
47
+ and passes it to the LayoutManager.
48
+
44
49
  Args:
45
- engine: Name of the layout engine (e.g., 'yolo', 'tatr'). Uses manager's default if None.
46
- options: Specific LayoutOptions object for advanced configuration.
50
+ engine: Name of the layout engine (e.g., 'yolo', 'tatr'). Uses manager's default if None and no options object given.
51
+ options: Specific LayoutOptions object for advanced configuration. If provided, simple args (confidence, etc.) are ignored.
47
52
  confidence: Minimum confidence threshold (simple mode).
48
53
  classes: Specific classes to detect (simple mode).
49
54
  exclude_classes: Classes to exclude (simple mode).
50
55
  device: Device for inference (simple mode).
51
56
  existing: How to handle existing detected regions: 'replace' (default) or 'append'.
52
-
57
+ **kwargs: Additional engine-specific arguments (added to options.extra_args or used by constructor if options=None).
58
+
53
59
  Returns:
54
60
  List of created Region objects.
55
61
  """
@@ -57,72 +63,139 @@ class LayoutAnalyzer:
57
63
  logger.error(f"Page {self._page.number}: LayoutManager not available. Cannot analyze layout.")
58
64
  return []
59
65
 
60
- logger.info(f"Page {self._page.number}: Analyzing layout (Engine: {engine or 'default'}, Options: {options is not None})...")
66
+ logger.info(f"Page {self._page.number}: Analyzing layout (Engine: {engine or 'default'}, Options provided: {options is not None})...")
61
67
 
62
- # --- Render Page Image ---
63
- logger.debug(f" Rendering page {self._page.number} to image for layout analysis...")
68
+ # --- Render Page Image (Standard Resolution) ---
69
+ logger.debug(f" Rendering page {self._page.number} to image for initial layout detection...")
64
70
  try:
65
- # Use a resolution suitable for layout analysis, potentially configurable
66
- layout_scale = getattr(self._page._parent, '_config', {}).get('layout_image_scale', 1.5) # ~108 DPI default
71
+ layout_scale = getattr(self._page._parent, '_config', {}).get('layout_image_scale', 1.5)
67
72
  layout_resolution = layout_scale * 72
68
- # Render without existing highlights to avoid interference
69
- page_image = self._page.to_image(resolution=layout_resolution, include_highlights=False)
70
- logger.debug(f" Rendered image size: {page_image.width}x{page_image.height}")
73
+ std_res_page_image = self._page.to_image(resolution=layout_resolution, include_highlights=False)
74
+ if not std_res_page_image:
75
+ raise ValueError("Initial page rendering returned None")
76
+ logger.debug(f" Initial rendered image size: {std_res_page_image.width}x{std_res_page_image.height}")
71
77
  except Exception as e:
72
- logger.error(f" Failed to render page {self._page.number} to image: {e}", exc_info=True)
78
+ logger.error(f" Failed to render initial page image: {e}", exc_info=True)
73
79
  return []
80
+
81
+ # --- Calculate Scaling Factors (Standard Res Image <-> PDF) ---
82
+ if std_res_page_image.width == 0 or std_res_page_image.height == 0:
83
+ logger.error(f"Page {self._page.number}: Invalid initial rendered image dimensions. Cannot scale results.")
84
+ return []
85
+ img_scale_x = self._page.width / std_res_page_image.width
86
+ img_scale_y = self._page.height / std_res_page_image.height
87
+ logger.debug(f" StdRes Image -> PDF Scaling: x={img_scale_x:.4f}, y={img_scale_y:.4f}")
74
88
 
75
- # --- Prepare Arguments for Layout Manager ---
76
- manager_args = {'image': page_image, 'options': options, 'engine': engine}
77
- if confidence is not None: manager_args['confidence'] = confidence
78
- if classes is not None: manager_args['classes'] = classes
79
- if exclude_classes is not None: manager_args['exclude_classes'] = exclude_classes
80
- if device is not None: manager_args['device'] = device
81
-
82
- # --- Call Layout Manager ---
83
- logger.debug(f" Calling Layout Manager...")
89
+ # --- Construct Final Options Object ---
90
+ final_options: BaseLayoutOptions
91
+
92
+ if options is not None:
93
+ # User provided a complete options object, use it directly
94
+ logger.debug("Using user-provided options object.")
95
+ final_options = copy.deepcopy(options) # Copy to avoid modifying original user object
96
+ if kwargs:
97
+ logger.warning(f"Ignoring kwargs {list(kwargs.keys())} because a full options object was provided.")
98
+ # Infer engine from options type if engine arg wasn't provided
99
+ if engine is None:
100
+ for name, registry_entry in self._layout_manager.ENGINE_REGISTRY.items():
101
+ if isinstance(final_options, registry_entry['options_class']):
102
+ engine = name
103
+ logger.debug(f"Inferred engine '{engine}' from options type.")
104
+ break
105
+ if engine is None:
106
+ logger.warning("Could not infer engine from provided options object.")
107
+ else:
108
+ # Construct options from simple args (engine, confidence, classes, etc.)
109
+ logger.debug("Constructing options from simple arguments.")
110
+ selected_engine = engine or self._layout_manager.get_available_engines()[0] # Use provided or first available
111
+ engine_lower = selected_engine.lower()
112
+ registry = self._layout_manager.ENGINE_REGISTRY
113
+
114
+ if engine_lower not in registry:
115
+ raise ValueError(f"Unknown or unavailable engine: '{selected_engine}'. Available: {list(registry.keys())}")
116
+
117
+ options_class = registry[engine_lower]['options_class']
118
+
119
+ # Get base defaults
120
+ base_defaults = BaseLayoutOptions()
121
+
122
+ # Prepare args for constructor, prioritizing explicit args over defaults
123
+ constructor_args = {
124
+ 'confidence': confidence if confidence is not None else base_defaults.confidence,
125
+ 'classes': classes, # Pass None if not provided
126
+ 'exclude_classes': exclude_classes, # Pass None if not provided
127
+ 'device': device if device is not None else base_defaults.device,
128
+ 'extra_args': kwargs # Pass other kwargs here
129
+ }
130
+ # Remove None values unless they are valid defaults (like classes=None)
131
+ # We can pass all to the dataclass constructor; it handles defaults
132
+
133
+ try:
134
+ final_options = options_class(**constructor_args)
135
+ logger.debug(f"Constructed options: {final_options}")
136
+ except TypeError as e:
137
+ logger.error(f"Failed to construct options object {options_class.__name__} with args {constructor_args}: {e}")
138
+ # Filter kwargs to only include fields defined in the specific options class? Complex.
139
+ # Re-raise for now, indicates programming error or invalid kwarg.
140
+ raise e
141
+
142
+ # --- Add Internal Context to extra_args (ALWAYS) ---
143
+ if not hasattr(final_options, 'extra_args') or final_options.extra_args is None:
144
+ final_options.extra_args = {}
145
+ final_options.extra_args['_page_ref'] = self._page
146
+ final_options.extra_args['_img_scale_x'] = img_scale_x
147
+ final_options.extra_args['_img_scale_y'] = img_scale_y
148
+ logger.debug(f"Added internal context to final_options.extra_args: {final_options.extra_args}")
149
+
150
+ # --- Call Layout Manager with the Final Options ---
151
+ logger.debug(f"Calling Layout Manager with final options object.")
84
152
  try:
85
- detections = self._layout_manager.analyze_layout(**manager_args)
153
+ # Pass only image and the constructed options object
154
+ detections = self._layout_manager.analyze_layout(
155
+ image=std_res_page_image,
156
+ options=final_options
157
+ # No engine, confidence, classes etc. passed here directly
158
+ )
86
159
  logger.info(f" Layout Manager returned {len(detections)} detections.")
87
160
  except Exception as e:
88
161
  logger.error(f" Layout analysis failed: {e}", exc_info=True)
89
162
  return []
90
163
 
91
- # --- Process Detections (Convert to Regions, Scale Coords) ---
92
- # Calculate scale factor to convert from image back to PDF coordinates
93
- if page_image.width == 0 or page_image.height == 0:
94
- logger.error(f"Page {self._page.number}: Invalid rendered image dimensions ({page_image.width}x{page_image.height}). Cannot scale layout results.")
95
- return []
96
- scale_x = self._page.width / page_image.width
97
- scale_y = self._page.height / page_image.height
98
- logger.debug(f" Scaling factors: x={scale_x:.4f}, y={scale_y:.4f}")
99
-
164
+ # --- Process Detections (Convert to Regions, Scale Coords from Image to PDF) ---
100
165
  layout_regions = []
101
166
  docling_id_to_region = {} # For hierarchy if using Docling
102
167
 
103
168
  for detection in detections:
104
169
  try:
170
+ # bbox is relative to std_res_page_image
105
171
  x_min, y_min, x_max, y_max = detection['bbox']
106
172
 
107
173
  # Convert coordinates from image to PDF space
108
- pdf_x0 = x_min * scale_x
109
- pdf_y0 = y_min * scale_y
110
- pdf_x1 = x_max * scale_x
111
- pdf_y1 = y_max * scale_y
112
-
113
- # Create a Region object
174
+ pdf_x0 = x_min * img_scale_x
175
+ pdf_y0 = y_min * img_scale_y
176
+ pdf_x1 = x_max * img_scale_x
177
+ pdf_y1 = y_max * img_scale_y
178
+
179
+ # Ensure PDF coords are valid
180
+ pdf_x0, pdf_x1 = min(pdf_x0, pdf_x1), max(pdf_x0, pdf_x1)
181
+ pdf_y0, pdf_y1 = min(pdf_y0, pdf_y1), max(pdf_y0, pdf_y1)
182
+ pdf_x0 = max(0, pdf_x0)
183
+ pdf_y0 = max(0, pdf_y0)
184
+ pdf_x1 = min(self._page.width, pdf_x1)
185
+ pdf_y1 = min(self._page.height, pdf_y1)
186
+
187
+ # Create a Region object with PDF coordinates
114
188
  region = Region(self._page, (pdf_x0, pdf_y0, pdf_x1, pdf_y1))
115
- region.region_type = detection.get('class', 'unknown') # Original class name
116
- region.normalized_type = detection.get('normalized_class', 'unknown') # Hyphenated name
189
+ region.region_type = detection.get('class', 'unknown')
190
+ region.normalized_type = detection.get('normalized_class', 'unknown')
117
191
  region.confidence = detection.get('confidence', 0.0)
118
- region.model = detection.get('model', engine or 'unknown') # Store model name
192
+ region.model = detection.get('model', engine or 'unknown')
119
193
  region.source = 'detected'
120
-
194
+
121
195
  # Add extra info if available
122
196
  if 'text' in detection: region.text_content = detection['text']
123
197
  if 'docling_id' in detection: region.docling_id = detection['docling_id']
124
198
  if 'parent_id' in detection: region.parent_id = detection['parent_id']
125
- # Add other fields like polygon, position, row/col index if needed
126
199
 
127
200
  layout_regions.append(region)
128
201
 
@@ -163,4 +236,20 @@ class LayoutAnalyzer:
163
236
  self._page.detected_layout_regions = self._page._regions['detected']
164
237
  logger.info(f"Layout analysis complete for page {self._page.number}.")
165
238
 
239
+ # --- Auto-create cells if requested by TATR options ---
240
+ if isinstance(final_options, TATRLayoutOptions) and final_options.create_cells:
241
+ logger.info(f" Option create_cells=True detected for TATR. Attempting cell creation...")
242
+ created_cell_count = 0
243
+ for region in layout_regions:
244
+ # Only attempt on regions identified as tables by the TATR model
245
+ if region.model == 'tatr' and region.region_type == 'table':
246
+ try:
247
+ # create_cells now modifies the page elements directly and returns self
248
+ region.create_cells()
249
+ # We could potentially count cells created here if needed,
250
+ # but the method logs its own count.
251
+ except Exception as cell_error:
252
+ logger.warning(f" Error calling create_cells for table region {region.bbox}: {cell_error}")
253
+ logger.info(f" Finished cell creation process triggered by options.")
254
+
166
255
  return layout_regions
@@ -120,9 +120,10 @@ class LayoutManager:
120
120
 
121
121
  # --- Determine Options and Engine ---
122
122
  if options is not None:
123
- # Advanced Mode
124
- logger.debug(f"LayoutManager: Using advanced mode with options object: {type(options).__name__}")
125
- final_options = copy.deepcopy(options) # Use copy
123
+ # Advanced Mode: An options object was provided directly (or constructed by LayoutAnalyzer)
124
+ # Use this object directly, do not deep copy or reconstruct.
125
+ logger.debug(f"LayoutManager: Using provided options object: {type(options).__name__}")
126
+ final_options = options # Use the provided object directly
126
127
  found_engine = False
127
128
  for name, registry_entry in self.ENGINE_REGISTRY.items():
128
129
  if isinstance(options, registry_entry['options_class']):
@@ -131,12 +132,14 @@ class LayoutManager:
131
132
  break
132
133
  if not found_engine:
133
134
  raise TypeError(f"Provided options object type '{type(options).__name__}' does not match any registered layout engine options.")
135
+ # Ignore simple kwargs if options object is present
134
136
  if kwargs:
135
- logger.warning(f"Keyword arguments {list(kwargs.keys())} were provided alongside 'options' and will be ignored.")
137
+ logger.warning(f"Keyword arguments {list(kwargs.keys())} were provided alongside an 'options' object and will be ignored.")
136
138
  else:
137
- # Simple Mode
139
+ # Simple Mode: No options object provided initially.
140
+ # Determine engine from kwargs or default, then construct options.
138
141
  selected_engine_name = default_engine.lower()
139
- logger.debug(f"LayoutManager: Using simple mode with engine: '{selected_engine_name}' and kwargs: {kwargs}")
142
+ logger.debug(f"LayoutManager: Using simple mode. Engine: '{selected_engine_name}', kwargs: {kwargs}")
140
143
 
141
144
  if selected_engine_name not in self.ENGINE_REGISTRY:
142
145
  raise ValueError(f"Unknown or unavailable layout engine: '{selected_engine_name}'. Available: {available_engines}")
@@ -34,7 +34,7 @@ class TATRLayoutOptions(BaseLayoutOptions):
34
34
  max_detection_size: int = 800
35
35
  max_structure_size: int = 1000
36
36
  # Whether to create cell regions (can be slow)
37
- create_cells: bool = False # Keep the flag for cell creation control
37
+ create_cells: bool = True
38
38
 
39
39
  # --- Paddle Specific Options ---
40
40
  @dataclass
@@ -51,10 +51,8 @@ class PaddleLayoutOptions(BaseLayoutOptions):
51
51
  @dataclass
52
52
  class SuryaLayoutOptions(BaseLayoutOptions):
53
53
  """Options specific to Surya layout detection."""
54
- # Surya doesn't seem to have many config options based on the example,
55
- # but we can add placeholders if needed. Device is handled by BaseLayoutOptions.
56
54
  model_name: str = "default" # Placeholder if different models become available
57
- verbose: bool = False # Verbose logging for the detector class
55
+ recognize_table_structure: bool = True # Automatically run table structure recognition?
58
56
 
59
57
  # --- Docling Specific Options ---
60
58
  @dataclass