natural-pdf 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +33 -1
- natural_pdf/analyzers/layout/layout_analyzer.py +133 -44
- natural_pdf/analyzers/layout/layout_manager.py +9 -6
- natural_pdf/analyzers/layout/layout_options.py +2 -4
- natural_pdf/analyzers/layout/surya.py +199 -91
- natural_pdf/collections/pdf_collection.py +259 -0
- natural_pdf/core/page.py +97 -69
- natural_pdf/core/pdf.py +382 -171
- natural_pdf/elements/region.py +55 -26
- natural_pdf/exporters/__init__.py +1 -0
- natural_pdf/exporters/searchable_pdf.py +252 -0
- natural_pdf/search/__init__.py +94 -0
- natural_pdf/search/haystack_search_service.py +520 -0
- natural_pdf/search/haystack_utils.py +386 -0
- natural_pdf/search/search_options.py +72 -0
- natural_pdf/search/search_service_protocol.py +189 -0
- natural_pdf/search/searchable_mixin.py +464 -0
- natural_pdf-0.1.3.dist-info/METADATA +137 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/RECORD +22 -13
- natural_pdf-0.1.1.dist-info/METADATA +0 -295
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/top_level.txt +0 -0
natural_pdf/__init__.py
CHANGED
@@ -52,4 +52,36 @@ __version__ = "0.1.1"
|
|
52
52
|
if HAS_QA:
|
53
53
|
__all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging", "DocumentQA", "get_qa_engine"]
|
54
54
|
else:
|
55
|
-
__all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging"]
|
55
|
+
__all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging"]
|
56
|
+
|
57
|
+
# Core classes
|
58
|
+
from .core.pdf import PDF
|
59
|
+
from .collections.pdf_collection import PDFCollection
|
60
|
+
from .elements.region import Region
|
61
|
+
|
62
|
+
# Search options (if extras installed)
|
63
|
+
try:
|
64
|
+
from .search.search_options import TextSearchOptions, MultiModalSearchOptions, BaseSearchOptions
|
65
|
+
except ImportError:
|
66
|
+
# Define dummy classes if extras not installed, so imports don't break
|
67
|
+
# but using them will raise the ImportError from check_haystack_availability
|
68
|
+
class TextSearchOptions:
|
69
|
+
def __init__(self, *args, **kwargs): pass
|
70
|
+
class MultiModalSearchOptions:
|
71
|
+
def __init__(self, *args, **kwargs): pass
|
72
|
+
class BaseSearchOptions:
|
73
|
+
def __init__(self, *args, **kwargs): pass
|
74
|
+
|
75
|
+
# Expose logging setup? (Optional)
|
76
|
+
# from . import logging_config
|
77
|
+
# logging_config.setup_logging()
|
78
|
+
|
79
|
+
# Explicitly define what gets imported with 'from natural_pdf import *'
|
80
|
+
__all__ = [
|
81
|
+
'PDF',
|
82
|
+
'PDFCollection',
|
83
|
+
'Region',
|
84
|
+
'TextSearchOptions', # Include search options
|
85
|
+
'MultiModalSearchOptions',
|
86
|
+
'BaseSearchOptions'
|
87
|
+
]
|
@@ -1,10 +1,11 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import List, Dict, Any, Optional, Union
|
3
3
|
from PIL import Image
|
4
|
+
import copy
|
4
5
|
|
5
6
|
from natural_pdf.elements.region import Region
|
6
7
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
7
|
-
from natural_pdf.analyzers.layout.layout_options import LayoutOptions
|
8
|
+
from natural_pdf.analyzers.layout.layout_options import LayoutOptions, TATRLayoutOptions, BaseLayoutOptions
|
8
9
|
|
9
10
|
logger = logging.getLogger(__name__)
|
10
11
|
|
@@ -36,20 +37,25 @@ class LayoutAnalyzer:
|
|
36
37
|
classes: Optional[List[str]] = None,
|
37
38
|
exclude_classes: Optional[List[str]] = None,
|
38
39
|
device: Optional[str] = None,
|
39
|
-
existing: str = "replace"
|
40
|
+
existing: str = "replace",
|
41
|
+
**kwargs
|
40
42
|
) -> List[Region]:
|
41
43
|
"""
|
42
44
|
Analyze the page layout using the configured LayoutManager.
|
43
45
|
|
46
|
+
This method constructs the final options object, including internal context,
|
47
|
+
and passes it to the LayoutManager.
|
48
|
+
|
44
49
|
Args:
|
45
|
-
engine: Name of the layout engine (e.g., 'yolo', 'tatr'). Uses manager's default if None.
|
46
|
-
options: Specific LayoutOptions object for advanced configuration.
|
50
|
+
engine: Name of the layout engine (e.g., 'yolo', 'tatr'). Uses manager's default if None and no options object given.
|
51
|
+
options: Specific LayoutOptions object for advanced configuration. If provided, simple args (confidence, etc.) are ignored.
|
47
52
|
confidence: Minimum confidence threshold (simple mode).
|
48
53
|
classes: Specific classes to detect (simple mode).
|
49
54
|
exclude_classes: Classes to exclude (simple mode).
|
50
55
|
device: Device for inference (simple mode).
|
51
56
|
existing: How to handle existing detected regions: 'replace' (default) or 'append'.
|
52
|
-
|
57
|
+
**kwargs: Additional engine-specific arguments (added to options.extra_args or used by constructor if options=None).
|
58
|
+
|
53
59
|
Returns:
|
54
60
|
List of created Region objects.
|
55
61
|
"""
|
@@ -57,72 +63,139 @@ class LayoutAnalyzer:
|
|
57
63
|
logger.error(f"Page {self._page.number}: LayoutManager not available. Cannot analyze layout.")
|
58
64
|
return []
|
59
65
|
|
60
|
-
logger.info(f"Page {self._page.number}: Analyzing layout (Engine: {engine or 'default'}, Options: {options is not None})...")
|
66
|
+
logger.info(f"Page {self._page.number}: Analyzing layout (Engine: {engine or 'default'}, Options provided: {options is not None})...")
|
61
67
|
|
62
|
-
# --- Render Page Image ---
|
63
|
-
logger.debug(f" Rendering page {self._page.number} to image for layout
|
68
|
+
# --- Render Page Image (Standard Resolution) ---
|
69
|
+
logger.debug(f" Rendering page {self._page.number} to image for initial layout detection...")
|
64
70
|
try:
|
65
|
-
|
66
|
-
layout_scale = getattr(self._page._parent, '_config', {}).get('layout_image_scale', 1.5) # ~108 DPI default
|
71
|
+
layout_scale = getattr(self._page._parent, '_config', {}).get('layout_image_scale', 1.5)
|
67
72
|
layout_resolution = layout_scale * 72
|
68
|
-
|
69
|
-
|
70
|
-
|
73
|
+
std_res_page_image = self._page.to_image(resolution=layout_resolution, include_highlights=False)
|
74
|
+
if not std_res_page_image:
|
75
|
+
raise ValueError("Initial page rendering returned None")
|
76
|
+
logger.debug(f" Initial rendered image size: {std_res_page_image.width}x{std_res_page_image.height}")
|
71
77
|
except Exception as e:
|
72
|
-
logger.error(f" Failed to render page
|
78
|
+
logger.error(f" Failed to render initial page image: {e}", exc_info=True)
|
73
79
|
return []
|
80
|
+
|
81
|
+
# --- Calculate Scaling Factors (Standard Res Image <-> PDF) ---
|
82
|
+
if std_res_page_image.width == 0 or std_res_page_image.height == 0:
|
83
|
+
logger.error(f"Page {self._page.number}: Invalid initial rendered image dimensions. Cannot scale results.")
|
84
|
+
return []
|
85
|
+
img_scale_x = self._page.width / std_res_page_image.width
|
86
|
+
img_scale_y = self._page.height / std_res_page_image.height
|
87
|
+
logger.debug(f" StdRes Image -> PDF Scaling: x={img_scale_x:.4f}, y={img_scale_y:.4f}")
|
74
88
|
|
75
|
-
# ---
|
76
|
-
|
77
|
-
|
78
|
-
if
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
89
|
+
# --- Construct Final Options Object ---
|
90
|
+
final_options: BaseLayoutOptions
|
91
|
+
|
92
|
+
if options is not None:
|
93
|
+
# User provided a complete options object, use it directly
|
94
|
+
logger.debug("Using user-provided options object.")
|
95
|
+
final_options = copy.deepcopy(options) # Copy to avoid modifying original user object
|
96
|
+
if kwargs:
|
97
|
+
logger.warning(f"Ignoring kwargs {list(kwargs.keys())} because a full options object was provided.")
|
98
|
+
# Infer engine from options type if engine arg wasn't provided
|
99
|
+
if engine is None:
|
100
|
+
for name, registry_entry in self._layout_manager.ENGINE_REGISTRY.items():
|
101
|
+
if isinstance(final_options, registry_entry['options_class']):
|
102
|
+
engine = name
|
103
|
+
logger.debug(f"Inferred engine '{engine}' from options type.")
|
104
|
+
break
|
105
|
+
if engine is None:
|
106
|
+
logger.warning("Could not infer engine from provided options object.")
|
107
|
+
else:
|
108
|
+
# Construct options from simple args (engine, confidence, classes, etc.)
|
109
|
+
logger.debug("Constructing options from simple arguments.")
|
110
|
+
selected_engine = engine or self._layout_manager.get_available_engines()[0] # Use provided or first available
|
111
|
+
engine_lower = selected_engine.lower()
|
112
|
+
registry = self._layout_manager.ENGINE_REGISTRY
|
113
|
+
|
114
|
+
if engine_lower not in registry:
|
115
|
+
raise ValueError(f"Unknown or unavailable engine: '{selected_engine}'. Available: {list(registry.keys())}")
|
116
|
+
|
117
|
+
options_class = registry[engine_lower]['options_class']
|
118
|
+
|
119
|
+
# Get base defaults
|
120
|
+
base_defaults = BaseLayoutOptions()
|
121
|
+
|
122
|
+
# Prepare args for constructor, prioritizing explicit args over defaults
|
123
|
+
constructor_args = {
|
124
|
+
'confidence': confidence if confidence is not None else base_defaults.confidence,
|
125
|
+
'classes': classes, # Pass None if not provided
|
126
|
+
'exclude_classes': exclude_classes, # Pass None if not provided
|
127
|
+
'device': device if device is not None else base_defaults.device,
|
128
|
+
'extra_args': kwargs # Pass other kwargs here
|
129
|
+
}
|
130
|
+
# Remove None values unless they are valid defaults (like classes=None)
|
131
|
+
# We can pass all to the dataclass constructor; it handles defaults
|
132
|
+
|
133
|
+
try:
|
134
|
+
final_options = options_class(**constructor_args)
|
135
|
+
logger.debug(f"Constructed options: {final_options}")
|
136
|
+
except TypeError as e:
|
137
|
+
logger.error(f"Failed to construct options object {options_class.__name__} with args {constructor_args}: {e}")
|
138
|
+
# Filter kwargs to only include fields defined in the specific options class? Complex.
|
139
|
+
# Re-raise for now, indicates programming error or invalid kwarg.
|
140
|
+
raise e
|
141
|
+
|
142
|
+
# --- Add Internal Context to extra_args (ALWAYS) ---
|
143
|
+
if not hasattr(final_options, 'extra_args') or final_options.extra_args is None:
|
144
|
+
final_options.extra_args = {}
|
145
|
+
final_options.extra_args['_page_ref'] = self._page
|
146
|
+
final_options.extra_args['_img_scale_x'] = img_scale_x
|
147
|
+
final_options.extra_args['_img_scale_y'] = img_scale_y
|
148
|
+
logger.debug(f"Added internal context to final_options.extra_args: {final_options.extra_args}")
|
149
|
+
|
150
|
+
# --- Call Layout Manager with the Final Options ---
|
151
|
+
logger.debug(f"Calling Layout Manager with final options object.")
|
84
152
|
try:
|
85
|
-
|
153
|
+
# Pass only image and the constructed options object
|
154
|
+
detections = self._layout_manager.analyze_layout(
|
155
|
+
image=std_res_page_image,
|
156
|
+
options=final_options
|
157
|
+
# No engine, confidence, classes etc. passed here directly
|
158
|
+
)
|
86
159
|
logger.info(f" Layout Manager returned {len(detections)} detections.")
|
87
160
|
except Exception as e:
|
88
161
|
logger.error(f" Layout analysis failed: {e}", exc_info=True)
|
89
162
|
return []
|
90
163
|
|
91
|
-
# --- Process Detections (Convert to Regions, Scale Coords) ---
|
92
|
-
# Calculate scale factor to convert from image back to PDF coordinates
|
93
|
-
if page_image.width == 0 or page_image.height == 0:
|
94
|
-
logger.error(f"Page {self._page.number}: Invalid rendered image dimensions ({page_image.width}x{page_image.height}). Cannot scale layout results.")
|
95
|
-
return []
|
96
|
-
scale_x = self._page.width / page_image.width
|
97
|
-
scale_y = self._page.height / page_image.height
|
98
|
-
logger.debug(f" Scaling factors: x={scale_x:.4f}, y={scale_y:.4f}")
|
99
|
-
|
164
|
+
# --- Process Detections (Convert to Regions, Scale Coords from Image to PDF) ---
|
100
165
|
layout_regions = []
|
101
166
|
docling_id_to_region = {} # For hierarchy if using Docling
|
102
167
|
|
103
168
|
for detection in detections:
|
104
169
|
try:
|
170
|
+
# bbox is relative to std_res_page_image
|
105
171
|
x_min, y_min, x_max, y_max = detection['bbox']
|
106
172
|
|
107
173
|
# Convert coordinates from image to PDF space
|
108
|
-
pdf_x0 = x_min *
|
109
|
-
pdf_y0 = y_min *
|
110
|
-
pdf_x1 = x_max *
|
111
|
-
pdf_y1 = y_max *
|
112
|
-
|
113
|
-
#
|
174
|
+
pdf_x0 = x_min * img_scale_x
|
175
|
+
pdf_y0 = y_min * img_scale_y
|
176
|
+
pdf_x1 = x_max * img_scale_x
|
177
|
+
pdf_y1 = y_max * img_scale_y
|
178
|
+
|
179
|
+
# Ensure PDF coords are valid
|
180
|
+
pdf_x0, pdf_x1 = min(pdf_x0, pdf_x1), max(pdf_x0, pdf_x1)
|
181
|
+
pdf_y0, pdf_y1 = min(pdf_y0, pdf_y1), max(pdf_y0, pdf_y1)
|
182
|
+
pdf_x0 = max(0, pdf_x0)
|
183
|
+
pdf_y0 = max(0, pdf_y0)
|
184
|
+
pdf_x1 = min(self._page.width, pdf_x1)
|
185
|
+
pdf_y1 = min(self._page.height, pdf_y1)
|
186
|
+
|
187
|
+
# Create a Region object with PDF coordinates
|
114
188
|
region = Region(self._page, (pdf_x0, pdf_y0, pdf_x1, pdf_y1))
|
115
|
-
region.region_type = detection.get('class', 'unknown')
|
116
|
-
region.normalized_type = detection.get('normalized_class', 'unknown')
|
189
|
+
region.region_type = detection.get('class', 'unknown')
|
190
|
+
region.normalized_type = detection.get('normalized_class', 'unknown')
|
117
191
|
region.confidence = detection.get('confidence', 0.0)
|
118
|
-
region.model = detection.get('model', engine or 'unknown')
|
192
|
+
region.model = detection.get('model', engine or 'unknown')
|
119
193
|
region.source = 'detected'
|
120
|
-
|
194
|
+
|
121
195
|
# Add extra info if available
|
122
196
|
if 'text' in detection: region.text_content = detection['text']
|
123
197
|
if 'docling_id' in detection: region.docling_id = detection['docling_id']
|
124
198
|
if 'parent_id' in detection: region.parent_id = detection['parent_id']
|
125
|
-
# Add other fields like polygon, position, row/col index if needed
|
126
199
|
|
127
200
|
layout_regions.append(region)
|
128
201
|
|
@@ -163,4 +236,20 @@ class LayoutAnalyzer:
|
|
163
236
|
self._page.detected_layout_regions = self._page._regions['detected']
|
164
237
|
logger.info(f"Layout analysis complete for page {self._page.number}.")
|
165
238
|
|
239
|
+
# --- Auto-create cells if requested by TATR options ---
|
240
|
+
if isinstance(final_options, TATRLayoutOptions) and final_options.create_cells:
|
241
|
+
logger.info(f" Option create_cells=True detected for TATR. Attempting cell creation...")
|
242
|
+
created_cell_count = 0
|
243
|
+
for region in layout_regions:
|
244
|
+
# Only attempt on regions identified as tables by the TATR model
|
245
|
+
if region.model == 'tatr' and region.region_type == 'table':
|
246
|
+
try:
|
247
|
+
# create_cells now modifies the page elements directly and returns self
|
248
|
+
region.create_cells()
|
249
|
+
# We could potentially count cells created here if needed,
|
250
|
+
# but the method logs its own count.
|
251
|
+
except Exception as cell_error:
|
252
|
+
logger.warning(f" Error calling create_cells for table region {region.bbox}: {cell_error}")
|
253
|
+
logger.info(f" Finished cell creation process triggered by options.")
|
254
|
+
|
166
255
|
return layout_regions
|
@@ -120,9 +120,10 @@ class LayoutManager:
|
|
120
120
|
|
121
121
|
# --- Determine Options and Engine ---
|
122
122
|
if options is not None:
|
123
|
-
# Advanced Mode
|
124
|
-
|
125
|
-
|
123
|
+
# Advanced Mode: An options object was provided directly (or constructed by LayoutAnalyzer)
|
124
|
+
# Use this object directly, do not deep copy or reconstruct.
|
125
|
+
logger.debug(f"LayoutManager: Using provided options object: {type(options).__name__}")
|
126
|
+
final_options = options # Use the provided object directly
|
126
127
|
found_engine = False
|
127
128
|
for name, registry_entry in self.ENGINE_REGISTRY.items():
|
128
129
|
if isinstance(options, registry_entry['options_class']):
|
@@ -131,12 +132,14 @@ class LayoutManager:
|
|
131
132
|
break
|
132
133
|
if not found_engine:
|
133
134
|
raise TypeError(f"Provided options object type '{type(options).__name__}' does not match any registered layout engine options.")
|
135
|
+
# Ignore simple kwargs if options object is present
|
134
136
|
if kwargs:
|
135
|
-
logger.warning(f"Keyword arguments {list(kwargs.keys())} were provided alongside 'options' and will be ignored.")
|
137
|
+
logger.warning(f"Keyword arguments {list(kwargs.keys())} were provided alongside an 'options' object and will be ignored.")
|
136
138
|
else:
|
137
|
-
# Simple Mode
|
139
|
+
# Simple Mode: No options object provided initially.
|
140
|
+
# Determine engine from kwargs or default, then construct options.
|
138
141
|
selected_engine_name = default_engine.lower()
|
139
|
-
logger.debug(f"LayoutManager: Using simple mode
|
142
|
+
logger.debug(f"LayoutManager: Using simple mode. Engine: '{selected_engine_name}', kwargs: {kwargs}")
|
140
143
|
|
141
144
|
if selected_engine_name not in self.ENGINE_REGISTRY:
|
142
145
|
raise ValueError(f"Unknown or unavailable layout engine: '{selected_engine_name}'. Available: {available_engines}")
|
@@ -34,7 +34,7 @@ class TATRLayoutOptions(BaseLayoutOptions):
|
|
34
34
|
max_detection_size: int = 800
|
35
35
|
max_structure_size: int = 1000
|
36
36
|
# Whether to create cell regions (can be slow)
|
37
|
-
create_cells: bool =
|
37
|
+
create_cells: bool = True
|
38
38
|
|
39
39
|
# --- Paddle Specific Options ---
|
40
40
|
@dataclass
|
@@ -51,10 +51,8 @@ class PaddleLayoutOptions(BaseLayoutOptions):
|
|
51
51
|
@dataclass
|
52
52
|
class SuryaLayoutOptions(BaseLayoutOptions):
|
53
53
|
"""Options specific to Surya layout detection."""
|
54
|
-
# Surya doesn't seem to have many config options based on the example,
|
55
|
-
# but we can add placeholders if needed. Device is handled by BaseLayoutOptions.
|
56
54
|
model_name: str = "default" # Placeholder if different models become available
|
57
|
-
|
55
|
+
recognize_table_structure: bool = True # Automatically run table structure recognition?
|
58
56
|
|
59
57
|
# --- Docling Specific Options ---
|
60
58
|
@dataclass
|