natural-pdf 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +33 -1
- natural_pdf/analyzers/layout/layout_analyzer.py +133 -44
- natural_pdf/analyzers/layout/layout_manager.py +9 -6
- natural_pdf/analyzers/layout/layout_options.py +2 -4
- natural_pdf/analyzers/layout/surya.py +199 -91
- natural_pdf/collections/pdf_collection.py +259 -0
- natural_pdf/core/page.py +97 -69
- natural_pdf/core/pdf.py +382 -171
- natural_pdf/elements/region.py +55 -26
- natural_pdf/exporters/__init__.py +1 -0
- natural_pdf/exporters/searchable_pdf.py +252 -0
- natural_pdf/search/__init__.py +94 -0
- natural_pdf/search/haystack_search_service.py +520 -0
- natural_pdf/search/haystack_utils.py +386 -0
- natural_pdf/search/search_options.py +72 -0
- natural_pdf/search/search_service_protocol.py +189 -0
- natural_pdf/search/searchable_mixin.py +464 -0
- natural_pdf-0.1.3.dist-info/METADATA +137 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/RECORD +22 -13
- natural_pdf-0.1.1.dist-info/METADATA +0 -295
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -7,6 +7,8 @@ from PIL import Image
|
|
7
7
|
import base64
|
8
8
|
import io
|
9
9
|
import json
|
10
|
+
import re
|
11
|
+
import hashlib
|
10
12
|
|
11
13
|
from natural_pdf.elements.collections import ElementCollection
|
12
14
|
from natural_pdf.elements.region import Region
|
@@ -96,6 +98,11 @@ class Page:
|
|
96
98
|
"""Get page number (1-based)."""
|
97
99
|
return self._page.page_number
|
98
100
|
|
101
|
+
@property
|
102
|
+
def page_number(self) -> int:
|
103
|
+
"""Get page number (1-based)."""
|
104
|
+
return self._page.page_number
|
105
|
+
|
99
106
|
@property
|
100
107
|
def index(self) -> int:
|
101
108
|
"""Get page index (0-based)."""
|
@@ -127,7 +134,7 @@ class Page:
|
|
127
134
|
self._exclusions = []
|
128
135
|
return self
|
129
136
|
|
130
|
-
def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region, Any]) -> 'Page':
|
137
|
+
def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region, Any], label: Optional[str] = None) -> 'Page':
|
131
138
|
"""
|
132
139
|
Add an exclusion to the page. Text from these regions will be excluded from extraction.
|
133
140
|
Ensures non-callable items are stored as Region objects if possible.
|
@@ -135,6 +142,7 @@ class Page:
|
|
135
142
|
Args:
|
136
143
|
exclusion_func_or_region: Either a callable function returning a Region,
|
137
144
|
a Region object, or another object with a valid .bbox attribute.
|
145
|
+
label: Optional label for this exclusion (e.g., 'header', 'footer').
|
138
146
|
|
139
147
|
Returns:
|
140
148
|
Self for method chaining
|
@@ -142,28 +150,36 @@ class Page:
|
|
142
150
|
Raises:
|
143
151
|
TypeError: If a non-callable, non-Region object without a valid bbox is provided.
|
144
152
|
"""
|
153
|
+
exclusion_data = None # Initialize exclusion data
|
154
|
+
|
145
155
|
if callable(exclusion_func_or_region):
|
146
|
-
# Store callable functions
|
147
|
-
|
148
|
-
logger.debug(f"Page {self.index}: Added callable exclusion: {exclusion_func_or_region}")
|
156
|
+
# Store callable functions along with their label
|
157
|
+
exclusion_data = (exclusion_func_or_region, label)
|
158
|
+
logger.debug(f"Page {self.index}: Added callable exclusion '{label}': {exclusion_func_or_region}")
|
149
159
|
elif isinstance(exclusion_func_or_region, Region):
|
150
|
-
# Store Region objects directly
|
151
|
-
|
152
|
-
|
160
|
+
# Store Region objects directly, assigning the label
|
161
|
+
exclusion_func_or_region.label = label # Assign label
|
162
|
+
exclusion_data = (exclusion_func_or_region, label) # Store as tuple for consistency
|
163
|
+
logger.debug(f"Page {self.index}: Added Region exclusion '{label}': {exclusion_func_or_region}")
|
153
164
|
elif hasattr(exclusion_func_or_region, 'bbox') and isinstance(getattr(exclusion_func_or_region, 'bbox', None), (tuple, list)) and len(exclusion_func_or_region.bbox) == 4:
|
154
165
|
# Convert objects with a valid bbox to a Region before storing
|
155
166
|
try:
|
156
167
|
bbox_coords = tuple(float(v) for v in exclusion_func_or_region.bbox)
|
157
|
-
|
158
|
-
self
|
159
|
-
|
168
|
+
# Pass the label to the Region constructor
|
169
|
+
region_to_add = Region(self, bbox_coords, label=label)
|
170
|
+
exclusion_data = (region_to_add, label) # Store as tuple
|
171
|
+
logger.debug(f"Page {self.index}: Added exclusion '{label}' converted to Region from {type(exclusion_func_or_region)}: {region_to_add}")
|
160
172
|
except (ValueError, TypeError, Exception) as e:
|
161
173
|
# Raise an error if conversion fails
|
162
174
|
raise TypeError(f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}") from e
|
163
175
|
else:
|
164
176
|
# Reject invalid types
|
165
177
|
raise TypeError(f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute.")
|
166
|
-
|
178
|
+
|
179
|
+
# Append the stored data (tuple of object/callable and label)
|
180
|
+
if exclusion_data:
|
181
|
+
self._exclusions.append(exclusion_data)
|
182
|
+
|
167
183
|
return self
|
168
184
|
|
169
185
|
def add_region(self, region: Region, name: Optional[str] = None) -> 'Page':
|
@@ -222,75 +238,66 @@ class Page:
|
|
222
238
|
def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
|
223
239
|
"""
|
224
240
|
Get all exclusion regions for this page.
|
225
|
-
Assumes self._exclusions contains
|
241
|
+
Assumes self._exclusions contains tuples of (callable/Region, label).
|
226
242
|
|
227
243
|
Args:
|
228
244
|
include_callable: Whether to evaluate callable exclusion functions
|
229
245
|
debug: Enable verbose debug logging for exclusion evaluation
|
230
246
|
|
231
247
|
Returns:
|
232
|
-
List of Region objects to exclude
|
248
|
+
List of Region objects to exclude, with labels assigned.
|
233
249
|
"""
|
234
250
|
regions = []
|
235
251
|
|
236
|
-
# Track exclusion results for debugging
|
237
252
|
if debug:
|
238
253
|
print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
|
239
|
-
|
240
|
-
for i,
|
241
|
-
#
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
# Check if it's a tuple from PDF.add_exclusion (should still be handled if PDF adds labels)
|
246
|
-
if isinstance(exclusion, tuple) and len(exclusion) == 2 and callable(exclusion[0]):
|
247
|
-
exclusion_func, label = exclusion
|
248
|
-
if label:
|
249
|
-
exclusion_label = label
|
250
|
-
exclusion = exclusion_func # Use the function part
|
251
|
-
|
254
|
+
|
255
|
+
for i, exclusion_data in enumerate(self._exclusions):
|
256
|
+
# Unpack the exclusion object/callable and its label
|
257
|
+
exclusion_item, label = exclusion_data
|
258
|
+
exclusion_label = label if label else f"exclusion {i}"
|
259
|
+
|
252
260
|
# Process callable exclusion functions
|
253
|
-
if callable(
|
254
|
-
# It's a function, call it with this page
|
261
|
+
if callable(exclusion_item) and include_callable:
|
255
262
|
try:
|
256
263
|
if debug:
|
257
|
-
print(f" - Evaluating callable {exclusion_label}...")
|
258
|
-
|
259
|
-
# Temporarily clear exclusions
|
260
|
-
# This might be overly cautious depending on use case, but safer.
|
264
|
+
print(f" - Evaluating callable '{exclusion_label}'...")
|
265
|
+
|
266
|
+
# Temporarily clear exclusions (consider if really needed)
|
261
267
|
temp_original_exclusions = self._exclusions
|
262
|
-
self._exclusions = []
|
263
|
-
|
268
|
+
self._exclusions = []
|
269
|
+
|
264
270
|
# Call the function - Expects it to return a Region or None
|
265
|
-
region_result =
|
266
|
-
|
271
|
+
region_result = exclusion_item(self)
|
272
|
+
|
267
273
|
# Restore exclusions
|
268
274
|
self._exclusions = temp_original_exclusions
|
269
|
-
|
275
|
+
|
270
276
|
if isinstance(region_result, Region):
|
277
|
+
# Assign the label to the returned region
|
278
|
+
region_result.label = label
|
271
279
|
regions.append(region_result)
|
272
280
|
if debug:
|
273
|
-
print(f" ✓ Added region from callable: {region_result}")
|
281
|
+
print(f" ✓ Added region from callable '{label}': {region_result}")
|
274
282
|
elif region_result:
|
275
|
-
|
276
|
-
logger.warning(f"Callable exclusion {exclusion_label} returned non-Region object: {type(region_result)}. Skipping.")
|
283
|
+
logger.warning(f"Callable exclusion '{exclusion_label}' returned non-Region object: {type(region_result)}. Skipping.")
|
277
284
|
if debug:
|
278
285
|
print(f" ✗ Callable returned non-Region/None: {type(region_result)}")
|
279
286
|
else:
|
280
287
|
if debug:
|
281
|
-
print(f" ✗ Callable returned None, no region added")
|
282
|
-
|
288
|
+
print(f" ✗ Callable '{exclusion_label}' returned None, no region added")
|
289
|
+
|
283
290
|
except Exception as e:
|
284
|
-
error_msg = f"Error evaluating callable exclusion {exclusion_label} for page {self.index}: {e}"
|
291
|
+
error_msg = f"Error evaluating callable exclusion '{exclusion_label}' for page {self.index}: {e}"
|
285
292
|
print(error_msg)
|
286
293
|
import traceback
|
287
294
|
print(f" Traceback: {traceback.format_exc().splitlines()[-3:]}")
|
288
|
-
|
289
|
-
# Process direct Region objects (
|
290
|
-
elif isinstance(
|
291
|
-
regions.append(
|
295
|
+
|
296
|
+
# Process direct Region objects (label was assigned in add_exclusion)
|
297
|
+
elif isinstance(exclusion_item, Region):
|
298
|
+
regions.append(exclusion_item) # Label is already on the Region object
|
292
299
|
if debug:
|
293
|
-
print(f" - Added direct region: {
|
300
|
+
print(f" - Added direct region '{label}': {exclusion_item}")
|
294
301
|
# No else needed, add_exclusion should prevent invalid types
|
295
302
|
|
296
303
|
if debug:
|
@@ -1485,25 +1492,46 @@ class Page:
|
|
1485
1492
|
RuntimeError: If required dependencies (ipywidgets) are missing.
|
1486
1493
|
ValueError: If image rendering or data preparation fails within from_page.
|
1487
1494
|
"""
|
1488
|
-
#
|
1489
|
-
|
1495
|
+
# Dynamically import here if needed, or ensure it's globally available
|
1496
|
+
try:
|
1497
|
+
from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
|
1498
|
+
except ImportError:
|
1499
|
+
logger.error("Interactive viewer requires optional dependencies. Install with `pip install natural-pdf[widgets]`")
|
1500
|
+
raise
|
1501
|
+
|
1502
|
+
# Pass self (the Page object) to the factory method
|
1503
|
+
return SimpleInteractiveViewerWidget.from_page(self)
|
1504
|
+
|
1505
|
+
# --- Indexable Protocol Methods ---
|
1506
|
+
def get_id(self) -> str:
|
1507
|
+
"""Returns a unique identifier for the page (required by Indexable protocol)."""
|
1508
|
+
# Ensure path is safe for use in IDs (replace problematic chars)
|
1509
|
+
safe_path = re.sub(r'[^a-zA-Z0-9_-]', '_', str(self.pdf.path))
|
1510
|
+
return f"pdf_{safe_path}_page_{self.page_number}"
|
1511
|
+
|
1512
|
+
def get_metadata(self) -> Dict[str, Any]:
|
1513
|
+
"""Returns metadata associated with the page (required by Indexable protocol)."""
|
1514
|
+
# Add content hash here for sync
|
1515
|
+
metadata = {
|
1516
|
+
"pdf_path": str(self.pdf.path),
|
1517
|
+
"page_number": self.page_number,
|
1518
|
+
"width": self.width,
|
1519
|
+
"height": self.height,
|
1520
|
+
"content_hash": self.get_content_hash() # Include the hash
|
1521
|
+
}
|
1522
|
+
return metadata
|
1490
1523
|
|
1491
|
-
|
1524
|
+
def get_content(self) -> 'Page':
|
1525
|
+
"""
|
1526
|
+
Returns the primary content object (self) for indexing (required by Indexable protocol).
|
1527
|
+
SearchService implementations decide how to process this (e.g., call extract_text).
|
1528
|
+
"""
|
1529
|
+
return self # Return the Page object itself
|
1492
1530
|
|
1493
|
-
|
1494
|
-
|
1495
|
-
|
1496
|
-
|
1497
|
-
|
1498
|
-
|
1499
|
-
|
1500
|
-
|
1501
|
-
logger.info("Interactive viewer widget created successfully.")
|
1502
|
-
return viewer_widget
|
1503
|
-
except ImportError as e:
|
1504
|
-
logger.error("Failed to import SimpleInteractiveViewerWidget. Ensure natural_pdf.widgets and ipywidgets are installed.")
|
1505
|
-
raise RuntimeError("Widget class not found. ipywidgets or natural_pdf.widgets might be missing or setup incorrect.") from e
|
1506
|
-
except Exception as e:
|
1507
|
-
logger.error(f"Failed to create interactive viewer: {e}", exc_info=True)
|
1508
|
-
# Re-raise the exception to make it visible to the user
|
1509
|
-
raise RuntimeError(f"Failed to create interactive viewer: {e}") from e
|
1531
|
+
def get_content_hash(self) -> str:
|
1532
|
+
"""Returns a SHA256 hash of the extracted text content (required by Indexable for sync)."""
|
1533
|
+
# Hash the extracted text (without exclusions for consistency)
|
1534
|
+
# Consider if exclusions should be part of the hash? For now, hash raw text.
|
1535
|
+
# Using extract_text directly might be slow if called repeatedly. Cache? TODO: Optimization
|
1536
|
+
text_content = self.extract_text(use_exclusions=False, preserve_whitespace=False) # Normalize whitespace?
|
1537
|
+
return hashlib.sha256(text_content.encode('utf-8')).hexdigest()
|