natural-pdf 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +1 -1
- natural_pdf/analyzers/layout/layout_analyzer.py +133 -44
- natural_pdf/analyzers/layout/layout_manager.py +9 -6
- natural_pdf/analyzers/layout/layout_options.py +2 -4
- natural_pdf/analyzers/layout/surya.py +199 -91
- natural_pdf/core/highlighting_service.py +48 -17
- natural_pdf/core/page.py +92 -27
- natural_pdf/core/pdf.py +11 -0
- natural_pdf/elements/base.py +99 -14
- natural_pdf/elements/collections.py +56 -0
- natural_pdf/elements/region.py +56 -131
- natural_pdf/qa/document_qa.py +4 -3
- natural_pdf/selectors/parser.py +215 -1
- natural_pdf/utils/visualization.py +2 -2
- natural_pdf-0.1.2.dist-info/METADATA +124 -0
- {natural_pdf-0.1.0.dist-info → natural_pdf-0.1.2.dist-info}/RECORD +19 -19
- natural_pdf-0.1.0.dist-info/METADATA +0 -295
- {natural_pdf-0.1.0.dist-info → natural_pdf-0.1.2.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.0.dist-info → natural_pdf-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.0.dist-info → natural_pdf-0.1.2.dist-info}/top_level.txt +0 -0
@@ -383,7 +383,7 @@ class HighlightingService:
|
|
383
383
|
def add(
|
384
384
|
self,
|
385
385
|
page_index: int,
|
386
|
-
bbox: Tuple[float, float, float, float],
|
386
|
+
bbox: Union[Tuple[float, float, float, float], Any], # Relax input type hint
|
387
387
|
color: Optional[Union[Tuple, str]] = None,
|
388
388
|
label: Optional[str] = None,
|
389
389
|
use_color_cycling: bool = False,
|
@@ -392,9 +392,32 @@ class HighlightingService:
|
|
392
392
|
existing: str = 'append'
|
393
393
|
):
|
394
394
|
"""Adds a rectangular highlight."""
|
395
|
+
|
396
|
+
processed_bbox: Tuple[float, float, float, float]
|
397
|
+
# Check if bbox is an object with expected attributes (likely a Region)
|
398
|
+
# Assuming Region object has x0, top, x1, bottom attributes based on error context
|
399
|
+
if (hasattr(bbox, 'x0') and hasattr(bbox, 'top') and
|
400
|
+
hasattr(bbox, 'x1') and hasattr(bbox, 'bottom')):
|
401
|
+
try:
|
402
|
+
# Ensure attributes are numeric before creating tuple
|
403
|
+
processed_bbox = (float(bbox.x0), float(bbox.top), float(bbox.x1), float(bbox.bottom))
|
404
|
+
except (ValueError, TypeError):
|
405
|
+
logger.error(f"Invalid attribute types in bbox object for page {page_index}: {bbox}. Expected numeric values.")
|
406
|
+
return
|
407
|
+
elif isinstance(bbox, (list, tuple)) and len(bbox) == 4:
|
408
|
+
try:
|
409
|
+
# Ensure elements are numeric and convert to tuple
|
410
|
+
processed_bbox = tuple(float(v) for v in bbox)
|
411
|
+
except (ValueError, TypeError):
|
412
|
+
logger.error(f"Invalid values in bbox sequence for page {page_index}: {bbox}. Expected numeric values.")
|
413
|
+
return
|
414
|
+
else:
|
415
|
+
logger.error(f"Invalid bbox type or structure provided for page {page_index}: {type(bbox)} - {bbox}. Expected tuple/list of 4 numbers or Region-like object.")
|
416
|
+
return # Don't proceed if bbox is invalid
|
417
|
+
|
395
418
|
self._add_internal(
|
396
419
|
page_index=page_index,
|
397
|
-
bbox=
|
420
|
+
bbox=processed_bbox, # Use the processed tuple
|
398
421
|
polygon=None,
|
399
422
|
color_input=color,
|
400
423
|
label=label,
|
@@ -526,6 +549,7 @@ class HighlightingService:
|
|
526
549
|
) -> Optional[Image.Image]:
|
527
550
|
"""
|
528
551
|
Renders a specific page with its highlights.
|
552
|
+
Legend is now generated based only on highlights present on this page.
|
529
553
|
|
530
554
|
Args:
|
531
555
|
page_index: The 0-based index of the page to render.
|
@@ -545,23 +569,19 @@ class HighlightingService:
|
|
545
569
|
return None
|
546
570
|
|
547
571
|
page = self._pdf[page_index]
|
548
|
-
highlights_on_page = self.get_highlights_for_page(page_index)
|
572
|
+
highlights_on_page = self.get_highlights_for_page(page_index) # This list will be empty if clear_page was called
|
549
573
|
|
550
574
|
# --- Get Base Image ---
|
551
575
|
try:
|
552
576
|
render_resolution = resolution if resolution is not None else scale * 72
|
553
|
-
# Use the underlying pdfplumber page object for base rendering
|
554
577
|
img_object = page._page.to_image(resolution=render_resolution, **kwargs)
|
555
|
-
|
556
|
-
base_image = img_object.annotated # .annotated usually holds the PIL Image
|
578
|
+
base_image = img_object.annotated
|
557
579
|
if not isinstance(base_image, Image.Image):
|
558
|
-
# Fallback for different pdfplumber versions/outputs
|
559
580
|
png_data = img_object._repr_png_()
|
560
581
|
if png_data:
|
561
582
|
base_image = Image.open(io.BytesIO(png_data)).convert('RGB')
|
562
583
|
else:
|
563
584
|
raise ValueError("Could not extract base PIL image from pdfplumber.")
|
564
|
-
# Convert to RGBA for compositing
|
565
585
|
base_image = base_image.convert('RGBA')
|
566
586
|
logger.debug(f"Base image for page {page_index} rendered with resolution {render_resolution}.")
|
567
587
|
except Exception as e:
|
@@ -569,6 +589,7 @@ class HighlightingService:
|
|
569
589
|
return None
|
570
590
|
|
571
591
|
# --- Render Highlights ---
|
592
|
+
rendered_image: Image.Image
|
572
593
|
if highlights_on_page:
|
573
594
|
renderer = HighlightRenderer(
|
574
595
|
page=page,
|
@@ -579,21 +600,31 @@ class HighlightingService:
|
|
579
600
|
)
|
580
601
|
rendered_image = renderer.render()
|
581
602
|
else:
|
582
|
-
# If no highlights, still need to potentially render OCR if requested
|
583
603
|
if render_ocr:
|
604
|
+
# Still render OCR even if no highlights
|
584
605
|
renderer = HighlightRenderer(page, base_image, [], scale, True)
|
585
|
-
rendered_image = renderer.render()
|
606
|
+
rendered_image = renderer.render()
|
586
607
|
else:
|
587
608
|
rendered_image = base_image # No highlights, no OCR requested
|
588
609
|
|
589
|
-
# --- Add Legend ---
|
610
|
+
# --- Add Legend (Based ONLY on this page's highlights) ---
|
590
611
|
if labels:
|
591
|
-
label_colors
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
612
|
+
# CHANGE: Create label_colors map only from highlights_on_page
|
613
|
+
labels_colors_on_page: Dict[str, Tuple[int, int, int, int]] = {}
|
614
|
+
for hl in highlights_on_page:
|
615
|
+
if hl.label and hl.label not in labels_colors_on_page:
|
616
|
+
labels_colors_on_page[hl.label] = hl.color
|
617
|
+
|
618
|
+
if labels_colors_on_page: # Only add legend if there are labels on this page
|
619
|
+
legend = create_legend(labels_colors_on_page)
|
620
|
+
if legend: # Ensure create_legend didn't return None
|
621
|
+
rendered_image = merge_images_with_legend(rendered_image, legend, legend_position)
|
622
|
+
logger.debug(f"Added legend with {len(labels_colors_on_page)} labels for page {page_index}.")
|
623
|
+
else:
|
624
|
+
logger.debug(f"Legend creation returned None for page {page_index}.")
|
625
|
+
else:
|
626
|
+
logger.debug(f"No labels found on page {page_index}, skipping legend.")
|
627
|
+
|
597
628
|
return rendered_image
|
598
629
|
|
599
630
|
def render_preview(
|
natural_pdf/core/page.py
CHANGED
@@ -9,6 +9,7 @@ import io
|
|
9
9
|
import json
|
10
10
|
|
11
11
|
from natural_pdf.elements.collections import ElementCollection
|
12
|
+
from natural_pdf.elements.region import Region
|
12
13
|
|
13
14
|
if TYPE_CHECKING:
|
14
15
|
import pdfplumber
|
@@ -17,7 +18,6 @@ if TYPE_CHECKING:
|
|
17
18
|
from natural_pdf.core.highlighting_service import HighlightingService
|
18
19
|
from natural_pdf.elements.base import Element
|
19
20
|
|
20
|
-
from natural_pdf.elements.region import Region
|
21
21
|
from natural_pdf.elements.text import TextElement
|
22
22
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
23
23
|
from natural_pdf.analyzers.layout.layout_options import LayoutOptions
|
@@ -120,18 +120,50 @@ class Page:
|
|
120
120
|
raise AttributeError("Parent PDF object does not have a 'highlighter' attribute.")
|
121
121
|
return self._parent.highlighter
|
122
122
|
|
123
|
-
def
|
123
|
+
def clear_exclusions(self) -> 'Page':
|
124
|
+
"""
|
125
|
+
Clear all exclusions from the page.
|
126
|
+
"""
|
127
|
+
self._exclusions = []
|
128
|
+
return self
|
129
|
+
|
130
|
+
def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region, Any]) -> 'Page':
|
124
131
|
"""
|
125
132
|
Add an exclusion to the page. Text from these regions will be excluded from extraction.
|
133
|
+
Ensures non-callable items are stored as Region objects if possible.
|
126
134
|
|
127
135
|
Args:
|
128
|
-
exclusion_func_or_region: Either a
|
129
|
-
|
136
|
+
exclusion_func_or_region: Either a callable function returning a Region,
|
137
|
+
a Region object, or another object with a valid .bbox attribute.
|
130
138
|
|
131
139
|
Returns:
|
132
140
|
Self for method chaining
|
133
|
-
|
134
|
-
|
141
|
+
|
142
|
+
Raises:
|
143
|
+
TypeError: If a non-callable, non-Region object without a valid bbox is provided.
|
144
|
+
"""
|
145
|
+
if callable(exclusion_func_or_region):
|
146
|
+
# Store callable functions directly
|
147
|
+
self._exclusions.append(exclusion_func_or_region)
|
148
|
+
logger.debug(f"Page {self.index}: Added callable exclusion: {exclusion_func_or_region}")
|
149
|
+
elif isinstance(exclusion_func_or_region, Region):
|
150
|
+
# Store Region objects directly
|
151
|
+
self._exclusions.append(exclusion_func_or_region)
|
152
|
+
logger.debug(f"Page {self.index}: Added Region exclusion: {exclusion_func_or_region}")
|
153
|
+
elif hasattr(exclusion_func_or_region, 'bbox') and isinstance(getattr(exclusion_func_or_region, 'bbox', None), (tuple, list)) and len(exclusion_func_or_region.bbox) == 4:
|
154
|
+
# Convert objects with a valid bbox to a Region before storing
|
155
|
+
try:
|
156
|
+
bbox_coords = tuple(float(v) for v in exclusion_func_or_region.bbox)
|
157
|
+
region_to_add = Region(self, bbox_coords)
|
158
|
+
self._exclusions.append(region_to_add)
|
159
|
+
logger.debug(f"Page {self.index}: Added exclusion converted to Region from {type(exclusion_func_or_region)}: {region_to_add}")
|
160
|
+
except (ValueError, TypeError, Exception) as e:
|
161
|
+
# Raise an error if conversion fails
|
162
|
+
raise TypeError(f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}") from e
|
163
|
+
else:
|
164
|
+
# Reject invalid types
|
165
|
+
raise TypeError(f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute.")
|
166
|
+
|
135
167
|
return self
|
136
168
|
|
137
169
|
def add_region(self, region: Region, name: Optional[str] = None) -> 'Page':
|
@@ -190,6 +222,7 @@ class Page:
|
|
190
222
|
def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
|
191
223
|
"""
|
192
224
|
Get all exclusion regions for this page.
|
225
|
+
Assumes self._exclusions contains only callables or Region objects.
|
193
226
|
|
194
227
|
Args:
|
195
228
|
include_callable: Whether to evaluate callable exclusion functions
|
@@ -207,15 +240,14 @@ class Page:
|
|
207
240
|
for i, exclusion in enumerate(self._exclusions):
|
208
241
|
# Get exclusion label if it's a tuple from PDF level
|
209
242
|
exclusion_label = f"exclusion {i}"
|
210
|
-
original_exclusion = exclusion
|
211
|
-
|
212
|
-
# Check if it's a tuple from PDF.add_exclusion
|
243
|
+
original_exclusion = exclusion # Keep track for debugging
|
244
|
+
|
245
|
+
# Check if it's a tuple from PDF.add_exclusion (should still be handled if PDF adds labels)
|
213
246
|
if isinstance(exclusion, tuple) and len(exclusion) == 2 and callable(exclusion[0]):
|
214
|
-
# This is likely from PDF.add_exclusion with (func, label)
|
215
247
|
exclusion_func, label = exclusion
|
216
248
|
if label:
|
217
249
|
exclusion_label = label
|
218
|
-
exclusion = exclusion_func
|
250
|
+
exclusion = exclusion_func # Use the function part
|
219
251
|
|
220
252
|
# Process callable exclusion functions
|
221
253
|
if callable(exclusion) and include_callable:
|
@@ -224,40 +256,45 @@ class Page:
|
|
224
256
|
if debug:
|
225
257
|
print(f" - Evaluating callable {exclusion_label}...")
|
226
258
|
|
227
|
-
#
|
228
|
-
|
229
|
-
self._exclusions
|
259
|
+
# Temporarily clear exclusions to avoid potential recursion if the callable uses exclusions itself
|
260
|
+
# This might be overly cautious depending on use case, but safer.
|
261
|
+
temp_original_exclusions = self._exclusions
|
262
|
+
self._exclusions = []
|
230
263
|
|
231
|
-
# Call the function
|
232
|
-
|
264
|
+
# Call the function - Expects it to return a Region or None
|
265
|
+
region_result = exclusion(self)
|
233
266
|
|
234
267
|
# Restore exclusions
|
235
|
-
self._exclusions =
|
268
|
+
self._exclusions = temp_original_exclusions
|
236
269
|
|
237
|
-
if
|
238
|
-
regions.append(
|
270
|
+
if isinstance(region_result, Region):
|
271
|
+
regions.append(region_result)
|
239
272
|
if debug:
|
240
|
-
print(f" ✓ Added region: {
|
273
|
+
print(f" ✓ Added region from callable: {region_result}")
|
274
|
+
elif region_result:
|
275
|
+
# Log warning if callable returned something other than Region/None
|
276
|
+
logger.warning(f"Callable exclusion {exclusion_label} returned non-Region object: {type(region_result)}. Skipping.")
|
277
|
+
if debug:
|
278
|
+
print(f" ✗ Callable returned non-Region/None: {type(region_result)}")
|
241
279
|
else:
|
242
280
|
if debug:
|
243
|
-
print(f" ✗
|
281
|
+
print(f" ✗ Callable returned None, no region added")
|
244
282
|
|
245
283
|
except Exception as e:
|
246
|
-
error_msg = f"Error
|
284
|
+
error_msg = f"Error evaluating callable exclusion {exclusion_label} for page {self.index}: {e}"
|
247
285
|
print(error_msg)
|
248
|
-
# Print more detailed traceback for debugging
|
249
286
|
import traceback
|
250
287
|
print(f" Traceback: {traceback.format_exc().splitlines()[-3:]}")
|
251
288
|
|
252
|
-
# Process direct Region objects
|
253
|
-
elif
|
254
|
-
# It's already a Region object
|
289
|
+
# Process direct Region objects (already validated by add_exclusion)
|
290
|
+
elif isinstance(exclusion, Region):
|
255
291
|
regions.append(exclusion)
|
256
292
|
if debug:
|
257
293
|
print(f" - Added direct region: {exclusion}")
|
294
|
+
# No else needed, add_exclusion should prevent invalid types
|
258
295
|
|
259
296
|
if debug:
|
260
|
-
print(f"Page {self.index}: Found {len(regions)} valid exclusion regions")
|
297
|
+
print(f"Page {self.index}: Found {len(regions)} valid exclusion regions to apply")
|
261
298
|
|
262
299
|
return regions
|
263
300
|
|
@@ -1178,6 +1215,34 @@ class Page:
|
|
1178
1215
|
|
1179
1216
|
return ElementCollection(detected_regions)
|
1180
1217
|
|
1218
|
+
def clear_detected_layout_regions(self) -> 'Page':
|
1219
|
+
"""
|
1220
|
+
Removes all regions from this page that were added by layout analysis
|
1221
|
+
(i.e., regions where `source` attribute is 'detected').
|
1222
|
+
|
1223
|
+
This clears the regions both from the page's internal `_regions['detected']` list
|
1224
|
+
and from the ElementManager's internal list of regions.
|
1225
|
+
|
1226
|
+
Returns:
|
1227
|
+
Self for method chaining.
|
1228
|
+
"""
|
1229
|
+
if not hasattr(self._element_mgr, 'regions') or not hasattr(self._element_mgr, '_elements') or 'regions' not in self._element_mgr._elements:
|
1230
|
+
logger.debug(f"Page {self.index}: No regions found in ElementManager, nothing to clear.")
|
1231
|
+
self._regions['detected'] = [] # Ensure page's list is also clear
|
1232
|
+
return self
|
1233
|
+
|
1234
|
+
# Filter ElementManager's list to keep only non-detected regions
|
1235
|
+
original_count = len(self._element_mgr.regions)
|
1236
|
+
self._element_mgr._elements['regions'] = [r for r in self._element_mgr.regions if getattr(r, 'source', None) != 'detected']
|
1237
|
+
new_count = len(self._element_mgr.regions)
|
1238
|
+
removed_count = original_count - new_count
|
1239
|
+
|
1240
|
+
# Clear the page's specific list of detected regions
|
1241
|
+
self._regions['detected'] = []
|
1242
|
+
|
1243
|
+
logger.info(f"Page {self.index}: Cleared {removed_count} detected layout regions.")
|
1244
|
+
return self
|
1245
|
+
|
1181
1246
|
def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both') -> Optional[Region]: # Return Optional
|
1182
1247
|
"""
|
1183
1248
|
Get a section between two elements on this page.
|
natural_pdf/core/pdf.py
CHANGED
@@ -125,6 +125,17 @@ class PDF:
|
|
125
125
|
from natural_pdf.elements.collections import PageCollection
|
126
126
|
return PageCollection(self._pages)
|
127
127
|
|
128
|
+
def clear_exclusions(self) -> 'PDF':
|
129
|
+
"""
|
130
|
+
Clear all exclusion functions from the PDF.
|
131
|
+
|
132
|
+
Returns:
|
133
|
+
Self for method chaining
|
134
|
+
"""
|
135
|
+
|
136
|
+
self._exclusions = []
|
137
|
+
return self
|
138
|
+
|
128
139
|
def add_exclusion(self, exclusion_func: Callable[[Page], Region], label: str = None) -> 'PDF':
|
129
140
|
"""
|
130
141
|
Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
|
natural_pdf/elements/base.py
CHANGED
@@ -7,7 +7,8 @@ from PIL import Image
|
|
7
7
|
if TYPE_CHECKING:
|
8
8
|
from natural_pdf.core.page import Page
|
9
9
|
from natural_pdf.elements.region import Region
|
10
|
-
from natural_pdf.elements.base import Element
|
10
|
+
from natural_pdf.elements.base import Element
|
11
|
+
from natural_pdf.elements.collections import ElementCollection
|
11
12
|
|
12
13
|
|
13
14
|
class DirectionalMixin:
|
@@ -17,7 +18,7 @@ class DirectionalMixin:
|
|
17
18
|
|
18
19
|
def _direction(self, direction: str, size: Optional[float] = None,
|
19
20
|
cross_size: str = "full", include_element: bool = False,
|
20
|
-
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) ->
|
21
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
|
21
22
|
"""
|
22
23
|
Protected helper method to create a region in a specified direction relative to this element/region.
|
23
24
|
|
@@ -154,7 +155,7 @@ class DirectionalMixin:
|
|
154
155
|
return result
|
155
156
|
|
156
157
|
def above(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
|
157
|
-
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) ->
|
158
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
|
158
159
|
"""
|
159
160
|
Select region above this element/region.
|
160
161
|
|
@@ -180,7 +181,7 @@ class DirectionalMixin:
|
|
180
181
|
)
|
181
182
|
|
182
183
|
def below(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
|
183
|
-
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) ->
|
184
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
|
184
185
|
"""
|
185
186
|
Select region below this element/region.
|
186
187
|
|
@@ -206,7 +207,7 @@ class DirectionalMixin:
|
|
206
207
|
)
|
207
208
|
|
208
209
|
def left(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
|
209
|
-
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) ->
|
210
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
|
210
211
|
"""
|
211
212
|
Select region to the left of this element/region.
|
212
213
|
|
@@ -232,7 +233,7 @@ class DirectionalMixin:
|
|
232
233
|
)
|
233
234
|
|
234
235
|
def right(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
|
235
|
-
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) ->
|
236
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
|
236
237
|
"""
|
237
238
|
Select region to the right of this element/region.
|
238
239
|
|
@@ -257,6 +258,86 @@ class DirectionalMixin:
|
|
257
258
|
**kwargs
|
258
259
|
)
|
259
260
|
|
261
|
+
def expand(self,
|
262
|
+
left: float = 0,
|
263
|
+
right: float = 0,
|
264
|
+
top_expand: float = 0, # Renamed to avoid conflict
|
265
|
+
bottom_expand: float = 0, # Renamed to avoid conflict
|
266
|
+
width_factor: float = 1.0,
|
267
|
+
height_factor: float = 1.0,
|
268
|
+
# Keep original parameter names for backward compatibility
|
269
|
+
top: float = None,
|
270
|
+
bottom: float = None) -> 'Region':
|
271
|
+
"""
|
272
|
+
Create a new region expanded from this element/region.
|
273
|
+
|
274
|
+
Args:
|
275
|
+
left: Amount to expand left edge (positive value expands leftwards)
|
276
|
+
right: Amount to expand right edge (positive value expands rightwards)
|
277
|
+
top_expand: Amount to expand top edge (positive value expands upwards)
|
278
|
+
bottom_expand: Amount to expand bottom edge (positive value expands downwards)
|
279
|
+
width_factor: Factor to multiply width by (applied after absolute expansion)
|
280
|
+
height_factor: Factor to multiply height by (applied after absolute expansion)
|
281
|
+
top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
|
282
|
+
bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
|
283
|
+
|
284
|
+
Returns:
|
285
|
+
New expanded Region object
|
286
|
+
"""
|
287
|
+
# Start with current coordinates
|
288
|
+
new_x0 = self.x0
|
289
|
+
new_x1 = self.x1
|
290
|
+
new_top = self.top
|
291
|
+
new_bottom = self.bottom
|
292
|
+
|
293
|
+
# Handle the deprecated parameter names for backward compatibility
|
294
|
+
if top is not None:
|
295
|
+
top_expand = top
|
296
|
+
if bottom is not None:
|
297
|
+
bottom_expand = bottom
|
298
|
+
|
299
|
+
# Apply absolute expansions first
|
300
|
+
new_x0 -= left
|
301
|
+
new_x1 += right
|
302
|
+
new_top -= top_expand # Expand upward (decrease top coordinate)
|
303
|
+
new_bottom += bottom_expand # Expand downward (increase bottom coordinate)
|
304
|
+
|
305
|
+
# Apply percentage factors if provided
|
306
|
+
if width_factor != 1.0 or height_factor != 1.0:
|
307
|
+
# Calculate center point *after* absolute expansion
|
308
|
+
center_x = (new_x0 + new_x1) / 2
|
309
|
+
center_y = (new_top + new_bottom) / 2
|
310
|
+
|
311
|
+
# Calculate current width and height *after* absolute expansion
|
312
|
+
current_width = new_x1 - new_x0
|
313
|
+
current_height = new_bottom - new_top
|
314
|
+
|
315
|
+
# Calculate new width and height
|
316
|
+
new_width = current_width * width_factor
|
317
|
+
new_height = current_height * height_factor
|
318
|
+
|
319
|
+
# Adjust coordinates based on the new dimensions, keeping the center
|
320
|
+
new_x0 = center_x - new_width / 2
|
321
|
+
new_x1 = center_x + new_width / 2
|
322
|
+
new_top = center_y - new_height / 2
|
323
|
+
new_bottom = center_y + new_height / 2
|
324
|
+
|
325
|
+
# Clamp coordinates to page boundaries
|
326
|
+
new_x0 = max(0, new_x0)
|
327
|
+
new_top = max(0, new_top)
|
328
|
+
new_x1 = min(self.page.width, new_x1)
|
329
|
+
new_bottom = min(self.page.height, new_bottom)
|
330
|
+
|
331
|
+
# Ensure coordinates are valid (x0 <= x1, top <= bottom)
|
332
|
+
if new_x0 > new_x1: new_x0 = new_x1 = (new_x0 + new_x1) / 2
|
333
|
+
if new_top > new_bottom: new_top = new_bottom = (new_top + new_bottom) / 2
|
334
|
+
|
335
|
+
# Create new region with expanded bbox
|
336
|
+
from natural_pdf.elements.region import Region
|
337
|
+
new_region = Region(self.page, (new_x0, new_top, new_x1, new_bottom))
|
338
|
+
|
339
|
+
return new_region
|
340
|
+
|
260
341
|
|
261
342
|
class Element(DirectionalMixin):
|
262
343
|
"""
|
@@ -415,7 +496,8 @@ class Element(DirectionalMixin):
|
|
415
496
|
candidates = candidates[:limit] if limit else candidates
|
416
497
|
|
417
498
|
# Find matching elements
|
418
|
-
|
499
|
+
from natural_pdf.elements.collections import ElementCollection
|
500
|
+
matches = ElementCollection(candidates).find_all(selector, **kwargs)
|
419
501
|
return matches[0] if matches else None
|
420
502
|
elif idx + 1 < len(all_elements):
|
421
503
|
# No selector, just return the next element
|
@@ -449,16 +531,17 @@ class Element(DirectionalMixin):
|
|
449
531
|
|
450
532
|
# Search for previous matching element
|
451
533
|
if selector:
|
452
|
-
#
|
534
|
+
# Select elements before this one
|
453
535
|
candidates = all_elements[:idx]
|
454
|
-
# Reverse to
|
536
|
+
# Reverse to search backwards from the current element
|
455
537
|
candidates = candidates[::-1]
|
456
538
|
# Limit search range for performance
|
457
539
|
candidates = candidates[:limit] if limit else candidates
|
458
540
|
|
459
|
-
# Find matching elements
|
460
|
-
|
461
|
-
|
541
|
+
# Find matching elements using ElementCollection
|
542
|
+
from natural_pdf.elements.collections import ElementCollection
|
543
|
+
matches = ElementCollection(candidates).find_all(selector, **kwargs)
|
544
|
+
return matches[0] if matches else None # find_all returns a collection
|
462
545
|
elif idx > 0:
|
463
546
|
# No selector, just return the previous element
|
464
547
|
return all_elements[idx - 1]
|
@@ -737,8 +820,9 @@ class Element(DirectionalMixin):
|
|
737
820
|
Returns:
|
738
821
|
First matching element or None
|
739
822
|
"""
|
740
|
-
# Create a temporary region from this element's bounds
|
741
823
|
from natural_pdf.elements.region import Region
|
824
|
+
|
825
|
+
# Create a temporary region from this element's bounds
|
742
826
|
temp_region = Region(self.page, self.bbox)
|
743
827
|
return temp_region.find(selector, apply_exclusions=apply_exclusions, **kwargs)
|
744
828
|
|
@@ -755,7 +839,8 @@ class Element(DirectionalMixin):
|
|
755
839
|
Returns:
|
756
840
|
ElementCollection with matching elements
|
757
841
|
"""
|
758
|
-
# Create a temporary region from this element's bounds
|
759
842
|
from natural_pdf.elements.region import Region
|
843
|
+
|
844
|
+
# Create a temporary region from this element's bounds
|
760
845
|
temp_region = Region(self.page, self.bbox)
|
761
846
|
return temp_region.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
@@ -2,6 +2,7 @@ import logging
|
|
2
2
|
|
3
3
|
from typing import List, Optional, Dict, Any, Union, Callable, TypeVar, Generic, Iterator, Tuple, TYPE_CHECKING
|
4
4
|
from natural_pdf.ocr import OCROptions
|
5
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
5
6
|
|
6
7
|
logger = logging.getLogger(__name__)
|
7
8
|
|
@@ -882,6 +883,61 @@ class ElementCollection(Generic[T]):
|
|
882
883
|
logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
|
883
884
|
return None
|
884
885
|
|
886
|
+
def find_all(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> 'ElementCollection[T]':
|
887
|
+
"""
|
888
|
+
Filter elements within this collection matching the selector.
|
889
|
+
|
890
|
+
Args:
|
891
|
+
selector: CSS-like selector string.
|
892
|
+
regex: Whether to use regex for text search in :contains (default: False).
|
893
|
+
case: Whether to do case-sensitive text search (default: True).
|
894
|
+
**kwargs: Additional filter parameters passed to the selector function.
|
895
|
+
|
896
|
+
Returns:
|
897
|
+
A new ElementCollection containing only the matching elements from this collection.
|
898
|
+
"""
|
899
|
+
if not self._elements:
|
900
|
+
return ElementCollection([])
|
901
|
+
|
902
|
+
try:
|
903
|
+
selector_obj = parse_selector(selector)
|
904
|
+
except Exception as e:
|
905
|
+
logger.error(f"Error parsing selector '{selector}': {e}")
|
906
|
+
return ElementCollection([]) # Return empty on parse error
|
907
|
+
|
908
|
+
# Pass regex and case flags to selector function generator
|
909
|
+
kwargs['regex'] = regex
|
910
|
+
kwargs['case'] = case
|
911
|
+
|
912
|
+
try:
|
913
|
+
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
914
|
+
except Exception as e:
|
915
|
+
logger.error(f"Error creating filter function for selector '{selector}': {e}")
|
916
|
+
return ElementCollection([]) # Return empty on filter creation error
|
917
|
+
|
918
|
+
matching_elements = [element for element in self._elements if filter_func(element)]
|
919
|
+
|
920
|
+
# Note: Unlike Page.find_all, this doesn't re-sort.
|
921
|
+
# Sorting should be done explicitly on the collection if needed.
|
922
|
+
|
923
|
+
return ElementCollection(matching_elements)
|
924
|
+
|
925
|
+
def find(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> Optional[T]:
|
926
|
+
"""
|
927
|
+
Find the first element within this collection matching the selector.
|
928
|
+
|
929
|
+
Args:
|
930
|
+
selector: CSS-like selector string.
|
931
|
+
regex: Whether to use regex for text search in :contains (default: False).
|
932
|
+
case: Whether to do case-sensitive text search (default: True).
|
933
|
+
**kwargs: Additional filter parameters passed to the selector function.
|
934
|
+
|
935
|
+
Returns:
|
936
|
+
The first matching element or None.
|
937
|
+
"""
|
938
|
+
results = self.find_all(selector, regex=regex, case=case, **kwargs)
|
939
|
+
return results.first
|
940
|
+
|
885
941
|
class PageCollection(Generic[P]):
|
886
942
|
"""
|
887
943
|
A collection of PDF pages with cross-page operations.
|