natural-pdf 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +1 -1
- natural_pdf/core/highlighting_service.py +48 -17
- natural_pdf/core/page.py +92 -27
- natural_pdf/core/pdf.py +11 -0
- natural_pdf/elements/base.py +99 -14
- natural_pdf/elements/collections.py +56 -0
- natural_pdf/elements/region.py +4 -106
- natural_pdf/qa/document_qa.py +4 -3
- natural_pdf/selectors/parser.py +215 -1
- natural_pdf/utils/visualization.py +2 -2
- {natural_pdf-0.1.0.dist-info → natural_pdf-0.1.1.dist-info}/METADATA +10 -10
- {natural_pdf-0.1.0.dist-info → natural_pdf-0.1.1.dist-info}/RECORD +15 -15
- {natural_pdf-0.1.0.dist-info → natural_pdf-0.1.1.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.0.dist-info → natural_pdf-0.1.1.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.0.dist-info → natural_pdf-0.1.1.dist-info}/top_level.txt +0 -0
natural_pdf/__init__.py
CHANGED
@@ -383,7 +383,7 @@ class HighlightingService:
|
|
383
383
|
def add(
|
384
384
|
self,
|
385
385
|
page_index: int,
|
386
|
-
bbox: Tuple[float, float, float, float],
|
386
|
+
bbox: Union[Tuple[float, float, float, float], Any], # Relax input type hint
|
387
387
|
color: Optional[Union[Tuple, str]] = None,
|
388
388
|
label: Optional[str] = None,
|
389
389
|
use_color_cycling: bool = False,
|
@@ -392,9 +392,32 @@ class HighlightingService:
|
|
392
392
|
existing: str = 'append'
|
393
393
|
):
|
394
394
|
"""Adds a rectangular highlight."""
|
395
|
+
|
396
|
+
processed_bbox: Tuple[float, float, float, float]
|
397
|
+
# Check if bbox is an object with expected attributes (likely a Region)
|
398
|
+
# Assuming Region object has x0, top, x1, bottom attributes based on error context
|
399
|
+
if (hasattr(bbox, 'x0') and hasattr(bbox, 'top') and
|
400
|
+
hasattr(bbox, 'x1') and hasattr(bbox, 'bottom')):
|
401
|
+
try:
|
402
|
+
# Ensure attributes are numeric before creating tuple
|
403
|
+
processed_bbox = (float(bbox.x0), float(bbox.top), float(bbox.x1), float(bbox.bottom))
|
404
|
+
except (ValueError, TypeError):
|
405
|
+
logger.error(f"Invalid attribute types in bbox object for page {page_index}: {bbox}. Expected numeric values.")
|
406
|
+
return
|
407
|
+
elif isinstance(bbox, (list, tuple)) and len(bbox) == 4:
|
408
|
+
try:
|
409
|
+
# Ensure elements are numeric and convert to tuple
|
410
|
+
processed_bbox = tuple(float(v) for v in bbox)
|
411
|
+
except (ValueError, TypeError):
|
412
|
+
logger.error(f"Invalid values in bbox sequence for page {page_index}: {bbox}. Expected numeric values.")
|
413
|
+
return
|
414
|
+
else:
|
415
|
+
logger.error(f"Invalid bbox type or structure provided for page {page_index}: {type(bbox)} - {bbox}. Expected tuple/list of 4 numbers or Region-like object.")
|
416
|
+
return # Don't proceed if bbox is invalid
|
417
|
+
|
395
418
|
self._add_internal(
|
396
419
|
page_index=page_index,
|
397
|
-
bbox=
|
420
|
+
bbox=processed_bbox, # Use the processed tuple
|
398
421
|
polygon=None,
|
399
422
|
color_input=color,
|
400
423
|
label=label,
|
@@ -526,6 +549,7 @@ class HighlightingService:
|
|
526
549
|
) -> Optional[Image.Image]:
|
527
550
|
"""
|
528
551
|
Renders a specific page with its highlights.
|
552
|
+
Legend is now generated based only on highlights present on this page.
|
529
553
|
|
530
554
|
Args:
|
531
555
|
page_index: The 0-based index of the page to render.
|
@@ -545,23 +569,19 @@ class HighlightingService:
|
|
545
569
|
return None
|
546
570
|
|
547
571
|
page = self._pdf[page_index]
|
548
|
-
highlights_on_page = self.get_highlights_for_page(page_index)
|
572
|
+
highlights_on_page = self.get_highlights_for_page(page_index) # This list will be empty if clear_page was called
|
549
573
|
|
550
574
|
# --- Get Base Image ---
|
551
575
|
try:
|
552
576
|
render_resolution = resolution if resolution is not None else scale * 72
|
553
|
-
# Use the underlying pdfplumber page object for base rendering
|
554
577
|
img_object = page._page.to_image(resolution=render_resolution, **kwargs)
|
555
|
-
|
556
|
-
base_image = img_object.annotated # .annotated usually holds the PIL Image
|
578
|
+
base_image = img_object.annotated
|
557
579
|
if not isinstance(base_image, Image.Image):
|
558
|
-
# Fallback for different pdfplumber versions/outputs
|
559
580
|
png_data = img_object._repr_png_()
|
560
581
|
if png_data:
|
561
582
|
base_image = Image.open(io.BytesIO(png_data)).convert('RGB')
|
562
583
|
else:
|
563
584
|
raise ValueError("Could not extract base PIL image from pdfplumber.")
|
564
|
-
# Convert to RGBA for compositing
|
565
585
|
base_image = base_image.convert('RGBA')
|
566
586
|
logger.debug(f"Base image for page {page_index} rendered with resolution {render_resolution}.")
|
567
587
|
except Exception as e:
|
@@ -569,6 +589,7 @@ class HighlightingService:
|
|
569
589
|
return None
|
570
590
|
|
571
591
|
# --- Render Highlights ---
|
592
|
+
rendered_image: Image.Image
|
572
593
|
if highlights_on_page:
|
573
594
|
renderer = HighlightRenderer(
|
574
595
|
page=page,
|
@@ -579,21 +600,31 @@ class HighlightingService:
|
|
579
600
|
)
|
580
601
|
rendered_image = renderer.render()
|
581
602
|
else:
|
582
|
-
# If no highlights, still need to potentially render OCR if requested
|
583
603
|
if render_ocr:
|
604
|
+
# Still render OCR even if no highlights
|
584
605
|
renderer = HighlightRenderer(page, base_image, [], scale, True)
|
585
|
-
rendered_image = renderer.render()
|
606
|
+
rendered_image = renderer.render()
|
586
607
|
else:
|
587
608
|
rendered_image = base_image # No highlights, no OCR requested
|
588
609
|
|
589
|
-
# --- Add Legend ---
|
610
|
+
# --- Add Legend (Based ONLY on this page's highlights) ---
|
590
611
|
if labels:
|
591
|
-
label_colors
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
612
|
+
# CHANGE: Create label_colors map only from highlights_on_page
|
613
|
+
labels_colors_on_page: Dict[str, Tuple[int, int, int, int]] = {}
|
614
|
+
for hl in highlights_on_page:
|
615
|
+
if hl.label and hl.label not in labels_colors_on_page:
|
616
|
+
labels_colors_on_page[hl.label] = hl.color
|
617
|
+
|
618
|
+
if labels_colors_on_page: # Only add legend if there are labels on this page
|
619
|
+
legend = create_legend(labels_colors_on_page)
|
620
|
+
if legend: # Ensure create_legend didn't return None
|
621
|
+
rendered_image = merge_images_with_legend(rendered_image, legend, legend_position)
|
622
|
+
logger.debug(f"Added legend with {len(labels_colors_on_page)} labels for page {page_index}.")
|
623
|
+
else:
|
624
|
+
logger.debug(f"Legend creation returned None for page {page_index}.")
|
625
|
+
else:
|
626
|
+
logger.debug(f"No labels found on page {page_index}, skipping legend.")
|
627
|
+
|
597
628
|
return rendered_image
|
598
629
|
|
599
630
|
def render_preview(
|
natural_pdf/core/page.py
CHANGED
@@ -9,6 +9,7 @@ import io
|
|
9
9
|
import json
|
10
10
|
|
11
11
|
from natural_pdf.elements.collections import ElementCollection
|
12
|
+
from natural_pdf.elements.region import Region
|
12
13
|
|
13
14
|
if TYPE_CHECKING:
|
14
15
|
import pdfplumber
|
@@ -17,7 +18,6 @@ if TYPE_CHECKING:
|
|
17
18
|
from natural_pdf.core.highlighting_service import HighlightingService
|
18
19
|
from natural_pdf.elements.base import Element
|
19
20
|
|
20
|
-
from natural_pdf.elements.region import Region
|
21
21
|
from natural_pdf.elements.text import TextElement
|
22
22
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
23
23
|
from natural_pdf.analyzers.layout.layout_options import LayoutOptions
|
@@ -120,18 +120,50 @@ class Page:
|
|
120
120
|
raise AttributeError("Parent PDF object does not have a 'highlighter' attribute.")
|
121
121
|
return self._parent.highlighter
|
122
122
|
|
123
|
-
def
|
123
|
+
def clear_exclusions(self) -> 'Page':
|
124
|
+
"""
|
125
|
+
Clear all exclusions from the page.
|
126
|
+
"""
|
127
|
+
self._exclusions = []
|
128
|
+
return self
|
129
|
+
|
130
|
+
def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region, Any]) -> 'Page':
|
124
131
|
"""
|
125
132
|
Add an exclusion to the page. Text from these regions will be excluded from extraction.
|
133
|
+
Ensures non-callable items are stored as Region objects if possible.
|
126
134
|
|
127
135
|
Args:
|
128
|
-
exclusion_func_or_region: Either a
|
129
|
-
|
136
|
+
exclusion_func_or_region: Either a callable function returning a Region,
|
137
|
+
a Region object, or another object with a valid .bbox attribute.
|
130
138
|
|
131
139
|
Returns:
|
132
140
|
Self for method chaining
|
133
|
-
|
134
|
-
|
141
|
+
|
142
|
+
Raises:
|
143
|
+
TypeError: If a non-callable, non-Region object without a valid bbox is provided.
|
144
|
+
"""
|
145
|
+
if callable(exclusion_func_or_region):
|
146
|
+
# Store callable functions directly
|
147
|
+
self._exclusions.append(exclusion_func_or_region)
|
148
|
+
logger.debug(f"Page {self.index}: Added callable exclusion: {exclusion_func_or_region}")
|
149
|
+
elif isinstance(exclusion_func_or_region, Region):
|
150
|
+
# Store Region objects directly
|
151
|
+
self._exclusions.append(exclusion_func_or_region)
|
152
|
+
logger.debug(f"Page {self.index}: Added Region exclusion: {exclusion_func_or_region}")
|
153
|
+
elif hasattr(exclusion_func_or_region, 'bbox') and isinstance(getattr(exclusion_func_or_region, 'bbox', None), (tuple, list)) and len(exclusion_func_or_region.bbox) == 4:
|
154
|
+
# Convert objects with a valid bbox to a Region before storing
|
155
|
+
try:
|
156
|
+
bbox_coords = tuple(float(v) for v in exclusion_func_or_region.bbox)
|
157
|
+
region_to_add = Region(self, bbox_coords)
|
158
|
+
self._exclusions.append(region_to_add)
|
159
|
+
logger.debug(f"Page {self.index}: Added exclusion converted to Region from {type(exclusion_func_or_region)}: {region_to_add}")
|
160
|
+
except (ValueError, TypeError, Exception) as e:
|
161
|
+
# Raise an error if conversion fails
|
162
|
+
raise TypeError(f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}") from e
|
163
|
+
else:
|
164
|
+
# Reject invalid types
|
165
|
+
raise TypeError(f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute.")
|
166
|
+
|
135
167
|
return self
|
136
168
|
|
137
169
|
def add_region(self, region: Region, name: Optional[str] = None) -> 'Page':
|
@@ -190,6 +222,7 @@ class Page:
|
|
190
222
|
def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
|
191
223
|
"""
|
192
224
|
Get all exclusion regions for this page.
|
225
|
+
Assumes self._exclusions contains only callables or Region objects.
|
193
226
|
|
194
227
|
Args:
|
195
228
|
include_callable: Whether to evaluate callable exclusion functions
|
@@ -207,15 +240,14 @@ class Page:
|
|
207
240
|
for i, exclusion in enumerate(self._exclusions):
|
208
241
|
# Get exclusion label if it's a tuple from PDF level
|
209
242
|
exclusion_label = f"exclusion {i}"
|
210
|
-
original_exclusion = exclusion
|
211
|
-
|
212
|
-
# Check if it's a tuple from PDF.add_exclusion
|
243
|
+
original_exclusion = exclusion # Keep track for debugging
|
244
|
+
|
245
|
+
# Check if it's a tuple from PDF.add_exclusion (should still be handled if PDF adds labels)
|
213
246
|
if isinstance(exclusion, tuple) and len(exclusion) == 2 and callable(exclusion[0]):
|
214
|
-
# This is likely from PDF.add_exclusion with (func, label)
|
215
247
|
exclusion_func, label = exclusion
|
216
248
|
if label:
|
217
249
|
exclusion_label = label
|
218
|
-
exclusion = exclusion_func
|
250
|
+
exclusion = exclusion_func # Use the function part
|
219
251
|
|
220
252
|
# Process callable exclusion functions
|
221
253
|
if callable(exclusion) and include_callable:
|
@@ -224,40 +256,45 @@ class Page:
|
|
224
256
|
if debug:
|
225
257
|
print(f" - Evaluating callable {exclusion_label}...")
|
226
258
|
|
227
|
-
#
|
228
|
-
|
229
|
-
self._exclusions
|
259
|
+
# Temporarily clear exclusions to avoid potential recursion if the callable uses exclusions itself
|
260
|
+
# This might be overly cautious depending on use case, but safer.
|
261
|
+
temp_original_exclusions = self._exclusions
|
262
|
+
self._exclusions = []
|
230
263
|
|
231
|
-
# Call the function
|
232
|
-
|
264
|
+
# Call the function - Expects it to return a Region or None
|
265
|
+
region_result = exclusion(self)
|
233
266
|
|
234
267
|
# Restore exclusions
|
235
|
-
self._exclusions =
|
268
|
+
self._exclusions = temp_original_exclusions
|
236
269
|
|
237
|
-
if
|
238
|
-
regions.append(
|
270
|
+
if isinstance(region_result, Region):
|
271
|
+
regions.append(region_result)
|
239
272
|
if debug:
|
240
|
-
print(f" ✓ Added region: {
|
273
|
+
print(f" ✓ Added region from callable: {region_result}")
|
274
|
+
elif region_result:
|
275
|
+
# Log warning if callable returned something other than Region/None
|
276
|
+
logger.warning(f"Callable exclusion {exclusion_label} returned non-Region object: {type(region_result)}. Skipping.")
|
277
|
+
if debug:
|
278
|
+
print(f" ✗ Callable returned non-Region/None: {type(region_result)}")
|
241
279
|
else:
|
242
280
|
if debug:
|
243
|
-
print(f" ✗
|
281
|
+
print(f" ✗ Callable returned None, no region added")
|
244
282
|
|
245
283
|
except Exception as e:
|
246
|
-
error_msg = f"Error
|
284
|
+
error_msg = f"Error evaluating callable exclusion {exclusion_label} for page {self.index}: {e}"
|
247
285
|
print(error_msg)
|
248
|
-
# Print more detailed traceback for debugging
|
249
286
|
import traceback
|
250
287
|
print(f" Traceback: {traceback.format_exc().splitlines()[-3:]}")
|
251
288
|
|
252
|
-
# Process direct Region objects
|
253
|
-
elif
|
254
|
-
# It's already a Region object
|
289
|
+
# Process direct Region objects (already validated by add_exclusion)
|
290
|
+
elif isinstance(exclusion, Region):
|
255
291
|
regions.append(exclusion)
|
256
292
|
if debug:
|
257
293
|
print(f" - Added direct region: {exclusion}")
|
294
|
+
# No else needed, add_exclusion should prevent invalid types
|
258
295
|
|
259
296
|
if debug:
|
260
|
-
print(f"Page {self.index}: Found {len(regions)} valid exclusion regions")
|
297
|
+
print(f"Page {self.index}: Found {len(regions)} valid exclusion regions to apply")
|
261
298
|
|
262
299
|
return regions
|
263
300
|
|
@@ -1178,6 +1215,34 @@ class Page:
|
|
1178
1215
|
|
1179
1216
|
return ElementCollection(detected_regions)
|
1180
1217
|
|
1218
|
+
def clear_detected_layout_regions(self) -> 'Page':
|
1219
|
+
"""
|
1220
|
+
Removes all regions from this page that were added by layout analysis
|
1221
|
+
(i.e., regions where `source` attribute is 'detected').
|
1222
|
+
|
1223
|
+
This clears the regions both from the page's internal `_regions['detected']` list
|
1224
|
+
and from the ElementManager's internal list of regions.
|
1225
|
+
|
1226
|
+
Returns:
|
1227
|
+
Self for method chaining.
|
1228
|
+
"""
|
1229
|
+
if not hasattr(self._element_mgr, 'regions') or not hasattr(self._element_mgr, '_elements') or 'regions' not in self._element_mgr._elements:
|
1230
|
+
logger.debug(f"Page {self.index}: No regions found in ElementManager, nothing to clear.")
|
1231
|
+
self._regions['detected'] = [] # Ensure page's list is also clear
|
1232
|
+
return self
|
1233
|
+
|
1234
|
+
# Filter ElementManager's list to keep only non-detected regions
|
1235
|
+
original_count = len(self._element_mgr.regions)
|
1236
|
+
self._element_mgr._elements['regions'] = [r for r in self._element_mgr.regions if getattr(r, 'source', None) != 'detected']
|
1237
|
+
new_count = len(self._element_mgr.regions)
|
1238
|
+
removed_count = original_count - new_count
|
1239
|
+
|
1240
|
+
# Clear the page's specific list of detected regions
|
1241
|
+
self._regions['detected'] = []
|
1242
|
+
|
1243
|
+
logger.info(f"Page {self.index}: Cleared {removed_count} detected layout regions.")
|
1244
|
+
return self
|
1245
|
+
|
1181
1246
|
def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both') -> Optional[Region]: # Return Optional
|
1182
1247
|
"""
|
1183
1248
|
Get a section between two elements on this page.
|
natural_pdf/core/pdf.py
CHANGED
@@ -125,6 +125,17 @@ class PDF:
|
|
125
125
|
from natural_pdf.elements.collections import PageCollection
|
126
126
|
return PageCollection(self._pages)
|
127
127
|
|
128
|
+
def clear_exclusions(self) -> 'PDF':
|
129
|
+
"""
|
130
|
+
Clear all exclusion functions from the PDF.
|
131
|
+
|
132
|
+
Returns:
|
133
|
+
Self for method chaining
|
134
|
+
"""
|
135
|
+
|
136
|
+
self._exclusions = []
|
137
|
+
return self
|
138
|
+
|
128
139
|
def add_exclusion(self, exclusion_func: Callable[[Page], Region], label: str = None) -> 'PDF':
|
129
140
|
"""
|
130
141
|
Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
|
natural_pdf/elements/base.py
CHANGED
@@ -7,7 +7,8 @@ from PIL import Image
|
|
7
7
|
if TYPE_CHECKING:
|
8
8
|
from natural_pdf.core.page import Page
|
9
9
|
from natural_pdf.elements.region import Region
|
10
|
-
from natural_pdf.elements.base import Element
|
10
|
+
from natural_pdf.elements.base import Element
|
11
|
+
from natural_pdf.elements.collections import ElementCollection
|
11
12
|
|
12
13
|
|
13
14
|
class DirectionalMixin:
|
@@ -17,7 +18,7 @@ class DirectionalMixin:
|
|
17
18
|
|
18
19
|
def _direction(self, direction: str, size: Optional[float] = None,
|
19
20
|
cross_size: str = "full", include_element: bool = False,
|
20
|
-
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) ->
|
21
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
|
21
22
|
"""
|
22
23
|
Protected helper method to create a region in a specified direction relative to this element/region.
|
23
24
|
|
@@ -154,7 +155,7 @@ class DirectionalMixin:
|
|
154
155
|
return result
|
155
156
|
|
156
157
|
def above(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
|
157
|
-
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) ->
|
158
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
|
158
159
|
"""
|
159
160
|
Select region above this element/region.
|
160
161
|
|
@@ -180,7 +181,7 @@ class DirectionalMixin:
|
|
180
181
|
)
|
181
182
|
|
182
183
|
def below(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
|
183
|
-
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) ->
|
184
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
|
184
185
|
"""
|
185
186
|
Select region below this element/region.
|
186
187
|
|
@@ -206,7 +207,7 @@ class DirectionalMixin:
|
|
206
207
|
)
|
207
208
|
|
208
209
|
def left(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
|
209
|
-
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) ->
|
210
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
|
210
211
|
"""
|
211
212
|
Select region to the left of this element/region.
|
212
213
|
|
@@ -232,7 +233,7 @@ class DirectionalMixin:
|
|
232
233
|
)
|
233
234
|
|
234
235
|
def right(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
|
235
|
-
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) ->
|
236
|
+
until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> 'Region':
|
236
237
|
"""
|
237
238
|
Select region to the right of this element/region.
|
238
239
|
|
@@ -257,6 +258,86 @@ class DirectionalMixin:
|
|
257
258
|
**kwargs
|
258
259
|
)
|
259
260
|
|
261
|
+
def expand(self,
|
262
|
+
left: float = 0,
|
263
|
+
right: float = 0,
|
264
|
+
top_expand: float = 0, # Renamed to avoid conflict
|
265
|
+
bottom_expand: float = 0, # Renamed to avoid conflict
|
266
|
+
width_factor: float = 1.0,
|
267
|
+
height_factor: float = 1.0,
|
268
|
+
# Keep original parameter names for backward compatibility
|
269
|
+
top: float = None,
|
270
|
+
bottom: float = None) -> 'Region':
|
271
|
+
"""
|
272
|
+
Create a new region expanded from this element/region.
|
273
|
+
|
274
|
+
Args:
|
275
|
+
left: Amount to expand left edge (positive value expands leftwards)
|
276
|
+
right: Amount to expand right edge (positive value expands rightwards)
|
277
|
+
top_expand: Amount to expand top edge (positive value expands upwards)
|
278
|
+
bottom_expand: Amount to expand bottom edge (positive value expands downwards)
|
279
|
+
width_factor: Factor to multiply width by (applied after absolute expansion)
|
280
|
+
height_factor: Factor to multiply height by (applied after absolute expansion)
|
281
|
+
top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
|
282
|
+
bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
|
283
|
+
|
284
|
+
Returns:
|
285
|
+
New expanded Region object
|
286
|
+
"""
|
287
|
+
# Start with current coordinates
|
288
|
+
new_x0 = self.x0
|
289
|
+
new_x1 = self.x1
|
290
|
+
new_top = self.top
|
291
|
+
new_bottom = self.bottom
|
292
|
+
|
293
|
+
# Handle the deprecated parameter names for backward compatibility
|
294
|
+
if top is not None:
|
295
|
+
top_expand = top
|
296
|
+
if bottom is not None:
|
297
|
+
bottom_expand = bottom
|
298
|
+
|
299
|
+
# Apply absolute expansions first
|
300
|
+
new_x0 -= left
|
301
|
+
new_x1 += right
|
302
|
+
new_top -= top_expand # Expand upward (decrease top coordinate)
|
303
|
+
new_bottom += bottom_expand # Expand downward (increase bottom coordinate)
|
304
|
+
|
305
|
+
# Apply percentage factors if provided
|
306
|
+
if width_factor != 1.0 or height_factor != 1.0:
|
307
|
+
# Calculate center point *after* absolute expansion
|
308
|
+
center_x = (new_x0 + new_x1) / 2
|
309
|
+
center_y = (new_top + new_bottom) / 2
|
310
|
+
|
311
|
+
# Calculate current width and height *after* absolute expansion
|
312
|
+
current_width = new_x1 - new_x0
|
313
|
+
current_height = new_bottom - new_top
|
314
|
+
|
315
|
+
# Calculate new width and height
|
316
|
+
new_width = current_width * width_factor
|
317
|
+
new_height = current_height * height_factor
|
318
|
+
|
319
|
+
# Adjust coordinates based on the new dimensions, keeping the center
|
320
|
+
new_x0 = center_x - new_width / 2
|
321
|
+
new_x1 = center_x + new_width / 2
|
322
|
+
new_top = center_y - new_height / 2
|
323
|
+
new_bottom = center_y + new_height / 2
|
324
|
+
|
325
|
+
# Clamp coordinates to page boundaries
|
326
|
+
new_x0 = max(0, new_x0)
|
327
|
+
new_top = max(0, new_top)
|
328
|
+
new_x1 = min(self.page.width, new_x1)
|
329
|
+
new_bottom = min(self.page.height, new_bottom)
|
330
|
+
|
331
|
+
# Ensure coordinates are valid (x0 <= x1, top <= bottom)
|
332
|
+
if new_x0 > new_x1: new_x0 = new_x1 = (new_x0 + new_x1) / 2
|
333
|
+
if new_top > new_bottom: new_top = new_bottom = (new_top + new_bottom) / 2
|
334
|
+
|
335
|
+
# Create new region with expanded bbox
|
336
|
+
from natural_pdf.elements.region import Region
|
337
|
+
new_region = Region(self.page, (new_x0, new_top, new_x1, new_bottom))
|
338
|
+
|
339
|
+
return new_region
|
340
|
+
|
260
341
|
|
261
342
|
class Element(DirectionalMixin):
|
262
343
|
"""
|
@@ -415,7 +496,8 @@ class Element(DirectionalMixin):
|
|
415
496
|
candidates = candidates[:limit] if limit else candidates
|
416
497
|
|
417
498
|
# Find matching elements
|
418
|
-
|
499
|
+
from natural_pdf.elements.collections import ElementCollection
|
500
|
+
matches = ElementCollection(candidates).find_all(selector, **kwargs)
|
419
501
|
return matches[0] if matches else None
|
420
502
|
elif idx + 1 < len(all_elements):
|
421
503
|
# No selector, just return the next element
|
@@ -449,16 +531,17 @@ class Element(DirectionalMixin):
|
|
449
531
|
|
450
532
|
# Search for previous matching element
|
451
533
|
if selector:
|
452
|
-
#
|
534
|
+
# Select elements before this one
|
453
535
|
candidates = all_elements[:idx]
|
454
|
-
# Reverse to
|
536
|
+
# Reverse to search backwards from the current element
|
455
537
|
candidates = candidates[::-1]
|
456
538
|
# Limit search range for performance
|
457
539
|
candidates = candidates[:limit] if limit else candidates
|
458
540
|
|
459
|
-
# Find matching elements
|
460
|
-
|
461
|
-
|
541
|
+
# Find matching elements using ElementCollection
|
542
|
+
from natural_pdf.elements.collections import ElementCollection
|
543
|
+
matches = ElementCollection(candidates).find_all(selector, **kwargs)
|
544
|
+
return matches[0] if matches else None # find_all returns a collection
|
462
545
|
elif idx > 0:
|
463
546
|
# No selector, just return the previous element
|
464
547
|
return all_elements[idx - 1]
|
@@ -737,8 +820,9 @@ class Element(DirectionalMixin):
|
|
737
820
|
Returns:
|
738
821
|
First matching element or None
|
739
822
|
"""
|
740
|
-
# Create a temporary region from this element's bounds
|
741
823
|
from natural_pdf.elements.region import Region
|
824
|
+
|
825
|
+
# Create a temporary region from this element's bounds
|
742
826
|
temp_region = Region(self.page, self.bbox)
|
743
827
|
return temp_region.find(selector, apply_exclusions=apply_exclusions, **kwargs)
|
744
828
|
|
@@ -755,7 +839,8 @@ class Element(DirectionalMixin):
|
|
755
839
|
Returns:
|
756
840
|
ElementCollection with matching elements
|
757
841
|
"""
|
758
|
-
# Create a temporary region from this element's bounds
|
759
842
|
from natural_pdf.elements.region import Region
|
843
|
+
|
844
|
+
# Create a temporary region from this element's bounds
|
760
845
|
temp_region = Region(self.page, self.bbox)
|
761
846
|
return temp_region.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
@@ -2,6 +2,7 @@ import logging
|
|
2
2
|
|
3
3
|
from typing import List, Optional, Dict, Any, Union, Callable, TypeVar, Generic, Iterator, Tuple, TYPE_CHECKING
|
4
4
|
from natural_pdf.ocr import OCROptions
|
5
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
5
6
|
|
6
7
|
logger = logging.getLogger(__name__)
|
7
8
|
|
@@ -882,6 +883,61 @@ class ElementCollection(Generic[T]):
|
|
882
883
|
logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
|
883
884
|
return None
|
884
885
|
|
886
|
+
def find_all(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> 'ElementCollection[T]':
|
887
|
+
"""
|
888
|
+
Filter elements within this collection matching the selector.
|
889
|
+
|
890
|
+
Args:
|
891
|
+
selector: CSS-like selector string.
|
892
|
+
regex: Whether to use regex for text search in :contains (default: False).
|
893
|
+
case: Whether to do case-sensitive text search (default: True).
|
894
|
+
**kwargs: Additional filter parameters passed to the selector function.
|
895
|
+
|
896
|
+
Returns:
|
897
|
+
A new ElementCollection containing only the matching elements from this collection.
|
898
|
+
"""
|
899
|
+
if not self._elements:
|
900
|
+
return ElementCollection([])
|
901
|
+
|
902
|
+
try:
|
903
|
+
selector_obj = parse_selector(selector)
|
904
|
+
except Exception as e:
|
905
|
+
logger.error(f"Error parsing selector '{selector}': {e}")
|
906
|
+
return ElementCollection([]) # Return empty on parse error
|
907
|
+
|
908
|
+
# Pass regex and case flags to selector function generator
|
909
|
+
kwargs['regex'] = regex
|
910
|
+
kwargs['case'] = case
|
911
|
+
|
912
|
+
try:
|
913
|
+
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
914
|
+
except Exception as e:
|
915
|
+
logger.error(f"Error creating filter function for selector '{selector}': {e}")
|
916
|
+
return ElementCollection([]) # Return empty on filter creation error
|
917
|
+
|
918
|
+
matching_elements = [element for element in self._elements if filter_func(element)]
|
919
|
+
|
920
|
+
# Note: Unlike Page.find_all, this doesn't re-sort.
|
921
|
+
# Sorting should be done explicitly on the collection if needed.
|
922
|
+
|
923
|
+
return ElementCollection(matching_elements)
|
924
|
+
|
925
|
+
def find(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> Optional[T]:
|
926
|
+
"""
|
927
|
+
Find the first element within this collection matching the selector.
|
928
|
+
|
929
|
+
Args:
|
930
|
+
selector: CSS-like selector string.
|
931
|
+
regex: Whether to use regex for text search in :contains (default: False).
|
932
|
+
case: Whether to do case-sensitive text search (default: True).
|
933
|
+
**kwargs: Additional filter parameters passed to the selector function.
|
934
|
+
|
935
|
+
Returns:
|
936
|
+
The first matching element or None.
|
937
|
+
"""
|
938
|
+
results = self.find_all(selector, regex=regex, case=case, **kwargs)
|
939
|
+
return results.first
|
940
|
+
|
885
941
|
class PageCollection(Generic[P]):
|
886
942
|
"""
|
887
943
|
A collection of PDF pages with cross-page operations.
|
natural_pdf/elements/region.py
CHANGED
@@ -761,8 +761,6 @@ class Region(DirectionalMixin):
|
|
761
761
|
exclusion_regions = self._page._get_exclusion_regions(include_callable=True)
|
762
762
|
|
763
763
|
if debug:
|
764
|
-
import logging
|
765
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
766
764
|
logger.debug(f"Region {self.bbox} with {len(exclusion_regions)} exclusion regions")
|
767
765
|
|
768
766
|
# IMPROVEMENT 1: Check if the region intersects with any exclusion zone
|
@@ -777,16 +775,12 @@ class Region(DirectionalMixin):
|
|
777
775
|
if overlap:
|
778
776
|
has_intersection = True
|
779
777
|
if debug:
|
780
|
-
import logging
|
781
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
782
778
|
logger.debug(f" Region intersects with exclusion {i}: {exclusion.bbox}")
|
783
779
|
break
|
784
780
|
|
785
781
|
# If no intersection, process without exclusions
|
786
782
|
if not has_intersection:
|
787
783
|
if debug:
|
788
|
-
import logging
|
789
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
790
784
|
logger.debug(f" No intersection with any exclusion, ignoring exclusions")
|
791
785
|
apply_exclusions = False
|
792
786
|
exclusion_regions = []
|
@@ -809,8 +803,6 @@ class Region(DirectionalMixin):
|
|
809
803
|
abs(exclusion.x1 - self.page.width) < 5)
|
810
804
|
|
811
805
|
if debug:
|
812
|
-
import logging
|
813
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
814
806
|
logger.debug(f" Exclusion {i}: {exclusion.bbox}, full width: {full_width}")
|
815
807
|
|
816
808
|
if full_width:
|
@@ -827,8 +819,6 @@ class Region(DirectionalMixin):
|
|
827
819
|
bottom_bound = self.bottom
|
828
820
|
|
829
821
|
if debug:
|
830
|
-
import logging
|
831
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
832
822
|
logger.debug(f" Using cropping approach, initial bounds: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
|
833
823
|
|
834
824
|
# Process only header/footer exclusions for cropping
|
@@ -838,8 +828,6 @@ class Region(DirectionalMixin):
|
|
838
828
|
# Move top bound to exclude the header
|
839
829
|
top_bound = max(top_bound, exclusion.bottom)
|
840
830
|
if debug:
|
841
|
-
import logging
|
842
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
843
831
|
logger.debug(f" Adjusted top bound to {top_bound} due to header exclusion")
|
844
832
|
|
845
833
|
# If exclusion is at the bottom of our region
|
@@ -847,14 +835,10 @@ class Region(DirectionalMixin):
|
|
847
835
|
# Move bottom bound to exclude the footer
|
848
836
|
bottom_bound = min(bottom_bound, exclusion.top)
|
849
837
|
if debug:
|
850
|
-
import logging
|
851
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
852
838
|
logger.debug(f" Adjusted bottom bound to {bottom_bound} due to footer exclusion")
|
853
839
|
|
854
840
|
|
855
841
|
if debug:
|
856
|
-
import logging
|
857
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
858
842
|
logger.debug(f" Final bounds after exclusion adjustment: ({self.x0}, {top_bound}, {self.x1}, {bottom_bound})")
|
859
843
|
|
860
844
|
# If we still have a valid region after exclusions
|
@@ -865,8 +849,6 @@ class Region(DirectionalMixin):
|
|
865
849
|
result = cropped.extract_text(keep_blank_chars=keep_blank_chars, **kwargs)
|
866
850
|
|
867
851
|
if debug:
|
868
|
-
import logging
|
869
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
870
852
|
logger.debug(f" Successfully extracted text using crop, got {len(result)} characters")
|
871
853
|
|
872
854
|
# Skip the complex filtering approach
|
@@ -874,16 +856,12 @@ class Region(DirectionalMixin):
|
|
874
856
|
else:
|
875
857
|
# This would only happen if the region is entirely inside an exclusion zone
|
876
858
|
# or if both top and bottom of the region are excluded leaving no valid area
|
877
|
-
import logging
|
878
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
879
859
|
logger.debug(f"Region {self.bbox} completely covered by exclusions, returning empty string")
|
880
860
|
return ""
|
881
861
|
# We have exclusions, but not all are headers/footers,
|
882
862
|
# or we have a non-rectangular region
|
883
863
|
else:
|
884
864
|
if debug:
|
885
|
-
import logging
|
886
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
887
865
|
logger.debug(f" Mixed exclusion types or non-rectangular region, switching to filtering")
|
888
866
|
|
889
867
|
# Don't use crop for mixed exclusion types
|
@@ -902,16 +880,13 @@ class Region(DirectionalMixin):
|
|
902
880
|
return result
|
903
881
|
|
904
882
|
# For all other cases (complex exclusions, polygons), we use element filtering
|
905
|
-
import warnings
|
906
|
-
import logging
|
907
|
-
logger = logging.getLogger("natural_pdf.elements.region")
|
908
|
-
|
909
883
|
if debug:
|
910
884
|
logger.debug(f"Using element filtering approach for region {self.bbox}")
|
911
885
|
|
912
|
-
# Get
|
913
|
-
|
914
|
-
|
886
|
+
# Get only word elements in this region first (instead of ALL elements)
|
887
|
+
# This prevents duplication from joining both char and word text
|
888
|
+
all_elements = [e for e in self.page.words if self._is_element_in_region(e)]
|
889
|
+
|
915
890
|
if apply_exclusions and exclusion_regions:
|
916
891
|
if debug:
|
917
892
|
logger.debug(f"Filtering with {len(exclusion_regions)} exclusion zones")
|
@@ -1325,83 +1300,6 @@ class Region(DirectionalMixin):
|
|
1325
1300
|
|
1326
1301
|
return elements
|
1327
1302
|
|
1328
|
-
def expand(self,
|
1329
|
-
left: float = 0,
|
1330
|
-
right: float = 0,
|
1331
|
-
top_expand: float = 0, # Renamed to avoid conflict
|
1332
|
-
bottom_expand: float = 0, # Renamed to avoid conflict
|
1333
|
-
width_factor: float = 1.0,
|
1334
|
-
height_factor: float = 1.0,
|
1335
|
-
# Keep original parameter names for backward compatibility
|
1336
|
-
top: float = None,
|
1337
|
-
bottom: float = None) -> 'Region':
|
1338
|
-
"""
|
1339
|
-
Create a new region expanded from this one.
|
1340
|
-
|
1341
|
-
Args:
|
1342
|
-
left: Amount to expand left edge
|
1343
|
-
right: Amount to expand right edge
|
1344
|
-
top_expand: Amount to expand top edge (upward)
|
1345
|
-
bottom_expand: Amount to expand bottom edge (downward)
|
1346
|
-
width_factor: Factor to multiply width by
|
1347
|
-
height_factor: Factor to multiply height by
|
1348
|
-
top: (DEPRECATED, use top_expand) Amount to expand top edge (upward)
|
1349
|
-
bottom: (DEPRECATED, use bottom_expand) Amount to expand bottom edge (downward)
|
1350
|
-
|
1351
|
-
Returns:
|
1352
|
-
New expanded Region
|
1353
|
-
"""
|
1354
|
-
# Start with current coordinates
|
1355
|
-
new_x0 = self.x0
|
1356
|
-
new_x1 = self.x1
|
1357
|
-
new_top = self.top
|
1358
|
-
new_bottom = self.bottom
|
1359
|
-
|
1360
|
-
# Handle the deprecated parameter names for backward compatibility
|
1361
|
-
if top is not None:
|
1362
|
-
top_expand = top
|
1363
|
-
if bottom is not None:
|
1364
|
-
bottom_expand = bottom
|
1365
|
-
|
1366
|
-
# Apply absolute expansions first
|
1367
|
-
new_x0 -= left
|
1368
|
-
new_x1 += right
|
1369
|
-
new_top -= top_expand # Expand upward (decrease top coordinate)
|
1370
|
-
new_bottom += bottom_expand # Expand downward (increase bottom coordinate)
|
1371
|
-
|
1372
|
-
# Apply percentage factors if provided
|
1373
|
-
if width_factor != 1.0 or height_factor != 1.0:
|
1374
|
-
# Current width and height
|
1375
|
-
current_width = new_x1 - new_x0
|
1376
|
-
current_height = new_bottom - new_top
|
1377
|
-
|
1378
|
-
# Calculate new width and height
|
1379
|
-
new_width = current_width * width_factor
|
1380
|
-
new_height = current_height * height_factor
|
1381
|
-
|
1382
|
-
# Calculate width and height differences
|
1383
|
-
width_diff = new_width - current_width
|
1384
|
-
height_diff = new_height - current_height
|
1385
|
-
|
1386
|
-
# Adjust coordinates to maintain center point
|
1387
|
-
new_x0 -= width_diff / 2
|
1388
|
-
new_x1 += width_diff / 2
|
1389
|
-
new_top -= height_diff / 2
|
1390
|
-
new_bottom += height_diff / 2
|
1391
|
-
|
1392
|
-
# Create new region with expanded bbox
|
1393
|
-
new_region = Region(self.page, (new_x0, new_top, new_x1, new_bottom))
|
1394
|
-
|
1395
|
-
# Copy multi-page properties if present
|
1396
|
-
if self._spans_pages:
|
1397
|
-
new_region._spans_pages = True
|
1398
|
-
new_region._multi_page_elements = self._multi_page_elements
|
1399
|
-
new_region._page_range = self._page_range
|
1400
|
-
new_region.start_element = self.start_element
|
1401
|
-
new_region.end_element = self.end_element
|
1402
|
-
|
1403
|
-
return new_region
|
1404
|
-
|
1405
1303
|
def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both'):
|
1406
1304
|
"""
|
1407
1305
|
Get a section between two elements within this region.
|
natural_pdf/qa/document_qa.py
CHANGED
@@ -5,6 +5,7 @@ from PIL import Image, ImageDraw
|
|
5
5
|
import os
|
6
6
|
import tempfile
|
7
7
|
import json
|
8
|
+
from natural_pdf.elements.collections import ElementCollection
|
8
9
|
|
9
10
|
logger = logging.getLogger("natural_pdf.qa.document_qa")
|
10
11
|
|
@@ -304,8 +305,8 @@ class DocumentQA:
|
|
304
305
|
# Remove from matched texts to avoid duplicates
|
305
306
|
if element.text in matched_texts:
|
306
307
|
matched_texts.remove(element.text)
|
307
|
-
|
308
|
-
result["source_elements"] = source_elements
|
308
|
+
|
309
|
+
result["source_elements"] = ElementCollection(source_elements)
|
309
310
|
|
310
311
|
return result
|
311
312
|
|
@@ -386,7 +387,7 @@ class DocumentQA:
|
|
386
387
|
if element.text in matched_texts:
|
387
388
|
matched_texts.remove(element.text)
|
388
389
|
|
389
|
-
result["source_elements"] = source_elements
|
390
|
+
result["source_elements"] = ElementCollection(source_elements)
|
390
391
|
|
391
392
|
return result
|
392
393
|
|
natural_pdf/selectors/parser.py
CHANGED
@@ -351,4 +351,218 @@ def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
|
|
351
351
|
return abs(value1 - value2) <= tolerance
|
352
352
|
|
353
353
|
# Default to exact match for other types
|
354
|
-
return value1 == value2
|
354
|
+
return value1 == value2
|
355
|
+
|
356
|
+
|
357
|
+
PSEUDO_CLASS_FUNCTIONS = {
|
358
|
+
'bold': lambda el: hasattr(el, 'bold') and el.bold,
|
359
|
+
'italic': lambda el: hasattr(el, 'italic') and el.italic,
|
360
|
+
'first-child': lambda el: hasattr(el, 'parent') and el.parent and el.parent.children[0] == el, # Example placeholder
|
361
|
+
'last-child': lambda el: hasattr(el, 'parent') and el.parent and el.parent.children[-1] == el, # Example placeholder
|
362
|
+
# Add the new pseudo-classes for negation
|
363
|
+
'not-bold': lambda el: hasattr(el, 'bold') and not el.bold,
|
364
|
+
'not-italic': lambda el: hasattr(el, 'italic') and not el.italic,
|
365
|
+
}
|
366
|
+
|
367
|
+
|
368
|
+
def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
|
369
|
+
"""
|
370
|
+
Convert a parsed selector to a filter function.
|
371
|
+
|
372
|
+
Args:
|
373
|
+
selector: Parsed selector dictionary
|
374
|
+
**kwargs: Additional filter parameters including:
|
375
|
+
- regex: Whether to use regex for text search
|
376
|
+
- case: Whether to do case-sensitive text search
|
377
|
+
|
378
|
+
Returns:
|
379
|
+
Function that takes an element and returns True if it matches
|
380
|
+
"""
|
381
|
+
def filter_func(element):
|
382
|
+
# Check element type
|
383
|
+
if selector['type'] != 'any':
|
384
|
+
# Special handling for 'text' type to match both 'text', 'char', and 'word'
|
385
|
+
if selector['type'] == 'text':
|
386
|
+
if element.type not in ['text', 'char', 'word']:
|
387
|
+
return False
|
388
|
+
# Special handling for 'region' type to check for detected layout regions
|
389
|
+
elif selector['type'] == 'region':
|
390
|
+
# Check if this is a Region with region_type property
|
391
|
+
if not hasattr(element, 'region_type'):
|
392
|
+
return False
|
393
|
+
|
394
|
+
# If 'type' attribute specified, it will be checked in the attributes section
|
395
|
+
# Check for Docling-specific types (section-header, etc.)
|
396
|
+
elif hasattr(element, 'normalized_type') and element.normalized_type == selector['type']:
|
397
|
+
# This is a direct match with a Docling region type
|
398
|
+
pass
|
399
|
+
# Otherwise, require exact match with the element's type attribute
|
400
|
+
elif not hasattr(element, 'type') or element.type != selector['type']:
|
401
|
+
return False
|
402
|
+
|
403
|
+
# Check attributes
|
404
|
+
for name, attr_info in selector['attributes'].items():
|
405
|
+
op = attr_info['op']
|
406
|
+
value = attr_info['value']
|
407
|
+
|
408
|
+
# Special case for fontname attribute - allow matching part of the name
|
409
|
+
if name == 'fontname' and op == '*=':
|
410
|
+
element_value = getattr(element, name, None)
|
411
|
+
if element_value is None or value.lower() not in element_value.lower():
|
412
|
+
return False
|
413
|
+
continue
|
414
|
+
|
415
|
+
# Convert hyphenated attribute names to underscore for Python properties
|
416
|
+
python_name = name.replace('-', '_')
|
417
|
+
|
418
|
+
# Special case for region attributes
|
419
|
+
if selector['type'] == 'region':
|
420
|
+
if name == 'type':
|
421
|
+
# Use normalized_type for comparison if available
|
422
|
+
if hasattr(element, 'normalized_type') and element.normalized_type:
|
423
|
+
element_value = element.normalized_type
|
424
|
+
else:
|
425
|
+
# Convert spaces to hyphens for consistency with the normalized format
|
426
|
+
element_value = getattr(element, 'region_type', '').lower().replace(' ', '-')
|
427
|
+
elif name == 'model':
|
428
|
+
# Special handling for model attribute in regions
|
429
|
+
element_value = getattr(element, 'model', None)
|
430
|
+
else:
|
431
|
+
# Get the attribute value from the element normally
|
432
|
+
element_value = getattr(element, python_name, None)
|
433
|
+
else:
|
434
|
+
# Get the attribute value from the element normally for non-region elements
|
435
|
+
element_value = getattr(element, python_name, None)
|
436
|
+
|
437
|
+
if element_value is None:
|
438
|
+
return False
|
439
|
+
|
440
|
+
# Apply operator
|
441
|
+
if op == '=':
|
442
|
+
if element_value != value:
|
443
|
+
return False
|
444
|
+
elif op == '~=':
|
445
|
+
# Approximate match (e.g., for colors)
|
446
|
+
if not _is_approximate_match(element_value, value):
|
447
|
+
return False
|
448
|
+
elif op == '>=':
|
449
|
+
# Greater than or equal (element value must be >= specified value)
|
450
|
+
if not (isinstance(element_value, (int, float)) and
|
451
|
+
isinstance(value, (int, float)) and
|
452
|
+
element_value >= value):
|
453
|
+
return False
|
454
|
+
elif op == '<=':
|
455
|
+
# Less than or equal (element value must be <= specified value)
|
456
|
+
if not (isinstance(element_value, (int, float)) and
|
457
|
+
isinstance(value, (int, float)) and
|
458
|
+
element_value <= value):
|
459
|
+
return False
|
460
|
+
elif op == '>':
|
461
|
+
# Greater than (element value must be > specified value)
|
462
|
+
if not (isinstance(element_value, (int, float)) and
|
463
|
+
isinstance(value, (int, float)) and
|
464
|
+
element_value > value):
|
465
|
+
return False
|
466
|
+
elif op == '<':
|
467
|
+
# Less than (element value must be < specified value)
|
468
|
+
if not (isinstance(element_value, (int, float)) and
|
469
|
+
isinstance(value, (int, float)) and
|
470
|
+
element_value < value):
|
471
|
+
return False
|
472
|
+
|
473
|
+
# Check pseudo-classes
|
474
|
+
for pseudo in selector['pseudo_classes']:
|
475
|
+
name = pseudo['name']
|
476
|
+
args = pseudo['args']
|
477
|
+
|
478
|
+
# Handle various pseudo-classes
|
479
|
+
if name == 'contains' and hasattr(element, 'text'):
|
480
|
+
use_regex = kwargs.get('regex', False)
|
481
|
+
ignore_case = not kwargs.get('case', True)
|
482
|
+
|
483
|
+
if use_regex:
|
484
|
+
import re
|
485
|
+
if not element.text:
|
486
|
+
return False
|
487
|
+
try:
|
488
|
+
pattern = re.compile(args, re.IGNORECASE if ignore_case else 0)
|
489
|
+
if not pattern.search(element.text):
|
490
|
+
return False
|
491
|
+
except re.error:
|
492
|
+
# If regex is invalid, fall back to literal text search
|
493
|
+
element_text = element.text
|
494
|
+
search_text = args
|
495
|
+
|
496
|
+
if ignore_case:
|
497
|
+
element_text = element_text.lower()
|
498
|
+
search_text = search_text.lower()
|
499
|
+
|
500
|
+
if search_text not in element_text:
|
501
|
+
return False
|
502
|
+
else:
|
503
|
+
# String comparison with case sensitivity option
|
504
|
+
if not element.text:
|
505
|
+
return False
|
506
|
+
|
507
|
+
element_text = element.text
|
508
|
+
search_text = args
|
509
|
+
|
510
|
+
if ignore_case:
|
511
|
+
element_text = element_text.lower()
|
512
|
+
search_text = search_text.lower()
|
513
|
+
|
514
|
+
if search_text not in element_text:
|
515
|
+
return False
|
516
|
+
elif name == 'starts-with' and hasattr(element, 'text'):
|
517
|
+
if not element.text or not element.text.startswith(args):
|
518
|
+
return False
|
519
|
+
elif name == 'ends-with' and hasattr(element, 'text'):
|
520
|
+
if not element.text or not element.text.endswith(args):
|
521
|
+
return False
|
522
|
+
elif name == 'bold':
|
523
|
+
if not (hasattr(element, 'bold') and element.bold):
|
524
|
+
return False
|
525
|
+
elif name == 'italic':
|
526
|
+
if not (hasattr(element, 'italic') and element.italic):
|
527
|
+
return False
|
528
|
+
elif name == 'horizontal':
|
529
|
+
if not (hasattr(element, 'is_horizontal') and element.is_horizontal):
|
530
|
+
return False
|
531
|
+
elif name == 'vertical':
|
532
|
+
if not (hasattr(element, 'is_vertical') and element.is_vertical):
|
533
|
+
return False
|
534
|
+
else:
|
535
|
+
# Check pseudo-classes (basic ones like :bold, :italic)
|
536
|
+
if name in PSEUDO_CLASS_FUNCTIONS:
|
537
|
+
if not PSEUDO_CLASS_FUNCTIONS[name](element):
|
538
|
+
return False
|
539
|
+
elif name == 'contains':
|
540
|
+
if not hasattr(element, 'text') or not element.text:
|
541
|
+
return False
|
542
|
+
text_to_check = element.text
|
543
|
+
search_term = args
|
544
|
+
if not kwargs.get('case', True): # Check case flag from kwargs
|
545
|
+
text_to_check = text_to_check.lower()
|
546
|
+
search_term = search_term.lower()
|
547
|
+
|
548
|
+
if kwargs.get('regex', False): # Check regex flag from kwargs
|
549
|
+
try:
|
550
|
+
if not re.search(search_term, text_to_check):
|
551
|
+
return False
|
552
|
+
except re.error as e:
|
553
|
+
logger.warning(f"Invalid regex in :contains selector '{search_term}': {e}")
|
554
|
+
return False # Invalid regex cannot match
|
555
|
+
else:
|
556
|
+
if search_term not in text_to_check:
|
557
|
+
return False
|
558
|
+
# Skip complex pseudo-classes like :near, :above here, handled later
|
559
|
+
elif name in ('above', 'below', 'near', 'left-of', 'right-of'):
|
560
|
+
pass # Handled separately after initial filtering
|
561
|
+
else:
|
562
|
+
# Optionally log unknown pseudo-classes
|
563
|
+
# logger.warning(f"Unknown pseudo-class: {name}")
|
564
|
+
pass
|
565
|
+
|
566
|
+
return True # Element passes all attribute and simple pseudo-class filters
|
567
|
+
|
568
|
+
return filter_func
|
@@ -127,10 +127,10 @@ def create_legend(labels_colors: Dict[str, Tuple[int, int, int, int]],
|
|
127
127
|
# Try to load a font, use default if not available
|
128
128
|
try:
|
129
129
|
# Use a commonly available font, adjust size
|
130
|
-
font = ImageFont.truetype("DejaVuSans.ttf",
|
130
|
+
font = ImageFont.truetype("DejaVuSans.ttf", 14)
|
131
131
|
except IOError:
|
132
132
|
try:
|
133
|
-
font = ImageFont.truetype("Arial.ttf",
|
133
|
+
font = ImageFont.truetype("Arial.ttf", 14)
|
134
134
|
except IOError:
|
135
135
|
font = ImageFont.load_default()
|
136
136
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.1
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -48,7 +48,7 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
|
|
48
48
|
Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
|
49
49
|
|
50
50
|
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
|
51
|
-
- [Live
|
51
|
+
- [Live demos here](https://colab.research.google.com/github/jsoma/natural-pdf/)
|
52
52
|
|
53
53
|
## Features
|
54
54
|
|
@@ -74,18 +74,16 @@ pip install natural-pdf
|
|
74
74
|
|
75
75
|
# Installs the core library along with required AI dependencies (PyTorch, Transformers)
|
76
76
|
```bash
|
77
|
-
# Install with support for specific OCR engines
|
78
|
-
pip install natural-pdf[easyocr]
|
79
|
-
pip install natural-pdf[paddle]
|
80
|
-
pip install natural-pdf[surya]
|
81
|
-
|
82
|
-
# Install with support for YOLO layout detection model
|
77
|
+
# Install with support for specific OCR and layout engines
|
78
|
+
pip install natural-pdf[easyocr]
|
79
|
+
pip install natural-pdf[paddle]
|
80
|
+
pip install natural-pdf[surya]
|
83
81
|
pip install natural-pdf[layout_yolo]
|
84
82
|
|
85
83
|
# Install with support for the interactive Jupyter widget
|
86
84
|
pip install natural-pdf[interactive]
|
87
85
|
|
88
|
-
#
|
86
|
+
# Just install everything
|
89
87
|
pip install natural-pdf[all]
|
90
88
|
```
|
91
89
|
|
@@ -119,6 +117,8 @@ clean_text = page.extract_text()
|
|
119
117
|
print(clean_text)
|
120
118
|
```
|
121
119
|
|
120
|
+
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
|
121
|
+
|
122
122
|
## Selectors
|
123
123
|
|
124
124
|
The library supports CSS-like selectors for finding elements:
|
@@ -185,7 +185,7 @@ Exclusions work efficiently with different region types:
|
|
185
185
|
|
186
186
|
## OCR Integration
|
187
187
|
|
188
|
-
Extract text from scanned documents using OCR, with support for multiple engines (EasyOCR, PaddleOCR, Surya):
|
188
|
+
Extract text from scanned documents using OCR, with support for multiple engines ([EasyOCR](https://www.jaided.ai/easyocr/), [PaddleOCR](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html), [Surya](https://github.com/VikParuchuri/surya)):
|
189
189
|
|
190
190
|
```python
|
191
191
|
# Apply OCR using a specific engine (e.g., PaddleOCR)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
natural_pdf/__init__.py,sha256=
|
1
|
+
natural_pdf/__init__.py,sha256=hsSosbPnvDRCfyYAL9bf1haVS6oBxLAl7cbKTWRTHkU,1784
|
2
2
|
natural_pdf/analyzers/__init__.py,sha256=BkSmEqw5J76C2fvYHF86EXQJQWWFNIvjSwRMwfW-Ht0,140
|
3
3
|
natural_pdf/analyzers/text_options.py,sha256=9IGRoem1O2mc1ZNGiM5-VPRZ3c8LLwEk1B3is9UxMoE,2777
|
4
4
|
natural_pdf/analyzers/text_structure.py,sha256=e4G6v0bD7ZJCdo6DcuDD3iZt8KAwBfALMduwZHGh0wI,12415
|
@@ -15,15 +15,15 @@ natural_pdf/analyzers/layout/tatr.py,sha256=H0Xygk9jA46-vlPleoal94cuDyz-LHTSxVb3
|
|
15
15
|
natural_pdf/analyzers/layout/yolo.py,sha256=NSQK3TcS1qN8D2MDxCvcwTpS_kvzGy3I2LepJDUceoQ,7699
|
16
16
|
natural_pdf/core/__init__.py,sha256=GUuFtj2Apc9biAdUOlnL8leL3BQncEzubvpiAUaU3ss,37
|
17
17
|
natural_pdf/core/element_manager.py,sha256=H1896JSt48ASLSmG22xEXMY-xSKcpYsUlYmYMD48i6Q,17117
|
18
|
-
natural_pdf/core/highlighting_service.py,sha256=
|
19
|
-
natural_pdf/core/page.py,sha256=
|
20
|
-
natural_pdf/core/pdf.py,sha256=
|
18
|
+
natural_pdf/core/highlighting_service.py,sha256=a-40UMohOglYrw4klW1GuQ_p3jZOxnAfPOXPORThr4U,31476
|
19
|
+
natural_pdf/core/page.py,sha256=tnxG-5OhFVuFHt0p-a9YSLU-nXjA8fftg5ViQdH5sOU,68512
|
20
|
+
natural_pdf/core/pdf.py,sha256=UzxVfVeCnhSN7rxdJresUj_UNFkcFkeaEjLvwZMJS-c,28532
|
21
21
|
natural_pdf/elements/__init__.py,sha256=6FGHZm2oONd8zErahMEawuB4AvJR5jOZPt4KtEwbj80,40
|
22
|
-
natural_pdf/elements/base.py,sha256=
|
23
|
-
natural_pdf/elements/collections.py,sha256=
|
22
|
+
natural_pdf/elements/base.py,sha256=9SQ-O2qbQe9Avbf9JI-p6vWlyThZVch-p1yqXWSrBHw,35750
|
23
|
+
natural_pdf/elements/collections.py,sha256=RJf4cBZeLfCtfS0-SjzYFRCtbzYjWsgk3LrcTwJAYMs,62392
|
24
24
|
natural_pdf/elements/line.py,sha256=QvVdhf_K6rwJkq3q67JmgdZpDhrBgWuSMF-Q25malP4,4783
|
25
25
|
natural_pdf/elements/rect.py,sha256=dls9g-R213O78HvfAJMak3_eV14Zh654Zw7hqTTXxDQ,3949
|
26
|
-
natural_pdf/elements/region.py,sha256=
|
26
|
+
natural_pdf/elements/region.py,sha256=MXQK00LLMvwuq94NigeeCVFoGov_RWFe9ZylnIMpzB0,72453
|
27
27
|
natural_pdf/elements/text.py,sha256=OAuy0ozaemj6yjMwhXPsJ76VZtRPeJbmrFTzpDJA2_U,11017
|
28
28
|
natural_pdf/ocr/__init__.py,sha256=mbUUsCfeU6yRsEqNn3I4Len-XY6FfjfKhTAoWDLA1f4,1943
|
29
29
|
natural_pdf/ocr/engine.py,sha256=xDnvhnm4Lr7d83ezglDqOtl9xfx74zOOTyYW-fZHQEQ,4183
|
@@ -33,20 +33,20 @@ natural_pdf/ocr/engine_surya.py,sha256=gWV_BEuLMqmJcKVlag9i45SsO2uLAtI-dayBm1xbD
|
|
33
33
|
natural_pdf/ocr/ocr_manager.py,sha256=mAyCntdAnrNv8TIvGYlGs40G2tDAdMQ_Jqb3owiPWW8,9934
|
34
34
|
natural_pdf/ocr/ocr_options.py,sha256=A2CQV172id-90zMpPZWb8CD09ZP0BuQnnCZGEFP4SaQ,3787
|
35
35
|
natural_pdf/qa/__init__.py,sha256=kagdfqNMpTnyzjC2EFy_PBX5us38NnJL548ESSQVzfI,107
|
36
|
-
natural_pdf/qa/document_qa.py,sha256=
|
36
|
+
natural_pdf/qa/document_qa.py,sha256=QYKKor0RqUQcEdFEBEUdq7L0ktq1WSMfQ-ynTc64cPU,15926
|
37
37
|
natural_pdf/selectors/__init__.py,sha256=Jfk-JBZEpQ7V5FWVGuLJQLH-qOfqNLC2AdicncMhrmY,121
|
38
|
-
natural_pdf/selectors/parser.py,sha256=
|
38
|
+
natural_pdf/selectors/parser.py,sha256=JK1zDVISACkUhzmzWfQMMW8hvsV422lRBFKgDBWOWC4,24108
|
39
39
|
natural_pdf/templates/__init__.py,sha256=i7N8epDxZoDDsK4p2iUiMwzKVs97i_KtNk8ATArqlC4,19
|
40
40
|
natural_pdf/templates/ocr_debug.html,sha256=Zy9StzBeHFQU8ity6cjFSZLe3TY0QOabUux4c5WQUzs,19171
|
41
41
|
natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg,43
|
42
42
|
natural_pdf/utils/highlighting.py,sha256=9H8vbWhwgxzjrL7MhAePXUWZZctLPboNocJzy-1TE_g,675
|
43
43
|
natural_pdf/utils/reading_order.py,sha256=1oihH9ZTqQvIVDYc2oVEYqIXyPzi94ERtelp6TyzmWU,7594
|
44
|
-
natural_pdf/utils/visualization.py,sha256=
|
44
|
+
natural_pdf/utils/visualization.py,sha256=14BM-K4ovDqHniNbxbP_y9KaEYNlkbpELGAv9_8aOZ4,8876
|
45
45
|
natural_pdf/widgets/__init__.py,sha256=qckw3DjdVTsASPLJ8uUrGKg3MFhvzHndUpeNGlqwg6A,215
|
46
46
|
natural_pdf/widgets/viewer.py,sha256=h_amj_uvf-vRqEsFg4P00fgKxawLAd9jjC1ohUza4BY,37479
|
47
47
|
natural_pdf/widgets/frontend/viewer.js,sha256=w8ywfz_IOAAv2nP_qaf2VBUkF1KhjT3zorhJxM1-CfU,4371
|
48
|
-
natural_pdf-0.1.
|
49
|
-
natural_pdf-0.1.
|
50
|
-
natural_pdf-0.1.
|
51
|
-
natural_pdf-0.1.
|
52
|
-
natural_pdf-0.1.
|
48
|
+
natural_pdf-0.1.1.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
49
|
+
natural_pdf-0.1.1.dist-info/METADATA,sha256=8o22GEPtEqlSqexFQxy6tVoHTB35LmT63sjbjbjORRE,10009
|
50
|
+
natural_pdf-0.1.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
51
|
+
natural_pdf-0.1.1.dist-info/top_level.txt,sha256=XtfS3IiR1fTjaQG9TjGDjZsB1Ih2GXQteDbJ2dXlLvQ,12
|
52
|
+
natural_pdf-0.1.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|