natural-pdf 0.1.35__py3-none-any.whl → 0.1.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +16 -4
- natural_pdf/analyzers/guides.py +1053 -26
- natural_pdf/core/page.py +205 -45
- natural_pdf/core/pdf.py +16 -1
- natural_pdf/elements/collections.py +10 -0
- natural_pdf/elements/region.py +106 -14
- natural_pdf/elements/text.py +36 -2
- natural_pdf/flows/region.py +128 -26
- natural_pdf/selectors/parser.py +24 -0
- natural_pdf/utils/layout.py +26 -0
- natural_pdf/utils/text_extraction.py +76 -1
- {natural_pdf-0.1.35.dist-info → natural_pdf-0.1.37.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.35.dist-info → natural_pdf-0.1.37.dist-info}/RECORD +17 -16
- {natural_pdf-0.1.35.dist-info → natural_pdf-0.1.37.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.35.dist-info → natural_pdf-0.1.37.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.35.dist-info → natural_pdf-0.1.37.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.35.dist-info → natural_pdf-0.1.37.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -312,6 +312,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
312
312
|
self,
|
313
313
|
exclusion_func_or_region: Union[Callable[["Page"], "Region"], "Region", Any],
|
314
314
|
label: Optional[str] = None,
|
315
|
+
method: str = "region",
|
315
316
|
) -> "Page":
|
316
317
|
"""
|
317
318
|
Add an exclusion to the page. Text from these regions will be excluded from extraction.
|
@@ -321,54 +322,146 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
321
322
|
exclusion_func_or_region: Either a callable function returning a Region,
|
322
323
|
a Region object, or another object with a valid .bbox attribute.
|
323
324
|
label: Optional label for this exclusion (e.g., 'header', 'footer').
|
325
|
+
method: Exclusion method - 'region' (exclude all elements in bounding box) or
|
326
|
+
'element' (exclude only the specific elements). Default: 'region'.
|
324
327
|
|
325
328
|
Returns:
|
326
329
|
Self for method chaining
|
327
330
|
|
328
331
|
Raises:
|
329
332
|
TypeError: If a non-callable, non-Region object without a valid bbox is provided.
|
333
|
+
ValueError: If method is not 'region' or 'element'.
|
330
334
|
"""
|
335
|
+
# Validate method parameter
|
336
|
+
if method not in ("region", "element"):
|
337
|
+
raise ValueError(f"Invalid exclusion method '{method}'. Must be 'region' or 'element'.")
|
338
|
+
|
339
|
+
# ------------------------------------------------------------------
|
340
|
+
# NEW: Handle selector strings and ElementCollection instances
|
341
|
+
# ------------------------------------------------------------------
|
342
|
+
# If a user supplies a selector string (e.g. "text:bold") we resolve it
|
343
|
+
# immediately *on this page* to the matching elements and turn each into
|
344
|
+
# a Region object which is added to the internal exclusions list.
|
345
|
+
#
|
346
|
+
# Likewise, if an ElementCollection is passed we iterate over its
|
347
|
+
# elements and create Regions for each one.
|
348
|
+
# ------------------------------------------------------------------
|
349
|
+
from natural_pdf.elements.collections import ElementCollection # local import to avoid cycle
|
350
|
+
|
351
|
+
# Selector string ---------------------------------------------------
|
352
|
+
if isinstance(exclusion_func_or_region, str):
|
353
|
+
selector_str = exclusion_func_or_region
|
354
|
+
matching_elements = self.find_all(selector_str, apply_exclusions=False)
|
355
|
+
|
356
|
+
if not matching_elements:
|
357
|
+
logger.warning(
|
358
|
+
f"Page {self.index}: Selector '{selector_str}' returned no elements – no exclusions added."
|
359
|
+
)
|
360
|
+
else:
|
361
|
+
if method == "element":
|
362
|
+
# Store the actual elements for element-based exclusion
|
363
|
+
for el in matching_elements:
|
364
|
+
self._exclusions.append((el, label, method))
|
365
|
+
logger.debug(
|
366
|
+
f"Page {self.index}: Added element exclusion from selector '{selector_str}' -> {el}"
|
367
|
+
)
|
368
|
+
else: # method == "region"
|
369
|
+
for el in matching_elements:
|
370
|
+
try:
|
371
|
+
bbox_coords = (float(el.x0), float(el.top), float(el.x1), float(el.bottom))
|
372
|
+
region = Region(self, bbox_coords, label=label)
|
373
|
+
# Store directly as a Region tuple so we don't recurse endlessly
|
374
|
+
self._exclusions.append((region, label, method))
|
375
|
+
logger.debug(
|
376
|
+
f"Page {self.index}: Added exclusion region from selector '{selector_str}' -> {bbox_coords}"
|
377
|
+
)
|
378
|
+
except Exception as e:
|
379
|
+
logger.warning(
|
380
|
+
f"Page {self.index}: Failed to create exclusion region from element {el}: {e}"
|
381
|
+
)
|
382
|
+
return self # Completed processing for selector input
|
383
|
+
|
384
|
+
# ElementCollection -----------------------------------------------
|
385
|
+
if isinstance(exclusion_func_or_region, ElementCollection):
|
386
|
+
if method == "element":
|
387
|
+
# Store the actual elements for element-based exclusion
|
388
|
+
for el in exclusion_func_or_region:
|
389
|
+
self._exclusions.append((el, label, method))
|
390
|
+
logger.debug(
|
391
|
+
f"Page {self.index}: Added element exclusion from ElementCollection -> {el}"
|
392
|
+
)
|
393
|
+
else: # method == "region"
|
394
|
+
# Convert each element to a Region and add
|
395
|
+
for el in exclusion_func_or_region:
|
396
|
+
try:
|
397
|
+
if not (hasattr(el, "bbox") and len(el.bbox) == 4):
|
398
|
+
logger.warning(
|
399
|
+
f"Page {self.index}: Skipping element without bbox in ElementCollection exclusion: {el}"
|
400
|
+
)
|
401
|
+
continue
|
402
|
+
bbox_coords = tuple(float(v) for v in el.bbox)
|
403
|
+
region = Region(self, bbox_coords, label=label)
|
404
|
+
self._exclusions.append((region, label, method))
|
405
|
+
logger.debug(
|
406
|
+
f"Page {self.index}: Added exclusion region from ElementCollection element {bbox_coords}"
|
407
|
+
)
|
408
|
+
except Exception as e:
|
409
|
+
logger.warning(
|
410
|
+
f"Page {self.index}: Failed to convert ElementCollection element to Region: {e}"
|
411
|
+
)
|
412
|
+
return self # Completed processing for ElementCollection input
|
413
|
+
|
414
|
+
# ------------------------------------------------------------------
|
415
|
+
# Existing logic (callable, Region, bbox-bearing objects)
|
416
|
+
# ------------------------------------------------------------------
|
331
417
|
exclusion_data = None # Initialize exclusion data
|
332
418
|
|
333
419
|
if callable(exclusion_func_or_region):
|
334
|
-
# Store callable functions along with their label
|
335
|
-
exclusion_data = (exclusion_func_or_region, label)
|
420
|
+
# Store callable functions along with their label and method
|
421
|
+
exclusion_data = (exclusion_func_or_region, label, method)
|
336
422
|
logger.debug(
|
337
|
-
f"Page {self.index}: Added callable exclusion '{label}': {exclusion_func_or_region}"
|
423
|
+
f"Page {self.index}: Added callable exclusion '{label}' with method '{method}': {exclusion_func_or_region}"
|
338
424
|
)
|
339
425
|
elif isinstance(exclusion_func_or_region, Region):
|
340
426
|
# Store Region objects directly, assigning the label
|
341
427
|
exclusion_func_or_region.label = label # Assign label
|
342
|
-
exclusion_data = (exclusion_func_or_region, label) # Store as tuple for consistency
|
428
|
+
exclusion_data = (exclusion_func_or_region, label, method) # Store as tuple for consistency
|
343
429
|
logger.debug(
|
344
|
-
f"Page {self.index}: Added Region exclusion '{label}': {exclusion_func_or_region}"
|
430
|
+
f"Page {self.index}: Added Region exclusion '{label}' with method '{method}': {exclusion_func_or_region}"
|
345
431
|
)
|
346
432
|
elif (
|
347
433
|
hasattr(exclusion_func_or_region, "bbox")
|
348
434
|
and isinstance(getattr(exclusion_func_or_region, "bbox", None), (tuple, list))
|
349
435
|
and len(exclusion_func_or_region.bbox) == 4
|
350
436
|
):
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
# Pass the label to the Region constructor
|
355
|
-
region_to_add = Region(self, bbox_coords, label=label)
|
356
|
-
exclusion_data = (region_to_add, label) # Store as tuple
|
437
|
+
if method == "element":
|
438
|
+
# For element method, store the element directly
|
439
|
+
exclusion_data = (exclusion_func_or_region, label, method)
|
357
440
|
logger.debug(
|
358
|
-
f"Page {self.index}: Added exclusion '{label}'
|
441
|
+
f"Page {self.index}: Added element exclusion '{label}': {exclusion_func_or_region}"
|
359
442
|
)
|
360
|
-
|
361
|
-
#
|
362
|
-
|
363
|
-
|
364
|
-
|
443
|
+
else: # method == "region"
|
444
|
+
# Convert objects with a valid bbox to a Region before storing
|
445
|
+
try:
|
446
|
+
bbox_coords = tuple(float(v) for v in exclusion_func_or_region.bbox)
|
447
|
+
# Pass the label to the Region constructor
|
448
|
+
region_to_add = Region(self, bbox_coords, label=label)
|
449
|
+
exclusion_data = (region_to_add, label, method) # Store as tuple
|
450
|
+
logger.debug(
|
451
|
+
f"Page {self.index}: Added exclusion '{label}' with method '{method}' converted to Region from {type(exclusion_func_or_region)}: {region_to_add}"
|
452
|
+
)
|
453
|
+
except (ValueError, TypeError, Exception) as e:
|
454
|
+
# Raise an error if conversion fails
|
455
|
+
raise TypeError(
|
456
|
+
f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}"
|
457
|
+
) from e
|
365
458
|
else:
|
366
459
|
# Reject invalid types
|
367
460
|
raise TypeError(
|
368
461
|
f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute."
|
369
462
|
)
|
370
463
|
|
371
|
-
# Append the stored data (tuple of object/callable and
|
464
|
+
# Append the stored data (tuple of object/callable, label, and method)
|
372
465
|
if exclusion_data:
|
373
466
|
self._exclusions.append(exclusion_data)
|
374
467
|
|
@@ -430,7 +523,8 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
430
523
|
def _get_exclusion_regions(self, include_callable=True, debug=False) -> List["Region"]:
|
431
524
|
"""
|
432
525
|
Get all exclusion regions for this page.
|
433
|
-
|
526
|
+
Now handles both region-based and element-based exclusions.
|
527
|
+
Assumes self._exclusions contains tuples of (callable/Region/Element, label, method).
|
434
528
|
|
435
529
|
Args:
|
436
530
|
include_callable: Whether to evaluate callable exclusion functions
|
@@ -445,8 +539,15 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
445
539
|
print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
|
446
540
|
|
447
541
|
for i, exclusion_data in enumerate(self._exclusions):
|
448
|
-
#
|
449
|
-
|
542
|
+
# Handle both old format (2-tuple) and new format (3-tuple) for backward compatibility
|
543
|
+
if len(exclusion_data) == 2:
|
544
|
+
# Old format: (exclusion_item, label)
|
545
|
+
exclusion_item, label = exclusion_data
|
546
|
+
method = "region" # Default to region for old format
|
547
|
+
else:
|
548
|
+
# New format: (exclusion_item, label, method)
|
549
|
+
exclusion_item, label, method = exclusion_data
|
550
|
+
|
450
551
|
exclusion_label = label if label else f"exclusion {i}"
|
451
552
|
|
452
553
|
# Process callable exclusion functions
|
@@ -495,7 +596,8 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
495
596
|
regions.append(exclusion_item) # Label is already on the Region object
|
496
597
|
if debug:
|
497
598
|
print(f" - Added direct region '{label}': {exclusion_item}")
|
498
|
-
#
|
599
|
+
# Element-based exclusions are not converted to regions here
|
600
|
+
# They will be handled separately in _filter_elements_by_exclusions
|
499
601
|
|
500
602
|
if debug:
|
501
603
|
print(f"Page {self.index}: Found {len(regions)} valid exclusion regions to apply")
|
@@ -506,14 +608,16 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
506
608
|
self, elements: List["Element"], debug_exclusions: bool = False
|
507
609
|
) -> List["Element"]:
|
508
610
|
"""
|
509
|
-
Filters a list of elements, removing those
|
611
|
+
Filters a list of elements, removing those based on exclusion rules.
|
612
|
+
Handles both region-based exclusions (exclude all in area) and
|
613
|
+
element-based exclusions (exclude only specific elements).
|
510
614
|
|
511
615
|
Args:
|
512
616
|
elements: The list of elements to filter.
|
513
617
|
debug_exclusions: Whether to output detailed exclusion debugging info (default: False).
|
514
618
|
|
515
619
|
Returns:
|
516
|
-
A new list containing only the elements not
|
620
|
+
A new list containing only the elements not excluded.
|
517
621
|
"""
|
518
622
|
if not self._exclusions:
|
519
623
|
if debug_exclusions:
|
@@ -527,34 +631,68 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
527
631
|
include_callable=True, debug=debug_exclusions
|
528
632
|
)
|
529
633
|
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
634
|
+
# Collect element-based exclusions
|
635
|
+
excluded_elements = set() # Use set for O(1) lookup
|
636
|
+
|
637
|
+
for exclusion_data in self._exclusions:
|
638
|
+
# Handle both old format (2-tuple) and new format (3-tuple)
|
639
|
+
if len(exclusion_data) == 2:
|
640
|
+
exclusion_item, label = exclusion_data
|
641
|
+
method = "region"
|
642
|
+
else:
|
643
|
+
exclusion_item, label, method = exclusion_data
|
644
|
+
|
645
|
+
# Skip callables (already handled in _get_exclusion_regions)
|
646
|
+
if callable(exclusion_item):
|
647
|
+
continue
|
648
|
+
|
649
|
+
# Skip regions (already in exclusion_regions)
|
650
|
+
if isinstance(exclusion_item, Region):
|
651
|
+
continue
|
652
|
+
|
653
|
+
# Handle element-based exclusions
|
654
|
+
if method == "element" and hasattr(exclusion_item, "bbox"):
|
655
|
+
excluded_elements.add(id(exclusion_item))
|
656
|
+
if debug_exclusions:
|
657
|
+
print(f" - Added element exclusion: {exclusion_item}")
|
536
658
|
|
537
659
|
if debug_exclusions:
|
538
660
|
print(
|
539
|
-
f"Page {self.index}: Applying {len(exclusion_regions)}
|
661
|
+
f"Page {self.index}: Applying {len(exclusion_regions)} region exclusions "
|
662
|
+
f"and {len(excluded_elements)} element exclusions to {len(elements)} elements."
|
540
663
|
)
|
541
664
|
|
542
665
|
filtered_elements = []
|
543
|
-
|
666
|
+
region_excluded_count = 0
|
667
|
+
element_excluded_count = 0
|
668
|
+
|
544
669
|
for element in elements:
|
545
670
|
exclude = False
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
671
|
+
|
672
|
+
# Check element-based exclusions first (faster)
|
673
|
+
if id(element) in excluded_elements:
|
674
|
+
exclude = True
|
675
|
+
element_excluded_count += 1
|
676
|
+
if debug_exclusions:
|
677
|
+
print(f" Element {element} excluded by element-based rule")
|
678
|
+
else:
|
679
|
+
# Check region-based exclusions
|
680
|
+
for region in exclusion_regions:
|
681
|
+
# Use the region's method to check if the element is inside
|
682
|
+
if region._is_element_in_region(element):
|
683
|
+
exclude = True
|
684
|
+
region_excluded_count += 1
|
685
|
+
if debug_exclusions:
|
686
|
+
print(f" Element {element} excluded by region {region}")
|
687
|
+
break # No need to check other regions for this element
|
688
|
+
|
552
689
|
if not exclude:
|
553
690
|
filtered_elements.append(element)
|
554
691
|
|
555
692
|
if debug_exclusions:
|
556
693
|
print(
|
557
|
-
f"Page {self.index}: Excluded {
|
694
|
+
f"Page {self.index}: Excluded {region_excluded_count} by regions, "
|
695
|
+
f"{element_excluded_count} by elements, keeping {len(filtered_elements)}."
|
558
696
|
)
|
559
697
|
|
560
698
|
return filtered_elements
|
@@ -1186,7 +1324,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1186
1324
|
return self._page.crop(bbox, **kwargs)
|
1187
1325
|
|
1188
1326
|
def extract_text(
|
1189
|
-
self, preserve_whitespace=True, use_exclusions=True, debug_exclusions=False, **kwargs
|
1327
|
+
self, preserve_whitespace=True, use_exclusions=True, debug_exclusions=False, content_filter=None, **kwargs
|
1190
1328
|
) -> str:
|
1191
1329
|
"""
|
1192
1330
|
Extract text from this page, respecting exclusions and using pdfplumber's
|
@@ -1196,6 +1334,10 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1196
1334
|
use_exclusions: Whether to apply exclusion regions (default: True).
|
1197
1335
|
Note: Filtering logic is now always applied if exclusions exist.
|
1198
1336
|
debug_exclusions: Whether to output detailed exclusion debugging info (default: False).
|
1337
|
+
content_filter: Optional content filter to exclude specific text patterns. Can be:
|
1338
|
+
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
1339
|
+
- A callable that takes text and returns True to KEEP the character
|
1340
|
+
- A list of regex patterns (characters matching ANY pattern are EXCLUDED)
|
1199
1341
|
**kwargs: Additional layout parameters passed directly to pdfplumber's
|
1200
1342
|
`chars_to_textmap` function. Common parameters include:
|
1201
1343
|
- layout (bool): If True (default), inserts spaces/newlines.
|
@@ -1219,22 +1361,30 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1219
1361
|
logger.debug(f"Page {self.number}: No word elements found.")
|
1220
1362
|
return ""
|
1221
1363
|
|
1222
|
-
# 2.
|
1223
|
-
|
1364
|
+
# 2. Apply element-based exclusions if enabled
|
1365
|
+
if use_exclusions and self._exclusions:
|
1366
|
+
# Filter word elements through _filter_elements_by_exclusions
|
1367
|
+
# This handles both element-based and region-based exclusions
|
1368
|
+
word_elements = self._filter_elements_by_exclusions(word_elements, debug_exclusions=debug)
|
1369
|
+
if debug:
|
1370
|
+
logger.debug(f"Page {self.number}: {len(word_elements)} words remaining after exclusion filtering.")
|
1371
|
+
|
1372
|
+
# 3. Get region-based exclusions for spatial filtering
|
1373
|
+
apply_exclusions_flag = kwargs.get("use_exclusions", use_exclusions)
|
1224
1374
|
exclusion_regions = []
|
1225
1375
|
if apply_exclusions_flag and self._exclusions:
|
1226
1376
|
exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug)
|
1227
1377
|
if debug:
|
1228
|
-
logger.debug(f"Page {self.number}:
|
1378
|
+
logger.debug(f"Page {self.number}: Found {len(exclusion_regions)} region exclusions for spatial filtering.")
|
1229
1379
|
elif debug:
|
1230
1380
|
logger.debug(f"Page {self.number}: Not applying exclusions.")
|
1231
1381
|
|
1232
|
-
#
|
1382
|
+
# 4. Collect All Character Dictionaries from remaining Word Elements
|
1233
1383
|
all_char_dicts = []
|
1234
1384
|
for word in word_elements:
|
1235
1385
|
all_char_dicts.extend(getattr(word, "_char_dicts", []))
|
1236
1386
|
|
1237
|
-
#
|
1387
|
+
# 5. Spatially Filter Characters (only by regions, elements already filtered above)
|
1238
1388
|
filtered_chars = filter_chars_spatially(
|
1239
1389
|
char_dicts=all_char_dicts,
|
1240
1390
|
exclusion_regions=exclusion_regions,
|
@@ -1255,6 +1405,10 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1255
1405
|
elif k in getattr(self._parent, "_config", {}):
|
1256
1406
|
merged_kwargs[k] = self._parent._config[k]
|
1257
1407
|
|
1408
|
+
# Add content_filter to kwargs if provided
|
1409
|
+
if content_filter is not None:
|
1410
|
+
merged_kwargs["content_filter"] = content_filter
|
1411
|
+
|
1258
1412
|
result = generate_text_layout(
|
1259
1413
|
char_dicts=filtered_chars,
|
1260
1414
|
layout_context_bbox=page_bbox,
|
@@ -1307,6 +1461,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1307
1461
|
text_options: Optional[Dict] = None,
|
1308
1462
|
cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
1309
1463
|
show_progress: bool = False,
|
1464
|
+
content_filter=None,
|
1310
1465
|
) -> List[List[Optional[str]]]:
|
1311
1466
|
"""
|
1312
1467
|
Extract the largest table from this page using enhanced region-based extraction.
|
@@ -1320,6 +1475,10 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1320
1475
|
cell_extraction_func: Optional callable function that takes a cell Region object
|
1321
1476
|
and returns its string content. For 'text' method only.
|
1322
1477
|
show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
|
1478
|
+
content_filter: Optional content filter to apply during cell text extraction. Can be:
|
1479
|
+
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
1480
|
+
- A callable that takes text and returns True to KEEP the character
|
1481
|
+
- A list of regex patterns (characters matching ANY pattern are EXCLUDED)
|
1323
1482
|
|
1324
1483
|
Returns:
|
1325
1484
|
Table data as a list of rows, where each row is a list of cell values (str or None).
|
@@ -1334,6 +1493,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1334
1493
|
text_options=text_options,
|
1335
1494
|
cell_extraction_func=cell_extraction_func,
|
1336
1495
|
show_progress=show_progress,
|
1496
|
+
content_filter=content_filter,
|
1337
1497
|
)
|
1338
1498
|
|
1339
1499
|
def extract_tables(
|
natural_pdf/core/pdf.py
CHANGED
@@ -561,7 +561,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
561
561
|
return self
|
562
562
|
|
563
563
|
def add_exclusion(
|
564
|
-
self, exclusion_func
|
564
|
+
self, exclusion_func, label: str = None
|
565
565
|
) -> "PDF":
|
566
566
|
"""Add an exclusion function to the PDF.
|
567
567
|
|
@@ -607,6 +607,21 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
607
607
|
if not hasattr(self, "_pages"):
|
608
608
|
raise AttributeError("PDF pages not yet initialized.")
|
609
609
|
|
610
|
+
# ------------------------------------------------------------------
|
611
|
+
# NEW: Support selector strings and ElementCollection objects directly.
|
612
|
+
# We simply forward the same object to each page's add_exclusion which
|
613
|
+
# now knows how to interpret these inputs.
|
614
|
+
# ------------------------------------------------------------------
|
615
|
+
from natural_pdf.elements.collections import ElementCollection # local import
|
616
|
+
|
617
|
+
if isinstance(exclusion_func, str) or isinstance(exclusion_func, ElementCollection):
|
618
|
+
# Store for bookkeeping
|
619
|
+
self._exclusions.append((exclusion_func, label))
|
620
|
+
for page in self._pages:
|
621
|
+
page.add_exclusion(exclusion_func, label=label)
|
622
|
+
return self
|
623
|
+
|
624
|
+
# Fallback to original callable / Region behaviour ------------------
|
610
625
|
exclusion_data = (exclusion_func, label)
|
611
626
|
self._exclusions.append(exclusion_data)
|
612
627
|
|
@@ -369,6 +369,7 @@ class ElementCollection(
|
|
369
369
|
preserve_whitespace: bool = True,
|
370
370
|
use_exclusions: bool = True,
|
371
371
|
strip: Optional[bool] = None,
|
372
|
+
content_filter=None,
|
372
373
|
**kwargs,
|
373
374
|
) -> str:
|
374
375
|
"""
|
@@ -379,6 +380,10 @@ class ElementCollection(
|
|
379
380
|
preserve_whitespace: Deprecated. Use layout=False for simple joining.
|
380
381
|
use_exclusions: Deprecated. Exclusions should be applied *before* creating
|
381
382
|
the collection or by filtering the collection itself.
|
383
|
+
content_filter: Optional content filter to exclude specific text patterns. Can be:
|
384
|
+
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
385
|
+
- A callable that takes text and returns True to KEEP the character
|
386
|
+
- A list of regex patterns (characters matching ANY pattern are EXCLUDED)
|
382
387
|
**kwargs: Additional layout parameters passed directly to pdfplumber's
|
383
388
|
`chars_to_textmap` function ONLY if `layout=True` is passed.
|
384
389
|
See Page.extract_text docstring for common parameters.
|
@@ -412,6 +417,11 @@ class ElementCollection(
|
|
412
417
|
getattr(el, "text", "") for el in text_elements
|
413
418
|
) # Fallback to simple join of word text
|
414
419
|
|
420
|
+
# Apply content filtering if provided
|
421
|
+
if content_filter is not None:
|
422
|
+
from natural_pdf.utils.text_extraction import _apply_content_filter
|
423
|
+
all_char_dicts = _apply_content_filter(all_char_dicts, content_filter)
|
424
|
+
|
415
425
|
# Check if layout is requested
|
416
426
|
use_layout = kwargs.get("layout", False)
|
417
427
|
|