natural-pdf 0.1.36__py3-none-any.whl → 0.1.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/core/page.py CHANGED
@@ -312,6 +312,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
312
312
  self,
313
313
  exclusion_func_or_region: Union[Callable[["Page"], "Region"], "Region", Any],
314
314
  label: Optional[str] = None,
315
+ method: str = "region",
315
316
  ) -> "Page":
316
317
  """
317
318
  Add an exclusion to the page. Text from these regions will be excluded from extraction.
@@ -321,54 +322,146 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
321
322
  exclusion_func_or_region: Either a callable function returning a Region,
322
323
  a Region object, or another object with a valid .bbox attribute.
323
324
  label: Optional label for this exclusion (e.g., 'header', 'footer').
325
+ method: Exclusion method - 'region' (exclude all elements in bounding box) or
326
+ 'element' (exclude only the specific elements). Default: 'region'.
324
327
 
325
328
  Returns:
326
329
  Self for method chaining
327
330
 
328
331
  Raises:
329
332
  TypeError: If a non-callable, non-Region object without a valid bbox is provided.
333
+ ValueError: If method is not 'region' or 'element'.
330
334
  """
335
+ # Validate method parameter
336
+ if method not in ("region", "element"):
337
+ raise ValueError(f"Invalid exclusion method '{method}'. Must be 'region' or 'element'.")
338
+
339
+ # ------------------------------------------------------------------
340
+ # NEW: Handle selector strings and ElementCollection instances
341
+ # ------------------------------------------------------------------
342
+ # If a user supplies a selector string (e.g. "text:bold") we resolve it
343
+ # immediately *on this page* to the matching elements and turn each into
344
+ # a Region object which is added to the internal exclusions list.
345
+ #
346
+ # Likewise, if an ElementCollection is passed we iterate over its
347
+ # elements and create Regions for each one.
348
+ # ------------------------------------------------------------------
349
+ from natural_pdf.elements.collections import ElementCollection # local import to avoid cycle
350
+
351
+ # Selector string ---------------------------------------------------
352
+ if isinstance(exclusion_func_or_region, str):
353
+ selector_str = exclusion_func_or_region
354
+ matching_elements = self.find_all(selector_str, apply_exclusions=False)
355
+
356
+ if not matching_elements:
357
+ logger.warning(
358
+ f"Page {self.index}: Selector '{selector_str}' returned no elements – no exclusions added."
359
+ )
360
+ else:
361
+ if method == "element":
362
+ # Store the actual elements for element-based exclusion
363
+ for el in matching_elements:
364
+ self._exclusions.append((el, label, method))
365
+ logger.debug(
366
+ f"Page {self.index}: Added element exclusion from selector '{selector_str}' -> {el}"
367
+ )
368
+ else: # method == "region"
369
+ for el in matching_elements:
370
+ try:
371
+ bbox_coords = (float(el.x0), float(el.top), float(el.x1), float(el.bottom))
372
+ region = Region(self, bbox_coords, label=label)
373
+ # Store directly as a Region tuple so we don't recurse endlessly
374
+ self._exclusions.append((region, label, method))
375
+ logger.debug(
376
+ f"Page {self.index}: Added exclusion region from selector '{selector_str}' -> {bbox_coords}"
377
+ )
378
+ except Exception as e:
379
+ logger.warning(
380
+ f"Page {self.index}: Failed to create exclusion region from element {el}: {e}"
381
+ )
382
+ return self # Completed processing for selector input
383
+
384
+ # ElementCollection -----------------------------------------------
385
+ if isinstance(exclusion_func_or_region, ElementCollection):
386
+ if method == "element":
387
+ # Store the actual elements for element-based exclusion
388
+ for el in exclusion_func_or_region:
389
+ self._exclusions.append((el, label, method))
390
+ logger.debug(
391
+ f"Page {self.index}: Added element exclusion from ElementCollection -> {el}"
392
+ )
393
+ else: # method == "region"
394
+ # Convert each element to a Region and add
395
+ for el in exclusion_func_or_region:
396
+ try:
397
+ if not (hasattr(el, "bbox") and len(el.bbox) == 4):
398
+ logger.warning(
399
+ f"Page {self.index}: Skipping element without bbox in ElementCollection exclusion: {el}"
400
+ )
401
+ continue
402
+ bbox_coords = tuple(float(v) for v in el.bbox)
403
+ region = Region(self, bbox_coords, label=label)
404
+ self._exclusions.append((region, label, method))
405
+ logger.debug(
406
+ f"Page {self.index}: Added exclusion region from ElementCollection element {bbox_coords}"
407
+ )
408
+ except Exception as e:
409
+ logger.warning(
410
+ f"Page {self.index}: Failed to convert ElementCollection element to Region: {e}"
411
+ )
412
+ return self # Completed processing for ElementCollection input
413
+
414
+ # ------------------------------------------------------------------
415
+ # Existing logic (callable, Region, bbox-bearing objects)
416
+ # ------------------------------------------------------------------
331
417
  exclusion_data = None # Initialize exclusion data
332
418
 
333
419
  if callable(exclusion_func_or_region):
334
- # Store callable functions along with their label
335
- exclusion_data = (exclusion_func_or_region, label)
420
+ # Store callable functions along with their label and method
421
+ exclusion_data = (exclusion_func_or_region, label, method)
336
422
  logger.debug(
337
- f"Page {self.index}: Added callable exclusion '{label}': {exclusion_func_or_region}"
423
+ f"Page {self.index}: Added callable exclusion '{label}' with method '{method}': {exclusion_func_or_region}"
338
424
  )
339
425
  elif isinstance(exclusion_func_or_region, Region):
340
426
  # Store Region objects directly, assigning the label
341
427
  exclusion_func_or_region.label = label # Assign label
342
- exclusion_data = (exclusion_func_or_region, label) # Store as tuple for consistency
428
+ exclusion_data = (exclusion_func_or_region, label, method) # Store as tuple for consistency
343
429
  logger.debug(
344
- f"Page {self.index}: Added Region exclusion '{label}': {exclusion_func_or_region}"
430
+ f"Page {self.index}: Added Region exclusion '{label}' with method '{method}': {exclusion_func_or_region}"
345
431
  )
346
432
  elif (
347
433
  hasattr(exclusion_func_or_region, "bbox")
348
434
  and isinstance(getattr(exclusion_func_or_region, "bbox", None), (tuple, list))
349
435
  and len(exclusion_func_or_region.bbox) == 4
350
436
  ):
351
- # Convert objects with a valid bbox to a Region before storing
352
- try:
353
- bbox_coords = tuple(float(v) for v in exclusion_func_or_region.bbox)
354
- # Pass the label to the Region constructor
355
- region_to_add = Region(self, bbox_coords, label=label)
356
- exclusion_data = (region_to_add, label) # Store as tuple
437
+ if method == "element":
438
+ # For element method, store the element directly
439
+ exclusion_data = (exclusion_func_or_region, label, method)
357
440
  logger.debug(
358
- f"Page {self.index}: Added exclusion '{label}' converted to Region from {type(exclusion_func_or_region)}: {region_to_add}"
441
+ f"Page {self.index}: Added element exclusion '{label}': {exclusion_func_or_region}"
359
442
  )
360
- except (ValueError, TypeError, Exception) as e:
361
- # Raise an error if conversion fails
362
- raise TypeError(
363
- f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}"
364
- ) from e
443
+ else: # method == "region"
444
+ # Convert objects with a valid bbox to a Region before storing
445
+ try:
446
+ bbox_coords = tuple(float(v) for v in exclusion_func_or_region.bbox)
447
+ # Pass the label to the Region constructor
448
+ region_to_add = Region(self, bbox_coords, label=label)
449
+ exclusion_data = (region_to_add, label, method) # Store as tuple
450
+ logger.debug(
451
+ f"Page {self.index}: Added exclusion '{label}' with method '{method}' converted to Region from {type(exclusion_func_or_region)}: {region_to_add}"
452
+ )
453
+ except (ValueError, TypeError, Exception) as e:
454
+ # Raise an error if conversion fails
455
+ raise TypeError(
456
+ f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}"
457
+ ) from e
365
458
  else:
366
459
  # Reject invalid types
367
460
  raise TypeError(
368
461
  f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute."
369
462
  )
370
463
 
371
- # Append the stored data (tuple of object/callable and label)
464
+ # Append the stored data (tuple of object/callable, label, and method)
372
465
  if exclusion_data:
373
466
  self._exclusions.append(exclusion_data)
374
467
 
@@ -430,7 +523,8 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
430
523
  def _get_exclusion_regions(self, include_callable=True, debug=False) -> List["Region"]:
431
524
  """
432
525
  Get all exclusion regions for this page.
433
- Assumes self._exclusions contains tuples of (callable/Region, label).
526
+ Now handles both region-based and element-based exclusions.
527
+ Assumes self._exclusions contains tuples of (callable/Region/Element, label, method).
434
528
 
435
529
  Args:
436
530
  include_callable: Whether to evaluate callable exclusion functions
@@ -445,8 +539,15 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
445
539
  print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
446
540
 
447
541
  for i, exclusion_data in enumerate(self._exclusions):
448
- # Unpack the exclusion object/callable and its label
449
- exclusion_item, label = exclusion_data
542
+ # Handle both old format (2-tuple) and new format (3-tuple) for backward compatibility
543
+ if len(exclusion_data) == 2:
544
+ # Old format: (exclusion_item, label)
545
+ exclusion_item, label = exclusion_data
546
+ method = "region" # Default to region for old format
547
+ else:
548
+ # New format: (exclusion_item, label, method)
549
+ exclusion_item, label, method = exclusion_data
550
+
450
551
  exclusion_label = label if label else f"exclusion {i}"
451
552
 
452
553
  # Process callable exclusion functions
@@ -495,7 +596,8 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
495
596
  regions.append(exclusion_item) # Label is already on the Region object
496
597
  if debug:
497
598
  print(f" - Added direct region '{label}': {exclusion_item}")
498
- # No else needed, add_exclusion should prevent invalid types
599
+ # Element-based exclusions are not converted to regions here
600
+ # They will be handled separately in _filter_elements_by_exclusions
499
601
 
500
602
  if debug:
501
603
  print(f"Page {self.index}: Found {len(regions)} valid exclusion regions to apply")
@@ -506,14 +608,16 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
506
608
  self, elements: List["Element"], debug_exclusions: bool = False
507
609
  ) -> List["Element"]:
508
610
  """
509
- Filters a list of elements, removing those within the page's exclusion regions.
611
+ Filters a list of elements, removing those based on exclusion rules.
612
+ Handles both region-based exclusions (exclude all in area) and
613
+ element-based exclusions (exclude only specific elements).
510
614
 
511
615
  Args:
512
616
  elements: The list of elements to filter.
513
617
  debug_exclusions: Whether to output detailed exclusion debugging info (default: False).
514
618
 
515
619
  Returns:
516
- A new list containing only the elements not falling within any exclusion region.
620
+ A new list containing only the elements not excluded.
517
621
  """
518
622
  if not self._exclusions:
519
623
  if debug_exclusions:
@@ -527,34 +631,68 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
527
631
  include_callable=True, debug=debug_exclusions
528
632
  )
529
633
 
530
- if not exclusion_regions:
531
- if debug_exclusions:
532
- print(
533
- f"Page {self.index}: No valid exclusion regions found, returning all {len(elements)} elements."
534
- )
535
- return elements
634
+ # Collect element-based exclusions
635
+ excluded_elements = set() # Use set for O(1) lookup
636
+
637
+ for exclusion_data in self._exclusions:
638
+ # Handle both old format (2-tuple) and new format (3-tuple)
639
+ if len(exclusion_data) == 2:
640
+ exclusion_item, label = exclusion_data
641
+ method = "region"
642
+ else:
643
+ exclusion_item, label, method = exclusion_data
644
+
645
+ # Skip callables (already handled in _get_exclusion_regions)
646
+ if callable(exclusion_item):
647
+ continue
648
+
649
+ # Skip regions (already in exclusion_regions)
650
+ if isinstance(exclusion_item, Region):
651
+ continue
652
+
653
+ # Handle element-based exclusions
654
+ if method == "element" and hasattr(exclusion_item, "bbox"):
655
+ excluded_elements.add(id(exclusion_item))
656
+ if debug_exclusions:
657
+ print(f" - Added element exclusion: {exclusion_item}")
536
658
 
537
659
  if debug_exclusions:
538
660
  print(
539
- f"Page {self.index}: Applying {len(exclusion_regions)} exclusion regions to {len(elements)} elements."
661
+ f"Page {self.index}: Applying {len(exclusion_regions)} region exclusions "
662
+ f"and {len(excluded_elements)} element exclusions to {len(elements)} elements."
540
663
  )
541
664
 
542
665
  filtered_elements = []
543
- excluded_count = 0
666
+ region_excluded_count = 0
667
+ element_excluded_count = 0
668
+
544
669
  for element in elements:
545
670
  exclude = False
546
- for region in exclusion_regions:
547
- # Use the region's method to check if the element is inside
548
- if region._is_element_in_region(element):
549
- exclude = True
550
- excluded_count += 1
551
- break # No need to check other regions for this element
671
+
672
+ # Check element-based exclusions first (faster)
673
+ if id(element) in excluded_elements:
674
+ exclude = True
675
+ element_excluded_count += 1
676
+ if debug_exclusions:
677
+ print(f" Element {element} excluded by element-based rule")
678
+ else:
679
+ # Check region-based exclusions
680
+ for region in exclusion_regions:
681
+ # Use the region's method to check if the element is inside
682
+ if region._is_element_in_region(element):
683
+ exclude = True
684
+ region_excluded_count += 1
685
+ if debug_exclusions:
686
+ print(f" Element {element} excluded by region {region}")
687
+ break # No need to check other regions for this element
688
+
552
689
  if not exclude:
553
690
  filtered_elements.append(element)
554
691
 
555
692
  if debug_exclusions:
556
693
  print(
557
- f"Page {self.index}: Excluded {excluded_count} elements, keeping {len(filtered_elements)}."
694
+ f"Page {self.index}: Excluded {region_excluded_count} by regions, "
695
+ f"{element_excluded_count} by elements, keeping {len(filtered_elements)}."
558
696
  )
559
697
 
560
698
  return filtered_elements
@@ -1186,7 +1324,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1186
1324
  return self._page.crop(bbox, **kwargs)
1187
1325
 
1188
1326
  def extract_text(
1189
- self, preserve_whitespace=True, use_exclusions=True, debug_exclusions=False, **kwargs
1327
+ self, preserve_whitespace=True, use_exclusions=True, debug_exclusions=False, content_filter=None, **kwargs
1190
1328
  ) -> str:
1191
1329
  """
1192
1330
  Extract text from this page, respecting exclusions and using pdfplumber's
@@ -1196,6 +1334,10 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1196
1334
  use_exclusions: Whether to apply exclusion regions (default: True).
1197
1335
  Note: Filtering logic is now always applied if exclusions exist.
1198
1336
  debug_exclusions: Whether to output detailed exclusion debugging info (default: False).
1337
+ content_filter: Optional content filter to exclude specific text patterns. Can be:
1338
+ - A regex pattern string (characters matching the pattern are EXCLUDED)
1339
+ - A callable that takes text and returns True to KEEP the character
1340
+ - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
1199
1341
  **kwargs: Additional layout parameters passed directly to pdfplumber's
1200
1342
  `chars_to_textmap` function. Common parameters include:
1201
1343
  - layout (bool): If True (default), inserts spaces/newlines.
@@ -1219,22 +1361,30 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1219
1361
  logger.debug(f"Page {self.number}: No word elements found.")
1220
1362
  return ""
1221
1363
 
1222
- # 2. Get Exclusions
1223
- apply_exclusions_flag = kwargs.get("use_exclusions", True)
1364
+ # 2. Apply element-based exclusions if enabled
1365
+ if use_exclusions and self._exclusions:
1366
+ # Filter word elements through _filter_elements_by_exclusions
1367
+ # This handles both element-based and region-based exclusions
1368
+ word_elements = self._filter_elements_by_exclusions(word_elements, debug_exclusions=debug)
1369
+ if debug:
1370
+ logger.debug(f"Page {self.number}: {len(word_elements)} words remaining after exclusion filtering.")
1371
+
1372
+ # 3. Get region-based exclusions for spatial filtering
1373
+ apply_exclusions_flag = kwargs.get("use_exclusions", use_exclusions)
1224
1374
  exclusion_regions = []
1225
1375
  if apply_exclusions_flag and self._exclusions:
1226
1376
  exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug)
1227
1377
  if debug:
1228
- logger.debug(f"Page {self.number}: Applying {len(exclusion_regions)} exclusions.")
1378
+ logger.debug(f"Page {self.number}: Found {len(exclusion_regions)} region exclusions for spatial filtering.")
1229
1379
  elif debug:
1230
1380
  logger.debug(f"Page {self.number}: Not applying exclusions.")
1231
1381
 
1232
- # 3. Collect All Character Dictionaries from Word Elements
1382
+ # 4. Collect All Character Dictionaries from remaining Word Elements
1233
1383
  all_char_dicts = []
1234
1384
  for word in word_elements:
1235
1385
  all_char_dicts.extend(getattr(word, "_char_dicts", []))
1236
1386
 
1237
- # 4. Spatially Filter Characters
1387
+ # 5. Spatially Filter Characters (only by regions, elements already filtered above)
1238
1388
  filtered_chars = filter_chars_spatially(
1239
1389
  char_dicts=all_char_dicts,
1240
1390
  exclusion_regions=exclusion_regions,
@@ -1255,6 +1405,10 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1255
1405
  elif k in getattr(self._parent, "_config", {}):
1256
1406
  merged_kwargs[k] = self._parent._config[k]
1257
1407
 
1408
+ # Add content_filter to kwargs if provided
1409
+ if content_filter is not None:
1410
+ merged_kwargs["content_filter"] = content_filter
1411
+
1258
1412
  result = generate_text_layout(
1259
1413
  char_dicts=filtered_chars,
1260
1414
  layout_context_bbox=page_bbox,
@@ -1307,6 +1461,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1307
1461
  text_options: Optional[Dict] = None,
1308
1462
  cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
1309
1463
  show_progress: bool = False,
1464
+ content_filter=None,
1310
1465
  ) -> List[List[Optional[str]]]:
1311
1466
  """
1312
1467
  Extract the largest table from this page using enhanced region-based extraction.
@@ -1320,6 +1475,10 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1320
1475
  cell_extraction_func: Optional callable function that takes a cell Region object
1321
1476
  and returns its string content. For 'text' method only.
1322
1477
  show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
1478
+ content_filter: Optional content filter to apply during cell text extraction. Can be:
1479
+ - A regex pattern string (characters matching the pattern are EXCLUDED)
1480
+ - A callable that takes text and returns True to KEEP the character
1481
+ - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
1323
1482
 
1324
1483
  Returns:
1325
1484
  Table data as a list of rows, where each row is a list of cell values (str or None).
@@ -1334,6 +1493,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1334
1493
  text_options=text_options,
1335
1494
  cell_extraction_func=cell_extraction_func,
1336
1495
  show_progress=show_progress,
1496
+ content_filter=content_filter,
1337
1497
  )
1338
1498
 
1339
1499
  def extract_tables(
natural_pdf/core/pdf.py CHANGED
@@ -561,7 +561,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
561
561
  return self
562
562
 
563
563
  def add_exclusion(
564
- self, exclusion_func: Callable[["Page"], Optional["Region"]], label: str = None
564
+ self, exclusion_func, label: str = None
565
565
  ) -> "PDF":
566
566
  """Add an exclusion function to the PDF.
567
567
 
@@ -607,6 +607,21 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
607
607
  if not hasattr(self, "_pages"):
608
608
  raise AttributeError("PDF pages not yet initialized.")
609
609
 
610
+ # ------------------------------------------------------------------
611
+ # NEW: Support selector strings and ElementCollection objects directly.
612
+ # We simply forward the same object to each page's add_exclusion which
613
+ # now knows how to interpret these inputs.
614
+ # ------------------------------------------------------------------
615
+ from natural_pdf.elements.collections import ElementCollection # local import
616
+
617
+ if isinstance(exclusion_func, str) or isinstance(exclusion_func, ElementCollection):
618
+ # Store for bookkeeping
619
+ self._exclusions.append((exclusion_func, label))
620
+ for page in self._pages:
621
+ page.add_exclusion(exclusion_func, label=label)
622
+ return self
623
+
624
+ # Fallback to original callable / Region behaviour ------------------
610
625
  exclusion_data = (exclusion_func, label)
611
626
  self._exclusions.append(exclusion_data)
612
627
 
@@ -369,6 +369,7 @@ class ElementCollection(
369
369
  preserve_whitespace: bool = True,
370
370
  use_exclusions: bool = True,
371
371
  strip: Optional[bool] = None,
372
+ content_filter=None,
372
373
  **kwargs,
373
374
  ) -> str:
374
375
  """
@@ -379,6 +380,10 @@ class ElementCollection(
379
380
  preserve_whitespace: Deprecated. Use layout=False for simple joining.
380
381
  use_exclusions: Deprecated. Exclusions should be applied *before* creating
381
382
  the collection or by filtering the collection itself.
383
+ content_filter: Optional content filter to exclude specific text patterns. Can be:
384
+ - A regex pattern string (characters matching the pattern are EXCLUDED)
385
+ - A callable that takes text and returns True to KEEP the character
386
+ - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
382
387
  **kwargs: Additional layout parameters passed directly to pdfplumber's
383
388
  `chars_to_textmap` function ONLY if `layout=True` is passed.
384
389
  See Page.extract_text docstring for common parameters.
@@ -412,6 +417,11 @@ class ElementCollection(
412
417
  getattr(el, "text", "") for el in text_elements
413
418
  ) # Fallback to simple join of word text
414
419
 
420
+ # Apply content filtering if provided
421
+ if content_filter is not None:
422
+ from natural_pdf.utils.text_extraction import _apply_content_filter
423
+ all_char_dicts = _apply_content_filter(all_char_dicts, content_filter)
424
+
415
425
  # Check if layout is requested
416
426
  use_layout = kwargs.get("layout", False)
417
427