natural-pdf 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/core/page.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import base64
2
2
  import concurrent.futures # Added import
3
+ import contextlib
3
4
  import hashlib
4
5
  import io
5
6
  import json
@@ -30,6 +31,7 @@ from tqdm.auto import tqdm # Added tqdm import
30
31
  from natural_pdf.elements.element_collection import ElementCollection
31
32
  from natural_pdf.elements.region import Region
32
33
  from natural_pdf.selectors.parser import parse_selector
34
+ from natural_pdf.tables.result import TableResult
33
35
  from natural_pdf.utils.locks import pdf_render_lock # Import from utils instead
34
36
  from natural_pdf.utils.visualization import render_plain_page
35
37
 
@@ -274,6 +276,9 @@ class Page(
274
276
  self._load_elements()
275
277
  self._to_image_cache: Dict[tuple, Optional["Image.Image"]] = {}
276
278
 
279
+ # Flag to prevent infinite recursion when computing exclusions
280
+ self._computing_exclusions = False
281
+
277
282
  def _get_render_specs(
278
283
  self,
279
284
  mode: Literal["show", "render"] = "show",
@@ -411,6 +416,35 @@ class Page(
411
416
  self._exclusions = []
412
417
  return self
413
418
 
419
+ @contextlib.contextmanager
420
+ def without_exclusions(self):
421
+ """
422
+ Context manager that temporarily disables exclusion processing.
423
+
424
+ This prevents infinite recursion when exclusion callables themselves
425
+ use find() operations. While in this context, all find operations
426
+ will skip exclusion filtering.
427
+
428
+ Example:
429
+ ```python
430
+ # This exclusion would normally cause infinite recursion:
431
+ page.add_exclusion(lambda p: p.find("text:contains('Header')").expand())
432
+
433
+ # But internally, it's safe because we use:
434
+ with page.without_exclusions():
435
+ region = exclusion_callable(page)
436
+ ```
437
+
438
+ Yields:
439
+ The page object with exclusions temporarily disabled.
440
+ """
441
+ old_value = self._computing_exclusions
442
+ self._computing_exclusions = True
443
+ try:
444
+ yield self
445
+ finally:
446
+ self._computing_exclusions = old_value
447
+
414
448
  def add_exclusion(
415
449
  self,
416
450
  exclusion_func_or_region: Union[
@@ -758,15 +792,10 @@ class Page(
758
792
  if debug:
759
793
  print(f" - Evaluating callable '{exclusion_label}'...")
760
794
 
761
- # Temporarily clear exclusions (consider if really needed)
762
- temp_original_exclusions = self._exclusions
763
- self._exclusions = []
764
-
765
- # Call the function - Expects it to return a Region or None
766
- region_result = exclusion_item(self)
767
-
768
- # Restore exclusions
769
- self._exclusions = temp_original_exclusions
795
+ # Use context manager to prevent infinite recursion
796
+ with self.without_exclusions():
797
+ # Call the function - Expects it to return a Region or None
798
+ region_result = exclusion_item(self)
770
799
 
771
800
  if isinstance(region_result, Region):
772
801
  # Assign the label to the returned region
@@ -866,26 +895,33 @@ class Page(
866
895
  if debug:
867
896
  print(f" - Added direct region '{label}': {exclusion_item}")
868
897
 
869
- # Process direct Element objects - convert to Region
898
+ # Process direct Element objects - only convert to Region if method is "region"
870
899
  elif hasattr(exclusion_item, "bbox") and hasattr(exclusion_item, "expand"):
871
- try:
872
- # Convert Element to Region using expand()
873
- expanded_region = exclusion_item.expand()
874
- if isinstance(expanded_region, Region):
875
- expanded_region.label = label
876
- regions.append(expanded_region)
877
- if debug:
878
- print(
879
- f" - Converted direct Element to Region '{label}': {expanded_region}"
880
- )
881
- else:
900
+ if method == "region":
901
+ try:
902
+ # Convert Element to Region using expand()
903
+ expanded_region = exclusion_item.expand()
904
+ if isinstance(expanded_region, Region):
905
+ expanded_region.label = label
906
+ regions.append(expanded_region)
907
+ if debug:
908
+ print(
909
+ f" - Converted direct Element to Region '{label}': {expanded_region}"
910
+ )
911
+ else:
912
+ if debug:
913
+ print(
914
+ f" - Element.expand() did not return a Region: {type(expanded_region)}"
915
+ )
916
+ except Exception as e:
882
917
  if debug:
883
- print(
884
- f" - Element.expand() did not return a Region: {type(expanded_region)}"
885
- )
886
- except Exception as e:
918
+ print(f" - Failed to convert Element to Region: {e}")
919
+ else:
920
+ # method == "element" - will be handled in _filter_elements_by_exclusions
887
921
  if debug:
888
- print(f" - Failed to convert Element to Region: {e}")
922
+ print(
923
+ f" - Skipping element '{label}' (will be handled as element-based exclusion)"
924
+ )
889
925
 
890
926
  # Process string selectors (from PDF-level exclusions)
891
927
  elif isinstance(exclusion_item, str):
@@ -939,6 +975,11 @@ class Page(
939
975
  Returns:
940
976
  A new list containing only the elements not excluded.
941
977
  """
978
+ # Skip exclusion filtering if we're currently computing exclusions
979
+ # This prevents infinite recursion when exclusion callables use find operations
980
+ if self._computing_exclusions:
981
+ return elements
982
+
942
983
  # Check both page-level and PDF-level exclusions
943
984
  has_page_exclusions = bool(self._exclusions)
944
985
  has_pdf_exclusions = (
@@ -1245,15 +1286,46 @@ class Page(
1245
1286
  Returns:
1246
1287
  ElementCollection of matching elements (unfiltered by exclusions)
1247
1288
  """
1248
- from natural_pdf.selectors.parser import selector_to_filter_func
1289
+ from natural_pdf.selectors.parser import _calculate_aggregates, selector_to_filter_func
1249
1290
 
1250
1291
  # Handle compound OR selectors
1251
1292
  if selector_obj.get("type") == "or":
1252
1293
  # For OR selectors, search all elements and let the filter function decide
1253
1294
  elements_to_search = self._element_mgr.get_all_elements()
1254
1295
 
1296
+ # Check if any sub-selector contains aggregate functions
1297
+ has_aggregates = False
1298
+ for sub_selector in selector_obj.get("selectors", []):
1299
+ for attr in sub_selector.get("attributes", []):
1300
+ value = attr.get("value")
1301
+ if isinstance(value, dict) and value.get("type") == "aggregate":
1302
+ has_aggregates = True
1303
+ break
1304
+ if has_aggregates:
1305
+ break
1306
+
1307
+ # Calculate aggregates if needed - for OR selectors we calculate on ALL elements
1308
+ aggregates = {}
1309
+ if has_aggregates:
1310
+ # Need to calculate aggregates for each sub-selector type
1311
+ for sub_selector in selector_obj.get("selectors", []):
1312
+ sub_type = sub_selector.get("type", "any").lower()
1313
+ if sub_type == "text":
1314
+ sub_elements = self._element_mgr.words
1315
+ elif sub_type == "rect":
1316
+ sub_elements = self._element_mgr.rects
1317
+ elif sub_type == "line":
1318
+ sub_elements = self._element_mgr.lines
1319
+ elif sub_type == "region":
1320
+ sub_elements = self._element_mgr.regions
1321
+ else:
1322
+ sub_elements = elements_to_search
1323
+
1324
+ sub_aggregates = _calculate_aggregates(sub_elements, sub_selector)
1325
+ aggregates.update(sub_aggregates)
1326
+
1255
1327
  # Create filter function from compound selector
1256
- filter_func = selector_to_filter_func(selector_obj, **kwargs)
1328
+ filter_func = selector_to_filter_func(selector_obj, aggregates=aggregates, **kwargs)
1257
1329
 
1258
1330
  # Apply the filter to all elements
1259
1331
  matching_elements = [element for element in elements_to_search if filter_func(element)]
@@ -1309,8 +1381,23 @@ class Page(
1309
1381
  else:
1310
1382
  elements_to_search = self._element_mgr.get_all_elements()
1311
1383
 
1384
+ # Check if selector contains aggregate functions
1385
+ has_aggregates = False
1386
+ for attr in selector_obj.get("attributes", []):
1387
+ value = attr.get("value")
1388
+ if isinstance(value, dict) and value.get("type") == "aggregate":
1389
+ has_aggregates = True
1390
+ break
1391
+
1392
+ # Calculate aggregates if needed
1393
+ aggregates = {}
1394
+ if has_aggregates:
1395
+ # For aggregates, we need to calculate based on ALL elements of the same type
1396
+ # not just the filtered subset
1397
+ aggregates = _calculate_aggregates(elements_to_search, selector_obj)
1398
+
1312
1399
  # Create filter function from selector, passing any additional parameters
1313
- filter_func = selector_to_filter_func(selector_obj, **kwargs)
1400
+ filter_func = selector_to_filter_func(selector_obj, aggregates=aggregates, **kwargs)
1314
1401
 
1315
1402
  # Apply the filter to matching elements
1316
1403
  matching_elements = [element for element in elements_to_search if filter_func(element)]
@@ -1857,7 +1944,9 @@ class Page(
1857
1944
  cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
1858
1945
  show_progress: bool = False,
1859
1946
  content_filter=None,
1860
- ) -> List[List[Optional[str]]]:
1947
+ verticals: Optional[List[float]] = None,
1948
+ horizontals: Optional[List[float]] = None,
1949
+ ) -> TableResult:
1861
1950
  """
1862
1951
  Extract the largest table from this page using enhanced region-based extraction.
1863
1952
 
@@ -1874,9 +1963,11 @@ class Page(
1874
1963
  - A regex pattern string (characters matching the pattern are EXCLUDED)
1875
1964
  - A callable that takes text and returns True to KEEP the character
1876
1965
  - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
1966
+ verticals: Optional list of x-coordinates for explicit vertical table lines.
1967
+ horizontals: Optional list of y-coordinates for explicit horizontal table lines.
1877
1968
 
1878
1969
  Returns:
1879
- Table data as a list of rows, where each row is a list of cell values (str or None).
1970
+ TableResult: A sequence-like object containing table rows that also provides .to_df() for pandas conversion.
1880
1971
  """
1881
1972
  # Create a full-page region and delegate to its enhanced extract_table method
1882
1973
  page_region = self.create_region(0, 0, self.width, self.height)
@@ -1889,6 +1980,8 @@ class Page(
1889
1980
  cell_extraction_func=cell_extraction_func,
1890
1981
  show_progress=show_progress,
1891
1982
  content_filter=content_filter,
1983
+ verticals=verticals,
1984
+ horizontals=horizontals,
1892
1985
  )
1893
1986
 
1894
1987
  def extract_tables(
@@ -2768,6 +2861,7 @@ class Page(
2768
2861
  region.start_element = current_start_element
2769
2862
  region.end_element = end_boundary_el # Mark the element that ended it
2770
2863
  region.is_end_next_start = True # Mark how it ended
2864
+ region._boundary_exclusions = include_boundaries
2771
2865
  regions.append(region)
2772
2866
  else: # horizontal
2773
2867
  sec_left = (
@@ -2787,6 +2881,7 @@ class Page(
2787
2881
  region.start_element = current_start_element
2788
2882
  region.end_element = end_boundary_el # Mark the element that ended it
2789
2883
  region.is_end_next_start = True # Mark how it ended
2884
+ region._boundary_exclusions = include_boundaries
2790
2885
  regions.append(region)
2791
2886
  active_section_started = False # Reset for the new start
2792
2887
 
@@ -2815,6 +2910,7 @@ class Page(
2815
2910
  region.start_element = current_start_element
2816
2911
  region.end_element = end_boundary_el
2817
2912
  region.is_end_next_start = False
2913
+ region._boundary_exclusions = include_boundaries
2818
2914
  regions.append(region)
2819
2915
  else: # horizontal
2820
2916
  sec_left = (
@@ -2834,6 +2930,7 @@ class Page(
2834
2930
  region.start_element = current_start_element
2835
2931
  region.end_element = end_boundary_el
2836
2932
  region.is_end_next_start = False
2933
+ region._boundary_exclusions = include_boundaries
2837
2934
  regions.append(region)
2838
2935
 
2839
2936
  # Reset: section ended explicitly
@@ -2854,6 +2951,7 @@ class Page(
2854
2951
  region.start_element = current_start_element
2855
2952
  region.end_element = None # Ended by page end
2856
2953
  region.is_end_next_start = False
2954
+ region._boundary_exclusions = include_boundaries
2857
2955
  regions.append(region)
2858
2956
  else: # horizontal
2859
2957
  sec_left = (
@@ -2867,6 +2965,7 @@ class Page(
2867
2965
  region.start_element = current_start_element
2868
2966
  region.end_element = None # Ended by page end
2869
2967
  region.is_end_next_start = False
2968
+ region._boundary_exclusions = include_boundaries
2870
2969
  regions.append(region)
2871
2970
 
2872
2971
  return ElementCollection(regions)
@@ -789,6 +789,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
789
789
  start_element.page,
790
790
  (0, top, start_element.page.width, bottom),
791
791
  )
792
+ section._boundary_exclusions = include_boundaries
792
793
  else: # horizontal
793
794
  left = start_element.x0
794
795
  right = end_element.x1
@@ -821,6 +822,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
821
822
  start_element.page,
822
823
  (left, 0, right, start_element.page.height),
823
824
  )
825
+ section._boundary_exclusions = include_boundaries
824
826
  section.start_element = start_element
825
827
  section.boundary_element_found = end_element
826
828
  else:
@@ -865,6 +867,10 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
865
867
  start_element.page, (0, top, start_element.page.width, bottom)
866
868
  )
867
869
  section.start_element = start_element
870
+ section.end_element = (
871
+ next_start # The next start is the end of this section
872
+ )
873
+ section._boundary_exclusions = include_boundaries
868
874
  sections.append(section)
869
875
  else: # horizontal
870
876
  # Determine horizontal bounds
@@ -882,6 +888,10 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
882
888
  start_element.page, (left, 0, right, start_element.page.height)
883
889
  )
884
890
  section.start_element = start_element
891
+ section.end_element = (
892
+ next_start # The next start is the end of this section
893
+ )
894
+ section._boundary_exclusions = include_boundaries
885
895
  sections.append(section)
886
896
  else:
887
897
  # Cross-page section - create from current_start to the end of its page
@@ -982,6 +992,71 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
982
992
 
983
993
  return ElementCollection(sections)
984
994
 
995
+ def split(self, divider, **kwargs) -> "ElementCollection[Region]":
996
+ """
997
+ Divide this page collection into sections based on the provided divider elements.
998
+
999
+ Args:
1000
+ divider: Elements or selector string that mark section boundaries
1001
+ **kwargs: Additional parameters passed to get_sections()
1002
+ - include_boundaries: How to include boundary elements (default: 'start')
1003
+ - orientation: 'vertical' or 'horizontal' (default: 'vertical')
1004
+ - new_section_on_page_break: Whether to split at page boundaries (default: False)
1005
+
1006
+ Returns:
1007
+ ElementCollection of Region objects representing the sections
1008
+
1009
+ Example:
1010
+ # Split a PDF by chapter titles
1011
+ chapters = pdf.pages.split("text[size>20]:contains('CHAPTER')")
1012
+
1013
+ # Split by page breaks
1014
+ page_sections = pdf.pages.split(None, new_section_on_page_break=True)
1015
+
1016
+ # Split multi-page document by section headers
1017
+ sections = pdf.pages[10:20].split("text:bold:contains('Section')")
1018
+ """
1019
+ # Default to 'start' boundaries for split (include divider at start of each section)
1020
+ if "include_boundaries" not in kwargs:
1021
+ kwargs["include_boundaries"] = "start"
1022
+
1023
+ sections = self.get_sections(start_elements=divider, **kwargs)
1024
+
1025
+ # Add initial section if there's content before the first divider
1026
+ if sections and divider is not None:
1027
+ # Get all elements across all pages
1028
+ all_elements = []
1029
+ for page in self.pages:
1030
+ all_elements.extend(page.get_elements())
1031
+
1032
+ if all_elements:
1033
+ # Find first divider
1034
+ if isinstance(divider, str):
1035
+ # Search for first matching element
1036
+ first_divider = None
1037
+ for page in self.pages:
1038
+ match = page.find(divider)
1039
+ if match:
1040
+ first_divider = match
1041
+ break
1042
+ else:
1043
+ # divider is already elements
1044
+ first_divider = divider[0] if hasattr(divider, "__getitem__") else divider
1045
+
1046
+ if first_divider and all_elements[0] != first_divider:
1047
+ # There's content before the first divider
1048
+ # Get section from start to first divider
1049
+ initial_sections = self.get_sections(
1050
+ start_elements=None,
1051
+ end_elements=[first_divider],
1052
+ include_boundaries="none",
1053
+ orientation=kwargs.get("orientation", "vertical"),
1054
+ )
1055
+ if initial_sections:
1056
+ sections = ElementCollection([initial_sections[0]] + list(sections))
1057
+
1058
+ return sections
1059
+
985
1060
  def _gather_analysis_data(
986
1061
  self,
987
1062
  analysis_keys: List[str],
natural_pdf/core/pdf.py CHANGED
@@ -1333,6 +1333,39 @@ class PDF(
1333
1333
  orientation=orientation,
1334
1334
  )
1335
1335
 
1336
+ def split(self, divider, **kwargs) -> "ElementCollection":
1337
+ """
1338
+ Divide the PDF into sections based on the provided divider elements.
1339
+
1340
+ Args:
1341
+ divider: Elements or selector string that mark section boundaries
1342
+ **kwargs: Additional parameters passed to get_sections()
1343
+ - include_boundaries: How to include boundary elements (default: 'start')
1344
+ - orientation: 'vertical' or 'horizontal' (default: 'vertical')
1345
+ - new_section_on_page_break: Whether to split at page boundaries (default: False)
1346
+
1347
+ Returns:
1348
+ ElementCollection of Region objects representing the sections
1349
+
1350
+ Example:
1351
+ # Split a PDF by chapter titles
1352
+ chapters = pdf.split("text[size>20]:contains('Chapter')")
1353
+
1354
+ # Export each chapter to a separate file
1355
+ for i, chapter in enumerate(chapters):
1356
+ chapter_text = chapter.extract_text()
1357
+ with open(f"chapter_{i+1}.txt", "w") as f:
1358
+ f.write(chapter_text)
1359
+
1360
+ # Split by horizontal rules/lines
1361
+ sections = pdf.split("line[orientation=horizontal]")
1362
+
1363
+ # Split only by page breaks (no divider elements)
1364
+ pages = pdf.split(None, new_section_on_page_break=True)
1365
+ """
1366
+ # Delegate to pages collection
1367
+ return self.pages.split(divider, **kwargs)
1368
+
1336
1369
  def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
1337
1370
  """
1338
1371
  DEPRECATED: Use save_pdf(..., ocr=True) instead.
@@ -272,17 +272,12 @@ def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str
272
272
  "font_family",
273
273
  "font_variant",
274
274
  "size",
275
- "bold",
276
- "italic",
277
- "strike",
278
- "underline",
279
- "highlight",
275
+ "styles",
280
276
  "source",
281
277
  "confidence",
278
+ "color",
282
279
  ]
283
280
  )
284
- # Add foreground text colour too
285
- columns.append("color")
286
281
  elif element_type == "rect":
287
282
  columns = base_columns + ["width", "height", "stroke", "fill", "stroke_width"]
288
283
  elif element_type == "line":
@@ -358,6 +353,52 @@ def _extract_element_value(element: "Element", column: str) -> Any:
358
353
  return str(col_val)
359
354
  return ""
360
355
 
356
+ elif column == "styles":
357
+ # Collect all active text decorations
358
+ styles = []
359
+
360
+ if getattr(element, "bold", False):
361
+ styles.append("bold")
362
+ if getattr(element, "italic", False):
363
+ styles.append("italic")
364
+ if getattr(element, "strike", False):
365
+ styles.append("strike")
366
+ if getattr(element, "underline", False):
367
+ styles.append("underline")
368
+
369
+ # Handle highlight specially - include color if not default yellow
370
+ if getattr(element, "is_highlighted", False):
371
+ highlight_color = getattr(element, "highlight_color", None)
372
+ if highlight_color is not None:
373
+ # Convert color to hex if needed
374
+ if isinstance(highlight_color, (tuple, list)) and len(highlight_color) >= 3:
375
+ try:
376
+ r, g, b = [
377
+ int(v * 255) if v <= 1 else int(v) for v in highlight_color[:3]
378
+ ]
379
+ hex_color = f"#{r:02x}{g:02x}{b:02x}"
380
+ styles.append(f"highlight({hex_color})")
381
+ except Exception:
382
+ styles.append("highlight")
383
+ elif isinstance(highlight_color, (int, float)):
384
+ # Grayscale value
385
+ try:
386
+ gray = (
387
+ int(highlight_color * 255)
388
+ if highlight_color <= 1
389
+ else int(highlight_color)
390
+ )
391
+ hex_color = f"#{gray:02x}{gray:02x}{gray:02x}"
392
+ styles.append(f"highlight({hex_color})")
393
+ except Exception:
394
+ styles.append("highlight")
395
+ else:
396
+ styles.append("highlight")
397
+ else:
398
+ styles.append("highlight")
399
+
400
+ return ", ".join(styles) if styles else ""
401
+
361
402
  elif column in ["stroke", "fill", "color"]:
362
403
  value = getattr(element, column, None)
363
404
  # If already a string (e.g. '#ff00aa' or 'red') return as is