natural-pdf 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1191,6 +1191,82 @@ class FlowRegion(Visualizable):
1191
1191
 
1192
1192
  return all_tables
1193
1193
 
1194
+ def get_sections(
1195
+ self,
1196
+ start_elements=None,
1197
+ end_elements=None,
1198
+ new_section_on_page_break: bool = False,
1199
+ include_boundaries: str = "both",
1200
+ orientation: str = "vertical",
1201
+ ) -> "ElementCollection":
1202
+ """
1203
+ Extract logical sections from this FlowRegion based on start/end boundary elements.
1204
+
1205
+ This delegates to the parent Flow's get_sections() method, but only operates
1206
+ on the segments that are part of this FlowRegion.
1207
+
1208
+ Args:
1209
+ start_elements: Elements or selector string that mark the start of sections
1210
+ end_elements: Elements or selector string that mark the end of sections
1211
+ new_section_on_page_break: Whether to start a new section at page boundaries
1212
+ include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
1213
+ orientation: 'vertical' (default) or 'horizontal' - determines section direction
1214
+
1215
+ Returns:
1216
+ ElementCollection of FlowRegion objects representing the extracted sections
1217
+
1218
+ Example:
1219
+ # Split a multi-page table region by headers
1220
+ table_region = flow.find("text:contains('Table 4')").below(until="text:contains('Table 5')")
1221
+ sections = table_region.get_sections(start_elements="text:bold")
1222
+ """
1223
+ # Create a temporary Flow with just our constituent regions as segments
1224
+ from natural_pdf.flows.flow import Flow
1225
+
1226
+ temp_flow = Flow(
1227
+ segments=self.constituent_regions,
1228
+ arrangement=self.flow.arrangement,
1229
+ alignment=self.flow.alignment,
1230
+ segment_gap=self.flow.segment_gap,
1231
+ )
1232
+
1233
+ # Delegate to Flow's get_sections implementation
1234
+ return temp_flow.get_sections(
1235
+ start_elements=start_elements,
1236
+ end_elements=end_elements,
1237
+ new_section_on_page_break=new_section_on_page_break,
1238
+ include_boundaries=include_boundaries,
1239
+ orientation=orientation,
1240
+ )
1241
+
1242
+ def split(
1243
+ self, by: Optional[str] = None, page_breaks: bool = True, **kwargs
1244
+ ) -> "ElementCollection":
1245
+ """
1246
+ Split this FlowRegion into sections.
1247
+
1248
+ This is a convenience method that wraps get_sections() with common splitting patterns.
1249
+
1250
+ Args:
1251
+ by: Selector string for elements that mark section boundaries (e.g., "text:bold")
1252
+ page_breaks: Whether to also split at page boundaries (default: True)
1253
+ **kwargs: Additional arguments passed to get_sections()
1254
+
1255
+ Returns:
1256
+ ElementCollection of FlowRegion objects representing the sections
1257
+
1258
+ Example:
1259
+ # Split by bold headers
1260
+ sections = flow_region.split(by="text:bold")
1261
+
1262
+ # Split only by specific text pattern, ignoring page breaks
1263
+ sections = flow_region.split(
1264
+ by="text:contains('Section')",
1265
+ page_breaks=False
1266
+ )
1267
+ """
1268
+ return self.get_sections(start_elements=by, new_section_on_page_break=page_breaks, **kwargs)
1269
+
1194
1270
  @property
1195
1271
  def normalized_type(self) -> Optional[str]:
1196
1272
  """
@@ -30,6 +30,7 @@ This enables powerful document navigation like:
30
30
  import ast
31
31
  import logging
32
32
  import re
33
+ from collections import Counter
33
34
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
34
35
 
35
36
  from colormath2.color_conversions import convert_color
@@ -86,6 +87,47 @@ def safe_parse_value(value_str: str) -> Any:
86
87
  return value_str
87
88
 
88
89
 
90
+ def _parse_aggregate_function(value_str: str) -> Optional[Dict[str, Any]]:
91
+ """Parse aggregate function syntax like min(), max(), avg(), closest("red").
92
+
93
+ Returns:
94
+ Dict with 'type': 'aggregate', 'func': function name, 'args': optional args
95
+ or None if not an aggregate function.
96
+ """
97
+ value_str = value_str.strip()
98
+
99
+ # Pattern for aggregate functions: funcname() or funcname(args)
100
+ # Supports: min(), max(), avg(), mean(), median(), mode(), most_common(), closest(...)
101
+ func_pattern = re.match(
102
+ r"^(min|max|avg|mean|median|mode|most_common|closest)\s*\((.*?)\)$",
103
+ value_str,
104
+ re.IGNORECASE,
105
+ )
106
+
107
+ if not func_pattern:
108
+ return None
109
+
110
+ func_name = func_pattern.group(1).lower()
111
+ args_str = func_pattern.group(2).strip()
112
+
113
+ # Normalize function aliases
114
+ if func_name == "mean":
115
+ func_name = "avg"
116
+ elif func_name == "most_common":
117
+ func_name = "mode"
118
+
119
+ # Parse arguments if present
120
+ args = None
121
+ if args_str:
122
+ # For closest(), parse the color argument
123
+ if func_name == "closest":
124
+ args = safe_parse_color(args_str)
125
+ else:
126
+ args = safe_parse_value(args_str)
127
+
128
+ return {"type": "aggregate", "func": func_name, "args": args}
129
+
130
+
89
131
  def safe_parse_color(value_str: str) -> tuple:
90
132
  """
91
133
  Parse a color value which could be an RGB tuple, color name, hex code, or CSS-style rgb(...)/rgba(...).
@@ -362,9 +404,14 @@ def parse_selector(selector: str) -> Dict[str, Any]:
362
404
  raise ValueError(
363
405
  f"Invalid selector: Attribute '[{name}{op}]' must have a value. Use '[{name}{op}\"\"]' for empty string or '[{name}]' for presence. Full selector: '{original_selector_for_error}'"
364
406
  )
365
- # Parse value
407
+ # Parse value - check for aggregate functions first
366
408
  parsed_value: Any
367
- if name in [
409
+ aggregate_func = _parse_aggregate_function(value_str)
410
+
411
+ if aggregate_func:
412
+ # Store aggregate function info
413
+ parsed_value = aggregate_func
414
+ elif name in [
368
415
  "color",
369
416
  "non_stroking_color",
370
417
  "fill",
@@ -564,12 +611,15 @@ PSEUDO_CLASS_FUNCTIONS = {
564
611
  }
565
612
 
566
613
 
567
- def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any]]:
614
+ def _build_filter_list(
615
+ selector: Dict[str, Any], aggregates: Optional[Dict[str, Any]] = None, **kwargs
616
+ ) -> List[Dict[str, Any]]:
568
617
  """
569
618
  Convert a parsed selector to a list of named filter functions.
570
619
 
571
620
  Args:
572
621
  selector: Parsed selector dictionary
622
+ aggregates: Pre-calculated aggregate values (optional)
573
623
  **kwargs: Additional filter parameters including:
574
624
  - regex: Whether to use regex for text search
575
625
  - case: Whether to do case-sensitive text search
@@ -581,6 +631,9 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
581
631
  filters: List[Dict[str, Any]] = []
582
632
  selector_type = selector["type"]
583
633
 
634
+ if aggregates is None:
635
+ aggregates = {}
636
+
584
637
  # Filter by element type
585
638
  if selector_type != "any":
586
639
  filter_name = f"type is '{selector_type}'"
@@ -611,6 +664,15 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
611
664
  value = attr_filter["value"]
612
665
  python_name = name.replace("-", "_") # Convert CSS-style names
613
666
 
667
+ # Check if value is an aggregate function
668
+ if isinstance(value, dict) and value.get("type") == "aggregate":
669
+ # Use pre-calculated aggregate value
670
+ aggregate_value = aggregates.get(name)
671
+ if aggregate_value is None:
672
+ # Skip this filter if aggregate couldn't be calculated
673
+ continue
674
+ value = aggregate_value
675
+
614
676
  # --- Define the core value retrieval logic ---
615
677
  def get_element_value(
616
678
  element, name=name, python_name=python_name, selector_type=selector_type
@@ -761,15 +823,15 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
761
823
  )
762
824
 
763
825
  # Recursively get the filter function for the inner selector
764
- # Pass kwargs down in case regex/case flags affect the inner selector
765
- inner_filter_func = selector_to_filter_func(args, **kwargs)
826
+ # Pass kwargs and aggregates down in case regex/case flags affect the inner selector
827
+ inner_filter_func = selector_to_filter_func(args, aggregates=aggregates, **kwargs)
766
828
 
767
829
  # The filter lambda applies the inner function and inverts the result
768
830
  filter_lambda = lambda el, inner_func=inner_filter_func: not inner_func(el)
769
831
 
770
832
  # Try to create a descriptive name (can be long)
771
833
  # Maybe simplify this later if needed
772
- inner_filter_list = _build_filter_list(args, **kwargs)
834
+ inner_filter_list = _build_filter_list(args, aggregates=aggregates, **kwargs)
773
835
  inner_filter_names = ", ".join([f["name"] for f in inner_filter_list])
774
836
  filter_name = f"pseudo-class :not({inner_filter_names})"
775
837
 
@@ -929,7 +991,113 @@ def _assemble_filter_func(filters: List[Dict[str, Any]]) -> Callable[[Any], bool
929
991
  return combined_filter
930
992
 
931
993
 
932
- def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any], bool]:
994
+ def _calculate_aggregates(elements: List[Any], selector: Dict[str, Any]) -> Dict[str, Any]:
995
+ """Calculate aggregate values for a selector.
996
+
997
+ Args:
998
+ elements: List of elements to calculate aggregates from
999
+ selector: Parsed selector dictionary
1000
+
1001
+ Returns:
1002
+ Dict mapping attribute names to their aggregate values
1003
+ """
1004
+ aggregates = {}
1005
+
1006
+ # Find all aggregate functions in attributes
1007
+ for attr in selector.get("attributes", []):
1008
+ value = attr.get("value")
1009
+ if isinstance(value, dict) and value.get("type") == "aggregate":
1010
+ attr_name = attr["name"]
1011
+ func_name = value["func"]
1012
+ func_args = value.get("args")
1013
+
1014
+ # Extract attribute values from elements
1015
+ values = []
1016
+ for el in elements:
1017
+ try:
1018
+ # Handle special bbox attributes
1019
+ if attr_name in ["x0", "y0", "x1", "y1"]:
1020
+ bbox_mapping = {"x0": 0, "y0": 1, "x1": 2, "y1": 3}
1021
+ bbox = getattr(el, "_bbox", None) or getattr(el, "bbox", None)
1022
+ if bbox:
1023
+ val = bbox[bbox_mapping[attr_name]]
1024
+ values.append(val)
1025
+ else:
1026
+ # General attribute access
1027
+ val = getattr(el, attr_name.replace("-", "_"), None)
1028
+ if val is not None:
1029
+ values.append(val)
1030
+ except Exception:
1031
+ continue
1032
+
1033
+ if not values:
1034
+ # No valid values found, aggregate is None
1035
+ aggregates[attr_name] = None
1036
+ continue
1037
+
1038
+ # Calculate aggregate based on function
1039
+ if func_name == "min":
1040
+ aggregates[attr_name] = min(values)
1041
+ elif func_name == "max":
1042
+ aggregates[attr_name] = max(values)
1043
+ elif func_name == "avg":
1044
+ try:
1045
+ aggregates[attr_name] = sum(values) / len(values)
1046
+ except TypeError:
1047
+ # Non-numeric values
1048
+ aggregates[attr_name] = None
1049
+ elif func_name == "median":
1050
+ try:
1051
+ sorted_values = sorted(values)
1052
+ n = len(sorted_values)
1053
+ if n % 2 == 0:
1054
+ aggregates[attr_name] = (
1055
+ sorted_values[n // 2 - 1] + sorted_values[n // 2]
1056
+ ) / 2
1057
+ else:
1058
+ aggregates[attr_name] = sorted_values[n // 2]
1059
+ except TypeError:
1060
+ # Non-numeric values
1061
+ aggregates[attr_name] = None
1062
+ elif func_name == "mode":
1063
+ # Works for any type
1064
+ counter = Counter(values)
1065
+ most_common = counter.most_common(1)
1066
+ if most_common:
1067
+ aggregates[attr_name] = most_common[0][0]
1068
+ else:
1069
+ aggregates[attr_name] = None
1070
+ elif func_name == "closest" and func_args is not None:
1071
+ # For colors, find the value with minimum distance
1072
+ if attr_name in [
1073
+ "color",
1074
+ "non_stroking_color",
1075
+ "fill",
1076
+ "stroke",
1077
+ "strokeColor",
1078
+ "fillColor",
1079
+ ]:
1080
+ min_distance = float("inf")
1081
+ closest_value = None
1082
+ for val in values:
1083
+ try:
1084
+ distance = _color_distance(val, func_args)
1085
+ if distance < min_distance:
1086
+ min_distance = distance
1087
+ closest_value = val
1088
+ except:
1089
+ continue
1090
+ aggregates[attr_name] = closest_value
1091
+ else:
1092
+ # For non-colors, closest doesn't make sense
1093
+ aggregates[attr_name] = None
1094
+
1095
+ return aggregates
1096
+
1097
+
1098
+ def selector_to_filter_func(
1099
+ selector: Dict[str, Any], aggregates: Optional[Dict[str, Any]] = None, **kwargs
1100
+ ) -> Callable[[Any], bool]:
933
1101
  """
934
1102
  Convert a parsed selector to a single filter function.
935
1103
 
@@ -938,6 +1106,7 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
938
1106
 
939
1107
  Args:
940
1108
  selector: Parsed selector dictionary (single or compound OR selector)
1109
+ aggregates: Pre-calculated aggregate values (optional)
941
1110
  **kwargs: Additional filter parameters (e.g., regex, case).
942
1111
 
943
1112
  Returns:
@@ -953,7 +1122,9 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
953
1122
  # Create filter functions for each sub-selector
954
1123
  sub_filter_funcs = []
955
1124
  for sub_selector in sub_selectors:
956
- sub_filter_funcs.append(selector_to_filter_func(sub_selector, **kwargs))
1125
+ sub_filter_funcs.append(
1126
+ selector_to_filter_func(sub_selector, aggregates=aggregates, **kwargs)
1127
+ )
957
1128
 
958
1129
  if logger.isEnabledFor(logging.DEBUG):
959
1130
  logger.debug(f"Creating OR filter with {len(sub_filter_funcs)} sub-selectors")
@@ -973,7 +1144,7 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
973
1144
  return or_filter
974
1145
 
975
1146
  # Handle single selectors (existing logic)
976
- filter_list = _build_filter_list(selector, **kwargs)
1147
+ filter_list = _build_filter_list(selector, aggregates=aggregates, **kwargs)
977
1148
 
978
1149
  if logger.isEnabledFor(logging.DEBUG):
979
1150
  filter_names = [f["name"] for f in filter_list]
@@ -0,0 +1,136 @@
1
+ """Monkey patches for pdfminer.six bugs.
2
+
3
+ This module contains patches for known bugs in pdfminer.six that affect
4
+ natural_pdf functionality. These patches are applied automatically when
5
+ natural_pdf is imported.
6
+ """
7
+
8
+ import logging
9
+ from typing import List, Optional, Tuple, Union
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ # Track if patches have been applied
14
+ _patches_applied = False
15
+
16
+ # Allow disabling patches via environment variable
17
+ import os
18
+
19
+ DISABLE_PATCHES = os.environ.get("NATURAL_PDF_DISABLE_PDFMINER_PATCHES", "").lower() in (
20
+ "1",
21
+ "true",
22
+ "yes",
23
+ )
24
+
25
+
26
+ def _patch_color_space_bug():
27
+ """
28
+ Fix pdfminer.six color parsing bug for bare 'sc' commands.
29
+
30
+ Bug: When a PDF uses 'sc' without an explicit color space (e.g., '1 1 0 sc'),
31
+ pdfminer defaults to DeviceGray (1 component) and only reads one value,
32
+ resulting in wrong colors.
33
+
34
+ This patch detects when there are more color components on the stack than
35
+ expected and handles RGB colors correctly.
36
+
37
+ Reference: https://github.com/jsvine/pdfplumber/issues/XXX
38
+ """
39
+ try:
40
+ import pdfminer.pdfinterp
41
+ from pdfminer.casting import safe_rgb
42
+
43
+ # Save original method
44
+ original_do_scn = pdfminer.pdfinterp.PDFPageInterpreter.do_scn
45
+
46
+ def patched_do_scn(self):
47
+ """Patched do_scn that handles RGB colors without explicit color space."""
48
+ # Get expected components from current color space
49
+ n = self.graphicstate.ncs.ncomponents
50
+
51
+ # Special handling for DeviceGray with potential RGB values
52
+ if n == 1 and len(self.argstack) >= 3:
53
+ # Peek at the last 3 values
54
+ last_three = self.argstack[-3:]
55
+
56
+ # Check if they look like RGB values (all numeric, 0-1 range)
57
+ try:
58
+ values = []
59
+ for v in last_three:
60
+ if isinstance(v, (int, float)):
61
+ values.append(float(v))
62
+ else:
63
+ # Not numeric, use original behavior
64
+ return original_do_scn(self)
65
+
66
+ # If all values are in 0-1 range, treat as RGB
67
+ if all(0 <= v <= 1 for v in values):
68
+ # Pop 3 values and set as RGB
69
+ components = self.pop(3)
70
+ rgb = safe_rgb(*components)
71
+ if rgb is not None:
72
+ self.graphicstate.ncolor = rgb
73
+ return
74
+
75
+ except (ValueError, TypeError, AttributeError):
76
+ # Any error, fall back to original
77
+ pass
78
+
79
+ # Use original behavior for all other cases
80
+ return original_do_scn(self)
81
+
82
+ # Apply the patch
83
+ pdfminer.pdfinterp.PDFPageInterpreter.do_scn = patched_do_scn
84
+ logger.debug("Applied pdfminer color space bug patch")
85
+ return True
86
+
87
+ except Exception as e:
88
+ logger.warning(f"Failed to apply pdfminer color patch: {e}")
89
+ return False
90
+
91
+
92
+ def apply_patches():
93
+ """Apply all pdfminer patches. Safe to call multiple times."""
94
+ global _patches_applied
95
+
96
+ if _patches_applied or DISABLE_PATCHES:
97
+ return
98
+
99
+ patches = [
100
+ ("color_space_bug", _patch_color_space_bug),
101
+ # Add more patches here as needed
102
+ ]
103
+
104
+ applied = []
105
+ failed = []
106
+
107
+ for name, patch_func in patches:
108
+ if patch_func():
109
+ applied.append(name)
110
+ else:
111
+ failed.append(name)
112
+
113
+ if applied:
114
+ logger.info(f"Applied pdfminer patches: {', '.join(applied)}")
115
+ if failed:
116
+ logger.warning(f"Failed to apply patches: {', '.join(failed)}")
117
+
118
+ _patches_applied = True
119
+
120
+
121
+ def get_patch_status() -> dict:
122
+ """Get information about applied patches."""
123
+ return {
124
+ "patches_applied": _patches_applied,
125
+ "pdfminer_version": _get_pdfminer_version(),
126
+ }
127
+
128
+
129
+ def _get_pdfminer_version() -> str:
130
+ """Get the installed pdfminer version."""
131
+ try:
132
+ import pdfminer
133
+
134
+ return getattr(pdfminer, "__version__", "unknown")
135
+ except ImportError:
136
+ return "not installed"