natural-pdf 0.2.15__py3-none-any.whl → 0.2.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +45 -0
- natural_pdf/analyzers/guides.py +359 -0
- natural_pdf/core/element_manager.py +4 -0
- natural_pdf/core/page.py +88 -22
- natural_pdf/core/page_collection.py +75 -0
- natural_pdf/core/pdf.py +33 -0
- natural_pdf/describe/base.py +48 -7
- natural_pdf/elements/base.py +408 -43
- natural_pdf/elements/element_collection.py +83 -10
- natural_pdf/elements/region.py +217 -178
- natural_pdf/elements/text.py +5 -3
- natural_pdf/flows/element.py +48 -46
- natural_pdf/flows/flow.py +175 -480
- natural_pdf/flows/region.py +76 -0
- natural_pdf/selectors/parser.py +180 -9
- natural_pdf/utils/pdfminer_patches.py +136 -0
- natural_pdf/utils/sections.py +346 -0
- natural_pdf/utils/spatial.py +169 -0
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/RECORD +24 -21
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/top_level.txt +0 -0
natural_pdf/flows/region.py
CHANGED
@@ -1191,6 +1191,82 @@ class FlowRegion(Visualizable):
|
|
1191
1191
|
|
1192
1192
|
return all_tables
|
1193
1193
|
|
1194
|
+
def get_sections(
|
1195
|
+
self,
|
1196
|
+
start_elements=None,
|
1197
|
+
end_elements=None,
|
1198
|
+
new_section_on_page_break: bool = False,
|
1199
|
+
include_boundaries: str = "both",
|
1200
|
+
orientation: str = "vertical",
|
1201
|
+
) -> "ElementCollection":
|
1202
|
+
"""
|
1203
|
+
Extract logical sections from this FlowRegion based on start/end boundary elements.
|
1204
|
+
|
1205
|
+
This delegates to the parent Flow's get_sections() method, but only operates
|
1206
|
+
on the segments that are part of this FlowRegion.
|
1207
|
+
|
1208
|
+
Args:
|
1209
|
+
start_elements: Elements or selector string that mark the start of sections
|
1210
|
+
end_elements: Elements or selector string that mark the end of sections
|
1211
|
+
new_section_on_page_break: Whether to start a new section at page boundaries
|
1212
|
+
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
1213
|
+
orientation: 'vertical' (default) or 'horizontal' - determines section direction
|
1214
|
+
|
1215
|
+
Returns:
|
1216
|
+
ElementCollection of FlowRegion objects representing the extracted sections
|
1217
|
+
|
1218
|
+
Example:
|
1219
|
+
# Split a multi-page table region by headers
|
1220
|
+
table_region = flow.find("text:contains('Table 4')").below(until="text:contains('Table 5')")
|
1221
|
+
sections = table_region.get_sections(start_elements="text:bold")
|
1222
|
+
"""
|
1223
|
+
# Create a temporary Flow with just our constituent regions as segments
|
1224
|
+
from natural_pdf.flows.flow import Flow
|
1225
|
+
|
1226
|
+
temp_flow = Flow(
|
1227
|
+
segments=self.constituent_regions,
|
1228
|
+
arrangement=self.flow.arrangement,
|
1229
|
+
alignment=self.flow.alignment,
|
1230
|
+
segment_gap=self.flow.segment_gap,
|
1231
|
+
)
|
1232
|
+
|
1233
|
+
# Delegate to Flow's get_sections implementation
|
1234
|
+
return temp_flow.get_sections(
|
1235
|
+
start_elements=start_elements,
|
1236
|
+
end_elements=end_elements,
|
1237
|
+
new_section_on_page_break=new_section_on_page_break,
|
1238
|
+
include_boundaries=include_boundaries,
|
1239
|
+
orientation=orientation,
|
1240
|
+
)
|
1241
|
+
|
1242
|
+
def split(
|
1243
|
+
self, by: Optional[str] = None, page_breaks: bool = True, **kwargs
|
1244
|
+
) -> "ElementCollection":
|
1245
|
+
"""
|
1246
|
+
Split this FlowRegion into sections.
|
1247
|
+
|
1248
|
+
This is a convenience method that wraps get_sections() with common splitting patterns.
|
1249
|
+
|
1250
|
+
Args:
|
1251
|
+
by: Selector string for elements that mark section boundaries (e.g., "text:bold")
|
1252
|
+
page_breaks: Whether to also split at page boundaries (default: True)
|
1253
|
+
**kwargs: Additional arguments passed to get_sections()
|
1254
|
+
|
1255
|
+
Returns:
|
1256
|
+
ElementCollection of FlowRegion objects representing the sections
|
1257
|
+
|
1258
|
+
Example:
|
1259
|
+
# Split by bold headers
|
1260
|
+
sections = flow_region.split(by="text:bold")
|
1261
|
+
|
1262
|
+
# Split only by specific text pattern, ignoring page breaks
|
1263
|
+
sections = flow_region.split(
|
1264
|
+
by="text:contains('Section')",
|
1265
|
+
page_breaks=False
|
1266
|
+
)
|
1267
|
+
"""
|
1268
|
+
return self.get_sections(start_elements=by, new_section_on_page_break=page_breaks, **kwargs)
|
1269
|
+
|
1194
1270
|
@property
|
1195
1271
|
def normalized_type(self) -> Optional[str]:
|
1196
1272
|
"""
|
natural_pdf/selectors/parser.py
CHANGED
@@ -30,6 +30,7 @@ This enables powerful document navigation like:
|
|
30
30
|
import ast
|
31
31
|
import logging
|
32
32
|
import re
|
33
|
+
from collections import Counter
|
33
34
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
34
35
|
|
35
36
|
from colormath2.color_conversions import convert_color
|
@@ -86,6 +87,47 @@ def safe_parse_value(value_str: str) -> Any:
|
|
86
87
|
return value_str
|
87
88
|
|
88
89
|
|
90
|
+
def _parse_aggregate_function(value_str: str) -> Optional[Dict[str, Any]]:
|
91
|
+
"""Parse aggregate function syntax like min(), max(), avg(), closest("red").
|
92
|
+
|
93
|
+
Returns:
|
94
|
+
Dict with 'type': 'aggregate', 'func': function name, 'args': optional args
|
95
|
+
or None if not an aggregate function.
|
96
|
+
"""
|
97
|
+
value_str = value_str.strip()
|
98
|
+
|
99
|
+
# Pattern for aggregate functions: funcname() or funcname(args)
|
100
|
+
# Supports: min(), max(), avg(), mean(), median(), mode(), most_common(), closest(...)
|
101
|
+
func_pattern = re.match(
|
102
|
+
r"^(min|max|avg|mean|median|mode|most_common|closest)\s*\((.*?)\)$",
|
103
|
+
value_str,
|
104
|
+
re.IGNORECASE,
|
105
|
+
)
|
106
|
+
|
107
|
+
if not func_pattern:
|
108
|
+
return None
|
109
|
+
|
110
|
+
func_name = func_pattern.group(1).lower()
|
111
|
+
args_str = func_pattern.group(2).strip()
|
112
|
+
|
113
|
+
# Normalize function aliases
|
114
|
+
if func_name == "mean":
|
115
|
+
func_name = "avg"
|
116
|
+
elif func_name == "most_common":
|
117
|
+
func_name = "mode"
|
118
|
+
|
119
|
+
# Parse arguments if present
|
120
|
+
args = None
|
121
|
+
if args_str:
|
122
|
+
# For closest(), parse the color argument
|
123
|
+
if func_name == "closest":
|
124
|
+
args = safe_parse_color(args_str)
|
125
|
+
else:
|
126
|
+
args = safe_parse_value(args_str)
|
127
|
+
|
128
|
+
return {"type": "aggregate", "func": func_name, "args": args}
|
129
|
+
|
130
|
+
|
89
131
|
def safe_parse_color(value_str: str) -> tuple:
|
90
132
|
"""
|
91
133
|
Parse a color value which could be an RGB tuple, color name, hex code, or CSS-style rgb(...)/rgba(...).
|
@@ -362,9 +404,14 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
362
404
|
raise ValueError(
|
363
405
|
f"Invalid selector: Attribute '[{name}{op}]' must have a value. Use '[{name}{op}\"\"]' for empty string or '[{name}]' for presence. Full selector: '{original_selector_for_error}'"
|
364
406
|
)
|
365
|
-
# Parse value
|
407
|
+
# Parse value - check for aggregate functions first
|
366
408
|
parsed_value: Any
|
367
|
-
|
409
|
+
aggregate_func = _parse_aggregate_function(value_str)
|
410
|
+
|
411
|
+
if aggregate_func:
|
412
|
+
# Store aggregate function info
|
413
|
+
parsed_value = aggregate_func
|
414
|
+
elif name in [
|
368
415
|
"color",
|
369
416
|
"non_stroking_color",
|
370
417
|
"fill",
|
@@ -564,12 +611,15 @@ PSEUDO_CLASS_FUNCTIONS = {
|
|
564
611
|
}
|
565
612
|
|
566
613
|
|
567
|
-
def _build_filter_list(
|
614
|
+
def _build_filter_list(
|
615
|
+
selector: Dict[str, Any], aggregates: Optional[Dict[str, Any]] = None, **kwargs
|
616
|
+
) -> List[Dict[str, Any]]:
|
568
617
|
"""
|
569
618
|
Convert a parsed selector to a list of named filter functions.
|
570
619
|
|
571
620
|
Args:
|
572
621
|
selector: Parsed selector dictionary
|
622
|
+
aggregates: Pre-calculated aggregate values (optional)
|
573
623
|
**kwargs: Additional filter parameters including:
|
574
624
|
- regex: Whether to use regex for text search
|
575
625
|
- case: Whether to do case-sensitive text search
|
@@ -581,6 +631,9 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
|
|
581
631
|
filters: List[Dict[str, Any]] = []
|
582
632
|
selector_type = selector["type"]
|
583
633
|
|
634
|
+
if aggregates is None:
|
635
|
+
aggregates = {}
|
636
|
+
|
584
637
|
# Filter by element type
|
585
638
|
if selector_type != "any":
|
586
639
|
filter_name = f"type is '{selector_type}'"
|
@@ -611,6 +664,15 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
|
|
611
664
|
value = attr_filter["value"]
|
612
665
|
python_name = name.replace("-", "_") # Convert CSS-style names
|
613
666
|
|
667
|
+
# Check if value is an aggregate function
|
668
|
+
if isinstance(value, dict) and value.get("type") == "aggregate":
|
669
|
+
# Use pre-calculated aggregate value
|
670
|
+
aggregate_value = aggregates.get(name)
|
671
|
+
if aggregate_value is None:
|
672
|
+
# Skip this filter if aggregate couldn't be calculated
|
673
|
+
continue
|
674
|
+
value = aggregate_value
|
675
|
+
|
614
676
|
# --- Define the core value retrieval logic ---
|
615
677
|
def get_element_value(
|
616
678
|
element, name=name, python_name=python_name, selector_type=selector_type
|
@@ -761,15 +823,15 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
|
|
761
823
|
)
|
762
824
|
|
763
825
|
# Recursively get the filter function for the inner selector
|
764
|
-
# Pass kwargs down in case regex/case flags affect the inner selector
|
765
|
-
inner_filter_func = selector_to_filter_func(args, **kwargs)
|
826
|
+
# Pass kwargs and aggregates down in case regex/case flags affect the inner selector
|
827
|
+
inner_filter_func = selector_to_filter_func(args, aggregates=aggregates, **kwargs)
|
766
828
|
|
767
829
|
# The filter lambda applies the inner function and inverts the result
|
768
830
|
filter_lambda = lambda el, inner_func=inner_filter_func: not inner_func(el)
|
769
831
|
|
770
832
|
# Try to create a descriptive name (can be long)
|
771
833
|
# Maybe simplify this later if needed
|
772
|
-
inner_filter_list = _build_filter_list(args, **kwargs)
|
834
|
+
inner_filter_list = _build_filter_list(args, aggregates=aggregates, **kwargs)
|
773
835
|
inner_filter_names = ", ".join([f["name"] for f in inner_filter_list])
|
774
836
|
filter_name = f"pseudo-class :not({inner_filter_names})"
|
775
837
|
|
@@ -929,7 +991,113 @@ def _assemble_filter_func(filters: List[Dict[str, Any]]) -> Callable[[Any], bool
|
|
929
991
|
return combined_filter
|
930
992
|
|
931
993
|
|
932
|
-
def
|
994
|
+
def _calculate_aggregates(elements: List[Any], selector: Dict[str, Any]) -> Dict[str, Any]:
|
995
|
+
"""Calculate aggregate values for a selector.
|
996
|
+
|
997
|
+
Args:
|
998
|
+
elements: List of elements to calculate aggregates from
|
999
|
+
selector: Parsed selector dictionary
|
1000
|
+
|
1001
|
+
Returns:
|
1002
|
+
Dict mapping attribute names to their aggregate values
|
1003
|
+
"""
|
1004
|
+
aggregates = {}
|
1005
|
+
|
1006
|
+
# Find all aggregate functions in attributes
|
1007
|
+
for attr in selector.get("attributes", []):
|
1008
|
+
value = attr.get("value")
|
1009
|
+
if isinstance(value, dict) and value.get("type") == "aggregate":
|
1010
|
+
attr_name = attr["name"]
|
1011
|
+
func_name = value["func"]
|
1012
|
+
func_args = value.get("args")
|
1013
|
+
|
1014
|
+
# Extract attribute values from elements
|
1015
|
+
values = []
|
1016
|
+
for el in elements:
|
1017
|
+
try:
|
1018
|
+
# Handle special bbox attributes
|
1019
|
+
if attr_name in ["x0", "y0", "x1", "y1"]:
|
1020
|
+
bbox_mapping = {"x0": 0, "y0": 1, "x1": 2, "y1": 3}
|
1021
|
+
bbox = getattr(el, "_bbox", None) or getattr(el, "bbox", None)
|
1022
|
+
if bbox:
|
1023
|
+
val = bbox[bbox_mapping[attr_name]]
|
1024
|
+
values.append(val)
|
1025
|
+
else:
|
1026
|
+
# General attribute access
|
1027
|
+
val = getattr(el, attr_name.replace("-", "_"), None)
|
1028
|
+
if val is not None:
|
1029
|
+
values.append(val)
|
1030
|
+
except Exception:
|
1031
|
+
continue
|
1032
|
+
|
1033
|
+
if not values:
|
1034
|
+
# No valid values found, aggregate is None
|
1035
|
+
aggregates[attr_name] = None
|
1036
|
+
continue
|
1037
|
+
|
1038
|
+
# Calculate aggregate based on function
|
1039
|
+
if func_name == "min":
|
1040
|
+
aggregates[attr_name] = min(values)
|
1041
|
+
elif func_name == "max":
|
1042
|
+
aggregates[attr_name] = max(values)
|
1043
|
+
elif func_name == "avg":
|
1044
|
+
try:
|
1045
|
+
aggregates[attr_name] = sum(values) / len(values)
|
1046
|
+
except TypeError:
|
1047
|
+
# Non-numeric values
|
1048
|
+
aggregates[attr_name] = None
|
1049
|
+
elif func_name == "median":
|
1050
|
+
try:
|
1051
|
+
sorted_values = sorted(values)
|
1052
|
+
n = len(sorted_values)
|
1053
|
+
if n % 2 == 0:
|
1054
|
+
aggregates[attr_name] = (
|
1055
|
+
sorted_values[n // 2 - 1] + sorted_values[n // 2]
|
1056
|
+
) / 2
|
1057
|
+
else:
|
1058
|
+
aggregates[attr_name] = sorted_values[n // 2]
|
1059
|
+
except TypeError:
|
1060
|
+
# Non-numeric values
|
1061
|
+
aggregates[attr_name] = None
|
1062
|
+
elif func_name == "mode":
|
1063
|
+
# Works for any type
|
1064
|
+
counter = Counter(values)
|
1065
|
+
most_common = counter.most_common(1)
|
1066
|
+
if most_common:
|
1067
|
+
aggregates[attr_name] = most_common[0][0]
|
1068
|
+
else:
|
1069
|
+
aggregates[attr_name] = None
|
1070
|
+
elif func_name == "closest" and func_args is not None:
|
1071
|
+
# For colors, find the value with minimum distance
|
1072
|
+
if attr_name in [
|
1073
|
+
"color",
|
1074
|
+
"non_stroking_color",
|
1075
|
+
"fill",
|
1076
|
+
"stroke",
|
1077
|
+
"strokeColor",
|
1078
|
+
"fillColor",
|
1079
|
+
]:
|
1080
|
+
min_distance = float("inf")
|
1081
|
+
closest_value = None
|
1082
|
+
for val in values:
|
1083
|
+
try:
|
1084
|
+
distance = _color_distance(val, func_args)
|
1085
|
+
if distance < min_distance:
|
1086
|
+
min_distance = distance
|
1087
|
+
closest_value = val
|
1088
|
+
except:
|
1089
|
+
continue
|
1090
|
+
aggregates[attr_name] = closest_value
|
1091
|
+
else:
|
1092
|
+
# For non-colors, closest doesn't make sense
|
1093
|
+
aggregates[attr_name] = None
|
1094
|
+
|
1095
|
+
return aggregates
|
1096
|
+
|
1097
|
+
|
1098
|
+
def selector_to_filter_func(
|
1099
|
+
selector: Dict[str, Any], aggregates: Optional[Dict[str, Any]] = None, **kwargs
|
1100
|
+
) -> Callable[[Any], bool]:
|
933
1101
|
"""
|
934
1102
|
Convert a parsed selector to a single filter function.
|
935
1103
|
|
@@ -938,6 +1106,7 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
|
|
938
1106
|
|
939
1107
|
Args:
|
940
1108
|
selector: Parsed selector dictionary (single or compound OR selector)
|
1109
|
+
aggregates: Pre-calculated aggregate values (optional)
|
941
1110
|
**kwargs: Additional filter parameters (e.g., regex, case).
|
942
1111
|
|
943
1112
|
Returns:
|
@@ -953,7 +1122,9 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
|
|
953
1122
|
# Create filter functions for each sub-selector
|
954
1123
|
sub_filter_funcs = []
|
955
1124
|
for sub_selector in sub_selectors:
|
956
|
-
sub_filter_funcs.append(
|
1125
|
+
sub_filter_funcs.append(
|
1126
|
+
selector_to_filter_func(sub_selector, aggregates=aggregates, **kwargs)
|
1127
|
+
)
|
957
1128
|
|
958
1129
|
if logger.isEnabledFor(logging.DEBUG):
|
959
1130
|
logger.debug(f"Creating OR filter with {len(sub_filter_funcs)} sub-selectors")
|
@@ -973,7 +1144,7 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
|
|
973
1144
|
return or_filter
|
974
1145
|
|
975
1146
|
# Handle single selectors (existing logic)
|
976
|
-
filter_list = _build_filter_list(selector, **kwargs)
|
1147
|
+
filter_list = _build_filter_list(selector, aggregates=aggregates, **kwargs)
|
977
1148
|
|
978
1149
|
if logger.isEnabledFor(logging.DEBUG):
|
979
1150
|
filter_names = [f["name"] for f in filter_list]
|
@@ -0,0 +1,136 @@
|
|
1
|
+
"""Monkey patches for pdfminer.six bugs.
|
2
|
+
|
3
|
+
This module contains patches for known bugs in pdfminer.six that affect
|
4
|
+
natural_pdf functionality. These patches are applied automatically when
|
5
|
+
natural_pdf is imported.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
from typing import List, Optional, Tuple, Union
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
# Track if patches have been applied
|
14
|
+
_patches_applied = False
|
15
|
+
|
16
|
+
# Allow disabling patches via environment variable
|
17
|
+
import os
|
18
|
+
|
19
|
+
DISABLE_PATCHES = os.environ.get("NATURAL_PDF_DISABLE_PDFMINER_PATCHES", "").lower() in (
|
20
|
+
"1",
|
21
|
+
"true",
|
22
|
+
"yes",
|
23
|
+
)
|
24
|
+
|
25
|
+
|
26
|
+
def _patch_color_space_bug():
|
27
|
+
"""
|
28
|
+
Fix pdfminer.six color parsing bug for bare 'sc' commands.
|
29
|
+
|
30
|
+
Bug: When a PDF uses 'sc' without an explicit color space (e.g., '1 1 0 sc'),
|
31
|
+
pdfminer defaults to DeviceGray (1 component) and only reads one value,
|
32
|
+
resulting in wrong colors.
|
33
|
+
|
34
|
+
This patch detects when there are more color components on the stack than
|
35
|
+
expected and handles RGB colors correctly.
|
36
|
+
|
37
|
+
Reference: https://github.com/jsvine/pdfplumber/issues/XXX
|
38
|
+
"""
|
39
|
+
try:
|
40
|
+
import pdfminer.pdfinterp
|
41
|
+
from pdfminer.casting import safe_rgb
|
42
|
+
|
43
|
+
# Save original method
|
44
|
+
original_do_scn = pdfminer.pdfinterp.PDFPageInterpreter.do_scn
|
45
|
+
|
46
|
+
def patched_do_scn(self):
|
47
|
+
"""Patched do_scn that handles RGB colors without explicit color space."""
|
48
|
+
# Get expected components from current color space
|
49
|
+
n = self.graphicstate.ncs.ncomponents
|
50
|
+
|
51
|
+
# Special handling for DeviceGray with potential RGB values
|
52
|
+
if n == 1 and len(self.argstack) >= 3:
|
53
|
+
# Peek at the last 3 values
|
54
|
+
last_three = self.argstack[-3:]
|
55
|
+
|
56
|
+
# Check if they look like RGB values (all numeric, 0-1 range)
|
57
|
+
try:
|
58
|
+
values = []
|
59
|
+
for v in last_three:
|
60
|
+
if isinstance(v, (int, float)):
|
61
|
+
values.append(float(v))
|
62
|
+
else:
|
63
|
+
# Not numeric, use original behavior
|
64
|
+
return original_do_scn(self)
|
65
|
+
|
66
|
+
# If all values are in 0-1 range, treat as RGB
|
67
|
+
if all(0 <= v <= 1 for v in values):
|
68
|
+
# Pop 3 values and set as RGB
|
69
|
+
components = self.pop(3)
|
70
|
+
rgb = safe_rgb(*components)
|
71
|
+
if rgb is not None:
|
72
|
+
self.graphicstate.ncolor = rgb
|
73
|
+
return
|
74
|
+
|
75
|
+
except (ValueError, TypeError, AttributeError):
|
76
|
+
# Any error, fall back to original
|
77
|
+
pass
|
78
|
+
|
79
|
+
# Use original behavior for all other cases
|
80
|
+
return original_do_scn(self)
|
81
|
+
|
82
|
+
# Apply the patch
|
83
|
+
pdfminer.pdfinterp.PDFPageInterpreter.do_scn = patched_do_scn
|
84
|
+
logger.debug("Applied pdfminer color space bug patch")
|
85
|
+
return True
|
86
|
+
|
87
|
+
except Exception as e:
|
88
|
+
logger.warning(f"Failed to apply pdfminer color patch: {e}")
|
89
|
+
return False
|
90
|
+
|
91
|
+
|
92
|
+
def apply_patches():
|
93
|
+
"""Apply all pdfminer patches. Safe to call multiple times."""
|
94
|
+
global _patches_applied
|
95
|
+
|
96
|
+
if _patches_applied or DISABLE_PATCHES:
|
97
|
+
return
|
98
|
+
|
99
|
+
patches = [
|
100
|
+
("color_space_bug", _patch_color_space_bug),
|
101
|
+
# Add more patches here as needed
|
102
|
+
]
|
103
|
+
|
104
|
+
applied = []
|
105
|
+
failed = []
|
106
|
+
|
107
|
+
for name, patch_func in patches:
|
108
|
+
if patch_func():
|
109
|
+
applied.append(name)
|
110
|
+
else:
|
111
|
+
failed.append(name)
|
112
|
+
|
113
|
+
if applied:
|
114
|
+
logger.info(f"Applied pdfminer patches: {', '.join(applied)}")
|
115
|
+
if failed:
|
116
|
+
logger.warning(f"Failed to apply patches: {', '.join(failed)}")
|
117
|
+
|
118
|
+
_patches_applied = True
|
119
|
+
|
120
|
+
|
121
|
+
def get_patch_status() -> dict:
|
122
|
+
"""Get information about applied patches."""
|
123
|
+
return {
|
124
|
+
"patches_applied": _patches_applied,
|
125
|
+
"pdfminer_version": _get_pdfminer_version(),
|
126
|
+
}
|
127
|
+
|
128
|
+
|
129
|
+
def _get_pdfminer_version() -> str:
|
130
|
+
"""Get the installed pdfminer version."""
|
131
|
+
try:
|
132
|
+
import pdfminer
|
133
|
+
|
134
|
+
return getattr(pdfminer, "__version__", "unknown")
|
135
|
+
except ImportError:
|
136
|
+
return "not installed"
|