natural-pdf 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +94 -42
- natural_pdf/core/page.py +110 -44
- natural_pdf/core/page_collection.py +223 -34
- natural_pdf/core/page_groupby.py +20 -2
- natural_pdf/core/pdf.py +3 -0
- natural_pdf/core/render_spec.py +20 -5
- natural_pdf/describe/base.py +1 -1
- natural_pdf/describe/elements.py +1 -1
- natural_pdf/elements/base.py +84 -8
- natural_pdf/elements/element_collection.py +730 -12
- natural_pdf/elements/region.py +181 -48
- natural_pdf/flows/flow.py +3 -0
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/utils/color_utils.py +100 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/RECORD +20 -19
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.5.dist-info → natural_pdf-0.2.6.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -222,7 +222,9 @@ class Region(
|
|
222
222
|
mode: Literal["show", "render"] = "show",
|
223
223
|
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
224
224
|
highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
|
225
|
-
crop: Union[
|
225
|
+
crop: Union[
|
226
|
+
bool, int, str, "Region", Literal["wide"]
|
227
|
+
] = True, # Default to True for regions
|
226
228
|
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
227
229
|
**kwargs,
|
228
230
|
) -> List[RenderSpec]:
|
@@ -232,7 +234,12 @@ class Region(
|
|
232
234
|
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
233
235
|
color: Color for highlighting this region in show mode
|
234
236
|
highlights: Additional highlight groups to show, or False to disable all highlights
|
235
|
-
crop:
|
237
|
+
crop: Cropping mode:
|
238
|
+
- False: No cropping
|
239
|
+
- True: Crop to region bounds (default for regions)
|
240
|
+
- int: Padding in pixels around region
|
241
|
+
- 'wide': Full page width, cropped vertically to region
|
242
|
+
- Region: Crop to the bounds of another region
|
236
243
|
crop_bbox: Explicit crop bounds (overrides region bounds)
|
237
244
|
**kwargs: Additional parameters
|
238
245
|
|
@@ -247,15 +254,34 @@ class Region(
|
|
247
254
|
if crop_bbox:
|
248
255
|
spec.crop_bbox = crop_bbox
|
249
256
|
elif crop:
|
250
|
-
|
251
|
-
|
257
|
+
x0, y0, x1, y1 = self.bbox
|
258
|
+
|
259
|
+
if crop is True:
|
260
|
+
# Crop to region bounds
|
261
|
+
spec.crop_bbox = self.bbox
|
262
|
+
elif isinstance(crop, (int, float)):
|
263
|
+
# Add padding around region
|
264
|
+
padding = float(crop)
|
265
|
+
spec.crop_bbox = (
|
266
|
+
max(0, x0 - padding),
|
267
|
+
max(0, y0 - padding),
|
268
|
+
min(self.page.width, x1 + padding),
|
269
|
+
min(self.page.height, y1 + padding),
|
270
|
+
)
|
271
|
+
elif crop == "wide":
|
272
|
+
# Full page width, cropped vertically to region
|
273
|
+
spec.crop_bbox = (0, y0, self.page.width, y1)
|
274
|
+
elif hasattr(crop, "bbox"):
|
275
|
+
# Crop to another region's bounds
|
276
|
+
spec.crop_bbox = crop.bbox
|
252
277
|
|
253
278
|
# Add highlights in show mode (unless explicitly disabled with highlights=False)
|
254
279
|
if mode == "show" and highlights is not False:
|
255
280
|
# Only highlight this region if:
|
256
281
|
# 1. We're not cropping, OR
|
257
|
-
# 2. We're cropping but color was explicitly specified
|
258
|
-
|
282
|
+
# 2. We're cropping but color was explicitly specified, OR
|
283
|
+
# 3. We're cropping to another region (not tight crop)
|
284
|
+
if not crop or color is not None or (crop and not isinstance(crop, bool)):
|
259
285
|
spec.add_highlight(
|
260
286
|
bbox=self.bbox,
|
261
287
|
polygon=self.polygon if self.has_polygon else None,
|
@@ -1237,6 +1263,8 @@ class Region(
|
|
1237
1263
|
Union[str, Callable[[str], bool], List[str]]
|
1238
1264
|
] = None, # NEW: Content filtering
|
1239
1265
|
apply_exclusions: bool = True, # Whether to apply exclusion regions during extraction
|
1266
|
+
verticals: Optional[List] = None, # Explicit vertical lines
|
1267
|
+
horizontals: Optional[List] = None, # Explicit horizontal lines
|
1240
1268
|
) -> TableResult: # Return type allows Optional[str] for cells
|
1241
1269
|
"""
|
1242
1270
|
Extract a table from this region.
|
@@ -1263,6 +1291,10 @@ class Region(
|
|
1263
1291
|
Works with all extraction methods by filtering cell content.
|
1264
1292
|
apply_exclusions: Whether to apply exclusion regions during text extraction (default: True).
|
1265
1293
|
When True, text within excluded regions (e.g., headers/footers) will not be extracted.
|
1294
|
+
verticals: Optional list of explicit vertical lines for table extraction. When provided,
|
1295
|
+
automatically sets vertical_strategy='explicit' and explicit_vertical_lines.
|
1296
|
+
horizontals: Optional list of explicit horizontal lines for table extraction. When provided,
|
1297
|
+
automatically sets horizontal_strategy='explicit' and explicit_horizontal_lines.
|
1266
1298
|
|
1267
1299
|
Returns:
|
1268
1300
|
Table data as a list of rows, where each row is a list of cell values (str or None).
|
@@ -1273,6 +1305,14 @@ class Region(
|
|
1273
1305
|
if text_options is None:
|
1274
1306
|
text_options = {} # Initialize empty dict
|
1275
1307
|
|
1308
|
+
# Handle explicit vertical and horizontal lines
|
1309
|
+
if verticals is not None:
|
1310
|
+
table_settings["vertical_strategy"] = "explicit"
|
1311
|
+
table_settings["explicit_vertical_lines"] = verticals
|
1312
|
+
if horizontals is not None:
|
1313
|
+
table_settings["horizontal_strategy"] = "explicit"
|
1314
|
+
table_settings["explicit_horizontal_lines"] = horizontals
|
1315
|
+
|
1276
1316
|
# Auto-detect method if not specified
|
1277
1317
|
if method is None:
|
1278
1318
|
# If this is a TATR-detected region, use TATR method
|
@@ -2547,7 +2587,13 @@ class Region(
|
|
2547
2587
|
|
2548
2588
|
return self
|
2549
2589
|
|
2550
|
-
def get_section_between(
|
2590
|
+
def get_section_between(
|
2591
|
+
self,
|
2592
|
+
start_element=None,
|
2593
|
+
end_element=None,
|
2594
|
+
include_boundaries="both",
|
2595
|
+
orientation="vertical",
|
2596
|
+
):
|
2551
2597
|
"""
|
2552
2598
|
Get a section between two elements within this region.
|
2553
2599
|
|
@@ -2555,6 +2601,7 @@ class Region(
|
|
2555
2601
|
start_element: Element marking the start of the section
|
2556
2602
|
end_element: Element marking the end of the section
|
2557
2603
|
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
2604
|
+
orientation: 'vertical' (default) or 'horizontal' - determines section direction
|
2558
2605
|
|
2559
2606
|
Returns:
|
2560
2607
|
Region representing the section
|
@@ -2599,41 +2646,67 @@ class Region(
|
|
2599
2646
|
else:
|
2600
2647
|
end_element = elements[-1] # Default end is last element
|
2601
2648
|
|
2602
|
-
#
|
2603
|
-
|
2604
|
-
|
2605
|
-
|
2606
|
-
|
2607
|
-
|
2608
|
-
|
2609
|
-
|
2610
|
-
|
2611
|
-
|
2612
|
-
|
2613
|
-
|
2614
|
-
|
2615
|
-
|
2616
|
-
|
2617
|
-
|
2618
|
-
|
2619
|
-
|
2620
|
-
|
2621
|
-
|
2622
|
-
|
2623
|
-
|
2624
|
-
|
2625
|
-
#
|
2626
|
-
|
2627
|
-
|
2628
|
-
|
2629
|
-
|
2630
|
-
|
2631
|
-
|
2632
|
-
|
2633
|
-
|
2634
|
-
|
2635
|
-
|
2636
|
-
|
2649
|
+
# Validate orientation parameter
|
2650
|
+
if orientation not in ["vertical", "horizontal"]:
|
2651
|
+
raise ValueError(f"orientation must be 'vertical' or 'horizontal', got '{orientation}'")
|
2652
|
+
|
2653
|
+
# Calculate the section boundaries based on orientation and include_boundaries
|
2654
|
+
if orientation == "vertical":
|
2655
|
+
# Use full width of the parent region for vertical sections
|
2656
|
+
x0 = self.x0 # Use parent region's left boundary
|
2657
|
+
x1 = self.x1 # Use parent region's right boundary
|
2658
|
+
|
2659
|
+
# Determine vertical boundaries based on include_boundaries
|
2660
|
+
if include_boundaries == "both":
|
2661
|
+
# Include both boundary elements
|
2662
|
+
top = start_element.top
|
2663
|
+
bottom = end_element.bottom
|
2664
|
+
elif include_boundaries == "start":
|
2665
|
+
# Include start element, exclude end element
|
2666
|
+
top = start_element.top
|
2667
|
+
bottom = end_element.top # Stop at the top of end element
|
2668
|
+
elif include_boundaries == "end":
|
2669
|
+
# Exclude start element, include end element
|
2670
|
+
top = start_element.bottom # Start at the bottom of start element
|
2671
|
+
bottom = end_element.bottom
|
2672
|
+
else: # "none"
|
2673
|
+
# Exclude both boundary elements
|
2674
|
+
top = start_element.bottom # Start at the bottom of start element
|
2675
|
+
bottom = end_element.top # Stop at the top of end element
|
2676
|
+
|
2677
|
+
# Ensure valid boundaries
|
2678
|
+
if top >= bottom:
|
2679
|
+
logger.debug(f"Invalid section boundaries: top={top} >= bottom={bottom}")
|
2680
|
+
# Return an empty region
|
2681
|
+
return Region(self.page, (x0, top, x0, top))
|
2682
|
+
else: # horizontal
|
2683
|
+
# Use full height of the parent region for horizontal sections
|
2684
|
+
top = self.top # Use parent region's top boundary
|
2685
|
+
bottom = self.bottom # Use parent region's bottom boundary
|
2686
|
+
|
2687
|
+
# Determine horizontal boundaries based on include_boundaries
|
2688
|
+
if include_boundaries == "both":
|
2689
|
+
# Include both boundary elements
|
2690
|
+
x0 = start_element.x0
|
2691
|
+
x1 = end_element.x1
|
2692
|
+
elif include_boundaries == "start":
|
2693
|
+
# Include start element, exclude end element
|
2694
|
+
x0 = start_element.x0
|
2695
|
+
x1 = end_element.x0 # Stop at the left of end element
|
2696
|
+
elif include_boundaries == "end":
|
2697
|
+
# Exclude start element, include end element
|
2698
|
+
x0 = start_element.x1 # Start at the right of start element
|
2699
|
+
x1 = end_element.x1
|
2700
|
+
else: # "none"
|
2701
|
+
# Exclude both boundary elements
|
2702
|
+
x0 = start_element.x1 # Start at the right of start element
|
2703
|
+
x1 = end_element.x0 # Stop at the left of end element
|
2704
|
+
|
2705
|
+
# Ensure valid boundaries
|
2706
|
+
if x0 >= x1:
|
2707
|
+
logger.debug(f"Invalid section boundaries: x0={x0} >= x1={x1}")
|
2708
|
+
# Return an empty region
|
2709
|
+
return Region(self.page, (x0, top, x0, top))
|
2637
2710
|
|
2638
2711
|
# Create new region
|
2639
2712
|
section = Region(self.page, (x0, top, x1, bottom))
|
@@ -2644,7 +2717,11 @@ class Region(
|
|
2644
2717
|
return section
|
2645
2718
|
|
2646
2719
|
def get_sections(
|
2647
|
-
self,
|
2720
|
+
self,
|
2721
|
+
start_elements=None,
|
2722
|
+
end_elements=None,
|
2723
|
+
include_boundaries="both",
|
2724
|
+
orientation="vertical",
|
2648
2725
|
) -> "ElementCollection[Region]":
|
2649
2726
|
"""
|
2650
2727
|
Get sections within this region based on start/end elements.
|
@@ -2653,6 +2730,7 @@ class Region(
|
|
2653
2730
|
start_elements: Elements or selector string that mark the start of sections
|
2654
2731
|
end_elements: Elements or selector string that mark the end of sections
|
2655
2732
|
include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
2733
|
+
orientation: 'vertical' (default) or 'horizontal' - determines section direction
|
2656
2734
|
|
2657
2735
|
Returns:
|
2658
2736
|
List of Region objects representing the extracted sections
|
@@ -2687,9 +2765,12 @@ class Region(
|
|
2687
2765
|
if not start_elements:
|
2688
2766
|
return []
|
2689
2767
|
|
2690
|
-
# Sort all elements within the region
|
2768
|
+
# Sort all elements within the region based on orientation
|
2691
2769
|
all_elements_in_region = self.get_elements()
|
2692
|
-
|
2770
|
+
if orientation == "vertical":
|
2771
|
+
all_elements_in_region.sort(key=lambda e: (e.top, e.x0))
|
2772
|
+
else: # horizontal
|
2773
|
+
all_elements_in_region.sort(key=lambda e: (e.x0, e.top))
|
2693
2774
|
|
2694
2775
|
if not all_elements_in_region:
|
2695
2776
|
return [] # Cannot create sections if region is empty
|
@@ -2731,7 +2812,9 @@ class Region(
|
|
2731
2812
|
start_element = current_start_boundary["element"]
|
2732
2813
|
end_element = boundary["element"]
|
2733
2814
|
# Use the helper, ensuring elements are from within the region
|
2734
|
-
section = self.get_section_between(
|
2815
|
+
section = self.get_section_between(
|
2816
|
+
start_element, end_element, include_boundaries, orientation
|
2817
|
+
)
|
2735
2818
|
sections.append(section)
|
2736
2819
|
current_start_boundary = None # Reset
|
2737
2820
|
|
@@ -2748,7 +2831,7 @@ class Region(
|
|
2748
2831
|
if end_idx >= 0 and end_idx >= current_start_boundary["index"]:
|
2749
2832
|
end_element = all_elements_in_region[end_idx]
|
2750
2833
|
section = self.get_section_between(
|
2751
|
-
start_element, end_element, include_boundaries
|
2834
|
+
start_element, end_element, include_boundaries, orientation
|
2752
2835
|
)
|
2753
2836
|
sections.append(section)
|
2754
2837
|
# Else: Section started and ended by consecutive start elements? Create empty?
|
@@ -2762,7 +2845,9 @@ class Region(
|
|
2762
2845
|
start_element = current_start_boundary["element"]
|
2763
2846
|
# End at the last element within the region
|
2764
2847
|
end_element = all_elements_in_region[-1]
|
2765
|
-
section = self.get_section_between(
|
2848
|
+
section = self.get_section_between(
|
2849
|
+
start_element, end_element, include_boundaries, orientation
|
2850
|
+
)
|
2766
2851
|
sections.append(section)
|
2767
2852
|
|
2768
2853
|
return ElementCollection(sections)
|
@@ -3016,6 +3101,54 @@ class Region(
|
|
3016
3101
|
|
3017
3102
|
return all_descendants
|
3018
3103
|
|
3104
|
+
def __add__(
|
3105
|
+
self, other: Union["Element", "Region", "ElementCollection"]
|
3106
|
+
) -> "ElementCollection":
|
3107
|
+
"""Add regions/elements together to create an ElementCollection.
|
3108
|
+
|
3109
|
+
This allows intuitive combination of regions using the + operator:
|
3110
|
+
```python
|
3111
|
+
complainant = section.find("text:contains(Complainant)").right(until='text')
|
3112
|
+
dob = section.find("text:contains(DOB)").right(until='text')
|
3113
|
+
combined = complainant + dob # Creates ElementCollection with both regions
|
3114
|
+
```
|
3115
|
+
|
3116
|
+
Args:
|
3117
|
+
other: Another Region, Element or ElementCollection to combine
|
3118
|
+
|
3119
|
+
Returns:
|
3120
|
+
ElementCollection containing all elements
|
3121
|
+
"""
|
3122
|
+
from natural_pdf.elements.base import Element
|
3123
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
3124
|
+
|
3125
|
+
# Create a list starting with self
|
3126
|
+
elements = [self]
|
3127
|
+
|
3128
|
+
# Add the other element(s)
|
3129
|
+
if isinstance(other, (Element, Region)):
|
3130
|
+
elements.append(other)
|
3131
|
+
elif isinstance(other, ElementCollection):
|
3132
|
+
elements.extend(other)
|
3133
|
+
elif hasattr(other, "__iter__") and not isinstance(other, (str, bytes)):
|
3134
|
+
# Handle other iterables but exclude strings
|
3135
|
+
elements.extend(other)
|
3136
|
+
else:
|
3137
|
+
raise TypeError(f"Cannot add Region with {type(other)}")
|
3138
|
+
|
3139
|
+
return ElementCollection(elements)
|
3140
|
+
|
3141
|
+
def __radd__(
|
3142
|
+
self, other: Union["Element", "Region", "ElementCollection"]
|
3143
|
+
) -> "ElementCollection":
|
3144
|
+
"""Right-hand addition to support ElementCollection + Region."""
|
3145
|
+
if other == 0:
|
3146
|
+
# This handles sum() which starts with 0
|
3147
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
3148
|
+
|
3149
|
+
return ElementCollection([self])
|
3150
|
+
return self.__add__(other)
|
3151
|
+
|
3019
3152
|
def __repr__(self) -> str:
|
3020
3153
|
"""String representation of the region."""
|
3021
3154
|
poly_info = " (Polygon)" if self.has_polygon else ""
|
natural_pdf/flows/flow.py
CHANGED
@@ -1296,6 +1296,7 @@ class Flow(Visualizable):
|
|
1296
1296
|
end_elements=None,
|
1297
1297
|
new_section_on_page_break: bool = False,
|
1298
1298
|
include_boundaries: str = "both",
|
1299
|
+
orientation: str = "vertical",
|
1299
1300
|
) -> "ElementCollection":
|
1300
1301
|
"""
|
1301
1302
|
Extract logical sections from the Flow based on *start* and *end* boundary
|
@@ -1317,6 +1318,7 @@ class Flow(Visualizable):
|
|
1317
1318
|
boundaries (default: False).
|
1318
1319
|
include_boundaries: How to include boundary elements: 'start',
|
1319
1320
|
'end', 'both', or 'none' (default: 'both').
|
1321
|
+
orientation: 'vertical' (default) or 'horizontal' - determines section direction.
|
1320
1322
|
|
1321
1323
|
Returns:
|
1322
1324
|
ElementCollection of Region/FlowRegion objects representing the
|
@@ -1408,6 +1410,7 @@ class Flow(Visualizable):
|
|
1408
1410
|
start_elements=seg_start_elems,
|
1409
1411
|
end_elements=seg_end_elems,
|
1410
1412
|
include_boundaries=include_boundaries,
|
1413
|
+
orientation=orientation,
|
1411
1414
|
)
|
1412
1415
|
|
1413
1416
|
if seg_sections:
|
natural_pdf/selectors/parser.py
CHANGED
@@ -530,8 +530,8 @@ PSEUDO_CLASS_FUNCTIONS = {
|
|
530
530
|
"italic": lambda el: hasattr(el, "italic") and el.italic,
|
531
531
|
"first-child": lambda el: hasattr(el, "parent") and el.parent and el.parent.children[0] == el,
|
532
532
|
"last-child": lambda el: hasattr(el, "parent") and el.parent and el.parent.children[-1] == el,
|
533
|
-
"empty": lambda el: not el.text,
|
534
|
-
"not-empty": lambda el: el.text,
|
533
|
+
"empty": lambda el: not hasattr(el, "text") or not el.text or not el.text.strip(),
|
534
|
+
"not-empty": lambda el: bool(hasattr(el, "text") and el.text and el.text.strip()),
|
535
535
|
"not-bold": lambda el: hasattr(el, "bold") and not el.bold,
|
536
536
|
"not-italic": lambda el: hasattr(el, "italic") and not el.italic,
|
537
537
|
}
|
@@ -0,0 +1,100 @@
|
|
1
|
+
"""
|
2
|
+
Utility functions for color formatting and conversion.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import Any, List, Optional, Tuple, Union
|
6
|
+
|
7
|
+
# List of known color attribute names in natural-pdf
|
8
|
+
COLOR_ATTRIBUTES = [
|
9
|
+
"color",
|
10
|
+
"fill",
|
11
|
+
"stroke",
|
12
|
+
"non_stroking_color",
|
13
|
+
"stroking_color",
|
14
|
+
"text_color",
|
15
|
+
"background_color",
|
16
|
+
"highlight_color",
|
17
|
+
"border_color",
|
18
|
+
]
|
19
|
+
|
20
|
+
|
21
|
+
def rgb_to_hex(color: Union[Tuple[float, ...], List[float]]) -> str:
|
22
|
+
"""
|
23
|
+
Convert an RGB/RGBA color tuple to hex string.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
color: RGB tuple with values either in [0,1] or [0,255] range
|
27
|
+
Can be RGB (3 values) or RGBA (4 values)
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
Hex color string (e.g., '#ff0000' for red)
|
31
|
+
"""
|
32
|
+
if not isinstance(color, (tuple, list)) or len(color) < 3:
|
33
|
+
raise ValueError(f"Invalid color format: {color}")
|
34
|
+
|
35
|
+
# Take first 3 values (RGB), ignore alpha if present
|
36
|
+
r, g, b = color[:3]
|
37
|
+
|
38
|
+
# Determine if values are in [0,1] or [0,255] range
|
39
|
+
# If any positive value is > 1, assume [0,255] range
|
40
|
+
max_val = max(abs(r), abs(g), abs(b))
|
41
|
+
|
42
|
+
if max_val > 1:
|
43
|
+
# Values are in 0-255 range
|
44
|
+
r_int = int(min(255, max(0, r)))
|
45
|
+
g_int = int(min(255, max(0, g)))
|
46
|
+
b_int = int(min(255, max(0, b)))
|
47
|
+
else:
|
48
|
+
# Values are in 0-1 range, convert to 0-255
|
49
|
+
r_int = int(min(255, max(0, r * 255)))
|
50
|
+
g_int = int(min(255, max(0, g * 255)))
|
51
|
+
b_int = int(min(255, max(0, b * 255)))
|
52
|
+
|
53
|
+
return f"#{r_int:02x}{g_int:02x}{b_int:02x}"
|
54
|
+
|
55
|
+
|
56
|
+
def is_color_attribute(attr_name: str) -> bool:
|
57
|
+
"""
|
58
|
+
Check if an attribute name is a known color attribute.
|
59
|
+
|
60
|
+
Args:
|
61
|
+
attr_name: The attribute name to check
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
True if this is a known color attribute
|
65
|
+
"""
|
66
|
+
return attr_name.lower() in [attr.lower() for attr in COLOR_ATTRIBUTES]
|
67
|
+
|
68
|
+
|
69
|
+
def format_color_value(value: Any, attr_name: Optional[str] = None) -> str:
|
70
|
+
"""
|
71
|
+
Format a color value for display, converting tuples to hex when appropriate.
|
72
|
+
|
73
|
+
Args:
|
74
|
+
value: The value to format
|
75
|
+
attr_name: Optional attribute name to help determine if this is a color
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
Formatted string representation
|
79
|
+
"""
|
80
|
+
# If attr_name is provided and it's not a color attribute, return as-is
|
81
|
+
if attr_name and not is_color_attribute(attr_name):
|
82
|
+
return str(value)
|
83
|
+
|
84
|
+
# Check if value looks like an RGB color tuple
|
85
|
+
if isinstance(value, (tuple, list)):
|
86
|
+
# Must have 3 or 4 values (RGB or RGBA)
|
87
|
+
if len(value) in (3, 4):
|
88
|
+
# Check if all values are numeric
|
89
|
+
if all(isinstance(v, (int, float)) for v in value):
|
90
|
+
# Additional validation: values should be in reasonable ranges
|
91
|
+
# Either all in [0,1] or all in [0,255]
|
92
|
+
if all(0 <= v <= 1 for v in value[:3]) or all(0 <= v <= 255 for v in value[:3]):
|
93
|
+
try:
|
94
|
+
return rgb_to_hex(value)
|
95
|
+
except Exception:
|
96
|
+
# If conversion fails, fall back to string representation
|
97
|
+
pass
|
98
|
+
|
99
|
+
# Default: convert to string
|
100
|
+
return str(value)
|
@@ -2,7 +2,7 @@ natural_pdf/__init__.py,sha256=N4pR0LbuPEnUYFZqbdVqc_FGKldgwPQc1wjJhYKTBBM,3417
|
|
2
2
|
natural_pdf/cli.py,sha256=SkPwhhMM-GhLsj3O1n1Agxz4KOxcZ08sj8hVQSFJB5c,4064
|
3
3
|
natural_pdf/text_mixin.py,sha256=eFCiHj6Okcw3aum4955BepcI2NPRalkf9UFFVTc_H30,4012
|
4
4
|
natural_pdf/analyzers/__init__.py,sha256=3XGoNq3OgiVkZP7tOdeP5XVUl7fDgyztdA8DlOcMLXg,1138
|
5
|
-
natural_pdf/analyzers/guides.py,sha256=
|
5
|
+
natural_pdf/analyzers/guides.py,sha256=RHFTc2n6kzKrjsd2pk-1MfG1esuEpnTJr8GrsTqlF3A,160441
|
6
6
|
natural_pdf/analyzers/shape_detection_mixin.py,sha256=mgpyJ4jIulz9l9HCqThabJIsLSrXh9BB2AmLxUoHmw0,62584
|
7
7
|
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
8
8
|
natural_pdf/analyzers/text_structure.py,sha256=3WWusi-BI0krUnJxB05DD6XmKj5qRNvQBqH7zOQGm1M,28451
|
@@ -27,24 +27,24 @@ natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666
|
|
27
27
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
28
28
|
natural_pdf/core/element_manager.py,sha256=KPuKM7SstfErTkRnGq4vrgE0Tv8iazN13Jp7yAXGKso,55575
|
29
29
|
natural_pdf/core/highlighting_service.py,sha256=7on8nErhi50CEH2L4XzGIZ6tIqZtMzmmFlp-2lmwnYE,68856
|
30
|
-
natural_pdf/core/page.py,sha256=
|
31
|
-
natural_pdf/core/page_collection.py,sha256=
|
32
|
-
natural_pdf/core/page_groupby.py,sha256=
|
33
|
-
natural_pdf/core/pdf.py,sha256=
|
30
|
+
natural_pdf/core/page.py,sha256=U0wAEw6z_lFuv6BBY8DKIpD5Y4wiZCo7x7qtjPf3hcM,148300
|
31
|
+
natural_pdf/core/page_collection.py,sha256=itVSWeY6285G7_bIP7vjrMygnGQTX2SdNbJxYW5Eypc,62196
|
32
|
+
natural_pdf/core/page_groupby.py,sha256=V2e_RNlHaasUzYm2h2vNJI7_aV_fl3_pg7kU3F2j0z8,8218
|
33
|
+
natural_pdf/core/pdf.py,sha256=XMEPyd6LlwAhFvnTAU5ZtE_Hr4WpkExxw16DpYsZpvQ,104410
|
34
34
|
natural_pdf/core/pdf_collection.py,sha256=s3ogu4CEHrHMTRqQMJUKJZ-9Ii8b_B9dWbVLTFj0s7g,34992
|
35
|
-
natural_pdf/core/render_spec.py,sha256=
|
35
|
+
natural_pdf/core/render_spec.py,sha256=y9QkMiIvWaKiEBlV0TjyldADIEUY3YfWLQXxStHu1S4,15480
|
36
36
|
natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
|
37
|
-
natural_pdf/describe/base.py,sha256=
|
38
|
-
natural_pdf/describe/elements.py,sha256=
|
37
|
+
natural_pdf/describe/base.py,sha256=LYbDjjQYOIZsYBbBQH3UP6XyWArJJvRc8LUugeVdJw0,18178
|
38
|
+
natural_pdf/describe/elements.py,sha256=3Y541z5TQ2obrfZFiFi1YQMsCt3oYrhMHpD5j1tuppw,12639
|
39
39
|
natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
|
40
40
|
natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
|
41
41
|
natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
|
42
|
-
natural_pdf/elements/base.py,sha256=
|
43
|
-
natural_pdf/elements/element_collection.py,sha256=
|
42
|
+
natural_pdf/elements/base.py,sha256=92ukTtRCQFsa5KvKflChCt4mt0ZGS4ecGYCQTNMO4zU,58907
|
43
|
+
natural_pdf/elements/element_collection.py,sha256=42SUzjD2nYFPNEQA-4oMi2QOwwwsxBmcrY4FKgGumJ0,128700
|
44
44
|
natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
|
45
45
|
natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
|
46
46
|
natural_pdf/elements/rect.py,sha256=0lNkVkPkvbRbrFED856RXoUcTcDkeeOIs5xldKGAQT8,3324
|
47
|
-
natural_pdf/elements/region.py,sha256=
|
47
|
+
natural_pdf/elements/region.py,sha256=XLbaMEQ-DXzbh4Xnv72ebS1ZlT5EuWpistz0O6bOSag,162583
|
48
48
|
natural_pdf/elements/text.py,sha256=829uSJv9E-8cC6T6iR_Va7Xtv54pJoyRN78fq4NN1d4,20687
|
49
49
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
50
50
|
natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
|
@@ -63,7 +63,7 @@ natural_pdf/extraction/result.py,sha256=PDaCCN2LQBbHsZy0_lrQ0ROeMsnmH1WRoXWOjk9M
|
|
63
63
|
natural_pdf/flows/__init__.py,sha256=cUN4A8hTDLZSRr4PO2W_lR4z6hWpbNG8Seox-IIcrLU,277
|
64
64
|
natural_pdf/flows/collections.py,sha256=ErkHWdX6W_y1SjkcA_bGM0uUYRGPWWpRkHip6LHpej0,25740
|
65
65
|
natural_pdf/flows/element.py,sha256=T-9uXsIBe7mIim-mQQMep6Ja5dRfWaYIj8g1ak_Bv8c,24892
|
66
|
-
natural_pdf/flows/flow.py,sha256=
|
66
|
+
natural_pdf/flows/flow.py,sha256=BuT3DBqNvLEqYle66-nZFO91i_1s98CAat28Dg-JjGU,86149
|
67
67
|
natural_pdf/flows/region.py,sha256=r_cFtBlmPi7ADN3k8oYA1s_vyz8GeQLCnYcv58Zt5eM,52263
|
68
68
|
natural_pdf/ocr/__init__.py,sha256=VY8hhvDPf7Gh2lB-d2QRmghLLyTy6ydxlgo1cS4dOSk,2482
|
69
69
|
natural_pdf/ocr/engine.py,sha256=SwNlWydtHbrIghV5JD_j5B4-rnjCMYIWUIEARag-zHw,11839
|
@@ -85,12 +85,13 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
|
|
85
85
|
natural_pdf/search/search_service_protocol.py,sha256=u8pbuWP96fnQEe6mnreY9DrdiDAHP6ZCY7phvSbFlP8,6697
|
86
86
|
natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
|
87
87
|
natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
|
88
|
-
natural_pdf/selectors/parser.py,sha256=
|
88
|
+
natural_pdf/selectors/parser.py,sha256=yV5Eb0VyNZocoYIXi7SMKsf8o66vrGNb-MeT27aEj-M,38977
|
89
89
|
natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
|
90
90
|
natural_pdf/tables/result.py,sha256=-8ctA-jCJYSHtlfAoqTvhUwO5zSP2BQxxetAjqEsNyg,8665
|
91
91
|
natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
|
92
92
|
natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg,43
|
93
93
|
natural_pdf/utils/bidi_mirror.py,sha256=jJEES0xDrMfo5Me8kHMxHv4COS51PitnYi2EvKv3HCE,1151
|
94
|
+
natural_pdf/utils/color_utils.py,sha256=6v2hqk4WdaUR85fFbOej_T4AeASpVeze3xVRWgNKqOk,3128
|
94
95
|
natural_pdf/utils/debug.py,sha256=Epwie_jmRgknUSaEoxEyvr1lBXpfYTFOe2UQh_zSj_0,1026
|
95
96
|
natural_pdf/utils/highlighting.py,sha256=c9SvvPaJDI9bWXzq1A7zdh_0s3C4GCMngrJdkL2AMeM,719
|
96
97
|
natural_pdf/utils/identifiers.py,sha256=P7n6owcubnF8oAMa_UfYtENmIaJQdH_AMC9Jbs2bWXo,1117
|
@@ -106,7 +107,7 @@ natural_pdf/vision/results.py,sha256=F2zXG3MVZIpOUvPkJHotOq6-9rFz68BaO_8pnSndlOs
|
|
106
107
|
natural_pdf/vision/similarity.py,sha256=YH8legN-t9uf1b_XULi4JLNDaRfPNKQwU1FZ4Qu08jY,11740
|
107
108
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
108
109
|
natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
|
109
|
-
natural_pdf-0.2.
|
110
|
+
natural_pdf-0.2.6.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
110
111
|
optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
|
111
112
|
optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
|
112
113
|
optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
|
@@ -123,8 +124,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
|
|
123
124
|
tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
|
124
125
|
tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
|
125
126
|
tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
|
126
|
-
natural_pdf-0.2.
|
127
|
-
natural_pdf-0.2.
|
128
|
-
natural_pdf-0.2.
|
129
|
-
natural_pdf-0.2.
|
130
|
-
natural_pdf-0.2.
|
127
|
+
natural_pdf-0.2.6.dist-info/METADATA,sha256=SJ7AqaSiRD-4NYz9Pk0Iz7IlEMiiv1aha3V8do8qvbo,6959
|
128
|
+
natural_pdf-0.2.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
129
|
+
natural_pdf-0.2.6.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
130
|
+
natural_pdf-0.2.6.dist-info/top_level.txt,sha256=80t0F2ZeX4vN4Ke5iTflcOk_PN_0USn33ha3X6X86Ik,36
|
131
|
+
natural_pdf-0.2.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|