natural-pdf 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +45 -0
- natural_pdf/analyzers/guides.py +359 -0
- natural_pdf/core/element_manager.py +4 -0
- natural_pdf/core/page.py +130 -31
- natural_pdf/core/page_collection.py +75 -0
- natural_pdf/core/pdf.py +33 -0
- natural_pdf/describe/base.py +48 -7
- natural_pdf/elements/base.py +408 -43
- natural_pdf/elements/element_collection.py +83 -10
- natural_pdf/elements/region.py +217 -178
- natural_pdf/elements/text.py +5 -3
- natural_pdf/flows/element.py +1 -0
- natural_pdf/flows/flow.py +175 -480
- natural_pdf/flows/region.py +76 -0
- natural_pdf/selectors/parser.py +180 -9
- natural_pdf/utils/pdfminer_patches.py +136 -0
- natural_pdf/utils/sections.py +346 -0
- natural_pdf/utils/spatial.py +172 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/RECORD +24 -21
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import base64
|
2
2
|
import concurrent.futures # Added import
|
3
|
+
import contextlib
|
3
4
|
import hashlib
|
4
5
|
import io
|
5
6
|
import json
|
@@ -30,6 +31,7 @@ from tqdm.auto import tqdm # Added tqdm import
|
|
30
31
|
from natural_pdf.elements.element_collection import ElementCollection
|
31
32
|
from natural_pdf.elements.region import Region
|
32
33
|
from natural_pdf.selectors.parser import parse_selector
|
34
|
+
from natural_pdf.tables.result import TableResult
|
33
35
|
from natural_pdf.utils.locks import pdf_render_lock # Import from utils instead
|
34
36
|
from natural_pdf.utils.visualization import render_plain_page
|
35
37
|
|
@@ -274,6 +276,9 @@ class Page(
|
|
274
276
|
self._load_elements()
|
275
277
|
self._to_image_cache: Dict[tuple, Optional["Image.Image"]] = {}
|
276
278
|
|
279
|
+
# Flag to prevent infinite recursion when computing exclusions
|
280
|
+
self._computing_exclusions = False
|
281
|
+
|
277
282
|
def _get_render_specs(
|
278
283
|
self,
|
279
284
|
mode: Literal["show", "render"] = "show",
|
@@ -411,6 +416,35 @@ class Page(
|
|
411
416
|
self._exclusions = []
|
412
417
|
return self
|
413
418
|
|
419
|
+
@contextlib.contextmanager
|
420
|
+
def without_exclusions(self):
|
421
|
+
"""
|
422
|
+
Context manager that temporarily disables exclusion processing.
|
423
|
+
|
424
|
+
This prevents infinite recursion when exclusion callables themselves
|
425
|
+
use find() operations. While in this context, all find operations
|
426
|
+
will skip exclusion filtering.
|
427
|
+
|
428
|
+
Example:
|
429
|
+
```python
|
430
|
+
# This exclusion would normally cause infinite recursion:
|
431
|
+
page.add_exclusion(lambda p: p.find("text:contains('Header')").expand())
|
432
|
+
|
433
|
+
# But internally, it's safe because we use:
|
434
|
+
with page.without_exclusions():
|
435
|
+
region = exclusion_callable(page)
|
436
|
+
```
|
437
|
+
|
438
|
+
Yields:
|
439
|
+
The page object with exclusions temporarily disabled.
|
440
|
+
"""
|
441
|
+
old_value = self._computing_exclusions
|
442
|
+
self._computing_exclusions = True
|
443
|
+
try:
|
444
|
+
yield self
|
445
|
+
finally:
|
446
|
+
self._computing_exclusions = old_value
|
447
|
+
|
414
448
|
def add_exclusion(
|
415
449
|
self,
|
416
450
|
exclusion_func_or_region: Union[
|
@@ -758,15 +792,10 @@ class Page(
|
|
758
792
|
if debug:
|
759
793
|
print(f" - Evaluating callable '{exclusion_label}'...")
|
760
794
|
|
761
|
-
#
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
# Call the function - Expects it to return a Region or None
|
766
|
-
region_result = exclusion_item(self)
|
767
|
-
|
768
|
-
# Restore exclusions
|
769
|
-
self._exclusions = temp_original_exclusions
|
795
|
+
# Use context manager to prevent infinite recursion
|
796
|
+
with self.without_exclusions():
|
797
|
+
# Call the function - Expects it to return a Region or None
|
798
|
+
region_result = exclusion_item(self)
|
770
799
|
|
771
800
|
if isinstance(region_result, Region):
|
772
801
|
# Assign the label to the returned region
|
@@ -866,26 +895,33 @@ class Page(
|
|
866
895
|
if debug:
|
867
896
|
print(f" - Added direct region '{label}': {exclusion_item}")
|
868
897
|
|
869
|
-
# Process direct Element objects - convert to Region
|
898
|
+
# Process direct Element objects - only convert to Region if method is "region"
|
870
899
|
elif hasattr(exclusion_item, "bbox") and hasattr(exclusion_item, "expand"):
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
expanded_region
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
900
|
+
if method == "region":
|
901
|
+
try:
|
902
|
+
# Convert Element to Region using expand()
|
903
|
+
expanded_region = exclusion_item.expand()
|
904
|
+
if isinstance(expanded_region, Region):
|
905
|
+
expanded_region.label = label
|
906
|
+
regions.append(expanded_region)
|
907
|
+
if debug:
|
908
|
+
print(
|
909
|
+
f" - Converted direct Element to Region '{label}': {expanded_region}"
|
910
|
+
)
|
911
|
+
else:
|
912
|
+
if debug:
|
913
|
+
print(
|
914
|
+
f" - Element.expand() did not return a Region: {type(expanded_region)}"
|
915
|
+
)
|
916
|
+
except Exception as e:
|
882
917
|
if debug:
|
883
|
-
print(
|
884
|
-
|
885
|
-
|
886
|
-
except Exception as e:
|
918
|
+
print(f" - Failed to convert Element to Region: {e}")
|
919
|
+
else:
|
920
|
+
# method == "element" - will be handled in _filter_elements_by_exclusions
|
887
921
|
if debug:
|
888
|
-
print(
|
922
|
+
print(
|
923
|
+
f" - Skipping element '{label}' (will be handled as element-based exclusion)"
|
924
|
+
)
|
889
925
|
|
890
926
|
# Process string selectors (from PDF-level exclusions)
|
891
927
|
elif isinstance(exclusion_item, str):
|
@@ -939,6 +975,11 @@ class Page(
|
|
939
975
|
Returns:
|
940
976
|
A new list containing only the elements not excluded.
|
941
977
|
"""
|
978
|
+
# Skip exclusion filtering if we're currently computing exclusions
|
979
|
+
# This prevents infinite recursion when exclusion callables use find operations
|
980
|
+
if self._computing_exclusions:
|
981
|
+
return elements
|
982
|
+
|
942
983
|
# Check both page-level and PDF-level exclusions
|
943
984
|
has_page_exclusions = bool(self._exclusions)
|
944
985
|
has_pdf_exclusions = (
|
@@ -1245,15 +1286,46 @@ class Page(
|
|
1245
1286
|
Returns:
|
1246
1287
|
ElementCollection of matching elements (unfiltered by exclusions)
|
1247
1288
|
"""
|
1248
|
-
from natural_pdf.selectors.parser import selector_to_filter_func
|
1289
|
+
from natural_pdf.selectors.parser import _calculate_aggregates, selector_to_filter_func
|
1249
1290
|
|
1250
1291
|
# Handle compound OR selectors
|
1251
1292
|
if selector_obj.get("type") == "or":
|
1252
1293
|
# For OR selectors, search all elements and let the filter function decide
|
1253
1294
|
elements_to_search = self._element_mgr.get_all_elements()
|
1254
1295
|
|
1296
|
+
# Check if any sub-selector contains aggregate functions
|
1297
|
+
has_aggregates = False
|
1298
|
+
for sub_selector in selector_obj.get("selectors", []):
|
1299
|
+
for attr in sub_selector.get("attributes", []):
|
1300
|
+
value = attr.get("value")
|
1301
|
+
if isinstance(value, dict) and value.get("type") == "aggregate":
|
1302
|
+
has_aggregates = True
|
1303
|
+
break
|
1304
|
+
if has_aggregates:
|
1305
|
+
break
|
1306
|
+
|
1307
|
+
# Calculate aggregates if needed - for OR selectors we calculate on ALL elements
|
1308
|
+
aggregates = {}
|
1309
|
+
if has_aggregates:
|
1310
|
+
# Need to calculate aggregates for each sub-selector type
|
1311
|
+
for sub_selector in selector_obj.get("selectors", []):
|
1312
|
+
sub_type = sub_selector.get("type", "any").lower()
|
1313
|
+
if sub_type == "text":
|
1314
|
+
sub_elements = self._element_mgr.words
|
1315
|
+
elif sub_type == "rect":
|
1316
|
+
sub_elements = self._element_mgr.rects
|
1317
|
+
elif sub_type == "line":
|
1318
|
+
sub_elements = self._element_mgr.lines
|
1319
|
+
elif sub_type == "region":
|
1320
|
+
sub_elements = self._element_mgr.regions
|
1321
|
+
else:
|
1322
|
+
sub_elements = elements_to_search
|
1323
|
+
|
1324
|
+
sub_aggregates = _calculate_aggregates(sub_elements, sub_selector)
|
1325
|
+
aggregates.update(sub_aggregates)
|
1326
|
+
|
1255
1327
|
# Create filter function from compound selector
|
1256
|
-
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
1328
|
+
filter_func = selector_to_filter_func(selector_obj, aggregates=aggregates, **kwargs)
|
1257
1329
|
|
1258
1330
|
# Apply the filter to all elements
|
1259
1331
|
matching_elements = [element for element in elements_to_search if filter_func(element)]
|
@@ -1309,8 +1381,23 @@ class Page(
|
|
1309
1381
|
else:
|
1310
1382
|
elements_to_search = self._element_mgr.get_all_elements()
|
1311
1383
|
|
1384
|
+
# Check if selector contains aggregate functions
|
1385
|
+
has_aggregates = False
|
1386
|
+
for attr in selector_obj.get("attributes", []):
|
1387
|
+
value = attr.get("value")
|
1388
|
+
if isinstance(value, dict) and value.get("type") == "aggregate":
|
1389
|
+
has_aggregates = True
|
1390
|
+
break
|
1391
|
+
|
1392
|
+
# Calculate aggregates if needed
|
1393
|
+
aggregates = {}
|
1394
|
+
if has_aggregates:
|
1395
|
+
# For aggregates, we need to calculate based on ALL elements of the same type
|
1396
|
+
# not just the filtered subset
|
1397
|
+
aggregates = _calculate_aggregates(elements_to_search, selector_obj)
|
1398
|
+
|
1312
1399
|
# Create filter function from selector, passing any additional parameters
|
1313
|
-
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
1400
|
+
filter_func = selector_to_filter_func(selector_obj, aggregates=aggregates, **kwargs)
|
1314
1401
|
|
1315
1402
|
# Apply the filter to matching elements
|
1316
1403
|
matching_elements = [element for element in elements_to_search if filter_func(element)]
|
@@ -1857,7 +1944,9 @@ class Page(
|
|
1857
1944
|
cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
1858
1945
|
show_progress: bool = False,
|
1859
1946
|
content_filter=None,
|
1860
|
-
|
1947
|
+
verticals: Optional[List[float]] = None,
|
1948
|
+
horizontals: Optional[List[float]] = None,
|
1949
|
+
) -> TableResult:
|
1861
1950
|
"""
|
1862
1951
|
Extract the largest table from this page using enhanced region-based extraction.
|
1863
1952
|
|
@@ -1874,9 +1963,11 @@ class Page(
|
|
1874
1963
|
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
1875
1964
|
- A callable that takes text and returns True to KEEP the character
|
1876
1965
|
- A list of regex patterns (characters matching ANY pattern are EXCLUDED)
|
1966
|
+
verticals: Optional list of x-coordinates for explicit vertical table lines.
|
1967
|
+
horizontals: Optional list of y-coordinates for explicit horizontal table lines.
|
1877
1968
|
|
1878
1969
|
Returns:
|
1879
|
-
|
1970
|
+
TableResult: A sequence-like object containing table rows that also provides .to_df() for pandas conversion.
|
1880
1971
|
"""
|
1881
1972
|
# Create a full-page region and delegate to its enhanced extract_table method
|
1882
1973
|
page_region = self.create_region(0, 0, self.width, self.height)
|
@@ -1889,6 +1980,8 @@ class Page(
|
|
1889
1980
|
cell_extraction_func=cell_extraction_func,
|
1890
1981
|
show_progress=show_progress,
|
1891
1982
|
content_filter=content_filter,
|
1983
|
+
verticals=verticals,
|
1984
|
+
horizontals=horizontals,
|
1892
1985
|
)
|
1893
1986
|
|
1894
1987
|
def extract_tables(
|
@@ -2768,6 +2861,7 @@ class Page(
|
|
2768
2861
|
region.start_element = current_start_element
|
2769
2862
|
region.end_element = end_boundary_el # Mark the element that ended it
|
2770
2863
|
region.is_end_next_start = True # Mark how it ended
|
2864
|
+
region._boundary_exclusions = include_boundaries
|
2771
2865
|
regions.append(region)
|
2772
2866
|
else: # horizontal
|
2773
2867
|
sec_left = (
|
@@ -2787,6 +2881,7 @@ class Page(
|
|
2787
2881
|
region.start_element = current_start_element
|
2788
2882
|
region.end_element = end_boundary_el # Mark the element that ended it
|
2789
2883
|
region.is_end_next_start = True # Mark how it ended
|
2884
|
+
region._boundary_exclusions = include_boundaries
|
2790
2885
|
regions.append(region)
|
2791
2886
|
active_section_started = False # Reset for the new start
|
2792
2887
|
|
@@ -2815,6 +2910,7 @@ class Page(
|
|
2815
2910
|
region.start_element = current_start_element
|
2816
2911
|
region.end_element = end_boundary_el
|
2817
2912
|
region.is_end_next_start = False
|
2913
|
+
region._boundary_exclusions = include_boundaries
|
2818
2914
|
regions.append(region)
|
2819
2915
|
else: # horizontal
|
2820
2916
|
sec_left = (
|
@@ -2834,6 +2930,7 @@ class Page(
|
|
2834
2930
|
region.start_element = current_start_element
|
2835
2931
|
region.end_element = end_boundary_el
|
2836
2932
|
region.is_end_next_start = False
|
2933
|
+
region._boundary_exclusions = include_boundaries
|
2837
2934
|
regions.append(region)
|
2838
2935
|
|
2839
2936
|
# Reset: section ended explicitly
|
@@ -2854,6 +2951,7 @@ class Page(
|
|
2854
2951
|
region.start_element = current_start_element
|
2855
2952
|
region.end_element = None # Ended by page end
|
2856
2953
|
region.is_end_next_start = False
|
2954
|
+
region._boundary_exclusions = include_boundaries
|
2857
2955
|
regions.append(region)
|
2858
2956
|
else: # horizontal
|
2859
2957
|
sec_left = (
|
@@ -2867,6 +2965,7 @@ class Page(
|
|
2867
2965
|
region.start_element = current_start_element
|
2868
2966
|
region.end_element = None # Ended by page end
|
2869
2967
|
region.is_end_next_start = False
|
2968
|
+
region._boundary_exclusions = include_boundaries
|
2870
2969
|
regions.append(region)
|
2871
2970
|
|
2872
2971
|
return ElementCollection(regions)
|
@@ -789,6 +789,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
789
789
|
start_element.page,
|
790
790
|
(0, top, start_element.page.width, bottom),
|
791
791
|
)
|
792
|
+
section._boundary_exclusions = include_boundaries
|
792
793
|
else: # horizontal
|
793
794
|
left = start_element.x0
|
794
795
|
right = end_element.x1
|
@@ -821,6 +822,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
821
822
|
start_element.page,
|
822
823
|
(left, 0, right, start_element.page.height),
|
823
824
|
)
|
825
|
+
section._boundary_exclusions = include_boundaries
|
824
826
|
section.start_element = start_element
|
825
827
|
section.boundary_element_found = end_element
|
826
828
|
else:
|
@@ -865,6 +867,10 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
865
867
|
start_element.page, (0, top, start_element.page.width, bottom)
|
866
868
|
)
|
867
869
|
section.start_element = start_element
|
870
|
+
section.end_element = (
|
871
|
+
next_start # The next start is the end of this section
|
872
|
+
)
|
873
|
+
section._boundary_exclusions = include_boundaries
|
868
874
|
sections.append(section)
|
869
875
|
else: # horizontal
|
870
876
|
# Determine horizontal bounds
|
@@ -882,6 +888,10 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
882
888
|
start_element.page, (left, 0, right, start_element.page.height)
|
883
889
|
)
|
884
890
|
section.start_element = start_element
|
891
|
+
section.end_element = (
|
892
|
+
next_start # The next start is the end of this section
|
893
|
+
)
|
894
|
+
section._boundary_exclusions = include_boundaries
|
885
895
|
sections.append(section)
|
886
896
|
else:
|
887
897
|
# Cross-page section - create from current_start to the end of its page
|
@@ -982,6 +992,71 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
982
992
|
|
983
993
|
return ElementCollection(sections)
|
984
994
|
|
995
|
+
def split(self, divider, **kwargs) -> "ElementCollection[Region]":
|
996
|
+
"""
|
997
|
+
Divide this page collection into sections based on the provided divider elements.
|
998
|
+
|
999
|
+
Args:
|
1000
|
+
divider: Elements or selector string that mark section boundaries
|
1001
|
+
**kwargs: Additional parameters passed to get_sections()
|
1002
|
+
- include_boundaries: How to include boundary elements (default: 'start')
|
1003
|
+
- orientation: 'vertical' or 'horizontal' (default: 'vertical')
|
1004
|
+
- new_section_on_page_break: Whether to split at page boundaries (default: False)
|
1005
|
+
|
1006
|
+
Returns:
|
1007
|
+
ElementCollection of Region objects representing the sections
|
1008
|
+
|
1009
|
+
Example:
|
1010
|
+
# Split a PDF by chapter titles
|
1011
|
+
chapters = pdf.pages.split("text[size>20]:contains('CHAPTER')")
|
1012
|
+
|
1013
|
+
# Split by page breaks
|
1014
|
+
page_sections = pdf.pages.split(None, new_section_on_page_break=True)
|
1015
|
+
|
1016
|
+
# Split multi-page document by section headers
|
1017
|
+
sections = pdf.pages[10:20].split("text:bold:contains('Section')")
|
1018
|
+
"""
|
1019
|
+
# Default to 'start' boundaries for split (include divider at start of each section)
|
1020
|
+
if "include_boundaries" not in kwargs:
|
1021
|
+
kwargs["include_boundaries"] = "start"
|
1022
|
+
|
1023
|
+
sections = self.get_sections(start_elements=divider, **kwargs)
|
1024
|
+
|
1025
|
+
# Add initial section if there's content before the first divider
|
1026
|
+
if sections and divider is not None:
|
1027
|
+
# Get all elements across all pages
|
1028
|
+
all_elements = []
|
1029
|
+
for page in self.pages:
|
1030
|
+
all_elements.extend(page.get_elements())
|
1031
|
+
|
1032
|
+
if all_elements:
|
1033
|
+
# Find first divider
|
1034
|
+
if isinstance(divider, str):
|
1035
|
+
# Search for first matching element
|
1036
|
+
first_divider = None
|
1037
|
+
for page in self.pages:
|
1038
|
+
match = page.find(divider)
|
1039
|
+
if match:
|
1040
|
+
first_divider = match
|
1041
|
+
break
|
1042
|
+
else:
|
1043
|
+
# divider is already elements
|
1044
|
+
first_divider = divider[0] if hasattr(divider, "__getitem__") else divider
|
1045
|
+
|
1046
|
+
if first_divider and all_elements[0] != first_divider:
|
1047
|
+
# There's content before the first divider
|
1048
|
+
# Get section from start to first divider
|
1049
|
+
initial_sections = self.get_sections(
|
1050
|
+
start_elements=None,
|
1051
|
+
end_elements=[first_divider],
|
1052
|
+
include_boundaries="none",
|
1053
|
+
orientation=kwargs.get("orientation", "vertical"),
|
1054
|
+
)
|
1055
|
+
if initial_sections:
|
1056
|
+
sections = ElementCollection([initial_sections[0]] + list(sections))
|
1057
|
+
|
1058
|
+
return sections
|
1059
|
+
|
985
1060
|
def _gather_analysis_data(
|
986
1061
|
self,
|
987
1062
|
analysis_keys: List[str],
|
natural_pdf/core/pdf.py
CHANGED
@@ -1333,6 +1333,39 @@ class PDF(
|
|
1333
1333
|
orientation=orientation,
|
1334
1334
|
)
|
1335
1335
|
|
1336
|
+
def split(self, divider, **kwargs) -> "ElementCollection":
|
1337
|
+
"""
|
1338
|
+
Divide the PDF into sections based on the provided divider elements.
|
1339
|
+
|
1340
|
+
Args:
|
1341
|
+
divider: Elements or selector string that mark section boundaries
|
1342
|
+
**kwargs: Additional parameters passed to get_sections()
|
1343
|
+
- include_boundaries: How to include boundary elements (default: 'start')
|
1344
|
+
- orientation: 'vertical' or 'horizontal' (default: 'vertical')
|
1345
|
+
- new_section_on_page_break: Whether to split at page boundaries (default: False)
|
1346
|
+
|
1347
|
+
Returns:
|
1348
|
+
ElementCollection of Region objects representing the sections
|
1349
|
+
|
1350
|
+
Example:
|
1351
|
+
# Split a PDF by chapter titles
|
1352
|
+
chapters = pdf.split("text[size>20]:contains('Chapter')")
|
1353
|
+
|
1354
|
+
# Export each chapter to a separate file
|
1355
|
+
for i, chapter in enumerate(chapters):
|
1356
|
+
chapter_text = chapter.extract_text()
|
1357
|
+
with open(f"chapter_{i+1}.txt", "w") as f:
|
1358
|
+
f.write(chapter_text)
|
1359
|
+
|
1360
|
+
# Split by horizontal rules/lines
|
1361
|
+
sections = pdf.split("line[orientation=horizontal]")
|
1362
|
+
|
1363
|
+
# Split only by page breaks (no divider elements)
|
1364
|
+
pages = pdf.split(None, new_section_on_page_break=True)
|
1365
|
+
"""
|
1366
|
+
# Delegate to pages collection
|
1367
|
+
return self.pages.split(divider, **kwargs)
|
1368
|
+
|
1336
1369
|
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
1337
1370
|
"""
|
1338
1371
|
DEPRECATED: Use save_pdf(..., ocr=True) instead.
|
natural_pdf/describe/base.py
CHANGED
@@ -272,17 +272,12 @@ def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str
|
|
272
272
|
"font_family",
|
273
273
|
"font_variant",
|
274
274
|
"size",
|
275
|
-
"
|
276
|
-
"italic",
|
277
|
-
"strike",
|
278
|
-
"underline",
|
279
|
-
"highlight",
|
275
|
+
"styles",
|
280
276
|
"source",
|
281
277
|
"confidence",
|
278
|
+
"color",
|
282
279
|
]
|
283
280
|
)
|
284
|
-
# Add foreground text colour too
|
285
|
-
columns.append("color")
|
286
281
|
elif element_type == "rect":
|
287
282
|
columns = base_columns + ["width", "height", "stroke", "fill", "stroke_width"]
|
288
283
|
elif element_type == "line":
|
@@ -358,6 +353,52 @@ def _extract_element_value(element: "Element", column: str) -> Any:
|
|
358
353
|
return str(col_val)
|
359
354
|
return ""
|
360
355
|
|
356
|
+
elif column == "styles":
|
357
|
+
# Collect all active text decorations
|
358
|
+
styles = []
|
359
|
+
|
360
|
+
if getattr(element, "bold", False):
|
361
|
+
styles.append("bold")
|
362
|
+
if getattr(element, "italic", False):
|
363
|
+
styles.append("italic")
|
364
|
+
if getattr(element, "strike", False):
|
365
|
+
styles.append("strike")
|
366
|
+
if getattr(element, "underline", False):
|
367
|
+
styles.append("underline")
|
368
|
+
|
369
|
+
# Handle highlight specially - include color if not default yellow
|
370
|
+
if getattr(element, "is_highlighted", False):
|
371
|
+
highlight_color = getattr(element, "highlight_color", None)
|
372
|
+
if highlight_color is not None:
|
373
|
+
# Convert color to hex if needed
|
374
|
+
if isinstance(highlight_color, (tuple, list)) and len(highlight_color) >= 3:
|
375
|
+
try:
|
376
|
+
r, g, b = [
|
377
|
+
int(v * 255) if v <= 1 else int(v) for v in highlight_color[:3]
|
378
|
+
]
|
379
|
+
hex_color = f"#{r:02x}{g:02x}{b:02x}"
|
380
|
+
styles.append(f"highlight({hex_color})")
|
381
|
+
except Exception:
|
382
|
+
styles.append("highlight")
|
383
|
+
elif isinstance(highlight_color, (int, float)):
|
384
|
+
# Grayscale value
|
385
|
+
try:
|
386
|
+
gray = (
|
387
|
+
int(highlight_color * 255)
|
388
|
+
if highlight_color <= 1
|
389
|
+
else int(highlight_color)
|
390
|
+
)
|
391
|
+
hex_color = f"#{gray:02x}{gray:02x}{gray:02x}"
|
392
|
+
styles.append(f"highlight({hex_color})")
|
393
|
+
except Exception:
|
394
|
+
styles.append("highlight")
|
395
|
+
else:
|
396
|
+
styles.append("highlight")
|
397
|
+
else:
|
398
|
+
styles.append("highlight")
|
399
|
+
|
400
|
+
return ", ".join(styles) if styles else ""
|
401
|
+
|
361
402
|
elif column in ["stroke", "fill", "color"]:
|
362
403
|
value = getattr(element, column, None)
|
363
404
|
# If already a string (e.g. '#ff00aa' or 'red') return as is
|