natural-pdf 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +45 -0
- natural_pdf/analyzers/guides.py +359 -0
- natural_pdf/core/element_manager.py +4 -0
- natural_pdf/core/page.py +88 -22
- natural_pdf/core/page_collection.py +75 -0
- natural_pdf/core/pdf.py +33 -0
- natural_pdf/describe/base.py +48 -7
- natural_pdf/elements/base.py +408 -43
- natural_pdf/elements/element_collection.py +83 -10
- natural_pdf/elements/region.py +217 -178
- natural_pdf/elements/text.py +5 -3
- natural_pdf/flows/element.py +1 -0
- natural_pdf/flows/flow.py +175 -480
- natural_pdf/flows/region.py +76 -0
- natural_pdf/selectors/parser.py +180 -9
- natural_pdf/utils/pdfminer_patches.py +136 -0
- natural_pdf/utils/sections.py +346 -0
- natural_pdf/utils/spatial.py +169 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/RECORD +24 -21
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/top_level.txt +0 -0
@@ -789,6 +789,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
789
789
|
start_element.page,
|
790
790
|
(0, top, start_element.page.width, bottom),
|
791
791
|
)
|
792
|
+
section._boundary_exclusions = include_boundaries
|
792
793
|
else: # horizontal
|
793
794
|
left = start_element.x0
|
794
795
|
right = end_element.x1
|
@@ -821,6 +822,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
821
822
|
start_element.page,
|
822
823
|
(left, 0, right, start_element.page.height),
|
823
824
|
)
|
825
|
+
section._boundary_exclusions = include_boundaries
|
824
826
|
section.start_element = start_element
|
825
827
|
section.boundary_element_found = end_element
|
826
828
|
else:
|
@@ -865,6 +867,10 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
865
867
|
start_element.page, (0, top, start_element.page.width, bottom)
|
866
868
|
)
|
867
869
|
section.start_element = start_element
|
870
|
+
section.end_element = (
|
871
|
+
next_start # The next start is the end of this section
|
872
|
+
)
|
873
|
+
section._boundary_exclusions = include_boundaries
|
868
874
|
sections.append(section)
|
869
875
|
else: # horizontal
|
870
876
|
# Determine horizontal bounds
|
@@ -882,6 +888,10 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
882
888
|
start_element.page, (left, 0, right, start_element.page.height)
|
883
889
|
)
|
884
890
|
section.start_element = start_element
|
891
|
+
section.end_element = (
|
892
|
+
next_start # The next start is the end of this section
|
893
|
+
)
|
894
|
+
section._boundary_exclusions = include_boundaries
|
885
895
|
sections.append(section)
|
886
896
|
else:
|
887
897
|
# Cross-page section - create from current_start to the end of its page
|
@@ -982,6 +992,71 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
|
|
982
992
|
|
983
993
|
return ElementCollection(sections)
|
984
994
|
|
995
|
+
def split(self, divider, **kwargs) -> "ElementCollection[Region]":
|
996
|
+
"""
|
997
|
+
Divide this page collection into sections based on the provided divider elements.
|
998
|
+
|
999
|
+
Args:
|
1000
|
+
divider: Elements or selector string that mark section boundaries
|
1001
|
+
**kwargs: Additional parameters passed to get_sections()
|
1002
|
+
- include_boundaries: How to include boundary elements (default: 'start')
|
1003
|
+
- orientation: 'vertical' or 'horizontal' (default: 'vertical')
|
1004
|
+
- new_section_on_page_break: Whether to split at page boundaries (default: False)
|
1005
|
+
|
1006
|
+
Returns:
|
1007
|
+
ElementCollection of Region objects representing the sections
|
1008
|
+
|
1009
|
+
Example:
|
1010
|
+
# Split a PDF by chapter titles
|
1011
|
+
chapters = pdf.pages.split("text[size>20]:contains('CHAPTER')")
|
1012
|
+
|
1013
|
+
# Split by page breaks
|
1014
|
+
page_sections = pdf.pages.split(None, new_section_on_page_break=True)
|
1015
|
+
|
1016
|
+
# Split multi-page document by section headers
|
1017
|
+
sections = pdf.pages[10:20].split("text:bold:contains('Section')")
|
1018
|
+
"""
|
1019
|
+
# Default to 'start' boundaries for split (include divider at start of each section)
|
1020
|
+
if "include_boundaries" not in kwargs:
|
1021
|
+
kwargs["include_boundaries"] = "start"
|
1022
|
+
|
1023
|
+
sections = self.get_sections(start_elements=divider, **kwargs)
|
1024
|
+
|
1025
|
+
# Add initial section if there's content before the first divider
|
1026
|
+
if sections and divider is not None:
|
1027
|
+
# Get all elements across all pages
|
1028
|
+
all_elements = []
|
1029
|
+
for page in self.pages:
|
1030
|
+
all_elements.extend(page.get_elements())
|
1031
|
+
|
1032
|
+
if all_elements:
|
1033
|
+
# Find first divider
|
1034
|
+
if isinstance(divider, str):
|
1035
|
+
# Search for first matching element
|
1036
|
+
first_divider = None
|
1037
|
+
for page in self.pages:
|
1038
|
+
match = page.find(divider)
|
1039
|
+
if match:
|
1040
|
+
first_divider = match
|
1041
|
+
break
|
1042
|
+
else:
|
1043
|
+
# divider is already elements
|
1044
|
+
first_divider = divider[0] if hasattr(divider, "__getitem__") else divider
|
1045
|
+
|
1046
|
+
if first_divider and all_elements[0] != first_divider:
|
1047
|
+
# There's content before the first divider
|
1048
|
+
# Get section from start to first divider
|
1049
|
+
initial_sections = self.get_sections(
|
1050
|
+
start_elements=None,
|
1051
|
+
end_elements=[first_divider],
|
1052
|
+
include_boundaries="none",
|
1053
|
+
orientation=kwargs.get("orientation", "vertical"),
|
1054
|
+
)
|
1055
|
+
if initial_sections:
|
1056
|
+
sections = ElementCollection([initial_sections[0]] + list(sections))
|
1057
|
+
|
1058
|
+
return sections
|
1059
|
+
|
985
1060
|
def _gather_analysis_data(
|
986
1061
|
self,
|
987
1062
|
analysis_keys: List[str],
|
natural_pdf/core/pdf.py
CHANGED
@@ -1333,6 +1333,39 @@ class PDF(
|
|
1333
1333
|
orientation=orientation,
|
1334
1334
|
)
|
1335
1335
|
|
1336
|
+
def split(self, divider, **kwargs) -> "ElementCollection":
|
1337
|
+
"""
|
1338
|
+
Divide the PDF into sections based on the provided divider elements.
|
1339
|
+
|
1340
|
+
Args:
|
1341
|
+
divider: Elements or selector string that mark section boundaries
|
1342
|
+
**kwargs: Additional parameters passed to get_sections()
|
1343
|
+
- include_boundaries: How to include boundary elements (default: 'start')
|
1344
|
+
- orientation: 'vertical' or 'horizontal' (default: 'vertical')
|
1345
|
+
- new_section_on_page_break: Whether to split at page boundaries (default: False)
|
1346
|
+
|
1347
|
+
Returns:
|
1348
|
+
ElementCollection of Region objects representing the sections
|
1349
|
+
|
1350
|
+
Example:
|
1351
|
+
# Split a PDF by chapter titles
|
1352
|
+
chapters = pdf.split("text[size>20]:contains('Chapter')")
|
1353
|
+
|
1354
|
+
# Export each chapter to a separate file
|
1355
|
+
for i, chapter in enumerate(chapters):
|
1356
|
+
chapter_text = chapter.extract_text()
|
1357
|
+
with open(f"chapter_{i+1}.txt", "w") as f:
|
1358
|
+
f.write(chapter_text)
|
1359
|
+
|
1360
|
+
# Split by horizontal rules/lines
|
1361
|
+
sections = pdf.split("line[orientation=horizontal]")
|
1362
|
+
|
1363
|
+
# Split only by page breaks (no divider elements)
|
1364
|
+
pages = pdf.split(None, new_section_on_page_break=True)
|
1365
|
+
"""
|
1366
|
+
# Delegate to pages collection
|
1367
|
+
return self.pages.split(divider, **kwargs)
|
1368
|
+
|
1336
1369
|
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
1337
1370
|
"""
|
1338
1371
|
DEPRECATED: Use save_pdf(..., ocr=True) instead.
|
natural_pdf/describe/base.py
CHANGED
@@ -272,17 +272,12 @@ def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str
|
|
272
272
|
"font_family",
|
273
273
|
"font_variant",
|
274
274
|
"size",
|
275
|
-
"
|
276
|
-
"italic",
|
277
|
-
"strike",
|
278
|
-
"underline",
|
279
|
-
"highlight",
|
275
|
+
"styles",
|
280
276
|
"source",
|
281
277
|
"confidence",
|
278
|
+
"color",
|
282
279
|
]
|
283
280
|
)
|
284
|
-
# Add foreground text colour too
|
285
|
-
columns.append("color")
|
286
281
|
elif element_type == "rect":
|
287
282
|
columns = base_columns + ["width", "height", "stroke", "fill", "stroke_width"]
|
288
283
|
elif element_type == "line":
|
@@ -358,6 +353,52 @@ def _extract_element_value(element: "Element", column: str) -> Any:
|
|
358
353
|
return str(col_val)
|
359
354
|
return ""
|
360
355
|
|
356
|
+
elif column == "styles":
|
357
|
+
# Collect all active text decorations
|
358
|
+
styles = []
|
359
|
+
|
360
|
+
if getattr(element, "bold", False):
|
361
|
+
styles.append("bold")
|
362
|
+
if getattr(element, "italic", False):
|
363
|
+
styles.append("italic")
|
364
|
+
if getattr(element, "strike", False):
|
365
|
+
styles.append("strike")
|
366
|
+
if getattr(element, "underline", False):
|
367
|
+
styles.append("underline")
|
368
|
+
|
369
|
+
# Handle highlight specially - include color if not default yellow
|
370
|
+
if getattr(element, "is_highlighted", False):
|
371
|
+
highlight_color = getattr(element, "highlight_color", None)
|
372
|
+
if highlight_color is not None:
|
373
|
+
# Convert color to hex if needed
|
374
|
+
if isinstance(highlight_color, (tuple, list)) and len(highlight_color) >= 3:
|
375
|
+
try:
|
376
|
+
r, g, b = [
|
377
|
+
int(v * 255) if v <= 1 else int(v) for v in highlight_color[:3]
|
378
|
+
]
|
379
|
+
hex_color = f"#{r:02x}{g:02x}{b:02x}"
|
380
|
+
styles.append(f"highlight({hex_color})")
|
381
|
+
except Exception:
|
382
|
+
styles.append("highlight")
|
383
|
+
elif isinstance(highlight_color, (int, float)):
|
384
|
+
# Grayscale value
|
385
|
+
try:
|
386
|
+
gray = (
|
387
|
+
int(highlight_color * 255)
|
388
|
+
if highlight_color <= 1
|
389
|
+
else int(highlight_color)
|
390
|
+
)
|
391
|
+
hex_color = f"#{gray:02x}{gray:02x}{gray:02x}"
|
392
|
+
styles.append(f"highlight({hex_color})")
|
393
|
+
except Exception:
|
394
|
+
styles.append("highlight")
|
395
|
+
else:
|
396
|
+
styles.append("highlight")
|
397
|
+
else:
|
398
|
+
styles.append("highlight")
|
399
|
+
|
400
|
+
return ", ".join(styles) if styles else ""
|
401
|
+
|
361
402
|
elif column in ["stroke", "fill", "color"]:
|
362
403
|
value = getattr(element, column, None)
|
363
404
|
# If already a string (e.g. '#ff00aa' or 'red') return as is
|