natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +11 -6
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +252 -399
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +231 -89
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +405 -280
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +25 -0
- natural_pdf/flows/flow.py +1658 -19
- natural_pdf/flows/region.py +757 -263
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +35 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +101 -0
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
natural_pdf/elements/base.py
CHANGED
@@ -2,11 +2,12 @@
|
|
2
2
|
Base Element class for natural-pdf.
|
3
3
|
"""
|
4
4
|
|
5
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, overload
|
5
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union, overload
|
6
6
|
|
7
7
|
from PIL import Image
|
8
8
|
|
9
9
|
from natural_pdf.classification.mixin import ClassificationMixin
|
10
|
+
from natural_pdf.core.render_spec import RenderSpec, Visualizable
|
10
11
|
from natural_pdf.describe.mixin import DescribeMixin
|
11
12
|
|
12
13
|
# Import selector parsing functions
|
@@ -15,7 +16,7 @@ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
|
15
16
|
if TYPE_CHECKING:
|
16
17
|
from natural_pdf.classification.manager import ClassificationManager # noqa: F401
|
17
18
|
from natural_pdf.core.page import Page
|
18
|
-
from natural_pdf.elements.
|
19
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
19
20
|
from natural_pdf.elements.region import Region
|
20
21
|
|
21
22
|
|
@@ -563,7 +564,56 @@ class DirectionalMixin:
|
|
563
564
|
return matches[0]
|
564
565
|
|
565
566
|
|
566
|
-
class
|
567
|
+
class HighlightableMixin:
|
568
|
+
"""
|
569
|
+
Mixin that provides the highlighting protocol for elements.
|
570
|
+
|
571
|
+
This protocol enables ElementCollection.show() to work with mixed content
|
572
|
+
including FlowRegions and elements from multiple pages by providing a
|
573
|
+
standard way to get highlight specifications.
|
574
|
+
"""
|
575
|
+
|
576
|
+
def get_highlight_specs(self) -> List[Dict[str, Any]]:
|
577
|
+
"""
|
578
|
+
Get highlight specifications for this element.
|
579
|
+
|
580
|
+
Returns a list of dictionaries, each containing:
|
581
|
+
- page: The Page object to highlight on
|
582
|
+
- page_index: The 0-based index of the page
|
583
|
+
- bbox: The bounding box (x0, y0, x1, y1) to highlight
|
584
|
+
- polygon: Optional polygon coordinates for non-rectangular highlights
|
585
|
+
- element: Reference to the element being highlighted
|
586
|
+
|
587
|
+
For regular elements, this returns a single spec.
|
588
|
+
For FlowRegions, this returns specs for all constituent regions.
|
589
|
+
|
590
|
+
Returns:
|
591
|
+
List of highlight specification dictionaries
|
592
|
+
"""
|
593
|
+
# Default implementation for regular elements
|
594
|
+
if not hasattr(self, "page") or self.page is None:
|
595
|
+
return []
|
596
|
+
|
597
|
+
if not hasattr(self, "bbox") or self.bbox is None:
|
598
|
+
return []
|
599
|
+
|
600
|
+
spec = {
|
601
|
+
"page": self.page,
|
602
|
+
"page_index": self.page.index if hasattr(self.page, "index") else 0,
|
603
|
+
"bbox": self.bbox,
|
604
|
+
"element": self,
|
605
|
+
}
|
606
|
+
|
607
|
+
# Add polygon if available
|
608
|
+
if hasattr(self, "polygon") and hasattr(self, "has_polygon") and self.has_polygon:
|
609
|
+
spec["polygon"] = self.polygon
|
610
|
+
|
611
|
+
return [spec]
|
612
|
+
|
613
|
+
|
614
|
+
class Element(
|
615
|
+
DirectionalMixin, ClassificationMixin, DescribeMixin, HighlightableMixin, Visualizable
|
616
|
+
):
|
567
617
|
"""Base class for all PDF elements.
|
568
618
|
|
569
619
|
This class provides common properties and methods for all PDF elements,
|
@@ -1024,7 +1074,7 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
|
|
1024
1074
|
label: str = "",
|
1025
1075
|
color: Optional[Tuple[float, float, float]] = None,
|
1026
1076
|
use_color_cycling: bool = True,
|
1027
|
-
|
1077
|
+
annotate: Optional[List[str]] = None,
|
1028
1078
|
existing: str = "append",
|
1029
1079
|
) -> "Element":
|
1030
1080
|
"""Highlight the element with the specified colour.
|
@@ -1042,7 +1092,7 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
|
|
1042
1092
|
"label": label,
|
1043
1093
|
"use_color_cycling": use_color_cycling,
|
1044
1094
|
"element": self, # Pass the element itself so attributes can be accessed
|
1045
|
-
"
|
1095
|
+
"annotate": annotate,
|
1046
1096
|
"existing": existing,
|
1047
1097
|
}
|
1048
1098
|
|
@@ -1056,84 +1106,67 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
|
|
1056
1106
|
|
1057
1107
|
return self
|
1058
1108
|
|
1059
|
-
def
|
1109
|
+
def _get_render_specs(
|
1060
1110
|
self,
|
1061
|
-
|
1062
|
-
|
1063
|
-
|
1064
|
-
|
1111
|
+
mode: Literal["show", "render"] = "show",
|
1112
|
+
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
1113
|
+
highlights: Optional[List[Dict[str, Any]]] = None,
|
1114
|
+
crop: Union[bool, Literal["content"]] = False,
|
1115
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
1065
1116
|
label: Optional[str] = None,
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1069
|
-
"""
|
1070
|
-
Show the page with only this element highlighted temporarily.
|
1117
|
+
**kwargs,
|
1118
|
+
) -> List[RenderSpec]:
|
1119
|
+
"""Get render specifications for this element.
|
1071
1120
|
|
1072
1121
|
Args:
|
1073
|
-
|
1074
|
-
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1078
|
-
|
1079
|
-
|
1080
|
-
bounding box before legends/overlays are added.
|
1122
|
+
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
1123
|
+
color: Color for highlighting this element in show mode
|
1124
|
+
highlights: Additional highlight groups to show
|
1125
|
+
crop: Whether to crop to element bounds
|
1126
|
+
crop_bbox: Explicit crop bounds
|
1127
|
+
label: Optional label for this element
|
1128
|
+
**kwargs: Additional parameters
|
1081
1129
|
|
1082
1130
|
Returns:
|
1083
|
-
|
1131
|
+
List with single RenderSpec for this element's page
|
1084
1132
|
"""
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1103
|
-
|
1104
|
-
|
1105
|
-
|
1106
|
-
|
1107
|
-
|
1108
|
-
"bbox": self.bbox if not self.has_polygon else None,
|
1109
|
-
"polygon": self.polygon if self.has_polygon else None,
|
1110
|
-
"color": color, # Use provided or default color
|
1111
|
-
"label": display_label,
|
1112
|
-
"use_color_cycling": False, # Explicitly false for single preview
|
1113
|
-
}
|
1133
|
+
if not hasattr(self, "page") or self.page is None:
|
1134
|
+
return []
|
1135
|
+
|
1136
|
+
spec = RenderSpec(page=self.page)
|
1137
|
+
|
1138
|
+
# Handle cropping
|
1139
|
+
if crop_bbox:
|
1140
|
+
spec.crop_bbox = crop_bbox
|
1141
|
+
elif crop == "content" or crop is True:
|
1142
|
+
# Crop to element bounds
|
1143
|
+
if hasattr(self, "bbox") and self.bbox:
|
1144
|
+
spec.crop_bbox = self.bbox
|
1145
|
+
|
1146
|
+
# Add highlight in show mode
|
1147
|
+
if mode == "show":
|
1148
|
+
# Use provided label or generate one
|
1149
|
+
element_label = label if label is not None else self.__class__.__name__
|
1150
|
+
|
1151
|
+
spec.add_highlight(
|
1152
|
+
element=self,
|
1153
|
+
color=color or "red", # Default red for single element
|
1154
|
+
label=element_label,
|
1155
|
+
)
|
1114
1156
|
|
1115
|
-
|
1116
|
-
|
1157
|
+
# Add additional highlight groups if provided
|
1158
|
+
if highlights:
|
1159
|
+
for group in highlights:
|
1160
|
+
group_elements = group.get("elements", [])
|
1161
|
+
group_color = group.get("color", color)
|
1162
|
+
group_label = group.get("label")
|
1117
1163
|
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1164
|
+
for elem in group_elements:
|
1165
|
+
# Only add if element is on same page
|
1166
|
+
if hasattr(elem, "page") and elem.page == self.page:
|
1167
|
+
spec.add_highlight(element=elem, color=group_color, label=group_label)
|
1122
1168
|
|
1123
|
-
|
1124
|
-
try:
|
1125
|
-
return service.render_preview(
|
1126
|
-
page_index=self.page.index,
|
1127
|
-
temporary_highlights=[temp_highlight_data],
|
1128
|
-
resolution=resolution,
|
1129
|
-
width=width, # Pass the width parameter
|
1130
|
-
labels=labels,
|
1131
|
-
legend_position=legend_position,
|
1132
|
-
crop_bbox=crop_bbox,
|
1133
|
-
)
|
1134
|
-
except Exception as e:
|
1135
|
-
logger.error(f"Error calling render_preview for element {self}: {e}", exc_info=True)
|
1136
|
-
return None
|
1169
|
+
return [spec]
|
1137
1170
|
|
1138
1171
|
def save(
|
1139
1172
|
self,
|
@@ -1346,22 +1379,14 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
|
|
1346
1379
|
resolution = kwargs.get("resolution", 150)
|
1347
1380
|
from natural_pdf.elements.region import Region # Local import to avoid cycles
|
1348
1381
|
|
1349
|
-
|
1382
|
+
# Use render() for clean image without highlights
|
1383
|
+
return self.expand().render(
|
1350
1384
|
resolution=resolution,
|
1351
|
-
include_highlights=False,
|
1352
1385
|
crop=True,
|
1353
1386
|
)
|
1354
1387
|
else:
|
1355
1388
|
raise ValueError(f"Unsupported model_type for classification: {model_type}")
|
1356
1389
|
|
1357
|
-
# ------------------------------------------------------------------
|
1358
|
-
# Lightweight to_image proxy (vision models, previews, etc.)
|
1359
|
-
# ------------------------------------------------------------------
|
1360
|
-
|
1361
|
-
def to_image(self, *args, **kwargs): # type: ignore[override]
|
1362
|
-
"""Generate an image of this element by delegating to a temporary Region."""
|
1363
|
-
return self.expand().to_image(*args, **kwargs)
|
1364
|
-
|
1365
1390
|
# ------------------------------------------------------------------
|
1366
1391
|
# Unified analysis storage (maps to metadata["analysis"])
|
1367
1392
|
# ------------------------------------------------------------------
|