natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +11 -6
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +252 -399
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +231 -89
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +405 -280
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +25 -0
- natural_pdf/flows/flow.py +1658 -19
- natural_pdf/flows/region.py +757 -263
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +35 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +101 -0
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
@@ -86,9 +86,8 @@ class LayoutAnalyzer:
|
|
86
86
|
layout_resolution = getattr(self._page._parent, "_config", {}).get(
|
87
87
|
"layout_image_resolution", 72
|
88
88
|
)
|
89
|
-
|
90
|
-
|
91
|
-
)
|
89
|
+
# Use render() for clean image without highlights
|
90
|
+
std_res_page_image = self._page.render(resolution=layout_resolution)
|
92
91
|
if not std_res_page_image:
|
93
92
|
raise ValueError("Initial page rendering returned None")
|
94
93
|
logger.debug(
|
@@ -128,7 +128,17 @@ class LayoutManager:
|
|
128
128
|
engine_class = engine_class_or_factory
|
129
129
|
|
130
130
|
detector_instance = engine_class() # Instantiate
|
131
|
-
|
131
|
+
|
132
|
+
# Try to check availability and capture any errors
|
133
|
+
availability_error = None
|
134
|
+
is_available = False
|
135
|
+
try:
|
136
|
+
is_available = detector_instance.is_available()
|
137
|
+
except Exception as e:
|
138
|
+
availability_error = e
|
139
|
+
logger.error(f"Error checking availability of {engine_name}: {e}", exc_info=True)
|
140
|
+
|
141
|
+
if not is_available:
|
132
142
|
# Check availability before storing
|
133
143
|
# Construct helpful error message with install hint
|
134
144
|
install_hint = ""
|
@@ -141,9 +151,13 @@ class LayoutManager:
|
|
141
151
|
else:
|
142
152
|
install_hint = f"(Check installation requirements for {engine_name})"
|
143
153
|
|
144
|
-
|
145
|
-
|
146
|
-
|
154
|
+
error_msg = f"Layout engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
|
155
|
+
|
156
|
+
# If we have an availability error, include it
|
157
|
+
if availability_error:
|
158
|
+
error_msg += f"\nAvailability check error: {availability_error}"
|
159
|
+
|
160
|
+
raise RuntimeError(error_msg)
|
147
161
|
self._detector_instances[engine_name] = detector_instance # Store if available
|
148
162
|
|
149
163
|
return self._detector_instances[engine_name]
|
@@ -42,13 +42,21 @@ logger = logging.getLogger(__name__)
|
|
42
42
|
paddle_spec = importlib.util.find_spec("paddle") or importlib.util.find_spec("paddlepaddle")
|
43
43
|
paddleocr_spec = importlib.util.find_spec("paddleocr")
|
44
44
|
PPStructureV3 = None
|
45
|
+
_paddle_import_error = None # Store the import error for debugging
|
45
46
|
|
46
47
|
if paddle_spec and paddleocr_spec:
|
47
48
|
try:
|
48
49
|
from paddleocr import PPStructureV3
|
49
50
|
except ImportError as e:
|
51
|
+
_paddle_import_error = str(e)
|
50
52
|
logger.warning(f"Could not import Paddle dependencies: {e}")
|
51
53
|
else:
|
54
|
+
if not paddle_spec:
|
55
|
+
_paddle_import_error = "paddlepaddle not found"
|
56
|
+
elif not paddleocr_spec:
|
57
|
+
_paddle_import_error = "paddleocr not found"
|
58
|
+
else:
|
59
|
+
_paddle_import_error = "Unknown import issue"
|
52
60
|
logger.warning(
|
53
61
|
"paddlepaddle or paddleocr not found. PaddleLayoutDetector will not be available."
|
54
62
|
)
|
@@ -82,6 +90,9 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
82
90
|
|
83
91
|
def is_available(self) -> bool:
|
84
92
|
"""Check if dependencies are installed."""
|
93
|
+
if PPStructureV3 is None and _paddle_import_error:
|
94
|
+
# Raise an informative error instead of just returning False
|
95
|
+
raise RuntimeError(f"Paddle dependencies check failed: {_paddle_import_error}")
|
85
96
|
return PPStructureV3 is not None
|
86
97
|
|
87
98
|
def _get_cache_key(self, options: BaseLayoutOptions) -> str:
|
@@ -188,9 +188,8 @@ class SuryaLayoutDetector(LayoutDetector):
|
|
188
188
|
high_res_dpi = getattr(self._page_ref._parent, "_config", {}).get(
|
189
189
|
"surya_table_rec_dpi", 192
|
190
190
|
)
|
191
|
-
|
192
|
-
|
193
|
-
)
|
191
|
+
# Use render() for clean image without highlights
|
192
|
+
high_res_page_image = self._page_ref.render(resolution=high_res_dpi)
|
194
193
|
|
195
194
|
# Render high-res page ONCE
|
196
195
|
self.logger.debug(
|
@@ -10,8 +10,9 @@ from sklearn.cluster import MiniBatchKMeans
|
|
10
10
|
|
11
11
|
if TYPE_CHECKING:
|
12
12
|
from natural_pdf.core.page import Page
|
13
|
+
from natural_pdf.core.page_collection import PageCollection
|
13
14
|
from natural_pdf.core.pdf import PDF
|
14
|
-
from natural_pdf.elements.
|
15
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
15
16
|
from natural_pdf.elements.line import LineElement
|
16
17
|
|
17
18
|
# from natural_pdf.elements.rect import RectangleElement # Removed
|
@@ -59,14 +60,13 @@ class ShapeDetectionMixin:
|
|
59
60
|
|
60
61
|
# Determine the type of self and get the appropriate image and page context
|
61
62
|
if (
|
62
|
-
hasattr(self, "
|
63
|
+
hasattr(self, "render") and hasattr(self, "width") and hasattr(self, "height")
|
63
64
|
): # Page or Region
|
64
65
|
if hasattr(self, "x0") and hasattr(self, "top") and hasattr(self, "_page"): # Region
|
65
66
|
logger.debug(f"Shape detection on Region: {self}")
|
66
67
|
page_obj = self._page
|
67
|
-
|
68
|
-
|
69
|
-
)
|
68
|
+
# Use render() for clean image without highlights, with cropping
|
69
|
+
pil_image = self.render(resolution=resolution, crop=True)
|
70
70
|
if pil_image: # Ensure pil_image is not None before accessing attributes
|
71
71
|
origin_offset_pdf = (self.x0, self.top)
|
72
72
|
logger.debug(
|
@@ -75,7 +75,8 @@ class ShapeDetectionMixin:
|
|
75
75
|
else: # Page
|
76
76
|
logger.debug(f"Shape detection on Page: {self}")
|
77
77
|
page_obj = self
|
78
|
-
|
78
|
+
# Use render() for clean image without highlights
|
79
|
+
pil_image = self.render(resolution=resolution)
|
79
80
|
logger.debug(
|
80
81
|
f"Page image rendered successfully: {pil_image.width}x{pil_image.height}"
|
81
82
|
)
|
@@ -150,6 +151,12 @@ class ShapeDetectionMixin:
|
|
150
151
|
origin_offset_pdf[1] + line_data_img["y2"] * effective_scale
|
151
152
|
) # y2 is the second y-coord
|
152
153
|
|
154
|
+
# Clamp coords to image dimensions
|
155
|
+
x0 = max(0, min(x0, page_obj.width))
|
156
|
+
top = max(0, min(top, page_obj.height))
|
157
|
+
x1 = max(0, min(x1, page_obj.width))
|
158
|
+
bottom = max(0, min(bottom, page_obj.height))
|
159
|
+
|
153
160
|
# For lines, width attribute in PDF points
|
154
161
|
line_width_pdf = line_data_img["width"] * effective_scale
|
155
162
|
|
@@ -158,7 +165,7 @@ class ShapeDetectionMixin:
|
|
158
165
|
getattr(page_obj._page, "initial_doctop", 0) if hasattr(page_obj, "_page") else 0
|
159
166
|
)
|
160
167
|
|
161
|
-
|
168
|
+
attrs = {
|
162
169
|
"x0": x0,
|
163
170
|
"top": top,
|
164
171
|
"x1": x1,
|
@@ -179,6 +186,8 @@ class ShapeDetectionMixin:
|
|
179
186
|
"raw_line_position_px": line_data_img.get("line_position_px"), # Added for clarity
|
180
187
|
}
|
181
188
|
|
189
|
+
return attrs
|
190
|
+
|
182
191
|
def _find_lines_on_image_data(
|
183
192
|
self,
|
184
193
|
cv_image: np.ndarray,
|
@@ -680,13 +689,12 @@ class ShapeDetectionMixin:
|
|
680
689
|
return self
|
681
690
|
|
682
691
|
pil_image_for_dims = None
|
683
|
-
if hasattr(self, "
|
692
|
+
if hasattr(self, "render") and hasattr(self, "width") and hasattr(self, "height"):
|
684
693
|
if hasattr(self, "x0") and hasattr(self, "top") and hasattr(self, "_page"):
|
685
|
-
pil_image_for_dims = self.
|
686
|
-
resolution=resolution, crop=True, include_highlights=False
|
687
|
-
)
|
694
|
+
pil_image_for_dims = self.render(resolution=resolution, crop=True)
|
688
695
|
else:
|
689
|
-
|
696
|
+
# Use render() for clean image without highlights
|
697
|
+
pil_image_for_dims = self.render(resolution=resolution)
|
690
698
|
if pil_image_for_dims is None:
|
691
699
|
logger.warning(f"Could not re-render PIL image for dimensions for {self}.")
|
692
700
|
pil_image_for_dims = Image.fromarray(cv_image) # Ensure it's not None
|
@@ -710,7 +718,6 @@ class ShapeDetectionMixin:
|
|
710
718
|
logger.info(
|
711
719
|
f"Removed {removed_count} existing lines with source '{source_label}' from {page_object_ctx}"
|
712
720
|
)
|
713
|
-
|
714
721
|
lines_data_img, profile_h_smoothed, profile_v_smoothed = self._find_lines_on_image_data(
|
715
722
|
cv_image=cv_image,
|
716
723
|
pil_image_rgb=pil_image_for_dims,
|
@@ -733,7 +740,6 @@ class ShapeDetectionMixin:
|
|
733
740
|
smoothing_sigma_v=smoothing_sigma_v,
|
734
741
|
peak_width_rel_height=peak_width_rel_height,
|
735
742
|
)
|
736
|
-
|
737
743
|
from natural_pdf.elements.line import LineElement
|
738
744
|
|
739
745
|
element_manager = page_object_ctx._element_mgr
|
@@ -742,14 +748,8 @@ class ShapeDetectionMixin:
|
|
742
748
|
element_constructor_data = self._convert_line_to_element_data(
|
743
749
|
line_data_item_img, scale_factor, origin_offset_pdf, page_object_ctx, source_label
|
744
750
|
)
|
745
|
-
|
746
|
-
|
747
|
-
element_manager.add_element(line_element, element_type="lines")
|
748
|
-
except Exception as e:
|
749
|
-
logger.error(
|
750
|
-
f"Failed to create or add LineElement: {e}. Data: {element_constructor_data}",
|
751
|
-
exc_info=True,
|
752
|
-
)
|
751
|
+
line_element = LineElement(element_constructor_data, page_object_ctx)
|
752
|
+
element_manager.add_element(line_element, element_type="lines")
|
753
753
|
|
754
754
|
logger.info(
|
755
755
|
f"Detected and added {len(lines_data_img)} lines to {page_object_ctx} with source '{source_label}' using projection profiling."
|
@@ -826,14 +826,8 @@ class ShapeDetectionMixin:
|
|
826
826
|
element_constructor_data = self._convert_line_to_element_data(
|
827
827
|
line_data_item_img, scale_factor, origin_offset_pdf, page_object_ctx, source_label
|
828
828
|
)
|
829
|
-
|
830
|
-
|
831
|
-
element_manager.add_element(line_element, element_type="lines")
|
832
|
-
except Exception as e:
|
833
|
-
logger.error(
|
834
|
-
f"Failed to create or add LineElement: {e}. Data: {element_constructor_data}",
|
835
|
-
exc_info=True,
|
836
|
-
)
|
829
|
+
line_element = LineElement(element_constructor_data, page_object_ctx)
|
830
|
+
element_manager.add_element(line_element, element_type="lines")
|
837
831
|
|
838
832
|
logger.info(
|
839
833
|
f"Detected and added {len(lines_data_img)} lines to {page_object_ctx} with source '{source_label}' using LSD."
|
@@ -1256,10 +1250,7 @@ class ShapeDetectionMixin:
|
|
1256
1250
|
and getattr(r, "source", None) == source_label
|
1257
1251
|
]
|
1258
1252
|
for r in old_blobs:
|
1259
|
-
|
1260
|
-
page_obj._element_mgr.regions.remove(r)
|
1261
|
-
except ValueError:
|
1262
|
-
pass
|
1253
|
+
page_obj._element_mgr.regions.remove(r)
|
1263
1254
|
|
1264
1255
|
# ── iterate clusters ───────────────────────────────────────────────────
|
1265
1256
|
unique_clusters = [cid for cid in np.unique(labels_img) if cid >= 0]
|
@@ -14,7 +14,7 @@ from natural_pdf.analyzers.text_options import TextStyleOptions
|
|
14
14
|
if TYPE_CHECKING:
|
15
15
|
from natural_pdf.core.page import Page
|
16
16
|
from natural_pdf.elements.base import Element
|
17
|
-
from natural_pdf.elements.
|
17
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
18
18
|
|
19
19
|
logger = logging.getLogger(__name__)
|
20
20
|
|
@@ -282,7 +282,7 @@ class TextStyleAnalyzer:
|
|
282
282
|
def analyze(
|
283
283
|
self, page: "Page", options: Optional[TextStyleOptions] = None
|
284
284
|
) -> "ElementCollection":
|
285
|
-
from natural_pdf.elements.
|
285
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
286
286
|
|
287
287
|
current_options = options or self.options
|
288
288
|
logger.info(
|
@@ -92,9 +92,10 @@ class ApplyMixin:
|
|
92
92
|
|
93
93
|
# Import here to avoid circular imports
|
94
94
|
from natural_pdf import PDF, Page
|
95
|
-
from natural_pdf.
|
95
|
+
from natural_pdf.core.page_collection import PageCollection
|
96
|
+
from natural_pdf.core.pdf_collection import PDFCollection
|
96
97
|
from natural_pdf.elements.base import Element
|
97
|
-
from natural_pdf.elements.
|
98
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
98
99
|
from natural_pdf.elements.region import Region
|
99
100
|
|
100
101
|
first_non_none = next((r for r in results if r is not None), None)
|