natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +11 -6
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +252 -399
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +231 -89
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +405 -280
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +25 -0
  33. natural_pdf/flows/flow.py +1658 -19
  34. natural_pdf/flows/region.py +757 -263
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +35 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +101 -0
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
  50. optimization/memory_comparison.py +1 -1
  51. optimization/pdf_analyzer.py +2 -2
  52. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
  53. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
  54. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
  55. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
@@ -86,9 +86,8 @@ class LayoutAnalyzer:
86
86
  layout_resolution = getattr(self._page._parent, "_config", {}).get(
87
87
  "layout_image_resolution", 72
88
88
  )
89
- std_res_page_image = self._page.to_image(
90
- resolution=layout_resolution, include_highlights=False
91
- )
89
+ # Use render() for clean image without highlights
90
+ std_res_page_image = self._page.render(resolution=layout_resolution)
92
91
  if not std_res_page_image:
93
92
  raise ValueError("Initial page rendering returned None")
94
93
  logger.debug(
@@ -128,7 +128,17 @@ class LayoutManager:
128
128
  engine_class = engine_class_or_factory
129
129
 
130
130
  detector_instance = engine_class() # Instantiate
131
- if not detector_instance.is_available():
131
+
132
+ # Try to check availability and capture any errors
133
+ availability_error = None
134
+ is_available = False
135
+ try:
136
+ is_available = detector_instance.is_available()
137
+ except Exception as e:
138
+ availability_error = e
139
+ logger.error(f"Error checking availability of {engine_name}: {e}", exc_info=True)
140
+
141
+ if not is_available:
132
142
  # Check availability before storing
133
143
  # Construct helpful error message with install hint
134
144
  install_hint = ""
@@ -141,9 +151,13 @@ class LayoutManager:
141
151
  else:
142
152
  install_hint = f"(Check installation requirements for {engine_name})"
143
153
 
144
- raise RuntimeError(
145
- f"Layout engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
146
- )
154
+ error_msg = f"Layout engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
155
+
156
+ # If we have an availability error, include it
157
+ if availability_error:
158
+ error_msg += f"\nAvailability check error: {availability_error}"
159
+
160
+ raise RuntimeError(error_msg)
147
161
  self._detector_instances[engine_name] = detector_instance # Store if available
148
162
 
149
163
  return self._detector_instances[engine_name]
@@ -42,13 +42,21 @@ logger = logging.getLogger(__name__)
42
42
  paddle_spec = importlib.util.find_spec("paddle") or importlib.util.find_spec("paddlepaddle")
43
43
  paddleocr_spec = importlib.util.find_spec("paddleocr")
44
44
  PPStructureV3 = None
45
+ _paddle_import_error = None # Store the import error for debugging
45
46
 
46
47
  if paddle_spec and paddleocr_spec:
47
48
  try:
48
49
  from paddleocr import PPStructureV3
49
50
  except ImportError as e:
51
+ _paddle_import_error = str(e)
50
52
  logger.warning(f"Could not import Paddle dependencies: {e}")
51
53
  else:
54
+ if not paddle_spec:
55
+ _paddle_import_error = "paddlepaddle not found"
56
+ elif not paddleocr_spec:
57
+ _paddle_import_error = "paddleocr not found"
58
+ else:
59
+ _paddle_import_error = "Unknown import issue"
52
60
  logger.warning(
53
61
  "paddlepaddle or paddleocr not found. PaddleLayoutDetector will not be available."
54
62
  )
@@ -82,6 +90,9 @@ class PaddleLayoutDetector(LayoutDetector):
82
90
 
83
91
  def is_available(self) -> bool:
84
92
  """Check if dependencies are installed."""
93
+ if PPStructureV3 is None and _paddle_import_error:
94
+ # Raise an informative error instead of just returning False
95
+ raise RuntimeError(f"Paddle dependencies check failed: {_paddle_import_error}")
85
96
  return PPStructureV3 is not None
86
97
 
87
98
  def _get_cache_key(self, options: BaseLayoutOptions) -> str:
@@ -188,9 +188,8 @@ class SuryaLayoutDetector(LayoutDetector):
188
188
  high_res_dpi = getattr(self._page_ref._parent, "_config", {}).get(
189
189
  "surya_table_rec_dpi", 192
190
190
  )
191
- high_res_page_image = self._page_ref.to_image(
192
- resolution=high_res_dpi, include_highlights=False
193
- )
191
+ # Use render() for clean image without highlights
192
+ high_res_page_image = self._page_ref.render(resolution=high_res_dpi)
194
193
 
195
194
  # Render high-res page ONCE
196
195
  self.logger.debug(
@@ -10,8 +10,9 @@ from sklearn.cluster import MiniBatchKMeans
10
10
 
11
11
  if TYPE_CHECKING:
12
12
  from natural_pdf.core.page import Page
13
+ from natural_pdf.core.page_collection import PageCollection
13
14
  from natural_pdf.core.pdf import PDF
14
- from natural_pdf.elements.collections import ElementCollection, PageCollection
15
+ from natural_pdf.elements.element_collection import ElementCollection
15
16
  from natural_pdf.elements.line import LineElement
16
17
 
17
18
  # from natural_pdf.elements.rect import RectangleElement # Removed
@@ -59,14 +60,13 @@ class ShapeDetectionMixin:
59
60
 
60
61
  # Determine the type of self and get the appropriate image and page context
61
62
  if (
62
- hasattr(self, "to_image") and hasattr(self, "width") and hasattr(self, "height")
63
+ hasattr(self, "render") and hasattr(self, "width") and hasattr(self, "height")
63
64
  ): # Page or Region
64
65
  if hasattr(self, "x0") and hasattr(self, "top") and hasattr(self, "_page"): # Region
65
66
  logger.debug(f"Shape detection on Region: {self}")
66
67
  page_obj = self._page
67
- pil_image = self.to_image(
68
- resolution=resolution, crop=True, include_highlights=False
69
- )
68
+ # Use render() for clean image without highlights, with cropping
69
+ pil_image = self.render(resolution=resolution, crop=True)
70
70
  if pil_image: # Ensure pil_image is not None before accessing attributes
71
71
  origin_offset_pdf = (self.x0, self.top)
72
72
  logger.debug(
@@ -75,7 +75,8 @@ class ShapeDetectionMixin:
75
75
  else: # Page
76
76
  logger.debug(f"Shape detection on Page: {self}")
77
77
  page_obj = self
78
- pil_image = self.to_image(resolution=resolution, include_highlights=False)
78
+ # Use render() for clean image without highlights
79
+ pil_image = self.render(resolution=resolution)
79
80
  logger.debug(
80
81
  f"Page image rendered successfully: {pil_image.width}x{pil_image.height}"
81
82
  )
@@ -150,6 +151,12 @@ class ShapeDetectionMixin:
150
151
  origin_offset_pdf[1] + line_data_img["y2"] * effective_scale
151
152
  ) # y2 is the second y-coord
152
153
 
154
+ # Clamp coords to image dimensions
155
+ x0 = max(0, min(x0, page_obj.width))
156
+ top = max(0, min(top, page_obj.height))
157
+ x1 = max(0, min(x1, page_obj.width))
158
+ bottom = max(0, min(bottom, page_obj.height))
159
+
153
160
  # For lines, width attribute in PDF points
154
161
  line_width_pdf = line_data_img["width"] * effective_scale
155
162
 
@@ -158,7 +165,7 @@ class ShapeDetectionMixin:
158
165
  getattr(page_obj._page, "initial_doctop", 0) if hasattr(page_obj, "_page") else 0
159
166
  )
160
167
 
161
- return {
168
+ attrs = {
162
169
  "x0": x0,
163
170
  "top": top,
164
171
  "x1": x1,
@@ -179,6 +186,8 @@ class ShapeDetectionMixin:
179
186
  "raw_line_position_px": line_data_img.get("line_position_px"), # Added for clarity
180
187
  }
181
188
 
189
+ return attrs
190
+
182
191
  def _find_lines_on_image_data(
183
192
  self,
184
193
  cv_image: np.ndarray,
@@ -680,13 +689,12 @@ class ShapeDetectionMixin:
680
689
  return self
681
690
 
682
691
  pil_image_for_dims = None
683
- if hasattr(self, "to_image") and hasattr(self, "width") and hasattr(self, "height"):
692
+ if hasattr(self, "render") and hasattr(self, "width") and hasattr(self, "height"):
684
693
  if hasattr(self, "x0") and hasattr(self, "top") and hasattr(self, "_page"):
685
- pil_image_for_dims = self.to_image(
686
- resolution=resolution, crop=True, include_highlights=False
687
- )
694
+ pil_image_for_dims = self.render(resolution=resolution, crop=True)
688
695
  else:
689
- pil_image_for_dims = self.to_image(resolution=resolution, include_highlights=False)
696
+ # Use render() for clean image without highlights
697
+ pil_image_for_dims = self.render(resolution=resolution)
690
698
  if pil_image_for_dims is None:
691
699
  logger.warning(f"Could not re-render PIL image for dimensions for {self}.")
692
700
  pil_image_for_dims = Image.fromarray(cv_image) # Ensure it's not None
@@ -710,7 +718,6 @@ class ShapeDetectionMixin:
710
718
  logger.info(
711
719
  f"Removed {removed_count} existing lines with source '{source_label}' from {page_object_ctx}"
712
720
  )
713
-
714
721
  lines_data_img, profile_h_smoothed, profile_v_smoothed = self._find_lines_on_image_data(
715
722
  cv_image=cv_image,
716
723
  pil_image_rgb=pil_image_for_dims,
@@ -733,7 +740,6 @@ class ShapeDetectionMixin:
733
740
  smoothing_sigma_v=smoothing_sigma_v,
734
741
  peak_width_rel_height=peak_width_rel_height,
735
742
  )
736
-
737
743
  from natural_pdf.elements.line import LineElement
738
744
 
739
745
  element_manager = page_object_ctx._element_mgr
@@ -742,14 +748,8 @@ class ShapeDetectionMixin:
742
748
  element_constructor_data = self._convert_line_to_element_data(
743
749
  line_data_item_img, scale_factor, origin_offset_pdf, page_object_ctx, source_label
744
750
  )
745
- try:
746
- line_element = LineElement(element_constructor_data, page_object_ctx)
747
- element_manager.add_element(line_element, element_type="lines")
748
- except Exception as e:
749
- logger.error(
750
- f"Failed to create or add LineElement: {e}. Data: {element_constructor_data}",
751
- exc_info=True,
752
- )
751
+ line_element = LineElement(element_constructor_data, page_object_ctx)
752
+ element_manager.add_element(line_element, element_type="lines")
753
753
 
754
754
  logger.info(
755
755
  f"Detected and added {len(lines_data_img)} lines to {page_object_ctx} with source '{source_label}' using projection profiling."
@@ -826,14 +826,8 @@ class ShapeDetectionMixin:
826
826
  element_constructor_data = self._convert_line_to_element_data(
827
827
  line_data_item_img, scale_factor, origin_offset_pdf, page_object_ctx, source_label
828
828
  )
829
- try:
830
- line_element = LineElement(element_constructor_data, page_object_ctx)
831
- element_manager.add_element(line_element, element_type="lines")
832
- except Exception as e:
833
- logger.error(
834
- f"Failed to create or add LineElement: {e}. Data: {element_constructor_data}",
835
- exc_info=True,
836
- )
829
+ line_element = LineElement(element_constructor_data, page_object_ctx)
830
+ element_manager.add_element(line_element, element_type="lines")
837
831
 
838
832
  logger.info(
839
833
  f"Detected and added {len(lines_data_img)} lines to {page_object_ctx} with source '{source_label}' using LSD."
@@ -1256,10 +1250,7 @@ class ShapeDetectionMixin:
1256
1250
  and getattr(r, "source", None) == source_label
1257
1251
  ]
1258
1252
  for r in old_blobs:
1259
- try:
1260
- page_obj._element_mgr.regions.remove(r)
1261
- except ValueError:
1262
- pass
1253
+ page_obj._element_mgr.regions.remove(r)
1263
1254
 
1264
1255
  # ── iterate clusters ───────────────────────────────────────────────────
1265
1256
  unique_clusters = [cid for cid in np.unique(labels_img) if cid >= 0]
@@ -14,7 +14,7 @@ from natural_pdf.analyzers.text_options import TextStyleOptions
14
14
  if TYPE_CHECKING:
15
15
  from natural_pdf.core.page import Page
16
16
  from natural_pdf.elements.base import Element
17
- from natural_pdf.elements.collections import ElementCollection
17
+ from natural_pdf.elements.element_collection import ElementCollection
18
18
 
19
19
  logger = logging.getLogger(__name__)
20
20
 
@@ -282,7 +282,7 @@ class TextStyleAnalyzer:
282
282
  def analyze(
283
283
  self, page: "Page", options: Optional[TextStyleOptions] = None
284
284
  ) -> "ElementCollection":
285
- from natural_pdf.elements.collections import ElementCollection
285
+ from natural_pdf.elements.element_collection import ElementCollection
286
286
 
287
287
  current_options = options or self.options
288
288
  logger.info(
@@ -1,6 +1,6 @@
1
1
  import logging
2
- import time
3
2
  import threading # Add threading for locks
3
+ import time
4
4
  from datetime import datetime
5
5
  from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
6
6
 
@@ -92,9 +92,10 @@ class ApplyMixin:
92
92
 
93
93
  # Import here to avoid circular imports
94
94
  from natural_pdf import PDF, Page
95
- from natural_pdf.collections.pdf_collection import PDFCollection
95
+ from natural_pdf.core.page_collection import PageCollection
96
+ from natural_pdf.core.pdf_collection import PDFCollection
96
97
  from natural_pdf.elements.base import Element
97
- from natural_pdf.elements.collections import ElementCollection, PageCollection
98
+ from natural_pdf.elements.element_collection import ElementCollection
98
99
  from natural_pdf.elements.region import Region
99
100
 
100
101
  first_non_none = next((r for r in results if r is not None), None)