natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +11 -6
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +252 -399
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +231 -89
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +405 -280
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +25 -0
  33. natural_pdf/flows/flow.py +1658 -19
  34. natural_pdf/flows/region.py +757 -263
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +35 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +101 -0
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
  50. optimization/memory_comparison.py +1 -1
  51. optimization/pdf_analyzer.py +2 -2
  52. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
  53. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
  54. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
  55. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
@@ -588,24 +588,25 @@ class PDFCollection(
588
588
  # Get classification manager from first PDF
589
589
  try:
590
590
  first_pdf = self._pdfs[0]
591
- if not hasattr(first_pdf, 'get_manager'):
591
+ if not hasattr(first_pdf, "get_manager"):
592
592
  raise RuntimeError("PDFs do not support classification manager")
593
- manager = first_pdf.get_manager('classification')
593
+ manager = first_pdf.get_manager("classification")
594
594
  if not manager or not manager.is_available():
595
595
  raise RuntimeError("ClassificationManager is not available")
596
596
  except Exception as e:
597
597
  from natural_pdf.classification.manager import ClassificationError
598
+
598
599
  raise ClassificationError(f"Cannot access ClassificationManager: {e}") from e
599
600
 
600
601
  # Determine processing mode early
601
602
  inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
602
-
603
+
603
604
  # Gather content from all PDFs
604
605
  pdf_contents = []
605
606
  valid_pdfs = []
606
-
607
+
607
608
  logger.info(f"Gathering content from {len(self._pdfs)} PDFs for batch classification...")
608
-
609
+
609
610
  for pdf in self._pdfs:
610
611
  try:
611
612
  # Get the content for classification - use the same logic as individual PDF classify
@@ -618,16 +619,18 @@ class PDFCollection(
618
619
  elif inferred_using == "vision":
619
620
  # For vision, we need single-page PDFs only
620
621
  if len(pdf.pages) != 1:
621
- logger.warning(f"Skipping PDF {pdf.path}: Vision classification requires single-page PDFs")
622
+ logger.warning(
623
+ f"Skipping PDF {pdf.path}: Vision classification requires single-page PDFs"
624
+ )
622
625
  continue
623
626
  # Get first page image
624
- content = pdf.pages[0].to_image()
627
+ content = pdf.pages[0].render()
625
628
  else:
626
629
  raise ValueError(f"Unsupported using mode: {inferred_using}")
627
-
630
+
628
631
  pdf_contents.append(content)
629
632
  valid_pdfs.append(pdf)
630
-
633
+
631
634
  except Exception as e:
632
635
  logger.warning(f"Skipping PDF {pdf.path}: Error getting content - {e}")
633
636
  continue
@@ -636,7 +639,9 @@ class PDFCollection(
636
639
  logger.warning("No valid content could be gathered from PDFs for classification.")
637
640
  return self
638
641
 
639
- logger.info(f"Gathered content from {len(valid_pdfs)} PDFs. Running batch classification...")
642
+ logger.info(
643
+ f"Gathered content from {len(valid_pdfs)} PDFs. Running batch classification..."
644
+ )
640
645
 
641
646
  # Run batch classification
642
647
  try:
@@ -651,6 +656,7 @@ class PDFCollection(
651
656
  except Exception as e:
652
657
  logger.error(f"Batch classification failed: {e}")
653
658
  from natural_pdf.classification.manager import ClassificationError
659
+
654
660
  raise ClassificationError(f"Batch classification failed: {e}") from e
655
661
 
656
662
  # Assign results back to PDFs
@@ -660,10 +666,11 @@ class PDFCollection(
660
666
  f"with PDFs processed ({len(valid_pdfs)}). Cannot assign results."
661
667
  )
662
668
  from natural_pdf.classification.manager import ClassificationError
669
+
663
670
  raise ClassificationError("Batch result count mismatch with input PDFs")
664
671
 
665
672
  logger.info(f"Assigning {len(batch_results)} results to PDFs under key '{analysis_key}'.")
666
-
673
+
667
674
  processed_count = 0
668
675
  for pdf, result_obj in zip(valid_pdfs, batch_results):
669
676
  try:
@@ -0,0 +1,335 @@
1
+ """Unified rendering infrastructure for natural-pdf.
2
+
3
+ This module provides the core components for the unified image generation system:
4
+ - RenderSpec: Data structure describing what to render
5
+ - Visualizable: Mixin providing show/render/export methods
6
+ """
7
+
8
+ import logging
9
+ from dataclasses import dataclass, field
10
+ from pathlib import Path
11
+ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
12
+
13
+ if TYPE_CHECKING:
14
+ from PIL import Image as PIL_Image
15
+
16
+ from natural_pdf.core.page import Page
17
+ from natural_pdf.elements.base import Element
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @dataclass
23
+ class RenderSpec:
24
+ """Specification for rendering a single page or region.
25
+
26
+ This is the core data structure that unifies all rendering operations.
27
+ Every visual object in natural-pdf converts its display requirements
28
+ into one or more RenderSpecs, which are then processed by the
29
+ unified rendering pipeline.
30
+
31
+ Attributes:
32
+ page: The page to render
33
+ crop_bbox: Optional bounding box (x0, y0, x1, y1) to crop to
34
+ highlights: List of highlight specifications, each containing:
35
+ - bbox or polygon: The geometry to highlight
36
+ - color: Optional color for the highlight
37
+ - label: Optional label text
38
+ - element: Optional reference to the source element
39
+ """
40
+
41
+ page: "Page"
42
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None
43
+ highlights: List[Dict[str, Any]] = field(default_factory=list)
44
+
45
+ def add_highlight(
46
+ self,
47
+ bbox: Optional[Tuple[float, float, float, float]] = None,
48
+ polygon: Optional[List[Tuple[float, float]]] = None,
49
+ color: Optional[Union[str, Tuple[int, int, int]]] = None,
50
+ label: Optional[str] = None,
51
+ element: Optional["Element"] = None,
52
+ ) -> None:
53
+ """Add a highlight to this render spec.
54
+
55
+ Args:
56
+ bbox: Bounding box to highlight
57
+ polygon: Polygon points to highlight (alternative to bbox)
58
+ color: Color for the highlight
59
+ label: Label text for the highlight
60
+ element: Source element reference
61
+ """
62
+ if bbox is None and polygon is None and element is not None:
63
+ # Extract geometry from element
64
+ if (
65
+ hasattr(element, "polygon")
66
+ and hasattr(element, "has_polygon")
67
+ and element.has_polygon
68
+ ):
69
+ polygon = element.polygon
70
+ elif hasattr(element, "bbox"):
71
+ bbox = element.bbox
72
+
73
+ if bbox is None and polygon is None:
74
+ raise ValueError("Must provide bbox, polygon, or element with geometry")
75
+
76
+ highlight = {
77
+ "bbox": bbox,
78
+ "polygon": polygon,
79
+ "color": color,
80
+ "label": label,
81
+ "element": element,
82
+ }
83
+ # Remove None values
84
+ highlight = {k: v for k, v in highlight.items() if v is not None}
85
+ self.highlights.append(highlight)
86
+
87
+
88
+ class Visualizable:
89
+ """Mixin class providing unified show/render/export methods.
90
+
91
+ Classes that inherit from Visualizable need only implement
92
+ _get_render_specs() to gain full image generation capabilities.
93
+ """
94
+
95
+ def _get_render_specs(
96
+ self, mode: Literal["show", "render"] = "show", **kwargs
97
+ ) -> List[RenderSpec]:
98
+ """Get render specifications for this object.
99
+
100
+ This is the only method subclasses need to implement.
101
+ It should return a list of RenderSpec objects describing
102
+ what needs to be rendered.
103
+
104
+ Args:
105
+ mode: Rendering mode - 'show' includes highlights, 'render' is clean
106
+ **kwargs: Additional parameters from show/render methods
107
+
108
+ Returns:
109
+ List of RenderSpec objects
110
+ """
111
+ raise NotImplementedError(f"{self.__class__.__name__} must implement _get_render_specs()")
112
+
113
+ def _get_highlighter(self):
114
+ """Get the highlighting service for rendering.
115
+
116
+ This method should be overridden by classes that have
117
+ a different way of accessing the highlighter.
118
+ """
119
+ # Try common patterns
120
+ if hasattr(self, "_highlighter"):
121
+ return self._highlighter
122
+ elif hasattr(self, "page") and hasattr(self.page, "_highlighter"):
123
+ return self.page._highlighter
124
+ elif hasattr(self, "pages") and self.pages:
125
+ # For collections, use first page's highlighter
126
+ first_page = next(iter(self.pages))
127
+ if hasattr(first_page, "_highlighter"):
128
+ return first_page._highlighter
129
+
130
+ raise RuntimeError(
131
+ f"Cannot find HighlightingService for {self.__class__.__name__}. "
132
+ "Override _get_highlighter() to provide access."
133
+ )
134
+
135
+ def show(
136
+ self,
137
+ *,
138
+ # Basic rendering options
139
+ resolution: Optional[float] = None,
140
+ width: Optional[int] = None,
141
+ # Highlight options
142
+ color: Optional[Union[str, Tuple[int, int, int]]] = None,
143
+ labels: bool = True,
144
+ label_format: Optional[str] = None,
145
+ highlights: Optional[List[Dict[str, Any]]] = None,
146
+ legend_position: str = "right",
147
+ annotate: Optional[Union[str, List[str]]] = None,
148
+ # Layout options for multi-page/region
149
+ layout: Literal["stack", "grid", "single"] = "stack",
150
+ stack_direction: Literal["vertical", "horizontal"] = "vertical",
151
+ gap: int = 5,
152
+ columns: Optional[int] = None, # For grid layout
153
+ # Cropping options
154
+ crop: Union[bool, Literal["content"]] = False,
155
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None,
156
+ **kwargs,
157
+ ) -> Optional["PIL_Image"]:
158
+ """Generate a preview image with highlights.
159
+
160
+ This method is for interactive debugging and visualization.
161
+ Elements are highlighted to show what's selected or being worked with.
162
+
163
+ Args:
164
+ resolution: DPI for rendering (default from global settings)
165
+ width: Target width in pixels (overrides resolution)
166
+ color: Default highlight color
167
+ labels: Whether to show labels for highlights
168
+ label_format: Format string for labels (e.g., "Element {index}")
169
+ highlights: Additional highlight groups to show
170
+ legend_position: Position of legend/colorbar ('right', 'left', 'top', 'bottom')
171
+ annotate: Attribute name(s) to display on highlights (string or list)
172
+ layout: How to arrange multiple pages/regions
173
+ stack_direction: Direction for stack layout
174
+ gap: Pixels between stacked images
175
+ columns: Number of columns for grid layout
176
+ crop: Whether to crop (True, False, or 'content' for bbox of elements)
177
+ crop_bbox: Explicit crop bounds
178
+ **kwargs: Additional parameters passed to rendering
179
+
180
+ Returns:
181
+ PIL Image object or None if nothing to render
182
+ """
183
+ # Convert string to list if needed
184
+ if isinstance(annotate, str):
185
+ annotate = [annotate]
186
+
187
+ specs = self._get_render_specs(
188
+ mode="show",
189
+ color=color,
190
+ highlights=highlights,
191
+ crop=crop,
192
+ crop_bbox=crop_bbox,
193
+ annotate=annotate,
194
+ **kwargs,
195
+ )
196
+
197
+ if not specs:
198
+ logger.warning(f"{self.__class__.__name__}.show() generated no render specs")
199
+ return None
200
+
201
+ highlighter = self._get_highlighter()
202
+ return highlighter.unified_render(
203
+ specs=specs,
204
+ resolution=resolution,
205
+ width=width,
206
+ labels=labels,
207
+ label_format=label_format,
208
+ legend_position=legend_position,
209
+ layout=layout,
210
+ stack_direction=stack_direction,
211
+ gap=gap,
212
+ columns=columns,
213
+ **kwargs,
214
+ )
215
+
216
+ def render(
217
+ self,
218
+ *,
219
+ # Basic rendering options
220
+ resolution: Optional[float] = None,
221
+ width: Optional[int] = None,
222
+ # Layout options for multi-page/region
223
+ layout: Literal["stack", "grid", "single"] = "stack",
224
+ stack_direction: Literal["vertical", "horizontal"] = "vertical",
225
+ gap: int = 5,
226
+ columns: Optional[int] = None,
227
+ # Cropping options
228
+ crop: Union[bool, Literal["content"]] = False,
229
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None,
230
+ **kwargs,
231
+ ) -> Optional["PIL_Image"]:
232
+ """Generate a clean image without highlights.
233
+
234
+ This method produces publication-ready images without
235
+ any debugging annotations or highlights.
236
+
237
+ Args:
238
+ resolution: DPI for rendering (default from global settings)
239
+ width: Target width in pixels (overrides resolution)
240
+ layout: How to arrange multiple pages/regions
241
+ stack_direction: Direction for stack layout
242
+ gap: Pixels between stacked images
243
+ columns: Number of columns for grid layout
244
+ crop: Whether to crop
245
+ crop_bbox: Explicit crop bounds
246
+ **kwargs: Additional parameters passed to rendering
247
+
248
+ Returns:
249
+ PIL Image object or None if nothing to render
250
+ """
251
+ specs = self._get_render_specs(mode="render", crop=crop, crop_bbox=crop_bbox, **kwargs)
252
+
253
+ if not specs:
254
+ logger.warning(f"{self.__class__.__name__}.render() generated no render specs")
255
+ return None
256
+
257
+ highlighter = self._get_highlighter()
258
+ return highlighter.unified_render(
259
+ specs=specs,
260
+ resolution=resolution,
261
+ width=width,
262
+ labels=False, # Never show labels in render mode
263
+ layout=layout,
264
+ stack_direction=stack_direction,
265
+ gap=gap,
266
+ columns=columns,
267
+ **kwargs,
268
+ )
269
+
270
+ def export(
271
+ self,
272
+ path: Union[str, Path],
273
+ *,
274
+ # All the same options as render()
275
+ resolution: Optional[float] = None,
276
+ width: Optional[int] = None,
277
+ layout: Literal["stack", "grid", "single"] = "stack",
278
+ stack_direction: Literal["vertical", "horizontal"] = "vertical",
279
+ gap: int = 5,
280
+ columns: Optional[int] = None,
281
+ crop: Union[bool, Literal["content"]] = False,
282
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None,
283
+ format: Optional[str] = None,
284
+ **kwargs,
285
+ ) -> None:
286
+ """Export a clean image to file.
287
+
288
+ This is a convenience method that renders and saves in one step.
289
+
290
+ Args:
291
+ path: Output file path
292
+ resolution: DPI for rendering
293
+ width: Target width in pixels
294
+ layout: How to arrange multiple pages/regions
295
+ stack_direction: Direction for stack layout
296
+ gap: Pixels between stacked images
297
+ columns: Number of columns for grid layout
298
+ crop: Whether to crop
299
+ crop_bbox: Explicit crop bounds
300
+ format: Image format (inferred from path if not specified)
301
+ **kwargs: Additional parameters passed to rendering
302
+ """
303
+ image = self.render(
304
+ resolution=resolution,
305
+ width=width,
306
+ layout=layout,
307
+ stack_direction=stack_direction,
308
+ gap=gap,
309
+ columns=columns,
310
+ crop=crop,
311
+ crop_bbox=crop_bbox,
312
+ **kwargs,
313
+ )
314
+
315
+ if image is None:
316
+ raise ValueError(f"No image generated by {self.__class__.__name__}.render()")
317
+
318
+ # Ensure path is a Path object
319
+ path = Path(path)
320
+
321
+ # Determine format
322
+ if format is None:
323
+ format = path.suffix.lstrip(".").upper()
324
+ if format == "JPG":
325
+ format = "JPEG"
326
+
327
+ # Save image
328
+ save_kwargs = {}
329
+ if format == "JPEG":
330
+ save_kwargs["quality"] = kwargs.get("quality", 95)
331
+ elif format == "PNG":
332
+ save_kwargs["compress_level"] = kwargs.get("compress_level", 6)
333
+
334
+ image.save(path, format=format, **save_kwargs)
335
+ logger.info(f"Exported {self.__class__.__name__} to {path}")
@@ -17,7 +17,7 @@ from .summary import ElementSummary, InspectionSummary
17
17
  if TYPE_CHECKING:
18
18
  from natural_pdf.core.page import Page
19
19
  from natural_pdf.elements.base import Element
20
- from natural_pdf.elements.collections import ElementCollection
20
+ from natural_pdf.elements.element_collection import ElementCollection
21
21
  from natural_pdf.elements.region import Region
22
22
 
23
23
  logger = logging.getLogger(__name__)
@@ -1,3 +1,4 @@
1
1
  """
2
2
  Element classes for Natural PDF.
3
+
3
4
  """