natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +11 -6
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +252 -399
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +231 -89
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +405 -280
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +25 -0
- natural_pdf/flows/flow.py +1658 -19
- natural_pdf/flows/region.py +757 -263
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +35 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +101 -0
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
@@ -588,24 +588,25 @@ class PDFCollection(
|
|
588
588
|
# Get classification manager from first PDF
|
589
589
|
try:
|
590
590
|
first_pdf = self._pdfs[0]
|
591
|
-
if not hasattr(first_pdf,
|
591
|
+
if not hasattr(first_pdf, "get_manager"):
|
592
592
|
raise RuntimeError("PDFs do not support classification manager")
|
593
|
-
manager = first_pdf.get_manager(
|
593
|
+
manager = first_pdf.get_manager("classification")
|
594
594
|
if not manager or not manager.is_available():
|
595
595
|
raise RuntimeError("ClassificationManager is not available")
|
596
596
|
except Exception as e:
|
597
597
|
from natural_pdf.classification.manager import ClassificationError
|
598
|
+
|
598
599
|
raise ClassificationError(f"Cannot access ClassificationManager: {e}") from e
|
599
600
|
|
600
601
|
# Determine processing mode early
|
601
602
|
inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
|
602
|
-
|
603
|
+
|
603
604
|
# Gather content from all PDFs
|
604
605
|
pdf_contents = []
|
605
606
|
valid_pdfs = []
|
606
|
-
|
607
|
+
|
607
608
|
logger.info(f"Gathering content from {len(self._pdfs)} PDFs for batch classification...")
|
608
|
-
|
609
|
+
|
609
610
|
for pdf in self._pdfs:
|
610
611
|
try:
|
611
612
|
# Get the content for classification - use the same logic as individual PDF classify
|
@@ -618,16 +619,18 @@ class PDFCollection(
|
|
618
619
|
elif inferred_using == "vision":
|
619
620
|
# For vision, we need single-page PDFs only
|
620
621
|
if len(pdf.pages) != 1:
|
621
|
-
logger.warning(
|
622
|
+
logger.warning(
|
623
|
+
f"Skipping PDF {pdf.path}: Vision classification requires single-page PDFs"
|
624
|
+
)
|
622
625
|
continue
|
623
626
|
# Get first page image
|
624
|
-
content = pdf.pages[0].
|
627
|
+
content = pdf.pages[0].render()
|
625
628
|
else:
|
626
629
|
raise ValueError(f"Unsupported using mode: {inferred_using}")
|
627
|
-
|
630
|
+
|
628
631
|
pdf_contents.append(content)
|
629
632
|
valid_pdfs.append(pdf)
|
630
|
-
|
633
|
+
|
631
634
|
except Exception as e:
|
632
635
|
logger.warning(f"Skipping PDF {pdf.path}: Error getting content - {e}")
|
633
636
|
continue
|
@@ -636,7 +639,9 @@ class PDFCollection(
|
|
636
639
|
logger.warning("No valid content could be gathered from PDFs for classification.")
|
637
640
|
return self
|
638
641
|
|
639
|
-
logger.info(
|
642
|
+
logger.info(
|
643
|
+
f"Gathered content from {len(valid_pdfs)} PDFs. Running batch classification..."
|
644
|
+
)
|
640
645
|
|
641
646
|
# Run batch classification
|
642
647
|
try:
|
@@ -651,6 +656,7 @@ class PDFCollection(
|
|
651
656
|
except Exception as e:
|
652
657
|
logger.error(f"Batch classification failed: {e}")
|
653
658
|
from natural_pdf.classification.manager import ClassificationError
|
659
|
+
|
654
660
|
raise ClassificationError(f"Batch classification failed: {e}") from e
|
655
661
|
|
656
662
|
# Assign results back to PDFs
|
@@ -660,10 +666,11 @@ class PDFCollection(
|
|
660
666
|
f"with PDFs processed ({len(valid_pdfs)}). Cannot assign results."
|
661
667
|
)
|
662
668
|
from natural_pdf.classification.manager import ClassificationError
|
669
|
+
|
663
670
|
raise ClassificationError("Batch result count mismatch with input PDFs")
|
664
671
|
|
665
672
|
logger.info(f"Assigning {len(batch_results)} results to PDFs under key '{analysis_key}'.")
|
666
|
-
|
673
|
+
|
667
674
|
processed_count = 0
|
668
675
|
for pdf, result_obj in zip(valid_pdfs, batch_results):
|
669
676
|
try:
|
@@ -0,0 +1,335 @@
|
|
1
|
+
"""Unified rendering infrastructure for natural-pdf.
|
2
|
+
|
3
|
+
This module provides the core components for the unified image generation system:
|
4
|
+
- RenderSpec: Data structure describing what to render
|
5
|
+
- Visualizable: Mixin providing show/render/export methods
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
from dataclasses import dataclass, field
|
10
|
+
from pathlib import Path
|
11
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
|
12
|
+
|
13
|
+
if TYPE_CHECKING:
|
14
|
+
from PIL import Image as PIL_Image
|
15
|
+
|
16
|
+
from natural_pdf.core.page import Page
|
17
|
+
from natural_pdf.elements.base import Element
|
18
|
+
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
@dataclass
|
23
|
+
class RenderSpec:
|
24
|
+
"""Specification for rendering a single page or region.
|
25
|
+
|
26
|
+
This is the core data structure that unifies all rendering operations.
|
27
|
+
Every visual object in natural-pdf converts its display requirements
|
28
|
+
into one or more RenderSpecs, which are then processed by the
|
29
|
+
unified rendering pipeline.
|
30
|
+
|
31
|
+
Attributes:
|
32
|
+
page: The page to render
|
33
|
+
crop_bbox: Optional bounding box (x0, y0, x1, y1) to crop to
|
34
|
+
highlights: List of highlight specifications, each containing:
|
35
|
+
- bbox or polygon: The geometry to highlight
|
36
|
+
- color: Optional color for the highlight
|
37
|
+
- label: Optional label text
|
38
|
+
- element: Optional reference to the source element
|
39
|
+
"""
|
40
|
+
|
41
|
+
page: "Page"
|
42
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None
|
43
|
+
highlights: List[Dict[str, Any]] = field(default_factory=list)
|
44
|
+
|
45
|
+
def add_highlight(
|
46
|
+
self,
|
47
|
+
bbox: Optional[Tuple[float, float, float, float]] = None,
|
48
|
+
polygon: Optional[List[Tuple[float, float]]] = None,
|
49
|
+
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
50
|
+
label: Optional[str] = None,
|
51
|
+
element: Optional["Element"] = None,
|
52
|
+
) -> None:
|
53
|
+
"""Add a highlight to this render spec.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
bbox: Bounding box to highlight
|
57
|
+
polygon: Polygon points to highlight (alternative to bbox)
|
58
|
+
color: Color for the highlight
|
59
|
+
label: Label text for the highlight
|
60
|
+
element: Source element reference
|
61
|
+
"""
|
62
|
+
if bbox is None and polygon is None and element is not None:
|
63
|
+
# Extract geometry from element
|
64
|
+
if (
|
65
|
+
hasattr(element, "polygon")
|
66
|
+
and hasattr(element, "has_polygon")
|
67
|
+
and element.has_polygon
|
68
|
+
):
|
69
|
+
polygon = element.polygon
|
70
|
+
elif hasattr(element, "bbox"):
|
71
|
+
bbox = element.bbox
|
72
|
+
|
73
|
+
if bbox is None and polygon is None:
|
74
|
+
raise ValueError("Must provide bbox, polygon, or element with geometry")
|
75
|
+
|
76
|
+
highlight = {
|
77
|
+
"bbox": bbox,
|
78
|
+
"polygon": polygon,
|
79
|
+
"color": color,
|
80
|
+
"label": label,
|
81
|
+
"element": element,
|
82
|
+
}
|
83
|
+
# Remove None values
|
84
|
+
highlight = {k: v for k, v in highlight.items() if v is not None}
|
85
|
+
self.highlights.append(highlight)
|
86
|
+
|
87
|
+
|
88
|
+
class Visualizable:
|
89
|
+
"""Mixin class providing unified show/render/export methods.
|
90
|
+
|
91
|
+
Classes that inherit from Visualizable need only implement
|
92
|
+
_get_render_specs() to gain full image generation capabilities.
|
93
|
+
"""
|
94
|
+
|
95
|
+
def _get_render_specs(
|
96
|
+
self, mode: Literal["show", "render"] = "show", **kwargs
|
97
|
+
) -> List[RenderSpec]:
|
98
|
+
"""Get render specifications for this object.
|
99
|
+
|
100
|
+
This is the only method subclasses need to implement.
|
101
|
+
It should return a list of RenderSpec objects describing
|
102
|
+
what needs to be rendered.
|
103
|
+
|
104
|
+
Args:
|
105
|
+
mode: Rendering mode - 'show' includes highlights, 'render' is clean
|
106
|
+
**kwargs: Additional parameters from show/render methods
|
107
|
+
|
108
|
+
Returns:
|
109
|
+
List of RenderSpec objects
|
110
|
+
"""
|
111
|
+
raise NotImplementedError(f"{self.__class__.__name__} must implement _get_render_specs()")
|
112
|
+
|
113
|
+
def _get_highlighter(self):
|
114
|
+
"""Get the highlighting service for rendering.
|
115
|
+
|
116
|
+
This method should be overridden by classes that have
|
117
|
+
a different way of accessing the highlighter.
|
118
|
+
"""
|
119
|
+
# Try common patterns
|
120
|
+
if hasattr(self, "_highlighter"):
|
121
|
+
return self._highlighter
|
122
|
+
elif hasattr(self, "page") and hasattr(self.page, "_highlighter"):
|
123
|
+
return self.page._highlighter
|
124
|
+
elif hasattr(self, "pages") and self.pages:
|
125
|
+
# For collections, use first page's highlighter
|
126
|
+
first_page = next(iter(self.pages))
|
127
|
+
if hasattr(first_page, "_highlighter"):
|
128
|
+
return first_page._highlighter
|
129
|
+
|
130
|
+
raise RuntimeError(
|
131
|
+
f"Cannot find HighlightingService for {self.__class__.__name__}. "
|
132
|
+
"Override _get_highlighter() to provide access."
|
133
|
+
)
|
134
|
+
|
135
|
+
def show(
|
136
|
+
self,
|
137
|
+
*,
|
138
|
+
# Basic rendering options
|
139
|
+
resolution: Optional[float] = None,
|
140
|
+
width: Optional[int] = None,
|
141
|
+
# Highlight options
|
142
|
+
color: Optional[Union[str, Tuple[int, int, int]]] = None,
|
143
|
+
labels: bool = True,
|
144
|
+
label_format: Optional[str] = None,
|
145
|
+
highlights: Optional[List[Dict[str, Any]]] = None,
|
146
|
+
legend_position: str = "right",
|
147
|
+
annotate: Optional[Union[str, List[str]]] = None,
|
148
|
+
# Layout options for multi-page/region
|
149
|
+
layout: Literal["stack", "grid", "single"] = "stack",
|
150
|
+
stack_direction: Literal["vertical", "horizontal"] = "vertical",
|
151
|
+
gap: int = 5,
|
152
|
+
columns: Optional[int] = None, # For grid layout
|
153
|
+
# Cropping options
|
154
|
+
crop: Union[bool, Literal["content"]] = False,
|
155
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
156
|
+
**kwargs,
|
157
|
+
) -> Optional["PIL_Image"]:
|
158
|
+
"""Generate a preview image with highlights.
|
159
|
+
|
160
|
+
This method is for interactive debugging and visualization.
|
161
|
+
Elements are highlighted to show what's selected or being worked with.
|
162
|
+
|
163
|
+
Args:
|
164
|
+
resolution: DPI for rendering (default from global settings)
|
165
|
+
width: Target width in pixels (overrides resolution)
|
166
|
+
color: Default highlight color
|
167
|
+
labels: Whether to show labels for highlights
|
168
|
+
label_format: Format string for labels (e.g., "Element {index}")
|
169
|
+
highlights: Additional highlight groups to show
|
170
|
+
legend_position: Position of legend/colorbar ('right', 'left', 'top', 'bottom')
|
171
|
+
annotate: Attribute name(s) to display on highlights (string or list)
|
172
|
+
layout: How to arrange multiple pages/regions
|
173
|
+
stack_direction: Direction for stack layout
|
174
|
+
gap: Pixels between stacked images
|
175
|
+
columns: Number of columns for grid layout
|
176
|
+
crop: Whether to crop (True, False, or 'content' for bbox of elements)
|
177
|
+
crop_bbox: Explicit crop bounds
|
178
|
+
**kwargs: Additional parameters passed to rendering
|
179
|
+
|
180
|
+
Returns:
|
181
|
+
PIL Image object or None if nothing to render
|
182
|
+
"""
|
183
|
+
# Convert string to list if needed
|
184
|
+
if isinstance(annotate, str):
|
185
|
+
annotate = [annotate]
|
186
|
+
|
187
|
+
specs = self._get_render_specs(
|
188
|
+
mode="show",
|
189
|
+
color=color,
|
190
|
+
highlights=highlights,
|
191
|
+
crop=crop,
|
192
|
+
crop_bbox=crop_bbox,
|
193
|
+
annotate=annotate,
|
194
|
+
**kwargs,
|
195
|
+
)
|
196
|
+
|
197
|
+
if not specs:
|
198
|
+
logger.warning(f"{self.__class__.__name__}.show() generated no render specs")
|
199
|
+
return None
|
200
|
+
|
201
|
+
highlighter = self._get_highlighter()
|
202
|
+
return highlighter.unified_render(
|
203
|
+
specs=specs,
|
204
|
+
resolution=resolution,
|
205
|
+
width=width,
|
206
|
+
labels=labels,
|
207
|
+
label_format=label_format,
|
208
|
+
legend_position=legend_position,
|
209
|
+
layout=layout,
|
210
|
+
stack_direction=stack_direction,
|
211
|
+
gap=gap,
|
212
|
+
columns=columns,
|
213
|
+
**kwargs,
|
214
|
+
)
|
215
|
+
|
216
|
+
def render(
|
217
|
+
self,
|
218
|
+
*,
|
219
|
+
# Basic rendering options
|
220
|
+
resolution: Optional[float] = None,
|
221
|
+
width: Optional[int] = None,
|
222
|
+
# Layout options for multi-page/region
|
223
|
+
layout: Literal["stack", "grid", "single"] = "stack",
|
224
|
+
stack_direction: Literal["vertical", "horizontal"] = "vertical",
|
225
|
+
gap: int = 5,
|
226
|
+
columns: Optional[int] = None,
|
227
|
+
# Cropping options
|
228
|
+
crop: Union[bool, Literal["content"]] = False,
|
229
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
230
|
+
**kwargs,
|
231
|
+
) -> Optional["PIL_Image"]:
|
232
|
+
"""Generate a clean image without highlights.
|
233
|
+
|
234
|
+
This method produces publication-ready images without
|
235
|
+
any debugging annotations or highlights.
|
236
|
+
|
237
|
+
Args:
|
238
|
+
resolution: DPI for rendering (default from global settings)
|
239
|
+
width: Target width in pixels (overrides resolution)
|
240
|
+
layout: How to arrange multiple pages/regions
|
241
|
+
stack_direction: Direction for stack layout
|
242
|
+
gap: Pixels between stacked images
|
243
|
+
columns: Number of columns for grid layout
|
244
|
+
crop: Whether to crop
|
245
|
+
crop_bbox: Explicit crop bounds
|
246
|
+
**kwargs: Additional parameters passed to rendering
|
247
|
+
|
248
|
+
Returns:
|
249
|
+
PIL Image object or None if nothing to render
|
250
|
+
"""
|
251
|
+
specs = self._get_render_specs(mode="render", crop=crop, crop_bbox=crop_bbox, **kwargs)
|
252
|
+
|
253
|
+
if not specs:
|
254
|
+
logger.warning(f"{self.__class__.__name__}.render() generated no render specs")
|
255
|
+
return None
|
256
|
+
|
257
|
+
highlighter = self._get_highlighter()
|
258
|
+
return highlighter.unified_render(
|
259
|
+
specs=specs,
|
260
|
+
resolution=resolution,
|
261
|
+
width=width,
|
262
|
+
labels=False, # Never show labels in render mode
|
263
|
+
layout=layout,
|
264
|
+
stack_direction=stack_direction,
|
265
|
+
gap=gap,
|
266
|
+
columns=columns,
|
267
|
+
**kwargs,
|
268
|
+
)
|
269
|
+
|
270
|
+
def export(
|
271
|
+
self,
|
272
|
+
path: Union[str, Path],
|
273
|
+
*,
|
274
|
+
# All the same options as render()
|
275
|
+
resolution: Optional[float] = None,
|
276
|
+
width: Optional[int] = None,
|
277
|
+
layout: Literal["stack", "grid", "single"] = "stack",
|
278
|
+
stack_direction: Literal["vertical", "horizontal"] = "vertical",
|
279
|
+
gap: int = 5,
|
280
|
+
columns: Optional[int] = None,
|
281
|
+
crop: Union[bool, Literal["content"]] = False,
|
282
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
283
|
+
format: Optional[str] = None,
|
284
|
+
**kwargs,
|
285
|
+
) -> None:
|
286
|
+
"""Export a clean image to file.
|
287
|
+
|
288
|
+
This is a convenience method that renders and saves in one step.
|
289
|
+
|
290
|
+
Args:
|
291
|
+
path: Output file path
|
292
|
+
resolution: DPI for rendering
|
293
|
+
width: Target width in pixels
|
294
|
+
layout: How to arrange multiple pages/regions
|
295
|
+
stack_direction: Direction for stack layout
|
296
|
+
gap: Pixels between stacked images
|
297
|
+
columns: Number of columns for grid layout
|
298
|
+
crop: Whether to crop
|
299
|
+
crop_bbox: Explicit crop bounds
|
300
|
+
format: Image format (inferred from path if not specified)
|
301
|
+
**kwargs: Additional parameters passed to rendering
|
302
|
+
"""
|
303
|
+
image = self.render(
|
304
|
+
resolution=resolution,
|
305
|
+
width=width,
|
306
|
+
layout=layout,
|
307
|
+
stack_direction=stack_direction,
|
308
|
+
gap=gap,
|
309
|
+
columns=columns,
|
310
|
+
crop=crop,
|
311
|
+
crop_bbox=crop_bbox,
|
312
|
+
**kwargs,
|
313
|
+
)
|
314
|
+
|
315
|
+
if image is None:
|
316
|
+
raise ValueError(f"No image generated by {self.__class__.__name__}.render()")
|
317
|
+
|
318
|
+
# Ensure path is a Path object
|
319
|
+
path = Path(path)
|
320
|
+
|
321
|
+
# Determine format
|
322
|
+
if format is None:
|
323
|
+
format = path.suffix.lstrip(".").upper()
|
324
|
+
if format == "JPG":
|
325
|
+
format = "JPEG"
|
326
|
+
|
327
|
+
# Save image
|
328
|
+
save_kwargs = {}
|
329
|
+
if format == "JPEG":
|
330
|
+
save_kwargs["quality"] = kwargs.get("quality", 95)
|
331
|
+
elif format == "PNG":
|
332
|
+
save_kwargs["compress_level"] = kwargs.get("compress_level", 6)
|
333
|
+
|
334
|
+
image.save(path, format=format, **save_kwargs)
|
335
|
+
logger.info(f"Exported {self.__class__.__name__} to {path}")
|
natural_pdf/describe/base.py
CHANGED
@@ -17,7 +17,7 @@ from .summary import ElementSummary, InspectionSummary
|
|
17
17
|
if TYPE_CHECKING:
|
18
18
|
from natural_pdf.core.page import Page
|
19
19
|
from natural_pdf.elements.base import Element
|
20
|
-
from natural_pdf.elements.
|
20
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
21
21
|
from natural_pdf.elements.region import Region
|
22
22
|
|
23
23
|
logger = logging.getLogger(__name__)
|
natural_pdf/elements/__init__.py
CHANGED