natural-pdf 0.1.21__py3-none-any.whl → 0.1.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/shape_detection_mixin.py +3 -3
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/classification/mixin.py +35 -14
- natural_pdf/classification/results.py +16 -1
- natural_pdf/cli.py +9 -27
- natural_pdf/core/highlighting_service.py +23 -0
- natural_pdf/core/page.py +16 -0
- natural_pdf/core/pdf.py +55 -49
- natural_pdf/describe/base.py +2 -2
- natural_pdf/describe/elements.py +1 -1
- natural_pdf/elements/base.py +79 -1
- natural_pdf/elements/collections.py +23 -1
- natural_pdf/elements/region.py +54 -148
- natural_pdf/exporters/paddleocr.py +1 -1
- natural_pdf/extraction/manager.py +2 -2
- natural_pdf/extraction/mixin.py +295 -11
- natural_pdf/extraction/result.py +28 -1
- natural_pdf/flows/region.py +1 -1
- natural_pdf/ocr/engine_surya.py +25 -5
- natural_pdf/qa/__init__.py +2 -1
- natural_pdf/qa/document_qa.py +33 -37
- natural_pdf/qa/qa_result.py +55 -0
- natural_pdf/selectors/parser.py +22 -0
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/METADATA +21 -14
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/RECORD +29 -28
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/top_level.txt +0 -0
@@ -63,7 +63,7 @@ class ShapeDetectionMixin:
|
|
63
63
|
logger.debug(f"Shape detection on Region: {self}")
|
64
64
|
page_obj = self._page
|
65
65
|
pil_image = self.to_image(
|
66
|
-
resolution=resolution,
|
66
|
+
resolution=resolution, crop=True, include_highlights=False
|
67
67
|
)
|
68
68
|
if pil_image: # Ensure pil_image is not None before accessing attributes
|
69
69
|
origin_offset_pdf = (self.x0, self.top)
|
@@ -681,7 +681,7 @@ class ShapeDetectionMixin:
|
|
681
681
|
if hasattr(self, "to_image") and hasattr(self, "width") and hasattr(self, "height"):
|
682
682
|
if hasattr(self, "x0") and hasattr(self, "top") and hasattr(self, "_page"):
|
683
683
|
pil_image_for_dims = self.to_image(
|
684
|
-
resolution=resolution,
|
684
|
+
resolution=resolution, crop=True, include_highlights=False
|
685
685
|
)
|
686
686
|
else:
|
687
687
|
pil_image_for_dims = self.to_image(resolution=resolution, include_highlights=False)
|
@@ -1204,7 +1204,7 @@ class ShapeDetectionMixin:
|
|
1204
1204
|
if hasattr(self, "to_image") and hasattr(self, "width") and hasattr(self, "height"):
|
1205
1205
|
if hasattr(self, "x0") and hasattr(self, "top") and hasattr(self, "_page"):
|
1206
1206
|
pil_image_for_dims = self.to_image(
|
1207
|
-
resolution=resolution,
|
1207
|
+
resolution=resolution, crop=True, include_highlights=False
|
1208
1208
|
)
|
1209
1209
|
else:
|
1210
1210
|
pil_image_for_dims = self.to_image(resolution=resolution, include_highlights=False)
|
@@ -90,7 +90,7 @@ class ClassificationManager:
|
|
90
90
|
if not _check_classification_dependencies():
|
91
91
|
raise ImportError(
|
92
92
|
"Classification dependencies missing. "
|
93
|
-
'Install with: pip install "natural-pdf[
|
93
|
+
'Install with: pip install "natural-pdf[ai]"'
|
94
94
|
)
|
95
95
|
|
96
96
|
self.pipelines: Dict[Tuple[str, str], "Pipeline"] = (
|
@@ -2,6 +2,7 @@ import logging
|
|
2
2
|
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
3
3
|
|
4
4
|
from PIL import Image
|
5
|
+
import warnings
|
5
6
|
|
6
7
|
from .results import ClassificationResult
|
7
8
|
|
@@ -74,32 +75,52 @@ class ClassificationMixin:
|
|
74
75
|
try:
|
75
76
|
manager = self._get_classification_manager()
|
76
77
|
|
77
|
-
#
|
78
|
+
# ------------------------------------------------------------
|
79
|
+
# Resolve engine ('text' vs 'vision')
|
80
|
+
# ------------------------------------------------------------
|
81
|
+
engine: Optional[str] = using # rename for clarity
|
82
|
+
|
83
|
+
content = None # will hold final content
|
84
|
+
|
85
|
+
if engine is None:
|
86
|
+
# Try text first
|
87
|
+
try:
|
88
|
+
tentative_text = self._get_classification_content("text", **kwargs)
|
89
|
+
if tentative_text and not (isinstance(tentative_text, str) and tentative_text.isspace()):
|
90
|
+
engine = "text"
|
91
|
+
content = tentative_text
|
92
|
+
else:
|
93
|
+
raise ValueError("Empty text")
|
94
|
+
except Exception:
|
95
|
+
warnings.warn(
|
96
|
+
"No text found for classification; falling back to vision model. "
|
97
|
+
"Pass using='vision' explicitly to silence this message.",
|
98
|
+
UserWarning,
|
99
|
+
)
|
100
|
+
engine = "vision"
|
101
|
+
|
102
|
+
# If engine determined but content not yet retrieved, get it now
|
103
|
+
if content is None:
|
104
|
+
content = self._get_classification_content(model_type=engine, **kwargs)
|
105
|
+
|
106
|
+
# ------------------------------------------------------------
|
107
|
+
# Determine model ID default based on engine
|
108
|
+
# ------------------------------------------------------------
|
78
109
|
effective_model_id = model
|
79
|
-
inferred_using = manager.infer_using(
|
80
|
-
model if model else manager.DEFAULT_TEXT_MODEL, using
|
81
|
-
)
|
82
|
-
|
83
|
-
# If model was not provided, use the manager's default for the inferred engine type
|
84
110
|
if effective_model_id is None:
|
85
111
|
effective_model_id = (
|
86
|
-
manager.DEFAULT_TEXT_MODEL
|
87
|
-
if inferred_using == "text"
|
88
|
-
else manager.DEFAULT_VISION_MODEL
|
112
|
+
manager.DEFAULT_TEXT_MODEL if engine == "text" else manager.DEFAULT_VISION_MODEL
|
89
113
|
)
|
90
114
|
logger.debug(
|
91
|
-
f"No model provided, using default for mode '{
|
115
|
+
f"No model provided, using default for mode '{engine}': '{effective_model_id}'"
|
92
116
|
)
|
93
117
|
|
94
|
-
# Get content based on the *final* determined engine type
|
95
|
-
content = self._get_classification_content(model_type=inferred_using, **kwargs)
|
96
|
-
|
97
118
|
# Manager now returns a ClassificationResult object
|
98
119
|
result_obj: ClassificationResult = manager.classify_item(
|
99
120
|
item_content=content,
|
100
121
|
labels=labels,
|
101
122
|
model_id=effective_model_id,
|
102
|
-
using=
|
123
|
+
using=engine,
|
103
124
|
min_confidence=min_confidence,
|
104
125
|
multi_label=multi_label,
|
105
126
|
**kwargs,
|
@@ -3,6 +3,7 @@ import logging
|
|
3
3
|
from dataclasses import dataclass
|
4
4
|
from datetime import datetime
|
5
5
|
from typing import Any, Dict, List, Optional
|
6
|
+
from collections.abc import Mapping
|
6
7
|
|
7
8
|
logger = logging.getLogger(__name__)
|
8
9
|
|
@@ -20,7 +21,7 @@ class CategoryScore:
|
|
20
21
|
|
21
22
|
|
22
23
|
@dataclass
|
23
|
-
class ClassificationResult:
|
24
|
+
class ClassificationResult(Mapping):
|
24
25
|
"""Results from a classification operation."""
|
25
26
|
|
26
27
|
category: Optional[str] # Can be None if scores are empty
|
@@ -86,3 +87,17 @@ class ClassificationResult:
|
|
86
87
|
|
87
88
|
def __repr__(self) -> str:
|
88
89
|
return f"<ClassificationResult category='{self.category}' score={self.score:.3f} model='{self.model_id}'>"
|
90
|
+
|
91
|
+
def __iter__(self):
|
92
|
+
"""Iterate over mapping keys (linked to ``to_dict`` so it stays in sync)."""
|
93
|
+
return iter(self.to_dict())
|
94
|
+
|
95
|
+
def __getitem__(self, key):
|
96
|
+
"""Dictionary-style access to attributes."""
|
97
|
+
try:
|
98
|
+
return self.to_dict()[key]
|
99
|
+
except KeyError as exc:
|
100
|
+
raise KeyError(key) from exc
|
101
|
+
|
102
|
+
def __len__(self):
|
103
|
+
return len(self.to_dict())
|
natural_pdf/cli.py
CHANGED
@@ -11,7 +11,9 @@ from packaging.requirements import Requirement
|
|
11
11
|
# ---------------------------------------------------------------------------
|
12
12
|
INSTALL_RECIPES: Dict[str, list[str]] = {
|
13
13
|
# heavyweight stacks
|
14
|
-
"paddle": ["paddlepaddle>=3.0.0", "paddleocr>=3.0.1", "paddlex>=3.0.2"],
|
14
|
+
"paddle": ["paddlepaddle>=3.0.0", "paddleocr>=3.0.1", "paddlex>=3.0.2", "pandas>=2.2.0"],
|
15
|
+
"numpy-high": ["numpy>=2.0"],
|
16
|
+
"numpy-low": ["numpy<1.27"],
|
15
17
|
"surya": ["surya-ocr>=0.13.0"],
|
16
18
|
"yolo": ["doclayout_yolo", "huggingface_hub>=0.29.3"],
|
17
19
|
"docling": ["docling"],
|
@@ -19,12 +21,13 @@ INSTALL_RECIPES: Dict[str, list[str]] = {
|
|
19
21
|
"deskew": [f"{__package__.split('.')[0]}[deskew]"],
|
20
22
|
"search": [f"{__package__.split('.')[0]}[search]"],
|
21
23
|
"easyocr": ["easyocr"],
|
24
|
+
"ai": [f"{__package__.split('.')[0]}[ai]"],
|
22
25
|
}
|
23
26
|
|
24
27
|
|
25
28
|
def _build_pip_install_args(requirements: list[str], upgrade: bool = True):
|
26
29
|
"""Return the pip command list to install/upgrade the given requirement strings."""
|
27
|
-
cmd = [sys.executable, "-m", "pip", "install"]
|
30
|
+
cmd = [sys.executable, "-m", "pip", "--quiet", "install"]
|
28
31
|
if upgrade:
|
29
32
|
cmd.append("--upgrade")
|
30
33
|
cmd.extend(requirements)
|
@@ -48,34 +51,13 @@ def cmd_install(args):
|
|
48
51
|
|
49
52
|
requirements = INSTALL_RECIPES[group_key]
|
50
53
|
|
51
|
-
# Skip paddlex upgrade if already satisfied
|
52
|
-
if group_key == "paddle":
|
53
|
-
try:
|
54
|
-
dist = distribution("paddlex")
|
55
|
-
from packaging.version import parse as V
|
56
|
-
if V(dist.version) >= V("3.0.2"):
|
57
|
-
print("✓ paddlex already ≥ 3.0.2 – nothing to do.")
|
58
|
-
continue
|
59
|
-
except PackageNotFoundError:
|
60
|
-
pass
|
61
|
-
|
62
54
|
# Special handling for paddle stack: install paddlepaddle & paddleocr first
|
63
55
|
# each in its own resolver run, then paddlex.
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
pip_cmd = _build_pip_install_args([req])
|
68
|
-
_run(pip_cmd)
|
69
|
-
|
70
|
-
# paddlex last to override the strict pin
|
71
|
-
pip_cmd = _build_pip_install_args(["paddlex==3.0.2"])
|
56
|
+
base_reqs = [r for r in requirements]
|
57
|
+
for req in base_reqs:
|
58
|
+
pip_cmd = _build_pip_install_args([req])
|
72
59
|
_run(pip_cmd)
|
73
|
-
|
74
|
-
else:
|
75
|
-
for req in requirements:
|
76
|
-
pip_cmd = _build_pip_install_args([req])
|
77
|
-
_run(pip_cmd)
|
78
|
-
print("✔ Finished installing extra dependencies for", group_key)
|
60
|
+
print("✔ Finished installing extra dependencies for", group_key)
|
79
61
|
|
80
62
|
|
81
63
|
def main():
|
@@ -727,6 +727,7 @@ class HighlightingService:
|
|
727
727
|
legend_position: str = "right",
|
728
728
|
render_ocr: bool = False,
|
729
729
|
resolution: Optional[float] = None,
|
730
|
+
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
730
731
|
**kwargs,
|
731
732
|
) -> Optional[Image.Image]:
|
732
733
|
"""
|
@@ -741,6 +742,9 @@ class HighlightingService:
|
|
741
742
|
legend_position: Position of the legend.
|
742
743
|
render_ocr: Whether to render OCR text.
|
743
744
|
resolution: Resolution for base page image rendering if width/height not used.
|
745
|
+
crop_bbox: Optional bounding box (x0, top, x1, bottom) in PDF coordinate
|
746
|
+
space to crop the output image to, before legends or other overlays are
|
747
|
+
applied. If None, no cropping is performed.
|
744
748
|
**kwargs: Additional args for pdfplumber's to_image (e.g., width, height).
|
745
749
|
|
746
750
|
Returns:
|
@@ -855,6 +859,25 @@ class HighlightingService:
|
|
855
859
|
)
|
856
860
|
rendered_image = renderer.render()
|
857
861
|
|
862
|
+
# --- Optional Cropping BEFORE legend addition ---
|
863
|
+
if crop_bbox is not None:
|
864
|
+
cb_x0, cb_top, cb_x1, cb_bottom = crop_bbox
|
865
|
+
# Convert to pixel coordinates using actual scales
|
866
|
+
left_px = int(cb_x0 * actual_scale_x) - 2
|
867
|
+
top_px = int(cb_top * actual_scale_y) - 2
|
868
|
+
right_px = int(cb_x1 * actual_scale_x) + 2
|
869
|
+
bottom_px = int(cb_bottom * actual_scale_y) + 2
|
870
|
+
|
871
|
+
# Safeguard coordinates within bounds
|
872
|
+
left_px = max(0, min(left_px, rendered_image.width - 1))
|
873
|
+
top_px = max(0, min(top_px, rendered_image.height - 1))
|
874
|
+
right_px = max(left_px + 1, min(right_px, rendered_image.width))
|
875
|
+
bottom_px = max(top_px + 1, min(bottom_px, rendered_image.height))
|
876
|
+
|
877
|
+
rendered_image = rendered_image.crop(
|
878
|
+
(left_px, top_px, right_px, bottom_px)
|
879
|
+
)
|
880
|
+
|
858
881
|
legend = None
|
859
882
|
if labels:
|
860
883
|
preview_labels = {h.label: h.color for h in preview_highlights if h.label}
|
natural_pdf/core/page.py
CHANGED
@@ -2808,3 +2808,19 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2808
2808
|
return None
|
2809
2809
|
|
2810
2810
|
# --- End Skew Detection and Correction --- #
|
2811
|
+
|
2812
|
+
# ------------------------------------------------------------------
|
2813
|
+
# Unified analysis storage (maps to metadata["analysis"])
|
2814
|
+
# ------------------------------------------------------------------
|
2815
|
+
|
2816
|
+
@property
|
2817
|
+
def analyses(self) -> Dict[str, Any]:
|
2818
|
+
if not hasattr(self, "metadata") or self.metadata is None:
|
2819
|
+
self.metadata = {}
|
2820
|
+
return self.metadata.setdefault("analysis", {})
|
2821
|
+
|
2822
|
+
@analyses.setter
|
2823
|
+
def analyses(self, value: Dict[str, Any]):
|
2824
|
+
if not hasattr(self, "metadata") or self.metadata is None:
|
2825
|
+
self.metadata = {}
|
2826
|
+
self.metadata["analysis"] = value
|
natural_pdf/core/pdf.py
CHANGED
@@ -263,7 +263,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
263
263
|
|
264
264
|
self._initialize_managers()
|
265
265
|
self._initialize_highlighter()
|
266
|
-
self.analyses
|
266
|
+
# Analysis results accessed via self.analyses property (see below)
|
267
267
|
|
268
268
|
# --- Automatic cleanup when object is garbage-collected ---
|
269
269
|
self._finalizer = weakref.finalize(
|
@@ -275,56 +275,42 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
275
275
|
)
|
276
276
|
|
277
277
|
def _initialize_managers(self):
|
278
|
-
"""
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
# Resolve the entry in DEFAULT_MANAGERS which can be:
|
283
|
-
# 1. A class -> instantiate directly
|
284
|
-
# 2. A factory (callable) returning a class -> call then instantiate
|
285
|
-
# 3. A factory returning a **ready instance** -> use as-is
|
286
|
-
|
287
|
-
resolved = manager_class_or_factory
|
288
|
-
|
289
|
-
# If we have a callable that is *not* a class, call it to obtain the real target
|
290
|
-
# (This is the lazy-import factory case.)
|
291
|
-
if not isinstance(resolved, type) and callable(resolved):
|
292
|
-
resolved = resolved()
|
293
|
-
|
294
|
-
# At this point `resolved` is either a class or an already-created instance
|
295
|
-
if isinstance(resolved, type):
|
296
|
-
instance = resolved() # Instantiate class
|
297
|
-
self._managers[key] = instance
|
298
|
-
logger.debug(f"Initialized manager for key '{key}': {resolved.__name__}")
|
299
|
-
else:
|
300
|
-
# Assume factory already returned an instance
|
301
|
-
self._managers[key] = resolved
|
302
|
-
logger.debug(
|
303
|
-
f"Initialized manager instance for key '{key}': {type(resolved).__name__} (factory-provided instance)"
|
304
|
-
)
|
305
|
-
except Exception as e:
|
306
|
-
logger.error(f"Failed to initialize manager for key '{key}': {e}")
|
307
|
-
self._managers[key] = None
|
278
|
+
"""Set up manager factories for lazy instantiation."""
|
279
|
+
# Store factories/classes for each manager key
|
280
|
+
self._manager_factories = dict(DEFAULT_MANAGERS)
|
281
|
+
self._managers = {} # Will hold instantiated managers
|
308
282
|
|
309
283
|
def get_manager(self, key: str) -> Any:
|
310
|
-
"""Retrieve a manager instance by its key."""
|
311
|
-
|
284
|
+
"""Retrieve a manager instance by its key, instantiating it lazily if needed."""
|
285
|
+
# Check if already instantiated
|
286
|
+
if key in self._managers:
|
287
|
+
manager_instance = self._managers[key]
|
288
|
+
if manager_instance is None:
|
289
|
+
raise RuntimeError(f"Manager '{key}' failed to initialize previously.")
|
290
|
+
return manager_instance
|
291
|
+
|
292
|
+
# Not instantiated yet: get factory/class
|
293
|
+
if not hasattr(self, "_manager_factories") or key not in self._manager_factories:
|
312
294
|
raise KeyError(
|
313
|
-
f"No manager registered for key '{key}'. Available: {list(self.
|
295
|
+
f"No manager registered for key '{key}'. Available: {list(getattr(self, '_manager_factories', {}).keys())}"
|
314
296
|
)
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
)
|
297
|
+
factory_or_class = self._manager_factories[key]
|
298
|
+
try:
|
299
|
+
resolved = factory_or_class
|
300
|
+
# If it's a callable that's not a class, call it to get the class/instance
|
301
|
+
if not isinstance(resolved, type) and callable(resolved):
|
302
|
+
resolved = resolved()
|
303
|
+
# If it's a class, instantiate it
|
304
|
+
if isinstance(resolved, type):
|
305
|
+
instance = resolved()
|
324
306
|
else:
|
325
|
-
|
326
|
-
|
327
|
-
|
307
|
+
instance = resolved # Already an instance
|
308
|
+
self._managers[key] = instance
|
309
|
+
return instance
|
310
|
+
except Exception as e:
|
311
|
+
logger.error(f"Failed to initialize manager for key '{key}': {e}")
|
312
|
+
self._managers[key] = None
|
313
|
+
raise RuntimeError(f"Manager '{key}' failed to initialize: {e}") from e
|
328
314
|
|
329
315
|
def _initialize_highlighter(self):
|
330
316
|
pass
|
@@ -1504,7 +1490,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1504
1490
|
if not is_classification_available():
|
1505
1491
|
raise ImportError(
|
1506
1492
|
"Classification dependencies missing. "
|
1507
|
-
'Install with: pip install "natural-pdf[
|
1493
|
+
'Install with: pip install "natural-pdf[ai]"'
|
1508
1494
|
)
|
1509
1495
|
raise ClassificationError("ClassificationManager not available.")
|
1510
1496
|
|
@@ -1816,6 +1802,26 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1816
1802
|
|
1817
1803
|
# --- End Classification Mixin Implementation ---
|
1818
1804
|
|
1805
|
+
# ------------------------------------------------------------------
|
1806
|
+
# Unified analysis storage (maps to metadata["analysis"])
|
1807
|
+
# ------------------------------------------------------------------
|
1808
|
+
|
1809
|
+
@property
|
1810
|
+
def analyses(self) -> Dict[str, Any]:
|
1811
|
+
if not hasattr(self, "metadata") or self.metadata is None:
|
1812
|
+
# For PDF, metadata property returns self._pdf.metadata which may be None
|
1813
|
+
self._pdf.metadata = self._pdf.metadata or {}
|
1814
|
+
if self.metadata is None:
|
1815
|
+
# Fallback safeguard
|
1816
|
+
self._pdf.metadata = {}
|
1817
|
+
return self.metadata.setdefault("analysis", {}) # type: ignore[attr-defined]
|
1818
|
+
|
1819
|
+
@analyses.setter
|
1820
|
+
def analyses(self, value: Dict[str, Any]):
|
1821
|
+
if not hasattr(self, "metadata") or self.metadata is None:
|
1822
|
+
self._pdf.metadata = self._pdf.metadata or {}
|
1823
|
+
self.metadata["analysis"] = value # type: ignore[attr-defined]
|
1824
|
+
|
1819
1825
|
# Static helper for weakref.finalize to avoid capturing 'self'
|
1820
1826
|
@staticmethod
|
1821
1827
|
def _finalize_cleanup(plumber_pdf, temp_file_obj, is_stream):
|
@@ -1830,5 +1836,5 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1830
1836
|
path = temp_file_obj.name if hasattr(temp_file_obj, "name") else None
|
1831
1837
|
if path and os.path.exists(path):
|
1832
1838
|
os.unlink(path)
|
1833
|
-
except Exception:
|
1834
|
-
|
1839
|
+
except Exception as e:
|
1840
|
+
logger.warning(f"Failed to clean up temporary file '{path}': {e}")
|
natural_pdf/describe/base.py
CHANGED
@@ -292,8 +292,8 @@ def _extract_element_value(element: "Element", column: str) -> Any:
|
|
292
292
|
try:
|
293
293
|
if column == 'text':
|
294
294
|
text = getattr(element, 'text', '')
|
295
|
-
if text and len(text) >
|
296
|
-
return text[:
|
295
|
+
if text and len(text) > 60:
|
296
|
+
return text[:60] + "..."
|
297
297
|
return text or ""
|
298
298
|
|
299
299
|
elif column == 'page':
|
natural_pdf/describe/elements.py
CHANGED
@@ -396,7 +396,7 @@ def _analyze_ocr_quality(elements: List["Element"]) -> Dict[str, Any]:
|
|
396
396
|
text = getattr(element, 'text', '').strip()
|
397
397
|
if text:
|
398
398
|
# Truncate long text
|
399
|
-
display_text = text[:
|
399
|
+
display_text = text[:60] + "..." if len(text) > 60 else text
|
400
400
|
element_confidences.append((confidence, display_text))
|
401
401
|
|
402
402
|
if element_confidences:
|
natural_pdf/elements/base.py
CHANGED
@@ -9,11 +9,13 @@ from PIL import Image
|
|
9
9
|
# Import selector parsing functions
|
10
10
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
11
11
|
from natural_pdf.describe.mixin import DescribeMixin
|
12
|
+
from natural_pdf.classification.mixin import ClassificationMixin
|
12
13
|
|
13
14
|
if TYPE_CHECKING:
|
14
15
|
from natural_pdf.core.page import Page
|
15
16
|
from natural_pdf.elements.collections import ElementCollection
|
16
17
|
from natural_pdf.elements.region import Region
|
18
|
+
from natural_pdf.classification.manager import ClassificationManager # noqa: F401
|
17
19
|
|
18
20
|
|
19
21
|
def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
|
@@ -413,7 +415,7 @@ class DirectionalMixin:
|
|
413
415
|
return new_region
|
414
416
|
|
415
417
|
|
416
|
-
class Element(DirectionalMixin, DescribeMixin):
|
418
|
+
class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
|
417
419
|
"""
|
418
420
|
Base class for all PDF elements.
|
419
421
|
|
@@ -432,6 +434,10 @@ class Element(DirectionalMixin, DescribeMixin):
|
|
432
434
|
self._obj = obj
|
433
435
|
self._page = page
|
434
436
|
|
437
|
+
# Containers for per-element metadata and analysis results (e.g., classification)
|
438
|
+
self.metadata: Dict[str, Any] = {}
|
439
|
+
# Access analysis results via self.analyses property (see below)
|
440
|
+
|
435
441
|
@property
|
436
442
|
def type(self) -> str:
|
437
443
|
"""Element type."""
|
@@ -850,6 +856,7 @@ class Element(DirectionalMixin, DescribeMixin):
|
|
850
856
|
color: Optional[Union[Tuple, str]] = "red", # Default color for single element
|
851
857
|
label: Optional[str] = None,
|
852
858
|
width: Optional[int] = None, # Add width parameter
|
859
|
+
crop: bool = False, # NEW: Crop to element bounds before legend
|
853
860
|
) -> Optional["Image.Image"]:
|
854
861
|
"""
|
855
862
|
Show the page with only this element highlighted temporarily.
|
@@ -861,6 +868,8 @@ class Element(DirectionalMixin, DescribeMixin):
|
|
861
868
|
color: Color to highlight this element (default: red)
|
862
869
|
label: Optional label for this element in the legend
|
863
870
|
width: Optional width for the output image in pixels
|
871
|
+
crop: If True, crop the rendered image to this element's
|
872
|
+
bounding box before legends/overlays are added.
|
864
873
|
|
865
874
|
Returns:
|
866
875
|
PIL Image of the page with only this element highlighted, or None if error.
|
@@ -887,6 +896,9 @@ class Element(DirectionalMixin, DescribeMixin):
|
|
887
896
|
"use_color_cycling": False, # Explicitly false for single preview
|
888
897
|
}
|
889
898
|
|
899
|
+
# Determine crop bbox
|
900
|
+
crop_bbox = self.bbox if crop else None
|
901
|
+
|
890
902
|
# Check if we actually got geometry data
|
891
903
|
if temp_highlight_data["bbox"] is None and temp_highlight_data["polygon"] is None:
|
892
904
|
logger.warning(f"Cannot show element, failed to get bbox or polygon: {self}")
|
@@ -901,6 +913,7 @@ class Element(DirectionalMixin, DescribeMixin):
|
|
901
913
|
width=width, # Pass the width parameter
|
902
914
|
labels=labels,
|
903
915
|
legend_position=legend_position,
|
916
|
+
crop_bbox=crop_bbox,
|
904
917
|
)
|
905
918
|
except Exception as e:
|
906
919
|
logger.error(f"Error calling render_preview for element {self}: {e}", exc_info=True)
|
@@ -1070,3 +1083,68 @@ class Element(DirectionalMixin, DescribeMixin):
|
|
1070
1083
|
case=case,
|
1071
1084
|
**kwargs,
|
1072
1085
|
)
|
1086
|
+
|
1087
|
+
# ------------------------------------------------------------------
|
1088
|
+
# ClassificationMixin requirements
|
1089
|
+
# ------------------------------------------------------------------
|
1090
|
+
|
1091
|
+
def _get_classification_manager(self) -> "ClassificationManager":
|
1092
|
+
"""Access the shared ClassificationManager via the parent PDF."""
|
1093
|
+
if (
|
1094
|
+
not hasattr(self, "page")
|
1095
|
+
or not hasattr(self.page, "pdf")
|
1096
|
+
or not hasattr(self.page.pdf, "get_manager")
|
1097
|
+
):
|
1098
|
+
raise AttributeError(
|
1099
|
+
"ClassificationManager cannot be accessed: Parent Page, PDF, or get_manager method missing."
|
1100
|
+
)
|
1101
|
+
|
1102
|
+
return self.page.pdf.get_manager("classification")
|
1103
|
+
|
1104
|
+
def _get_classification_content(self, model_type: str, **kwargs): # type: ignore[override]
|
1105
|
+
"""Return either text or an image, depending on model_type (text|vision)."""
|
1106
|
+
if model_type == "text":
|
1107
|
+
text_content = self.extract_text(layout=False) # type: ignore[arg-type]
|
1108
|
+
if not text_content or text_content.isspace():
|
1109
|
+
raise ValueError(
|
1110
|
+
"Cannot classify element with 'text' model: No text content found."
|
1111
|
+
)
|
1112
|
+
return text_content
|
1113
|
+
|
1114
|
+
elif model_type == "vision":
|
1115
|
+
# Delegate to Region implementation via a temporary expand()
|
1116
|
+
resolution = kwargs.get("resolution", 150)
|
1117
|
+
from natural_pdf.elements.region import Region # Local import to avoid cycles
|
1118
|
+
|
1119
|
+
return self.expand().to_image(
|
1120
|
+
resolution=resolution,
|
1121
|
+
include_highlights=False,
|
1122
|
+
crop=True,
|
1123
|
+
)
|
1124
|
+
else:
|
1125
|
+
raise ValueError(f"Unsupported model_type for classification: {model_type}")
|
1126
|
+
|
1127
|
+
# ------------------------------------------------------------------
|
1128
|
+
# Lightweight to_image proxy (vision models, previews, etc.)
|
1129
|
+
# ------------------------------------------------------------------
|
1130
|
+
|
1131
|
+
def to_image(self, *args, **kwargs): # type: ignore[override]
|
1132
|
+
"""Generate an image of this element by delegating to a temporary Region."""
|
1133
|
+
return self.expand().to_image(*args, **kwargs)
|
1134
|
+
|
1135
|
+
# ------------------------------------------------------------------
|
1136
|
+
# Unified analysis storage (maps to metadata["analysis"])
|
1137
|
+
# ------------------------------------------------------------------
|
1138
|
+
|
1139
|
+
@property
|
1140
|
+
def analyses(self) -> Dict[str, Any]:
|
1141
|
+
"""Dictionary holding model-generated analysis objects (classification, extraction, …)."""
|
1142
|
+
if not hasattr(self, "metadata") or self.metadata is None:
|
1143
|
+
self.metadata = {}
|
1144
|
+
return self.metadata.setdefault("analysis", {})
|
1145
|
+
|
1146
|
+
@analyses.setter
|
1147
|
+
def analyses(self, value: Dict[str, Any]):
|
1148
|
+
if not hasattr(self, "metadata") or self.metadata is None:
|
1149
|
+
self.metadata = {}
|
1150
|
+
self.metadata["analysis"] = value
|
@@ -852,6 +852,7 @@ class ElementCollection(
|
|
852
852
|
render_ocr: bool = False,
|
853
853
|
width: Optional[int] = None, # Add width parameter
|
854
854
|
page: Optional[Any] = None, # NEW: Optional page parameter for empty collections
|
855
|
+
crop: bool = False, # NEW: If True, crop output to element bounds
|
855
856
|
) -> Optional["Image.Image"]:
|
856
857
|
"""
|
857
858
|
Generates a temporary preview image highlighting elements in this collection
|
@@ -875,6 +876,9 @@ class ElementCollection(
|
|
875
876
|
legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
|
876
877
|
render_ocr: Whether to render OCR text.
|
877
878
|
width: Optional width for the output image in pixels.
|
879
|
+
crop: If True, crop the resulting image to the tight bounding box
|
880
|
+
containing all elements in the collection. The elements are
|
881
|
+
still highlighted first, then the image is cropped.
|
878
882
|
|
879
883
|
Returns:
|
880
884
|
PIL Image object of the temporary preview, or None if rendering fails or
|
@@ -931,7 +935,23 @@ class ElementCollection(
|
|
931
935
|
|
932
936
|
# 2. Call render_preview on the HighlightingService
|
933
937
|
try:
|
934
|
-
|
938
|
+
# Calculate crop bounding box in PDF coordinates if crop is requested
|
939
|
+
crop_bbox = None
|
940
|
+
if crop:
|
941
|
+
try:
|
942
|
+
crop_bbox = (
|
943
|
+
min(el.x0 for el in self._elements),
|
944
|
+
min(el.top for el in self._elements),
|
945
|
+
max(el.x1 for el in self._elements),
|
946
|
+
max(el.bottom for el in self._elements),
|
947
|
+
)
|
948
|
+
except Exception as bbox_err:
|
949
|
+
logger.error(
|
950
|
+
f"Error determining crop bbox for collection show: {bbox_err}",
|
951
|
+
exc_info=True,
|
952
|
+
)
|
953
|
+
|
954
|
+
img = service.render_preview(
|
935
955
|
page_index=page.index,
|
936
956
|
temporary_highlights=highlight_data_list,
|
937
957
|
scale=scale,
|
@@ -939,7 +959,9 @@ class ElementCollection(
|
|
939
959
|
labels=labels, # Use 'labels'
|
940
960
|
legend_position=legend_position,
|
941
961
|
render_ocr=render_ocr,
|
962
|
+
crop_bbox=crop_bbox,
|
942
963
|
)
|
964
|
+
return img
|
943
965
|
except Exception as e:
|
944
966
|
logger.error(f"Error calling highlighting_service.render_preview: {e}", exc_info=True)
|
945
967
|
return None
|