natural-pdf 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +6 -0
- natural_pdf/analyzers/layout/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +151 -0
- natural_pdf/analyzers/layout/docling.py +247 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
- natural_pdf/analyzers/layout/layout_manager.py +200 -0
- natural_pdf/analyzers/layout/layout_options.py +78 -0
- natural_pdf/analyzers/layout/paddle.py +240 -0
- natural_pdf/analyzers/layout/surya.py +151 -0
- natural_pdf/analyzers/layout/tatr.py +251 -0
- natural_pdf/analyzers/layout/yolo.py +165 -0
- natural_pdf/analyzers/text_options.py +60 -0
- natural_pdf/analyzers/text_structure.py +270 -0
- natural_pdf/analyzers/utils.py +57 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/element_manager.py +457 -0
- natural_pdf/core/highlighting_service.py +698 -0
- natural_pdf/core/page.py +1444 -0
- natural_pdf/core/pdf.py +653 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +761 -0
- natural_pdf/elements/collections.py +1345 -0
- natural_pdf/elements/line.py +140 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1793 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +56 -0
- natural_pdf/ocr/engine.py +104 -0
- natural_pdf/ocr/engine_easyocr.py +179 -0
- natural_pdf/ocr/engine_paddle.py +204 -0
- natural_pdf/ocr/engine_surya.py +171 -0
- natural_pdf/ocr/ocr_manager.py +191 -0
- natural_pdf/ocr/ocr_options.py +114 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +396 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +354 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +3 -0
- natural_pdf/utils/highlighting.py +12 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +223 -0
- natural_pdf/widgets/__init__.py +4 -0
- natural_pdf/widgets/frontend/viewer.js +88 -0
- natural_pdf/widgets/viewer.py +765 -0
- natural_pdf-0.1.0.dist-info/METADATA +295 -0
- natural_pdf-0.1.0.dist-info/RECORD +52 -0
- natural_pdf-0.1.0.dist-info/WHEEL +5 -0
- natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
- natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
natural_pdf/__init__.py
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
"""
|
2
|
+
Natural PDF - A more intuitive interface for working with PDFs.
|
3
|
+
"""
|
4
|
+
import logging
|
5
|
+
|
6
|
+
# Create library logger
|
7
|
+
logger = logging.getLogger("natural_pdf")
|
8
|
+
|
9
|
+
# Add a NullHandler to prevent "No handler found" warnings
|
10
|
+
# (Best practice for libraries)
|
11
|
+
logger.addHandler(logging.NullHandler())
|
12
|
+
|
13
|
+
# Utility function for users to easily configure logging
|
14
|
+
def configure_logging(level=logging.INFO, handler=None):
|
15
|
+
"""Configure Natural PDF's logging.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
level: The logging level (e.g., logging.INFO, logging.DEBUG)
|
19
|
+
handler: A custom handler, or None to use StreamHandler
|
20
|
+
"""
|
21
|
+
# Remove NullHandler if present
|
22
|
+
if logger.handlers and isinstance(logger.handlers[0], logging.NullHandler):
|
23
|
+
logger.removeHandler(logger.handlers[0])
|
24
|
+
|
25
|
+
if handler is None:
|
26
|
+
handler = logging.StreamHandler()
|
27
|
+
formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
|
28
|
+
handler.setFormatter(formatter)
|
29
|
+
|
30
|
+
logger.addHandler(handler)
|
31
|
+
logger.setLevel(level)
|
32
|
+
|
33
|
+
# Propagate level to all child loggers
|
34
|
+
for name in logging.root.manager.loggerDict:
|
35
|
+
if name.startswith("natural_pdf."):
|
36
|
+
logging.getLogger(name).setLevel(level)
|
37
|
+
|
38
|
+
from natural_pdf.core.pdf import PDF
|
39
|
+
from natural_pdf.core.page import Page
|
40
|
+
from natural_pdf.elements.region import Region
|
41
|
+
from natural_pdf.elements.collections import ElementCollection
|
42
|
+
|
43
|
+
# Import QA module if available
|
44
|
+
try:
|
45
|
+
from natural_pdf.qa import DocumentQA, get_qa_engine
|
46
|
+
HAS_QA = True
|
47
|
+
except ImportError:
|
48
|
+
HAS_QA = False
|
49
|
+
|
50
|
+
__version__ = "0.1.0"
|
51
|
+
|
52
|
+
if HAS_QA:
|
53
|
+
__all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging", "DocumentQA", "get_qa_engine"]
|
54
|
+
else:
|
55
|
+
__all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging"]
|
@@ -0,0 +1 @@
|
|
1
|
+
from .base import LayoutDetector
|
@@ -0,0 +1,151 @@
|
|
1
|
+
# layout_detector_base.py
|
2
|
+
import logging
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from typing import Dict, List, Any, Optional, Set, Union
|
5
|
+
from PIL import Image
|
6
|
+
|
7
|
+
# Assuming layout_options defines BaseLayoutOptions
|
8
|
+
try:
|
9
|
+
from .layout_options import BaseLayoutOptions
|
10
|
+
except ImportError:
|
11
|
+
# Placeholder if run standalone or options not found
|
12
|
+
class BaseLayoutOptions: pass
|
13
|
+
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
class LayoutDetector(ABC):
|
17
|
+
"""
|
18
|
+
Abstract Base Class for layout detection engines.
|
19
|
+
|
20
|
+
Subclasses should implement is_available, _load_model_from_options, detect,
|
21
|
+
and override _get_cache_key if model loading depends on options beyond device.
|
22
|
+
They should also populate the 'supported_classes' set.
|
23
|
+
"""
|
24
|
+
|
25
|
+
def __init__(self):
|
26
|
+
"""Initializes the base layout detector."""
|
27
|
+
self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
|
28
|
+
self.logger.info(f"Initializing {self.__class__.__name__}")
|
29
|
+
self.supported_classes: Set[str] = set() # Subclasses should populate this
|
30
|
+
self._model_cache: Dict[str, Any] = {} # Cache for initialized models
|
31
|
+
|
32
|
+
@abstractmethod
|
33
|
+
def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
|
34
|
+
"""
|
35
|
+
Detect layout elements in a given PIL Image.
|
36
|
+
|
37
|
+
Args:
|
38
|
+
image: PIL Image of the page to analyze.
|
39
|
+
options: An instance of a dataclass inheriting from BaseLayoutOptions
|
40
|
+
containing configuration for this run.
|
41
|
+
|
42
|
+
Returns:
|
43
|
+
List of standardized detection dictionaries with at least:
|
44
|
+
- 'bbox': Tuple[float, float, float, float] - (x0, y0, x1, y1) relative to image size
|
45
|
+
- 'class': str - Original class name from the model
|
46
|
+
- 'confidence': float - Confidence score (0.0-1.0)
|
47
|
+
- 'normalized_class': str - Hyphenated, lowercase class name
|
48
|
+
- 'model': str - Name of the model used (e.g., 'yolo', 'tatr')
|
49
|
+
- 'source': str - Usually 'layout'
|
50
|
+
"""
|
51
|
+
raise NotImplementedError("Subclasses must implement this method")
|
52
|
+
|
53
|
+
@abstractmethod
|
54
|
+
def is_available(self) -> bool:
|
55
|
+
"""
|
56
|
+
Check if the detector's dependencies are installed and usable.
|
57
|
+
|
58
|
+
Returns:
|
59
|
+
True if the detector is available, False otherwise.
|
60
|
+
"""
|
61
|
+
raise NotImplementedError("Subclasses must implement this method")
|
62
|
+
|
63
|
+
def _get_cache_key(self, options: BaseLayoutOptions) -> str:
|
64
|
+
"""
|
65
|
+
Generates a cache key for model loading based on relevant options.
|
66
|
+
Subclasses MUST override this to include options that change the loaded model
|
67
|
+
(e.g., model path, model name, specific configurations like TATR structure model).
|
68
|
+
|
69
|
+
Args:
|
70
|
+
options: The options dataclass instance.
|
71
|
+
|
72
|
+
Returns:
|
73
|
+
A string cache key.
|
74
|
+
"""
|
75
|
+
# Base key only includes device, subclasses MUST add model specifics
|
76
|
+
device_key = str(options.device).lower()
|
77
|
+
return f"{self.__class__.__name__}_{device_key}"
|
78
|
+
|
79
|
+
def _get_model(self, options: BaseLayoutOptions) -> Any:
|
80
|
+
"""
|
81
|
+
Gets or initializes the underlying model based on options, using caching.
|
82
|
+
Subclasses must implement _load_model_from_options.
|
83
|
+
"""
|
84
|
+
cache_key = self._get_cache_key(options)
|
85
|
+
if cache_key not in self._model_cache:
|
86
|
+
self.logger.info(f"Loading model for cache key: {cache_key}")
|
87
|
+
try:
|
88
|
+
# Ensure dependencies are met before loading
|
89
|
+
if not self.is_available():
|
90
|
+
raise RuntimeError(f"{self.__class__.__name__} dependencies are not met.")
|
91
|
+
self._model_cache[cache_key] = self._load_model_from_options(options)
|
92
|
+
self.logger.info(f"Model loaded successfully for key: {cache_key}")
|
93
|
+
except Exception as e:
|
94
|
+
self.logger.error(f"Failed to load model for key {cache_key}: {e}", exc_info=True)
|
95
|
+
# Remove potentially corrupted cache entry
|
96
|
+
self._model_cache.pop(cache_key, None)
|
97
|
+
raise # Re-raise exception after logging
|
98
|
+
else:
|
99
|
+
self.logger.debug(f"Using cached model for key: {cache_key}")
|
100
|
+
return self._model_cache[cache_key]
|
101
|
+
|
102
|
+
@abstractmethod
|
103
|
+
def _load_model_from_options(self, options: BaseLayoutOptions) -> Any:
|
104
|
+
"""
|
105
|
+
Abstract method for subclasses to implement the actual model loading logic
|
106
|
+
based on the provided options. Should return the loaded model object(s).
|
107
|
+
Should handle necessary imports internally.
|
108
|
+
"""
|
109
|
+
raise NotImplementedError("Subclasses must implement _load_model_from_options")
|
110
|
+
|
111
|
+
def _normalize_class_name(self, name: str) -> str:
|
112
|
+
"""Convert class names with spaces/underscores to hyphenated lowercase format."""
|
113
|
+
if not isinstance(name, str): name = str(name) # Ensure string
|
114
|
+
return name.lower().replace(' ', '-').replace('_', '-')
|
115
|
+
|
116
|
+
def validate_classes(self, classes: List[str]) -> None:
|
117
|
+
"""
|
118
|
+
Validate that the requested classes are supported by this detector.
|
119
|
+
|
120
|
+
Args:
|
121
|
+
classes: List of class names to validate.
|
122
|
+
|
123
|
+
Raises:
|
124
|
+
ValueError: If any class is not supported.
|
125
|
+
"""
|
126
|
+
if not self.supported_classes:
|
127
|
+
self.logger.warning("Supported classes not defined for this detector. Skipping class validation.")
|
128
|
+
return
|
129
|
+
|
130
|
+
if classes:
|
131
|
+
# Normalize both requested and supported classes for comparison
|
132
|
+
normalized_supported = {self._normalize_class_name(c) for c in self.supported_classes}
|
133
|
+
normalized_requested = {self._normalize_class_name(c) for c in classes}
|
134
|
+
unsupported_normalized = normalized_requested - normalized_supported
|
135
|
+
|
136
|
+
if unsupported_normalized:
|
137
|
+
# Find original names of unsupported classes for better error message
|
138
|
+
unsupported_original = [
|
139
|
+
c for c in classes if self._normalize_class_name(c) in unsupported_normalized
|
140
|
+
]
|
141
|
+
raise ValueError(f"Classes not supported by {self.__class__.__name__}: {unsupported_original}. "
|
142
|
+
f"Supported (normalized): {sorted(list(normalized_supported))}")
|
143
|
+
|
144
|
+
def __del__(self):
|
145
|
+
"""Cleanup resources."""
|
146
|
+
self.logger.info(f"Cleaning up {self.__class__.__name__} resources.")
|
147
|
+
# Clear model cache to free up memory/GPU resources if models are large
|
148
|
+
# Consider implications if models are shared or expensive to reload
|
149
|
+
# del self._model_cache # Optional: uncomment if models should be released aggressively
|
150
|
+
self._model_cache.clear()
|
151
|
+
|
@@ -0,0 +1,247 @@
|
|
1
|
+
# layout_detector_docling.py
|
2
|
+
import logging
|
3
|
+
import importlib.util
|
4
|
+
import os
|
5
|
+
import tempfile
|
6
|
+
from typing import List, Dict, Any, Optional
|
7
|
+
from PIL import Image
|
8
|
+
|
9
|
+
# Assuming base class and options are importable
|
10
|
+
try:
|
11
|
+
from .base import LayoutDetector
|
12
|
+
from .layout_options import DoclingLayoutOptions, BaseLayoutOptions
|
13
|
+
except ImportError:
|
14
|
+
# Placeholders if run standalone or imports fail
|
15
|
+
class BaseLayoutOptions: pass
|
16
|
+
class DoclingLayoutOptions(BaseLayoutOptions): pass
|
17
|
+
class LayoutDetector:
|
18
|
+
def __init__(self): self.logger=logging.getLogger(); self.supported_classes=set()
|
19
|
+
def _get_model(self, options): raise NotImplementedError
|
20
|
+
def _normalize_class_name(self, n): return n
|
21
|
+
def validate_classes(self, c): pass
|
22
|
+
logging.basicConfig()
|
23
|
+
|
24
|
+
logger = logging.getLogger(__name__)
|
25
|
+
|
26
|
+
# Check for dependency
|
27
|
+
docling_spec = importlib.util.find_spec("docling")
|
28
|
+
DocumentConverter = None
|
29
|
+
if docling_spec:
|
30
|
+
try:
|
31
|
+
from docling.document_converter import DocumentConverter
|
32
|
+
except ImportError as e:
|
33
|
+
logger.warning(f"Could not import Docling dependencies: {e}")
|
34
|
+
else:
|
35
|
+
logger.warning("docling not found. DoclingLayoutDetector will not be available.")
|
36
|
+
|
37
|
+
|
38
|
+
class DoclingLayoutDetector(LayoutDetector):
|
39
|
+
"""Document layout and text recognition using Docling."""
|
40
|
+
|
41
|
+
def __init__(self):
|
42
|
+
super().__init__()
|
43
|
+
# Docling classes are dynamic/hierarchical, define common ones
|
44
|
+
self.supported_classes = {
|
45
|
+
'Header', 'Footer', 'Paragraph', 'Heading', 'List', 'ListItem',
|
46
|
+
'Table', 'Figure', 'Caption', 'Footnote', 'PageNumber', 'Equation',
|
47
|
+
'Code', 'Title', 'Author', 'Abstract', 'Section', 'Unknown', 'Metadata' # Add more as needed
|
48
|
+
}
|
49
|
+
self._docling_document_cache = {} # Cache the output doc per image/options if needed
|
50
|
+
|
51
|
+
def is_available(self) -> bool:
|
52
|
+
"""Check if docling is installed."""
|
53
|
+
return DocumentConverter is not None
|
54
|
+
|
55
|
+
def _get_cache_key(self, options: BaseLayoutOptions) -> str:
|
56
|
+
"""Generate cache key based on device and potentially converter args."""
|
57
|
+
if not isinstance(options, DoclingLayoutOptions):
|
58
|
+
options = DoclingLayoutOptions(device=options.device, extra_args=options.extra_args)
|
59
|
+
|
60
|
+
device_key = str(options.device).lower() if options.device else 'default_device'
|
61
|
+
# Include hash of extra_args if they affect model loading/converter init
|
62
|
+
extra_args_key = hash(frozenset(options.extra_args.items()))
|
63
|
+
return f"{self.__class__.__name__}_{device_key}_{extra_args_key}"
|
64
|
+
|
65
|
+
def _load_model_from_options(self, options: BaseLayoutOptions) -> Any:
|
66
|
+
"""Load the Docling DocumentConverter."""
|
67
|
+
if not self.is_available():
|
68
|
+
raise RuntimeError("Docling dependency not installed.")
|
69
|
+
|
70
|
+
if not isinstance(options, DoclingLayoutOptions):
|
71
|
+
raise TypeError("Incorrect options type provided for Docling model loading.")
|
72
|
+
|
73
|
+
self.logger.info("Initializing Docling DocumentConverter...")
|
74
|
+
try:
|
75
|
+
# Pass device if converter accepts it, otherwise handle via extra_args
|
76
|
+
converter_args = options.extra_args.copy()
|
77
|
+
|
78
|
+
converter = DocumentConverter(**converter_args)
|
79
|
+
self.logger.info("Docling DocumentConverter initialized.")
|
80
|
+
return converter
|
81
|
+
except Exception as e:
|
82
|
+
self.logger.error(f"Failed to initialize Docling DocumentConverter: {e}", exc_info=True)
|
83
|
+
raise
|
84
|
+
|
85
|
+
def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
|
86
|
+
"""Detect document structure and text using Docling."""
|
87
|
+
if not self.is_available():
|
88
|
+
raise RuntimeError("Docling dependency not installed.")
|
89
|
+
|
90
|
+
if not isinstance(options, DoclingLayoutOptions):
|
91
|
+
self.logger.warning("Received BaseLayoutOptions, expected DoclingLayoutOptions. Using defaults.")
|
92
|
+
options = DoclingLayoutOptions(
|
93
|
+
confidence=options.confidence, classes=options.classes,
|
94
|
+
exclude_classes=options.exclude_classes, device=options.device,
|
95
|
+
extra_args=options.extra_args, verbose=options.extra_args.get('verbose', False)
|
96
|
+
)
|
97
|
+
|
98
|
+
# Validate classes before proceeding (note: Docling classes are case-sensitive)
|
99
|
+
# self.validate_classes(options.classes or []) # Validation might be tricky due to case sensitivity
|
100
|
+
# if options.exclude_classes:
|
101
|
+
# self.validate_classes(options.exclude_classes)
|
102
|
+
|
103
|
+
# Get the cached/loaded converter instance
|
104
|
+
converter = self._get_model(options)
|
105
|
+
|
106
|
+
# Docling convert method requires an image path. Save temp file.
|
107
|
+
detections = []
|
108
|
+
docling_doc = None # To store the result
|
109
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
110
|
+
temp_image_path = os.path.join(temp_dir, f"docling_input_{os.getpid()}.png")
|
111
|
+
try:
|
112
|
+
self.logger.debug(f"Saving temporary image for Docling detector to: {temp_image_path}")
|
113
|
+
image.convert("RGB").save(temp_image_path) # Ensure RGB
|
114
|
+
|
115
|
+
# Convert the document using Docling's DocumentConverter
|
116
|
+
self.logger.debug("Running Docling conversion...")
|
117
|
+
# Docling convert returns a Result object with a 'document' attribute
|
118
|
+
result = converter.convert(temp_image_path)
|
119
|
+
docling_doc = result.document # Store the DoclingDocument
|
120
|
+
self.logger.info(f"Docling conversion complete.")
|
121
|
+
|
122
|
+
# Convert Docling document to our detection format
|
123
|
+
detections = self._convert_docling_to_detections(docling_doc, options)
|
124
|
+
|
125
|
+
except Exception as e:
|
126
|
+
self.logger.error(f"Error during Docling detection: {e}", exc_info=True)
|
127
|
+
raise # Re-raise the exception
|
128
|
+
finally:
|
129
|
+
# Ensure temp file is removed
|
130
|
+
if os.path.exists(temp_image_path):
|
131
|
+
try: os.remove(temp_image_path)
|
132
|
+
except OSError as e_rm: self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
|
133
|
+
|
134
|
+
# Cache the docling document if needed elsewhere (maybe associate with page?)
|
135
|
+
# self._docling_document_cache[image_hash] = docling_doc # Needs a way to key this
|
136
|
+
|
137
|
+
self.logger.info(f"Docling detected {len(detections)} layout elements matching criteria.")
|
138
|
+
return detections
|
139
|
+
|
140
|
+
def _convert_docling_to_detections(self, doc, options: DoclingLayoutOptions) -> List[Dict[str, Any]]:
|
141
|
+
"""Convert a Docling document to our standard detection format."""
|
142
|
+
if not doc or not hasattr(doc, 'pages') or not doc.pages:
|
143
|
+
self.logger.warning("Invalid or empty Docling document for conversion.")
|
144
|
+
return []
|
145
|
+
|
146
|
+
detections = []
|
147
|
+
id_to_detection_index = {} # Map Docling ID to index in detections list
|
148
|
+
|
149
|
+
# Prepare normalized class filters once
|
150
|
+
normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
|
151
|
+
normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
|
152
|
+
|
153
|
+
# --- Iterate through elements using Docling's structure ---
|
154
|
+
# This requires traversing the hierarchy (e.g., doc.body.children)
|
155
|
+
# or iterating through specific lists like doc.texts, doc.tables etc.
|
156
|
+
elements_to_process = []
|
157
|
+
if hasattr(doc, 'texts'): elements_to_process.extend(doc.texts)
|
158
|
+
if hasattr(doc, 'tables'): elements_to_process.extend(doc.tables)
|
159
|
+
if hasattr(doc, 'pictures'): elements_to_process.extend(doc.pictures)
|
160
|
+
# Add other element types from DoclingDocument as needed
|
161
|
+
|
162
|
+
self.logger.debug(f"Converting {len(elements_to_process)} Docling elements...")
|
163
|
+
|
164
|
+
for elem in elements_to_process:
|
165
|
+
try:
|
166
|
+
# Get Provenance (bbox and page number)
|
167
|
+
if not hasattr(elem, 'prov') or not elem.prov: continue
|
168
|
+
prov = elem.prov[0] # Use first provenance
|
169
|
+
if not hasattr(prov, 'bbox') or not prov.bbox: continue
|
170
|
+
bbox = prov.bbox
|
171
|
+
page_no = prov.page_no
|
172
|
+
|
173
|
+
# Get Page Dimensions (crucial for coordinate conversion)
|
174
|
+
if not hasattr(doc.pages.get(page_no), 'size'): continue
|
175
|
+
page_height = doc.pages[page_no].size.height
|
176
|
+
page_width = doc.pages[page_no].size.width # Needed? Bbox seems absolute
|
177
|
+
|
178
|
+
# Convert coordinates from Docling's system (often bottom-left origin)
|
179
|
+
# to standard top-left origin (0,0 at top-left)
|
180
|
+
# Docling Bbox: l, b, r, t (relative to bottom-left)
|
181
|
+
x0 = float(bbox.l)
|
182
|
+
x1 = float(bbox.r)
|
183
|
+
# Convert y: top_y = page_height - bottom_left_t
|
184
|
+
# bottom_y = page_height - bottom_left_b
|
185
|
+
y0 = float(page_height - bbox.t) # Top y
|
186
|
+
y1 = float(page_height - bbox.b) # Bottom y
|
187
|
+
|
188
|
+
# Ensure y0 < y1
|
189
|
+
if y0 > y1: y0, y1 = y1, y0
|
190
|
+
# Ensure x0 < x1
|
191
|
+
if x0 > x1: x0, x1 = x1, x0
|
192
|
+
|
193
|
+
# Get Class Label
|
194
|
+
label_orig = str(getattr(elem, 'label', 'Unknown')) # Default if no label
|
195
|
+
normalized_label = self._normalize_class_name(label_orig)
|
196
|
+
|
197
|
+
# Apply Class Filtering
|
198
|
+
if normalized_classes_req and normalized_label not in normalized_classes_req: continue
|
199
|
+
if normalized_label in normalized_classes_excl: continue
|
200
|
+
|
201
|
+
# Get Confidence (Docling often doesn't provide per-element confidence)
|
202
|
+
confidence = getattr(elem, 'confidence', 0.95) # Assign default confidence
|
203
|
+
if confidence < options.confidence: continue # Apply confidence threshold
|
204
|
+
|
205
|
+
# Get Text Content
|
206
|
+
text_content = getattr(elem, 'text', None)
|
207
|
+
|
208
|
+
# Get IDs for hierarchy
|
209
|
+
docling_id = getattr(elem, 'self_ref', None)
|
210
|
+
parent_id_obj = getattr(elem, 'parent', None)
|
211
|
+
parent_id = getattr(parent_id_obj, 'self_ref', None) if parent_id_obj else None
|
212
|
+
|
213
|
+
# Create Detection Dictionary
|
214
|
+
detection = {
|
215
|
+
'bbox': (x0, y0, x1, y1),
|
216
|
+
'class': label_orig,
|
217
|
+
'normalized_class': normalized_label,
|
218
|
+
'confidence': confidence,
|
219
|
+
'text': text_content,
|
220
|
+
'docling_id': docling_id,
|
221
|
+
'parent_id': parent_id,
|
222
|
+
'page_number': page_no, # Add page number if useful
|
223
|
+
'source': 'layout',
|
224
|
+
'model': 'docling'
|
225
|
+
}
|
226
|
+
detections.append(detection)
|
227
|
+
|
228
|
+
# Store index for hierarchy linking (if needed later)
|
229
|
+
# if docling_id: id_to_detection_index[docling_id] = len(detections) - 1
|
230
|
+
|
231
|
+
except Exception as conv_e:
|
232
|
+
self.logger.warning(f"Could not convert Docling element: {elem}. Error: {conv_e}")
|
233
|
+
continue
|
234
|
+
|
235
|
+
return detections
|
236
|
+
|
237
|
+
def get_docling_document(self, image: Image.Image, options: BaseLayoutOptions):
|
238
|
+
"""
|
239
|
+
Get the raw DoclingDocument object after running detection.
|
240
|
+
Ensures detection is run if not already cached for these options/image.
|
241
|
+
"""
|
242
|
+
# This requires caching the doc based on image/options or re-running.
|
243
|
+
# For simplicity, let's just re-run detect if needed.
|
244
|
+
self.logger.warning("get_docling_document: Re-running detection to ensure document is generated.")
|
245
|
+
self.detect(image, options) # Run detect to populate internal doc
|
246
|
+
return getattr(self, '_docling_document', None) # Return the stored doc
|
247
|
+
|
@@ -0,0 +1,166 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import List, Dict, Any, Optional, Union
|
3
|
+
from PIL import Image
|
4
|
+
|
5
|
+
from natural_pdf.elements.region import Region
|
6
|
+
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
7
|
+
from natural_pdf.analyzers.layout.layout_options import LayoutOptions
|
8
|
+
|
9
|
+
logger = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
class LayoutAnalyzer:
|
12
|
+
"""
|
13
|
+
Handles layout analysis for PDF pages, including image rendering,
|
14
|
+
coordinate scaling, region creation, and result storage.
|
15
|
+
"""
|
16
|
+
|
17
|
+
def __init__(self, page, layout_manager: Optional[LayoutManager] = None):
|
18
|
+
"""
|
19
|
+
Initialize the layout analyzer.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
page: The Page object to analyze
|
23
|
+
layout_manager: Optional LayoutManager instance. If None, will try to get from page's parent.
|
24
|
+
"""
|
25
|
+
self._page = page
|
26
|
+
self._layout_manager = layout_manager or getattr(page._parent, '_layout_manager', None)
|
27
|
+
|
28
|
+
if not self._layout_manager:
|
29
|
+
logger.warning(f"LayoutManager not available for page {page.number}. Layout analysis will fail.")
|
30
|
+
|
31
|
+
def analyze_layout(
|
32
|
+
self,
|
33
|
+
engine: Optional[str] = None,
|
34
|
+
options: Optional[LayoutOptions] = None,
|
35
|
+
confidence: Optional[float] = None,
|
36
|
+
classes: Optional[List[str]] = None,
|
37
|
+
exclude_classes: Optional[List[str]] = None,
|
38
|
+
device: Optional[str] = None,
|
39
|
+
existing: str = "replace"
|
40
|
+
) -> List[Region]:
|
41
|
+
"""
|
42
|
+
Analyze the page layout using the configured LayoutManager.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
engine: Name of the layout engine (e.g., 'yolo', 'tatr'). Uses manager's default if None.
|
46
|
+
options: Specific LayoutOptions object for advanced configuration.
|
47
|
+
confidence: Minimum confidence threshold (simple mode).
|
48
|
+
classes: Specific classes to detect (simple mode).
|
49
|
+
exclude_classes: Classes to exclude (simple mode).
|
50
|
+
device: Device for inference (simple mode).
|
51
|
+
existing: How to handle existing detected regions: 'replace' (default) or 'append'.
|
52
|
+
|
53
|
+
Returns:
|
54
|
+
List of created Region objects.
|
55
|
+
"""
|
56
|
+
if not self._layout_manager:
|
57
|
+
logger.error(f"Page {self._page.number}: LayoutManager not available. Cannot analyze layout.")
|
58
|
+
return []
|
59
|
+
|
60
|
+
logger.info(f"Page {self._page.number}: Analyzing layout (Engine: {engine or 'default'}, Options: {options is not None})...")
|
61
|
+
|
62
|
+
# --- Render Page Image ---
|
63
|
+
logger.debug(f" Rendering page {self._page.number} to image for layout analysis...")
|
64
|
+
try:
|
65
|
+
# Use a resolution suitable for layout analysis, potentially configurable
|
66
|
+
layout_scale = getattr(self._page._parent, '_config', {}).get('layout_image_scale', 1.5) # ~108 DPI default
|
67
|
+
layout_resolution = layout_scale * 72
|
68
|
+
# Render without existing highlights to avoid interference
|
69
|
+
page_image = self._page.to_image(resolution=layout_resolution, include_highlights=False)
|
70
|
+
logger.debug(f" Rendered image size: {page_image.width}x{page_image.height}")
|
71
|
+
except Exception as e:
|
72
|
+
logger.error(f" Failed to render page {self._page.number} to image: {e}", exc_info=True)
|
73
|
+
return []
|
74
|
+
|
75
|
+
# --- Prepare Arguments for Layout Manager ---
|
76
|
+
manager_args = {'image': page_image, 'options': options, 'engine': engine}
|
77
|
+
if confidence is not None: manager_args['confidence'] = confidence
|
78
|
+
if classes is not None: manager_args['classes'] = classes
|
79
|
+
if exclude_classes is not None: manager_args['exclude_classes'] = exclude_classes
|
80
|
+
if device is not None: manager_args['device'] = device
|
81
|
+
|
82
|
+
# --- Call Layout Manager ---
|
83
|
+
logger.debug(f" Calling Layout Manager...")
|
84
|
+
try:
|
85
|
+
detections = self._layout_manager.analyze_layout(**manager_args)
|
86
|
+
logger.info(f" Layout Manager returned {len(detections)} detections.")
|
87
|
+
except Exception as e:
|
88
|
+
logger.error(f" Layout analysis failed: {e}", exc_info=True)
|
89
|
+
return []
|
90
|
+
|
91
|
+
# --- Process Detections (Convert to Regions, Scale Coords) ---
|
92
|
+
# Calculate scale factor to convert from image back to PDF coordinates
|
93
|
+
if page_image.width == 0 or page_image.height == 0:
|
94
|
+
logger.error(f"Page {self._page.number}: Invalid rendered image dimensions ({page_image.width}x{page_image.height}). Cannot scale layout results.")
|
95
|
+
return []
|
96
|
+
scale_x = self._page.width / page_image.width
|
97
|
+
scale_y = self._page.height / page_image.height
|
98
|
+
logger.debug(f" Scaling factors: x={scale_x:.4f}, y={scale_y:.4f}")
|
99
|
+
|
100
|
+
layout_regions = []
|
101
|
+
docling_id_to_region = {} # For hierarchy if using Docling
|
102
|
+
|
103
|
+
for detection in detections:
|
104
|
+
try:
|
105
|
+
x_min, y_min, x_max, y_max = detection['bbox']
|
106
|
+
|
107
|
+
# Convert coordinates from image to PDF space
|
108
|
+
pdf_x0 = x_min * scale_x
|
109
|
+
pdf_y0 = y_min * scale_y
|
110
|
+
pdf_x1 = x_max * scale_x
|
111
|
+
pdf_y1 = y_max * scale_y
|
112
|
+
|
113
|
+
# Create a Region object
|
114
|
+
region = Region(self._page, (pdf_x0, pdf_y0, pdf_x1, pdf_y1))
|
115
|
+
region.region_type = detection.get('class', 'unknown') # Original class name
|
116
|
+
region.normalized_type = detection.get('normalized_class', 'unknown') # Hyphenated name
|
117
|
+
region.confidence = detection.get('confidence', 0.0)
|
118
|
+
region.model = detection.get('model', engine or 'unknown') # Store model name
|
119
|
+
region.source = 'detected'
|
120
|
+
|
121
|
+
# Add extra info if available
|
122
|
+
if 'text' in detection: region.text_content = detection['text']
|
123
|
+
if 'docling_id' in detection: region.docling_id = detection['docling_id']
|
124
|
+
if 'parent_id' in detection: region.parent_id = detection['parent_id']
|
125
|
+
# Add other fields like polygon, position, row/col index if needed
|
126
|
+
|
127
|
+
layout_regions.append(region)
|
128
|
+
|
129
|
+
# Track Docling IDs for hierarchy
|
130
|
+
if hasattr(region, 'docling_id') and region.docling_id:
|
131
|
+
docling_id_to_region[region.docling_id] = region
|
132
|
+
|
133
|
+
except (KeyError, IndexError, TypeError, ValueError) as e:
|
134
|
+
logger.warning(f"Could not process layout detection: {detection}. Error: {e}")
|
135
|
+
continue
|
136
|
+
|
137
|
+
# --- Build Hierarchy (if Docling results detected) ---
|
138
|
+
if docling_id_to_region:
|
139
|
+
logger.debug("Building Docling region hierarchy...")
|
140
|
+
for region in layout_regions:
|
141
|
+
if hasattr(region, 'parent_id') and region.parent_id:
|
142
|
+
parent_region = docling_id_to_region.get(region.parent_id)
|
143
|
+
if parent_region:
|
144
|
+
if hasattr(parent_region, 'add_child'):
|
145
|
+
parent_region.add_child(region)
|
146
|
+
else:
|
147
|
+
logger.warning("Region object missing add_child method for hierarchy.")
|
148
|
+
|
149
|
+
# --- Store Results ---
|
150
|
+
logger.debug(f"Storing {len(layout_regions)} processed layout regions (mode: {existing}).")
|
151
|
+
# Handle existing regions based on mode
|
152
|
+
if existing.lower() == 'append':
|
153
|
+
if 'detected' not in self._page._regions: self._page._regions['detected'] = []
|
154
|
+
self._page._regions['detected'].extend(layout_regions)
|
155
|
+
else: # Default is 'replace'
|
156
|
+
self._page._regions['detected'] = layout_regions
|
157
|
+
|
158
|
+
# Add regions to the element manager
|
159
|
+
for region in layout_regions:
|
160
|
+
self._page._element_mgr.add_region(region)
|
161
|
+
|
162
|
+
# Store layout regions in a dedicated attribute for easier access
|
163
|
+
self._page.detected_layout_regions = self._page._regions['detected']
|
164
|
+
logger.info(f"Layout analysis complete for page {self._page.number}.")
|
165
|
+
|
166
|
+
return layout_regions
|