natural-pdf 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. natural_pdf/__init__.py +55 -0
  2. natural_pdf/analyzers/__init__.py +6 -0
  3. natural_pdf/analyzers/layout/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/base.py +151 -0
  5. natural_pdf/analyzers/layout/docling.py +247 -0
  6. natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
  7. natural_pdf/analyzers/layout/layout_manager.py +200 -0
  8. natural_pdf/analyzers/layout/layout_options.py +78 -0
  9. natural_pdf/analyzers/layout/paddle.py +240 -0
  10. natural_pdf/analyzers/layout/surya.py +151 -0
  11. natural_pdf/analyzers/layout/tatr.py +251 -0
  12. natural_pdf/analyzers/layout/yolo.py +165 -0
  13. natural_pdf/analyzers/text_options.py +60 -0
  14. natural_pdf/analyzers/text_structure.py +270 -0
  15. natural_pdf/analyzers/utils.py +57 -0
  16. natural_pdf/core/__init__.py +3 -0
  17. natural_pdf/core/element_manager.py +457 -0
  18. natural_pdf/core/highlighting_service.py +698 -0
  19. natural_pdf/core/page.py +1444 -0
  20. natural_pdf/core/pdf.py +653 -0
  21. natural_pdf/elements/__init__.py +3 -0
  22. natural_pdf/elements/base.py +761 -0
  23. natural_pdf/elements/collections.py +1345 -0
  24. natural_pdf/elements/line.py +140 -0
  25. natural_pdf/elements/rect.py +122 -0
  26. natural_pdf/elements/region.py +1793 -0
  27. natural_pdf/elements/text.py +304 -0
  28. natural_pdf/ocr/__init__.py +56 -0
  29. natural_pdf/ocr/engine.py +104 -0
  30. natural_pdf/ocr/engine_easyocr.py +179 -0
  31. natural_pdf/ocr/engine_paddle.py +204 -0
  32. natural_pdf/ocr/engine_surya.py +171 -0
  33. natural_pdf/ocr/ocr_manager.py +191 -0
  34. natural_pdf/ocr/ocr_options.py +114 -0
  35. natural_pdf/qa/__init__.py +3 -0
  36. natural_pdf/qa/document_qa.py +396 -0
  37. natural_pdf/selectors/__init__.py +4 -0
  38. natural_pdf/selectors/parser.py +354 -0
  39. natural_pdf/templates/__init__.py +1 -0
  40. natural_pdf/templates/ocr_debug.html +517 -0
  41. natural_pdf/utils/__init__.py +3 -0
  42. natural_pdf/utils/highlighting.py +12 -0
  43. natural_pdf/utils/reading_order.py +227 -0
  44. natural_pdf/utils/visualization.py +223 -0
  45. natural_pdf/widgets/__init__.py +4 -0
  46. natural_pdf/widgets/frontend/viewer.js +88 -0
  47. natural_pdf/widgets/viewer.py +765 -0
  48. natural_pdf-0.1.0.dist-info/METADATA +295 -0
  49. natural_pdf-0.1.0.dist-info/RECORD +52 -0
  50. natural_pdf-0.1.0.dist-info/WHEEL +5 -0
  51. natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
  52. natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,55 @@
1
+ """
2
+ Natural PDF - A more intuitive interface for working with PDFs.
3
+ """
4
+ import logging
5
+
6
+ # Create library logger
7
+ logger = logging.getLogger("natural_pdf")
8
+
9
+ # Add a NullHandler to prevent "No handler found" warnings
10
+ # (Best practice for libraries)
11
+ logger.addHandler(logging.NullHandler())
12
+
13
+ # Utility function for users to easily configure logging
14
+ def configure_logging(level=logging.INFO, handler=None):
15
+ """Configure Natural PDF's logging.
16
+
17
+ Args:
18
+ level: The logging level (e.g., logging.INFO, logging.DEBUG)
19
+ handler: A custom handler, or None to use StreamHandler
20
+ """
21
+ # Remove NullHandler if present
22
+ if logger.handlers and isinstance(logger.handlers[0], logging.NullHandler):
23
+ logger.removeHandler(logger.handlers[0])
24
+
25
+ if handler is None:
26
+ handler = logging.StreamHandler()
27
+ formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
28
+ handler.setFormatter(formatter)
29
+
30
+ logger.addHandler(handler)
31
+ logger.setLevel(level)
32
+
33
+ # Propagate level to all child loggers
34
+ for name in logging.root.manager.loggerDict:
35
+ if name.startswith("natural_pdf."):
36
+ logging.getLogger(name).setLevel(level)
37
+
38
+ from natural_pdf.core.pdf import PDF
39
+ from natural_pdf.core.page import Page
40
+ from natural_pdf.elements.region import Region
41
+ from natural_pdf.elements.collections import ElementCollection
42
+
43
+ # Import QA module if available
44
+ try:
45
+ from natural_pdf.qa import DocumentQA, get_qa_engine
46
+ HAS_QA = True
47
+ except ImportError:
48
+ HAS_QA = False
49
+
50
+ __version__ = "0.1.0"
51
+
52
+ if HAS_QA:
53
+ __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging", "DocumentQA", "get_qa_engine"]
54
+ else:
55
+ __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging"]
@@ -0,0 +1,6 @@
1
+ """
2
+ Analyzers for natural-pdf.
3
+ """
4
+ from .layout import *
5
+ from .text_structure import TextStyleAnalyzer
6
+ from .utils import convert_to_regions
@@ -0,0 +1 @@
1
+ from .base import LayoutDetector
@@ -0,0 +1,151 @@
1
+ # layout_detector_base.py
2
+ import logging
3
+ from abc import ABC, abstractmethod
4
+ from typing import Dict, List, Any, Optional, Set, Union
5
+ from PIL import Image
6
+
7
+ # Assuming layout_options defines BaseLayoutOptions
8
+ try:
9
+ from .layout_options import BaseLayoutOptions
10
+ except ImportError:
11
+ # Placeholder if run standalone or options not found
12
+ class BaseLayoutOptions: pass
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ class LayoutDetector(ABC):
17
+ """
18
+ Abstract Base Class for layout detection engines.
19
+
20
+ Subclasses should implement is_available, _load_model_from_options, detect,
21
+ and override _get_cache_key if model loading depends on options beyond device.
22
+ They should also populate the 'supported_classes' set.
23
+ """
24
+
25
+ def __init__(self):
26
+ """Initializes the base layout detector."""
27
+ self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
28
+ self.logger.info(f"Initializing {self.__class__.__name__}")
29
+ self.supported_classes: Set[str] = set() # Subclasses should populate this
30
+ self._model_cache: Dict[str, Any] = {} # Cache for initialized models
31
+
32
+ @abstractmethod
33
+ def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
34
+ """
35
+ Detect layout elements in a given PIL Image.
36
+
37
+ Args:
38
+ image: PIL Image of the page to analyze.
39
+ options: An instance of a dataclass inheriting from BaseLayoutOptions
40
+ containing configuration for this run.
41
+
42
+ Returns:
43
+ List of standardized detection dictionaries with at least:
44
+ - 'bbox': Tuple[float, float, float, float] - (x0, y0, x1, y1) relative to image size
45
+ - 'class': str - Original class name from the model
46
+ - 'confidence': float - Confidence score (0.0-1.0)
47
+ - 'normalized_class': str - Hyphenated, lowercase class name
48
+ - 'model': str - Name of the model used (e.g., 'yolo', 'tatr')
49
+ - 'source': str - Usually 'layout'
50
+ """
51
+ raise NotImplementedError("Subclasses must implement this method")
52
+
53
+ @abstractmethod
54
+ def is_available(self) -> bool:
55
+ """
56
+ Check if the detector's dependencies are installed and usable.
57
+
58
+ Returns:
59
+ True if the detector is available, False otherwise.
60
+ """
61
+ raise NotImplementedError("Subclasses must implement this method")
62
+
63
+ def _get_cache_key(self, options: BaseLayoutOptions) -> str:
64
+ """
65
+ Generates a cache key for model loading based on relevant options.
66
+ Subclasses MUST override this to include options that change the loaded model
67
+ (e.g., model path, model name, specific configurations like TATR structure model).
68
+
69
+ Args:
70
+ options: The options dataclass instance.
71
+
72
+ Returns:
73
+ A string cache key.
74
+ """
75
+ # Base key only includes device, subclasses MUST add model specifics
76
+ device_key = str(options.device).lower()
77
+ return f"{self.__class__.__name__}_{device_key}"
78
+
79
+ def _get_model(self, options: BaseLayoutOptions) -> Any:
80
+ """
81
+ Gets or initializes the underlying model based on options, using caching.
82
+ Subclasses must implement _load_model_from_options.
83
+ """
84
+ cache_key = self._get_cache_key(options)
85
+ if cache_key not in self._model_cache:
86
+ self.logger.info(f"Loading model for cache key: {cache_key}")
87
+ try:
88
+ # Ensure dependencies are met before loading
89
+ if not self.is_available():
90
+ raise RuntimeError(f"{self.__class__.__name__} dependencies are not met.")
91
+ self._model_cache[cache_key] = self._load_model_from_options(options)
92
+ self.logger.info(f"Model loaded successfully for key: {cache_key}")
93
+ except Exception as e:
94
+ self.logger.error(f"Failed to load model for key {cache_key}: {e}", exc_info=True)
95
+ # Remove potentially corrupted cache entry
96
+ self._model_cache.pop(cache_key, None)
97
+ raise # Re-raise exception after logging
98
+ else:
99
+ self.logger.debug(f"Using cached model for key: {cache_key}")
100
+ return self._model_cache[cache_key]
101
+
102
+ @abstractmethod
103
+ def _load_model_from_options(self, options: BaseLayoutOptions) -> Any:
104
+ """
105
+ Abstract method for subclasses to implement the actual model loading logic
106
+ based on the provided options. Should return the loaded model object(s).
107
+ Should handle necessary imports internally.
108
+ """
109
+ raise NotImplementedError("Subclasses must implement _load_model_from_options")
110
+
111
+ def _normalize_class_name(self, name: str) -> str:
112
+ """Convert class names with spaces/underscores to hyphenated lowercase format."""
113
+ if not isinstance(name, str): name = str(name) # Ensure string
114
+ return name.lower().replace(' ', '-').replace('_', '-')
115
+
116
+ def validate_classes(self, classes: List[str]) -> None:
117
+ """
118
+ Validate that the requested classes are supported by this detector.
119
+
120
+ Args:
121
+ classes: List of class names to validate.
122
+
123
+ Raises:
124
+ ValueError: If any class is not supported.
125
+ """
126
+ if not self.supported_classes:
127
+ self.logger.warning("Supported classes not defined for this detector. Skipping class validation.")
128
+ return
129
+
130
+ if classes:
131
+ # Normalize both requested and supported classes for comparison
132
+ normalized_supported = {self._normalize_class_name(c) for c in self.supported_classes}
133
+ normalized_requested = {self._normalize_class_name(c) for c in classes}
134
+ unsupported_normalized = normalized_requested - normalized_supported
135
+
136
+ if unsupported_normalized:
137
+ # Find original names of unsupported classes for better error message
138
+ unsupported_original = [
139
+ c for c in classes if self._normalize_class_name(c) in unsupported_normalized
140
+ ]
141
+ raise ValueError(f"Classes not supported by {self.__class__.__name__}: {unsupported_original}. "
142
+ f"Supported (normalized): {sorted(list(normalized_supported))}")
143
+
144
+ def __del__(self):
145
+ """Cleanup resources."""
146
+ self.logger.info(f"Cleaning up {self.__class__.__name__} resources.")
147
+ # Clear model cache to free up memory/GPU resources if models are large
148
+ # Consider implications if models are shared or expensive to reload
149
+ # del self._model_cache # Optional: uncomment if models should be released aggressively
150
+ self._model_cache.clear()
151
+
@@ -0,0 +1,247 @@
1
+ # layout_detector_docling.py
2
+ import logging
3
+ import importlib.util
4
+ import os
5
+ import tempfile
6
+ from typing import List, Dict, Any, Optional
7
+ from PIL import Image
8
+
9
+ # Assuming base class and options are importable
10
+ try:
11
+ from .base import LayoutDetector
12
+ from .layout_options import DoclingLayoutOptions, BaseLayoutOptions
13
+ except ImportError:
14
+ # Placeholders if run standalone or imports fail
15
+ class BaseLayoutOptions: pass
16
+ class DoclingLayoutOptions(BaseLayoutOptions): pass
17
+ class LayoutDetector:
18
+ def __init__(self): self.logger=logging.getLogger(); self.supported_classes=set()
19
+ def _get_model(self, options): raise NotImplementedError
20
+ def _normalize_class_name(self, n): return n
21
+ def validate_classes(self, c): pass
22
+ logging.basicConfig()
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Check for dependency
27
+ docling_spec = importlib.util.find_spec("docling")
28
+ DocumentConverter = None
29
+ if docling_spec:
30
+ try:
31
+ from docling.document_converter import DocumentConverter
32
+ except ImportError as e:
33
+ logger.warning(f"Could not import Docling dependencies: {e}")
34
+ else:
35
+ logger.warning("docling not found. DoclingLayoutDetector will not be available.")
36
+
37
+
38
+ class DoclingLayoutDetector(LayoutDetector):
39
+ """Document layout and text recognition using Docling."""
40
+
41
+ def __init__(self):
42
+ super().__init__()
43
+ # Docling classes are dynamic/hierarchical, define common ones
44
+ self.supported_classes = {
45
+ 'Header', 'Footer', 'Paragraph', 'Heading', 'List', 'ListItem',
46
+ 'Table', 'Figure', 'Caption', 'Footnote', 'PageNumber', 'Equation',
47
+ 'Code', 'Title', 'Author', 'Abstract', 'Section', 'Unknown', 'Metadata' # Add more as needed
48
+ }
49
+ self._docling_document_cache = {} # Cache the output doc per image/options if needed
50
+
51
+ def is_available(self) -> bool:
52
+ """Check if docling is installed."""
53
+ return DocumentConverter is not None
54
+
55
+ def _get_cache_key(self, options: BaseLayoutOptions) -> str:
56
+ """Generate cache key based on device and potentially converter args."""
57
+ if not isinstance(options, DoclingLayoutOptions):
58
+ options = DoclingLayoutOptions(device=options.device, extra_args=options.extra_args)
59
+
60
+ device_key = str(options.device).lower() if options.device else 'default_device'
61
+ # Include hash of extra_args if they affect model loading/converter init
62
+ extra_args_key = hash(frozenset(options.extra_args.items()))
63
+ return f"{self.__class__.__name__}_{device_key}_{extra_args_key}"
64
+
65
+ def _load_model_from_options(self, options: BaseLayoutOptions) -> Any:
66
+ """Load the Docling DocumentConverter."""
67
+ if not self.is_available():
68
+ raise RuntimeError("Docling dependency not installed.")
69
+
70
+ if not isinstance(options, DoclingLayoutOptions):
71
+ raise TypeError("Incorrect options type provided for Docling model loading.")
72
+
73
+ self.logger.info("Initializing Docling DocumentConverter...")
74
+ try:
75
+ # Pass device if converter accepts it, otherwise handle via extra_args
76
+ converter_args = options.extra_args.copy()
77
+
78
+ converter = DocumentConverter(**converter_args)
79
+ self.logger.info("Docling DocumentConverter initialized.")
80
+ return converter
81
+ except Exception as e:
82
+ self.logger.error(f"Failed to initialize Docling DocumentConverter: {e}", exc_info=True)
83
+ raise
84
+
85
+ def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
86
+ """Detect document structure and text using Docling."""
87
+ if not self.is_available():
88
+ raise RuntimeError("Docling dependency not installed.")
89
+
90
+ if not isinstance(options, DoclingLayoutOptions):
91
+ self.logger.warning("Received BaseLayoutOptions, expected DoclingLayoutOptions. Using defaults.")
92
+ options = DoclingLayoutOptions(
93
+ confidence=options.confidence, classes=options.classes,
94
+ exclude_classes=options.exclude_classes, device=options.device,
95
+ extra_args=options.extra_args, verbose=options.extra_args.get('verbose', False)
96
+ )
97
+
98
+ # Validate classes before proceeding (note: Docling classes are case-sensitive)
99
+ # self.validate_classes(options.classes or []) # Validation might be tricky due to case sensitivity
100
+ # if options.exclude_classes:
101
+ # self.validate_classes(options.exclude_classes)
102
+
103
+ # Get the cached/loaded converter instance
104
+ converter = self._get_model(options)
105
+
106
+ # Docling convert method requires an image path. Save temp file.
107
+ detections = []
108
+ docling_doc = None # To store the result
109
+ with tempfile.TemporaryDirectory() as temp_dir:
110
+ temp_image_path = os.path.join(temp_dir, f"docling_input_{os.getpid()}.png")
111
+ try:
112
+ self.logger.debug(f"Saving temporary image for Docling detector to: {temp_image_path}")
113
+ image.convert("RGB").save(temp_image_path) # Ensure RGB
114
+
115
+ # Convert the document using Docling's DocumentConverter
116
+ self.logger.debug("Running Docling conversion...")
117
+ # Docling convert returns a Result object with a 'document' attribute
118
+ result = converter.convert(temp_image_path)
119
+ docling_doc = result.document # Store the DoclingDocument
120
+ self.logger.info(f"Docling conversion complete.")
121
+
122
+ # Convert Docling document to our detection format
123
+ detections = self._convert_docling_to_detections(docling_doc, options)
124
+
125
+ except Exception as e:
126
+ self.logger.error(f"Error during Docling detection: {e}", exc_info=True)
127
+ raise # Re-raise the exception
128
+ finally:
129
+ # Ensure temp file is removed
130
+ if os.path.exists(temp_image_path):
131
+ try: os.remove(temp_image_path)
132
+ except OSError as e_rm: self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
133
+
134
+ # Cache the docling document if needed elsewhere (maybe associate with page?)
135
+ # self._docling_document_cache[image_hash] = docling_doc # Needs a way to key this
136
+
137
+ self.logger.info(f"Docling detected {len(detections)} layout elements matching criteria.")
138
+ return detections
139
+
140
+ def _convert_docling_to_detections(self, doc, options: DoclingLayoutOptions) -> List[Dict[str, Any]]:
141
+ """Convert a Docling document to our standard detection format."""
142
+ if not doc or not hasattr(doc, 'pages') or not doc.pages:
143
+ self.logger.warning("Invalid or empty Docling document for conversion.")
144
+ return []
145
+
146
+ detections = []
147
+ id_to_detection_index = {} # Map Docling ID to index in detections list
148
+
149
+ # Prepare normalized class filters once
150
+ normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
151
+ normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
152
+
153
+ # --- Iterate through elements using Docling's structure ---
154
+ # This requires traversing the hierarchy (e.g., doc.body.children)
155
+ # or iterating through specific lists like doc.texts, doc.tables etc.
156
+ elements_to_process = []
157
+ if hasattr(doc, 'texts'): elements_to_process.extend(doc.texts)
158
+ if hasattr(doc, 'tables'): elements_to_process.extend(doc.tables)
159
+ if hasattr(doc, 'pictures'): elements_to_process.extend(doc.pictures)
160
+ # Add other element types from DoclingDocument as needed
161
+
162
+ self.logger.debug(f"Converting {len(elements_to_process)} Docling elements...")
163
+
164
+ for elem in elements_to_process:
165
+ try:
166
+ # Get Provenance (bbox and page number)
167
+ if not hasattr(elem, 'prov') or not elem.prov: continue
168
+ prov = elem.prov[0] # Use first provenance
169
+ if not hasattr(prov, 'bbox') or not prov.bbox: continue
170
+ bbox = prov.bbox
171
+ page_no = prov.page_no
172
+
173
+ # Get Page Dimensions (crucial for coordinate conversion)
174
+ if not hasattr(doc.pages.get(page_no), 'size'): continue
175
+ page_height = doc.pages[page_no].size.height
176
+ page_width = doc.pages[page_no].size.width # Needed? Bbox seems absolute
177
+
178
+ # Convert coordinates from Docling's system (often bottom-left origin)
179
+ # to standard top-left origin (0,0 at top-left)
180
+ # Docling Bbox: l, b, r, t (relative to bottom-left)
181
+ x0 = float(bbox.l)
182
+ x1 = float(bbox.r)
183
+ # Convert y: top_y = page_height - bottom_left_t
184
+ # bottom_y = page_height - bottom_left_b
185
+ y0 = float(page_height - bbox.t) # Top y
186
+ y1 = float(page_height - bbox.b) # Bottom y
187
+
188
+ # Ensure y0 < y1
189
+ if y0 > y1: y0, y1 = y1, y0
190
+ # Ensure x0 < x1
191
+ if x0 > x1: x0, x1 = x1, x0
192
+
193
+ # Get Class Label
194
+ label_orig = str(getattr(elem, 'label', 'Unknown')) # Default if no label
195
+ normalized_label = self._normalize_class_name(label_orig)
196
+
197
+ # Apply Class Filtering
198
+ if normalized_classes_req and normalized_label not in normalized_classes_req: continue
199
+ if normalized_label in normalized_classes_excl: continue
200
+
201
+ # Get Confidence (Docling often doesn't provide per-element confidence)
202
+ confidence = getattr(elem, 'confidence', 0.95) # Assign default confidence
203
+ if confidence < options.confidence: continue # Apply confidence threshold
204
+
205
+ # Get Text Content
206
+ text_content = getattr(elem, 'text', None)
207
+
208
+ # Get IDs for hierarchy
209
+ docling_id = getattr(elem, 'self_ref', None)
210
+ parent_id_obj = getattr(elem, 'parent', None)
211
+ parent_id = getattr(parent_id_obj, 'self_ref', None) if parent_id_obj else None
212
+
213
+ # Create Detection Dictionary
214
+ detection = {
215
+ 'bbox': (x0, y0, x1, y1),
216
+ 'class': label_orig,
217
+ 'normalized_class': normalized_label,
218
+ 'confidence': confidence,
219
+ 'text': text_content,
220
+ 'docling_id': docling_id,
221
+ 'parent_id': parent_id,
222
+ 'page_number': page_no, # Add page number if useful
223
+ 'source': 'layout',
224
+ 'model': 'docling'
225
+ }
226
+ detections.append(detection)
227
+
228
+ # Store index for hierarchy linking (if needed later)
229
+ # if docling_id: id_to_detection_index[docling_id] = len(detections) - 1
230
+
231
+ except Exception as conv_e:
232
+ self.logger.warning(f"Could not convert Docling element: {elem}. Error: {conv_e}")
233
+ continue
234
+
235
+ return detections
236
+
237
+ def get_docling_document(self, image: Image.Image, options: BaseLayoutOptions):
238
+ """
239
+ Get the raw DoclingDocument object after running detection.
240
+ Ensures detection is run if not already cached for these options/image.
241
+ """
242
+ # This requires caching the doc based on image/options or re-running.
243
+ # For simplicity, let's just re-run detect if needed.
244
+ self.logger.warning("get_docling_document: Re-running detection to ensure document is generated.")
245
+ self.detect(image, options) # Run detect to populate internal doc
246
+ return getattr(self, '_docling_document', None) # Return the stored doc
247
+
@@ -0,0 +1,166 @@
1
+ import logging
2
+ from typing import List, Dict, Any, Optional, Union
3
+ from PIL import Image
4
+
5
+ from natural_pdf.elements.region import Region
6
+ from natural_pdf.analyzers.layout.layout_manager import LayoutManager
7
+ from natural_pdf.analyzers.layout.layout_options import LayoutOptions
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class LayoutAnalyzer:
12
+ """
13
+ Handles layout analysis for PDF pages, including image rendering,
14
+ coordinate scaling, region creation, and result storage.
15
+ """
16
+
17
+ def __init__(self, page, layout_manager: Optional[LayoutManager] = None):
18
+ """
19
+ Initialize the layout analyzer.
20
+
21
+ Args:
22
+ page: The Page object to analyze
23
+ layout_manager: Optional LayoutManager instance. If None, will try to get from page's parent.
24
+ """
25
+ self._page = page
26
+ self._layout_manager = layout_manager or getattr(page._parent, '_layout_manager', None)
27
+
28
+ if not self._layout_manager:
29
+ logger.warning(f"LayoutManager not available for page {page.number}. Layout analysis will fail.")
30
+
31
+ def analyze_layout(
32
+ self,
33
+ engine: Optional[str] = None,
34
+ options: Optional[LayoutOptions] = None,
35
+ confidence: Optional[float] = None,
36
+ classes: Optional[List[str]] = None,
37
+ exclude_classes: Optional[List[str]] = None,
38
+ device: Optional[str] = None,
39
+ existing: str = "replace"
40
+ ) -> List[Region]:
41
+ """
42
+ Analyze the page layout using the configured LayoutManager.
43
+
44
+ Args:
45
+ engine: Name of the layout engine (e.g., 'yolo', 'tatr'). Uses manager's default if None.
46
+ options: Specific LayoutOptions object for advanced configuration.
47
+ confidence: Minimum confidence threshold (simple mode).
48
+ classes: Specific classes to detect (simple mode).
49
+ exclude_classes: Classes to exclude (simple mode).
50
+ device: Device for inference (simple mode).
51
+ existing: How to handle existing detected regions: 'replace' (default) or 'append'.
52
+
53
+ Returns:
54
+ List of created Region objects.
55
+ """
56
+ if not self._layout_manager:
57
+ logger.error(f"Page {self._page.number}: LayoutManager not available. Cannot analyze layout.")
58
+ return []
59
+
60
+ logger.info(f"Page {self._page.number}: Analyzing layout (Engine: {engine or 'default'}, Options: {options is not None})...")
61
+
62
+ # --- Render Page Image ---
63
+ logger.debug(f" Rendering page {self._page.number} to image for layout analysis...")
64
+ try:
65
+ # Use a resolution suitable for layout analysis, potentially configurable
66
+ layout_scale = getattr(self._page._parent, '_config', {}).get('layout_image_scale', 1.5) # ~108 DPI default
67
+ layout_resolution = layout_scale * 72
68
+ # Render without existing highlights to avoid interference
69
+ page_image = self._page.to_image(resolution=layout_resolution, include_highlights=False)
70
+ logger.debug(f" Rendered image size: {page_image.width}x{page_image.height}")
71
+ except Exception as e:
72
+ logger.error(f" Failed to render page {self._page.number} to image: {e}", exc_info=True)
73
+ return []
74
+
75
+ # --- Prepare Arguments for Layout Manager ---
76
+ manager_args = {'image': page_image, 'options': options, 'engine': engine}
77
+ if confidence is not None: manager_args['confidence'] = confidence
78
+ if classes is not None: manager_args['classes'] = classes
79
+ if exclude_classes is not None: manager_args['exclude_classes'] = exclude_classes
80
+ if device is not None: manager_args['device'] = device
81
+
82
+ # --- Call Layout Manager ---
83
+ logger.debug(f" Calling Layout Manager...")
84
+ try:
85
+ detections = self._layout_manager.analyze_layout(**manager_args)
86
+ logger.info(f" Layout Manager returned {len(detections)} detections.")
87
+ except Exception as e:
88
+ logger.error(f" Layout analysis failed: {e}", exc_info=True)
89
+ return []
90
+
91
+ # --- Process Detections (Convert to Regions, Scale Coords) ---
92
+ # Calculate scale factor to convert from image back to PDF coordinates
93
+ if page_image.width == 0 or page_image.height == 0:
94
+ logger.error(f"Page {self._page.number}: Invalid rendered image dimensions ({page_image.width}x{page_image.height}). Cannot scale layout results.")
95
+ return []
96
+ scale_x = self._page.width / page_image.width
97
+ scale_y = self._page.height / page_image.height
98
+ logger.debug(f" Scaling factors: x={scale_x:.4f}, y={scale_y:.4f}")
99
+
100
+ layout_regions = []
101
+ docling_id_to_region = {} # For hierarchy if using Docling
102
+
103
+ for detection in detections:
104
+ try:
105
+ x_min, y_min, x_max, y_max = detection['bbox']
106
+
107
+ # Convert coordinates from image to PDF space
108
+ pdf_x0 = x_min * scale_x
109
+ pdf_y0 = y_min * scale_y
110
+ pdf_x1 = x_max * scale_x
111
+ pdf_y1 = y_max * scale_y
112
+
113
+ # Create a Region object
114
+ region = Region(self._page, (pdf_x0, pdf_y0, pdf_x1, pdf_y1))
115
+ region.region_type = detection.get('class', 'unknown') # Original class name
116
+ region.normalized_type = detection.get('normalized_class', 'unknown') # Hyphenated name
117
+ region.confidence = detection.get('confidence', 0.0)
118
+ region.model = detection.get('model', engine or 'unknown') # Store model name
119
+ region.source = 'detected'
120
+
121
+ # Add extra info if available
122
+ if 'text' in detection: region.text_content = detection['text']
123
+ if 'docling_id' in detection: region.docling_id = detection['docling_id']
124
+ if 'parent_id' in detection: region.parent_id = detection['parent_id']
125
+ # Add other fields like polygon, position, row/col index if needed
126
+
127
+ layout_regions.append(region)
128
+
129
+ # Track Docling IDs for hierarchy
130
+ if hasattr(region, 'docling_id') and region.docling_id:
131
+ docling_id_to_region[region.docling_id] = region
132
+
133
+ except (KeyError, IndexError, TypeError, ValueError) as e:
134
+ logger.warning(f"Could not process layout detection: {detection}. Error: {e}")
135
+ continue
136
+
137
+ # --- Build Hierarchy (if Docling results detected) ---
138
+ if docling_id_to_region:
139
+ logger.debug("Building Docling region hierarchy...")
140
+ for region in layout_regions:
141
+ if hasattr(region, 'parent_id') and region.parent_id:
142
+ parent_region = docling_id_to_region.get(region.parent_id)
143
+ if parent_region:
144
+ if hasattr(parent_region, 'add_child'):
145
+ parent_region.add_child(region)
146
+ else:
147
+ logger.warning("Region object missing add_child method for hierarchy.")
148
+
149
+ # --- Store Results ---
150
+ logger.debug(f"Storing {len(layout_regions)} processed layout regions (mode: {existing}).")
151
+ # Handle existing regions based on mode
152
+ if existing.lower() == 'append':
153
+ if 'detected' not in self._page._regions: self._page._regions['detected'] = []
154
+ self._page._regions['detected'].extend(layout_regions)
155
+ else: # Default is 'replace'
156
+ self._page._regions['detected'] = layout_regions
157
+
158
+ # Add regions to the element manager
159
+ for region in layout_regions:
160
+ self._page._element_mgr.add_region(region)
161
+
162
+ # Store layout regions in a dedicated attribute for easier access
163
+ self._page.detected_layout_regions = self._page._regions['detected']
164
+ logger.info(f"Layout analysis complete for page {self._page.number}.")
165
+
166
+ return layout_regions