natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +126 -98
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +910 -516
- natural_pdf/core/pdf.py +387 -289
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +714 -514
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.3.dist-info/RECORD +0 -61
- natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,34 +1,42 @@
|
|
1
|
+
import copy
|
1
2
|
import logging
|
2
|
-
from typing import
|
3
|
+
from typing import Any, Dict, List, Optional, Union
|
4
|
+
|
3
5
|
from PIL import Image
|
4
|
-
import copy
|
5
6
|
|
6
|
-
from natural_pdf.elements.region import Region
|
7
7
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
8
|
-
from natural_pdf.analyzers.layout.layout_options import
|
8
|
+
from natural_pdf.analyzers.layout.layout_options import (
|
9
|
+
BaseLayoutOptions,
|
10
|
+
LayoutOptions,
|
11
|
+
TATRLayoutOptions,
|
12
|
+
)
|
13
|
+
from natural_pdf.elements.region import Region
|
9
14
|
|
10
15
|
logger = logging.getLogger(__name__)
|
11
16
|
|
17
|
+
|
12
18
|
class LayoutAnalyzer:
|
13
19
|
"""
|
14
20
|
Handles layout analysis for PDF pages, including image rendering,
|
15
21
|
coordinate scaling, region creation, and result storage.
|
16
22
|
"""
|
17
|
-
|
23
|
+
|
18
24
|
def __init__(self, page, layout_manager: Optional[LayoutManager] = None):
|
19
25
|
"""
|
20
26
|
Initialize the layout analyzer.
|
21
|
-
|
27
|
+
|
22
28
|
Args:
|
23
29
|
page: The Page object to analyze
|
24
30
|
layout_manager: Optional LayoutManager instance. If None, will try to get from page's parent.
|
25
31
|
"""
|
26
32
|
self._page = page
|
27
|
-
self._layout_manager = layout_manager or getattr(page._parent,
|
28
|
-
|
33
|
+
self._layout_manager = layout_manager or getattr(page._parent, "_layout_manager", None)
|
34
|
+
|
29
35
|
if not self._layout_manager:
|
30
|
-
logger.warning(
|
31
|
-
|
36
|
+
logger.warning(
|
37
|
+
f"LayoutManager not available for page {page.number}. Layout analysis will fail."
|
38
|
+
)
|
39
|
+
|
32
40
|
def analyze_layout(
|
33
41
|
self,
|
34
42
|
engine: Optional[str] = None,
|
@@ -38,14 +46,14 @@ class LayoutAnalyzer:
|
|
38
46
|
exclude_classes: Optional[List[str]] = None,
|
39
47
|
device: Optional[str] = None,
|
40
48
|
existing: str = "replace",
|
41
|
-
**kwargs
|
49
|
+
**kwargs,
|
42
50
|
) -> List[Region]:
|
43
51
|
"""
|
44
52
|
Analyze the page layout using the configured LayoutManager.
|
45
|
-
|
46
|
-
This method constructs the final options object, including internal context,
|
53
|
+
|
54
|
+
This method constructs the final options object, including internal context,
|
47
55
|
and passes it to the LayoutManager.
|
48
|
-
|
56
|
+
|
49
57
|
Args:
|
50
58
|
engine: Name of the layout engine (e.g., 'yolo', 'tatr'). Uses manager's default if None and no options object given.
|
51
59
|
options: Specific LayoutOptions object for advanced configuration. If provided, simple args (confidence, etc.) are ignored.
|
@@ -60,122 +68,149 @@ class LayoutAnalyzer:
|
|
60
68
|
List of created Region objects.
|
61
69
|
"""
|
62
70
|
if not self._layout_manager:
|
63
|
-
logger.error(
|
71
|
+
logger.error(
|
72
|
+
f"Page {self._page.number}: LayoutManager not available. Cannot analyze layout."
|
73
|
+
)
|
64
74
|
return []
|
65
75
|
|
66
|
-
logger.info(
|
76
|
+
logger.info(
|
77
|
+
f"Page {self._page.number}: Analyzing layout (Engine: {engine or 'default'}, Options provided: {options is not None})..."
|
78
|
+
)
|
67
79
|
|
68
80
|
# --- Render Page Image (Standard Resolution) ---
|
69
|
-
logger.debug(
|
81
|
+
logger.debug(
|
82
|
+
f" Rendering page {self._page.number} to image for initial layout detection..."
|
83
|
+
)
|
70
84
|
try:
|
71
|
-
layout_scale = getattr(self._page._parent,
|
85
|
+
layout_scale = getattr(self._page._parent, "_config", {}).get("layout_image_scale", 1.5)
|
72
86
|
layout_resolution = layout_scale * 72
|
73
|
-
std_res_page_image = self._page.to_image(
|
87
|
+
std_res_page_image = self._page.to_image(
|
88
|
+
resolution=layout_resolution, include_highlights=False
|
89
|
+
)
|
74
90
|
if not std_res_page_image:
|
75
91
|
raise ValueError("Initial page rendering returned None")
|
76
|
-
logger.debug(
|
92
|
+
logger.debug(
|
93
|
+
f" Initial rendered image size: {std_res_page_image.width}x{std_res_page_image.height}"
|
94
|
+
)
|
77
95
|
except Exception as e:
|
78
96
|
logger.error(f" Failed to render initial page image: {e}", exc_info=True)
|
79
97
|
return []
|
80
|
-
|
98
|
+
|
81
99
|
# --- Calculate Scaling Factors (Standard Res Image <-> PDF) ---
|
82
100
|
if std_res_page_image.width == 0 or std_res_page_image.height == 0:
|
83
|
-
logger.error(
|
101
|
+
logger.error(
|
102
|
+
f"Page {self._page.number}: Invalid initial rendered image dimensions. Cannot scale results."
|
103
|
+
)
|
84
104
|
return []
|
85
105
|
img_scale_x = self._page.width / std_res_page_image.width
|
86
106
|
img_scale_y = self._page.height / std_res_page_image.height
|
87
107
|
logger.debug(f" StdRes Image -> PDF Scaling: x={img_scale_x:.4f}, y={img_scale_y:.4f}")
|
88
108
|
|
89
|
-
# --- Construct Final Options Object ---
|
109
|
+
# --- Construct Final Options Object ---
|
90
110
|
final_options: BaseLayoutOptions
|
91
|
-
|
111
|
+
|
92
112
|
if options is not None:
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
113
|
+
# User provided a complete options object, use it directly
|
114
|
+
logger.debug("Using user-provided options object.")
|
115
|
+
final_options = copy.deepcopy(options) # Copy to avoid modifying original user object
|
116
|
+
if kwargs:
|
117
|
+
logger.warning(
|
118
|
+
f"Ignoring kwargs {list(kwargs.keys())} because a full options object was provided."
|
119
|
+
)
|
120
|
+
# Infer engine from options type if engine arg wasn't provided
|
121
|
+
if engine is None:
|
122
|
+
for name, registry_entry in self._layout_manager.ENGINE_REGISTRY.items():
|
123
|
+
if isinstance(final_options, registry_entry["options_class"]):
|
124
|
+
engine = name
|
125
|
+
logger.debug(f"Inferred engine '{engine}' from options type.")
|
126
|
+
break
|
127
|
+
if engine is None:
|
128
|
+
logger.warning("Could not infer engine from provided options object.")
|
107
129
|
else:
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
130
|
+
# Construct options from simple args (engine, confidence, classes, etc.)
|
131
|
+
logger.debug("Constructing options from simple arguments.")
|
132
|
+
selected_engine = (
|
133
|
+
engine or self._layout_manager.get_available_engines()[0]
|
134
|
+
) # Use provided or first available
|
135
|
+
engine_lower = selected_engine.lower()
|
136
|
+
registry = self._layout_manager.ENGINE_REGISTRY
|
137
|
+
|
138
|
+
if engine_lower not in registry:
|
139
|
+
raise ValueError(
|
140
|
+
f"Unknown or unavailable engine: '{selected_engine}'. Available: {list(registry.keys())}"
|
141
|
+
)
|
142
|
+
|
143
|
+
options_class = registry[engine_lower]["options_class"]
|
144
|
+
|
145
|
+
# Get base defaults
|
146
|
+
base_defaults = BaseLayoutOptions()
|
147
|
+
|
148
|
+
# Prepare args for constructor, prioritizing explicit args over defaults
|
149
|
+
constructor_args = {
|
150
|
+
"confidence": confidence if confidence is not None else base_defaults.confidence,
|
151
|
+
"classes": classes, # Pass None if not provided
|
152
|
+
"exclude_classes": exclude_classes, # Pass None if not provided
|
153
|
+
"device": device if device is not None else base_defaults.device,
|
154
|
+
"extra_args": kwargs, # Pass other kwargs here
|
155
|
+
}
|
156
|
+
# Remove None values unless they are valid defaults (like classes=None)
|
157
|
+
# We can pass all to the dataclass constructor; it handles defaults
|
158
|
+
|
159
|
+
try:
|
160
|
+
final_options = options_class(**constructor_args)
|
161
|
+
logger.debug(f"Constructed options: {final_options}")
|
162
|
+
except TypeError as e:
|
163
|
+
logger.error(
|
164
|
+
f"Failed to construct options object {options_class.__name__} with args {constructor_args}: {e}"
|
165
|
+
)
|
166
|
+
# Filter kwargs to only include fields defined in the specific options class? Complex.
|
167
|
+
# Re-raise for now, indicates programming error or invalid kwarg.
|
168
|
+
raise e
|
169
|
+
|
170
|
+
# --- Add Internal Context to extra_args (ALWAYS) ---
|
171
|
+
if not hasattr(final_options, "extra_args") or final_options.extra_args is None:
|
172
|
+
final_options.extra_args = {}
|
173
|
+
final_options.extra_args["_page_ref"] = self._page
|
174
|
+
final_options.extra_args["_img_scale_x"] = img_scale_x
|
175
|
+
final_options.extra_args["_img_scale_y"] = img_scale_y
|
176
|
+
logger.debug(
|
177
|
+
f"Added internal context to final_options.extra_args: {final_options.extra_args}"
|
178
|
+
)
|
179
|
+
|
180
|
+
# --- Call Layout Manager with the Final Options ---
|
151
181
|
logger.debug(f"Calling Layout Manager with final options object.")
|
152
182
|
try:
|
153
183
|
# Pass only image and the constructed options object
|
154
184
|
detections = self._layout_manager.analyze_layout(
|
155
|
-
image=std_res_page_image,
|
156
|
-
options=final_options
|
185
|
+
image=std_res_page_image,
|
186
|
+
options=final_options,
|
157
187
|
# No engine, confidence, classes etc. passed here directly
|
158
188
|
)
|
159
189
|
logger.info(f" Layout Manager returned {len(detections)} detections.")
|
190
|
+
# Specifically let errors about unknown/unavailable engines propagate
|
191
|
+
except (ValueError, RuntimeError) as engine_error:
|
192
|
+
logger.error(f"Layout analysis failed: {engine_error}")
|
193
|
+
raise engine_error # Re-raise the specific error
|
160
194
|
except Exception as e:
|
161
|
-
|
162
|
-
|
195
|
+
# Catch other unexpected errors during analysis execution
|
196
|
+
logger.error(f" Layout analysis failed with unexpected error: {e}", exc_info=True)
|
197
|
+
return [] # Return empty list for other runtime errors
|
163
198
|
|
164
199
|
# --- Process Detections (Convert to Regions, Scale Coords from Image to PDF) ---
|
165
200
|
layout_regions = []
|
166
|
-
docling_id_to_region = {}
|
201
|
+
docling_id_to_region = {} # For hierarchy if using Docling
|
167
202
|
|
168
203
|
for detection in detections:
|
169
204
|
try:
|
170
205
|
# bbox is relative to std_res_page_image
|
171
|
-
x_min, y_min, x_max, y_max = detection[
|
206
|
+
x_min, y_min, x_max, y_max = detection["bbox"]
|
172
207
|
|
173
208
|
# Convert coordinates from image to PDF space
|
174
209
|
pdf_x0 = x_min * img_scale_x
|
175
210
|
pdf_y0 = y_min * img_scale_y
|
176
211
|
pdf_x1 = x_max * img_scale_x
|
177
212
|
pdf_y1 = y_max * img_scale_y
|
178
|
-
|
213
|
+
|
179
214
|
# Ensure PDF coords are valid
|
180
215
|
pdf_x0, pdf_x1 = min(pdf_x0, pdf_x1), max(pdf_x0, pdf_x1)
|
181
216
|
pdf_y0, pdf_y1 = min(pdf_y0, pdf_y1), max(pdf_y0, pdf_y1)
|
@@ -186,21 +221,24 @@ class LayoutAnalyzer:
|
|
186
221
|
|
187
222
|
# Create a Region object with PDF coordinates
|
188
223
|
region = Region(self._page, (pdf_x0, pdf_y0, pdf_x1, pdf_y1))
|
189
|
-
region.region_type = detection.get(
|
190
|
-
region.normalized_type = detection.get(
|
191
|
-
region.confidence = detection.get(
|
192
|
-
region.model = detection.get(
|
193
|
-
region.source =
|
194
|
-
|
224
|
+
region.region_type = detection.get("class", "unknown")
|
225
|
+
region.normalized_type = detection.get("normalized_class", "unknown")
|
226
|
+
region.confidence = detection.get("confidence", 0.0)
|
227
|
+
region.model = detection.get("model", engine or "unknown")
|
228
|
+
region.source = "detected"
|
229
|
+
|
195
230
|
# Add extra info if available
|
196
|
-
if
|
197
|
-
|
198
|
-
if
|
231
|
+
if "text" in detection:
|
232
|
+
region.text_content = detection["text"]
|
233
|
+
if "docling_id" in detection:
|
234
|
+
region.docling_id = detection["docling_id"]
|
235
|
+
if "parent_id" in detection:
|
236
|
+
region.parent_id = detection["parent_id"]
|
199
237
|
|
200
238
|
layout_regions.append(region)
|
201
239
|
|
202
240
|
# Track Docling IDs for hierarchy
|
203
|
-
if hasattr(region,
|
241
|
+
if hasattr(region, "docling_id") and region.docling_id:
|
204
242
|
docling_id_to_region[region.docling_id] = region
|
205
243
|
|
206
244
|
except (KeyError, IndexError, TypeError, ValueError) as e:
|
@@ -211,10 +249,10 @@ class LayoutAnalyzer:
|
|
211
249
|
if docling_id_to_region:
|
212
250
|
logger.debug("Building Docling region hierarchy...")
|
213
251
|
for region in layout_regions:
|
214
|
-
if hasattr(region,
|
252
|
+
if hasattr(region, "parent_id") and region.parent_id:
|
215
253
|
parent_region = docling_id_to_region.get(region.parent_id)
|
216
254
|
if parent_region:
|
217
|
-
if hasattr(parent_region,
|
255
|
+
if hasattr(parent_region, "add_child"):
|
218
256
|
parent_region.add_child(region)
|
219
257
|
else:
|
220
258
|
logger.warning("Region object missing add_child method for hierarchy.")
|
@@ -222,34 +260,39 @@ class LayoutAnalyzer:
|
|
222
260
|
# --- Store Results ---
|
223
261
|
logger.debug(f"Storing {len(layout_regions)} processed layout regions (mode: {existing}).")
|
224
262
|
# Handle existing regions based on mode
|
225
|
-
if existing.lower() ==
|
226
|
-
if
|
227
|
-
|
228
|
-
|
229
|
-
|
263
|
+
if existing.lower() == "append":
|
264
|
+
if "detected" not in self._page._regions:
|
265
|
+
self._page._regions["detected"] = []
|
266
|
+
self._page._regions["detected"].extend(layout_regions)
|
267
|
+
else: # Default is 'replace'
|
268
|
+
self._page._regions["detected"] = layout_regions
|
230
269
|
|
231
270
|
# Add regions to the element manager
|
232
271
|
for region in layout_regions:
|
233
272
|
self._page._element_mgr.add_region(region)
|
234
273
|
|
235
274
|
# Store layout regions in a dedicated attribute for easier access
|
236
|
-
self._page.detected_layout_regions = self._page._regions[
|
275
|
+
self._page.detected_layout_regions = self._page._regions["detected"]
|
237
276
|
logger.info(f"Layout analysis complete for page {self._page.number}.")
|
238
|
-
|
277
|
+
|
239
278
|
# --- Auto-create cells if requested by TATR options ---
|
240
279
|
if isinstance(final_options, TATRLayoutOptions) and final_options.create_cells:
|
241
|
-
logger.info(
|
280
|
+
logger.info(
|
281
|
+
f" Option create_cells=True detected for TATR. Attempting cell creation..."
|
282
|
+
)
|
242
283
|
created_cell_count = 0
|
243
284
|
for region in layout_regions:
|
244
285
|
# Only attempt on regions identified as tables by the TATR model
|
245
|
-
if region.model ==
|
286
|
+
if region.model == "tatr" and region.region_type == "table":
|
246
287
|
try:
|
247
288
|
# create_cells now modifies the page elements directly and returns self
|
248
289
|
region.create_cells()
|
249
|
-
# We could potentially count cells created here if needed,
|
290
|
+
# We could potentially count cells created here if needed,
|
250
291
|
# but the method logs its own count.
|
251
292
|
except Exception as cell_error:
|
252
|
-
logger.warning(
|
293
|
+
logger.warning(
|
294
|
+
f" Error calling create_cells for table region {region.bbox}: {cell_error}"
|
295
|
+
)
|
253
296
|
logger.info(f" Finished cell creation process triggered by options.")
|
254
|
-
|
255
|
-
return layout_regions
|
297
|
+
|
298
|
+
return layout_regions
|