natural-pdf 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +31 -0
- natural_pdf/analyzers/layout/gemini.py +137 -162
- natural_pdf/analyzers/layout/layout_manager.py +9 -5
- natural_pdf/analyzers/layout/layout_options.py +77 -7
- natural_pdf/analyzers/layout/paddle.py +318 -165
- natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
- natural_pdf/analyzers/shape_detection_mixin.py +770 -405
- natural_pdf/classification/mixin.py +2 -8
- natural_pdf/collections/pdf_collection.py +25 -30
- natural_pdf/core/highlighting_service.py +47 -32
- natural_pdf/core/page.py +117 -75
- natural_pdf/core/pdf.py +19 -22
- natural_pdf/elements/base.py +9 -9
- natural_pdf/elements/collections.py +105 -50
- natural_pdf/elements/region.py +200 -126
- natural_pdf/exporters/paddleocr.py +38 -13
- natural_pdf/flows/__init__.py +3 -3
- natural_pdf/flows/collections.py +303 -132
- natural_pdf/flows/element.py +277 -132
- natural_pdf/flows/flow.py +33 -16
- natural_pdf/flows/region.py +142 -79
- natural_pdf/ocr/engine_doctr.py +37 -4
- natural_pdf/ocr/engine_easyocr.py +23 -3
- natural_pdf/ocr/engine_paddle.py +281 -30
- natural_pdf/ocr/engine_surya.py +8 -3
- natural_pdf/ocr/ocr_manager.py +75 -76
- natural_pdf/ocr/ocr_options.py +52 -87
- natural_pdf/search/__init__.py +25 -12
- natural_pdf/search/lancedb_search_service.py +91 -54
- natural_pdf/search/numpy_search_service.py +86 -65
- natural_pdf/search/searchable_mixin.py +2 -2
- natural_pdf/selectors/parser.py +125 -81
- natural_pdf/widgets/__init__.py +1 -1
- natural_pdf/widgets/viewer.py +205 -449
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ import os
|
|
5
5
|
import tempfile
|
6
6
|
from typing import Any, Dict, List, Optional
|
7
7
|
|
8
|
+
import numpy as np
|
8
9
|
from PIL import Image
|
9
10
|
|
10
11
|
# Assuming base class and options are importable
|
@@ -40,12 +41,11 @@ logger = logging.getLogger(__name__)
|
|
40
41
|
# Check for dependencies
|
41
42
|
paddle_spec = importlib.util.find_spec("paddle") or importlib.util.find_spec("paddlepaddle")
|
42
43
|
paddleocr_spec = importlib.util.find_spec("paddleocr")
|
43
|
-
|
44
|
-
PaddleOCR = None # For optional text detection
|
44
|
+
PPStructureV3 = None
|
45
45
|
|
46
46
|
if paddle_spec and paddleocr_spec:
|
47
47
|
try:
|
48
|
-
from paddleocr import
|
48
|
+
from paddleocr import PPStructureV3
|
49
49
|
except ImportError as e:
|
50
50
|
logger.warning(f"Could not import Paddle dependencies: {e}")
|
51
51
|
else:
|
@@ -53,85 +53,159 @@ else:
|
|
53
53
|
"paddlepaddle or paddleocr not found. PaddleLayoutDetector will not be available."
|
54
54
|
)
|
55
55
|
|
56
|
+
from .table_structure_utils import group_cells_into_rows_and_columns
|
56
57
|
|
57
58
|
class PaddleLayoutDetector(LayoutDetector):
|
58
|
-
"""Document layout and table structure detector using PaddlePaddle's PP-
|
59
|
+
"""Document layout and table structure detector using PaddlePaddle's PP-StructureV3."""
|
59
60
|
|
60
61
|
def __init__(self):
|
61
62
|
super().__init__()
|
62
|
-
# Supported classes by PP-
|
63
|
+
# Supported classes by PP-StructureV3 (based on docs and common usage)
|
63
64
|
self.supported_classes = {
|
64
65
|
"text",
|
65
66
|
"title",
|
66
67
|
"figure",
|
67
|
-
"figure_caption",
|
68
68
|
"table",
|
69
|
-
"table_caption",
|
70
|
-
"table_cell", # Added table_cell
|
71
69
|
"header",
|
72
70
|
"footer",
|
73
71
|
"reference",
|
74
72
|
"equation",
|
75
|
-
#
|
73
|
+
# New labels from V3
|
74
|
+
"image",
|
75
|
+
"paragraph_title",
|
76
|
+
"doc_title",
|
77
|
+
"figure_title",
|
78
|
+
"table_cell",
|
76
79
|
}
|
77
80
|
# Models are loaded via _get_model
|
78
81
|
|
79
82
|
def is_available(self) -> bool:
|
80
83
|
"""Check if dependencies are installed."""
|
81
|
-
return
|
84
|
+
return PPStructureV3 is not None
|
82
85
|
|
83
86
|
def _get_cache_key(self, options: BaseLayoutOptions) -> str:
|
84
|
-
"""Generate cache key based on
|
87
|
+
"""Generate cache key based on model configuration."""
|
85
88
|
if not isinstance(options, PaddleLayoutOptions):
|
86
|
-
options = PaddleLayoutOptions(device=options.device)
|
89
|
+
options = PaddleLayoutOptions(device=options.device)
|
87
90
|
|
88
91
|
device_key = str(options.device).lower() if options.device else "default_device"
|
89
92
|
lang_key = options.lang
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
+
table_key = str(options.use_table_recognition)
|
94
|
+
orientation_key = str(options.use_textline_orientation)
|
95
|
+
|
96
|
+
return f"{self.__class__.__name__}_{device_key}_{lang_key}_{table_key}_{orientation_key}"
|
93
97
|
|
94
98
|
def _load_model_from_options(self, options: BaseLayoutOptions) -> Any:
|
95
|
-
"""Load the
|
99
|
+
"""Load the PPStructureV3 model based on options."""
|
96
100
|
if not self.is_available():
|
97
101
|
raise RuntimeError("Paddle dependencies (paddlepaddle, paddleocr) not installed.")
|
98
102
|
|
99
103
|
if not isinstance(options, PaddleLayoutOptions):
|
100
104
|
raise TypeError("Incorrect options type provided for Paddle model loading.")
|
101
105
|
|
102
|
-
self.logger.info(
|
103
|
-
|
104
|
-
)
|
106
|
+
self.logger.info(f"Loading PP-StructureV3 model with options: {options}")
|
107
|
+
|
108
|
+
# List of valid PPStructureV3 constructor arguments (from official docs)
|
109
|
+
valid_init_args = {
|
110
|
+
"layout_detection_model_name",
|
111
|
+
"layout_detection_model_dir",
|
112
|
+
"layout_threshold",
|
113
|
+
"layout_nms",
|
114
|
+
"layout_unclip_ratio",
|
115
|
+
"layout_merge_bboxes_mode",
|
116
|
+
"chart_recognition_model_name",
|
117
|
+
"chart_recognition_model_dir",
|
118
|
+
"chart_recognition_batch_size",
|
119
|
+
"region_detection_model_name",
|
120
|
+
"region_detection_model_dir",
|
121
|
+
"doc_orientation_classify_model_name",
|
122
|
+
"doc_orientation_classify_model_dir",
|
123
|
+
"doc_unwarping_model_name",
|
124
|
+
"doc_unwarping_model_dir",
|
125
|
+
"text_detection_model_name",
|
126
|
+
"text_detection_model_dir",
|
127
|
+
"text_det_limit_side_len",
|
128
|
+
"text_det_limit_type",
|
129
|
+
"text_det_thresh",
|
130
|
+
"text_det_box_thresh",
|
131
|
+
"text_det_unclip_ratio",
|
132
|
+
"textline_orientation_model_name",
|
133
|
+
"textline_orientation_model_dir",
|
134
|
+
"textline_orientation_batch_size",
|
135
|
+
"text_recognition_model_name",
|
136
|
+
"text_recognition_model_dir",
|
137
|
+
"text_recognition_batch_size",
|
138
|
+
"text_rec_score_thresh",
|
139
|
+
"table_classification_model_name",
|
140
|
+
"table_classification_model_dir",
|
141
|
+
"wired_table_structure_recognition_model_name",
|
142
|
+
"wired_table_structure_recognition_model_dir",
|
143
|
+
"wireless_table_structure_recognition_model_name",
|
144
|
+
"wireless_table_structure_recognition_model_dir",
|
145
|
+
"wired_table_cells_detection_model_name",
|
146
|
+
"wired_table_cells_detection_model_dir",
|
147
|
+
"wireless_table_cells_detection_model_name",
|
148
|
+
"wireless_table_cells_detection_model_dir",
|
149
|
+
"seal_text_detection_model_name",
|
150
|
+
"seal_text_detection_model_dir",
|
151
|
+
"seal_det_limit_side_len",
|
152
|
+
"seal_det_limit_type",
|
153
|
+
"seal_det_thresh",
|
154
|
+
"seal_det_box_thresh",
|
155
|
+
"seal_det_unclip_ratio",
|
156
|
+
"seal_text_recognition_model_name",
|
157
|
+
"seal_text_recognition_model_dir",
|
158
|
+
"seal_text_recognition_batch_size",
|
159
|
+
"seal_rec_score_thresh",
|
160
|
+
"formula_recognition_model_name",
|
161
|
+
"formula_recognition_model_dir",
|
162
|
+
"formula_recognition_batch_size",
|
163
|
+
"use_doc_orientation_classify",
|
164
|
+
"use_doc_unwarping",
|
165
|
+
"use_textline_orientation",
|
166
|
+
"use_seal_recognition",
|
167
|
+
"use_table_recognition",
|
168
|
+
"use_formula_recognition",
|
169
|
+
"use_chart_recognition",
|
170
|
+
"use_region_detection",
|
171
|
+
"device",
|
172
|
+
"enable_hpi",
|
173
|
+
"use_tensorrt",
|
174
|
+
"precision",
|
175
|
+
"enable_mkldnn",
|
176
|
+
"cpu_threads",
|
177
|
+
"paddlex_config",
|
178
|
+
}
|
179
|
+
|
180
|
+
# Build init_args from dataclass fields and filtered extra_args
|
181
|
+
init_args = {}
|
182
|
+
# Add all dataclass fields that are in the valid set and not None
|
183
|
+
for field_name in options.__dataclass_fields__:
|
184
|
+
if field_name in valid_init_args:
|
185
|
+
value = getattr(options, field_name)
|
186
|
+
if value is not None:
|
187
|
+
init_args[field_name] = value
|
188
|
+
# Add filtered extra_args (not starting with '_' and in valid set)
|
189
|
+
filtered_extra_args = {
|
190
|
+
k: v for k, v in options.extra_args.items()
|
191
|
+
if not k.startswith('_') and k in valid_init_args
|
192
|
+
}
|
193
|
+
init_args.update(filtered_extra_args)
|
194
|
+
|
195
|
+
# Special handling for English model selection
|
196
|
+
if getattr(options, "lang", None) == "en":
|
197
|
+
init_args["text_recognition_model_name"] = "en_PP-OCRv4_mobile_rec"
|
198
|
+
|
105
199
|
try:
|
106
|
-
|
107
|
-
|
108
|
-
# Note: show_log is a runtime arg, not needed for instance caching key.
|
109
|
-
# Note: `layout=False` disables layout analysis, which we definitely want here.
|
110
|
-
# Note: `ocr=False` might disable text detection needed for table structure? Check PPStructure docs.
|
111
|
-
# It seems best to initialize with core settings and pass others during the call if possible.
|
112
|
-
# However, PPStructure call signature is simple (__call__(self, img, ...))
|
113
|
-
# So, we likely need to initialize with most settings.
|
114
|
-
model_instance = PPStructure(
|
115
|
-
lang=options.lang,
|
116
|
-
use_gpu=(
|
117
|
-
"cuda" in str(options.device).lower() or "gpu" in str(options.device).lower()
|
118
|
-
),
|
119
|
-
use_angle_cls=options.use_angle_cls,
|
120
|
-
show_log=options.show_log,
|
121
|
-
layout=True, # Ensure layout analysis is on
|
122
|
-
table=options.enable_table, # Control table analysis
|
123
|
-
ocr=False, # Usually disable internal OCR if only using for layout/table
|
124
|
-
# Add other PPStructure init args from options.extra_args if needed
|
125
|
-
# **options.extra_args
|
126
|
-
)
|
127
|
-
self.logger.info("PPStructure model loaded.")
|
200
|
+
model_instance = PPStructureV3(**init_args)
|
201
|
+
self.logger.info("PP-StructureV3 model loaded.")
|
128
202
|
return model_instance
|
129
203
|
except Exception as e:
|
130
|
-
self.logger.error(f"Failed to load
|
204
|
+
self.logger.error(f"Failed to load PP-StructureV3 model: {e}", exc_info=True)
|
131
205
|
raise
|
132
206
|
|
133
207
|
def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
|
134
|
-
"""Detect layout elements in an image using
|
208
|
+
"""Detect layout elements in an image using PP-StructureV3."""
|
135
209
|
if not self.is_available():
|
136
210
|
raise RuntimeError("Paddle dependencies (paddlepaddle, paddleocr) not installed.")
|
137
211
|
|
@@ -145,52 +219,41 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
145
219
|
exclude_classes=options.exclude_classes,
|
146
220
|
device=options.device,
|
147
221
|
extra_args=options.extra_args,
|
148
|
-
# Other Paddle options will use defaults
|
149
222
|
)
|
150
223
|
|
224
|
+
# --- Backward compatibility for renamed options passed via extra_args ---
|
225
|
+
if "use_angle_cls" in options.extra_args:
|
226
|
+
self.logger.warning(
|
227
|
+
"Parameter 'use_angle_cls' is deprecated for Paddle. Use 'use_textline_orientation' instead."
|
228
|
+
)
|
229
|
+
options.use_textline_orientation = options.extra_args.pop("use_angle_cls")
|
230
|
+
if "enable_table" in options.extra_args:
|
231
|
+
self.logger.warning(
|
232
|
+
"Parameter 'enable_table' is deprecated for Paddle. Use 'use_table_recognition' instead."
|
233
|
+
)
|
234
|
+
options.use_table_recognition = options.extra_args.pop("enable_table")
|
235
|
+
|
151
236
|
self.validate_classes(options.classes or [])
|
152
237
|
if options.exclude_classes:
|
153
238
|
self.validate_classes(options.exclude_classes)
|
154
239
|
|
155
|
-
# Get the cached/loaded
|
240
|
+
# Get the cached/loaded PP-StructureV3 instance
|
156
241
|
ppstructure_instance = self._get_model(options)
|
157
242
|
|
158
|
-
#
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
# Process image with PP-Structure instance
|
169
|
-
# The instance was configured during _load_model_from_options
|
170
|
-
self.logger.debug("Running PPStructure analysis...")
|
171
|
-
result = ppstructure_instance(temp_image_path)
|
172
|
-
self.logger.debug(f"PPStructure returned {len(result)} regions.")
|
173
|
-
|
174
|
-
except Exception as e:
|
175
|
-
self.logger.error(f"Error during PPStructure analysis: {e}", exc_info=True)
|
176
|
-
# Clean up temp file before raising or returning
|
177
|
-
if os.path.exists(temp_image_path):
|
178
|
-
try:
|
179
|
-
os.remove(temp_image_path)
|
180
|
-
except OSError as e_rm:
|
181
|
-
self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
|
182
|
-
raise # Re-raise error
|
183
|
-
|
184
|
-
finally:
|
185
|
-
# Ensure cleanup even if analysis worked
|
186
|
-
if os.path.exists(temp_image_path):
|
187
|
-
try:
|
188
|
-
os.remove(temp_image_path)
|
189
|
-
except OSError as e_rm:
|
190
|
-
self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
|
243
|
+
# Convert PIL image to numpy array for prediction
|
244
|
+
img_np = np.array(image.convert("RGB"))
|
245
|
+
self.logger.debug("Running PP-StructureV3 analysis...")
|
246
|
+
try:
|
247
|
+
results = ppstructure_instance.predict(img_np)
|
248
|
+
except Exception as e:
|
249
|
+
self.logger.error(f"Error during PP-StructureV3 analysis: {e}", exc_info=True)
|
250
|
+
raise
|
251
|
+
|
252
|
+
self.logger.debug(f"PP-StructureV3 returned {len(results)} result objects.")
|
191
253
|
|
192
254
|
# --- Process Results ---
|
193
|
-
|
255
|
+
detections = []
|
256
|
+
if not results:
|
194
257
|
self.logger.warning("PaddleLayout returned empty results")
|
195
258
|
return []
|
196
259
|
|
@@ -203,95 +266,185 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
203
266
|
if options.exclude_classes
|
204
267
|
else set()
|
205
268
|
)
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
# PP-Structure results don't always have confidence, use threshold or default
|
224
|
-
confidence_score = region.get("score", 1.0) # Default to 1.0 if missing
|
225
|
-
if confidence_score < options.confidence:
|
226
|
-
continue
|
227
|
-
|
228
|
-
bbox = region.get("bbox")
|
229
|
-
if not bbox or len(bbox) != 4:
|
230
|
-
self.logger.warning(f"Skipping region with invalid bbox: {region}")
|
231
|
-
continue
|
232
|
-
x_min, y_min, x_max, y_max = map(float, bbox)
|
233
|
-
|
234
|
-
# Add detection
|
235
|
-
detection_data = {
|
236
|
-
"bbox": (x_min, y_min, x_max, y_max),
|
237
|
-
"class": region_type_orig, # Keep original case if needed
|
238
|
-
"confidence": confidence_score,
|
239
|
-
"normalized_class": normalized_class,
|
240
|
-
"source": "layout",
|
241
|
-
"model": "paddle",
|
242
|
-
}
|
243
|
-
detections.append(detection_data)
|
244
|
-
|
245
|
-
# --- Process Table Cells (if enabled and present) ---
|
246
|
-
if region_type == "table" and options.enable_table and "res" in region:
|
247
|
-
process_cells = (
|
248
|
-
normalized_classes_req is None or "table-cell" in normalized_classes_req
|
249
|
-
) and ("table-cell" not in normalized_classes_excl)
|
250
|
-
|
251
|
-
if process_cells and isinstance(region["res"], list): # V2 structure
|
252
|
-
for cell in region["res"]:
|
253
|
-
if "box" not in cell or len(cell["box"]) != 4:
|
254
|
-
continue
|
255
|
-
cell_bbox = cell["box"]
|
256
|
-
cell_x_min, cell_y_min, cell_x_max, cell_y_max = map(float, cell_bbox)
|
257
|
-
# Add cell detection (confidence often not available per cell)
|
258
|
-
detections.append(
|
259
|
-
{
|
260
|
-
"bbox": (cell_x_min, cell_y_min, cell_x_max, cell_y_max),
|
261
|
-
"class": "table cell", # Standardize name
|
262
|
-
"confidence": confidence_score
|
263
|
-
* 0.95, # Inherit table confidence (slightly reduced)
|
264
|
-
"normalized_class": "table-cell",
|
265
|
-
"text": cell.get("text", ""), # Include text if available
|
266
|
-
"source": "layout",
|
267
|
-
"model": "paddle",
|
268
|
-
}
|
269
|
-
)
|
270
|
-
elif (
|
271
|
-
process_cells
|
272
|
-
and isinstance(region["res"], dict)
|
273
|
-
and "cells" in region["res"]
|
274
|
-
): # Older structure
|
275
|
-
# Handle older 'cells' list if needed (logic from original file)
|
276
|
-
pass # Add logic based on original paddle.txt if supporting older PP-Structure
|
277
|
-
|
278
|
-
except (TypeError, KeyError, IndexError, ValueError) as e:
|
279
|
-
self.logger.warning(f"Error processing Paddle region: {region}. Error: {e}")
|
269
|
+
|
270
|
+
# Debug counters
|
271
|
+
table_count = 0
|
272
|
+
cell_count = 0
|
273
|
+
row_count = 0
|
274
|
+
col_count = 0
|
275
|
+
matched_table_structures = 0
|
276
|
+
|
277
|
+
# A single image input returns a list with one result object
|
278
|
+
for res in results:
|
279
|
+
# Handle both possible result structures (with or without 'res' key)
|
280
|
+
if isinstance(res, dict) and "res" in res:
|
281
|
+
result_data = res["res"]
|
282
|
+
elif isinstance(res, dict):
|
283
|
+
result_data = res
|
284
|
+
else:
|
285
|
+
self.logger.warning(f"Skipping result with unexpected structure: {res}")
|
280
286
|
continue
|
281
287
|
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
#
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
288
|
+
# --- Process Layout Regions ---
|
289
|
+
layout_res = result_data.get("layout_det_res", {})
|
290
|
+
table_res_list = result_data.get("table_res_list", [])
|
291
|
+
# Build a map of table_region_id to structure info for fast lookup
|
292
|
+
table_structures_by_id = {}
|
293
|
+
for t in table_res_list:
|
294
|
+
if "table_region_id" in t:
|
295
|
+
table_structures_by_id[t["table_region_id"]] = t
|
296
|
+
table_structures = table_res_list or []
|
297
|
+
table_idx = 0 # fallback index if no region_id
|
298
|
+
if table_res_list:
|
299
|
+
self.logger.debug(f"Found {len(table_res_list)} table structure(s) in table_res_list.")
|
300
|
+
|
301
|
+
if not layout_res or "boxes" not in layout_res:
|
302
|
+
self.logger.debug("No layout detection boxes found in result.")
|
303
|
+
else:
|
304
|
+
for region in layout_res["boxes"]:
|
305
|
+
try:
|
306
|
+
region_type_orig = region.get("label", "unknown")
|
307
|
+
region_type = region_type_orig.lower()
|
308
|
+
normalized_class = self._normalize_class_name(region_type)
|
309
|
+
|
310
|
+
# Apply class filtering
|
311
|
+
if (
|
312
|
+
normalized_classes_req
|
313
|
+
and normalized_class not in normalized_classes_req
|
314
|
+
):
|
315
|
+
continue
|
316
|
+
if normalized_class in normalized_classes_excl:
|
317
|
+
continue
|
318
|
+
|
319
|
+
confidence_score = region.get("score", 1.0)
|
320
|
+
if confidence_score < options.confidence:
|
321
|
+
continue
|
322
|
+
|
323
|
+
bbox = region.get("coordinate")
|
324
|
+
if not bbox or len(bbox) != 4:
|
325
|
+
self.logger.warning(
|
326
|
+
f"Skipping region with invalid bbox: {region}"
|
327
|
+
)
|
328
|
+
continue
|
329
|
+
x_min, y_min, x_max, y_max = map(float, bbox)
|
330
|
+
|
331
|
+
detection_data = {
|
332
|
+
"bbox": (x_min, y_min, x_max, y_max),
|
333
|
+
"class": region_type_orig,
|
334
|
+
"confidence": confidence_score,
|
335
|
+
"normalized_class": normalized_class,
|
336
|
+
"source": "layout",
|
337
|
+
"model": "paddle_v3",
|
338
|
+
}
|
339
|
+
|
340
|
+
# --- Table structure parsing ---
|
341
|
+
if normalized_class == "table" and options.create_cells:
|
342
|
+
table_count += 1
|
343
|
+
# Try to match by region_id, else by order
|
344
|
+
table_struct = None
|
345
|
+
region_id = region.get("table_region_id")
|
346
|
+
if region_id is not None and region_id in table_structures_by_id:
|
347
|
+
table_struct = table_structures_by_id[region_id]
|
348
|
+
elif table_idx < len(table_structures):
|
349
|
+
table_struct = table_structures[table_idx]
|
350
|
+
table_idx += 1
|
351
|
+
|
352
|
+
if table_struct:
|
353
|
+
matched_table_structures += 1
|
354
|
+
self.logger.debug(f"Matched table structure for table_region_id {region_id} or index {table_idx-1}.")
|
355
|
+
# Attach structure info as metadata
|
356
|
+
detection_data["metadata"] = {
|
357
|
+
k: v for k, v in table_struct.items() if k not in ("cell_box_list", "table_ocr_pred", "pred_html")
|
358
|
+
}
|
359
|
+
detection_data["html"] = table_struct.get("pred_html")
|
360
|
+
# Add cell regions
|
361
|
+
cell_boxes = []
|
362
|
+
for cell_bbox in table_struct.get("cell_box_list", []):
|
363
|
+
if cell_bbox is None or len(cell_bbox) != 4:
|
364
|
+
continue
|
365
|
+
sx0, sy0, sx1, sy1 = map(float, cell_bbox)
|
366
|
+
cell_boxes.append((sx0, sy0, sx1, sy1))
|
367
|
+
detections.append({
|
368
|
+
"bbox": (sx0, sy0, sx1, sy1),
|
369
|
+
"class": "table_cell",
|
370
|
+
"confidence": confidence_score,
|
371
|
+
"normalized_class": self._normalize_class_name("table_cell"),
|
372
|
+
"source": "layout",
|
373
|
+
"model": "paddle_v3",
|
374
|
+
"parent_bbox": (x_min, y_min, x_max, y_max),
|
375
|
+
})
|
376
|
+
cell_count += 1
|
377
|
+
self.logger.debug(f"Created table_cell region for bbox {(sx0, sy0, sx1, sy1)}.")
|
378
|
+
# Add row/col regions if not present in Paddle output
|
379
|
+
if not table_struct.get("row_box_list") and not table_struct.get("col_box_list"):
|
380
|
+
row_boxes, col_boxes = group_cells_into_rows_and_columns(cell_boxes)
|
381
|
+
for row_bbox in row_boxes:
|
382
|
+
rx0, ry0, rx1, ry1 = row_bbox
|
383
|
+
detections.append({
|
384
|
+
"bbox": (rx0, ry0, rx1, ry1),
|
385
|
+
"class": "table_row",
|
386
|
+
"confidence": confidence_score,
|
387
|
+
"normalized_class": self._normalize_class_name("table_row"),
|
388
|
+
"source": "layout",
|
389
|
+
"model": "paddle_v3",
|
390
|
+
"parent_bbox": (x_min, y_min, x_max, y_max),
|
391
|
+
})
|
392
|
+
row_count += 1
|
393
|
+
self.logger.debug(f"[UTIL] Created table_row region for bbox {(rx0, ry0, rx1, ry1)}.")
|
394
|
+
for col_bbox in col_boxes:
|
395
|
+
cx0, cy0, cx1, cy1 = col_bbox
|
396
|
+
detections.append({
|
397
|
+
"bbox": (cx0, cy0, cx1, cy1),
|
398
|
+
"class": "table_column",
|
399
|
+
"confidence": confidence_score,
|
400
|
+
"normalized_class": self._normalize_class_name("table_column"),
|
401
|
+
"source": "layout",
|
402
|
+
"model": "paddle_v3",
|
403
|
+
"parent_bbox": (x_min, y_min, x_max, y_max),
|
404
|
+
})
|
405
|
+
col_count += 1
|
406
|
+
self.logger.debug(f"[UTIL] Created table_column region for bbox {(cx0, cy0, cx1, cy1)}.")
|
407
|
+
else:
|
408
|
+
# Add row regions from Paddle output if present
|
409
|
+
for row_bbox in table_struct.get("row_box_list", []):
|
410
|
+
if row_bbox is None or len(row_bbox) != 4:
|
411
|
+
continue
|
412
|
+
rx0, ry0, rx1, ry1 = map(float, row_bbox)
|
413
|
+
detections.append({
|
414
|
+
"bbox": (rx0, ry0, rx1, ry1),
|
415
|
+
"class": "table_row",
|
416
|
+
"confidence": confidence_score,
|
417
|
+
"normalized_class": self._normalize_class_name("table_row"),
|
418
|
+
"source": "layout",
|
419
|
+
"model": "paddle_v3",
|
420
|
+
"parent_bbox": (x_min, y_min, x_max, y_max),
|
421
|
+
})
|
422
|
+
row_count += 1
|
423
|
+
self.logger.debug(f"Created table_row region for bbox {(rx0, ry0, rx1, ry1)}.")
|
424
|
+
# Add column regions from Paddle output if present
|
425
|
+
for col_bbox in table_struct.get("col_box_list", []):
|
426
|
+
if col_bbox is None or len(col_bbox) != 4:
|
427
|
+
continue
|
428
|
+
cx0, cy0, cx1, cy1 = map(float, col_bbox)
|
429
|
+
detections.append({
|
430
|
+
"bbox": (cx0, cy0, cx1, cy1),
|
431
|
+
"class": "table_column",
|
432
|
+
"confidence": confidence_score,
|
433
|
+
"normalized_class": self._normalize_class_name("table_column"),
|
434
|
+
"source": "layout",
|
435
|
+
"model": "paddle_v3",
|
436
|
+
"parent_bbox": (x_min, y_min, x_max, y_max),
|
437
|
+
})
|
438
|
+
col_count += 1
|
439
|
+
self.logger.debug(f"Created table_column region for bbox {(cx0, cy0, cx1, cy1)}.")
|
440
|
+
detections.append(detection_data)
|
441
|
+
except (TypeError, KeyError, IndexError, ValueError) as e:
|
442
|
+
self.logger.warning(
|
443
|
+
f"Error processing Paddle region: {region}. Error: {e}"
|
444
|
+
)
|
445
|
+
continue
|
293
446
|
|
294
447
|
self.logger.info(
|
295
|
-
f"PaddleLayout detected {len(detections)} layout elements matching criteria."
|
448
|
+
f"PaddleLayout detected {len(detections)} layout elements matching criteria. Tables: {table_count}, matched structures: {matched_table_structures}, cells: {cell_count}, rows: {row_count}, columns: {col_count}."
|
296
449
|
)
|
297
450
|
return detections
|
@@ -0,0 +1,78 @@
|
|
1
|
+
from typing import List, Tuple
|
2
|
+
import numpy as np
|
3
|
+
|
4
|
+
def group_cells_into_rows_and_columns(
|
5
|
+
cell_boxes: List[Tuple[float, float, float, float]],
|
6
|
+
row_tol: float = None,
|
7
|
+
col_tol: float = None,
|
8
|
+
) -> Tuple[List[Tuple[float, float, float, float]], List[Tuple[float, float, float, float]]]:
|
9
|
+
"""
|
10
|
+
Groups cell bounding boxes into rows and columns using spatial proximity.
|
11
|
+
|
12
|
+
Args:
|
13
|
+
cell_boxes: List of (x0, y0, x1, y1) for each cell.
|
14
|
+
row_tol: Vertical tolerance for grouping rows (default: 10% of median cell height).
|
15
|
+
col_tol: Horizontal tolerance for grouping columns (default: 10% of median cell width).
|
16
|
+
|
17
|
+
Returns:
|
18
|
+
(row_boxes, col_boxes): Lists of bounding boxes for rows and columns.
|
19
|
+
"""
|
20
|
+
if not cell_boxes:
|
21
|
+
return [], []
|
22
|
+
|
23
|
+
# Convert to numpy for easier manipulation
|
24
|
+
boxes = np.array(cell_boxes)
|
25
|
+
y_centers = (boxes[:, 1] + boxes[:, 3]) / 2
|
26
|
+
x_centers = (boxes[:, 0] + boxes[:, 2]) / 2
|
27
|
+
heights = boxes[:, 3] - boxes[:, 1]
|
28
|
+
widths = boxes[:, 2] - boxes[:, 0]
|
29
|
+
|
30
|
+
# Set default tolerances if not provided
|
31
|
+
median_height = float(np.median(heights))
|
32
|
+
median_width = float(np.median(widths))
|
33
|
+
row_tol = row_tol if row_tol is not None else max(2.0, 0.1 * median_height)
|
34
|
+
col_tol = col_tol if col_tol is not None else max(2.0, 0.1 * median_width)
|
35
|
+
|
36
|
+
# --- Group into rows ---
|
37
|
+
row_groups = []
|
38
|
+
for i, yc in enumerate(y_centers):
|
39
|
+
placed = False
|
40
|
+
for group in row_groups:
|
41
|
+
# If this cell's center is close to the group's mean center, add it
|
42
|
+
if abs(yc - np.mean([y_centers[j] for j in group])) <= row_tol:
|
43
|
+
group.append(i)
|
44
|
+
placed = True
|
45
|
+
break
|
46
|
+
if not placed:
|
47
|
+
row_groups.append([i])
|
48
|
+
|
49
|
+
# --- Group into columns ---
|
50
|
+
col_groups = []
|
51
|
+
for i, xc in enumerate(x_centers):
|
52
|
+
placed = False
|
53
|
+
for group in col_groups:
|
54
|
+
if abs(xc - np.mean([x_centers[j] for j in group])) <= col_tol:
|
55
|
+
group.append(i)
|
56
|
+
placed = True
|
57
|
+
break
|
58
|
+
if not placed:
|
59
|
+
col_groups.append([i])
|
60
|
+
|
61
|
+
# --- Compute bounding boxes for each group ---
|
62
|
+
row_boxes = []
|
63
|
+
for group in row_groups:
|
64
|
+
x0 = float(np.min(boxes[group, 0]))
|
65
|
+
y0 = float(np.min(boxes[group, 1]))
|
66
|
+
x1 = float(np.max(boxes[group, 2]))
|
67
|
+
y1 = float(np.max(boxes[group, 3]))
|
68
|
+
row_boxes.append((x0, y0, x1, y1))
|
69
|
+
|
70
|
+
col_boxes = []
|
71
|
+
for group in col_groups:
|
72
|
+
x0 = float(np.min(boxes[group, 0]))
|
73
|
+
y0 = float(np.min(boxes[group, 1]))
|
74
|
+
x1 = float(np.max(boxes[group, 2]))
|
75
|
+
y1 = float(np.max(boxes[group, 3]))
|
76
|
+
col_boxes.append((x0, y0, x1, y1))
|
77
|
+
|
78
|
+
return row_boxes, col_boxes
|