natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. natural_pdf/__init__.py +31 -0
  2. natural_pdf/analyzers/layout/gemini.py +137 -162
  3. natural_pdf/analyzers/layout/layout_manager.py +9 -5
  4. natural_pdf/analyzers/layout/layout_options.py +77 -7
  5. natural_pdf/analyzers/layout/paddle.py +318 -165
  6. natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
  7. natural_pdf/analyzers/shape_detection_mixin.py +770 -405
  8. natural_pdf/classification/mixin.py +2 -8
  9. natural_pdf/collections/pdf_collection.py +25 -30
  10. natural_pdf/core/highlighting_service.py +47 -32
  11. natural_pdf/core/page.py +119 -76
  12. natural_pdf/core/pdf.py +19 -22
  13. natural_pdf/describe/__init__.py +21 -0
  14. natural_pdf/describe/base.py +457 -0
  15. natural_pdf/describe/elements.py +411 -0
  16. natural_pdf/describe/mixin.py +84 -0
  17. natural_pdf/describe/summary.py +186 -0
  18. natural_pdf/elements/base.py +11 -10
  19. natural_pdf/elements/collections.py +116 -51
  20. natural_pdf/elements/region.py +204 -127
  21. natural_pdf/exporters/paddleocr.py +38 -13
  22. natural_pdf/flows/__init__.py +3 -3
  23. natural_pdf/flows/collections.py +303 -132
  24. natural_pdf/flows/element.py +277 -132
  25. natural_pdf/flows/flow.py +33 -16
  26. natural_pdf/flows/region.py +142 -79
  27. natural_pdf/ocr/engine_doctr.py +37 -4
  28. natural_pdf/ocr/engine_easyocr.py +23 -3
  29. natural_pdf/ocr/engine_paddle.py +281 -30
  30. natural_pdf/ocr/engine_surya.py +8 -3
  31. natural_pdf/ocr/ocr_manager.py +75 -76
  32. natural_pdf/ocr/ocr_options.py +52 -87
  33. natural_pdf/search/__init__.py +25 -12
  34. natural_pdf/search/lancedb_search_service.py +91 -54
  35. natural_pdf/search/numpy_search_service.py +86 -65
  36. natural_pdf/search/searchable_mixin.py +2 -2
  37. natural_pdf/selectors/parser.py +125 -81
  38. natural_pdf/widgets/__init__.py +1 -1
  39. natural_pdf/widgets/viewer.py +205 -449
  40. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
  41. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
  42. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
  43. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
  44. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ import os
5
5
  import tempfile
6
6
  from typing import Any, Dict, List, Optional
7
7
 
8
+ import numpy as np
8
9
  from PIL import Image
9
10
 
10
11
  # Assuming base class and options are importable
@@ -40,12 +41,11 @@ logger = logging.getLogger(__name__)
40
41
  # Check for dependencies
41
42
  paddle_spec = importlib.util.find_spec("paddle") or importlib.util.find_spec("paddlepaddle")
42
43
  paddleocr_spec = importlib.util.find_spec("paddleocr")
43
- PPStructure = None
44
- PaddleOCR = None # For optional text detection
44
+ PPStructureV3 = None
45
45
 
46
46
  if paddle_spec and paddleocr_spec:
47
47
  try:
48
- from paddleocr import PaddleOCR, PPStructure
48
+ from paddleocr import PPStructureV3
49
49
  except ImportError as e:
50
50
  logger.warning(f"Could not import Paddle dependencies: {e}")
51
51
  else:
@@ -53,85 +53,159 @@ else:
53
53
  "paddlepaddle or paddleocr not found. PaddleLayoutDetector will not be available."
54
54
  )
55
55
 
56
+ from .table_structure_utils import group_cells_into_rows_and_columns
56
57
 
57
58
  class PaddleLayoutDetector(LayoutDetector):
58
- """Document layout and table structure detector using PaddlePaddle's PP-Structure."""
59
+ """Document layout and table structure detector using PaddlePaddle's PP-StructureV3."""
59
60
 
60
61
  def __init__(self):
61
62
  super().__init__()
62
- # Supported classes by PP-Structure (adjust based on model version/capabilities)
63
+ # Supported classes by PP-StructureV3 (based on docs and common usage)
63
64
  self.supported_classes = {
64
65
  "text",
65
66
  "title",
66
67
  "figure",
67
- "figure_caption",
68
68
  "table",
69
- "table_caption",
70
- "table_cell", # Added table_cell
71
69
  "header",
72
70
  "footer",
73
71
  "reference",
74
72
  "equation",
75
- # PP-StructureV2 might add others like list, pub_number etc.
73
+ # New labels from V3
74
+ "image",
75
+ "paragraph_title",
76
+ "doc_title",
77
+ "figure_title",
78
+ "table_cell",
76
79
  }
77
80
  # Models are loaded via _get_model
78
81
 
79
82
  def is_available(self) -> bool:
80
83
  """Check if dependencies are installed."""
81
- return PPStructure is not None and PaddleOCR is not None
84
+ return PPStructureV3 is not None
82
85
 
83
86
  def _get_cache_key(self, options: BaseLayoutOptions) -> str:
84
- """Generate cache key based on language and device."""
87
+ """Generate cache key based on model configuration."""
85
88
  if not isinstance(options, PaddleLayoutOptions):
86
- options = PaddleLayoutOptions(device=options.device) # Use base device
89
+ options = PaddleLayoutOptions(device=options.device)
87
90
 
88
91
  device_key = str(options.device).lower() if options.device else "default_device"
89
92
  lang_key = options.lang
90
- # Key could also include enable_table, use_angle_cls if these affect model loading fundamentally
91
- # For PPStructure, they are primarily runtime flags, so lang/device might suffice for caching the *instance*.
92
- return f"{self.__class__.__name__}_{device_key}_{lang_key}"
93
+ table_key = str(options.use_table_recognition)
94
+ orientation_key = str(options.use_textline_orientation)
95
+
96
+ return f"{self.__class__.__name__}_{device_key}_{lang_key}_{table_key}_{orientation_key}"
93
97
 
94
98
  def _load_model_from_options(self, options: BaseLayoutOptions) -> Any:
95
- """Load the PPStructure model based on options."""
99
+ """Load the PPStructureV3 model based on options."""
96
100
  if not self.is_available():
97
101
  raise RuntimeError("Paddle dependencies (paddlepaddle, paddleocr) not installed.")
98
102
 
99
103
  if not isinstance(options, PaddleLayoutOptions):
100
104
  raise TypeError("Incorrect options type provided for Paddle model loading.")
101
105
 
102
- self.logger.info(
103
- f"Loading PPStructure model (lang={options.lang}, device={options.device}, table={options.enable_table})..."
104
- )
106
+ self.logger.info(f"Loading PP-StructureV3 model with options: {options}")
107
+
108
+ # List of valid PPStructureV3 constructor arguments (from official docs)
109
+ valid_init_args = {
110
+ "layout_detection_model_name",
111
+ "layout_detection_model_dir",
112
+ "layout_threshold",
113
+ "layout_nms",
114
+ "layout_unclip_ratio",
115
+ "layout_merge_bboxes_mode",
116
+ "chart_recognition_model_name",
117
+ "chart_recognition_model_dir",
118
+ "chart_recognition_batch_size",
119
+ "region_detection_model_name",
120
+ "region_detection_model_dir",
121
+ "doc_orientation_classify_model_name",
122
+ "doc_orientation_classify_model_dir",
123
+ "doc_unwarping_model_name",
124
+ "doc_unwarping_model_dir",
125
+ "text_detection_model_name",
126
+ "text_detection_model_dir",
127
+ "text_det_limit_side_len",
128
+ "text_det_limit_type",
129
+ "text_det_thresh",
130
+ "text_det_box_thresh",
131
+ "text_det_unclip_ratio",
132
+ "textline_orientation_model_name",
133
+ "textline_orientation_model_dir",
134
+ "textline_orientation_batch_size",
135
+ "text_recognition_model_name",
136
+ "text_recognition_model_dir",
137
+ "text_recognition_batch_size",
138
+ "text_rec_score_thresh",
139
+ "table_classification_model_name",
140
+ "table_classification_model_dir",
141
+ "wired_table_structure_recognition_model_name",
142
+ "wired_table_structure_recognition_model_dir",
143
+ "wireless_table_structure_recognition_model_name",
144
+ "wireless_table_structure_recognition_model_dir",
145
+ "wired_table_cells_detection_model_name",
146
+ "wired_table_cells_detection_model_dir",
147
+ "wireless_table_cells_detection_model_name",
148
+ "wireless_table_cells_detection_model_dir",
149
+ "seal_text_detection_model_name",
150
+ "seal_text_detection_model_dir",
151
+ "seal_det_limit_side_len",
152
+ "seal_det_limit_type",
153
+ "seal_det_thresh",
154
+ "seal_det_box_thresh",
155
+ "seal_det_unclip_ratio",
156
+ "seal_text_recognition_model_name",
157
+ "seal_text_recognition_model_dir",
158
+ "seal_text_recognition_batch_size",
159
+ "seal_rec_score_thresh",
160
+ "formula_recognition_model_name",
161
+ "formula_recognition_model_dir",
162
+ "formula_recognition_batch_size",
163
+ "use_doc_orientation_classify",
164
+ "use_doc_unwarping",
165
+ "use_textline_orientation",
166
+ "use_seal_recognition",
167
+ "use_table_recognition",
168
+ "use_formula_recognition",
169
+ "use_chart_recognition",
170
+ "use_region_detection",
171
+ "device",
172
+ "enable_hpi",
173
+ "use_tensorrt",
174
+ "precision",
175
+ "enable_mkldnn",
176
+ "cpu_threads",
177
+ "paddlex_config",
178
+ }
179
+
180
+ # Build init_args from dataclass fields and filtered extra_args
181
+ init_args = {}
182
+ # Add all dataclass fields that are in the valid set and not None
183
+ for field_name in options.__dataclass_fields__:
184
+ if field_name in valid_init_args:
185
+ value = getattr(options, field_name)
186
+ if value is not None:
187
+ init_args[field_name] = value
188
+ # Add filtered extra_args (not starting with '_' and in valid set)
189
+ filtered_extra_args = {
190
+ k: v for k, v in options.extra_args.items()
191
+ if not k.startswith('_') and k in valid_init_args
192
+ }
193
+ init_args.update(filtered_extra_args)
194
+
195
+ # Special handling for English model selection
196
+ if getattr(options, "lang", None) == "en":
197
+ init_args["text_recognition_model_name"] = "en_PP-OCRv4_mobile_rec"
198
+
105
199
  try:
106
- # PPStructure init takes several arguments that control runtime behavior
107
- # We cache the instance based on lang/device, assuming other flags don't require reloading.
108
- # Note: show_log is a runtime arg, not needed for instance caching key.
109
- # Note: `layout=False` disables layout analysis, which we definitely want here.
110
- # Note: `ocr=False` might disable text detection needed for table structure? Check PPStructure docs.
111
- # It seems best to initialize with core settings and pass others during the call if possible.
112
- # However, PPStructure call signature is simple (__call__(self, img, ...))
113
- # So, we likely need to initialize with most settings.
114
- model_instance = PPStructure(
115
- lang=options.lang,
116
- use_gpu=(
117
- "cuda" in str(options.device).lower() or "gpu" in str(options.device).lower()
118
- ),
119
- use_angle_cls=options.use_angle_cls,
120
- show_log=options.show_log,
121
- layout=True, # Ensure layout analysis is on
122
- table=options.enable_table, # Control table analysis
123
- ocr=False, # Usually disable internal OCR if only using for layout/table
124
- # Add other PPStructure init args from options.extra_args if needed
125
- # **options.extra_args
126
- )
127
- self.logger.info("PPStructure model loaded.")
200
+ model_instance = PPStructureV3(**init_args)
201
+ self.logger.info("PP-StructureV3 model loaded.")
128
202
  return model_instance
129
203
  except Exception as e:
130
- self.logger.error(f"Failed to load PPStructure model: {e}", exc_info=True)
204
+ self.logger.error(f"Failed to load PP-StructureV3 model: {e}", exc_info=True)
131
205
  raise
132
206
 
133
207
  def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
134
- """Detect layout elements in an image using PaddlePaddle."""
208
+ """Detect layout elements in an image using PP-StructureV3."""
135
209
  if not self.is_available():
136
210
  raise RuntimeError("Paddle dependencies (paddlepaddle, paddleocr) not installed.")
137
211
 
@@ -145,52 +219,41 @@ class PaddleLayoutDetector(LayoutDetector):
145
219
  exclude_classes=options.exclude_classes,
146
220
  device=options.device,
147
221
  extra_args=options.extra_args,
148
- # Other Paddle options will use defaults
149
222
  )
150
223
 
224
+ # --- Backward compatibility for renamed options passed via extra_args ---
225
+ if "use_angle_cls" in options.extra_args:
226
+ self.logger.warning(
227
+ "Parameter 'use_angle_cls' is deprecated for Paddle. Use 'use_textline_orientation' instead."
228
+ )
229
+ options.use_textline_orientation = options.extra_args.pop("use_angle_cls")
230
+ if "enable_table" in options.extra_args:
231
+ self.logger.warning(
232
+ "Parameter 'enable_table' is deprecated for Paddle. Use 'use_table_recognition' instead."
233
+ )
234
+ options.use_table_recognition = options.extra_args.pop("enable_table")
235
+
151
236
  self.validate_classes(options.classes or [])
152
237
  if options.exclude_classes:
153
238
  self.validate_classes(options.exclude_classes)
154
239
 
155
- # Get the cached/loaded PPStructure instance
240
+ # Get the cached/loaded PP-StructureV3 instance
156
241
  ppstructure_instance = self._get_model(options)
157
242
 
158
- # PPStructure call requires an image path. Save temp file.
159
- detections = []
160
- with tempfile.TemporaryDirectory() as temp_dir:
161
- temp_image_path = os.path.join(temp_dir, f"paddle_input_{os.getpid()}.png")
162
- try:
163
- self.logger.debug(
164
- f"Saving temporary image for Paddle detector to: {temp_image_path}"
165
- )
166
- image.convert("RGB").save(temp_image_path) # Ensure RGB
167
-
168
- # Process image with PP-Structure instance
169
- # The instance was configured during _load_model_from_options
170
- self.logger.debug("Running PPStructure analysis...")
171
- result = ppstructure_instance(temp_image_path)
172
- self.logger.debug(f"PPStructure returned {len(result)} regions.")
173
-
174
- except Exception as e:
175
- self.logger.error(f"Error during PPStructure analysis: {e}", exc_info=True)
176
- # Clean up temp file before raising or returning
177
- if os.path.exists(temp_image_path):
178
- try:
179
- os.remove(temp_image_path)
180
- except OSError as e_rm:
181
- self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
182
- raise # Re-raise error
183
-
184
- finally:
185
- # Ensure cleanup even if analysis worked
186
- if os.path.exists(temp_image_path):
187
- try:
188
- os.remove(temp_image_path)
189
- except OSError as e_rm:
190
- self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
243
+ # Convert PIL image to numpy array for prediction
244
+ img_np = np.array(image.convert("RGB"))
245
+ self.logger.debug("Running PP-StructureV3 analysis...")
246
+ try:
247
+ results = ppstructure_instance.predict(img_np)
248
+ except Exception as e:
249
+ self.logger.error(f"Error during PP-StructureV3 analysis: {e}", exc_info=True)
250
+ raise
251
+
252
+ self.logger.debug(f"PP-StructureV3 returned {len(results)} result objects.")
191
253
 
192
254
  # --- Process Results ---
193
- if not result:
255
+ detections = []
256
+ if not results:
194
257
  self.logger.warning("PaddleLayout returned empty results")
195
258
  return []
196
259
 
@@ -203,95 +266,185 @@ class PaddleLayoutDetector(LayoutDetector):
203
266
  if options.exclude_classes
204
267
  else set()
205
268
  )
206
-
207
- for region in result:
208
- try:
209
- region_type_orig = region.get("type", "unknown")
210
- # Handle potential list returns for type (seen in some versions)
211
- if isinstance(region_type_orig, list):
212
- region_type_orig = region_type_orig[0] if region_type_orig else "unknown"
213
-
214
- region_type = region_type_orig.lower()
215
- normalized_class = self._normalize_class_name(region_type)
216
-
217
- # Apply class filtering
218
- if normalized_classes_req and normalized_class not in normalized_classes_req:
219
- continue
220
- if normalized_class in normalized_classes_excl:
221
- continue
222
-
223
- # PP-Structure results don't always have confidence, use threshold or default
224
- confidence_score = region.get("score", 1.0) # Default to 1.0 if missing
225
- if confidence_score < options.confidence:
226
- continue
227
-
228
- bbox = region.get("bbox")
229
- if not bbox or len(bbox) != 4:
230
- self.logger.warning(f"Skipping region with invalid bbox: {region}")
231
- continue
232
- x_min, y_min, x_max, y_max = map(float, bbox)
233
-
234
- # Add detection
235
- detection_data = {
236
- "bbox": (x_min, y_min, x_max, y_max),
237
- "class": region_type_orig, # Keep original case if needed
238
- "confidence": confidence_score,
239
- "normalized_class": normalized_class,
240
- "source": "layout",
241
- "model": "paddle",
242
- }
243
- detections.append(detection_data)
244
-
245
- # --- Process Table Cells (if enabled and present) ---
246
- if region_type == "table" and options.enable_table and "res" in region:
247
- process_cells = (
248
- normalized_classes_req is None or "table-cell" in normalized_classes_req
249
- ) and ("table-cell" not in normalized_classes_excl)
250
-
251
- if process_cells and isinstance(region["res"], list): # V2 structure
252
- for cell in region["res"]:
253
- if "box" not in cell or len(cell["box"]) != 4:
254
- continue
255
- cell_bbox = cell["box"]
256
- cell_x_min, cell_y_min, cell_x_max, cell_y_max = map(float, cell_bbox)
257
- # Add cell detection (confidence often not available per cell)
258
- detections.append(
259
- {
260
- "bbox": (cell_x_min, cell_y_min, cell_x_max, cell_y_max),
261
- "class": "table cell", # Standardize name
262
- "confidence": confidence_score
263
- * 0.95, # Inherit table confidence (slightly reduced)
264
- "normalized_class": "table-cell",
265
- "text": cell.get("text", ""), # Include text if available
266
- "source": "layout",
267
- "model": "paddle",
268
- }
269
- )
270
- elif (
271
- process_cells
272
- and isinstance(region["res"], dict)
273
- and "cells" in region["res"]
274
- ): # Older structure
275
- # Handle older 'cells' list if needed (logic from original file)
276
- pass # Add logic based on original paddle.txt if supporting older PP-Structure
277
-
278
- except (TypeError, KeyError, IndexError, ValueError) as e:
279
- self.logger.warning(f"Error processing Paddle region: {region}. Error: {e}")
269
+
270
+ # Debug counters
271
+ table_count = 0
272
+ cell_count = 0
273
+ row_count = 0
274
+ col_count = 0
275
+ matched_table_structures = 0
276
+
277
+ # A single image input returns a list with one result object
278
+ for res in results:
279
+ # Handle both possible result structures (with or without 'res' key)
280
+ if isinstance(res, dict) and "res" in res:
281
+ result_data = res["res"]
282
+ elif isinstance(res, dict):
283
+ result_data = res
284
+ else:
285
+ self.logger.warning(f"Skipping result with unexpected structure: {res}")
280
286
  continue
281
287
 
282
- # --- Optional: Add Text Boxes from separate OCR run ---
283
- if options.detect_text:
284
- # This requires another model instance (PaddleOCR) and adds complexity.
285
- # Consider if this is truly needed or if layout regions are sufficient.
286
- # If needed, implement similar to original paddle.txt:
287
- # - Instantiate PaddleOCR (potentially cache separately)
288
- # - Run ocr(img_path, det=True, rec=False)
289
- # - Process results, adding 'text' class detections
290
- self.logger.info("Paddle detect_text=True: Running separate OCR text detection...")
291
- # (Implementation omitted for brevity - requires PaddleOCR instance)
292
- pass
288
+ # --- Process Layout Regions ---
289
+ layout_res = result_data.get("layout_det_res", {})
290
+ table_res_list = result_data.get("table_res_list", [])
291
+ # Build a map of table_region_id to structure info for fast lookup
292
+ table_structures_by_id = {}
293
+ for t in table_res_list:
294
+ if "table_region_id" in t:
295
+ table_structures_by_id[t["table_region_id"]] = t
296
+ table_structures = table_res_list or []
297
+ table_idx = 0 # fallback index if no region_id
298
+ if table_res_list:
299
+ self.logger.debug(f"Found {len(table_res_list)} table structure(s) in table_res_list.")
300
+
301
+ if not layout_res or "boxes" not in layout_res:
302
+ self.logger.debug("No layout detection boxes found in result.")
303
+ else:
304
+ for region in layout_res["boxes"]:
305
+ try:
306
+ region_type_orig = region.get("label", "unknown")
307
+ region_type = region_type_orig.lower()
308
+ normalized_class = self._normalize_class_name(region_type)
309
+
310
+ # Apply class filtering
311
+ if (
312
+ normalized_classes_req
313
+ and normalized_class not in normalized_classes_req
314
+ ):
315
+ continue
316
+ if normalized_class in normalized_classes_excl:
317
+ continue
318
+
319
+ confidence_score = region.get("score", 1.0)
320
+ if confidence_score < options.confidence:
321
+ continue
322
+
323
+ bbox = region.get("coordinate")
324
+ if not bbox or len(bbox) != 4:
325
+ self.logger.warning(
326
+ f"Skipping region with invalid bbox: {region}"
327
+ )
328
+ continue
329
+ x_min, y_min, x_max, y_max = map(float, bbox)
330
+
331
+ detection_data = {
332
+ "bbox": (x_min, y_min, x_max, y_max),
333
+ "class": region_type_orig,
334
+ "confidence": confidence_score,
335
+ "normalized_class": normalized_class,
336
+ "source": "layout",
337
+ "model": "paddle_v3",
338
+ }
339
+
340
+ # --- Table structure parsing ---
341
+ if normalized_class == "table" and options.create_cells:
342
+ table_count += 1
343
+ # Try to match by region_id, else by order
344
+ table_struct = None
345
+ region_id = region.get("table_region_id")
346
+ if region_id is not None and region_id in table_structures_by_id:
347
+ table_struct = table_structures_by_id[region_id]
348
+ elif table_idx < len(table_structures):
349
+ table_struct = table_structures[table_idx]
350
+ table_idx += 1
351
+
352
+ if table_struct:
353
+ matched_table_structures += 1
354
+ self.logger.debug(f"Matched table structure for table_region_id {region_id} or index {table_idx-1}.")
355
+ # Attach structure info as metadata
356
+ detection_data["metadata"] = {
357
+ k: v for k, v in table_struct.items() if k not in ("cell_box_list", "table_ocr_pred", "pred_html")
358
+ }
359
+ detection_data["html"] = table_struct.get("pred_html")
360
+ # Add cell regions
361
+ cell_boxes = []
362
+ for cell_bbox in table_struct.get("cell_box_list", []):
363
+ if cell_bbox is None or len(cell_bbox) != 4:
364
+ continue
365
+ sx0, sy0, sx1, sy1 = map(float, cell_bbox)
366
+ cell_boxes.append((sx0, sy0, sx1, sy1))
367
+ detections.append({
368
+ "bbox": (sx0, sy0, sx1, sy1),
369
+ "class": "table_cell",
370
+ "confidence": confidence_score,
371
+ "normalized_class": self._normalize_class_name("table_cell"),
372
+ "source": "layout",
373
+ "model": "paddle_v3",
374
+ "parent_bbox": (x_min, y_min, x_max, y_max),
375
+ })
376
+ cell_count += 1
377
+ self.logger.debug(f"Created table_cell region for bbox {(sx0, sy0, sx1, sy1)}.")
378
+ # Add row/col regions if not present in Paddle output
379
+ if not table_struct.get("row_box_list") and not table_struct.get("col_box_list"):
380
+ row_boxes, col_boxes = group_cells_into_rows_and_columns(cell_boxes)
381
+ for row_bbox in row_boxes:
382
+ rx0, ry0, rx1, ry1 = row_bbox
383
+ detections.append({
384
+ "bbox": (rx0, ry0, rx1, ry1),
385
+ "class": "table_row",
386
+ "confidence": confidence_score,
387
+ "normalized_class": self._normalize_class_name("table_row"),
388
+ "source": "layout",
389
+ "model": "paddle_v3",
390
+ "parent_bbox": (x_min, y_min, x_max, y_max),
391
+ })
392
+ row_count += 1
393
+ self.logger.debug(f"[UTIL] Created table_row region for bbox {(rx0, ry0, rx1, ry1)}.")
394
+ for col_bbox in col_boxes:
395
+ cx0, cy0, cx1, cy1 = col_bbox
396
+ detections.append({
397
+ "bbox": (cx0, cy0, cx1, cy1),
398
+ "class": "table_column",
399
+ "confidence": confidence_score,
400
+ "normalized_class": self._normalize_class_name("table_column"),
401
+ "source": "layout",
402
+ "model": "paddle_v3",
403
+ "parent_bbox": (x_min, y_min, x_max, y_max),
404
+ })
405
+ col_count += 1
406
+ self.logger.debug(f"[UTIL] Created table_column region for bbox {(cx0, cy0, cx1, cy1)}.")
407
+ else:
408
+ # Add row regions from Paddle output if present
409
+ for row_bbox in table_struct.get("row_box_list", []):
410
+ if row_bbox is None or len(row_bbox) != 4:
411
+ continue
412
+ rx0, ry0, rx1, ry1 = map(float, row_bbox)
413
+ detections.append({
414
+ "bbox": (rx0, ry0, rx1, ry1),
415
+ "class": "table_row",
416
+ "confidence": confidence_score,
417
+ "normalized_class": self._normalize_class_name("table_row"),
418
+ "source": "layout",
419
+ "model": "paddle_v3",
420
+ "parent_bbox": (x_min, y_min, x_max, y_max),
421
+ })
422
+ row_count += 1
423
+ self.logger.debug(f"Created table_row region for bbox {(rx0, ry0, rx1, ry1)}.")
424
+ # Add column regions from Paddle output if present
425
+ for col_bbox in table_struct.get("col_box_list", []):
426
+ if col_bbox is None or len(col_bbox) != 4:
427
+ continue
428
+ cx0, cy0, cx1, cy1 = map(float, col_bbox)
429
+ detections.append({
430
+ "bbox": (cx0, cy0, cx1, cy1),
431
+ "class": "table_column",
432
+ "confidence": confidence_score,
433
+ "normalized_class": self._normalize_class_name("table_column"),
434
+ "source": "layout",
435
+ "model": "paddle_v3",
436
+ "parent_bbox": (x_min, y_min, x_max, y_max),
437
+ })
438
+ col_count += 1
439
+ self.logger.debug(f"Created table_column region for bbox {(cx0, cy0, cx1, cy1)}.")
440
+ detections.append(detection_data)
441
+ except (TypeError, KeyError, IndexError, ValueError) as e:
442
+ self.logger.warning(
443
+ f"Error processing Paddle region: {region}. Error: {e}"
444
+ )
445
+ continue
293
446
 
294
447
  self.logger.info(
295
- f"PaddleLayout detected {len(detections)} layout elements matching criteria."
448
+ f"PaddleLayout detected {len(detections)} layout elements matching criteria. Tables: {table_count}, matched structures: {matched_table_structures}, cells: {cell_count}, rows: {row_count}, columns: {col_count}."
296
449
  )
297
450
  return detections
@@ -0,0 +1,78 @@
1
+ from typing import List, Tuple
2
+ import numpy as np
3
+
4
+ def group_cells_into_rows_and_columns(
5
+ cell_boxes: List[Tuple[float, float, float, float]],
6
+ row_tol: float = None,
7
+ col_tol: float = None,
8
+ ) -> Tuple[List[Tuple[float, float, float, float]], List[Tuple[float, float, float, float]]]:
9
+ """
10
+ Groups cell bounding boxes into rows and columns using spatial proximity.
11
+
12
+ Args:
13
+ cell_boxes: List of (x0, y0, x1, y1) for each cell.
14
+ row_tol: Vertical tolerance for grouping rows (default: 10% of median cell height).
15
+ col_tol: Horizontal tolerance for grouping columns (default: 10% of median cell width).
16
+
17
+ Returns:
18
+ (row_boxes, col_boxes): Lists of bounding boxes for rows and columns.
19
+ """
20
+ if not cell_boxes:
21
+ return [], []
22
+
23
+ # Convert to numpy for easier manipulation
24
+ boxes = np.array(cell_boxes)
25
+ y_centers = (boxes[:, 1] + boxes[:, 3]) / 2
26
+ x_centers = (boxes[:, 0] + boxes[:, 2]) / 2
27
+ heights = boxes[:, 3] - boxes[:, 1]
28
+ widths = boxes[:, 2] - boxes[:, 0]
29
+
30
+ # Set default tolerances if not provided
31
+ median_height = float(np.median(heights))
32
+ median_width = float(np.median(widths))
33
+ row_tol = row_tol if row_tol is not None else max(2.0, 0.1 * median_height)
34
+ col_tol = col_tol if col_tol is not None else max(2.0, 0.1 * median_width)
35
+
36
+ # --- Group into rows ---
37
+ row_groups = []
38
+ for i, yc in enumerate(y_centers):
39
+ placed = False
40
+ for group in row_groups:
41
+ # If this cell's center is close to the group's mean center, add it
42
+ if abs(yc - np.mean([y_centers[j] for j in group])) <= row_tol:
43
+ group.append(i)
44
+ placed = True
45
+ break
46
+ if not placed:
47
+ row_groups.append([i])
48
+
49
+ # --- Group into columns ---
50
+ col_groups = []
51
+ for i, xc in enumerate(x_centers):
52
+ placed = False
53
+ for group in col_groups:
54
+ if abs(xc - np.mean([x_centers[j] for j in group])) <= col_tol:
55
+ group.append(i)
56
+ placed = True
57
+ break
58
+ if not placed:
59
+ col_groups.append([i])
60
+
61
+ # --- Compute bounding boxes for each group ---
62
+ row_boxes = []
63
+ for group in row_groups:
64
+ x0 = float(np.min(boxes[group, 0]))
65
+ y0 = float(np.min(boxes[group, 1]))
66
+ x1 = float(np.max(boxes[group, 2]))
67
+ y1 = float(np.max(boxes[group, 3]))
68
+ row_boxes.append((x0, y0, x1, y1))
69
+
70
+ col_boxes = []
71
+ for group in col_groups:
72
+ x0 = float(np.min(boxes[group, 0]))
73
+ y0 = float(np.min(boxes[group, 1]))
74
+ x1 = float(np.max(boxes[group, 2]))
75
+ y1 = float(np.max(boxes[group, 3]))
76
+ col_boxes.append((x0, y0, x1, y1))
77
+
78
+ return row_boxes, col_boxes