natural-pdf 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +751 -607
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +131 -45
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +120 -23
- natural_pdf/core/pdf.py +477 -75
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +222 -108
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/METADATA +1 -1
- natural_pdf-0.1.35.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.33.dist-info/RECORD +0 -118
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/top_level.txt +0 -0
@@ -18,12 +18,59 @@ logger = logging.getLogger(__name__)
|
|
18
18
|
|
19
19
|
|
20
20
|
class LayoutDetector(ABC):
|
21
|
-
"""
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
21
|
+
"""Abstract base class for layout detection engines.
|
22
|
+
|
23
|
+
This class defines the standard interface that all layout detection engines
|
24
|
+
must implement in natural-pdf. Layout detectors analyze document images to
|
25
|
+
identify structural elements like tables, figures, headers, paragraphs, etc.
|
26
|
+
|
27
|
+
The base class provides common functionality including model caching, result
|
28
|
+
standardization, and configuration management, while concrete implementations
|
29
|
+
handle engine-specific detection logic for different models (YOLO, TATR, Surya, etc.).
|
30
|
+
|
31
|
+
Subclasses must implement:
|
32
|
+
- detect(): Core layout detection for a single image
|
33
|
+
- is_available(): Check if engine dependencies are installed
|
34
|
+
- _load_model_from_options(): Load and configure the detection model
|
35
|
+
- _get_cache_key(): Generate cache keys for model instances
|
36
|
+
|
37
|
+
Subclasses should also populate the 'supported_classes' set with the document
|
38
|
+
element types they can detect (e.g., 'table', 'figure', 'text', 'title').
|
39
|
+
|
40
|
+
Attributes:
|
41
|
+
logger: Logger instance for the specific detector.
|
42
|
+
supported_classes: Set of document element types this detector can identify.
|
43
|
+
_model_cache: Dictionary cache for loaded model instances.
|
44
|
+
|
45
|
+
Example:
|
46
|
+
Implementing a custom layout detector:
|
47
|
+
```python
|
48
|
+
class MyLayoutDetector(LayoutDetector):
|
49
|
+
def __init__(self):
|
50
|
+
super().__init__()
|
51
|
+
self.supported_classes = {'table', 'figure', 'text'}
|
52
|
+
|
53
|
+
@classmethod
|
54
|
+
def is_available(cls) -> bool:
|
55
|
+
try:
|
56
|
+
import my_layout_library
|
57
|
+
return True
|
58
|
+
except ImportError:
|
59
|
+
return False
|
60
|
+
|
61
|
+
def detect(self, image, options):
|
62
|
+
# Implement layout detection
|
63
|
+
return detection_results
|
64
|
+
```
|
65
|
+
|
66
|
+
Using a layout detector:
|
67
|
+
```python
|
68
|
+
if YOLODetector.is_available():
|
69
|
+
detector = YOLODetector()
|
70
|
+
results = detector.detect(page_image, options)
|
71
|
+
for result in results:
|
72
|
+
print(f"Found {result['class']} at {result['bbox']}")
|
73
|
+
```
|
27
74
|
"""
|
28
75
|
|
29
76
|
def __init__(self):
|
@@ -83,7 +83,9 @@ class LayoutAnalyzer:
|
|
83
83
|
f" Rendering page {self._page.number} to image for initial layout detection..."
|
84
84
|
)
|
85
85
|
try:
|
86
|
-
layout_resolution = getattr(self._page._parent, "_config", {}).get(
|
86
|
+
layout_resolution = getattr(self._page._parent, "_config", {}).get(
|
87
|
+
"layout_image_resolution", 72
|
88
|
+
)
|
87
89
|
std_res_page_image = self._page.to_image(
|
88
90
|
resolution=layout_resolution, include_highlights=False
|
89
91
|
)
|
@@ -5,10 +5,6 @@ from typing import Any, Dict, List, Optional, Type, Union
|
|
5
5
|
|
6
6
|
from PIL import Image
|
7
7
|
|
8
|
-
# --- Import lightweight components only ---
|
9
|
-
# Heavy detector implementations (paddle, yolo, etc.) are **not** imported at module load.
|
10
|
-
# Instead, we provide tiny helper functions that import them lazily **only when needed**.
|
11
|
-
|
12
8
|
from .base import LayoutDetector # Lightweight base class
|
13
9
|
from .layout_options import (
|
14
10
|
BaseLayoutOptions,
|
@@ -21,6 +17,11 @@ from .layout_options import (
|
|
21
17
|
YOLOLayoutOptions,
|
22
18
|
)
|
23
19
|
|
20
|
+
# --- Import lightweight components only ---
|
21
|
+
# Heavy detector implementations (paddle, yolo, etc.) are **not** imported at module load.
|
22
|
+
# Instead, we provide tiny helper functions that import them lazily **only when needed**.
|
23
|
+
|
24
|
+
|
24
25
|
# ------------------ Lazy import helpers ------------------ #
|
25
26
|
|
26
27
|
|
@@ -60,6 +61,7 @@ def _lazy_import_gemini_detector():
|
|
60
61
|
|
61
62
|
return GeminiLayoutDetector
|
62
63
|
|
64
|
+
|
63
65
|
# --------------------------------------------------------- #
|
64
66
|
|
65
67
|
logger = logging.getLogger(__name__)
|
@@ -205,7 +207,9 @@ class LayoutManager:
|
|
205
207
|
for name, registry_entry in self.ENGINE_REGISTRY.items():
|
206
208
|
try:
|
207
209
|
engine_class_or_factory = registry_entry["class"]
|
208
|
-
if callable(engine_class_or_factory) and not isinstance(
|
210
|
+
if callable(engine_class_or_factory) and not isinstance(
|
211
|
+
engine_class_or_factory, type
|
212
|
+
):
|
209
213
|
# Lazy factory – call it to obtain real class
|
210
214
|
engine_class = engine_class_or_factory()
|
211
215
|
else:
|
@@ -224,43 +228,43 @@ class LayoutManager:
|
|
224
228
|
def cleanup_detector(self, detector_name: Optional[str] = None) -> int:
|
225
229
|
"""
|
226
230
|
Cleanup layout detector instances to free memory.
|
227
|
-
|
231
|
+
|
228
232
|
Args:
|
229
233
|
detector_name: Specific detector to cleanup, or None to cleanup all detectors
|
230
|
-
|
234
|
+
|
231
235
|
Returns:
|
232
236
|
Number of detectors cleaned up
|
233
237
|
"""
|
234
238
|
cleaned_count = 0
|
235
|
-
|
239
|
+
|
236
240
|
if detector_name:
|
237
241
|
# Cleanup specific detector
|
238
242
|
detector_name = detector_name.lower()
|
239
243
|
if detector_name in self._detector_instances:
|
240
244
|
detector = self._detector_instances.pop(detector_name)
|
241
|
-
if hasattr(detector,
|
245
|
+
if hasattr(detector, "cleanup"):
|
242
246
|
try:
|
243
247
|
detector.cleanup()
|
244
248
|
except Exception as e:
|
245
249
|
logger.debug(f"Detector {detector_name} cleanup method failed: {e}")
|
246
|
-
|
250
|
+
|
247
251
|
logger.info(f"Cleaned up layout detector: {detector_name}")
|
248
252
|
cleaned_count = 1
|
249
253
|
else:
|
250
254
|
# Cleanup all detectors
|
251
255
|
for name, detector in list(self._detector_instances.items()):
|
252
|
-
if hasattr(detector,
|
256
|
+
if hasattr(detector, "cleanup"):
|
253
257
|
try:
|
254
258
|
detector.cleanup()
|
255
259
|
except Exception as e:
|
256
260
|
logger.debug(f"Detector {name} cleanup method failed: {e}")
|
257
|
-
|
261
|
+
|
258
262
|
# Clear all caches
|
259
263
|
detector_count = len(self._detector_instances)
|
260
264
|
self._detector_instances.clear()
|
261
|
-
|
265
|
+
|
262
266
|
if detector_count > 0:
|
263
267
|
logger.info(f"Cleaned up {detector_count} layout detectors")
|
264
268
|
cleaned_count = detector_count
|
265
|
-
|
269
|
+
|
266
270
|
return cleaned_count
|
@@ -58,6 +58,7 @@ class PaddleLayoutOptions(BaseLayoutOptions):
|
|
58
58
|
Options specific to PaddlePaddle PP-StructureV3 layout detection.
|
59
59
|
See: https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-StructureV3.html
|
60
60
|
"""
|
61
|
+
|
61
62
|
# Model paths and names
|
62
63
|
layout_detection_model_name: Optional[str] = None
|
63
64
|
layout_detection_model_dir: Optional[str] = None
|
@@ -55,6 +55,7 @@ else:
|
|
55
55
|
|
56
56
|
from .table_structure_utils import group_cells_into_rows_and_columns
|
57
57
|
|
58
|
+
|
58
59
|
class PaddleLayoutDetector(LayoutDetector):
|
59
60
|
"""Document layout and table structure detector using PaddlePaddle's PP-StructureV3."""
|
60
61
|
|
@@ -187,8 +188,9 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
187
188
|
init_args[field_name] = value
|
188
189
|
# Add filtered extra_args (not starting with '_' and in valid set)
|
189
190
|
filtered_extra_args = {
|
190
|
-
k: v
|
191
|
-
|
191
|
+
k: v
|
192
|
+
for k, v in options.extra_args.items()
|
193
|
+
if not k.startswith("_") and k in valid_init_args
|
192
194
|
}
|
193
195
|
init_args.update(filtered_extra_args)
|
194
196
|
|
@@ -266,7 +268,7 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
266
268
|
if options.exclude_classes
|
267
269
|
else set()
|
268
270
|
)
|
269
|
-
|
271
|
+
|
270
272
|
# Debug counters
|
271
273
|
table_count = 0
|
272
274
|
cell_count = 0
|
@@ -296,7 +298,9 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
296
298
|
table_structures = table_res_list or []
|
297
299
|
table_idx = 0 # fallback index if no region_id
|
298
300
|
if table_res_list:
|
299
|
-
self.logger.debug(
|
301
|
+
self.logger.debug(
|
302
|
+
f"Found {len(table_res_list)} table structure(s) in table_res_list."
|
303
|
+
)
|
300
304
|
|
301
305
|
if not layout_res or "boxes" not in layout_res:
|
302
306
|
self.logger.debug("No layout detection boxes found in result.")
|
@@ -322,9 +326,7 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
322
326
|
|
323
327
|
bbox = region.get("coordinate")
|
324
328
|
if not bbox or len(bbox) != 4:
|
325
|
-
self.logger.warning(
|
326
|
-
f"Skipping region with invalid bbox: {region}"
|
327
|
-
)
|
329
|
+
self.logger.warning(f"Skipping region with invalid bbox: {region}")
|
328
330
|
continue
|
329
331
|
x_min, y_min, x_max, y_max = map(float, bbox)
|
330
332
|
|
@@ -351,10 +353,14 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
351
353
|
|
352
354
|
if table_struct:
|
353
355
|
matched_table_structures += 1
|
354
|
-
self.logger.debug(
|
356
|
+
self.logger.debug(
|
357
|
+
f"Matched table structure for table_region_id {region_id} or index {table_idx-1}."
|
358
|
+
)
|
355
359
|
# Attach structure info as metadata
|
356
360
|
detection_data["metadata"] = {
|
357
|
-
k: v
|
361
|
+
k: v
|
362
|
+
for k, v in table_struct.items()
|
363
|
+
if k not in ("cell_box_list", "table_ocr_pred", "pred_html")
|
358
364
|
}
|
359
365
|
detection_data["html"] = table_struct.get("pred_html")
|
360
366
|
# Add cell regions
|
@@ -364,84 +370,116 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
364
370
|
continue
|
365
371
|
sx0, sy0, sx1, sy1 = map(float, cell_bbox)
|
366
372
|
cell_boxes.append((sx0, sy0, sx1, sy1))
|
367
|
-
detections.append(
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
"normalized_class": self._normalize_class_name("table_cell"),
|
372
|
-
"source": "layout",
|
373
|
-
"model": "paddle_v3",
|
374
|
-
"parent_bbox": (x_min, y_min, x_max, y_max),
|
375
|
-
})
|
376
|
-
cell_count += 1
|
377
|
-
self.logger.debug(f"Created table_cell region for bbox {(sx0, sy0, sx1, sy1)}.")
|
378
|
-
# Add row/col regions if not present in Paddle output
|
379
|
-
if not table_struct.get("row_box_list") and not table_struct.get("col_box_list"):
|
380
|
-
row_boxes, col_boxes = group_cells_into_rows_and_columns(cell_boxes)
|
381
|
-
for row_bbox in row_boxes:
|
382
|
-
rx0, ry0, rx1, ry1 = row_bbox
|
383
|
-
detections.append({
|
384
|
-
"bbox": (rx0, ry0, rx1, ry1),
|
385
|
-
"class": "table_row",
|
373
|
+
detections.append(
|
374
|
+
{
|
375
|
+
"bbox": (sx0, sy0, sx1, sy1),
|
376
|
+
"class": "table_cell",
|
386
377
|
"confidence": confidence_score,
|
387
|
-
"normalized_class": self._normalize_class_name(
|
378
|
+
"normalized_class": self._normalize_class_name(
|
379
|
+
"table_cell"
|
380
|
+
),
|
388
381
|
"source": "layout",
|
389
382
|
"model": "paddle_v3",
|
390
383
|
"parent_bbox": (x_min, y_min, x_max, y_max),
|
391
|
-
}
|
384
|
+
}
|
385
|
+
)
|
386
|
+
cell_count += 1
|
387
|
+
self.logger.debug(
|
388
|
+
f"Created table_cell region for bbox {(sx0, sy0, sx1, sy1)}."
|
389
|
+
)
|
390
|
+
# Add row/col regions if not present in Paddle output
|
391
|
+
if not table_struct.get("row_box_list") and not table_struct.get(
|
392
|
+
"col_box_list"
|
393
|
+
):
|
394
|
+
row_boxes, col_boxes = group_cells_into_rows_and_columns(
|
395
|
+
cell_boxes
|
396
|
+
)
|
397
|
+
for row_bbox in row_boxes:
|
398
|
+
rx0, ry0, rx1, ry1 = row_bbox
|
399
|
+
detections.append(
|
400
|
+
{
|
401
|
+
"bbox": (rx0, ry0, rx1, ry1),
|
402
|
+
"class": "table_row",
|
403
|
+
"confidence": confidence_score,
|
404
|
+
"normalized_class": self._normalize_class_name(
|
405
|
+
"table_row"
|
406
|
+
),
|
407
|
+
"source": "layout",
|
408
|
+
"model": "paddle_v3",
|
409
|
+
"parent_bbox": (x_min, y_min, x_max, y_max),
|
410
|
+
}
|
411
|
+
)
|
392
412
|
row_count += 1
|
393
|
-
self.logger.debug(
|
413
|
+
self.logger.debug(
|
414
|
+
f"[UTIL] Created table_row region for bbox {(rx0, ry0, rx1, ry1)}."
|
415
|
+
)
|
394
416
|
for col_bbox in col_boxes:
|
395
417
|
cx0, cy0, cx1, cy1 = col_bbox
|
396
|
-
detections.append(
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
418
|
+
detections.append(
|
419
|
+
{
|
420
|
+
"bbox": (cx0, cy0, cx1, cy1),
|
421
|
+
"class": "table_column",
|
422
|
+
"confidence": confidence_score,
|
423
|
+
"normalized_class": self._normalize_class_name(
|
424
|
+
"table_column"
|
425
|
+
),
|
426
|
+
"source": "layout",
|
427
|
+
"model": "paddle_v3",
|
428
|
+
"parent_bbox": (x_min, y_min, x_max, y_max),
|
429
|
+
}
|
430
|
+
)
|
405
431
|
col_count += 1
|
406
|
-
self.logger.debug(
|
432
|
+
self.logger.debug(
|
433
|
+
f"[UTIL] Created table_column region for bbox {(cx0, cy0, cx1, cy1)}."
|
434
|
+
)
|
407
435
|
else:
|
408
436
|
# Add row regions from Paddle output if present
|
409
437
|
for row_bbox in table_struct.get("row_box_list", []):
|
410
438
|
if row_bbox is None or len(row_bbox) != 4:
|
411
439
|
continue
|
412
440
|
rx0, ry0, rx1, ry1 = map(float, row_bbox)
|
413
|
-
detections.append(
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
441
|
+
detections.append(
|
442
|
+
{
|
443
|
+
"bbox": (rx0, ry0, rx1, ry1),
|
444
|
+
"class": "table_row",
|
445
|
+
"confidence": confidence_score,
|
446
|
+
"normalized_class": self._normalize_class_name(
|
447
|
+
"table_row"
|
448
|
+
),
|
449
|
+
"source": "layout",
|
450
|
+
"model": "paddle_v3",
|
451
|
+
"parent_bbox": (x_min, y_min, x_max, y_max),
|
452
|
+
}
|
453
|
+
)
|
422
454
|
row_count += 1
|
423
|
-
self.logger.debug(
|
455
|
+
self.logger.debug(
|
456
|
+
f"Created table_row region for bbox {(rx0, ry0, rx1, ry1)}."
|
457
|
+
)
|
424
458
|
# Add column regions from Paddle output if present
|
425
459
|
for col_bbox in table_struct.get("col_box_list", []):
|
426
460
|
if col_bbox is None or len(col_bbox) != 4:
|
427
461
|
continue
|
428
462
|
cx0, cy0, cx1, cy1 = map(float, col_bbox)
|
429
|
-
detections.append(
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
463
|
+
detections.append(
|
464
|
+
{
|
465
|
+
"bbox": (cx0, cy0, cx1, cy1),
|
466
|
+
"class": "table_column",
|
467
|
+
"confidence": confidence_score,
|
468
|
+
"normalized_class": self._normalize_class_name(
|
469
|
+
"table_column"
|
470
|
+
),
|
471
|
+
"source": "layout",
|
472
|
+
"model": "paddle_v3",
|
473
|
+
"parent_bbox": (x_min, y_min, x_max, y_max),
|
474
|
+
}
|
475
|
+
)
|
438
476
|
col_count += 1
|
439
|
-
self.logger.debug(
|
477
|
+
self.logger.debug(
|
478
|
+
f"Created table_column region for bbox {(cx0, cy0, cx1, cy1)}."
|
479
|
+
)
|
440
480
|
detections.append(detection_data)
|
441
481
|
except (TypeError, KeyError, IndexError, ValueError) as e:
|
442
|
-
self.logger.warning(
|
443
|
-
f"Error processing Paddle region: {region}. Error: {e}"
|
444
|
-
)
|
482
|
+
self.logger.warning(f"Error processing Paddle region: {region}. Error: {e}")
|
445
483
|
continue
|
446
484
|
|
447
485
|
self.logger.info(
|
@@ -1,6 +1,8 @@
|
|
1
1
|
from typing import List, Tuple
|
2
|
+
|
2
3
|
import numpy as np
|
3
4
|
|
5
|
+
|
4
6
|
def group_cells_into_rows_and_columns(
|
5
7
|
cell_boxes: List[Tuple[float, float, float, float]],
|
6
8
|
row_tol: float = None,
|
@@ -75,4 +77,4 @@ def group_cells_into_rows_and_columns(
|
|
75
77
|
y1 = float(np.max(boxes[group, 3]))
|
76
78
|
col_boxes.append((x0, y0, x1, y1))
|
77
79
|
|
78
|
-
return row_boxes, col_boxes
|
80
|
+
return row_boxes, col_boxes
|
@@ -91,9 +91,7 @@ class YOLODocLayoutDetector(LayoutDetector):
|
|
91
91
|
def _load_model_from_options(self, options: YOLOLayoutOptions) -> Any:
|
92
92
|
"""Load the YOLOv10 model based on options."""
|
93
93
|
if not self.is_available():
|
94
|
-
raise RuntimeError(
|
95
|
-
"YOLO dependencies not installed. Please run: npdf install yolo"
|
96
|
-
)
|
94
|
+
raise RuntimeError("YOLO dependencies not installed. Please run: npdf install yolo")
|
97
95
|
self.logger.info(f"Loading YOLO model: {options.model_repo}/{options.model_file}")
|
98
96
|
try:
|
99
97
|
model_path = hf_hub_download(repo_id=options.model_repo, filename=options.model_file)
|
@@ -107,9 +105,7 @@ class YOLODocLayoutDetector(LayoutDetector):
|
|
107
105
|
def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
|
108
106
|
"""Detect layout elements in an image using YOLO."""
|
109
107
|
if not self.is_available():
|
110
|
-
raise RuntimeError(
|
111
|
-
"YOLO dependencies not installed. Please run: npdf install yolo"
|
112
|
-
)
|
108
|
+
raise RuntimeError("YOLO dependencies not installed. Please run: npdf install yolo")
|
113
109
|
|
114
110
|
# Ensure options are the correct type, falling back to defaults if base type passed
|
115
111
|
if not isinstance(options, YOLOLayoutOptions):
|
@@ -3,10 +3,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
from PIL import Image, ImageDraw
|
6
|
-
from scipy.ndimage import binary_closing, binary_opening, gaussian_filter1d
|
6
|
+
from scipy.ndimage import binary_closing, binary_opening, find_objects, gaussian_filter1d
|
7
|
+
from scipy.ndimage import label as nd_label
|
7
8
|
from scipy.signal import find_peaks
|
8
9
|
from sklearn.cluster import MiniBatchKMeans
|
9
|
-
from scipy.ndimage import label as nd_label, find_objects
|
10
10
|
|
11
11
|
if TYPE_CHECKING:
|
12
12
|
from natural_pdf.core.page import Page
|
@@ -1160,10 +1160,13 @@ class ShapeDetectionMixin:
|
|
1160
1160
|
masking so large painted areas are not cut by text boxes.
|
1161
1161
|
"""
|
1162
1162
|
import numpy as np
|
1163
|
-
from scipy.ndimage import
|
1163
|
+
from scipy.ndimage import find_objects
|
1164
|
+
from scipy.ndimage import label as nd_label
|
1164
1165
|
|
1165
1166
|
# Acquire raster image & scale info
|
1166
|
-
cv_image, scale_factor, origin_offset_pdf, page_obj = self._get_image_for_detection(
|
1167
|
+
cv_image, scale_factor, origin_offset_pdf, page_obj = self._get_image_for_detection(
|
1168
|
+
resolution
|
1169
|
+
)
|
1167
1170
|
if cv_image is None or page_obj is None:
|
1168
1171
|
return self # nothing to do
|
1169
1172
|
img_arr = cv_image.reshape(-1, 3).astype(np.float32) / 255.0 # normalised
|
@@ -1246,7 +1249,12 @@ class ShapeDetectionMixin:
|
|
1246
1249
|
|
1247
1250
|
# ── optional purge ──
|
1248
1251
|
if replace and hasattr(page_obj, "_element_mgr"):
|
1249
|
-
old_blobs = [
|
1252
|
+
old_blobs = [
|
1253
|
+
r
|
1254
|
+
for r in page_obj._element_mgr.regions
|
1255
|
+
if getattr(r, "region_type", None) == "blob"
|
1256
|
+
and getattr(r, "source", None) == source_label
|
1257
|
+
]
|
1250
1258
|
for r in old_blobs:
|
1251
1259
|
try:
|
1252
1260
|
page_obj._element_mgr.regions.remove(r)
|
@@ -1273,7 +1281,7 @@ class ShapeDetectionMixin:
|
|
1273
1281
|
x0, x1 = sl[1].start, sl[1].stop
|
1274
1282
|
# bbox area in pixels → in pts²
|
1275
1283
|
area_pixels = (y1 - y0) * (x1 - x0)
|
1276
|
-
area_pts = area_pixels * (scale_factor
|
1284
|
+
area_pts = area_pixels * (scale_factor**2)
|
1277
1285
|
|
1278
1286
|
# Skip tiny regions
|
1279
1287
|
if area_pts < min_area_pts:
|
@@ -1331,6 +1339,7 @@ class ShapeDetectionMixin:
|
|
1331
1339
|
pdf_x0, pdf_top, pdf_x1, pdf_bottom = region_bbox_pdf
|
1332
1340
|
|
1333
1341
|
from natural_pdf.elements.region import Region
|
1342
|
+
|
1334
1343
|
region = Region(page_obj, (pdf_x0, pdf_top, pdf_x1, pdf_bottom))
|
1335
1344
|
region.region_type = "blob"
|
1336
1345
|
region.normalized_type = "blob"
|