natural-pdf 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +670 -595
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +131 -45
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +113 -22
- natural_pdf/core/pdf.py +477 -75
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +222 -108
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
- natural_pdf-0.1.34.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.33.dist-info/RECORD +0 -118
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
natural_pdf/elements/text.py
CHANGED
@@ -32,11 +32,11 @@ class TextElement(Element):
|
|
32
32
|
obj["object_type"] = "text"
|
33
33
|
|
34
34
|
super().__init__(obj, page)
|
35
|
-
|
35
|
+
|
36
36
|
# Memory optimization: Store character indices instead of full dictionaries
|
37
37
|
# This reduces memory usage by ~50% by avoiding character data duplication
|
38
38
|
self._char_indices = obj.pop("_char_indices", [])
|
39
|
-
|
39
|
+
|
40
40
|
# Backward compatibility: Keep _char_dicts for existing code
|
41
41
|
# But prefer _char_indices when available to save memory
|
42
42
|
self._char_dicts = obj.pop("_char_dicts", [])
|
@@ -44,20 +44,20 @@ class TextElement(Element):
|
|
44
44
|
@property
|
45
45
|
def chars(self):
|
46
46
|
"""Get constituent character elements efficiently.
|
47
|
-
|
47
|
+
|
48
48
|
Uses character indices when available to avoid memory duplication,
|
49
49
|
falls back to _char_dicts for backward compatibility.
|
50
50
|
"""
|
51
51
|
if self._char_indices:
|
52
52
|
# Memory-efficient approach: access characters by index
|
53
|
-
if hasattr(self.page,
|
54
|
-
char_elements = self.page._element_mgr.get_elements(
|
53
|
+
if hasattr(self.page, "_element_mgr"):
|
54
|
+
char_elements = self.page._element_mgr.get_elements("chars")
|
55
55
|
return [char_elements[i] for i in self._char_indices if i < len(char_elements)]
|
56
|
-
|
56
|
+
|
57
57
|
# Backward compatibility: convert _char_dicts to TextElement objects
|
58
58
|
if self._char_dicts:
|
59
59
|
return [TextElement(char_dict, self.page) for char_dict in self._char_dicts]
|
60
|
-
|
60
|
+
|
61
61
|
return []
|
62
62
|
|
63
63
|
@property
|
@@ -75,12 +75,12 @@ class TextElement(Element):
|
|
75
75
|
try:
|
76
76
|
# If using memory-efficient character indices, update the referenced chars
|
77
77
|
if hasattr(self, "_char_indices") and self._char_indices:
|
78
|
-
if hasattr(self.page,
|
79
|
-
char_elements = self.page._element_mgr.get_elements(
|
78
|
+
if hasattr(self.page, "_element_mgr"):
|
79
|
+
char_elements = self.page._element_mgr.get_elements("chars")
|
80
80
|
for idx, char_idx in enumerate(self._char_indices):
|
81
81
|
if char_idx < len(char_elements) and idx < len(value):
|
82
82
|
char_elements[char_idx].text = value[idx]
|
83
|
-
|
83
|
+
|
84
84
|
# Legacy _char_dicts synchronization for backward compatibility
|
85
85
|
elif hasattr(self, "_char_dicts") and isinstance(self._char_dicts, list):
|
86
86
|
if not self._char_dicts:
|
@@ -121,6 +121,7 @@ class TextElement(Element):
|
|
121
121
|
except Exception as sync_err: # pragma: no cover
|
122
122
|
# Keep failures silent but logged; better to have outdated chars than crash.
|
123
123
|
import logging
|
124
|
+
|
124
125
|
logger = logging.getLogger(__name__)
|
125
126
|
logger.debug(f"TextElement: Failed to sync char data after text update: {sync_err}")
|
126
127
|
|
@@ -379,7 +380,9 @@ class TextElement(Element):
|
|
379
380
|
@property
|
380
381
|
def underline(self) -> bool:
|
381
382
|
"""True if element is underlined."""
|
382
|
-
return bool(
|
383
|
+
return bool(
|
384
|
+
self._obj.get("underline") or self.metadata.get("decoration", {}).get("underline")
|
385
|
+
)
|
383
386
|
|
384
387
|
# -----------------------------
|
385
388
|
# Highlight decoration
|
@@ -397,7 +400,9 @@ class TextElement(Element):
|
|
397
400
|
@property
|
398
401
|
def highlight_color(self):
|
399
402
|
"""Return RGB(A) tuple of highlight colour if stored."""
|
400
|
-
return self._obj.get("highlight_color") or self.metadata.get("decoration", {}).get(
|
403
|
+
return self._obj.get("highlight_color") or self.metadata.get("decoration", {}).get(
|
404
|
+
"highlight_color"
|
405
|
+
)
|
401
406
|
|
402
407
|
def __repr__(self) -> str:
|
403
408
|
"""String representation of the text element."""
|
@@ -489,6 +494,7 @@ class TextElement(Element):
|
|
489
494
|
|
490
495
|
try:
|
491
496
|
from bidi.algorithm import get_display # type: ignore
|
497
|
+
|
492
498
|
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
493
499
|
|
494
500
|
# Convert from logical order to visual order
|
@@ -1,16 +1,19 @@
|
|
1
1
|
from .base import FinetuneExporter
|
2
2
|
|
3
|
+
|
3
4
|
# Lazy import for PaddleOCRRecognitionExporter to avoid heavy paddle dependencies at module level
|
4
5
|
def _get_paddleocr_exporter():
|
5
6
|
"""Lazy import for PaddleOCRRecognitionExporter."""
|
6
7
|
from .paddleocr import PaddleOCRRecognitionExporter
|
7
|
-
|
8
|
+
|
8
9
|
return PaddleOCRRecognitionExporter
|
9
10
|
|
11
|
+
|
10
12
|
# Make PaddleOCRRecognitionExporter available through attribute access
|
11
13
|
def __getattr__(name):
|
12
14
|
if name == "PaddleOCRRecognitionExporter":
|
13
15
|
return _get_paddleocr_exporter()
|
14
16
|
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
|
15
17
|
|
18
|
+
|
16
19
|
__all__ = ["FinetuneExporter", "PaddleOCRRecognitionExporter"]
|
@@ -2,9 +2,9 @@
|
|
2
2
|
Module for exporting original PDF pages without modification.
|
3
3
|
"""
|
4
4
|
|
5
|
+
import io
|
5
6
|
import logging
|
6
7
|
import os
|
7
|
-
import io
|
8
8
|
import urllib.request
|
9
9
|
from pathlib import Path
|
10
10
|
from typing import TYPE_CHECKING, List, Set, Union
|
@@ -103,11 +103,17 @@ def create_original_pdf(
|
|
103
103
|
source_handle = pikepdf.Pdf.open(first_page_pdf_path)
|
104
104
|
else:
|
105
105
|
# Fallback: attempt to open from in-memory bytes stored on PDF object
|
106
|
-
if
|
106
|
+
if (
|
107
|
+
first_page_pdf_obj is not None
|
108
|
+
and hasattr(first_page_pdf_obj, "_original_bytes")
|
109
|
+
and first_page_pdf_obj._original_bytes
|
110
|
+
):
|
107
111
|
source_handle = pikepdf.Pdf.open(io.BytesIO(first_page_pdf_obj._original_bytes))
|
108
112
|
else:
|
109
113
|
# Attempt to download bytes directly if path looks like URL
|
110
|
-
if isinstance(first_page_pdf_path, str) and first_page_pdf_path.startswith(
|
114
|
+
if isinstance(first_page_pdf_path, str) and first_page_pdf_path.startswith(
|
115
|
+
("http://", "https://")
|
116
|
+
):
|
111
117
|
try:
|
112
118
|
with urllib.request.urlopen(first_page_pdf_path) as resp:
|
113
119
|
data = resp.read()
|
@@ -117,7 +123,9 @@ def create_original_pdf(
|
|
117
123
|
f"Source PDF bytes not available and download failed for {first_page_pdf_path}: {dl_err}"
|
118
124
|
)
|
119
125
|
else:
|
120
|
-
raise FileNotFoundError(
|
126
|
+
raise FileNotFoundError(
|
127
|
+
f"Source PDF bytes not available for {first_page_pdf_path}"
|
128
|
+
)
|
121
129
|
|
122
130
|
with source_handle as source_pikepdf_doc:
|
123
131
|
target_pikepdf_doc = pikepdf.Pdf.new()
|
natural_pdf/extraction/mixin.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
import logging
|
2
2
|
from abc import ABC, abstractmethod
|
3
|
-
from typing import TYPE_CHECKING, Any, Optional,
|
3
|
+
from typing import TYPE_CHECKING, Any, Optional, Sequence, Type
|
4
4
|
|
5
5
|
from pydantic import BaseModel, Field, create_model
|
6
6
|
|
@@ -16,9 +16,54 @@ DEFAULT_STRUCTURED_KEY = "structured" # Define default key
|
|
16
16
|
|
17
17
|
|
18
18
|
class ExtractionMixin(ABC):
|
19
|
-
"""
|
20
|
-
|
21
|
-
|
19
|
+
"""Mixin class providing structured data extraction capabilities to elements.
|
20
|
+
|
21
|
+
This mixin adds AI-powered structured data extraction functionality to pages,
|
22
|
+
regions, and elements, enabling extraction of specific data fields using
|
23
|
+
Pydantic schemas and large language models. It supports both text-based and
|
24
|
+
vision-based extraction modes.
|
25
|
+
|
26
|
+
The mixin integrates with the StructuredDataManager to handle LLM interactions
|
27
|
+
and provides schema validation using Pydantic models. Extracted data is
|
28
|
+
automatically validated against the provided schema and stored with
|
29
|
+
confidence metrics and metadata.
|
30
|
+
|
31
|
+
Extraction modes:
|
32
|
+
- Text-based: Uses extracted text content for LLM processing
|
33
|
+
- Vision-based: Uses rendered images for multimodal LLM analysis
|
34
|
+
- Automatic: Selects best mode based on content and model capabilities
|
35
|
+
|
36
|
+
Host class requirements:
|
37
|
+
- Must implement extract_text(**kwargs) -> str
|
38
|
+
- Must implement to_image(**kwargs) -> PIL.Image
|
39
|
+
- Must have access to StructuredDataManager (usually via parent PDF)
|
40
|
+
|
41
|
+
Example:
|
42
|
+
```python
|
43
|
+
from pydantic import BaseModel
|
44
|
+
|
45
|
+
class InvoiceData(BaseModel):
|
46
|
+
invoice_number: str
|
47
|
+
total_amount: float
|
48
|
+
due_date: str
|
49
|
+
vendor_name: str
|
50
|
+
|
51
|
+
pdf = npdf.PDF("invoice.pdf")
|
52
|
+
page = pdf.pages[0]
|
53
|
+
|
54
|
+
# Extract structured data
|
55
|
+
invoice = page.extract_structured_data(InvoiceData)
|
56
|
+
print(f"Invoice {invoice.data.invoice_number}: ${invoice.data.total_amount}")
|
57
|
+
|
58
|
+
# Region-specific extraction
|
59
|
+
header_region = page.find('text:contains("Invoice")').above()
|
60
|
+
header_data = header_region.extract_structured_data(InvoiceData)
|
61
|
+
```
|
62
|
+
|
63
|
+
Note:
|
64
|
+
Structured extraction requires a compatible LLM to be configured in the
|
65
|
+
StructuredDataManager. Results include confidence scores and validation
|
66
|
+
metadata for quality assessment.
|
22
67
|
"""
|
23
68
|
|
24
69
|
def _get_extraction_content(self, using: str = "text", **kwargs) -> Any:
|
@@ -386,10 +431,13 @@ class ExtractionMixin(ABC):
|
|
386
431
|
question_map = question_map or {}
|
387
432
|
|
388
433
|
try:
|
389
|
-
from natural_pdf.qa.document_qa import get_qa_engine
|
390
|
-
from natural_pdf.extraction.result import StructuredDataResult
|
391
|
-
from pydantic import Field as _Field, create_model
|
392
434
|
import re
|
435
|
+
|
436
|
+
from pydantic import Field as _Field
|
437
|
+
from pydantic import create_model
|
438
|
+
|
439
|
+
from natural_pdf.extraction.result import StructuredDataResult
|
440
|
+
from natural_pdf.qa.document_qa import get_qa_engine
|
393
441
|
except ImportError as exc:
|
394
442
|
raise RuntimeError(
|
395
443
|
"Document-QA dependencies missing. Install with `pip install natural-pdf[ai]`."
|
@@ -424,7 +472,9 @@ class ExtractionMixin(ABC):
|
|
424
472
|
question = question_map[display_name]
|
425
473
|
else:
|
426
474
|
description = None
|
427
|
-
if hasattr(field_obj, "field_info") and hasattr(
|
475
|
+
if hasattr(field_obj, "field_info") and hasattr(
|
476
|
+
field_obj.field_info, "description"
|
477
|
+
):
|
428
478
|
description = field_obj.field_info.description
|
429
479
|
elif hasattr(field_obj, "description"):
|
430
480
|
description = field_obj.description
|
@@ -529,7 +579,11 @@ class ExtractionMixin(ABC):
|
|
529
579
|
pdf_instance = self
|
530
580
|
elif hasattr(self, "pdf") and hasattr(self.pdf, "get_manager"):
|
531
581
|
pdf_instance = self.pdf
|
532
|
-
elif
|
582
|
+
elif (
|
583
|
+
hasattr(self, "page")
|
584
|
+
and hasattr(self.page, "pdf")
|
585
|
+
and hasattr(self.page.pdf, "get_manager")
|
586
|
+
):
|
533
587
|
pdf_instance = self.page.pdf
|
534
588
|
else:
|
535
589
|
raise RuntimeError("Cannot access PDF manager to perform LLM extraction.")
|
@@ -542,7 +596,9 @@ class ExtractionMixin(ABC):
|
|
542
596
|
layout_for_text = kwargs.pop("layout", True)
|
543
597
|
content = self._get_extraction_content(using=using, layout=layout_for_text, **kwargs)
|
544
598
|
|
545
|
-
if content is None or (
|
599
|
+
if content is None or (
|
600
|
+
using == "text" and isinstance(content, str) and not content.strip()
|
601
|
+
):
|
546
602
|
result = StructuredDataResult(
|
547
603
|
data=None,
|
548
604
|
success=False,
|
natural_pdf/extraction/result.py
CHANGED
natural_pdf/flows/flow.py
CHANGED
@@ -14,10 +14,69 @@ logger = logging.getLogger(__name__)
|
|
14
14
|
|
15
15
|
|
16
16
|
class Flow:
|
17
|
-
"""
|
18
|
-
|
19
|
-
|
20
|
-
|
17
|
+
"""Defines a logical flow or sequence of physical Page or Region objects.
|
18
|
+
|
19
|
+
A Flow represents a continuous logical document structure that spans across
|
20
|
+
multiple pages or regions, enabling operations on content that flows across
|
21
|
+
boundaries. This is essential for handling multi-page tables, articles that
|
22
|
+
span columns, or any content that requires reading order across segments.
|
23
|
+
|
24
|
+
Flows specify arrangement (vertical/horizontal) and alignment rules to create
|
25
|
+
a unified coordinate system for element extraction and text processing. They
|
26
|
+
enable natural-pdf to treat fragmented content as a single continuous area
|
27
|
+
for analysis and extraction operations.
|
28
|
+
|
29
|
+
The Flow system is particularly useful for:
|
30
|
+
- Multi-page tables that break across page boundaries
|
31
|
+
- Multi-column articles with complex reading order
|
32
|
+
- Forms that span multiple pages
|
33
|
+
- Any content requiring logical continuation across segments
|
34
|
+
|
35
|
+
Attributes:
|
36
|
+
segments: List of Page or Region objects in flow order.
|
37
|
+
arrangement: Primary flow direction ('vertical' or 'horizontal').
|
38
|
+
alignment: Cross-axis alignment for segments of different sizes.
|
39
|
+
segment_gap: Virtual gap between segments in PDF points.
|
40
|
+
|
41
|
+
Example:
|
42
|
+
Multi-page table flow:
|
43
|
+
```python
|
44
|
+
pdf = npdf.PDF("multi_page_table.pdf")
|
45
|
+
|
46
|
+
# Create flow for table spanning pages 2-4
|
47
|
+
table_flow = Flow(
|
48
|
+
segments=[pdf.pages[1], pdf.pages[2], pdf.pages[3]],
|
49
|
+
arrangement='vertical',
|
50
|
+
alignment='left',
|
51
|
+
segment_gap=10.0
|
52
|
+
)
|
53
|
+
|
54
|
+
# Extract table as if it were continuous
|
55
|
+
table_data = table_flow.extract_table()
|
56
|
+
text_content = table_flow.get_text()
|
57
|
+
```
|
58
|
+
|
59
|
+
Multi-column article flow:
|
60
|
+
```python
|
61
|
+
page = pdf.pages[0]
|
62
|
+
left_column = page.region(0, 0, 300, page.height)
|
63
|
+
right_column = page.region(320, 0, page.width, page.height)
|
64
|
+
|
65
|
+
# Create horizontal flow for columns
|
66
|
+
article_flow = Flow(
|
67
|
+
segments=[left_column, right_column],
|
68
|
+
arrangement='horizontal',
|
69
|
+
alignment='top'
|
70
|
+
)
|
71
|
+
|
72
|
+
# Read in proper order
|
73
|
+
article_text = article_flow.get_text()
|
74
|
+
```
|
75
|
+
|
76
|
+
Note:
|
77
|
+
Flows create virtual coordinate systems that map element positions across
|
78
|
+
segments, enabling spatial navigation and element selection to work
|
79
|
+
seamlessly across boundaries.
|
21
80
|
"""
|
22
81
|
|
23
82
|
def __init__(
|
natural_pdf/flows/region.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
2
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
3
3
|
|
4
4
|
from pdfplumber.utils.geometry import objects_to_bbox # For calculating combined bbox
|
5
5
|
|
@@ -133,7 +133,7 @@ class FlowRegion:
|
|
133
133
|
# This is a simplification; true layout-aware joining would be more complex.
|
134
134
|
joiner = (
|
135
135
|
"\n" if self.flow.arrangement == "vertical" else " "
|
136
|
-
) # TODO:
|
136
|
+
) # TODO: Consider flow.segment_gap for proportional spacing between segments
|
137
137
|
extracted = joiner.join(t for t in texts if t)
|
138
138
|
|
139
139
|
if apply_exclusions: # Only cache if standard exclusion behavior
|
@@ -258,7 +258,7 @@ class FlowRegion:
|
|
258
258
|
"""
|
259
259
|
Generates and returns a PIL Image of relevant pages with constituent regions highlighted.
|
260
260
|
If multiple pages are involved, they are stacked into a single image.
|
261
|
-
|
261
|
+
|
262
262
|
Args:
|
263
263
|
resolution: Resolution in DPI for page rendering. If None, uses global setting or defaults to 144 DPI.
|
264
264
|
labels: Whether to include a legend for highlights.
|
@@ -270,7 +270,7 @@ class FlowRegion:
|
|
270
270
|
stack_gap: Gap in pixels between stacked pages.
|
271
271
|
stack_background_color: RGB background color for the stacked image.
|
272
272
|
**kwargs: Additional arguments passed to the underlying rendering methods.
|
273
|
-
|
273
|
+
|
274
274
|
Returns:
|
275
275
|
PIL Image of the rendered pages with highlighted regions, or None if rendering fails.
|
276
276
|
"""
|
natural_pdf/ocr/engine.py
CHANGED
@@ -12,7 +12,38 @@ logger = logging.getLogger(__name__)
|
|
12
12
|
|
13
13
|
|
14
14
|
class TextRegion:
|
15
|
-
"""Standard representation of an OCR text region.
|
15
|
+
"""Standard representation of an OCR text region.
|
16
|
+
|
17
|
+
TextRegion provides a standardized format for representing text detected by
|
18
|
+
OCR engines, regardless of the underlying engine implementation. This ensures
|
19
|
+
consistent interfaces across different OCR backends (EasyOCR, Surya, PaddleOCR, etc.).
|
20
|
+
|
21
|
+
The class handles coordinate normalization and provides utilities for converting
|
22
|
+
between different coordinate formats (bounding boxes vs. polygons).
|
23
|
+
|
24
|
+
Attributes:
|
25
|
+
bbox: Bounding box coordinates as (x0, y0, x1, y1) tuple.
|
26
|
+
text: The recognized text content.
|
27
|
+
confidence: Confidence score from 0.0 (low) to 1.0 (high).
|
28
|
+
source: Source identifier, typically "ocr" or engine name.
|
29
|
+
|
30
|
+
Example:
|
31
|
+
```python
|
32
|
+
# Create from bounding box
|
33
|
+
region = TextRegion(
|
34
|
+
bbox=(100, 200, 300, 250),
|
35
|
+
text="Hello World",
|
36
|
+
confidence=0.95
|
37
|
+
)
|
38
|
+
|
39
|
+
# Create from polygon coordinates
|
40
|
+
polygon = [[100, 200], [300, 200], [300, 250], [100, 250]]
|
41
|
+
region = TextRegion.from_polygon(polygon, "Hello World", 0.95)
|
42
|
+
|
43
|
+
# Convert to dictionary for processing
|
44
|
+
data = region.to_dict()
|
45
|
+
```
|
46
|
+
"""
|
16
47
|
|
17
48
|
def __init__(
|
18
49
|
self,
|
@@ -54,7 +85,57 @@ class TextRegion:
|
|
54
85
|
|
55
86
|
|
56
87
|
class OCREngine(ABC):
|
57
|
-
"""Abstract
|
88
|
+
"""Abstract base class for OCR engines.
|
89
|
+
|
90
|
+
This class defines the standard interface that all OCR engines must implement
|
91
|
+
in natural-pdf. It provides a consistent API for text recognition regardless
|
92
|
+
of the underlying OCR technology (EasyOCR, Surya, PaddleOCR, DocTR, etc.).
|
93
|
+
|
94
|
+
The base class handles common functionality like model caching, parameter
|
95
|
+
validation, and result standardization, while concrete implementations
|
96
|
+
provide engine-specific processing logic.
|
97
|
+
|
98
|
+
Subclasses must implement:
|
99
|
+
- process_single_image(): Core OCR processing for a single image
|
100
|
+
- is_available(): Check if the engine dependencies are installed
|
101
|
+
- get_supported_languages(): Return list of supported language codes
|
102
|
+
|
103
|
+
Class Attributes:
|
104
|
+
DEFAULT_MIN_CONFIDENCE: Default confidence threshold (0.2).
|
105
|
+
DEFAULT_LANGUAGES: Default language list (["en"]).
|
106
|
+
DEFAULT_DEVICE: Default processing device ("cpu").
|
107
|
+
|
108
|
+
Attributes:
|
109
|
+
logger: Logger instance for the specific engine.
|
110
|
+
_model: Cached model instance for the engine.
|
111
|
+
_initialized: Whether the engine has been initialized.
|
112
|
+
_reader_cache: Cache for initialized models/readers.
|
113
|
+
|
114
|
+
Example:
|
115
|
+
Implementing a custom OCR engine:
|
116
|
+
```python
|
117
|
+
class MyOCREngine(OCREngine):
|
118
|
+
@classmethod
|
119
|
+
def is_available(cls) -> bool:
|
120
|
+
try:
|
121
|
+
import my_ocr_library
|
122
|
+
return True
|
123
|
+
except ImportError:
|
124
|
+
return False
|
125
|
+
|
126
|
+
def process_single_image(self, image, languages, min_confidence,
|
127
|
+
device, detect_only, options):
|
128
|
+
# Implement OCR processing
|
129
|
+
return text_regions
|
130
|
+
```
|
131
|
+
|
132
|
+
Using an OCR engine:
|
133
|
+
```python
|
134
|
+
if EasyOCREngine.is_available():
|
135
|
+
engine = EasyOCREngine()
|
136
|
+
results = engine.process_image(image, languages=['en', 'es'])
|
137
|
+
```
|
138
|
+
"""
|
58
139
|
|
59
140
|
# Default values as class constants
|
60
141
|
DEFAULT_MIN_CONFIDENCE = 0.2
|
natural_pdf/ocr/engine_paddle.py
CHANGED
@@ -11,6 +11,7 @@ from .ocr_options import BaseOCROptions, PaddleOCROptions
|
|
11
11
|
|
12
12
|
logger = logging.getLogger(__name__)
|
13
13
|
|
14
|
+
|
14
15
|
class PaddleOCREngine(OCREngine):
|
15
16
|
"""PaddleOCR engine implementation."""
|
16
17
|
|
@@ -147,8 +148,8 @@ class PaddleOCREngine(OCREngine):
|
|
147
148
|
|
148
149
|
# --- RESTORE: Language/version support check logic ---
|
149
150
|
user_specified_model = (
|
150
|
-
getattr(paddle_options, "text_recognition_model_name", None) is not None
|
151
|
-
getattr(paddle_options, "text_detection_model_name", None) is not None
|
151
|
+
getattr(paddle_options, "text_recognition_model_name", None) is not None
|
152
|
+
or getattr(paddle_options, "text_detection_model_name", None) is not None
|
152
153
|
)
|
153
154
|
if user_specified_model and user_ocr_version:
|
154
155
|
if primary_lang not in self.SUPPORT_MATRIX.get(user_ocr_version, set()):
|
@@ -169,7 +170,7 @@ class PaddleOCREngine(OCREngine):
|
|
169
170
|
user_ocr_version,
|
170
171
|
)
|
171
172
|
final_ocr_version = None # Reset to find a compatible version
|
172
|
-
|
173
|
+
|
173
174
|
# If no version was specified or the specified one was incompatible, find the best fit.
|
174
175
|
if not final_ocr_version:
|
175
176
|
found_compatible = False
|
@@ -269,7 +270,6 @@ class PaddleOCREngine(OCREngine):
|
|
269
270
|
if value is not None:
|
270
271
|
ocr_config[arg] = value
|
271
272
|
|
272
|
-
|
273
273
|
try:
|
274
274
|
# The new API uses PaddleOCR as a pipeline object.
|
275
275
|
self._model = paddleocr.PaddleOCR(**ocr_config)
|
@@ -350,7 +350,7 @@ class PaddleOCREngine(OCREngine):
|
|
350
350
|
# This code converts any numpy array to a list before passing to _standardize_bbox,
|
351
351
|
# which handles both rectangle and polygon formats robustly.
|
352
352
|
box = rec_boxes[i]
|
353
|
-
if hasattr(box,
|
353
|
+
if hasattr(box, "tolist"):
|
354
354
|
box = box.tolist()
|
355
355
|
bbox = self._standardize_bbox(box)
|
356
356
|
if detect_only:
|
natural_pdf/ocr/ocr_factory.py
CHANGED
@@ -32,7 +32,8 @@ class OCRFactory:
|
|
32
32
|
return SuryaOCREngine(**kwargs)
|
33
33
|
except ImportError:
|
34
34
|
raise ImportError(
|
35
|
-
"Surya engine requires additional dependencies. "
|
35
|
+
"Surya engine requires additional dependencies. "
|
36
|
+
"Install with: npdf install surya"
|
36
37
|
)
|
37
38
|
elif engine_type == "easyocr":
|
38
39
|
try:
|
natural_pdf/ocr/ocr_manager.py
CHANGED
@@ -11,6 +11,7 @@ from PIL import Image
|
|
11
11
|
from .engine import OCREngine
|
12
12
|
from .engine_doctr import DoctrOCREngine
|
13
13
|
from .engine_easyocr import EasyOCREngine
|
14
|
+
|
14
15
|
# Lazy import for PaddleOCREngine to avoid heavy paddle dependencies at module level
|
15
16
|
# from .engine_paddle import PaddleOCREngine
|
16
17
|
from .engine_surya import SuryaOCREngine
|
@@ -33,12 +34,16 @@ class OCRManager:
|
|
33
34
|
def _get_paddle_engine_class():
|
34
35
|
"""Lazy import for PaddleOCREngine to avoid heavy paddle dependencies at module level."""
|
35
36
|
from .engine_paddle import PaddleOCREngine
|
37
|
+
|
36
38
|
return PaddleOCREngine
|
37
39
|
|
38
40
|
# Registry mapping engine names to classes and default options
|
39
41
|
ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {
|
40
42
|
"easyocr": {"class": EasyOCREngine, "options_class": EasyOCROptions},
|
41
|
-
"paddle": {
|
43
|
+
"paddle": {
|
44
|
+
"class": lambda: OCRManager._get_paddle_engine_class(),
|
45
|
+
"options_class": PaddleOCROptions,
|
46
|
+
},
|
42
47
|
"surya": {"class": SuryaOCREngine, "options_class": SuryaOCROptions},
|
43
48
|
"doctr": {"class": DoctrOCREngine, "options_class": DoctrOCROptions},
|
44
49
|
# Add other engines here
|
@@ -85,7 +90,10 @@ class OCRManager:
|
|
85
90
|
)
|
86
91
|
engine_class_or_factory = self.ENGINE_REGISTRY[engine_name]["class"]
|
87
92
|
# Handle lazy loading - if it's a lambda function, call it to get the actual class
|
88
|
-
if
|
93
|
+
if (
|
94
|
+
callable(engine_class_or_factory)
|
95
|
+
and getattr(engine_class_or_factory, "__name__", "") == "<lambda>"
|
96
|
+
):
|
89
97
|
engine_class = engine_class_or_factory()
|
90
98
|
else:
|
91
99
|
engine_class = engine_class_or_factory
|
@@ -283,7 +291,10 @@ class OCRManager:
|
|
283
291
|
# Temporarily instantiate to check availability without caching
|
284
292
|
engine_class_or_factory = registry_entry["class"]
|
285
293
|
# Handle lazy loading - if it's a lambda function, call it to get the actual class
|
286
|
-
if
|
294
|
+
if (
|
295
|
+
callable(engine_class_or_factory)
|
296
|
+
and getattr(engine_class_or_factory, "__name__", "") == "<lambda>"
|
297
|
+
):
|
287
298
|
engine_class = engine_class_or_factory()
|
288
299
|
else:
|
289
300
|
engine_class = engine_class_or_factory
|
@@ -299,49 +310,49 @@ class OCRManager:
|
|
299
310
|
def cleanup_engine(self, engine_name: Optional[str] = None) -> int:
|
300
311
|
"""
|
301
312
|
Cleanup OCR engine instances to free memory.
|
302
|
-
|
313
|
+
|
303
314
|
Args:
|
304
315
|
engine_name: Specific engine to cleanup, or None to cleanup all engines
|
305
|
-
|
316
|
+
|
306
317
|
Returns:
|
307
318
|
Number of engines cleaned up
|
308
319
|
"""
|
309
320
|
cleaned_count = 0
|
310
|
-
|
321
|
+
|
311
322
|
if engine_name:
|
312
323
|
# Cleanup specific engine
|
313
324
|
engine_name = engine_name.lower()
|
314
325
|
if engine_name in self._engine_instances:
|
315
326
|
engine = self._engine_instances.pop(engine_name)
|
316
|
-
if hasattr(engine,
|
327
|
+
if hasattr(engine, "cleanup"):
|
317
328
|
try:
|
318
329
|
engine.cleanup()
|
319
330
|
except Exception as e:
|
320
331
|
logger.debug(f"Engine {engine_name} cleanup method failed: {e}")
|
321
|
-
|
332
|
+
|
322
333
|
# Clear associated locks
|
323
334
|
self._engine_locks.pop(engine_name, None)
|
324
335
|
self._engine_inference_locks.pop(engine_name, None)
|
325
|
-
|
336
|
+
|
326
337
|
logger.info(f"Cleaned up OCR engine: {engine_name}")
|
327
338
|
cleaned_count = 1
|
328
339
|
else:
|
329
340
|
# Cleanup all engines
|
330
341
|
for name, engine in list(self._engine_instances.items()):
|
331
|
-
if hasattr(engine,
|
342
|
+
if hasattr(engine, "cleanup"):
|
332
343
|
try:
|
333
344
|
engine.cleanup()
|
334
345
|
except Exception as e:
|
335
346
|
logger.debug(f"Engine {name} cleanup method failed: {e}")
|
336
|
-
|
347
|
+
|
337
348
|
# Clear all caches
|
338
349
|
engine_count = len(self._engine_instances)
|
339
350
|
self._engine_instances.clear()
|
340
351
|
self._engine_locks.clear()
|
341
352
|
self._engine_inference_locks.clear()
|
342
|
-
|
353
|
+
|
343
354
|
if engine_count > 0:
|
344
355
|
logger.info(f"Cleaned up {engine_count} OCR engines")
|
345
356
|
cleaned_count = engine_count
|
346
|
-
|
357
|
+
|
347
358
|
return cleaned_count
|