natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +126 -98
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +910 -516
- natural_pdf/core/pdf.py +387 -289
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +714 -514
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.3.dist-info/RECORD +0 -61
- natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,22 +1,24 @@
|
|
1
1
|
# ocr_engine_easyocr.py
|
2
|
-
import logging
|
3
2
|
import importlib.util
|
4
|
-
|
3
|
+
import inspect # Used for dynamic parameter passing
|
4
|
+
import logging
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
6
|
+
|
5
7
|
import numpy as np
|
6
8
|
from PIL import Image
|
7
|
-
import inspect # Used for dynamic parameter passing
|
8
9
|
|
9
10
|
from .engine import OCREngine
|
10
|
-
from .ocr_options import
|
11
|
+
from .ocr_options import BaseOCROptions, EasyOCROptions
|
11
12
|
|
12
13
|
logger = logging.getLogger(__name__)
|
13
14
|
|
15
|
+
|
14
16
|
class EasyOCREngine(OCREngine):
|
15
17
|
"""EasyOCR engine implementation."""
|
16
18
|
|
17
19
|
def __init__(self):
|
18
20
|
super().__init__()
|
19
|
-
self._easyocr = None
|
21
|
+
self._easyocr = None # Lazy load easyocr module
|
20
22
|
|
21
23
|
def _lazy_import_easyocr(self):
|
22
24
|
"""Imports easyocr only when needed."""
|
@@ -25,6 +27,7 @@ class EasyOCREngine(OCREngine):
|
|
25
27
|
raise ImportError("EasyOCR is not installed or available.")
|
26
28
|
try:
|
27
29
|
import easyocr
|
30
|
+
|
28
31
|
self._easyocr = easyocr
|
29
32
|
logger.info("EasyOCR module imported successfully.")
|
30
33
|
except ImportError as e:
|
@@ -56,15 +59,18 @@ class EasyOCREngine(OCREngine):
|
|
56
59
|
|
57
60
|
constructor_sig = inspect.signature(easyocr.Reader.__init__)
|
58
61
|
constructor_args = {}
|
59
|
-
constructor_args[
|
60
|
-
constructor_args[
|
62
|
+
constructor_args["lang_list"] = options.languages
|
63
|
+
constructor_args["gpu"] = (
|
64
|
+
"cuda" in str(options.device).lower() or "mps" in str(options.device).lower()
|
65
|
+
)
|
61
66
|
|
62
67
|
for field_name, param in constructor_sig.parameters.items():
|
63
|
-
if field_name in [
|
68
|
+
if field_name in ["self", "lang_list", "gpu"]:
|
69
|
+
continue
|
64
70
|
if hasattr(options, field_name):
|
65
|
-
|
71
|
+
constructor_args[field_name] = getattr(options, field_name)
|
66
72
|
elif field_name in options.extra_args:
|
67
|
-
|
73
|
+
constructor_args[field_name] = options.extra_args[field_name]
|
68
74
|
|
69
75
|
logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
|
70
76
|
try:
|
@@ -81,22 +87,29 @@ class EasyOCREngine(OCREngine):
|
|
81
87
|
readtext_sig = inspect.signature(reader.readtext)
|
82
88
|
readtext_args = {}
|
83
89
|
for field_name, param in readtext_sig.parameters.items():
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
90
|
+
if field_name == "image":
|
91
|
+
continue
|
92
|
+
if hasattr(options, field_name):
|
93
|
+
readtext_args[field_name] = getattr(options, field_name)
|
94
|
+
elif field_name in options.extra_args:
|
95
|
+
readtext_args[field_name] = options.extra_args[field_name]
|
89
96
|
logger.debug(f"EasyOCR readtext args: {readtext_args}")
|
90
97
|
return readtext_args
|
91
98
|
|
92
|
-
def _standardize_results(
|
99
|
+
def _standardize_results(
|
100
|
+
self, raw_results: List[Any], options: EasyOCROptions
|
101
|
+
) -> List[Dict[str, Any]]:
|
93
102
|
"""Standardizes raw results from EasyOCR's readtext."""
|
94
103
|
standardized_results = []
|
95
104
|
min_confidence = options.min_confidence
|
96
105
|
|
97
106
|
for detection in raw_results:
|
98
107
|
try:
|
99
|
-
if
|
108
|
+
if (
|
109
|
+
options.detail == 1
|
110
|
+
and isinstance(detection, (list, tuple))
|
111
|
+
and len(detection) >= 3
|
112
|
+
):
|
100
113
|
bbox_raw = detection[0]
|
101
114
|
text = str(detection[1])
|
102
115
|
confidence = float(detection[2])
|
@@ -104,38 +117,40 @@ class EasyOCREngine(OCREngine):
|
|
104
117
|
if confidence >= min_confidence:
|
105
118
|
bbox = self._standardize_bbox(bbox_raw)
|
106
119
|
if bbox:
|
107
|
-
standardized_results.append(
|
108
|
-
|
109
|
-
|
120
|
+
standardized_results.append(
|
121
|
+
{
|
122
|
+
"bbox": bbox,
|
123
|
+
"text": text,
|
124
|
+
"confidence": confidence,
|
125
|
+
"source": "ocr",
|
126
|
+
}
|
127
|
+
)
|
110
128
|
else:
|
111
|
-
|
129
|
+
logger.warning(f"Skipping result due to invalid bbox: {bbox_raw}")
|
112
130
|
|
113
131
|
elif options.detail == 0 and isinstance(detection, str):
|
114
|
-
|
115
|
-
|
116
|
-
|
132
|
+
standardized_results.append(
|
133
|
+
{"bbox": None, "text": detection, "confidence": 1.0, "source": "ocr"}
|
134
|
+
)
|
117
135
|
except (IndexError, ValueError, TypeError) as e:
|
118
|
-
|
119
|
-
|
136
|
+
logger.warning(f"Skipping invalid detection format: {detection}. Error: {e}")
|
137
|
+
continue
|
120
138
|
return standardized_results
|
121
139
|
|
122
|
-
|
123
140
|
def process_image(
|
124
|
-
self,
|
125
|
-
images: Union[Image.Image, List[Image.Image]],
|
126
|
-
options: BaseOCROptions
|
141
|
+
self, images: Union[Image.Image, List[Image.Image]], options: BaseOCROptions
|
127
142
|
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
|
128
143
|
"""Processes a single image or a batch of images with EasyOCR."""
|
129
144
|
|
130
145
|
if not isinstance(options, EasyOCROptions):
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
146
|
+
logger.warning("Received BaseOCROptions, expected EasyOCROptions. Using defaults.")
|
147
|
+
# Create default EasyOCR options if base was passed, preserving base settings
|
148
|
+
options = EasyOCROptions(
|
149
|
+
languages=options.languages,
|
150
|
+
min_confidence=options.min_confidence,
|
151
|
+
device=options.device,
|
152
|
+
extra_args=options.extra_args, # Pass along any extra args
|
153
|
+
)
|
139
154
|
|
140
155
|
reader = self._get_reader(options)
|
141
156
|
readtext_args = self._prepare_readtext_args(options, reader)
|
@@ -147,9 +162,9 @@ class EasyOCREngine(OCREngine):
|
|
147
162
|
logger.info(f"Processing batch of {len(images)} images with EasyOCR (iteratively)...")
|
148
163
|
for i, img in enumerate(images):
|
149
164
|
if not isinstance(img, Image.Image):
|
150
|
-
|
151
|
-
|
152
|
-
|
165
|
+
logger.warning(f"Item at index {i} in batch is not a PIL Image. Skipping.")
|
166
|
+
all_results.append([])
|
167
|
+
continue
|
153
168
|
img_array = np.array(img)
|
154
169
|
try:
|
155
170
|
logger.debug(f"Processing image {i+1}/{len(images)} in batch.")
|
@@ -157,10 +172,12 @@ class EasyOCREngine(OCREngine):
|
|
157
172
|
standardized = self._standardize_results(raw_results, options)
|
158
173
|
all_results.append(standardized)
|
159
174
|
except Exception as e:
|
160
|
-
logger.error(
|
161
|
-
|
175
|
+
logger.error(
|
176
|
+
f"Error processing image {i+1} in EasyOCR batch: {e}", exc_info=True
|
177
|
+
)
|
178
|
+
all_results.append([]) # Append empty list for failed image
|
162
179
|
logger.info(f"Finished processing batch with EasyOCR.")
|
163
|
-
return all_results
|
180
|
+
return all_results # Return List[List[Dict]]
|
164
181
|
|
165
182
|
elif isinstance(images, Image.Image):
|
166
183
|
# --- Single Image Processing ---
|
@@ -170,10 +187,9 @@ class EasyOCREngine(OCREngine):
|
|
170
187
|
raw_results = reader.readtext(img_array, **readtext_args)
|
171
188
|
standardized = self._standardize_results(raw_results, options)
|
172
189
|
logger.info(f"Finished processing single image. Found {len(standardized)} results.")
|
173
|
-
return standardized
|
190
|
+
return standardized # Return List[Dict]
|
174
191
|
except Exception as e:
|
175
192
|
logger.error(f"Error processing single image with EasyOCR: {e}", exc_info=True)
|
176
|
-
return []
|
193
|
+
return [] # Return empty list on failure
|
177
194
|
else:
|
178
195
|
raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
|
179
|
-
|
natural_pdf/ocr/engine_paddle.py
CHANGED
@@ -1,27 +1,49 @@
|
|
1
1
|
# ocr_engine_paddleocr.py
|
2
|
-
import logging
|
3
2
|
import importlib.util
|
4
|
-
|
3
|
+
import inspect # Used for dynamic parameter passing
|
4
|
+
import logging
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
6
|
+
|
5
7
|
import numpy as np
|
6
8
|
from PIL import Image
|
7
|
-
import inspect # Used for dynamic parameter passing
|
8
9
|
|
9
10
|
from .engine import OCREngine
|
10
|
-
from .ocr_options import
|
11
|
+
from .ocr_options import BaseOCROptions, PaddleOCROptions
|
11
12
|
|
12
13
|
logger = logging.getLogger(__name__)
|
13
14
|
|
15
|
+
|
14
16
|
class PaddleOCREngine(OCREngine):
|
15
17
|
"""PaddleOCR engine implementation."""
|
16
18
|
|
17
|
-
LANGUAGE_MAP = {
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
19
|
+
LANGUAGE_MAP = {
|
20
|
+
"en": "en",
|
21
|
+
"zh": "ch",
|
22
|
+
"zh-cn": "ch",
|
23
|
+
"zh-tw": "chinese_cht",
|
24
|
+
"ja": "japan",
|
25
|
+
"ko": "korean",
|
26
|
+
"th": "thai",
|
27
|
+
"fr": "french",
|
28
|
+
"de": "german",
|
29
|
+
"ru": "russian",
|
30
|
+
"ar": "arabic",
|
31
|
+
"hi": "hindi",
|
32
|
+
"vi": "vietnam",
|
33
|
+
"fa": "cyrillic",
|
34
|
+
"ur": "cyrillic",
|
35
|
+
"rs": "serbian",
|
36
|
+
"oc": "latin",
|
37
|
+
"rsc": "cyrillic",
|
38
|
+
"bg": "bulgarian",
|
39
|
+
"uk": "cyrillic",
|
40
|
+
"be": "cyrillic",
|
41
|
+
"te": "telugu",
|
42
|
+
"kn": "kannada",
|
43
|
+
"ta": "tamil",
|
44
|
+
"latin": "latin",
|
45
|
+
"cyrillic": "cyrillic",
|
46
|
+
"devanagari": "devanagari",
|
25
47
|
}
|
26
48
|
|
27
49
|
def __init__(self):
|
@@ -36,6 +58,7 @@ class PaddleOCREngine(OCREngine):
|
|
36
58
|
try:
|
37
59
|
import paddle
|
38
60
|
import paddleocr
|
61
|
+
|
39
62
|
self._paddleocr = paddleocr
|
40
63
|
logger.info("PaddleOCR module imported successfully.")
|
41
64
|
except ImportError as e:
|
@@ -45,19 +68,21 @@ class PaddleOCREngine(OCREngine):
|
|
45
68
|
|
46
69
|
def is_available(self) -> bool:
|
47
70
|
"""Check if PaddleOCR and paddlepaddle are installed."""
|
48
|
-
paddle_installed =
|
49
|
-
|
71
|
+
paddle_installed = (
|
72
|
+
importlib.util.find_spec("paddle") is not None
|
73
|
+
or importlib.util.find_spec("paddlepaddle") is not None
|
74
|
+
)
|
50
75
|
paddleocr_installed = importlib.util.find_spec("paddleocr") is not None
|
51
76
|
return paddle_installed and paddleocr_installed
|
52
77
|
|
53
78
|
def _map_language(self, iso_lang: str) -> str:
|
54
79
|
"""Map ISO language code to PaddleOCR language code."""
|
55
|
-
return self.LANGUAGE_MAP.get(iso_lang.lower(),
|
80
|
+
return self.LANGUAGE_MAP.get(iso_lang.lower(), "en")
|
56
81
|
|
57
82
|
def _get_cache_key(self, options: PaddleOCROptions) -> str:
|
58
83
|
"""Generate a more specific cache key for PaddleOCR."""
|
59
84
|
base_key = super()._get_cache_key(options)
|
60
|
-
primary_lang = self._map_language(options.languages[0]) if options.languages else
|
85
|
+
primary_lang = self._map_language(options.languages[0]) if options.languages else "en"
|
61
86
|
angle_cls_key = str(options.use_angle_cls)
|
62
87
|
precision_key = options.precision
|
63
88
|
return f"{base_key}_{primary_lang}_{angle_cls_key}_{precision_key}"
|
@@ -74,31 +99,34 @@ class PaddleOCREngine(OCREngine):
|
|
74
99
|
|
75
100
|
constructor_sig = inspect.signature(paddleocr.PaddleOCR.__init__)
|
76
101
|
constructor_args = {}
|
77
|
-
constructor_args[
|
102
|
+
constructor_args["lang"] = (
|
103
|
+
self._map_language(options.languages[0]) if options.languages else "en"
|
104
|
+
)
|
78
105
|
|
79
106
|
for field_name, param in constructor_sig.parameters.items():
|
80
|
-
if field_name in [
|
81
|
-
|
82
|
-
|
83
|
-
|
107
|
+
if field_name in ["self", "lang"]:
|
108
|
+
continue
|
109
|
+
if field_name == "use_gpu":
|
110
|
+
constructor_args["use_gpu"] = options.use_gpu
|
111
|
+
continue
|
84
112
|
if hasattr(options, field_name):
|
85
|
-
|
113
|
+
constructor_args[field_name] = getattr(options, field_name)
|
86
114
|
elif field_name in options.extra_args:
|
87
|
-
|
115
|
+
constructor_args[field_name] = options.extra_args[field_name]
|
88
116
|
|
89
|
-
constructor_args.pop(
|
117
|
+
constructor_args.pop("device", None)
|
90
118
|
logger.debug(f"PaddleOCR constructor args: {constructor_args}")
|
91
119
|
|
92
120
|
try:
|
93
|
-
show_log = constructor_args.get(
|
94
|
-
original_log_level = logging.getLogger(
|
121
|
+
show_log = constructor_args.get("show_log", False)
|
122
|
+
original_log_level = logging.getLogger("ppocr").level
|
95
123
|
if not show_log:
|
96
|
-
|
124
|
+
logging.getLogger("ppocr").setLevel(logging.ERROR)
|
97
125
|
|
98
126
|
reader = paddleocr.PaddleOCR(**constructor_args)
|
99
127
|
|
100
128
|
if not show_log:
|
101
|
-
|
129
|
+
logging.getLogger("ppocr").setLevel(original_log_level)
|
102
130
|
|
103
131
|
self._reader_cache[cache_key] = reader
|
104
132
|
logger.info("PaddleOCR reader created successfully.")
|
@@ -108,32 +136,36 @@ class PaddleOCREngine(OCREngine):
|
|
108
136
|
raise
|
109
137
|
|
110
138
|
def _prepare_ocr_args(self, options: PaddleOCROptions) -> Dict[str, Any]:
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
def _standardize_results(
|
139
|
+
"""Helper to prepare arguments for the ocr method (excluding image)."""
|
140
|
+
ocr_args = {}
|
141
|
+
# Determine 'cls' value based on options precedence
|
142
|
+
ocr_args["cls"] = options.cls if options.cls is not None else options.use_angle_cls
|
143
|
+
ocr_args["det"] = options.det
|
144
|
+
ocr_args["rec"] = options.rec
|
145
|
+
# Add extra args if needed (less common for ocr method itself)
|
146
|
+
# for field_name in options.extra_args:
|
147
|
+
# if field_name in ['cls', 'det', 'rec']: # Check against known ocr args
|
148
|
+
# ocr_args[field_name] = options.extra_args[field_name]
|
149
|
+
logger.debug(f"PaddleOCR ocr args (excluding image): {ocr_args}")
|
150
|
+
return ocr_args
|
151
|
+
|
152
|
+
def _standardize_results(
|
153
|
+
self, raw_page_results: Optional[List[Any]], options: PaddleOCROptions
|
154
|
+
) -> List[Dict[str, Any]]:
|
125
155
|
"""Standardizes raw results from a single page/image from PaddleOCR."""
|
126
156
|
standardized_page = []
|
127
|
-
if not raw_page_results:
|
157
|
+
if not raw_page_results: # Handle None or empty list
|
128
158
|
return standardized_page
|
129
159
|
|
130
160
|
min_confidence = options.min_confidence
|
131
161
|
for detection in raw_page_results:
|
132
162
|
try:
|
133
|
-
if not isinstance(detection, (list, tuple)) or len(detection) < 2:
|
163
|
+
if not isinstance(detection, (list, tuple)) or len(detection) < 2:
|
164
|
+
continue
|
134
165
|
bbox_raw = detection[0]
|
135
166
|
text_confidence = detection[1]
|
136
|
-
if not isinstance(text_confidence, tuple) or len(text_confidence) < 2:
|
167
|
+
if not isinstance(text_confidence, tuple) or len(text_confidence) < 2:
|
168
|
+
continue
|
137
169
|
|
138
170
|
text = str(text_confidence[0])
|
139
171
|
confidence = float(text_confidence[1])
|
@@ -141,55 +173,52 @@ class PaddleOCREngine(OCREngine):
|
|
141
173
|
if confidence >= min_confidence:
|
142
174
|
bbox = self._standardize_bbox(bbox_raw)
|
143
175
|
if bbox:
|
144
|
-
standardized_page.append(
|
145
|
-
|
146
|
-
|
176
|
+
standardized_page.append(
|
177
|
+
{"bbox": bbox, "text": text, "confidence": confidence, "source": "ocr"}
|
178
|
+
)
|
147
179
|
else:
|
148
180
|
logger.warning(f"Skipping result due to invalid bbox: {bbox_raw}")
|
149
181
|
except (IndexError, ValueError, TypeError) as e:
|
150
|
-
|
151
|
-
|
182
|
+
logger.warning(f"Skipping invalid detection format: {detection}. Error: {e}")
|
183
|
+
continue
|
152
184
|
return standardized_page
|
153
185
|
|
154
186
|
def _pil_to_bgr(self, image: Image.Image) -> np.ndarray:
|
155
187
|
"""Converts PIL Image to BGR numpy array."""
|
156
|
-
if image.mode ==
|
157
|
-
|
158
|
-
img_rgb = image.convert(
|
188
|
+
if image.mode == "BGR": # Already BGR
|
189
|
+
return np.array(image)
|
190
|
+
img_rgb = image.convert("RGB")
|
159
191
|
img_array_rgb = np.array(img_rgb)
|
160
|
-
img_array_bgr = img_array_rgb[:, :, ::-1]
|
192
|
+
img_array_bgr = img_array_rgb[:, :, ::-1] # Convert RGB to BGR
|
161
193
|
return img_array_bgr
|
162
194
|
|
163
|
-
|
164
195
|
def process_image(
|
165
|
-
self,
|
166
|
-
images: Union[Image.Image, List[Image.Image]],
|
167
|
-
options: BaseOCROptions
|
196
|
+
self, images: Union[Image.Image, List[Image.Image]], options: BaseOCROptions
|
168
197
|
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
|
169
198
|
"""Processes a single image or a batch of images with PaddleOCR."""
|
170
199
|
|
171
200
|
if not isinstance(options, PaddleOCROptions):
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
201
|
+
logger.warning("Received BaseOCROptions, expected PaddleOCROptions. Using defaults.")
|
202
|
+
options = PaddleOCROptions(
|
203
|
+
languages=options.languages,
|
204
|
+
min_confidence=options.min_confidence,
|
205
|
+
device=options.device,
|
206
|
+
extra_args=options.extra_args,
|
207
|
+
)
|
179
208
|
|
180
209
|
reader = self._get_reader(options)
|
181
210
|
ocr_args = self._prepare_ocr_args(options)
|
182
|
-
|
211
|
+
|
183
212
|
# Helper function to process one image
|
184
213
|
def process_one(img):
|
185
214
|
try:
|
186
215
|
img_array_bgr = self._pil_to_bgr(img)
|
187
216
|
raw_results = reader.ocr(img_array_bgr, **ocr_args)
|
188
|
-
|
217
|
+
|
189
218
|
page_results = []
|
190
219
|
if raw_results and isinstance(raw_results, list) and len(raw_results) > 0:
|
191
220
|
page_results = raw_results[0]
|
192
|
-
|
221
|
+
|
193
222
|
return self._standardize_results(page_results, options)
|
194
223
|
except Exception as e:
|
195
224
|
logger.error(f"Error processing image with PaddleOCR: {e}")
|