natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +209 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +288 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +413 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +512 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +604 -0
- docs/tutorials/12-ocr-integration.md +175 -0
- docs/tutorials/13-semantic-search.ipynb +1328 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +50 -33
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/gemini.py +264 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +125 -58
- natural_pdf/analyzers/layout/layout_options.py +43 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +89 -45
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +146 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +419 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +1044 -521
- natural_pdf/core/pdf.py +516 -313
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +307 -225
- natural_pdf/elements/collections.py +805 -543
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +889 -879
- natural_pdf/elements/text.py +127 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +57 -35
- natural_pdf/ocr/engine.py +150 -46
- natural_pdf/ocr/engine_easyocr.py +146 -150
- natural_pdf/ocr/engine_paddle.py +118 -175
- natural_pdf/ocr/engine_surya.py +78 -141
- natural_pdf/ocr/ocr_factory.py +114 -0
- natural_pdf/ocr/ocr_manager.py +122 -124
- natural_pdf/ocr/ocr_options.py +16 -20
- natural_pdf/ocr/utils.py +98 -0
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/templates/spa/css/style.css +334 -0
- natural_pdf/templates/spa/index.html +31 -0
- natural_pdf/templates/spa/js/app.js +472 -0
- natural_pdf/templates/spa/words.txt +235976 -0
- natural_pdf/utils/debug.py +32 -0
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/identifiers.py +29 -0
- natural_pdf/utils/packaging.py +418 -0
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
- natural_pdf-0.1.6.dist-info/RECORD +141 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
- natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- natural_pdf/templates/ocr_debug.html +0 -517
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
natural_pdf/ocr/engine_paddle.py
CHANGED
@@ -1,204 +1,147 @@
|
|
1
1
|
# ocr_engine_paddleocr.py
|
2
|
-
import logging
|
3
2
|
import importlib.util
|
4
|
-
|
3
|
+
import logging
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
5
|
+
|
5
6
|
import numpy as np
|
6
7
|
from PIL import Image
|
7
|
-
import inspect # Used for dynamic parameter passing
|
8
8
|
|
9
|
-
from .engine import OCREngine
|
10
|
-
from .ocr_options import
|
9
|
+
from .engine import OCREngine, TextRegion
|
10
|
+
from .ocr_options import BaseOCROptions, PaddleOCROptions
|
11
11
|
|
12
12
|
logger = logging.getLogger(__name__)
|
13
13
|
|
14
|
+
|
14
15
|
class PaddleOCREngine(OCREngine):
|
15
16
|
"""PaddleOCR engine implementation."""
|
16
17
|
|
17
|
-
LANGUAGE_MAP = {
|
18
|
-
'en': 'en', 'zh': 'ch', 'zh-cn': 'ch', 'zh-tw': 'chinese_cht',
|
19
|
-
'ja': 'japan', 'ko': 'korean', 'th': 'thai', 'fr': 'french',
|
20
|
-
'de': 'german', 'ru': 'russian', 'ar': 'arabic', 'hi': 'hindi',
|
21
|
-
'vi': 'vietnam', 'fa': 'cyrillic', 'ur': 'cyrillic', 'rs': 'serbian',
|
22
|
-
'oc': 'latin', 'rsc': 'cyrillic', 'bg': 'bulgarian', 'uk': 'cyrillic',
|
23
|
-
'be': 'cyrillic', 'te': 'telugu', 'kn': 'kannada', 'ta': 'tamil',
|
24
|
-
'latin': 'latin', 'cyrillic': 'cyrillic', 'devanagari': 'devanagari',
|
25
|
-
}
|
26
|
-
|
27
18
|
def __init__(self):
|
28
19
|
super().__init__()
|
29
|
-
self._paddleocr = None
|
30
|
-
|
31
|
-
def _lazy_import_paddleocr(self):
|
32
|
-
"""Imports paddleocr only when needed."""
|
33
|
-
if self._paddleocr is None:
|
34
|
-
if not self.is_available():
|
35
|
-
raise ImportError("PaddleOCR or PaddlePaddle is not installed or available.")
|
36
|
-
try:
|
37
|
-
import paddle
|
38
|
-
import paddleocr
|
39
|
-
self._paddleocr = paddleocr
|
40
|
-
logger.info("PaddleOCR module imported successfully.")
|
41
|
-
except ImportError as e:
|
42
|
-
logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
|
43
|
-
raise
|
44
|
-
return self._paddleocr
|
45
20
|
|
46
21
|
def is_available(self) -> bool:
|
47
22
|
"""Check if PaddleOCR and paddlepaddle are installed."""
|
48
|
-
paddle_installed =
|
49
|
-
|
23
|
+
paddle_installed = (
|
24
|
+
importlib.util.find_spec("paddle") is not None
|
25
|
+
or importlib.util.find_spec("paddlepaddle") is not None
|
26
|
+
)
|
50
27
|
paddleocr_installed = importlib.util.find_spec("paddleocr") is not None
|
51
28
|
return paddle_installed and paddleocr_installed
|
52
29
|
|
53
|
-
def
|
54
|
-
"""
|
55
|
-
return self.LANGUAGE_MAP.get(iso_lang.lower(), 'en')
|
56
|
-
|
57
|
-
def _get_cache_key(self, options: PaddleOCROptions) -> str:
|
58
|
-
"""Generate a more specific cache key for PaddleOCR."""
|
59
|
-
base_key = super()._get_cache_key(options)
|
60
|
-
primary_lang = self._map_language(options.languages[0]) if options.languages else 'en'
|
61
|
-
angle_cls_key = str(options.use_angle_cls)
|
62
|
-
precision_key = options.precision
|
63
|
-
return f"{base_key}_{primary_lang}_{angle_cls_key}_{precision_key}"
|
64
|
-
|
65
|
-
def _get_reader(self, options: PaddleOCROptions):
|
66
|
-
"""Get or initialize a PaddleOCR reader based on options."""
|
67
|
-
cache_key = self._get_cache_key(options)
|
68
|
-
if cache_key in self._reader_cache:
|
69
|
-
logger.debug(f"Using cached PaddleOCR reader for key: {cache_key}")
|
70
|
-
return self._reader_cache[cache_key]
|
71
|
-
|
72
|
-
logger.info(f"Creating new PaddleOCR reader for key: {cache_key}")
|
73
|
-
paddleocr = self._lazy_import_paddleocr()
|
74
|
-
|
75
|
-
constructor_sig = inspect.signature(paddleocr.PaddleOCR.__init__)
|
76
|
-
constructor_args = {}
|
77
|
-
constructor_args['lang'] = self._map_language(options.languages[0]) if options.languages else 'en'
|
78
|
-
|
79
|
-
for field_name, param in constructor_sig.parameters.items():
|
80
|
-
if field_name in ['self', 'lang']: continue
|
81
|
-
if field_name == 'use_gpu':
|
82
|
-
constructor_args['use_gpu'] = options.use_gpu
|
83
|
-
continue
|
84
|
-
if hasattr(options, field_name):
|
85
|
-
constructor_args[field_name] = getattr(options, field_name)
|
86
|
-
elif field_name in options.extra_args:
|
87
|
-
constructor_args[field_name] = options.extra_args[field_name]
|
88
|
-
|
89
|
-
constructor_args.pop('device', None)
|
90
|
-
logger.debug(f"PaddleOCR constructor args: {constructor_args}")
|
91
|
-
|
30
|
+
def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
|
31
|
+
"""Initialize the PaddleOCR model."""
|
92
32
|
try:
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
33
|
+
import paddleocr
|
34
|
+
self.logger.info("PaddleOCR module imported successfully.")
|
35
|
+
except ImportError as e:
|
36
|
+
self.logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
|
37
|
+
raise
|
38
|
+
|
39
|
+
# Cast to PaddleOCROptions if possible
|
40
|
+
paddle_options = options if isinstance(options, PaddleOCROptions) else PaddleOCROptions()
|
41
|
+
|
42
|
+
# Determine parameters
|
43
|
+
primary_lang = languages[0] if languages else "en"
|
44
|
+
use_gpu = "cuda" in str(device).lower()
|
45
|
+
|
46
|
+
# Create constructor arguments
|
47
|
+
constructor_args = {
|
48
|
+
"lang": primary_lang,
|
49
|
+
"use_gpu": use_gpu,
|
50
|
+
"use_angle_cls": paddle_options.use_angle_cls,
|
51
|
+
"det": True,
|
52
|
+
"rec": True # We'll control recognition at process time
|
53
|
+
}
|
54
|
+
|
55
|
+
# Add optional parameters if available
|
56
|
+
for param in ["det_model_dir", "rec_model_dir", "cls_model_dir", "show_log", "use_onnx"]:
|
57
|
+
if hasattr(paddle_options, param):
|
58
|
+
val = getattr(paddle_options, param)
|
59
|
+
if val is not None:
|
60
|
+
constructor_args[param] = val
|
61
|
+
|
62
|
+
self.logger.debug(f"PaddleOCR constructor args: {constructor_args}")
|
63
|
+
|
64
|
+
# Create the model
|
65
|
+
try:
|
66
|
+
self._model = paddleocr.PaddleOCR(**constructor_args)
|
67
|
+
self.logger.info("PaddleOCR model created successfully")
|
106
68
|
except Exception as e:
|
107
|
-
logger.error(f"Failed to create PaddleOCR
|
69
|
+
self.logger.error(f"Failed to create PaddleOCR model: {e}")
|
108
70
|
raise
|
109
71
|
|
110
|
-
def
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
ocr_args['det'] = options.det
|
116
|
-
ocr_args['rec'] = options.rec
|
117
|
-
# Add extra args if needed (less common for ocr method itself)
|
118
|
-
# for field_name in options.extra_args:
|
119
|
-
# if field_name in ['cls', 'det', 'rec']: # Check against known ocr args
|
120
|
-
# ocr_args[field_name] = options.extra_args[field_name]
|
121
|
-
logger.debug(f"PaddleOCR ocr args (excluding image): {ocr_args}")
|
122
|
-
return ocr_args
|
123
|
-
|
124
|
-
def _standardize_results(self, raw_page_results: Optional[List[Any]], options: PaddleOCROptions) -> List[Dict[str, Any]]:
|
125
|
-
"""Standardizes raw results from a single page/image from PaddleOCR."""
|
126
|
-
standardized_page = []
|
127
|
-
if not raw_page_results: # Handle None or empty list
|
128
|
-
return standardized_page
|
129
|
-
|
130
|
-
min_confidence = options.min_confidence
|
131
|
-
for detection in raw_page_results:
|
132
|
-
try:
|
133
|
-
if not isinstance(detection, (list, tuple)) or len(detection) < 2: continue
|
134
|
-
bbox_raw = detection[0]
|
135
|
-
text_confidence = detection[1]
|
136
|
-
if not isinstance(text_confidence, tuple) or len(text_confidence) < 2: continue
|
137
|
-
|
138
|
-
text = str(text_confidence[0])
|
139
|
-
confidence = float(text_confidence[1])
|
140
|
-
|
141
|
-
if confidence >= min_confidence:
|
142
|
-
bbox = self._standardize_bbox(bbox_raw)
|
143
|
-
if bbox:
|
144
|
-
standardized_page.append({
|
145
|
-
'bbox': bbox, 'text': text, 'confidence': confidence, 'source': 'ocr'
|
146
|
-
})
|
147
|
-
else:
|
148
|
-
logger.warning(f"Skipping result due to invalid bbox: {bbox_raw}")
|
149
|
-
except (IndexError, ValueError, TypeError) as e:
|
150
|
-
logger.warning(f"Skipping invalid detection format: {detection}. Error: {e}")
|
151
|
-
continue
|
152
|
-
return standardized_page
|
153
|
-
|
154
|
-
def _pil_to_bgr(self, image: Image.Image) -> np.ndarray:
|
155
|
-
"""Converts PIL Image to BGR numpy array."""
|
156
|
-
if image.mode == 'BGR': # Already BGR
|
157
|
-
return np.array(image)
|
158
|
-
img_rgb = image.convert('RGB')
|
72
|
+
def _preprocess_image(self, image: Image.Image) -> np.ndarray:
|
73
|
+
"""Convert PIL Image to BGR numpy array for PaddleOCR."""
|
74
|
+
if image.mode == "BGR":
|
75
|
+
return np.array(image)
|
76
|
+
img_rgb = image.convert("RGB")
|
159
77
|
img_array_rgb = np.array(img_rgb)
|
160
|
-
img_array_bgr = img_array_rgb[:, :, ::-1]
|
78
|
+
img_array_bgr = img_array_rgb[:, :, ::-1] # Convert RGB to BGR
|
161
79
|
return img_array_bgr
|
162
80
|
|
81
|
+
def _process_single_image(self, image: np.ndarray, detect_only: bool, options: Optional[PaddleOCROptions]) -> Any:
|
82
|
+
"""Process a single image with PaddleOCR."""
|
83
|
+
if self._model is None:
|
84
|
+
raise RuntimeError("PaddleOCR model not initialized")
|
85
|
+
|
86
|
+
# Prepare OCR arguments
|
87
|
+
ocr_args = {}
|
88
|
+
if options and isinstance(options, PaddleOCROptions):
|
89
|
+
ocr_args["cls"] = options.cls if options.cls is not None else options.use_angle_cls
|
90
|
+
ocr_args["det"] = options.det
|
91
|
+
ocr_args["rec"] = not detect_only # Control recognition based on detect_only flag
|
92
|
+
|
93
|
+
# Run OCR
|
94
|
+
raw_results = self._model.ocr(image, **ocr_args)
|
95
|
+
return raw_results
|
163
96
|
|
164
|
-
def
|
165
|
-
|
166
|
-
|
167
|
-
options: BaseOCROptions
|
168
|
-
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
|
169
|
-
"""Processes a single image or a batch of images with PaddleOCR."""
|
170
|
-
|
171
|
-
if not isinstance(options, PaddleOCROptions):
|
172
|
-
logger.warning("Received BaseOCROptions, expected PaddleOCROptions. Using defaults.")
|
173
|
-
options = PaddleOCROptions(
|
174
|
-
languages=options.languages,
|
175
|
-
min_confidence=options.min_confidence,
|
176
|
-
device=options.device,
|
177
|
-
extra_args=options.extra_args
|
178
|
-
)
|
179
|
-
|
180
|
-
reader = self._get_reader(options)
|
181
|
-
ocr_args = self._prepare_ocr_args(options)
|
97
|
+
def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
|
98
|
+
"""Convert PaddleOCR results to standardized TextRegion objects."""
|
99
|
+
standardized_regions = []
|
182
100
|
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
101
|
+
if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
|
102
|
+
return standardized_regions
|
103
|
+
|
104
|
+
page_results = raw_results[0] if raw_results[0] is not None else []
|
105
|
+
|
106
|
+
for detection in page_results:
|
107
|
+
# Initialize text and confidence
|
108
|
+
text = None
|
109
|
+
confidence = None
|
110
|
+
bbox_raw = None
|
111
|
+
|
112
|
+
# Paddle always seems to return the tuple structure [bbox, (text, conf)]
|
113
|
+
# even if rec=False. We need to parse this structure regardless.
|
114
|
+
if len(detection) == 4: # Handle potential alternative format?
|
115
|
+
detection = [detection, ('', 1.0)] # Treat as bbox + dummy text/conf
|
116
|
+
|
117
|
+
if not isinstance(detection, (list, tuple)) or len(detection) < 2:
|
118
|
+
raise ValueError(f"Invalid detection format from PaddleOCR: {detection}")
|
119
|
+
|
120
|
+
bbox_raw = detection[0]
|
121
|
+
text_confidence = detection[1]
|
122
|
+
|
123
|
+
if not isinstance(text_confidence, tuple) or len(text_confidence) < 2:
|
124
|
+
# Even if detect_only, we expect the (text, conf) structure,
|
125
|
+
# it might just contain dummy values.
|
126
|
+
raise ValueError(f"Invalid text/confidence structure from PaddleOCR: {text_confidence}")
|
127
|
+
|
128
|
+
# Extract text/conf only if not detect_only
|
129
|
+
if not detect_only:
|
130
|
+
text = str(text_confidence[0])
|
131
|
+
confidence = float(text_confidence[1])
|
197
132
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
133
|
+
# Standardize the bbox (always needed)
|
134
|
+
try:
|
135
|
+
bbox = self._standardize_bbox(bbox_raw)
|
136
|
+
except ValueError as e:
|
137
|
+
raise ValueError(f"Could not standardize bounding box from PaddleOCR: {bbox_raw}") from e
|
138
|
+
|
139
|
+
# Append based on mode
|
140
|
+
if detect_only:
|
141
|
+
# Append regardless of dummy confidence value, set text/conf to None
|
142
|
+
standardized_regions.append(TextRegion(bbox, text=None, confidence=None))
|
143
|
+
elif confidence >= min_confidence:
|
144
|
+
# Only append if confidence meets threshold in full OCR mode
|
145
|
+
standardized_regions.append(TextRegion(bbox, text, confidence))
|
146
|
+
|
147
|
+
return standardized_regions
|
natural_pdf/ocr/engine_surya.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
# ocr_engine_surya.py
|
2
|
-
import logging
|
3
2
|
import importlib.util
|
4
|
-
|
3
|
+
import logging
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
5
|
+
|
5
6
|
import numpy as np
|
6
7
|
from PIL import Image
|
7
8
|
|
8
|
-
from .engine import OCREngine
|
9
|
-
from .ocr_options import
|
9
|
+
from .engine import OCREngine, TextRegion
|
10
|
+
from .ocr_options import BaseOCROptions, SuryaOCROptions
|
10
11
|
|
11
|
-
logger = logging.getLogger(__name__)
|
12
12
|
|
13
13
|
class SuryaOCREngine(OCREngine):
|
14
14
|
"""Surya OCR engine implementation."""
|
@@ -19,153 +19,90 @@ class SuryaOCREngine(OCREngine):
|
|
19
19
|
self._detection_predictor = None
|
20
20
|
self._surya_recognition = None
|
21
21
|
self._surya_detection = None
|
22
|
-
self._initialized = False
|
23
|
-
|
24
|
-
def _lazy_load_predictors(self, options: SuryaOCROptions):
|
25
|
-
"""Initializes Surya predictors when first needed."""
|
26
|
-
if self._initialized:
|
27
|
-
return
|
28
22
|
|
23
|
+
def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
|
24
|
+
"""Initialize Surya predictors."""
|
29
25
|
if not self.is_available():
|
30
26
|
raise ImportError("Surya OCR library is not installed or available.")
|
31
27
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
logger.info("Surya modules imported successfully.")
|
38
|
-
|
39
|
-
# --- Instantiate Predictors ---
|
40
|
-
# Add arguments from options if Surya supports them
|
41
|
-
# Example: device = options.device or 'cuda' if torch.cuda.is_available() else 'cpu'
|
42
|
-
# predictor_args = {'device': options.device} # If applicable
|
43
|
-
predictor_args = {} # Assuming parameterless init based on example
|
44
|
-
|
45
|
-
logger.info("Instantiating Surya DetectionPredictor...")
|
46
|
-
self._detection_predictor = self._surya_detection(**predictor_args)
|
47
|
-
logger.info("Instantiating Surya RecognitionPredictor...")
|
48
|
-
self._recognition_predictor = self._surya_recognition(**predictor_args)
|
49
|
-
|
50
|
-
self._initialized = True
|
51
|
-
logger.info("Surya predictors initialized.")
|
52
|
-
|
53
|
-
except ImportError as e:
|
54
|
-
logger.error(f"Failed to import Surya modules: {e}")
|
55
|
-
raise
|
56
|
-
except Exception as e:
|
57
|
-
logger.error(f"Failed to initialize Surya predictors: {e}", exc_info=True)
|
58
|
-
raise
|
28
|
+
# Store languages for use in _process_single_image
|
29
|
+
self._langs = languages
|
30
|
+
|
31
|
+
from surya.detection import DetectionPredictor
|
32
|
+
from surya.recognition import RecognitionPredictor
|
59
33
|
|
60
|
-
|
61
|
-
|
62
|
-
|
34
|
+
self._surya_recognition = RecognitionPredictor
|
35
|
+
self._surya_detection = DetectionPredictor
|
36
|
+
self.logger.info("Surya modules imported successfully.")
|
63
37
|
|
64
|
-
|
65
|
-
"""Standardizes raw results from a single image from Surya."""
|
66
|
-
standardized_page = []
|
67
|
-
min_confidence = options.min_confidence
|
38
|
+
predictor_args = {} # Configure if needed
|
68
39
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
40
|
+
self.logger.info("Instantiating Surya DetectionPredictor...")
|
41
|
+
self._detection_predictor = self._surya_detection(**predictor_args)
|
42
|
+
self.logger.info("Instantiating Surya RecognitionPredictor...")
|
43
|
+
self._recognition_predictor = self._surya_recognition(**predictor_args)
|
44
|
+
|
45
|
+
self.logger.info("Surya predictors initialized.")
|
73
46
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
text = line.text
|
78
|
-
confidence = line.confidence
|
79
|
-
# Surya provides both polygon and bbox, bbox is already (x0, y0, x1, y1)
|
80
|
-
bbox_raw = line.bbox # Use bbox directly if available and correct format
|
47
|
+
def _preprocess_image(self, image: Image.Image) -> Image.Image:
|
48
|
+
"""Surya uses PIL images directly, so just return the image."""
|
49
|
+
return image
|
81
50
|
|
82
|
-
|
83
|
-
|
84
|
-
if bbox:
|
85
|
-
standardized_page.append({
|
86
|
-
'bbox': bbox,
|
87
|
-
'text': text,
|
88
|
-
'confidence': confidence,
|
89
|
-
'source': 'ocr'
|
90
|
-
})
|
91
|
-
else:
|
92
|
-
# Try polygon if bbox failed standardization
|
93
|
-
bbox_poly = self._standardize_bbox(line.polygon)
|
94
|
-
if bbox_poly:
|
95
|
-
standardized_page.append({
|
96
|
-
'bbox': bbox_poly, 'text': text, 'confidence': confidence, 'source': 'ocr'
|
97
|
-
})
|
98
|
-
else:
|
99
|
-
logger.warning(f"Skipping Surya line due to invalid bbox/polygon: {line}")
|
100
|
-
|
101
|
-
except (AttributeError, ValueError, TypeError) as e:
|
102
|
-
logger.warning(f"Skipping invalid Surya TextLine format: {line}. Error: {e}")
|
103
|
-
continue
|
104
|
-
return standardized_page
|
105
|
-
|
106
|
-
def process_image(
|
107
|
-
self,
|
108
|
-
images: Union[Image.Image, List[Image.Image]],
|
109
|
-
options: BaseOCROptions
|
110
|
-
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
|
111
|
-
"""Processes a single image or a batch of images with Surya OCR."""
|
112
|
-
|
113
|
-
if not isinstance(options, SuryaOCROptions):
|
114
|
-
logger.warning("Received BaseOCROptions, expected SuryaOCROptions. Using defaults.")
|
115
|
-
options = SuryaOCROptions(
|
116
|
-
languages=options.languages,
|
117
|
-
min_confidence=options.min_confidence,
|
118
|
-
device=options.device,
|
119
|
-
extra_args=options.extra_args
|
120
|
-
)
|
121
|
-
|
122
|
-
# Ensure predictors are loaded/initialized
|
123
|
-
self._lazy_load_predictors(options)
|
51
|
+
def _process_single_image(self, image: Image.Image, detect_only: bool, options: Optional[SuryaOCROptions]) -> Any:
|
52
|
+
"""Process a single image with Surya OCR."""
|
124
53
|
if not self._recognition_predictor or not self._detection_predictor:
|
125
|
-
|
126
|
-
|
127
|
-
#
|
128
|
-
|
129
|
-
|
130
|
-
# Surya expects
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
# --- Run Surya Prediction ---
|
138
|
-
try:
|
139
|
-
processing_mode = "batch" if is_batch else "single image"
|
140
|
-
logger.info(f"Processing {processing_mode} ({len(input_images)} images) with Surya...")
|
141
|
-
# Call Surya's predictor
|
142
|
-
# It returns a list of OCRResult objects, one per input image
|
143
|
-
predictions = self._recognition_predictor(
|
144
|
-
images=input_images,
|
145
|
-
langs=input_langs,
|
54
|
+
raise RuntimeError("Surya predictors are not initialized.")
|
55
|
+
|
56
|
+
# Store languages instance variable during initialization to use here
|
57
|
+
langs = [[lang] for lang in self._langs] if hasattr(self, '_langs') else [[self.DEFAULT_LANGUAGES[0]]]
|
58
|
+
|
59
|
+
# Surya expects lists of images, so we need to wrap our single image
|
60
|
+
if detect_only:
|
61
|
+
results = self._detection_predictor(images=[image])
|
62
|
+
else:
|
63
|
+
results = self._recognition_predictor(
|
64
|
+
images=[image],
|
65
|
+
langs=langs, # Use the languages set during initialization
|
146
66
|
det_predictor=self._detection_predictor
|
147
67
|
)
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
68
|
+
|
69
|
+
# Surya may return a list with one result per image or a single result object
|
70
|
+
# Return the result as-is and handle the extraction in _standardize_results
|
71
|
+
return results
|
72
|
+
|
73
|
+
def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
|
74
|
+
"""Convert Surya results to standardized TextRegion objects."""
|
75
|
+
standardized_regions = []
|
76
|
+
|
77
|
+
raw_result = raw_results
|
78
|
+
if isinstance(raw_results, list) and len(raw_results) > 0:
|
79
|
+
raw_result = raw_results[0]
|
80
|
+
|
81
|
+
results = raw_result.text_lines if hasattr(raw_result, "text_lines") and not detect_only else raw_result.bboxes
|
82
|
+
|
83
|
+
for line in results:
|
84
|
+
# Always extract bbox first
|
85
|
+
try:
|
86
|
+
# Prioritize line.bbox, fallback to line.polygon
|
87
|
+
bbox_raw = line.bbox if hasattr(line, 'bbox') else getattr(line, 'polygon', None)
|
88
|
+
if bbox_raw is None:
|
89
|
+
raise ValueError("Missing bbox/polygon data")
|
90
|
+
bbox = self._standardize_bbox(bbox_raw)
|
91
|
+
except ValueError as e:
|
92
|
+
raise ValueError(f"Could not standardize bounding box from Surya result: {bbox_raw}") from e
|
93
|
+
|
94
|
+
if detect_only:
|
95
|
+
# For detect_only, text and confidence are None
|
96
|
+
standardized_regions.append(TextRegion(bbox, text=None, confidence=None))
|
160
97
|
else:
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
# Note: Caching is handled differently for Surya as predictors are stateful
|
169
|
-
# and initialized once. The base class _reader_cache is not used here.
|
170
|
-
# If predictors could be configured per-run, caching would need rethinking.
|
98
|
+
# For full OCR, extract text and confidence, then filter
|
99
|
+
text = line.text if hasattr(line, "text") else ""
|
100
|
+
confidence = line.confidence
|
101
|
+
if confidence >= min_confidence:
|
102
|
+
standardized_regions.append(TextRegion(bbox, text, confidence))
|
103
|
+
|
104
|
+
return standardized_regions
|
171
105
|
|
106
|
+
def is_available(self) -> bool:
|
107
|
+
"""Check if the surya library is installed."""
|
108
|
+
return importlib.util.find_spec("surya") is not None
|