xfmr-zem 0.2.2__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xfmr_zem/cli.py +32 -3
- xfmr_zem/client.py +59 -8
- xfmr_zem/server.py +21 -4
- xfmr_zem/servers/data_juicer/server.py +1 -1
- xfmr_zem/servers/instruction_gen/server.py +1 -1
- xfmr_zem/servers/io/server.py +1 -1
- xfmr_zem/servers/llm/parameters.yml +10 -0
- xfmr_zem/servers/nemo_curator/server.py +1 -1
- xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py +90 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py +1286 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py +562 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/ocr.py +512 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/.gitattributes +35 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/README.md +5 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/ocr.res +6623 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/operators.py +725 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/phases.py +191 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/pipeline.py +561 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/postprocess.py +370 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/recognizer.py +436 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/table_structure_recognizer.py +569 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/utils/__init__.py +81 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/utils/file_utils.py +246 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/__init__.py +0 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/base.yml +58 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/vgg-seq2seq.yml +38 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/__init__.py +0 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/cnn.py +25 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/vgg.py +51 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/seqmodel/seq2seq.py +175 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/transformerocr.py +29 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/vocab.py +36 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/config.py +37 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/translate.py +111 -0
- xfmr_zem/servers/ocr/engines.py +242 -0
- xfmr_zem/servers/ocr/install_models.py +63 -0
- xfmr_zem/servers/ocr/parameters.yml +4 -0
- xfmr_zem/servers/ocr/server.py +44 -0
- xfmr_zem/servers/profiler/parameters.yml +4 -0
- xfmr_zem/servers/sinks/parameters.yml +6 -0
- xfmr_zem/servers/unstructured/parameters.yml +6 -0
- xfmr_zem/servers/unstructured/server.py +62 -0
- xfmr_zem/zenml_wrapper.py +20 -7
- {xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/METADATA +19 -1
- xfmr_zem-0.2.5.dist-info/RECORD +58 -0
- xfmr_zem-0.2.2.dist-info/RECORD +0 -23
- /xfmr_zem/servers/data_juicer/{parameter.yaml → parameters.yml} +0 -0
- /xfmr_zem/servers/instruction_gen/{parameter.yaml → parameters.yml} +0 -0
- /xfmr_zem/servers/io/{parameter.yaml → parameters.yml} +0 -0
- /xfmr_zem/servers/nemo_curator/{parameter.yaml → parameters.yml} +0 -0
- {xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/WHEEL +0 -0
- {xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/entry_points.txt +0 -0
- {xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1286 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Concrete Implementations của các Abstract Phases
|
|
3
|
+
|
|
4
|
+
File này chứa các implementations thực tế cho từng phase trong pipeline.
|
|
5
|
+
Mỗi implementation có thể được thay thế độc lập bằng implementation khác.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import logging
|
|
10
|
+
from typing import List, Dict, Tuple, Optional, Any
|
|
11
|
+
from PIL import Image
|
|
12
|
+
import numpy as np
|
|
13
|
+
import cv2
|
|
14
|
+
|
|
15
|
+
from .phases import (
|
|
16
|
+
LayoutAnalysisPhase,
|
|
17
|
+
TextDetectionPhase,
|
|
18
|
+
TextRecognitionPhase,
|
|
19
|
+
PostProcessingPhase,
|
|
20
|
+
DocumentReconstructionPhase,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# Import existing modules
|
|
24
|
+
from . import LayoutRecognizer
|
|
25
|
+
from .ocr import TextDetector, TextRecognizer, get_project_base_directory
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# ============================================================================
|
|
29
|
+
# Layout Analysis Implementations
|
|
30
|
+
# ============================================================================
|
|
31
|
+
|
|
32
|
+
class DocLayoutYOLOAnalyzer(LayoutAnalysisPhase):
|
|
33
|
+
"""
|
|
34
|
+
Layout Analysis sử dụng DocLayout-YOLO ONNX model.
|
|
35
|
+
|
|
36
|
+
Features:
|
|
37
|
+
- Sử dụng ONNX optimized model cho CPU performance
|
|
38
|
+
- Phát hiện: text, title, figure, table, caption, header, footer, equation
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, model_name: str = "layout"):
|
|
42
|
+
"""
|
|
43
|
+
Args:
|
|
44
|
+
model_name: Tên model (default: "layout" for DocLayout-YOLO)
|
|
45
|
+
"""
|
|
46
|
+
self.recognizer = LayoutRecognizer(model_name)
|
|
47
|
+
logging.info("✓ DocLayoutYOLOAnalyzer initialized with ONNX model")
|
|
48
|
+
|
|
49
|
+
def analyze(self, image: Image.Image, threshold: float = 0.5) -> List[Dict[str, Any]]:
|
|
50
|
+
"""
|
|
51
|
+
Phân tích layout sử dụng DocLayout-YOLO.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
List of regions with structure:
|
|
55
|
+
{
|
|
56
|
+
"bbox": [x0, y0, x1, y1],
|
|
57
|
+
"type": str (text/title/figure/table/etc.),
|
|
58
|
+
"score": float,
|
|
59
|
+
"label": str (detailed label)
|
|
60
|
+
}
|
|
61
|
+
"""
|
|
62
|
+
# Call forward on batch of 1 image
|
|
63
|
+
layouts = self.recognizer.forward([image], thr=float(threshold))[0]
|
|
64
|
+
|
|
65
|
+
# Normalize output format
|
|
66
|
+
results = []
|
|
67
|
+
for region in layouts:
|
|
68
|
+
bbox = self._extract_bbox(region)
|
|
69
|
+
label = region.get("type", "").lower()
|
|
70
|
+
score = region.get("score", 1.0)
|
|
71
|
+
|
|
72
|
+
if score < threshold:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
results.append({
|
|
76
|
+
"bbox": bbox,
|
|
77
|
+
"type": label,
|
|
78
|
+
"score": score,
|
|
79
|
+
"label": label,
|
|
80
|
+
"raw": region # Keep original for backward compatibility
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
logging.info(f"DocLayoutYOLO detected {len(results)} regions")
|
|
84
|
+
return results
|
|
85
|
+
|
|
86
|
+
def _extract_bbox(self, region: Dict) -> List[int]:
|
|
87
|
+
"""Extract and normalize bbox from region dict"""
|
|
88
|
+
if "bbox" in region:
|
|
89
|
+
return list(map(int, region["bbox"]))
|
|
90
|
+
return list(map(int, [
|
|
91
|
+
region.get("x0", 0),
|
|
92
|
+
region.get("top", 0),
|
|
93
|
+
region.get("x1", 0),
|
|
94
|
+
region.get("bottom", 0)
|
|
95
|
+
]))
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class PaddleStructureV3Analyzer(LayoutAnalysisPhase):
|
|
100
|
+
"""
|
|
101
|
+
Layout & Table Analysis utilizing PaddleOCR's PP-StructureV3.
|
|
102
|
+
|
|
103
|
+
Features:
|
|
104
|
+
- Unified Layout Analysis + Table Recognition + OCR
|
|
105
|
+
- PP-StructureV3 model (SOTA for document structure)
|
|
106
|
+
- Supports 'layout', 'table', 'ocr' modes (default: structure=True)
|
|
107
|
+
- Better handling of complex tables and multi-column layouts
|
|
108
|
+
|
|
109
|
+
Note: Requires paddleocr>=2.7 (best with 3.x)
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
def __init__(self, lang: str = 'en', show_log: bool = True, **kwargs):
|
|
113
|
+
"""
|
|
114
|
+
Args:
|
|
115
|
+
lang: Language code (default: 'en' for layout compatibility)
|
|
116
|
+
show_log: Whether to show PaddleOCR logs
|
|
117
|
+
**kwargs: Additional arguments passed to PPStructureV3 (e.g., use_gpu, enable_mkldnn)
|
|
118
|
+
"""
|
|
119
|
+
try:
|
|
120
|
+
from paddleocr import PPStructureV3
|
|
121
|
+
except ImportError as e:
|
|
122
|
+
try:
|
|
123
|
+
# Fallback for older versions or if PPStructure is the name
|
|
124
|
+
from paddleocr import PPStructure as PPStructureV3
|
|
125
|
+
except ImportError:
|
|
126
|
+
raise ImportError(f"paddleocr must be installed to use PPStructure/V3: {e}")
|
|
127
|
+
|
|
128
|
+
self.engine = PPStructureV3(
|
|
129
|
+
lang=lang,
|
|
130
|
+
# show_log is not supported in PPStructureV3 init args
|
|
131
|
+
**kwargs
|
|
132
|
+
)
|
|
133
|
+
logging.info(f"✓ PaddleStructureV3Analyzer initialized (lang={lang})")
|
|
134
|
+
|
|
135
|
+
def analyze(self, image: Image.Image, threshold: float = 0.5) -> List[Dict[str, Any]]:
|
|
136
|
+
"""
|
|
137
|
+
Analyze document structure.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
List of regions including Text, Title, Figure, Table, etc.
|
|
141
|
+
For 'table' regions, it may include structural info if available.
|
|
142
|
+
"""
|
|
143
|
+
import numpy as np
|
|
144
|
+
|
|
145
|
+
# Convert PIL to numpy
|
|
146
|
+
if isinstance(image, Image.Image):
|
|
147
|
+
image = np.array(image)
|
|
148
|
+
|
|
149
|
+
# Ensure RGB
|
|
150
|
+
if len(image.shape) == 2:
|
|
151
|
+
image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
|
|
152
|
+
elif image.shape[2] == 4:
|
|
153
|
+
image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
|
|
154
|
+
|
|
155
|
+
# Run PP-Structure
|
|
156
|
+
if hasattr(self.engine, 'predict'):
|
|
157
|
+
predict_gen = self.engine.predict(image)
|
|
158
|
+
else:
|
|
159
|
+
# Fallback for older versions
|
|
160
|
+
predict_gen = self.engine(image)
|
|
161
|
+
|
|
162
|
+
# Each result is a dict containing 'doc_layout_result' (list of regions)
|
|
163
|
+
raw_regions = []
|
|
164
|
+
for res in predict_gen:
|
|
165
|
+
logging.info(f"DEBUG: Processing result item type: {type(res)}")
|
|
166
|
+
try:
|
|
167
|
+
logging.info(f"DEBUG: res.keys(): {list(res.keys())}")
|
|
168
|
+
except:
|
|
169
|
+
logging.info("DEBUG: res has no keys()")
|
|
170
|
+
logging.info(f"DEBUG: str(res): {str(res)}")
|
|
171
|
+
|
|
172
|
+
# Handle structure extraction from various result types
|
|
173
|
+
success = False
|
|
174
|
+
|
|
175
|
+
# Strategy 1: key access 'parsing_res_list' (V2 result)
|
|
176
|
+
try:
|
|
177
|
+
if 'parsing_res_list' in res:
|
|
178
|
+
raw_regions.extend(res['parsing_res_list'])
|
|
179
|
+
logging.info("DEBUG: Extracted via res['parsing_res_list']")
|
|
180
|
+
success = True
|
|
181
|
+
except Exception as e:
|
|
182
|
+
logging.info(f"DEBUG: Strategy 1 failed: {e}")
|
|
183
|
+
|
|
184
|
+
if not success:
|
|
185
|
+
# Strategy 2: key access 'doc_layout_result' (Legacy/V3 result)
|
|
186
|
+
try:
|
|
187
|
+
if 'doc_layout_result' in res:
|
|
188
|
+
raw_regions.extend(res['doc_layout_result'])
|
|
189
|
+
logging.info("DEBUG: Extracted via res['doc_layout_result']")
|
|
190
|
+
success = True
|
|
191
|
+
except:
|
|
192
|
+
pass
|
|
193
|
+
|
|
194
|
+
if not success:
|
|
195
|
+
# Strategy 3: direct access
|
|
196
|
+
if hasattr(res, 'parsing_res_list'):
|
|
197
|
+
val = res.parsing_res_list
|
|
198
|
+
if val:
|
|
199
|
+
raw_regions.extend(val)
|
|
200
|
+
success = True
|
|
201
|
+
|
|
202
|
+
if not success and isinstance(res, list):
|
|
203
|
+
raw_regions.extend(res)
|
|
204
|
+
logging.info("DEBUG: Extracted via list extension")
|
|
205
|
+
|
|
206
|
+
# Normalize results
|
|
207
|
+
normalized_results = []
|
|
208
|
+
for region in raw_regions:
|
|
209
|
+
# Handle dictionary or object attributes
|
|
210
|
+
if isinstance(region, dict):
|
|
211
|
+
region_type = region.get('type', region.get('label', '')).lower()
|
|
212
|
+
bbox = region.get('bbox', [0, 0, 0, 0])
|
|
213
|
+
score = region.get('score', 1.0)
|
|
214
|
+
# Content key might vary
|
|
215
|
+
if 'res' in region: # Legacy format
|
|
216
|
+
content = ""
|
|
217
|
+
res = region['res']
|
|
218
|
+
if isinstance(res, list):
|
|
219
|
+
texts = []
|
|
220
|
+
for line in res:
|
|
221
|
+
if isinstance(line, dict) and 'text' in line:
|
|
222
|
+
texts.append(line['text'])
|
|
223
|
+
elif isinstance(line, (list, tuple)) and len(line) > 0:
|
|
224
|
+
texts.append(str(line[0]))
|
|
225
|
+
content = " ".join(texts)
|
|
226
|
+
elif isinstance(res, dict) and 'html' in res:
|
|
227
|
+
content = res['html']
|
|
228
|
+
else:
|
|
229
|
+
content = region.get('content', region.get('text', ''))
|
|
230
|
+
else:
|
|
231
|
+
# Handle parsed object (LayoutBlock)
|
|
232
|
+
# Attributes based on logs: label, bbox, content
|
|
233
|
+
try:
|
|
234
|
+
region_type = getattr(region, 'label', '').lower()
|
|
235
|
+
bbox = getattr(region, 'bbox', [0, 0, 0, 0])
|
|
236
|
+
score = getattr(region, 'score', 1.0)
|
|
237
|
+
content = getattr(region, 'content', getattr(region, 'text', ''))
|
|
238
|
+
except:
|
|
239
|
+
logging.warning(f"Could not parse region object: {dir(region)}")
|
|
240
|
+
continue
|
|
241
|
+
|
|
242
|
+
# Filter by threshold if score is available
|
|
243
|
+
if score < threshold:
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
normalized_results.append({
|
|
247
|
+
"bbox": bbox,
|
|
248
|
+
"type": region_type,
|
|
249
|
+
"score": score,
|
|
250
|
+
"label": region_type,
|
|
251
|
+
"content": content,
|
|
252
|
+
"raw": str(region)
|
|
253
|
+
})
|
|
254
|
+
|
|
255
|
+
logging.info(f"PaddleStructureV3 detected {len(normalized_results)} regions")
|
|
256
|
+
return normalized_results
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
# ============================================================================
|
|
260
|
+
# Text Detection Implementations
|
|
261
|
+
# ============================================================================
|
|
262
|
+
|
|
263
|
+
class PaddleOCRTextDetector(TextDetectionPhase):
|
|
264
|
+
"""
|
|
265
|
+
Text Detection sử dụng PaddleOCR ONNX detection model.
|
|
266
|
+
|
|
267
|
+
Features:
|
|
268
|
+
- ONNX optimized cho CPU
|
|
269
|
+
- DBNet architecture
|
|
270
|
+
- Phát hiện text boxes với 4 góc coordinates
|
|
271
|
+
"""
|
|
272
|
+
|
|
273
|
+
def __init__(self, model_dir: Optional[str] = None, device_id: Optional[int] = None):
|
|
274
|
+
"""
|
|
275
|
+
Args:
|
|
276
|
+
model_dir: Directory chứa ONNX models (default: auto-download)
|
|
277
|
+
device_id: Device ID cho CUDA (None = CPU only)
|
|
278
|
+
"""
|
|
279
|
+
if model_dir is None:
|
|
280
|
+
model_dir = os.path.join(get_project_base_directory(), "onnx")
|
|
281
|
+
|
|
282
|
+
self.detector = TextDetector(model_dir, device_id)
|
|
283
|
+
logging.info("✓ PaddleOCRTextDetector initialized")
|
|
284
|
+
|
|
285
|
+
def detect(self, image: np.ndarray) -> Tuple[Optional[np.ndarray], Any]:
|
|
286
|
+
"""
|
|
287
|
+
Phát hiện text boxes trong image.
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
(dt_boxes, elapsed_time) where dt_boxes shape is (N, 4, 2)
|
|
291
|
+
"""
|
|
292
|
+
return self.detector(image)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
# ============================================================================
|
|
296
|
+
# Text Recognition Implementations
|
|
297
|
+
# ============================================================================
|
|
298
|
+
|
|
299
|
+
class VietOCRRecognizer(TextRecognitionPhase):
|
|
300
|
+
"""
|
|
301
|
+
Text Recognition sử dụng VietOCR (vgg-seq2seq).
|
|
302
|
+
|
|
303
|
+
Features:
|
|
304
|
+
- Model chuyên cho tiếng Việt
|
|
305
|
+
- Accuracy: ~75-80%
|
|
306
|
+
- CPU optimized với quantization
|
|
307
|
+
|
|
308
|
+
Limitations:
|
|
309
|
+
- Model cũ (2019)
|
|
310
|
+
- Tốc độ chậm hơn
|
|
311
|
+
- Có thể có nhiễu character
|
|
312
|
+
"""
|
|
313
|
+
|
|
314
|
+
def __init__(self, model_dir: Optional[str] = None, device_id: Optional[int] = None):
|
|
315
|
+
"""
|
|
316
|
+
Args:
|
|
317
|
+
model_dir: Directory chứa models (không dùng với VietOCR)
|
|
318
|
+
device_id: Device ID (không dùng, VietOCR luôn dùng CPU)
|
|
319
|
+
"""
|
|
320
|
+
self.recognizer = TextRecognizer(model_dir, device_id)
|
|
321
|
+
logging.info("✓ VietOCRRecognizer initialized (vgg-seq2seq)")
|
|
322
|
+
|
|
323
|
+
def recognize(self, image_crops: List[Any]) -> Tuple[List[Tuple[str, float]], float]:
|
|
324
|
+
"""
|
|
325
|
+
Nhận dạng text sử dụng VietOCR.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
image_crops: List các image crops (numpy array hoặc PIL Image)
|
|
329
|
+
|
|
330
|
+
Returns:
|
|
331
|
+
(results, elapsed_time) where results = [(text, confidence), ...]
|
|
332
|
+
"""
|
|
333
|
+
return self.recognizer(image_crops)
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
class LandingAIRecognizer(TextRecognitionPhase):
|
|
337
|
+
"""
|
|
338
|
+
Text Recognition sử dụng LandingAI OCR API.
|
|
339
|
+
|
|
340
|
+
Features:
|
|
341
|
+
- Cloud-based OCR với accuracy cao
|
|
342
|
+
- Support nhiều ngôn ngữ
|
|
343
|
+
- Phù hợp cho production với volume lớn
|
|
344
|
+
|
|
345
|
+
Limitations:
|
|
346
|
+
- Cần API key
|
|
347
|
+
- Cần internet connection
|
|
348
|
+
- Có cost per request
|
|
349
|
+
|
|
350
|
+
Note: Đây là placeholder implementation. Cần implement API calls thực tế.
|
|
351
|
+
"""
|
|
352
|
+
|
|
353
|
+
def __init__(self, api_key: Optional[str] = None, model_dir: Optional[str] = None, device_id: Optional[int] = None):
|
|
354
|
+
"""
|
|
355
|
+
Args:
|
|
356
|
+
api_key: LandingAI API key
|
|
357
|
+
model_dir: Không sử dụng (placeholder for interface consistency)
|
|
358
|
+
device_id: Không sử dụng (cloud-based)
|
|
359
|
+
"""
|
|
360
|
+
self.api_key = api_key or os.environ.get("LANDINGAI_API_KEY")
|
|
361
|
+
if not self.api_key:
|
|
362
|
+
logging.warning("LandingAI API key not provided. Recognition will fail.")
|
|
363
|
+
|
|
364
|
+
logging.info("✓ LandingAIRecognizer initialized (placeholder)")
|
|
365
|
+
|
|
366
|
+
def recognize(self, image_crops: List[Any]) -> Tuple[List[Tuple[str, float]], float]:
|
|
367
|
+
"""
|
|
368
|
+
Nhận dạng text sử dụng LandingAI API.
|
|
369
|
+
|
|
370
|
+
TODO: Implement actual API calls
|
|
371
|
+
"""
|
|
372
|
+
# Placeholder: Return empty results
|
|
373
|
+
logging.warning("LandingAIRecognizer not fully implemented yet")
|
|
374
|
+
return [(("", 0.0)) for _ in image_crops], 0.0
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
class SVTRv2Recognizer(TextRecognitionPhase):
|
|
378
|
+
"""
|
|
379
|
+
Text Recognition sử dụng PP-OCRv3 với SVTRv2-LCNet.
|
|
380
|
+
|
|
381
|
+
⚠️ IMPORTANT: Chỉ hoạt động với PaddleOCR 2.7.x (KHÔNG tương thích với 3.x)
|
|
382
|
+
|
|
383
|
+
PaddleOCR 2.7.x có method `.rec()` cho recognition-only.
|
|
384
|
+
PaddleOCR 3.x đã remove `.rec()` method và chỉ support full pipeline.
|
|
385
|
+
|
|
386
|
+
Để sử dụng class này, cần downgrade:
|
|
387
|
+
pip uninstall paddleocr paddlepaddle
|
|
388
|
+
pip install paddleocr==2.7.3 paddlepaddle==2.5.2
|
|
389
|
+
|
|
390
|
+
Features:
|
|
391
|
+
- SVTRv2-LCNet architecture (Transformer-based, không dùng RNN)
|
|
392
|
+
- Accuracy: 92-95% (vs 75-80% VietOCR)
|
|
393
|
+
- Speed: ~150ms per crop on CPU (vs ~500ms VietOCR)
|
|
394
|
+
- Model size: 50-80 MB
|
|
395
|
+
- Vietnamese support: Excellent
|
|
396
|
+
- Giảm ký tự nhiễu rõ rệt so với VietOCR
|
|
397
|
+
|
|
398
|
+
Advantages over VietOCR:
|
|
399
|
+
- 3x faster inference
|
|
400
|
+
- +15-20% accuracy improvement
|
|
401
|
+
- Better handling of Vietnamese diacritics
|
|
402
|
+
- Less character noise
|
|
403
|
+
- Production-ready và stable
|
|
404
|
+
|
|
405
|
+
Architecture:
|
|
406
|
+
- Backbone: SVTRv2-HGNet (Hierarchical Grouped Network)
|
|
407
|
+
- Neck: LCNet (Lightweight Convolutional Network)
|
|
408
|
+
- Head: CTC/Attention decoder
|
|
409
|
+
|
|
410
|
+
Note: Yêu cầu PaddleOCR 2.7.x (không phải 3.x)
|
|
411
|
+
"""
|
|
412
|
+
|
|
413
|
+
def __init__(self, model_dir: Optional[str] = None, device_id: Optional[int] = None, lang: str = 'vi'):
|
|
414
|
+
"""
|
|
415
|
+
Args:
|
|
416
|
+
model_dir: Directory chứa models (auto-download if None)
|
|
417
|
+
device_id: Device ID cho CUDA (None = CPU)
|
|
418
|
+
lang: Language code ('vi' for Vietnamese, 'en' for English)
|
|
419
|
+
"""
|
|
420
|
+
try:
|
|
421
|
+
from paddleocr import PaddleOCR
|
|
422
|
+
except ImportError:
|
|
423
|
+
raise ImportError(
|
|
424
|
+
"PaddleOCR not installed. Install with:\n"
|
|
425
|
+
" pip install paddleocr>=2.7.0 paddlepaddle>=2.5.0"
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Initialize PaddleOCR with PP-OCRv3 (includes SVTRv2)
|
|
429
|
+
# Following the same pattern as TextRecognizerPaddleOCR in ocr.py:176
|
|
430
|
+
|
|
431
|
+
# Use simple initialization like in ocr.py - this is the stable API
|
|
432
|
+
self.ocr = PaddleOCR(lang=lang)
|
|
433
|
+
|
|
434
|
+
self.device_id = device_id
|
|
435
|
+
self.lang = lang
|
|
436
|
+
|
|
437
|
+
device_str = f"GPU:{device_id}" if device_id is not None else "CPU"
|
|
438
|
+
logging.info(f"✓ SVTRv2Recognizer initialized (PP-OCRv3 via PaddleOCR, lang={lang}, device={device_str})")
|
|
439
|
+
|
|
440
|
+
def recognize(self, image_crops: List[Any]) -> Tuple[List[Tuple[str, float]], float]:
|
|
441
|
+
"""
|
|
442
|
+
Nhận dạng text sử dụng SVTRv2.
|
|
443
|
+
|
|
444
|
+
Args:
|
|
445
|
+
image_crops: List các image crops (numpy array hoặc PIL Image)
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
Tuple of (results, elapsed_time):
|
|
449
|
+
- results: List of (text, confidence) tuples
|
|
450
|
+
- elapsed_time: Thời gian xử lý tổng (seconds)
|
|
451
|
+
"""
|
|
452
|
+
import time
|
|
453
|
+
start_time = time.time()
|
|
454
|
+
|
|
455
|
+
results = []
|
|
456
|
+
|
|
457
|
+
for img_crop in image_crops:
|
|
458
|
+
# Convert to numpy if PIL Image
|
|
459
|
+
if isinstance(img_crop, Image.Image):
|
|
460
|
+
img_crop = np.array(img_crop)
|
|
461
|
+
|
|
462
|
+
# Ensure correct format (RGB)
|
|
463
|
+
if len(img_crop.shape) == 2:
|
|
464
|
+
# Grayscale -> RGB
|
|
465
|
+
img_crop = cv2.cvtColor(img_crop, cv2.COLOR_GRAY2RGB)
|
|
466
|
+
elif img_crop.shape[2] == 4:
|
|
467
|
+
# RGBA -> RGB
|
|
468
|
+
img_crop = cv2.cvtColor(img_crop, cv2.COLOR_RGBA2RGB)
|
|
469
|
+
|
|
470
|
+
# Ensure minimum height for SVTRv2 (32px)
|
|
471
|
+
if img_crop.shape[0] < 32:
|
|
472
|
+
scale = 32.0 / img_crop.shape[0]
|
|
473
|
+
new_width = int(img_crop.shape[1] * scale)
|
|
474
|
+
img_crop = cv2.resize(img_crop, (new_width, 32), interpolation=cv2.INTER_CUBIC)
|
|
475
|
+
|
|
476
|
+
try:
|
|
477
|
+
# Call PaddleOCR recognition (SVTRv2)
|
|
478
|
+
# Use .rec() method for recognition-only (same as ocr.py:200)
|
|
479
|
+
rec_result = self.ocr.rec(img_crop)
|
|
480
|
+
|
|
481
|
+
# Parse result - format: [(text, confidence), ...]
|
|
482
|
+
# Following the pattern from ocr.py:203-213
|
|
483
|
+
if rec_result and len(rec_result) > 0:
|
|
484
|
+
# Extract first result
|
|
485
|
+
if isinstance(rec_result[0], (tuple, list)) and len(rec_result[0]) >= 2:
|
|
486
|
+
text = str(rec_result[0][0])
|
|
487
|
+
confidence = float(rec_result[0][1])
|
|
488
|
+
else:
|
|
489
|
+
text = str(rec_result[0]) if rec_result[0] else ""
|
|
490
|
+
confidence = 1.0
|
|
491
|
+
else:
|
|
492
|
+
text = ""
|
|
493
|
+
confidence = 0.0
|
|
494
|
+
|
|
495
|
+
results.append((text, confidence))
|
|
496
|
+
|
|
497
|
+
except Exception as e:
|
|
498
|
+
logging.warning(f"SVTRv2 recognition failed: {e}")
|
|
499
|
+
results.append(("", 0.0))
|
|
500
|
+
|
|
501
|
+
elapsed = time.time() - start_time
|
|
502
|
+
return results, elapsed
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
class PPOCRv5Recognizer(TextRecognitionPhase):
|
|
506
|
+
"""
|
|
507
|
+
Text Recognition sử dụng PP-OCRv5 (PaddleOCR 3.x) với FULL PIPELINE.
|
|
508
|
+
|
|
509
|
+
✅ Hoạt động với PaddleOCR 3.x (latest version)
|
|
510
|
+
|
|
511
|
+
Lưu ý quan trọng:
|
|
512
|
+
----------------
|
|
513
|
+
PP-OCRv5 trong PaddleOCR 3.x KHÔNG hỗ trợ recognition-only mode.
|
|
514
|
+
Class này sử dụng FULL PIPELINE (detection + recognition) cho TOÀN BỘ image.
|
|
515
|
+
|
|
516
|
+
Do đó, class này KHÔNG integrate vào phase-based architecture hiện tại.
|
|
517
|
+
Thay vào đó, nó thay thế cả Detection + Recognition phases.
|
|
518
|
+
|
|
519
|
+
API Changes từ PaddleOCR 2.7.x → 3.x:
|
|
520
|
+
- REMOVED: `.rec()` method (recognition-only)
|
|
521
|
+
- NEW: `.predict()` method (full pipeline only)
|
|
522
|
+
- NEW: Result object với `.print()`, `.save_to_img()`, `.save_to_json()`
|
|
523
|
+
|
|
524
|
+
Features:
|
|
525
|
+
- PP-OCRv5_server model (latest, 2025)
|
|
526
|
+
- +13% accuracy improvement vs PP-OCRv4
|
|
527
|
+
- Vietnamese support: Excellent
|
|
528
|
+
- Detection: PP-OCRv5 DBNet
|
|
529
|
+
- Recognition: PP-OCRv5 SVTRv2
|
|
530
|
+
|
|
531
|
+
Architecture:
|
|
532
|
+
- Detection: PP-OCRv5_server_det (DBNet-based)
|
|
533
|
+
- Recognition: PP-OCRv5_server_rec (SVTRv2-based)
|
|
534
|
+
- Full pipeline: 5 modules (doc orientation, unwarping, text orientation, detection, recognition)
|
|
535
|
+
|
|
536
|
+
Usage:
|
|
537
|
+
```python
|
|
538
|
+
# Sử dụng riêng PP-OCRv5, không cần phase-based pipeline
|
|
539
|
+
recognizer = PPOCRv5Recognizer(lang='vi')
|
|
540
|
+
text, confidence = recognizer.recognize_full_image(image)
|
|
541
|
+
```
|
|
542
|
+
|
|
543
|
+
Note: Yêu cầu PaddleOCR >= 3.0 (tested with 3.3.3)
|
|
544
|
+
"""
|
|
545
|
+
|
|
546
|
+
def __init__(
|
|
547
|
+
self,
|
|
548
|
+
model_dir: Optional[str] = None,
|
|
549
|
+
device_id: Optional[int] = None,
|
|
550
|
+
lang: str = 'vi',
|
|
551
|
+
use_doc_orientation: bool = False,
|
|
552
|
+
use_doc_unwarping: bool = False,
|
|
553
|
+
use_textline_orientation: bool = False,
|
|
554
|
+
text_detection_model_name: Optional[str] = None,
|
|
555
|
+
text_recognition_model_name: Optional[str] = None,
|
|
556
|
+
**kwargs
|
|
557
|
+
):
|
|
558
|
+
"""
|
|
559
|
+
Args:
|
|
560
|
+
model_dir: Directory chứa models (không dùng trong PaddleOCR 3.x)
|
|
561
|
+
device_id: Device ID cho CUDA (None = CPU)
|
|
562
|
+
lang: Language code ('vi' for Vietnamese, 'en' for English, 'ch' for Chinese)
|
|
563
|
+
use_doc_orientation: Sử dụng doc orientation classification (default: False)
|
|
564
|
+
use_doc_unwarping: Sử dụng doc unwarping (default: False)
|
|
565
|
+
use_textline_orientation: Sử dụng textline orientation (default: False)
|
|
566
|
+
text_detection_model_name: Model name for detection (e.g. "PP-OCRv5_mobile_det")
|
|
567
|
+
text_recognition_model_name: Model name for recognition (e.g. "PP-OCRv5_mobile_rec")
|
|
568
|
+
**kwargs: Additional arguments for PaddleOCR (e.g., enable_mkldnn, use_gpu)
|
|
569
|
+
"""
|
|
570
|
+
try:
|
|
571
|
+
from paddleocr import PaddleOCR
|
|
572
|
+
except ImportError:
|
|
573
|
+
raise ImportError(
|
|
574
|
+
"PaddleOCR not installed. Install with:\n"
|
|
575
|
+
" pip install paddleocr>=3.0.0 paddlepaddle>=2.5.0"
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
# Initialize PaddleOCR with PP-OCRv5 (PaddleOCR 3.x API)
|
|
579
|
+
# Based on documentation: https://www.paddleocr.ai/latest/version3.x/pipeline_usage/OCR.html
|
|
580
|
+
init_params = {
|
|
581
|
+
'lang': lang,
|
|
582
|
+
'use_doc_orientation_classify': use_doc_orientation,
|
|
583
|
+
'use_doc_unwarping': use_doc_unwarping,
|
|
584
|
+
'use_textline_orientation': use_textline_orientation,
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
# Add custom model names if provided
|
|
588
|
+
if text_detection_model_name:
|
|
589
|
+
init_params['text_detection_model_name'] = text_detection_model_name
|
|
590
|
+
if text_recognition_model_name:
|
|
591
|
+
init_params['text_recognition_model_name'] = text_recognition_model_name
|
|
592
|
+
|
|
593
|
+
# Note: PaddleOCR 3.x không có use_gpu parameter
|
|
594
|
+
# GPU được tự động detect hoặc set qua device parameter
|
|
595
|
+
if device_id is not None:
|
|
596
|
+
init_params['use_gpu'] = True
|
|
597
|
+
init_params['gpu_id'] = device_id
|
|
598
|
+
|
|
599
|
+
# Merge additional kwargs (e.g., enable_mkldnn)
|
|
600
|
+
init_params.update(kwargs)
|
|
601
|
+
|
|
602
|
+
self.ocr = PaddleOCR(**init_params)
|
|
603
|
+
self.device_id = device_id
|
|
604
|
+
self.lang = lang
|
|
605
|
+
|
|
606
|
+
device_str = f"GPU:{device_id}" if device_id is not None else "CPU"
|
|
607
|
+
logging.info(
|
|
608
|
+
f"✓ PPOCRv5Recognizer initialized "
|
|
609
|
+
f"(PP-OCRv5 full pipeline, lang={lang}, device={device_str})"
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
def recognize(self, image_crops: List[Any]) -> Tuple[List[Tuple[str, float]], float]:
|
|
613
|
+
"""
|
|
614
|
+
⚠️ WARNING: Method này KHÔNG nên sử dụng với image crops!
|
|
615
|
+
|
|
616
|
+
PP-OCRv5 trong PaddleOCR 3.x không support recognition-only mode.
|
|
617
|
+
Method này sẽ chạy FULL PIPELINE (detection + recognition) trên MỖI crop,
|
|
618
|
+
dẫn đến kết quả sai và performance kém.
|
|
619
|
+
|
|
620
|
+
Thay vào đó, sử dụng `recognize_full_image()` với toàn bộ image.
|
|
621
|
+
|
|
622
|
+
Args:
|
|
623
|
+
image_crops: List các image crops (không khuyến khích)
|
|
624
|
+
|
|
625
|
+
Returns:
|
|
626
|
+
(results, elapsed_time) - nhưng kết quả có thể không chính xác
|
|
627
|
+
"""
|
|
628
|
+
import time
|
|
629
|
+
logging.warning(
|
|
630
|
+
"PPOCRv5Recognizer.recognize() is called with image crops. "
|
|
631
|
+
"This is NOT recommended! Use recognize_full_image() instead."
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
start_time = time.time()
|
|
635
|
+
results = []
|
|
636
|
+
|
|
637
|
+
for img_crop in image_crops:
|
|
638
|
+
# Convert to numpy if PIL Image
|
|
639
|
+
if isinstance(img_crop, Image.Image):
|
|
640
|
+
img_crop = np.array(img_crop)
|
|
641
|
+
|
|
642
|
+
# Ensure correct format (RGB)
|
|
643
|
+
if len(img_crop.shape) == 2:
|
|
644
|
+
img_crop = cv2.cvtColor(img_crop, cv2.COLOR_GRAY2RGB)
|
|
645
|
+
elif img_crop.shape[2] == 4:
|
|
646
|
+
img_crop = cv2.cvtColor(img_crop, cv2.COLOR_RGBA2RGB)
|
|
647
|
+
|
|
648
|
+
try:
|
|
649
|
+
# Call PaddleOCR full pipeline (không phải recognition-only)
|
|
650
|
+
# API: result = ocr.predict(input)
|
|
651
|
+
predict_result = self.ocr.predict(img_crop)
|
|
652
|
+
|
|
653
|
+
# Parse result - format: iterable of result objects
|
|
654
|
+
# Each result has: dt_polys, rec_texts, etc.
|
|
655
|
+
text = ""
|
|
656
|
+
confidence = 0.0
|
|
657
|
+
|
|
658
|
+
if predict_result:
|
|
659
|
+
for res in predict_result:
|
|
660
|
+
# Access rec_texts field
|
|
661
|
+
if hasattr(res, 'rec_texts'):
|
|
662
|
+
rec_texts = res.rec_texts
|
|
663
|
+
elif isinstance(res, dict) and 'rec_texts' in res:
|
|
664
|
+
rec_texts = res['rec_texts']
|
|
665
|
+
else:
|
|
666
|
+
rec_texts = []
|
|
667
|
+
|
|
668
|
+
# Concatenate all recognized texts
|
|
669
|
+
if rec_texts:
|
|
670
|
+
text = " ".join([str(t) for t in rec_texts if t])
|
|
671
|
+
confidence = 1.0 # PP-OCRv5 doesn't return confidence per text
|
|
672
|
+
|
|
673
|
+
results.append((text, confidence))
|
|
674
|
+
|
|
675
|
+
except Exception as e:
|
|
676
|
+
logging.error(f"PP-OCRv5 recognition failed: {e}", exc_info=True)
|
|
677
|
+
results.append(("", 0.0))
|
|
678
|
+
|
|
679
|
+
elapsed = time.time() - start_time
|
|
680
|
+
return results, elapsed
|
|
681
|
+
|
|
682
|
+
def recognize_full_image(
|
|
683
|
+
self,
|
|
684
|
+
image: np.ndarray,
|
|
685
|
+
return_visualization: bool = False
|
|
686
|
+
) -> Dict[str, Any]:
|
|
687
|
+
"""
|
|
688
|
+
Nhận dạng text trên TOÀN BỘ image sử dụng PP-OCRv5 full pipeline.
|
|
689
|
+
|
|
690
|
+
Đây là cách SỬ DỤNG ĐÚNG cho PP-OCRv5 trong PaddleOCR 3.x.
|
|
691
|
+
|
|
692
|
+
Args:
|
|
693
|
+
image: Full image (numpy array hoặc PIL Image)
|
|
694
|
+
return_visualization: Return visualization image (default: False)
|
|
695
|
+
|
|
696
|
+
Returns:
|
|
697
|
+
Dictionary chứa:
|
|
698
|
+
{
|
|
699
|
+
'texts': List[str], # Recognized texts
|
|
700
|
+
'boxes': List[np.ndarray], # Detection boxes (N, 4, 2)
|
|
701
|
+
'scores': List[float], # Confidence scores (if available)
|
|
702
|
+
'elapsed_time': float, # Processing time (seconds)
|
|
703
|
+
'visualization': np.ndarray, # Visualization image (if requested)
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
Example:
|
|
707
|
+
```python
|
|
708
|
+
recognizer = PPOCRv5Recognizer(lang='vi')
|
|
709
|
+
result = recognizer.recognize_full_image(image)
|
|
710
|
+
|
|
711
|
+
for text, box in zip(result['texts'], result['boxes']):
|
|
712
|
+
print(f"Text: {text}, Box: {box}")
|
|
713
|
+
```
|
|
714
|
+
"""
|
|
715
|
+
import time
|
|
716
|
+
start_time = time.time()
|
|
717
|
+
|
|
718
|
+
# Convert to numpy if PIL Image
|
|
719
|
+
if isinstance(image, Image.Image):
|
|
720
|
+
image = np.array(image)
|
|
721
|
+
|
|
722
|
+
# Ensure correct format (RGB)
|
|
723
|
+
if len(image.shape) == 2:
|
|
724
|
+
image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
|
|
725
|
+
elif image.shape[2] == 4:
|
|
726
|
+
image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
|
|
727
|
+
|
|
728
|
+
try:
|
|
729
|
+
# Call PaddleOCR full pipeline
|
|
730
|
+
# API from documentation: result = ocr.predict(input)
|
|
731
|
+
predict_result = self.ocr.predict(image)
|
|
732
|
+
|
|
733
|
+
texts = []
|
|
734
|
+
boxes = []
|
|
735
|
+
scores = []
|
|
736
|
+
|
|
737
|
+
# Parse result - format: iterable of result objects
|
|
738
|
+
for res in predict_result:
|
|
739
|
+
# Extract data from result object
|
|
740
|
+
# Based on documentation, result has fields: dt_polys, rec_texts, etc.
|
|
741
|
+
if hasattr(res, 'dt_polys'):
|
|
742
|
+
dt_polys = res.dt_polys
|
|
743
|
+
elif isinstance(res, dict) and 'dt_polys' in res:
|
|
744
|
+
dt_polys = res['dt_polys']
|
|
745
|
+
else:
|
|
746
|
+
dt_polys = []
|
|
747
|
+
|
|
748
|
+
if hasattr(res, 'rec_texts'):
|
|
749
|
+
rec_texts = res.rec_texts
|
|
750
|
+
elif isinstance(res, dict) and 'rec_texts' in res:
|
|
751
|
+
rec_texts = res['rec_texts']
|
|
752
|
+
else:
|
|
753
|
+
rec_texts = []
|
|
754
|
+
|
|
755
|
+
# Append results
|
|
756
|
+
texts.extend([str(t) for t in rec_texts if t])
|
|
757
|
+
boxes.extend(dt_polys if isinstance(dt_polys, list) else [dt_polys])
|
|
758
|
+
|
|
759
|
+
# PP-OCRv5 may not return per-text scores, default to 1.0
|
|
760
|
+
scores.extend([1.0] * len(rec_texts))
|
|
761
|
+
|
|
762
|
+
elapsed = time.time() - start_time
|
|
763
|
+
|
|
764
|
+
result = {
|
|
765
|
+
'texts': texts,
|
|
766
|
+
'boxes': boxes,
|
|
767
|
+
'scores': scores,
|
|
768
|
+
'elapsed_time': elapsed,
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
# Generate visualization if requested
|
|
772
|
+
if return_visualization:
|
|
773
|
+
vis_image = self._draw_boxes_and_texts(image.copy(), boxes, texts)
|
|
774
|
+
result['visualization'] = vis_image
|
|
775
|
+
|
|
776
|
+
logging.info(
|
|
777
|
+
f"PP-OCRv5 processed image: {len(texts)} texts detected "
|
|
778
|
+
f"in {elapsed:.2f}s"
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
return result
|
|
782
|
+
|
|
783
|
+
except Exception as e:
|
|
784
|
+
logging.error(f"PP-OCRv5 full image recognition failed: {e}", exc_info=True)
|
|
785
|
+
return {
|
|
786
|
+
'texts': [],
|
|
787
|
+
'boxes': [],
|
|
788
|
+
'scores': [],
|
|
789
|
+
'elapsed_time': time.time() - start_time,
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
def _draw_boxes_and_texts(
|
|
793
|
+
self,
|
|
794
|
+
image: np.ndarray,
|
|
795
|
+
boxes: List[np.ndarray],
|
|
796
|
+
texts: List[str]
|
|
797
|
+
) -> np.ndarray:
|
|
798
|
+
"""
|
|
799
|
+
Vẽ boxes và texts lên image để visualization.
|
|
800
|
+
|
|
801
|
+
Args:
|
|
802
|
+
image: Image to draw on
|
|
803
|
+
boxes: List of boxes (N, 4, 2)
|
|
804
|
+
texts: List of texts
|
|
805
|
+
|
|
806
|
+
Returns:
|
|
807
|
+
Image with boxes and texts drawn
|
|
808
|
+
"""
|
|
809
|
+
for box, text in zip(boxes, texts):
|
|
810
|
+
if isinstance(box, np.ndarray) and box.shape == (4, 2):
|
|
811
|
+
# Draw box
|
|
812
|
+
pts = box.astype(np.int32)
|
|
813
|
+
cv2.polylines(image, [pts], True, (0, 255, 0), 2)
|
|
814
|
+
|
|
815
|
+
# Draw text
|
|
816
|
+
cv2.putText(
|
|
817
|
+
image,
|
|
818
|
+
text,
|
|
819
|
+
tuple(pts[0]),
|
|
820
|
+
cv2.FONT_HERSHEY_SIMPLEX,
|
|
821
|
+
0.5,
|
|
822
|
+
(255, 0, 0),
|
|
823
|
+
1
|
|
824
|
+
)
|
|
825
|
+
|
|
826
|
+
return image
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
|
|
830
|
+
class AdvancedPaddleOCR(PPOCRv5Recognizer):
|
|
831
|
+
"""
|
|
832
|
+
Advanced PaddleOCR Configuration for difficult documents.
|
|
833
|
+
|
|
834
|
+
Enables:
|
|
835
|
+
- Document Orientation Classification (Auto-rotate)
|
|
836
|
+
- Document Unwarping (UVDoc) for bent/curved pages
|
|
837
|
+
- Textline Orientation Correction
|
|
838
|
+
|
|
839
|
+
Ideal for:
|
|
840
|
+
- Scanned legal documents
|
|
841
|
+
- Photos of documents taken by mobile phones
|
|
842
|
+
- Tilted/Skewed scans
|
|
843
|
+
"""
|
|
844
|
+
|
|
845
|
+
def __init__(self, lang: str = 'vi', device_id: Optional[int] = None, **kwargs):
|
|
846
|
+
super().__init__(
|
|
847
|
+
lang=lang,
|
|
848
|
+
device_id=device_id,
|
|
849
|
+
use_doc_orientation=True,
|
|
850
|
+
use_doc_unwarping=True,
|
|
851
|
+
use_textline_orientation=True,
|
|
852
|
+
**kwargs
|
|
853
|
+
)
|
|
854
|
+
logging.info("✓ AdvancedPaddleOCR initialized with Unwarping & Orientation enabled")
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
# ============================================================================
|
|
858
|
+
# Post-Processing Implementations
|
|
859
|
+
# ============================================================================
|
|
860
|
+
|
|
861
|
+
class VietnameseTextPostProcessor(PostProcessingPhase):
|
|
862
|
+
"""
|
|
863
|
+
Post-processing chuyên cho tiếng Việt.
|
|
864
|
+
|
|
865
|
+
Features:
|
|
866
|
+
- Sửa các lỗi OCR thường gặp (I -> l, 0 -> O, etc.)
|
|
867
|
+
- Loại bỏ ký tự nhiễu
|
|
868
|
+
- Normalize Vietnamese diacritics
|
|
869
|
+
|
|
870
|
+
Note: Đây là placeholder. Có thể extend với dictionary-based correction.
|
|
871
|
+
"""
|
|
872
|
+
|
|
873
|
+
def __init__(self):
|
|
874
|
+
logging.info("✓ VietnameseTextPostProcessor initialized")
|
|
875
|
+
|
|
876
|
+
def process(self, text: str, confidence: float, metadata: Optional[Dict] = None) -> str:
|
|
877
|
+
"""
|
|
878
|
+
Xử lý text để giảm nhiễu và cải thiện chất lượng.
|
|
879
|
+
"""
|
|
880
|
+
if not text or not text.strip():
|
|
881
|
+
return text
|
|
882
|
+
|
|
883
|
+
# Basic cleaning
|
|
884
|
+
cleaned = text.strip()
|
|
885
|
+
|
|
886
|
+
# TODO: Implement Vietnamese-specific corrections
|
|
887
|
+
# - Fix common OCR errors (I/l, 0/O, etc.)
|
|
888
|
+
# - Remove excessive whitespace
|
|
889
|
+
# - Normalize diacritics
|
|
890
|
+
|
|
891
|
+
return cleaned
|
|
892
|
+
|
|
893
|
+
|
|
894
|
+
# ============================================================================
|
|
895
|
+
# Document Reconstruction Implementations
|
|
896
|
+
# ============================================================================
|
|
897
|
+
|
|
898
|
+
class SmartMarkdownReconstruction(DocumentReconstructionPhase):
|
|
899
|
+
"""
|
|
900
|
+
Smart Markdown reconstruction với reading order intelligence.
|
|
901
|
+
|
|
902
|
+
Features:
|
|
903
|
+
- Smart sorting: Y-first, then X for same-line regions
|
|
904
|
+
- Handles multi-column layouts
|
|
905
|
+
- Preserves document structure
|
|
906
|
+
"""
|
|
907
|
+
|
|
908
|
+
def __init__(self, y_threshold: int = 30):
|
|
909
|
+
"""
|
|
910
|
+
Args:
|
|
911
|
+
y_threshold: Threshold (pixels) để xem regions có cùng dòng hay không
|
|
912
|
+
"""
|
|
913
|
+
self.y_threshold = y_threshold
|
|
914
|
+
logging.info(f"✓ SmartMarkdownReconstruction initialized (y_threshold={y_threshold}px)")
|
|
915
|
+
|
|
916
|
+
def reconstruct(
|
|
917
|
+
self,
|
|
918
|
+
regions: List[Tuple[int, str, Any]],
|
|
919
|
+
output_format: str = "markdown"
|
|
920
|
+
) -> str:
|
|
921
|
+
"""
|
|
922
|
+
Ghép nối regions với smart sorting.
|
|
923
|
+
|
|
924
|
+
Args:
|
|
925
|
+
regions: List of (y_position, content, bbox) tuples
|
|
926
|
+
output_format: Only "markdown" supported
|
|
927
|
+
|
|
928
|
+
Returns:
|
|
929
|
+
Markdown string
|
|
930
|
+
"""
|
|
931
|
+
if output_format != "markdown":
|
|
932
|
+
raise NotImplementedError(f"Format {output_format} not supported")
|
|
933
|
+
|
|
934
|
+
# Smart sort regions (Y-first, X-second for same line)
|
|
935
|
+
sorted_regions = self._smart_sort_regions(regions)
|
|
936
|
+
|
|
937
|
+
# Concatenate with double newline
|
|
938
|
+
markdown = "\n\n".join([item[1] for item in sorted_regions])
|
|
939
|
+
return markdown
|
|
940
|
+
|
|
941
|
+
def _smart_sort_regions(self, regions: List[Tuple[int, str, Any]]) -> List[Tuple[int, str, Any]]:
|
|
942
|
+
"""
|
|
943
|
+
Sort regions với reading order thông minh.
|
|
944
|
+
|
|
945
|
+
Algorithm:
|
|
946
|
+
1. Group regions by Y coordinate (với threshold)
|
|
947
|
+
2. Sort each group by X coordinate
|
|
948
|
+
3. Flatten results
|
|
949
|
+
"""
|
|
950
|
+
if not regions:
|
|
951
|
+
return regions
|
|
952
|
+
|
|
953
|
+
# Convert to dict format for sorting
|
|
954
|
+
regions_dict = []
|
|
955
|
+
for item in regions:
|
|
956
|
+
y_pos = item[0]
|
|
957
|
+
content = item[1]
|
|
958
|
+
|
|
959
|
+
# Extract x0 from bbox if available
|
|
960
|
+
x0 = 0
|
|
961
|
+
if len(item) > 2 and isinstance(item[2], (list, tuple)):
|
|
962
|
+
bbox = item[2]
|
|
963
|
+
x0 = bbox[0] if len(bbox) > 0 else 0
|
|
964
|
+
|
|
965
|
+
regions_dict.append({
|
|
966
|
+
"top": y_pos,
|
|
967
|
+
"x0": x0,
|
|
968
|
+
"content": content,
|
|
969
|
+
"original": item
|
|
970
|
+
})
|
|
971
|
+
|
|
972
|
+
# Use LayoutRecognizer.sort_Y_firstly for smart Y+X sorting
|
|
973
|
+
sorted_dict = LayoutRecognizer.sort_Y_firstly(regions_dict, self.y_threshold)
|
|
974
|
+
|
|
975
|
+
# Reconstruct original format
|
|
976
|
+
return [r["original"] for r in sorted_dict]
|
|
977
|
+
|
|
978
|
+
|
|
979
|
+
# ============================================================================
|
|
980
|
+
# Factory Functions
|
|
981
|
+
# ============================================================================
|
|
982
|
+
|
|
983
|
+
def create_default_pipeline() -> Dict[str, Any]:
|
|
984
|
+
"""
|
|
985
|
+
Tạo pipeline mặc định với các implementations hiện tại.
|
|
986
|
+
|
|
987
|
+
Returns:
|
|
988
|
+
Dictionary chứa các phase instances:
|
|
989
|
+
{
|
|
990
|
+
"layout_analyzer": LayoutAnalysisPhase,
|
|
991
|
+
"text_detector": TextDetectionPhase,
|
|
992
|
+
"text_recognizer": TextRecognitionPhase,
|
|
993
|
+
"post_processor": PostProcessingPhase,
|
|
994
|
+
"reconstructor": DocumentReconstructionPhase
|
|
995
|
+
}
|
|
996
|
+
"""
|
|
997
|
+
return {
|
|
998
|
+
"layout_analyzer": DocLayoutYOLOAnalyzer(),
|
|
999
|
+
"text_detector": PaddleOCRTextDetector(),
|
|
1000
|
+
"text_recognizer": VietOCRRecognizer(),
|
|
1001
|
+
"post_processor": VietnameseTextPostProcessor(),
|
|
1002
|
+
"reconstructor": SmartMarkdownReconstruction(y_threshold=30)
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
|
|
1006
|
+
def create_svtrv2_pipeline(device_id: Optional[int] = None, lang: str = 'vi') -> Dict[str, Any]:
|
|
1007
|
+
"""
|
|
1008
|
+
Tạo pipeline với SVTRv2 recognizer (recommended for production).
|
|
1009
|
+
|
|
1010
|
+
SVTRv2 Benefits:
|
|
1011
|
+
- 3x faster than VietOCR (150ms vs 500ms per crop)
|
|
1012
|
+
- +15-20% accuracy improvement (92-95% vs 75-80%)
|
|
1013
|
+
- Better Vietnamese diacritic handling
|
|
1014
|
+
- Less character noise
|
|
1015
|
+
- Production-ready và stable
|
|
1016
|
+
|
|
1017
|
+
Args:
|
|
1018
|
+
device_id: Device ID cho CUDA (None = CPU)
|
|
1019
|
+
lang: Language code ('vi' for Vietnamese, 'en' for English)
|
|
1020
|
+
|
|
1021
|
+
Returns:
|
|
1022
|
+
Dictionary chứa các phase instances với SVTRv2
|
|
1023
|
+
|
|
1024
|
+
Example:
|
|
1025
|
+
```python
|
|
1026
|
+
from deepdoc_vietocr import DocumentPipeline
|
|
1027
|
+
from deepdoc_vietocr.implementations import create_svtrv2_pipeline
|
|
1028
|
+
|
|
1029
|
+
# Create SVTRv2 pipeline
|
|
1030
|
+
config = create_svtrv2_pipeline()
|
|
1031
|
+
pipeline = DocumentPipeline(**config, threshold=0.5, max_workers=2)
|
|
1032
|
+
|
|
1033
|
+
# Process image
|
|
1034
|
+
result = pipeline.process(image, img_name='test', figure_save_dir='./output')
|
|
1035
|
+
```
|
|
1036
|
+
"""
|
|
1037
|
+
return {
|
|
1038
|
+
"layout_analyzer": DocLayoutYOLOAnalyzer(),
|
|
1039
|
+
"text_detector": PaddleOCRTextDetector(),
|
|
1040
|
+
"text_recognizer": SVTRv2Recognizer(device_id=device_id, lang=lang), # ← SVTRv2
|
|
1041
|
+
"post_processor": VietnameseTextPostProcessor(),
|
|
1042
|
+
"reconstructor": SmartMarkdownReconstruction(y_threshold=30)
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
|
|
1046
|
+
def create_experimental_pipeline() -> Dict[str, Any]:
|
|
1047
|
+
"""
|
|
1048
|
+
Tạo pipeline thử nghiệm với các implementations mới.
|
|
1049
|
+
|
|
1050
|
+
Ví dụ: Swap VietOCR bằng LandingAI recognizer
|
|
1051
|
+
"""
|
|
1052
|
+
return {
|
|
1053
|
+
"layout_analyzer": DocLayoutYOLOAnalyzer(),
|
|
1054
|
+
"text_detector": PaddleOCRTextDetector(),
|
|
1055
|
+
"text_recognizer": LandingAIRecognizer(), # Experimental
|
|
1056
|
+
"post_processor": VietnameseTextPostProcessor(),
|
|
1057
|
+
"reconstructor": SmartMarkdownReconstruction(y_threshold=30)
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
|
|
1061
|
+
# ============================================================================
|
|
1062
|
+
# Hybrid Pipeline Implementations (Fusion)
|
|
1063
|
+
# ============================================================================
|
|
1064
|
+
|
|
1065
|
+
def ocr_region_worker(args):
|
|
1066
|
+
"""
|
|
1067
|
+
Worker function for parallel processing.
|
|
1068
|
+
Args:
|
|
1069
|
+
args: (image_np, bbox, detector, recognizer)
|
|
1070
|
+
Note: detector/recognizer must be picklable or re-initialized.
|
|
1071
|
+
However, re-initializing models in each worker is expensive.
|
|
1072
|
+
|
|
1073
|
+
Optimized Approach:
|
|
1074
|
+
Pass cropping logic here, but models might need global init or passed if lightweight.
|
|
1075
|
+
VietOCR is lightweight enough on CPU. PaddleDetector (ONNX) is also fine.
|
|
1076
|
+
|
|
1077
|
+
Alternative: Initialize models inside worker (using global singleton pattern).
|
|
1078
|
+
"""
|
|
1079
|
+
import numpy as np
|
|
1080
|
+
from PIL import Image
|
|
1081
|
+
|
|
1082
|
+
crop, detector_instance, recognizer_instance = args
|
|
1083
|
+
|
|
1084
|
+
if crop is None:
|
|
1085
|
+
return "", 1.0
|
|
1086
|
+
|
|
1087
|
+
# 1. Line Detection (Split paragraph into lines)
|
|
1088
|
+
# Convert PIL to numpy for PaddleDetector
|
|
1089
|
+
crop_np = np.array(crop)
|
|
1090
|
+
# Ensure RGB
|
|
1091
|
+
if len(crop_np.shape) == 2:
|
|
1092
|
+
crop_np = cv2.cvtColor(crop_np, cv2.COLOR_GRAY2RGB)
|
|
1093
|
+
elif crop_np.shape[2] == 4:
|
|
1094
|
+
crop_np = cv2.cvtColor(crop_np, cv2.COLOR_RGBA2RGB)
|
|
1095
|
+
|
|
1096
|
+
dt_boxes, _ = detector_instance.detect(crop_np)
|
|
1097
|
+
|
|
1098
|
+
if dt_boxes is None or len(dt_boxes) == 0:
|
|
1099
|
+
# Fallback: Treat whole crop as single line (or empty)
|
|
1100
|
+
# But if it was a paragraph, VietOCR might fail.
|
|
1101
|
+
# Let's try OCR on whole crop if detection fails (might be single line already)
|
|
1102
|
+
lines = [crop]
|
|
1103
|
+
else:
|
|
1104
|
+
# Sort lines matching reading order (Top to Bottom)
|
|
1105
|
+
# dt_boxes shape: [N, 4, 2]
|
|
1106
|
+
# Sort by Y coordinate of top-left corner
|
|
1107
|
+
dt_boxes = sorted(dt_boxes, key=lambda b: b[0][1])
|
|
1108
|
+
|
|
1109
|
+
lines = []
|
|
1110
|
+
for box in dt_boxes:
|
|
1111
|
+
# Crop each line
|
|
1112
|
+
# Box might be rotated, but for now assume mostly horizontal or slight tilt
|
|
1113
|
+
# Get min/max x,y
|
|
1114
|
+
h, w, _ = crop_np.shape
|
|
1115
|
+
box_int = np.int0(box)
|
|
1116
|
+
x_min = max(0, np.min(box_int[:, 0]))
|
|
1117
|
+
x_max = min(w, np.max(box_int[:, 0]))
|
|
1118
|
+
y_min = max(0, np.min(box_int[:, 1]))
|
|
1119
|
+
y_max = min(h, np.max(box_int[:, 1]))
|
|
1120
|
+
|
|
1121
|
+
if x_max > x_min and y_max > y_min:
|
|
1122
|
+
line_crop = crop.crop((x_min, y_min, x_max, y_max))
|
|
1123
|
+
lines.append(line_crop)
|
|
1124
|
+
|
|
1125
|
+
# 2. Line Recognition (VietOCR)
|
|
1126
|
+
full_text = []
|
|
1127
|
+
total_conf = 0.0
|
|
1128
|
+
|
|
1129
|
+
if not lines:
|
|
1130
|
+
return "", 0.0
|
|
1131
|
+
|
|
1132
|
+
results, _ = recognizer_instance.recognize(lines)
|
|
1133
|
+
|
|
1134
|
+
texts = []
|
|
1135
|
+
confs = []
|
|
1136
|
+
for text, conf in results:
|
|
1137
|
+
if text.strip():
|
|
1138
|
+
texts.append(text.strip())
|
|
1139
|
+
confs.append(conf)
|
|
1140
|
+
|
|
1141
|
+
final_text = " ".join(texts)
|
|
1142
|
+
avg_conf = sum(confs) / len(confs) if confs else 0.0
|
|
1143
|
+
|
|
1144
|
+
return final_text, avg_conf
|
|
1145
|
+
|
|
1146
|
+
class HybridStructureVietOCRAnalyzer(LayoutAnalysisPhase):
|
|
1147
|
+
"""
|
|
1148
|
+
Hybrid Pipeline combining PP-StructureV3 (Layout) + VietOCR (Text).
|
|
1149
|
+
|
|
1150
|
+
Strategy:
|
|
1151
|
+
1. Use PP-StructureV3 for Layout Analysis (Top-down reading order, Table detection).
|
|
1152
|
+
2. For 'Text'/'Title' regions: Crop image and use VietOCR for high-accuracy recognition.
|
|
1153
|
+
3. For 'Table' regions: Keep PP-StructureV3's HTML/Markdown output (since VietOCR can't handle tables).
|
|
1154
|
+
|
|
1155
|
+
Pros:
|
|
1156
|
+
- Best of both worlds: Perfect layout + Perfect Vietnamese characters.
|
|
1157
|
+
- Handles complex tables (wireless, merged cells).
|
|
1158
|
+
- Solves the 'bottom-up' reading order issue of YOLOv10.
|
|
1159
|
+
|
|
1160
|
+
Cons:
|
|
1161
|
+
- Slower than pure YOLO+VietOCR (due to heavy Layout model).
|
|
1162
|
+
- Slower than pure PP-StructureV3 (due to extra VietOCR calls).
|
|
1163
|
+
"""
|
|
1164
|
+
|
|
1165
|
+
def __init__(self, lang: str = 'vi'):
|
|
1166
|
+
from .ocr import TextRecognizer # Import locally to avoid circular deps if any
|
|
1167
|
+
import multiprocessing
|
|
1168
|
+
|
|
1169
|
+
# 1. Initialize Text Recognition Engine (VietOCR) - EARLY INIT to avoid conflicts
|
|
1170
|
+
# VietOCR is CPU optimized and very accurate for Vietnamese
|
|
1171
|
+
self.text_engine = VietOCRRecognizer()
|
|
1172
|
+
logging.info("✓ HybridPipeline: Text Engine (VietOCR) ready")
|
|
1173
|
+
|
|
1174
|
+
# 2. Initialize Line Detector (PaddleOCR Det)
|
|
1175
|
+
# Needed for splitting paragraphs into lines
|
|
1176
|
+
self.line_detector = PaddleOCRTextDetector()
|
|
1177
|
+
logging.info("✓ HybridPipeline: Line Detector (PaddleOCR) ready")
|
|
1178
|
+
|
|
1179
|
+
# 3. Initialize Layout Engine (PP-StructureV3)
|
|
1180
|
+
# Note: We use the existing wrapper to safely handle import errors
|
|
1181
|
+
self.layout_engine = PaddleStructureV3Analyzer(lang=lang, show_log=False)
|
|
1182
|
+
logging.info("✓ HybridPipeline: Layout Engine (PP-StructureV3) ready")
|
|
1183
|
+
|
|
1184
|
+
# CPU Count for multiprocessing
|
|
1185
|
+
self.num_workers = min(4, multiprocessing.cpu_count())
|
|
1186
|
+
|
|
1187
|
+
def analyze(self, image: Image.Image, threshold: float = 0.5) -> List[Dict[str, Any]]:
|
|
1188
|
+
"""
|
|
1189
|
+
Run the hybrid analysis pipeline.
|
|
1190
|
+
"""
|
|
1191
|
+
import numpy as np
|
|
1192
|
+
|
|
1193
|
+
# Ensure image is PIL for cropping (VietOCR likes PIL)
|
|
1194
|
+
if isinstance(image, np.ndarray):
|
|
1195
|
+
image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
|
|
1196
|
+
else:
|
|
1197
|
+
image_pil = image
|
|
1198
|
+
|
|
1199
|
+
# Step 1: Run Layout Analysis (PP-StructureV3)
|
|
1200
|
+
# This returns regions with 'bbox', 'type', and raw 'content' (from PaddleOCR)
|
|
1201
|
+
logging.info("HybridPipeline: Running Layout Analysis...")
|
|
1202
|
+
layout_results = self.layout_engine.analyze(image_pil, threshold=threshold)
|
|
1203
|
+
|
|
1204
|
+
# Step 2: Refine Text Content with VietOCR
|
|
1205
|
+
logging.info(f"HybridPipeline: Refining {len(layout_results)} regions with VietOCR...")
|
|
1206
|
+
|
|
1207
|
+
final_results = []
|
|
1208
|
+
text_crops = []
|
|
1209
|
+
text_indices = []
|
|
1210
|
+
|
|
1211
|
+
# Prepare batch for Parallel Processing
|
|
1212
|
+
# We cannot pass self.text_engine/self.line_detector easily if they are not picklable
|
|
1213
|
+
# For simplicity in this demo, we will use sequential processing first with the new logic
|
|
1214
|
+
# OR use a ThreadPool since VietOCR releases GIL or runs C++ (Paddle)
|
|
1215
|
+
# Wait, VietOCR is PyTorch, Paddle is ONNX.
|
|
1216
|
+
# Deepcopying models for processes is heavy.
|
|
1217
|
+
# User suggested multiprocessing but that requires pickling models.
|
|
1218
|
+
# BETTER STRATEGY: Use ThreadPoolExecutor for I/O bound tasks,
|
|
1219
|
+
# but here it is CPU bound.
|
|
1220
|
+
#
|
|
1221
|
+
# WORKAROUND for non-picklable models in simple script:
|
|
1222
|
+
# Just run sequential loop first to prove quality fix (Line detection).
|
|
1223
|
+
# Optimization (Multiprocessing) requires moving model init to global or worker_init.
|
|
1224
|
+
# Given complexity, let's implement the LINE DETECTION logic first (Quality Fix).
|
|
1225
|
+
# We can simulate "Parallel" by batching lines if possible, but here we process region by region.
|
|
1226
|
+
|
|
1227
|
+
# Let's conform to User Request for Quality First (Line Detection).
|
|
1228
|
+
# We will iterate regions and perform split + OCR.
|
|
1229
|
+
|
|
1230
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
1231
|
+
|
|
1232
|
+
# We will use ThreadPool to parallelize the *regions* processing if models are thread-safe.
|
|
1233
|
+
# Paddle (ONNX Runtime) is thread-safe. VietOCR (PyTorch) is thread-safe for inference.
|
|
1234
|
+
|
|
1235
|
+
def process_single_region(idx_region_tuple):
|
|
1236
|
+
idx, region = idx_region_tuple
|
|
1237
|
+
rtype = region.get('type', '').lower()
|
|
1238
|
+
|
|
1239
|
+
# Note: PPStructureV3 uses 'paragraph_title'
|
|
1240
|
+
if rtype in ['text', 'title', 'header', 'footer', 'paragraph_title', 'reference', 'list']:
|
|
1241
|
+
bbox = region.get('bbox')
|
|
1242
|
+
if bbox:
|
|
1243
|
+
# Crop image: [x0, y0, x1, y1]
|
|
1244
|
+
x0, y0, x1, y1 = map(int, bbox)
|
|
1245
|
+
w, h = image_pil.size
|
|
1246
|
+
x0 = max(0, x0); y0 = max(0, y0)
|
|
1247
|
+
x1 = min(w, x1); y1 = min(h, y1)
|
|
1248
|
+
|
|
1249
|
+
if x1 > x0 and y1 > y0:
|
|
1250
|
+
crop = image_pil.crop((x0, y0, x1, y1))
|
|
1251
|
+
|
|
1252
|
+
# CALL WORKER LOGIC DIRECTLY (No pickling issues)
|
|
1253
|
+
# Pass self.line_detector and self.text_engine
|
|
1254
|
+
text, conf = ocr_region_worker((crop, self.line_detector, self.text_engine))
|
|
1255
|
+
|
|
1256
|
+
region['content'] = text
|
|
1257
|
+
region['score'] = conf
|
|
1258
|
+
region['source'] = 'VietOCR+LineDet'
|
|
1259
|
+
return region
|
|
1260
|
+
|
|
1261
|
+
# Collect extractable regions
|
|
1262
|
+
tasks = []
|
|
1263
|
+
for idx, region in enumerate(layout_results):
|
|
1264
|
+
tasks.append((idx, region))
|
|
1265
|
+
|
|
1266
|
+
# Run in ThreadPool (lighter than ProcessPool and works with unpicklable objects usually)
|
|
1267
|
+
# If CPU bound, GIL limits speedup, but ONNX Runtime releases GIL.
|
|
1268
|
+
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
|
|
1269
|
+
results = list(executor.map(process_single_region, tasks))
|
|
1270
|
+
|
|
1271
|
+
return results
|
|
1272
|
+
|
|
1273
|
+
def create_hybrid_pipeline() -> Dict[str, Any]:
|
|
1274
|
+
"""
|
|
1275
|
+
Creates the Hybrid Pipeline (Fusion Strategy).
|
|
1276
|
+
Usage similar to other pipelines, but 'layout_analyzer' does mostly everything.
|
|
1277
|
+
"""
|
|
1278
|
+
return {
|
|
1279
|
+
"layout_analyzer": HybridStructureVietOCRAnalyzer(),
|
|
1280
|
+
# Other components are placeholders since HybridAnalyzer handles everything internally
|
|
1281
|
+
# but we keep them for interface consistency if needed
|
|
1282
|
+
"text_detector": None,
|
|
1283
|
+
"text_recognizer": None,
|
|
1284
|
+
"post_processor": VietnameseTextPostProcessor(),
|
|
1285
|
+
"reconstructor": SmartMarkdownReconstruction(y_threshold=30)
|
|
1286
|
+
}
|