xfmr-zem 0.2.2__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. xfmr_zem/cli.py +32 -3
  2. xfmr_zem/client.py +59 -8
  3. xfmr_zem/server.py +21 -4
  4. xfmr_zem/servers/data_juicer/server.py +1 -1
  5. xfmr_zem/servers/instruction_gen/server.py +1 -1
  6. xfmr_zem/servers/io/server.py +1 -1
  7. xfmr_zem/servers/llm/parameters.yml +10 -0
  8. xfmr_zem/servers/nemo_curator/server.py +1 -1
  9. xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py +90 -0
  10. xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py +1286 -0
  11. xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py +562 -0
  12. xfmr_zem/servers/ocr/deepdoc_vietocr/ocr.py +512 -0
  13. xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/.gitattributes +35 -0
  14. xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/README.md +5 -0
  15. xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/ocr.res +6623 -0
  16. xfmr_zem/servers/ocr/deepdoc_vietocr/operators.py +725 -0
  17. xfmr_zem/servers/ocr/deepdoc_vietocr/phases.py +191 -0
  18. xfmr_zem/servers/ocr/deepdoc_vietocr/pipeline.py +561 -0
  19. xfmr_zem/servers/ocr/deepdoc_vietocr/postprocess.py +370 -0
  20. xfmr_zem/servers/ocr/deepdoc_vietocr/recognizer.py +436 -0
  21. xfmr_zem/servers/ocr/deepdoc_vietocr/table_structure_recognizer.py +569 -0
  22. xfmr_zem/servers/ocr/deepdoc_vietocr/utils/__init__.py +81 -0
  23. xfmr_zem/servers/ocr/deepdoc_vietocr/utils/file_utils.py +246 -0
  24. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/__init__.py +0 -0
  25. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/base.yml +58 -0
  26. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/vgg-seq2seq.yml +38 -0
  27. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/__init__.py +0 -0
  28. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/cnn.py +25 -0
  29. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/vgg.py +51 -0
  30. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/seqmodel/seq2seq.py +175 -0
  31. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/transformerocr.py +29 -0
  32. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/vocab.py +36 -0
  33. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/config.py +37 -0
  34. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/translate.py +111 -0
  35. xfmr_zem/servers/ocr/engines.py +242 -0
  36. xfmr_zem/servers/ocr/install_models.py +63 -0
  37. xfmr_zem/servers/ocr/parameters.yml +4 -0
  38. xfmr_zem/servers/ocr/server.py +44 -0
  39. xfmr_zem/servers/profiler/parameters.yml +4 -0
  40. xfmr_zem/servers/sinks/parameters.yml +6 -0
  41. xfmr_zem/servers/unstructured/parameters.yml +6 -0
  42. xfmr_zem/servers/unstructured/server.py +62 -0
  43. xfmr_zem/zenml_wrapper.py +20 -7
  44. {xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/METADATA +19 -1
  45. xfmr_zem-0.2.5.dist-info/RECORD +58 -0
  46. xfmr_zem-0.2.2.dist-info/RECORD +0 -23
  47. /xfmr_zem/servers/data_juicer/{parameter.yaml → parameters.yml} +0 -0
  48. /xfmr_zem/servers/instruction_gen/{parameter.yaml → parameters.yml} +0 -0
  49. /xfmr_zem/servers/io/{parameter.yaml → parameters.yml} +0 -0
  50. /xfmr_zem/servers/nemo_curator/{parameter.yaml → parameters.yml} +0 -0
  51. {xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/WHEEL +0 -0
  52. {xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/entry_points.txt +0 -0
  53. {xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1286 @@
1
+ """
2
+ Concrete Implementations của các Abstract Phases
3
+
4
+ File này chứa các implementations thực tế cho từng phase trong pipeline.
5
+ Mỗi implementation có thể được thay thế độc lập bằng implementation khác.
6
+ """
7
+
8
+ import os
9
+ import logging
10
+ from typing import List, Dict, Tuple, Optional, Any
11
+ from PIL import Image
12
+ import numpy as np
13
+ import cv2
14
+
15
+ from .phases import (
16
+ LayoutAnalysisPhase,
17
+ TextDetectionPhase,
18
+ TextRecognitionPhase,
19
+ PostProcessingPhase,
20
+ DocumentReconstructionPhase,
21
+ )
22
+
23
+ # Import existing modules
24
+ from . import LayoutRecognizer
25
+ from .ocr import TextDetector, TextRecognizer, get_project_base_directory
26
+
27
+
28
+ # ============================================================================
29
+ # Layout Analysis Implementations
30
+ # ============================================================================
31
+
32
+ class DocLayoutYOLOAnalyzer(LayoutAnalysisPhase):
33
+ """
34
+ Layout Analysis sử dụng DocLayout-YOLO ONNX model.
35
+
36
+ Features:
37
+ - Sử dụng ONNX optimized model cho CPU performance
38
+ - Phát hiện: text, title, figure, table, caption, header, footer, equation
39
+ """
40
+
41
+ def __init__(self, model_name: str = "layout"):
42
+ """
43
+ Args:
44
+ model_name: Tên model (default: "layout" for DocLayout-YOLO)
45
+ """
46
+ self.recognizer = LayoutRecognizer(model_name)
47
+ logging.info("✓ DocLayoutYOLOAnalyzer initialized with ONNX model")
48
+
49
+ def analyze(self, image: Image.Image, threshold: float = 0.5) -> List[Dict[str, Any]]:
50
+ """
51
+ Phân tích layout sử dụng DocLayout-YOLO.
52
+
53
+ Returns:
54
+ List of regions with structure:
55
+ {
56
+ "bbox": [x0, y0, x1, y1],
57
+ "type": str (text/title/figure/table/etc.),
58
+ "score": float,
59
+ "label": str (detailed label)
60
+ }
61
+ """
62
+ # Call forward on batch of 1 image
63
+ layouts = self.recognizer.forward([image], thr=float(threshold))[0]
64
+
65
+ # Normalize output format
66
+ results = []
67
+ for region in layouts:
68
+ bbox = self._extract_bbox(region)
69
+ label = region.get("type", "").lower()
70
+ score = region.get("score", 1.0)
71
+
72
+ if score < threshold:
73
+ continue
74
+
75
+ results.append({
76
+ "bbox": bbox,
77
+ "type": label,
78
+ "score": score,
79
+ "label": label,
80
+ "raw": region # Keep original for backward compatibility
81
+ })
82
+
83
+ logging.info(f"DocLayoutYOLO detected {len(results)} regions")
84
+ return results
85
+
86
+ def _extract_bbox(self, region: Dict) -> List[int]:
87
+ """Extract and normalize bbox from region dict"""
88
+ if "bbox" in region:
89
+ return list(map(int, region["bbox"]))
90
+ return list(map(int, [
91
+ region.get("x0", 0),
92
+ region.get("top", 0),
93
+ region.get("x1", 0),
94
+ region.get("bottom", 0)
95
+ ]))
96
+
97
+
98
+
99
+ class PaddleStructureV3Analyzer(LayoutAnalysisPhase):
100
+ """
101
+ Layout & Table Analysis utilizing PaddleOCR's PP-StructureV3.
102
+
103
+ Features:
104
+ - Unified Layout Analysis + Table Recognition + OCR
105
+ - PP-StructureV3 model (SOTA for document structure)
106
+ - Supports 'layout', 'table', 'ocr' modes (default: structure=True)
107
+ - Better handling of complex tables and multi-column layouts
108
+
109
+ Note: Requires paddleocr>=2.7 (best with 3.x)
110
+ """
111
+
112
+ def __init__(self, lang: str = 'en', show_log: bool = True, **kwargs):
113
+ """
114
+ Args:
115
+ lang: Language code (default: 'en' for layout compatibility)
116
+ show_log: Whether to show PaddleOCR logs
117
+ **kwargs: Additional arguments passed to PPStructureV3 (e.g., use_gpu, enable_mkldnn)
118
+ """
119
+ try:
120
+ from paddleocr import PPStructureV3
121
+ except ImportError as e:
122
+ try:
123
+ # Fallback for older versions or if PPStructure is the name
124
+ from paddleocr import PPStructure as PPStructureV3
125
+ except ImportError:
126
+ raise ImportError(f"paddleocr must be installed to use PPStructure/V3: {e}")
127
+
128
+ self.engine = PPStructureV3(
129
+ lang=lang,
130
+ # show_log is not supported in PPStructureV3 init args
131
+ **kwargs
132
+ )
133
+ logging.info(f"✓ PaddleStructureV3Analyzer initialized (lang={lang})")
134
+
135
+ def analyze(self, image: Image.Image, threshold: float = 0.5) -> List[Dict[str, Any]]:
136
+ """
137
+ Analyze document structure.
138
+
139
+ Returns:
140
+ List of regions including Text, Title, Figure, Table, etc.
141
+ For 'table' regions, it may include structural info if available.
142
+ """
143
+ import numpy as np
144
+
145
+ # Convert PIL to numpy
146
+ if isinstance(image, Image.Image):
147
+ image = np.array(image)
148
+
149
+ # Ensure RGB
150
+ if len(image.shape) == 2:
151
+ image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
152
+ elif image.shape[2] == 4:
153
+ image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
154
+
155
+ # Run PP-Structure
156
+ if hasattr(self.engine, 'predict'):
157
+ predict_gen = self.engine.predict(image)
158
+ else:
159
+ # Fallback for older versions
160
+ predict_gen = self.engine(image)
161
+
162
+ # Each result is a dict containing 'doc_layout_result' (list of regions)
163
+ raw_regions = []
164
+ for res in predict_gen:
165
+ logging.info(f"DEBUG: Processing result item type: {type(res)}")
166
+ try:
167
+ logging.info(f"DEBUG: res.keys(): {list(res.keys())}")
168
+ except:
169
+ logging.info("DEBUG: res has no keys()")
170
+ logging.info(f"DEBUG: str(res): {str(res)}")
171
+
172
+ # Handle structure extraction from various result types
173
+ success = False
174
+
175
+ # Strategy 1: key access 'parsing_res_list' (V2 result)
176
+ try:
177
+ if 'parsing_res_list' in res:
178
+ raw_regions.extend(res['parsing_res_list'])
179
+ logging.info("DEBUG: Extracted via res['parsing_res_list']")
180
+ success = True
181
+ except Exception as e:
182
+ logging.info(f"DEBUG: Strategy 1 failed: {e}")
183
+
184
+ if not success:
185
+ # Strategy 2: key access 'doc_layout_result' (Legacy/V3 result)
186
+ try:
187
+ if 'doc_layout_result' in res:
188
+ raw_regions.extend(res['doc_layout_result'])
189
+ logging.info("DEBUG: Extracted via res['doc_layout_result']")
190
+ success = True
191
+ except:
192
+ pass
193
+
194
+ if not success:
195
+ # Strategy 3: direct access
196
+ if hasattr(res, 'parsing_res_list'):
197
+ val = res.parsing_res_list
198
+ if val:
199
+ raw_regions.extend(val)
200
+ success = True
201
+
202
+ if not success and isinstance(res, list):
203
+ raw_regions.extend(res)
204
+ logging.info("DEBUG: Extracted via list extension")
205
+
206
+ # Normalize results
207
+ normalized_results = []
208
+ for region in raw_regions:
209
+ # Handle dictionary or object attributes
210
+ if isinstance(region, dict):
211
+ region_type = region.get('type', region.get('label', '')).lower()
212
+ bbox = region.get('bbox', [0, 0, 0, 0])
213
+ score = region.get('score', 1.0)
214
+ # Content key might vary
215
+ if 'res' in region: # Legacy format
216
+ content = ""
217
+ res = region['res']
218
+ if isinstance(res, list):
219
+ texts = []
220
+ for line in res:
221
+ if isinstance(line, dict) and 'text' in line:
222
+ texts.append(line['text'])
223
+ elif isinstance(line, (list, tuple)) and len(line) > 0:
224
+ texts.append(str(line[0]))
225
+ content = " ".join(texts)
226
+ elif isinstance(res, dict) and 'html' in res:
227
+ content = res['html']
228
+ else:
229
+ content = region.get('content', region.get('text', ''))
230
+ else:
231
+ # Handle parsed object (LayoutBlock)
232
+ # Attributes based on logs: label, bbox, content
233
+ try:
234
+ region_type = getattr(region, 'label', '').lower()
235
+ bbox = getattr(region, 'bbox', [0, 0, 0, 0])
236
+ score = getattr(region, 'score', 1.0)
237
+ content = getattr(region, 'content', getattr(region, 'text', ''))
238
+ except:
239
+ logging.warning(f"Could not parse region object: {dir(region)}")
240
+ continue
241
+
242
+ # Filter by threshold if score is available
243
+ if score < threshold:
244
+ continue
245
+
246
+ normalized_results.append({
247
+ "bbox": bbox,
248
+ "type": region_type,
249
+ "score": score,
250
+ "label": region_type,
251
+ "content": content,
252
+ "raw": str(region)
253
+ })
254
+
255
+ logging.info(f"PaddleStructureV3 detected {len(normalized_results)} regions")
256
+ return normalized_results
257
+
258
+
259
+ # ============================================================================
260
+ # Text Detection Implementations
261
+ # ============================================================================
262
+
263
+ class PaddleOCRTextDetector(TextDetectionPhase):
264
+ """
265
+ Text Detection sử dụng PaddleOCR ONNX detection model.
266
+
267
+ Features:
268
+ - ONNX optimized cho CPU
269
+ - DBNet architecture
270
+ - Phát hiện text boxes với 4 góc coordinates
271
+ """
272
+
273
+ def __init__(self, model_dir: Optional[str] = None, device_id: Optional[int] = None):
274
+ """
275
+ Args:
276
+ model_dir: Directory chứa ONNX models (default: auto-download)
277
+ device_id: Device ID cho CUDA (None = CPU only)
278
+ """
279
+ if model_dir is None:
280
+ model_dir = os.path.join(get_project_base_directory(), "onnx")
281
+
282
+ self.detector = TextDetector(model_dir, device_id)
283
+ logging.info("✓ PaddleOCRTextDetector initialized")
284
+
285
+ def detect(self, image: np.ndarray) -> Tuple[Optional[np.ndarray], Any]:
286
+ """
287
+ Phát hiện text boxes trong image.
288
+
289
+ Returns:
290
+ (dt_boxes, elapsed_time) where dt_boxes shape is (N, 4, 2)
291
+ """
292
+ return self.detector(image)
293
+
294
+
295
+ # ============================================================================
296
+ # Text Recognition Implementations
297
+ # ============================================================================
298
+
299
+ class VietOCRRecognizer(TextRecognitionPhase):
300
+ """
301
+ Text Recognition sử dụng VietOCR (vgg-seq2seq).
302
+
303
+ Features:
304
+ - Model chuyên cho tiếng Việt
305
+ - Accuracy: ~75-80%
306
+ - CPU optimized với quantization
307
+
308
+ Limitations:
309
+ - Model cũ (2019)
310
+ - Tốc độ chậm hơn
311
+ - Có thể có nhiễu character
312
+ """
313
+
314
+ def __init__(self, model_dir: Optional[str] = None, device_id: Optional[int] = None):
315
+ """
316
+ Args:
317
+ model_dir: Directory chứa models (không dùng với VietOCR)
318
+ device_id: Device ID (không dùng, VietOCR luôn dùng CPU)
319
+ """
320
+ self.recognizer = TextRecognizer(model_dir, device_id)
321
+ logging.info("✓ VietOCRRecognizer initialized (vgg-seq2seq)")
322
+
323
+ def recognize(self, image_crops: List[Any]) -> Tuple[List[Tuple[str, float]], float]:
324
+ """
325
+ Nhận dạng text sử dụng VietOCR.
326
+
327
+ Args:
328
+ image_crops: List các image crops (numpy array hoặc PIL Image)
329
+
330
+ Returns:
331
+ (results, elapsed_time) where results = [(text, confidence), ...]
332
+ """
333
+ return self.recognizer(image_crops)
334
+
335
+
336
+ class LandingAIRecognizer(TextRecognitionPhase):
337
+ """
338
+ Text Recognition sử dụng LandingAI OCR API.
339
+
340
+ Features:
341
+ - Cloud-based OCR với accuracy cao
342
+ - Support nhiều ngôn ngữ
343
+ - Phù hợp cho production với volume lớn
344
+
345
+ Limitations:
346
+ - Cần API key
347
+ - Cần internet connection
348
+ - Có cost per request
349
+
350
+ Note: Đây là placeholder implementation. Cần implement API calls thực tế.
351
+ """
352
+
353
+ def __init__(self, api_key: Optional[str] = None, model_dir: Optional[str] = None, device_id: Optional[int] = None):
354
+ """
355
+ Args:
356
+ api_key: LandingAI API key
357
+ model_dir: Không sử dụng (placeholder for interface consistency)
358
+ device_id: Không sử dụng (cloud-based)
359
+ """
360
+ self.api_key = api_key or os.environ.get("LANDINGAI_API_KEY")
361
+ if not self.api_key:
362
+ logging.warning("LandingAI API key not provided. Recognition will fail.")
363
+
364
+ logging.info("✓ LandingAIRecognizer initialized (placeholder)")
365
+
366
+ def recognize(self, image_crops: List[Any]) -> Tuple[List[Tuple[str, float]], float]:
367
+ """
368
+ Nhận dạng text sử dụng LandingAI API.
369
+
370
+ TODO: Implement actual API calls
371
+ """
372
+ # Placeholder: Return empty results
373
+ logging.warning("LandingAIRecognizer not fully implemented yet")
374
+ return [(("", 0.0)) for _ in image_crops], 0.0
375
+
376
+
377
+ class SVTRv2Recognizer(TextRecognitionPhase):
378
+ """
379
+ Text Recognition sử dụng PP-OCRv3 với SVTRv2-LCNet.
380
+
381
+ ⚠️ IMPORTANT: Chỉ hoạt động với PaddleOCR 2.7.x (KHÔNG tương thích với 3.x)
382
+
383
+ PaddleOCR 2.7.x có method `.rec()` cho recognition-only.
384
+ PaddleOCR 3.x đã remove `.rec()` method và chỉ support full pipeline.
385
+
386
+ Để sử dụng class này, cần downgrade:
387
+ pip uninstall paddleocr paddlepaddle
388
+ pip install paddleocr==2.7.3 paddlepaddle==2.5.2
389
+
390
+ Features:
391
+ - SVTRv2-LCNet architecture (Transformer-based, không dùng RNN)
392
+ - Accuracy: 92-95% (vs 75-80% VietOCR)
393
+ - Speed: ~150ms per crop on CPU (vs ~500ms VietOCR)
394
+ - Model size: 50-80 MB
395
+ - Vietnamese support: Excellent
396
+ - Giảm ký tự nhiễu rõ rệt so với VietOCR
397
+
398
+ Advantages over VietOCR:
399
+ - 3x faster inference
400
+ - +15-20% accuracy improvement
401
+ - Better handling of Vietnamese diacritics
402
+ - Less character noise
403
+ - Production-ready và stable
404
+
405
+ Architecture:
406
+ - Backbone: SVTRv2-HGNet (Hierarchical Grouped Network)
407
+ - Neck: LCNet (Lightweight Convolutional Network)
408
+ - Head: CTC/Attention decoder
409
+
410
+ Note: Yêu cầu PaddleOCR 2.7.x (không phải 3.x)
411
+ """
412
+
413
+ def __init__(self, model_dir: Optional[str] = None, device_id: Optional[int] = None, lang: str = 'vi'):
414
+ """
415
+ Args:
416
+ model_dir: Directory chứa models (auto-download if None)
417
+ device_id: Device ID cho CUDA (None = CPU)
418
+ lang: Language code ('vi' for Vietnamese, 'en' for English)
419
+ """
420
+ try:
421
+ from paddleocr import PaddleOCR
422
+ except ImportError:
423
+ raise ImportError(
424
+ "PaddleOCR not installed. Install with:\n"
425
+ " pip install paddleocr>=2.7.0 paddlepaddle>=2.5.0"
426
+ )
427
+
428
+ # Initialize PaddleOCR with PP-OCRv3 (includes SVTRv2)
429
+ # Following the same pattern as TextRecognizerPaddleOCR in ocr.py:176
430
+
431
+ # Use simple initialization like in ocr.py - this is the stable API
432
+ self.ocr = PaddleOCR(lang=lang)
433
+
434
+ self.device_id = device_id
435
+ self.lang = lang
436
+
437
+ device_str = f"GPU:{device_id}" if device_id is not None else "CPU"
438
+ logging.info(f"✓ SVTRv2Recognizer initialized (PP-OCRv3 via PaddleOCR, lang={lang}, device={device_str})")
439
+
440
+ def recognize(self, image_crops: List[Any]) -> Tuple[List[Tuple[str, float]], float]:
441
+ """
442
+ Nhận dạng text sử dụng SVTRv2.
443
+
444
+ Args:
445
+ image_crops: List các image crops (numpy array hoặc PIL Image)
446
+
447
+ Returns:
448
+ Tuple of (results, elapsed_time):
449
+ - results: List of (text, confidence) tuples
450
+ - elapsed_time: Thời gian xử lý tổng (seconds)
451
+ """
452
+ import time
453
+ start_time = time.time()
454
+
455
+ results = []
456
+
457
+ for img_crop in image_crops:
458
+ # Convert to numpy if PIL Image
459
+ if isinstance(img_crop, Image.Image):
460
+ img_crop = np.array(img_crop)
461
+
462
+ # Ensure correct format (RGB)
463
+ if len(img_crop.shape) == 2:
464
+ # Grayscale -> RGB
465
+ img_crop = cv2.cvtColor(img_crop, cv2.COLOR_GRAY2RGB)
466
+ elif img_crop.shape[2] == 4:
467
+ # RGBA -> RGB
468
+ img_crop = cv2.cvtColor(img_crop, cv2.COLOR_RGBA2RGB)
469
+
470
+ # Ensure minimum height for SVTRv2 (32px)
471
+ if img_crop.shape[0] < 32:
472
+ scale = 32.0 / img_crop.shape[0]
473
+ new_width = int(img_crop.shape[1] * scale)
474
+ img_crop = cv2.resize(img_crop, (new_width, 32), interpolation=cv2.INTER_CUBIC)
475
+
476
+ try:
477
+ # Call PaddleOCR recognition (SVTRv2)
478
+ # Use .rec() method for recognition-only (same as ocr.py:200)
479
+ rec_result = self.ocr.rec(img_crop)
480
+
481
+ # Parse result - format: [(text, confidence), ...]
482
+ # Following the pattern from ocr.py:203-213
483
+ if rec_result and len(rec_result) > 0:
484
+ # Extract first result
485
+ if isinstance(rec_result[0], (tuple, list)) and len(rec_result[0]) >= 2:
486
+ text = str(rec_result[0][0])
487
+ confidence = float(rec_result[0][1])
488
+ else:
489
+ text = str(rec_result[0]) if rec_result[0] else ""
490
+ confidence = 1.0
491
+ else:
492
+ text = ""
493
+ confidence = 0.0
494
+
495
+ results.append((text, confidence))
496
+
497
+ except Exception as e:
498
+ logging.warning(f"SVTRv2 recognition failed: {e}")
499
+ results.append(("", 0.0))
500
+
501
+ elapsed = time.time() - start_time
502
+ return results, elapsed
503
+
504
+
505
+ class PPOCRv5Recognizer(TextRecognitionPhase):
506
+ """
507
+ Text Recognition sử dụng PP-OCRv5 (PaddleOCR 3.x) với FULL PIPELINE.
508
+
509
+ ✅ Hoạt động với PaddleOCR 3.x (latest version)
510
+
511
+ Lưu ý quan trọng:
512
+ ----------------
513
+ PP-OCRv5 trong PaddleOCR 3.x KHÔNG hỗ trợ recognition-only mode.
514
+ Class này sử dụng FULL PIPELINE (detection + recognition) cho TOÀN BỘ image.
515
+
516
+ Do đó, class này KHÔNG integrate vào phase-based architecture hiện tại.
517
+ Thay vào đó, nó thay thế cả Detection + Recognition phases.
518
+
519
+ API Changes từ PaddleOCR 2.7.x → 3.x:
520
+ - REMOVED: `.rec()` method (recognition-only)
521
+ - NEW: `.predict()` method (full pipeline only)
522
+ - NEW: Result object với `.print()`, `.save_to_img()`, `.save_to_json()`
523
+
524
+ Features:
525
+ - PP-OCRv5_server model (latest, 2025)
526
+ - +13% accuracy improvement vs PP-OCRv4
527
+ - Vietnamese support: Excellent
528
+ - Detection: PP-OCRv5 DBNet
529
+ - Recognition: PP-OCRv5 SVTRv2
530
+
531
+ Architecture:
532
+ - Detection: PP-OCRv5_server_det (DBNet-based)
533
+ - Recognition: PP-OCRv5_server_rec (SVTRv2-based)
534
+ - Full pipeline: 5 modules (doc orientation, unwarping, text orientation, detection, recognition)
535
+
536
+ Usage:
537
+ ```python
538
+ # Sử dụng riêng PP-OCRv5, không cần phase-based pipeline
539
+ recognizer = PPOCRv5Recognizer(lang='vi')
540
+ text, confidence = recognizer.recognize_full_image(image)
541
+ ```
542
+
543
+ Note: Yêu cầu PaddleOCR >= 3.0 (tested with 3.3.3)
544
+ """
545
+
546
+ def __init__(
547
+ self,
548
+ model_dir: Optional[str] = None,
549
+ device_id: Optional[int] = None,
550
+ lang: str = 'vi',
551
+ use_doc_orientation: bool = False,
552
+ use_doc_unwarping: bool = False,
553
+ use_textline_orientation: bool = False,
554
+ text_detection_model_name: Optional[str] = None,
555
+ text_recognition_model_name: Optional[str] = None,
556
+ **kwargs
557
+ ):
558
+ """
559
+ Args:
560
+ model_dir: Directory chứa models (không dùng trong PaddleOCR 3.x)
561
+ device_id: Device ID cho CUDA (None = CPU)
562
+ lang: Language code ('vi' for Vietnamese, 'en' for English, 'ch' for Chinese)
563
+ use_doc_orientation: Sử dụng doc orientation classification (default: False)
564
+ use_doc_unwarping: Sử dụng doc unwarping (default: False)
565
+ use_textline_orientation: Sử dụng textline orientation (default: False)
566
+ text_detection_model_name: Model name for detection (e.g. "PP-OCRv5_mobile_det")
567
+ text_recognition_model_name: Model name for recognition (e.g. "PP-OCRv5_mobile_rec")
568
+ **kwargs: Additional arguments for PaddleOCR (e.g., enable_mkldnn, use_gpu)
569
+ """
570
+ try:
571
+ from paddleocr import PaddleOCR
572
+ except ImportError:
573
+ raise ImportError(
574
+ "PaddleOCR not installed. Install with:\n"
575
+ " pip install paddleocr>=3.0.0 paddlepaddle>=2.5.0"
576
+ )
577
+
578
+ # Initialize PaddleOCR with PP-OCRv5 (PaddleOCR 3.x API)
579
+ # Based on documentation: https://www.paddleocr.ai/latest/version3.x/pipeline_usage/OCR.html
580
+ init_params = {
581
+ 'lang': lang,
582
+ 'use_doc_orientation_classify': use_doc_orientation,
583
+ 'use_doc_unwarping': use_doc_unwarping,
584
+ 'use_textline_orientation': use_textline_orientation,
585
+ }
586
+
587
+ # Add custom model names if provided
588
+ if text_detection_model_name:
589
+ init_params['text_detection_model_name'] = text_detection_model_name
590
+ if text_recognition_model_name:
591
+ init_params['text_recognition_model_name'] = text_recognition_model_name
592
+
593
+ # Note: PaddleOCR 3.x không có use_gpu parameter
594
+ # GPU được tự động detect hoặc set qua device parameter
595
+ if device_id is not None:
596
+ init_params['use_gpu'] = True
597
+ init_params['gpu_id'] = device_id
598
+
599
+ # Merge additional kwargs (e.g., enable_mkldnn)
600
+ init_params.update(kwargs)
601
+
602
+ self.ocr = PaddleOCR(**init_params)
603
+ self.device_id = device_id
604
+ self.lang = lang
605
+
606
+ device_str = f"GPU:{device_id}" if device_id is not None else "CPU"
607
+ logging.info(
608
+ f"✓ PPOCRv5Recognizer initialized "
609
+ f"(PP-OCRv5 full pipeline, lang={lang}, device={device_str})"
610
+ )
611
+
612
+ def recognize(self, image_crops: List[Any]) -> Tuple[List[Tuple[str, float]], float]:
613
+ """
614
+ ⚠️ WARNING: Method này KHÔNG nên sử dụng với image crops!
615
+
616
+ PP-OCRv5 trong PaddleOCR 3.x không support recognition-only mode.
617
+ Method này sẽ chạy FULL PIPELINE (detection + recognition) trên MỖI crop,
618
+ dẫn đến kết quả sai và performance kém.
619
+
620
+ Thay vào đó, sử dụng `recognize_full_image()` với toàn bộ image.
621
+
622
+ Args:
623
+ image_crops: List các image crops (không khuyến khích)
624
+
625
+ Returns:
626
+ (results, elapsed_time) - nhưng kết quả có thể không chính xác
627
+ """
628
+ import time
629
+ logging.warning(
630
+ "PPOCRv5Recognizer.recognize() is called with image crops. "
631
+ "This is NOT recommended! Use recognize_full_image() instead."
632
+ )
633
+
634
+ start_time = time.time()
635
+ results = []
636
+
637
+ for img_crop in image_crops:
638
+ # Convert to numpy if PIL Image
639
+ if isinstance(img_crop, Image.Image):
640
+ img_crop = np.array(img_crop)
641
+
642
+ # Ensure correct format (RGB)
643
+ if len(img_crop.shape) == 2:
644
+ img_crop = cv2.cvtColor(img_crop, cv2.COLOR_GRAY2RGB)
645
+ elif img_crop.shape[2] == 4:
646
+ img_crop = cv2.cvtColor(img_crop, cv2.COLOR_RGBA2RGB)
647
+
648
+ try:
649
+ # Call PaddleOCR full pipeline (không phải recognition-only)
650
+ # API: result = ocr.predict(input)
651
+ predict_result = self.ocr.predict(img_crop)
652
+
653
+ # Parse result - format: iterable of result objects
654
+ # Each result has: dt_polys, rec_texts, etc.
655
+ text = ""
656
+ confidence = 0.0
657
+
658
+ if predict_result:
659
+ for res in predict_result:
660
+ # Access rec_texts field
661
+ if hasattr(res, 'rec_texts'):
662
+ rec_texts = res.rec_texts
663
+ elif isinstance(res, dict) and 'rec_texts' in res:
664
+ rec_texts = res['rec_texts']
665
+ else:
666
+ rec_texts = []
667
+
668
+ # Concatenate all recognized texts
669
+ if rec_texts:
670
+ text = " ".join([str(t) for t in rec_texts if t])
671
+ confidence = 1.0 # PP-OCRv5 doesn't return confidence per text
672
+
673
+ results.append((text, confidence))
674
+
675
+ except Exception as e:
676
+ logging.error(f"PP-OCRv5 recognition failed: {e}", exc_info=True)
677
+ results.append(("", 0.0))
678
+
679
+ elapsed = time.time() - start_time
680
+ return results, elapsed
681
+
682
+ def recognize_full_image(
683
+ self,
684
+ image: np.ndarray,
685
+ return_visualization: bool = False
686
+ ) -> Dict[str, Any]:
687
+ """
688
+ Nhận dạng text trên TOÀN BỘ image sử dụng PP-OCRv5 full pipeline.
689
+
690
+ Đây là cách SỬ DỤNG ĐÚNG cho PP-OCRv5 trong PaddleOCR 3.x.
691
+
692
+ Args:
693
+ image: Full image (numpy array hoặc PIL Image)
694
+ return_visualization: Return visualization image (default: False)
695
+
696
+ Returns:
697
+ Dictionary chứa:
698
+ {
699
+ 'texts': List[str], # Recognized texts
700
+ 'boxes': List[np.ndarray], # Detection boxes (N, 4, 2)
701
+ 'scores': List[float], # Confidence scores (if available)
702
+ 'elapsed_time': float, # Processing time (seconds)
703
+ 'visualization': np.ndarray, # Visualization image (if requested)
704
+ }
705
+
706
+ Example:
707
+ ```python
708
+ recognizer = PPOCRv5Recognizer(lang='vi')
709
+ result = recognizer.recognize_full_image(image)
710
+
711
+ for text, box in zip(result['texts'], result['boxes']):
712
+ print(f"Text: {text}, Box: {box}")
713
+ ```
714
+ """
715
+ import time
716
+ start_time = time.time()
717
+
718
+ # Convert to numpy if PIL Image
719
+ if isinstance(image, Image.Image):
720
+ image = np.array(image)
721
+
722
+ # Ensure correct format (RGB)
723
+ if len(image.shape) == 2:
724
+ image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
725
+ elif image.shape[2] == 4:
726
+ image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
727
+
728
+ try:
729
+ # Call PaddleOCR full pipeline
730
+ # API from documentation: result = ocr.predict(input)
731
+ predict_result = self.ocr.predict(image)
732
+
733
+ texts = []
734
+ boxes = []
735
+ scores = []
736
+
737
+ # Parse result - format: iterable of result objects
738
+ for res in predict_result:
739
+ # Extract data from result object
740
+ # Based on documentation, result has fields: dt_polys, rec_texts, etc.
741
+ if hasattr(res, 'dt_polys'):
742
+ dt_polys = res.dt_polys
743
+ elif isinstance(res, dict) and 'dt_polys' in res:
744
+ dt_polys = res['dt_polys']
745
+ else:
746
+ dt_polys = []
747
+
748
+ if hasattr(res, 'rec_texts'):
749
+ rec_texts = res.rec_texts
750
+ elif isinstance(res, dict) and 'rec_texts' in res:
751
+ rec_texts = res['rec_texts']
752
+ else:
753
+ rec_texts = []
754
+
755
+ # Append results
756
+ texts.extend([str(t) for t in rec_texts if t])
757
+ boxes.extend(dt_polys if isinstance(dt_polys, list) else [dt_polys])
758
+
759
+ # PP-OCRv5 may not return per-text scores, default to 1.0
760
+ scores.extend([1.0] * len(rec_texts))
761
+
762
+ elapsed = time.time() - start_time
763
+
764
+ result = {
765
+ 'texts': texts,
766
+ 'boxes': boxes,
767
+ 'scores': scores,
768
+ 'elapsed_time': elapsed,
769
+ }
770
+
771
+ # Generate visualization if requested
772
+ if return_visualization:
773
+ vis_image = self._draw_boxes_and_texts(image.copy(), boxes, texts)
774
+ result['visualization'] = vis_image
775
+
776
+ logging.info(
777
+ f"PP-OCRv5 processed image: {len(texts)} texts detected "
778
+ f"in {elapsed:.2f}s"
779
+ )
780
+
781
+ return result
782
+
783
+ except Exception as e:
784
+ logging.error(f"PP-OCRv5 full image recognition failed: {e}", exc_info=True)
785
+ return {
786
+ 'texts': [],
787
+ 'boxes': [],
788
+ 'scores': [],
789
+ 'elapsed_time': time.time() - start_time,
790
+ }
791
+
792
+ def _draw_boxes_and_texts(
793
+ self,
794
+ image: np.ndarray,
795
+ boxes: List[np.ndarray],
796
+ texts: List[str]
797
+ ) -> np.ndarray:
798
+ """
799
+ Vẽ boxes và texts lên image để visualization.
800
+
801
+ Args:
802
+ image: Image to draw on
803
+ boxes: List of boxes (N, 4, 2)
804
+ texts: List of texts
805
+
806
+ Returns:
807
+ Image with boxes and texts drawn
808
+ """
809
+ for box, text in zip(boxes, texts):
810
+ if isinstance(box, np.ndarray) and box.shape == (4, 2):
811
+ # Draw box
812
+ pts = box.astype(np.int32)
813
+ cv2.polylines(image, [pts], True, (0, 255, 0), 2)
814
+
815
+ # Draw text
816
+ cv2.putText(
817
+ image,
818
+ text,
819
+ tuple(pts[0]),
820
+ cv2.FONT_HERSHEY_SIMPLEX,
821
+ 0.5,
822
+ (255, 0, 0),
823
+ 1
824
+ )
825
+
826
+ return image
827
+
828
+
829
+
830
+ class AdvancedPaddleOCR(PPOCRv5Recognizer):
831
+ """
832
+ Advanced PaddleOCR Configuration for difficult documents.
833
+
834
+ Enables:
835
+ - Document Orientation Classification (Auto-rotate)
836
+ - Document Unwarping (UVDoc) for bent/curved pages
837
+ - Textline Orientation Correction
838
+
839
+ Ideal for:
840
+ - Scanned legal documents
841
+ - Photos of documents taken by mobile phones
842
+ - Tilted/Skewed scans
843
+ """
844
+
845
+ def __init__(self, lang: str = 'vi', device_id: Optional[int] = None, **kwargs):
846
+ super().__init__(
847
+ lang=lang,
848
+ device_id=device_id,
849
+ use_doc_orientation=True,
850
+ use_doc_unwarping=True,
851
+ use_textline_orientation=True,
852
+ **kwargs
853
+ )
854
+ logging.info("✓ AdvancedPaddleOCR initialized with Unwarping & Orientation enabled")
855
+
856
+
857
+ # ============================================================================
858
+ # Post-Processing Implementations
859
+ # ============================================================================
860
+
861
+ class VietnameseTextPostProcessor(PostProcessingPhase):
862
+ """
863
+ Post-processing chuyên cho tiếng Việt.
864
+
865
+ Features:
866
+ - Sửa các lỗi OCR thường gặp (I -> l, 0 -> O, etc.)
867
+ - Loại bỏ ký tự nhiễu
868
+ - Normalize Vietnamese diacritics
869
+
870
+ Note: Đây là placeholder. Có thể extend với dictionary-based correction.
871
+ """
872
+
873
+ def __init__(self):
874
+ logging.info("✓ VietnameseTextPostProcessor initialized")
875
+
876
+ def process(self, text: str, confidence: float, metadata: Optional[Dict] = None) -> str:
877
+ """
878
+ Xử lý text để giảm nhiễu và cải thiện chất lượng.
879
+ """
880
+ if not text or not text.strip():
881
+ return text
882
+
883
+ # Basic cleaning
884
+ cleaned = text.strip()
885
+
886
+ # TODO: Implement Vietnamese-specific corrections
887
+ # - Fix common OCR errors (I/l, 0/O, etc.)
888
+ # - Remove excessive whitespace
889
+ # - Normalize diacritics
890
+
891
+ return cleaned
892
+
893
+
894
+ # ============================================================================
895
+ # Document Reconstruction Implementations
896
+ # ============================================================================
897
+
898
+ class SmartMarkdownReconstruction(DocumentReconstructionPhase):
899
+ """
900
+ Smart Markdown reconstruction với reading order intelligence.
901
+
902
+ Features:
903
+ - Smart sorting: Y-first, then X for same-line regions
904
+ - Handles multi-column layouts
905
+ - Preserves document structure
906
+ """
907
+
908
+ def __init__(self, y_threshold: int = 30):
909
+ """
910
+ Args:
911
+ y_threshold: Threshold (pixels) để xem regions có cùng dòng hay không
912
+ """
913
+ self.y_threshold = y_threshold
914
+ logging.info(f"✓ SmartMarkdownReconstruction initialized (y_threshold={y_threshold}px)")
915
+
916
+ def reconstruct(
917
+ self,
918
+ regions: List[Tuple[int, str, Any]],
919
+ output_format: str = "markdown"
920
+ ) -> str:
921
+ """
922
+ Ghép nối regions với smart sorting.
923
+
924
+ Args:
925
+ regions: List of (y_position, content, bbox) tuples
926
+ output_format: Only "markdown" supported
927
+
928
+ Returns:
929
+ Markdown string
930
+ """
931
+ if output_format != "markdown":
932
+ raise NotImplementedError(f"Format {output_format} not supported")
933
+
934
+ # Smart sort regions (Y-first, X-second for same line)
935
+ sorted_regions = self._smart_sort_regions(regions)
936
+
937
+ # Concatenate with double newline
938
+ markdown = "\n\n".join([item[1] for item in sorted_regions])
939
+ return markdown
940
+
941
+ def _smart_sort_regions(self, regions: List[Tuple[int, str, Any]]) -> List[Tuple[int, str, Any]]:
942
+ """
943
+ Sort regions với reading order thông minh.
944
+
945
+ Algorithm:
946
+ 1. Group regions by Y coordinate (với threshold)
947
+ 2. Sort each group by X coordinate
948
+ 3. Flatten results
949
+ """
950
+ if not regions:
951
+ return regions
952
+
953
+ # Convert to dict format for sorting
954
+ regions_dict = []
955
+ for item in regions:
956
+ y_pos = item[0]
957
+ content = item[1]
958
+
959
+ # Extract x0 from bbox if available
960
+ x0 = 0
961
+ if len(item) > 2 and isinstance(item[2], (list, tuple)):
962
+ bbox = item[2]
963
+ x0 = bbox[0] if len(bbox) > 0 else 0
964
+
965
+ regions_dict.append({
966
+ "top": y_pos,
967
+ "x0": x0,
968
+ "content": content,
969
+ "original": item
970
+ })
971
+
972
+ # Use LayoutRecognizer.sort_Y_firstly for smart Y+X sorting
973
+ sorted_dict = LayoutRecognizer.sort_Y_firstly(regions_dict, self.y_threshold)
974
+
975
+ # Reconstruct original format
976
+ return [r["original"] for r in sorted_dict]
977
+
978
+
979
+ # ============================================================================
980
+ # Factory Functions
981
+ # ============================================================================
982
+
983
+ def create_default_pipeline() -> Dict[str, Any]:
984
+ """
985
+ Tạo pipeline mặc định với các implementations hiện tại.
986
+
987
+ Returns:
988
+ Dictionary chứa các phase instances:
989
+ {
990
+ "layout_analyzer": LayoutAnalysisPhase,
991
+ "text_detector": TextDetectionPhase,
992
+ "text_recognizer": TextRecognitionPhase,
993
+ "post_processor": PostProcessingPhase,
994
+ "reconstructor": DocumentReconstructionPhase
995
+ }
996
+ """
997
+ return {
998
+ "layout_analyzer": DocLayoutYOLOAnalyzer(),
999
+ "text_detector": PaddleOCRTextDetector(),
1000
+ "text_recognizer": VietOCRRecognizer(),
1001
+ "post_processor": VietnameseTextPostProcessor(),
1002
+ "reconstructor": SmartMarkdownReconstruction(y_threshold=30)
1003
+ }
1004
+
1005
+
1006
+ def create_svtrv2_pipeline(device_id: Optional[int] = None, lang: str = 'vi') -> Dict[str, Any]:
1007
+ """
1008
+ Tạo pipeline với SVTRv2 recognizer (recommended for production).
1009
+
1010
+ SVTRv2 Benefits:
1011
+ - 3x faster than VietOCR (150ms vs 500ms per crop)
1012
+ - +15-20% accuracy improvement (92-95% vs 75-80%)
1013
+ - Better Vietnamese diacritic handling
1014
+ - Less character noise
1015
+ - Production-ready và stable
1016
+
1017
+ Args:
1018
+ device_id: Device ID cho CUDA (None = CPU)
1019
+ lang: Language code ('vi' for Vietnamese, 'en' for English)
1020
+
1021
+ Returns:
1022
+ Dictionary chứa các phase instances với SVTRv2
1023
+
1024
+ Example:
1025
+ ```python
1026
+ from deepdoc_vietocr import DocumentPipeline
1027
+ from deepdoc_vietocr.implementations import create_svtrv2_pipeline
1028
+
1029
+ # Create SVTRv2 pipeline
1030
+ config = create_svtrv2_pipeline()
1031
+ pipeline = DocumentPipeline(**config, threshold=0.5, max_workers=2)
1032
+
1033
+ # Process image
1034
+ result = pipeline.process(image, img_name='test', figure_save_dir='./output')
1035
+ ```
1036
+ """
1037
+ return {
1038
+ "layout_analyzer": DocLayoutYOLOAnalyzer(),
1039
+ "text_detector": PaddleOCRTextDetector(),
1040
+ "text_recognizer": SVTRv2Recognizer(device_id=device_id, lang=lang), # ← SVTRv2
1041
+ "post_processor": VietnameseTextPostProcessor(),
1042
+ "reconstructor": SmartMarkdownReconstruction(y_threshold=30)
1043
+ }
1044
+
1045
+
1046
+ def create_experimental_pipeline() -> Dict[str, Any]:
1047
+ """
1048
+ Tạo pipeline thử nghiệm với các implementations mới.
1049
+
1050
+ Ví dụ: Swap VietOCR bằng LandingAI recognizer
1051
+ """
1052
+ return {
1053
+ "layout_analyzer": DocLayoutYOLOAnalyzer(),
1054
+ "text_detector": PaddleOCRTextDetector(),
1055
+ "text_recognizer": LandingAIRecognizer(), # Experimental
1056
+ "post_processor": VietnameseTextPostProcessor(),
1057
+ "reconstructor": SmartMarkdownReconstruction(y_threshold=30)
1058
+ }
1059
+
1060
+
1061
+ # ============================================================================
1062
+ # Hybrid Pipeline Implementations (Fusion)
1063
+ # ============================================================================
1064
+
1065
+ def ocr_region_worker(args):
1066
+ """
1067
+ Worker function for parallel processing.
1068
+ Args:
1069
+ args: (image_np, bbox, detector, recognizer)
1070
+ Note: detector/recognizer must be picklable or re-initialized.
1071
+ However, re-initializing models in each worker is expensive.
1072
+
1073
+ Optimized Approach:
1074
+ Pass cropping logic here, but models might need global init or passed if lightweight.
1075
+ VietOCR is lightweight enough on CPU. PaddleDetector (ONNX) is also fine.
1076
+
1077
+ Alternative: Initialize models inside worker (using global singleton pattern).
1078
+ """
1079
+ import numpy as np
1080
+ from PIL import Image
1081
+
1082
+ crop, detector_instance, recognizer_instance = args
1083
+
1084
+ if crop is None:
1085
+ return "", 1.0
1086
+
1087
+ # 1. Line Detection (Split paragraph into lines)
1088
+ # Convert PIL to numpy for PaddleDetector
1089
+ crop_np = np.array(crop)
1090
+ # Ensure RGB
1091
+ if len(crop_np.shape) == 2:
1092
+ crop_np = cv2.cvtColor(crop_np, cv2.COLOR_GRAY2RGB)
1093
+ elif crop_np.shape[2] == 4:
1094
+ crop_np = cv2.cvtColor(crop_np, cv2.COLOR_RGBA2RGB)
1095
+
1096
+ dt_boxes, _ = detector_instance.detect(crop_np)
1097
+
1098
+ if dt_boxes is None or len(dt_boxes) == 0:
1099
+ # Fallback: Treat whole crop as single line (or empty)
1100
+ # But if it was a paragraph, VietOCR might fail.
1101
+ # Let's try OCR on whole crop if detection fails (might be single line already)
1102
+ lines = [crop]
1103
+ else:
1104
+ # Sort lines matching reading order (Top to Bottom)
1105
+ # dt_boxes shape: [N, 4, 2]
1106
+ # Sort by Y coordinate of top-left corner
1107
+ dt_boxes = sorted(dt_boxes, key=lambda b: b[0][1])
1108
+
1109
+ lines = []
1110
+ for box in dt_boxes:
1111
+ # Crop each line
1112
+ # Box might be rotated, but for now assume mostly horizontal or slight tilt
1113
+ # Get min/max x,y
1114
+ h, w, _ = crop_np.shape
1115
+ box_int = np.int0(box)
1116
+ x_min = max(0, np.min(box_int[:, 0]))
1117
+ x_max = min(w, np.max(box_int[:, 0]))
1118
+ y_min = max(0, np.min(box_int[:, 1]))
1119
+ y_max = min(h, np.max(box_int[:, 1]))
1120
+
1121
+ if x_max > x_min and y_max > y_min:
1122
+ line_crop = crop.crop((x_min, y_min, x_max, y_max))
1123
+ lines.append(line_crop)
1124
+
1125
+ # 2. Line Recognition (VietOCR)
1126
+ full_text = []
1127
+ total_conf = 0.0
1128
+
1129
+ if not lines:
1130
+ return "", 0.0
1131
+
1132
+ results, _ = recognizer_instance.recognize(lines)
1133
+
1134
+ texts = []
1135
+ confs = []
1136
+ for text, conf in results:
1137
+ if text.strip():
1138
+ texts.append(text.strip())
1139
+ confs.append(conf)
1140
+
1141
+ final_text = " ".join(texts)
1142
+ avg_conf = sum(confs) / len(confs) if confs else 0.0
1143
+
1144
+ return final_text, avg_conf
1145
+
1146
+ class HybridStructureVietOCRAnalyzer(LayoutAnalysisPhase):
1147
+ """
1148
+ Hybrid Pipeline combining PP-StructureV3 (Layout) + VietOCR (Text).
1149
+
1150
+ Strategy:
1151
+ 1. Use PP-StructureV3 for Layout Analysis (Top-down reading order, Table detection).
1152
+ 2. For 'Text'/'Title' regions: Crop image and use VietOCR for high-accuracy recognition.
1153
+ 3. For 'Table' regions: Keep PP-StructureV3's HTML/Markdown output (since VietOCR can't handle tables).
1154
+
1155
+ Pros:
1156
+ - Best of both worlds: Perfect layout + Perfect Vietnamese characters.
1157
+ - Handles complex tables (wireless, merged cells).
1158
+ - Solves the 'bottom-up' reading order issue of YOLOv10.
1159
+
1160
+ Cons:
1161
+ - Slower than pure YOLO+VietOCR (due to heavy Layout model).
1162
+ - Slower than pure PP-StructureV3 (due to extra VietOCR calls).
1163
+ """
1164
+
1165
+ def __init__(self, lang: str = 'vi'):
1166
+ from .ocr import TextRecognizer # Import locally to avoid circular deps if any
1167
+ import multiprocessing
1168
+
1169
+ # 1. Initialize Text Recognition Engine (VietOCR) - EARLY INIT to avoid conflicts
1170
+ # VietOCR is CPU optimized and very accurate for Vietnamese
1171
+ self.text_engine = VietOCRRecognizer()
1172
+ logging.info("✓ HybridPipeline: Text Engine (VietOCR) ready")
1173
+
1174
+ # 2. Initialize Line Detector (PaddleOCR Det)
1175
+ # Needed for splitting paragraphs into lines
1176
+ self.line_detector = PaddleOCRTextDetector()
1177
+ logging.info("✓ HybridPipeline: Line Detector (PaddleOCR) ready")
1178
+
1179
+ # 3. Initialize Layout Engine (PP-StructureV3)
1180
+ # Note: We use the existing wrapper to safely handle import errors
1181
+ self.layout_engine = PaddleStructureV3Analyzer(lang=lang, show_log=False)
1182
+ logging.info("✓ HybridPipeline: Layout Engine (PP-StructureV3) ready")
1183
+
1184
+ # CPU Count for multiprocessing
1185
+ self.num_workers = min(4, multiprocessing.cpu_count())
1186
+
1187
+ def analyze(self, image: Image.Image, threshold: float = 0.5) -> List[Dict[str, Any]]:
1188
+ """
1189
+ Run the hybrid analysis pipeline.
1190
+ """
1191
+ import numpy as np
1192
+
1193
+ # Ensure image is PIL for cropping (VietOCR likes PIL)
1194
+ if isinstance(image, np.ndarray):
1195
+ image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
1196
+ else:
1197
+ image_pil = image
1198
+
1199
+ # Step 1: Run Layout Analysis (PP-StructureV3)
1200
+ # This returns regions with 'bbox', 'type', and raw 'content' (from PaddleOCR)
1201
+ logging.info("HybridPipeline: Running Layout Analysis...")
1202
+ layout_results = self.layout_engine.analyze(image_pil, threshold=threshold)
1203
+
1204
+ # Step 2: Refine Text Content with VietOCR
1205
+ logging.info(f"HybridPipeline: Refining {len(layout_results)} regions with VietOCR...")
1206
+
1207
+ final_results = []
1208
+ text_crops = []
1209
+ text_indices = []
1210
+
1211
+ # Prepare batch for Parallel Processing
1212
+ # We cannot pass self.text_engine/self.line_detector easily if they are not picklable
1213
+ # For simplicity in this demo, we will use sequential processing first with the new logic
1214
+ # OR use a ThreadPool since VietOCR releases GIL or runs C++ (Paddle)
1215
+ # Wait, VietOCR is PyTorch, Paddle is ONNX.
1216
+ # Deepcopying models for processes is heavy.
1217
+ # User suggested multiprocessing but that requires pickling models.
1218
+ # BETTER STRATEGY: Use ThreadPoolExecutor for I/O bound tasks,
1219
+ # but here it is CPU bound.
1220
+ #
1221
+ # WORKAROUND for non-picklable models in simple script:
1222
+ # Just run sequential loop first to prove quality fix (Line detection).
1223
+ # Optimization (Multiprocessing) requires moving model init to global or worker_init.
1224
+ # Given complexity, let's implement the LINE DETECTION logic first (Quality Fix).
1225
+ # We can simulate "Parallel" by batching lines if possible, but here we process region by region.
1226
+
1227
+ # Let's conform to User Request for Quality First (Line Detection).
1228
+ # We will iterate regions and perform split + OCR.
1229
+
1230
+ from concurrent.futures import ThreadPoolExecutor
1231
+
1232
+ # We will use ThreadPool to parallelize the *regions* processing if models are thread-safe.
1233
+ # Paddle (ONNX Runtime) is thread-safe. VietOCR (PyTorch) is thread-safe for inference.
1234
+
1235
+ def process_single_region(idx_region_tuple):
1236
+ idx, region = idx_region_tuple
1237
+ rtype = region.get('type', '').lower()
1238
+
1239
+ # Note: PPStructureV3 uses 'paragraph_title'
1240
+ if rtype in ['text', 'title', 'header', 'footer', 'paragraph_title', 'reference', 'list']:
1241
+ bbox = region.get('bbox')
1242
+ if bbox:
1243
+ # Crop image: [x0, y0, x1, y1]
1244
+ x0, y0, x1, y1 = map(int, bbox)
1245
+ w, h = image_pil.size
1246
+ x0 = max(0, x0); y0 = max(0, y0)
1247
+ x1 = min(w, x1); y1 = min(h, y1)
1248
+
1249
+ if x1 > x0 and y1 > y0:
1250
+ crop = image_pil.crop((x0, y0, x1, y1))
1251
+
1252
+ # CALL WORKER LOGIC DIRECTLY (No pickling issues)
1253
+ # Pass self.line_detector and self.text_engine
1254
+ text, conf = ocr_region_worker((crop, self.line_detector, self.text_engine))
1255
+
1256
+ region['content'] = text
1257
+ region['score'] = conf
1258
+ region['source'] = 'VietOCR+LineDet'
1259
+ return region
1260
+
1261
+ # Collect extractable regions
1262
+ tasks = []
1263
+ for idx, region in enumerate(layout_results):
1264
+ tasks.append((idx, region))
1265
+
1266
+ # Run in ThreadPool (lighter than ProcessPool and works with unpicklable objects usually)
1267
+ # If CPU bound, GIL limits speedup, but ONNX Runtime releases GIL.
1268
+ with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
1269
+ results = list(executor.map(process_single_region, tasks))
1270
+
1271
+ return results
1272
+
1273
+ def create_hybrid_pipeline() -> Dict[str, Any]:
1274
+ """
1275
+ Creates the Hybrid Pipeline (Fusion Strategy).
1276
+ Usage similar to other pipelines, but 'layout_analyzer' does mostly everything.
1277
+ """
1278
+ return {
1279
+ "layout_analyzer": HybridStructureVietOCRAnalyzer(),
1280
+ # Other components are placeholders since HybridAnalyzer handles everything internally
1281
+ # but we keep them for interface consistency if needed
1282
+ "text_detector": None,
1283
+ "text_recognizer": None,
1284
+ "post_processor": VietnameseTextPostProcessor(),
1285
+ "reconstructor": SmartMarkdownReconstruction(y_threshold=30)
1286
+ }