xfmr-zem 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xfmr_zem/cli.py +32 -3
- xfmr_zem/client.py +59 -8
- xfmr_zem/server.py +21 -4
- xfmr_zem/servers/data_juicer/server.py +1 -1
- xfmr_zem/servers/instruction_gen/server.py +1 -1
- xfmr_zem/servers/io/server.py +1 -1
- xfmr_zem/servers/llm/parameters.yml +10 -0
- xfmr_zem/servers/nemo_curator/server.py +1 -1
- xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py +90 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py +1286 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py +562 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/ocr.py +512 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/.gitattributes +35 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/README.md +5 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/ocr.res +6623 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/operators.py +725 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/phases.py +191 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/pipeline.py +561 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/postprocess.py +370 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/recognizer.py +436 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/table_structure_recognizer.py +569 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/utils/__init__.py +81 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/utils/file_utils.py +246 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/__init__.py +0 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/base.yml +58 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/vgg-seq2seq.yml +38 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/__init__.py +0 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/cnn.py +25 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/vgg.py +51 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/seqmodel/seq2seq.py +175 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/transformerocr.py +29 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/vocab.py +36 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/config.py +37 -0
- xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/translate.py +111 -0
- xfmr_zem/servers/ocr/engines.py +242 -0
- xfmr_zem/servers/ocr/install_models.py +63 -0
- xfmr_zem/servers/ocr/parameters.yml +4 -0
- xfmr_zem/servers/ocr/server.py +102 -0
- xfmr_zem/servers/profiler/parameters.yml +4 -0
- xfmr_zem/servers/sinks/parameters.yml +6 -0
- xfmr_zem/servers/unstructured/parameters.yml +6 -0
- xfmr_zem/servers/unstructured/server.py +62 -0
- xfmr_zem/zenml_wrapper.py +20 -7
- {xfmr_zem-0.2.4.dist-info → xfmr_zem-0.2.6.dist-info}/METADATA +20 -1
- xfmr_zem-0.2.6.dist-info/RECORD +58 -0
- xfmr_zem-0.2.4.dist-info/RECORD +0 -23
- /xfmr_zem/servers/data_juicer/{parameter.yaml → parameters.yml} +0 -0
- /xfmr_zem/servers/instruction_gen/{parameter.yaml → parameters.yml} +0 -0
- /xfmr_zem/servers/io/{parameter.yaml → parameters.yml} +0 -0
- /xfmr_zem/servers/nemo_curator/{parameter.yaml → parameters.yml} +0 -0
- {xfmr_zem-0.2.4.dist-info → xfmr_zem-0.2.6.dist-info}/WHEEL +0 -0
- {xfmr_zem-0.2.4.dist-info → xfmr_zem-0.2.6.dist-info}/entry_points.txt +0 -0
- {xfmr_zem-0.2.4.dist-info → xfmr_zem-0.2.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Abstract Phase Architecture for Document Processing Pipeline
|
|
3
|
+
|
|
4
|
+
Kiến trúc này tách biệt các giai đoạn xử lý thành các abstract base classes,
|
|
5
|
+
cho phép dễ dàng thử nghiệm và thay thế các implementations khác nhau mà
|
|
6
|
+
không cần thay đổi code ở các phase khác.
|
|
7
|
+
|
|
8
|
+
Pipeline Flow:
|
|
9
|
+
1. Layout Analysis Phase: Phát hiện vùng layout (text, table, figure, etc.)
|
|
10
|
+
2. Text Detection Phase: Phát hiện text boxes trong mỗi region
|
|
11
|
+
3. Text Recognition Phase: Nhận dạng text từ các text boxes
|
|
12
|
+
4. Post-Processing Phase: Làm sạch và cải thiện text output
|
|
13
|
+
5. Document Reconstruction Phase: Ghép nối các regions thành markdown
|
|
14
|
+
|
|
15
|
+
Mỗi phase có thể được thay thế độc lập bằng implementations khác.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from abc import ABC, abstractmethod
|
|
19
|
+
from typing import List, Dict, Tuple, Optional, Any
|
|
20
|
+
from PIL import Image
|
|
21
|
+
import numpy as np
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# ============================================================================
|
|
25
|
+
# Phase 1: Layout Analysis
|
|
26
|
+
# ============================================================================
|
|
27
|
+
|
|
28
|
+
class LayoutAnalysisPhase(ABC):
|
|
29
|
+
"""
|
|
30
|
+
Abstract base class cho Layout Analysis.
|
|
31
|
+
Phát hiện và phân loại các vùng trong document (text, table, figure, etc.)
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def analyze(self, image: Image.Image, threshold: float = 0.5) -> List[Dict[str, Any]]:
|
|
36
|
+
"""
|
|
37
|
+
Phân tích layout của document image.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
image: PIL Image cần phân tích
|
|
41
|
+
threshold: Ngưỡng confidence để giữ lại detection
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
List of regions, mỗi region là dict:
|
|
45
|
+
{
|
|
46
|
+
"bbox": [x0, y0, x1, y1], # Bounding box
|
|
47
|
+
"type": str, # "text", "table", "figure", "title", etc.
|
|
48
|
+
"score": float, # Confidence score (0-1)
|
|
49
|
+
"label": str # Nhãn chi tiết hơn (optional)
|
|
50
|
+
}
|
|
51
|
+
"""
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# ============================================================================
|
|
56
|
+
# Phase 2: Text Detection
|
|
57
|
+
# ============================================================================
|
|
58
|
+
|
|
59
|
+
class TextDetectionPhase(ABC):
|
|
60
|
+
"""
|
|
61
|
+
Abstract base class cho Text Detection.
|
|
62
|
+
Phát hiện vị trí các text boxes trong image region.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
@abstractmethod
|
|
66
|
+
def detect(self, image: np.ndarray) -> Tuple[Optional[np.ndarray], Any]:
|
|
67
|
+
"""
|
|
68
|
+
Phát hiện text boxes trong image.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
image: Numpy array của image (BGR hoặc RGB format)
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Tuple of (dt_boxes, elapsed_time):
|
|
75
|
+
- dt_boxes: numpy array shape (N, 4, 2) hoặc None nếu không detect được
|
|
76
|
+
Mỗi box là 4 điểm góc [top-left, top-right, bottom-right, bottom-left]
|
|
77
|
+
- elapsed_time: Thời gian xử lý (có thể bỏ qua, return 0)
|
|
78
|
+
"""
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# ============================================================================
|
|
83
|
+
# Phase 3: Text Recognition
|
|
84
|
+
# ============================================================================
|
|
85
|
+
|
|
86
|
+
class TextRecognitionPhase(ABC):
|
|
87
|
+
"""
|
|
88
|
+
Abstract base class cho Text Recognition.
|
|
89
|
+
Nhận dạng text từ các image crops đã được phát hiện.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
@abstractmethod
|
|
93
|
+
def recognize(self, image_crops: List[Any]) -> Tuple[List[Tuple[str, float]], float]:
|
|
94
|
+
"""
|
|
95
|
+
Nhận dạng text từ list các image crops.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
image_crops: List các image crops (numpy array hoặc PIL Image)
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Tuple of (results, elapsed_time):
|
|
102
|
+
- results: List of (text, confidence) tuples
|
|
103
|
+
text: str - Recognized text
|
|
104
|
+
confidence: float - Confidence score (0-1)
|
|
105
|
+
- elapsed_time: Thời gian xử lý (có thể bỏ qua, return 0)
|
|
106
|
+
"""
|
|
107
|
+
pass
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# ============================================================================
|
|
111
|
+
# Phase 4: Post-Processing
|
|
112
|
+
# ============================================================================
|
|
113
|
+
|
|
114
|
+
class PostProcessingPhase(ABC):
|
|
115
|
+
"""
|
|
116
|
+
Abstract base class cho Post-Processing.
|
|
117
|
+
Làm sạch và cải thiện text đã nhận dạng (remove noise, spell check, etc.)
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
@abstractmethod
|
|
121
|
+
def process(self, text: str, confidence: float, metadata: Optional[Dict] = None) -> str:
|
|
122
|
+
"""
|
|
123
|
+
Xử lý hậu kỳ cho recognized text.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
text: Text đã được nhận dạng
|
|
127
|
+
confidence: Confidence score của recognition
|
|
128
|
+
metadata: Thông tin bổ sung (region type, bbox, etc.)
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Cleaned/improved text
|
|
132
|
+
"""
|
|
133
|
+
pass
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# ============================================================================
|
|
137
|
+
# Phase 5: Document Reconstruction
|
|
138
|
+
# ============================================================================
|
|
139
|
+
|
|
140
|
+
class DocumentReconstructionPhase(ABC):
|
|
141
|
+
"""
|
|
142
|
+
Abstract base class cho Document Reconstruction.
|
|
143
|
+
Sắp xếp và ghép nối các regions thành document cuối cùng (markdown, html, etc.)
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
@abstractmethod
|
|
147
|
+
def reconstruct(
|
|
148
|
+
self,
|
|
149
|
+
regions: List[Tuple[int, str, Any]],
|
|
150
|
+
output_format: str = "markdown"
|
|
151
|
+
) -> str:
|
|
152
|
+
"""
|
|
153
|
+
Ghép nối các regions đã xử lý thành document.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
regions: List of (y_position, content, bbox) tuples
|
|
157
|
+
output_format: "markdown", "html", "plain", etc.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Document cuối cùng dạng string
|
|
161
|
+
"""
|
|
162
|
+
pass
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# ============================================================================
|
|
166
|
+
# Default Implementations (No-op)
|
|
167
|
+
# ============================================================================
|
|
168
|
+
|
|
169
|
+
class NoOpPostProcessing(PostProcessingPhase):
|
|
170
|
+
"""
|
|
171
|
+
Default implementation: không xử lý gì, trả về text nguyên bản.
|
|
172
|
+
Sử dụng khi không cần post-processing.
|
|
173
|
+
"""
|
|
174
|
+
def process(self, text: str, confidence: float, metadata: Optional[Dict] = None) -> str:
|
|
175
|
+
return text
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class SimpleMarkdownReconstruction(DocumentReconstructionPhase):
|
|
179
|
+
"""
|
|
180
|
+
Default implementation: Ghép nối các regions bằng double newline.
|
|
181
|
+
"""
|
|
182
|
+
def reconstruct(
|
|
183
|
+
self,
|
|
184
|
+
regions: List[Tuple[int, str, Any]],
|
|
185
|
+
output_format: str = "markdown"
|
|
186
|
+
) -> str:
|
|
187
|
+
"""Simple concatenation with double newline"""
|
|
188
|
+
if output_format != "markdown":
|
|
189
|
+
raise NotImplementedError(f"Format {output_format} not supported")
|
|
190
|
+
|
|
191
|
+
return "\n\n".join([item[1] for item in regions])
|