xfmr-zem 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. xfmr_zem/cli.py +32 -3
  2. xfmr_zem/client.py +59 -8
  3. xfmr_zem/server.py +21 -4
  4. xfmr_zem/servers/data_juicer/server.py +1 -1
  5. xfmr_zem/servers/instruction_gen/server.py +1 -1
  6. xfmr_zem/servers/io/server.py +1 -1
  7. xfmr_zem/servers/llm/parameters.yml +10 -0
  8. xfmr_zem/servers/nemo_curator/server.py +1 -1
  9. xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py +90 -0
  10. xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py +1286 -0
  11. xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py +562 -0
  12. xfmr_zem/servers/ocr/deepdoc_vietocr/ocr.py +512 -0
  13. xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/.gitattributes +35 -0
  14. xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/README.md +5 -0
  15. xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/ocr.res +6623 -0
  16. xfmr_zem/servers/ocr/deepdoc_vietocr/operators.py +725 -0
  17. xfmr_zem/servers/ocr/deepdoc_vietocr/phases.py +191 -0
  18. xfmr_zem/servers/ocr/deepdoc_vietocr/pipeline.py +561 -0
  19. xfmr_zem/servers/ocr/deepdoc_vietocr/postprocess.py +370 -0
  20. xfmr_zem/servers/ocr/deepdoc_vietocr/recognizer.py +436 -0
  21. xfmr_zem/servers/ocr/deepdoc_vietocr/table_structure_recognizer.py +569 -0
  22. xfmr_zem/servers/ocr/deepdoc_vietocr/utils/__init__.py +81 -0
  23. xfmr_zem/servers/ocr/deepdoc_vietocr/utils/file_utils.py +246 -0
  24. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/__init__.py +0 -0
  25. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/base.yml +58 -0
  26. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/vgg-seq2seq.yml +38 -0
  27. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/__init__.py +0 -0
  28. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/cnn.py +25 -0
  29. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/vgg.py +51 -0
  30. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/seqmodel/seq2seq.py +175 -0
  31. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/transformerocr.py +29 -0
  32. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/vocab.py +36 -0
  33. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/config.py +37 -0
  34. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/translate.py +111 -0
  35. xfmr_zem/servers/ocr/engines.py +242 -0
  36. xfmr_zem/servers/ocr/install_models.py +63 -0
  37. xfmr_zem/servers/ocr/parameters.yml +4 -0
  38. xfmr_zem/servers/ocr/server.py +102 -0
  39. xfmr_zem/servers/profiler/parameters.yml +4 -0
  40. xfmr_zem/servers/sinks/parameters.yml +6 -0
  41. xfmr_zem/servers/unstructured/parameters.yml +6 -0
  42. xfmr_zem/servers/unstructured/server.py +62 -0
  43. xfmr_zem/zenml_wrapper.py +20 -7
  44. {xfmr_zem-0.2.4.dist-info → xfmr_zem-0.2.6.dist-info}/METADATA +20 -1
  45. xfmr_zem-0.2.6.dist-info/RECORD +58 -0
  46. xfmr_zem-0.2.4.dist-info/RECORD +0 -23
  47. /xfmr_zem/servers/data_juicer/{parameter.yaml → parameters.yml} +0 -0
  48. /xfmr_zem/servers/instruction_gen/{parameter.yaml → parameters.yml} +0 -0
  49. /xfmr_zem/servers/io/{parameter.yaml → parameters.yml} +0 -0
  50. /xfmr_zem/servers/nemo_curator/{parameter.yaml → parameters.yml} +0 -0
  51. {xfmr_zem-0.2.4.dist-info → xfmr_zem-0.2.6.dist-info}/WHEEL +0 -0
  52. {xfmr_zem-0.2.4.dist-info → xfmr_zem-0.2.6.dist-info}/entry_points.txt +0 -0
  53. {xfmr_zem-0.2.4.dist-info → xfmr_zem-0.2.6.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,191 @@
1
+ """
2
+ Abstract Phase Architecture for Document Processing Pipeline
3
+
4
+ Kiến trúc này tách biệt các giai đoạn xử lý thành các abstract base classes,
5
+ cho phép dễ dàng thử nghiệm và thay thế các implementations khác nhau mà
6
+ không cần thay đổi code ở các phase khác.
7
+
8
+ Pipeline Flow:
9
+ 1. Layout Analysis Phase: Phát hiện vùng layout (text, table, figure, etc.)
10
+ 2. Text Detection Phase: Phát hiện text boxes trong mỗi region
11
+ 3. Text Recognition Phase: Nhận dạng text từ các text boxes
12
+ 4. Post-Processing Phase: Làm sạch và cải thiện text output
13
+ 5. Document Reconstruction Phase: Ghép nối các regions thành markdown
14
+
15
+ Mỗi phase có thể được thay thế độc lập bằng implementations khác.
16
+ """
17
+
18
+ from abc import ABC, abstractmethod
19
+ from typing import List, Dict, Tuple, Optional, Any
20
+ from PIL import Image
21
+ import numpy as np
22
+
23
+
24
+ # ============================================================================
25
+ # Phase 1: Layout Analysis
26
+ # ============================================================================
27
+
28
+ class LayoutAnalysisPhase(ABC):
29
+ """
30
+ Abstract base class cho Layout Analysis.
31
+ Phát hiện và phân loại các vùng trong document (text, table, figure, etc.)
32
+ """
33
+
34
+ @abstractmethod
35
+ def analyze(self, image: Image.Image, threshold: float = 0.5) -> List[Dict[str, Any]]:
36
+ """
37
+ Phân tích layout của document image.
38
+
39
+ Args:
40
+ image: PIL Image cần phân tích
41
+ threshold: Ngưỡng confidence để giữ lại detection
42
+
43
+ Returns:
44
+ List of regions, mỗi region là dict:
45
+ {
46
+ "bbox": [x0, y0, x1, y1], # Bounding box
47
+ "type": str, # "text", "table", "figure", "title", etc.
48
+ "score": float, # Confidence score (0-1)
49
+ "label": str # Nhãn chi tiết hơn (optional)
50
+ }
51
+ """
52
+ pass
53
+
54
+
55
+ # ============================================================================
56
+ # Phase 2: Text Detection
57
+ # ============================================================================
58
+
59
+ class TextDetectionPhase(ABC):
60
+ """
61
+ Abstract base class cho Text Detection.
62
+ Phát hiện vị trí các text boxes trong image region.
63
+ """
64
+
65
+ @abstractmethod
66
+ def detect(self, image: np.ndarray) -> Tuple[Optional[np.ndarray], Any]:
67
+ """
68
+ Phát hiện text boxes trong image.
69
+
70
+ Args:
71
+ image: Numpy array của image (BGR hoặc RGB format)
72
+
73
+ Returns:
74
+ Tuple of (dt_boxes, elapsed_time):
75
+ - dt_boxes: numpy array shape (N, 4, 2) hoặc None nếu không detect được
76
+ Mỗi box là 4 điểm góc [top-left, top-right, bottom-right, bottom-left]
77
+ - elapsed_time: Thời gian xử lý (có thể bỏ qua, return 0)
78
+ """
79
+ pass
80
+
81
+
82
+ # ============================================================================
83
+ # Phase 3: Text Recognition
84
+ # ============================================================================
85
+
86
+ class TextRecognitionPhase(ABC):
87
+ """
88
+ Abstract base class cho Text Recognition.
89
+ Nhận dạng text từ các image crops đã được phát hiện.
90
+ """
91
+
92
+ @abstractmethod
93
+ def recognize(self, image_crops: List[Any]) -> Tuple[List[Tuple[str, float]], float]:
94
+ """
95
+ Nhận dạng text từ list các image crops.
96
+
97
+ Args:
98
+ image_crops: List các image crops (numpy array hoặc PIL Image)
99
+
100
+ Returns:
101
+ Tuple of (results, elapsed_time):
102
+ - results: List of (text, confidence) tuples
103
+ text: str - Recognized text
104
+ confidence: float - Confidence score (0-1)
105
+ - elapsed_time: Thời gian xử lý (có thể bỏ qua, return 0)
106
+ """
107
+ pass
108
+
109
+
110
+ # ============================================================================
111
+ # Phase 4: Post-Processing
112
+ # ============================================================================
113
+
114
+ class PostProcessingPhase(ABC):
115
+ """
116
+ Abstract base class cho Post-Processing.
117
+ Làm sạch và cải thiện text đã nhận dạng (remove noise, spell check, etc.)
118
+ """
119
+
120
+ @abstractmethod
121
+ def process(self, text: str, confidence: float, metadata: Optional[Dict] = None) -> str:
122
+ """
123
+ Xử lý hậu kỳ cho recognized text.
124
+
125
+ Args:
126
+ text: Text đã được nhận dạng
127
+ confidence: Confidence score của recognition
128
+ metadata: Thông tin bổ sung (region type, bbox, etc.)
129
+
130
+ Returns:
131
+ Cleaned/improved text
132
+ """
133
+ pass
134
+
135
+
136
+ # ============================================================================
137
+ # Phase 5: Document Reconstruction
138
+ # ============================================================================
139
+
140
+ class DocumentReconstructionPhase(ABC):
141
+ """
142
+ Abstract base class cho Document Reconstruction.
143
+ Sắp xếp và ghép nối các regions thành document cuối cùng (markdown, html, etc.)
144
+ """
145
+
146
+ @abstractmethod
147
+ def reconstruct(
148
+ self,
149
+ regions: List[Tuple[int, str, Any]],
150
+ output_format: str = "markdown"
151
+ ) -> str:
152
+ """
153
+ Ghép nối các regions đã xử lý thành document.
154
+
155
+ Args:
156
+ regions: List of (y_position, content, bbox) tuples
157
+ output_format: "markdown", "html", "plain", etc.
158
+
159
+ Returns:
160
+ Document cuối cùng dạng string
161
+ """
162
+ pass
163
+
164
+
165
+ # ============================================================================
166
+ # Default Implementations (No-op)
167
+ # ============================================================================
168
+
169
+ class NoOpPostProcessing(PostProcessingPhase):
170
+ """
171
+ Default implementation: không xử lý gì, trả về text nguyên bản.
172
+ Sử dụng khi không cần post-processing.
173
+ """
174
+ def process(self, text: str, confidence: float, metadata: Optional[Dict] = None) -> str:
175
+ return text
176
+
177
+
178
+ class SimpleMarkdownReconstruction(DocumentReconstructionPhase):
179
+ """
180
+ Default implementation: Ghép nối các regions bằng double newline.
181
+ """
182
+ def reconstruct(
183
+ self,
184
+ regions: List[Tuple[int, str, Any]],
185
+ output_format: str = "markdown"
186
+ ) -> str:
187
+ """Simple concatenation with double newline"""
188
+ if output_format != "markdown":
189
+ raise NotImplementedError(f"Format {output_format} not supported")
190
+
191
+ return "\n\n".join([item[1] for item in regions])