xfmr-zem 0.2.4__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/.gitignore +5 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/AGENTS.md +12 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/CHANGELOG.md +6 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/PKG-INFO +19 -1
- xfmr_zem-0.2.5/data/ocr_test.png +0 -0
- xfmr_zem-0.2.5/data/vietnamese_ocr.png +0 -0
- xfmr_zem-0.2.5/parameters.yml +7 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/pyproject.toml +20 -1
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/cli.py +32 -3
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/client.py +59 -8
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/server.py +21 -4
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/data_juicer/server.py +1 -1
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/instruction_gen/server.py +1 -1
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/io/server.py +1 -1
- xfmr_zem-0.2.5/src/xfmr_zem/servers/llm/parameters.yml +10 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/nemo_curator/server.py +1 -1
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py +90 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py +1286 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py +562 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/ocr.py +512 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/.gitattributes +35 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/README.md +5 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/ocr.res +6623 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/operators.py +725 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/phases.py +191 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/pipeline.py +561 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/postprocess.py +370 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/recognizer.py +436 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/table_structure_recognizer.py +569 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/utils/__init__.py +81 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/utils/file_utils.py +246 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/__init__.py +0 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/base.yml +58 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/vgg-seq2seq.yml +38 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/__init__.py +0 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/cnn.py +25 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/vgg.py +51 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/seqmodel/seq2seq.py +175 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/transformerocr.py +29 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/vocab.py +36 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/config.py +37 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/translate.py +111 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/engines.py +242 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/install_models.py +63 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/parameters.yml +4 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/server.py +44 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/profiler/parameters.yml +4 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/sinks/parameters.yml +6 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/unstructured/parameters.yml +6 -0
- xfmr_zem-0.2.5/src/xfmr_zem/servers/unstructured/server.py +62 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/zenml_wrapper.py +20 -7
- xfmr_zem-0.2.5/tests/manual/hf_ocr_test.yaml +14 -0
- xfmr_zem-0.2.5/tests/manual/multimodal_test.yaml +12 -0
- xfmr_zem-0.2.5/tests/manual/ocr_test.yaml +19 -0
- xfmr_zem-0.2.5/tests/manual/viet_ocr_test.yaml +16 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/uv.lock +2025 -105
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/.github/workflows/deploy.yml +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/.github/workflows/pypi-publish.yml +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/LICENSE +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/README.md +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/data/big_data_output.parquet +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/data/big_data_sim.parquet +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/data/dup_cleaned.parquet +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/data/dup_data.parquet +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/data/dup_data_large.parquet +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/data/nemo_full_stack_result.parquet +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/data/nemo_real_result.parquet +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/data/output_result.jsonl +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/data/sample.jsonl +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/__init__.py +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/orchestrators/parallel_local.py +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/schemas.py +0 -0
- /xfmr_zem-0.2.4/src/xfmr_zem/servers/data_juicer/parameter.yaml → /xfmr_zem-0.2.5/src/xfmr_zem/servers/data_juicer/parameters.yml +0 -0
- /xfmr_zem-0.2.4/src/xfmr_zem/servers/instruction_gen/parameter.yaml → /xfmr_zem-0.2.5/src/xfmr_zem/servers/instruction_gen/parameters.yml +0 -0
- /xfmr_zem-0.2.4/src/xfmr_zem/servers/io/parameter.yaml → /xfmr_zem-0.2.5/src/xfmr_zem/servers/io/parameters.yml +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/llm/server.py +0 -0
- /xfmr_zem-0.2.4/src/xfmr_zem/servers/nemo_curator/parameter.yaml → /xfmr_zem-0.2.5/src/xfmr_zem/servers/nemo_curator/parameters.yml +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/profiler/server.py +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/sinks/server.py +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/tests/manual/caching_test.yaml +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/tests/manual/llm_test.yaml +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/tests/manual/parallel_test.yaml +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/tests/manual/parquet_test.yaml +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/tests/manual/phase4_test.yaml +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/tests/manual/profiler_test.yaml +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/tests/manual/standard_data_pipeline.yaml +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/website/docs/docs.css +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/website/docs/index.html +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/website/index.html +0 -0
- {xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/website/style.css +0 -0
|
@@ -55,3 +55,8 @@ Thumbs.db
|
|
|
55
55
|
# Project specific
|
|
56
56
|
outputs/
|
|
57
57
|
cache/
|
|
58
|
+
|
|
59
|
+
# Binary Models (OCR) - Should be downloaded separately
|
|
60
|
+
src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/*.onnx
|
|
61
|
+
src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/weight/*.pth
|
|
62
|
+
src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/weight/*.onnx
|
|
@@ -42,6 +42,18 @@
|
|
|
42
42
|
- Providers: **Ollama** (local default), **OpenAI**.
|
|
43
43
|
- **Sinks Server** (`src/xfmr_zem/servers/sinks/server.py`):
|
|
44
44
|
- Tools: `to_huggingface`, `to_vector_db`.
|
|
45
|
+
### 5. Unstructured Server (`servers/unstructured`)
|
|
46
|
+
Advanced document parsing for multimodal data ingestion.
|
|
47
|
+
- `parse_document`: Convert PDF, DOCX, HTML to structured text.
|
|
48
|
+
- `extract_tables`: Specifically isolate and extract table data from documents.
|
|
49
|
+
|
|
50
|
+
### 6. OCR Server (`servers/ocr`)
|
|
51
|
+
Unified OCR processing with multiple engine support (SOLID Strategy Pattern).
|
|
52
|
+
- `extract_text`: Extract text from images using different engines:
|
|
53
|
+
- `tesseract`: Lightweight and fast.
|
|
54
|
+
- `paddle`: Medium weight, high accuracy.
|
|
55
|
+
- `qwen`: Heavy Vision-Language Model (Qwen3-VL-8B) for state-of-the-art OCR.
|
|
56
|
+
- `viet`: Specialized Vietnamese OCR using built-in `deepdoc_vietocr` pipeline. Optimized for Vietnamese diacritics and document layout reconstruction.
|
|
45
57
|
|
|
46
58
|
## Orchestration & Concurrency
|
|
47
59
|
|
|
@@ -2,6 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project will be documented in this file.
|
|
4
4
|
|
|
5
|
+
## [0.2.5] - 2026-02-03
|
|
6
|
+
|
|
7
|
+
### Added
|
|
8
|
+
- **Vietnamese OCR**: Integrated specialized Deep-ocr DocumentPipeline for high-accuracy Vietnamese text extraction with layout analysis.
|
|
9
|
+
- **Improved OCR Dependencies**: Added `pdfplumber`, `ruamel.yaml`, and `cachetools` to resolve OCR server tool errors.
|
|
10
|
+
|
|
5
11
|
## [0.2.4] - 2026-02-02
|
|
6
12
|
|
|
7
13
|
### Fixed
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xfmr-zem
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing
|
|
5
5
|
Project-URL: Homepage, https://github.com/OAI-Labs/xfmr-zem
|
|
6
6
|
Project-URL: Repository, https://github.com/OAI-Labs/xfmr-zem
|
|
@@ -22,8 +22,10 @@ Requires-Dist: numpy>=1.24.0
|
|
|
22
22
|
Requires-Dist: pandas>=2.0.0
|
|
23
23
|
Requires-Dist: pyarrow>=15.0.0
|
|
24
24
|
Requires-Dist: pydantic>=2.0.0
|
|
25
|
+
Requires-Dist: python-magic>=0.4.27
|
|
25
26
|
Requires-Dist: pyyaml>=6.0
|
|
26
27
|
Requires-Dist: rich>=13.0.0
|
|
28
|
+
Requires-Dist: unstructured[all-docs]>=0.16.0
|
|
27
29
|
Requires-Dist: zenml[local,server]>=0.75.0
|
|
28
30
|
Provides-Extra: all
|
|
29
31
|
Requires-Dist: nemo-curator>=0.6.0; extra == 'all'
|
|
@@ -39,6 +41,22 @@ Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
|
39
41
|
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
40
42
|
Provides-Extra: nemo
|
|
41
43
|
Requires-Dist: nemo-curator>=0.6.0; extra == 'nemo'
|
|
44
|
+
Provides-Extra: ocr
|
|
45
|
+
Requires-Dist: cachetools>=5.0.0; extra == 'ocr'
|
|
46
|
+
Requires-Dist: einops; extra == 'ocr'
|
|
47
|
+
Requires-Dist: onnxruntime>=1.16.0; extra == 'ocr'
|
|
48
|
+
Requires-Dist: opencv-python>=4.8.0; extra == 'ocr'
|
|
49
|
+
Requires-Dist: paddleocr>=2.7.0; extra == 'ocr'
|
|
50
|
+
Requires-Dist: paddlepaddle>=2.6.0; extra == 'ocr'
|
|
51
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == 'ocr'
|
|
52
|
+
Requires-Dist: pillow>=10.0.0; extra == 'ocr'
|
|
53
|
+
Requires-Dist: pyclipper; extra == 'ocr'
|
|
54
|
+
Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
|
|
55
|
+
Requires-Dist: ruamel-yaml>=0.17.0; extra == 'ocr'
|
|
56
|
+
Requires-Dist: shapely; extra == 'ocr'
|
|
57
|
+
Requires-Dist: torch==2.5.1; extra == 'ocr'
|
|
58
|
+
Requires-Dist: torchvision==0.20.1; extra == 'ocr'
|
|
59
|
+
Requires-Dist: transformers>=4.40.0; extra == 'ocr'
|
|
42
60
|
Provides-Extra: zenml
|
|
43
61
|
Requires-Dist: zenml>=0.75.0; extra == 'zenml'
|
|
44
62
|
Description-Content-Type: text/markdown
|
|
Binary file
|
|
Binary file
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "xfmr-zem"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.5"
|
|
4
4
|
description = "Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10,<3.13"
|
|
@@ -25,6 +25,8 @@ dependencies = [
|
|
|
25
25
|
"dask-cuda>=24.0.0",
|
|
26
26
|
"ftfy>=6.3.1",
|
|
27
27
|
"pydantic>=2.0.0",
|
|
28
|
+
"unstructured[all-docs]>=0.16.0",
|
|
29
|
+
"python-magic>=0.4.27",
|
|
28
30
|
]
|
|
29
31
|
|
|
30
32
|
[project.urls]
|
|
@@ -48,6 +50,23 @@ all = [
|
|
|
48
50
|
"nemo-curator>=0.6.0",
|
|
49
51
|
"py-data-juicer>=1.0.0",
|
|
50
52
|
]
|
|
53
|
+
ocr = [
|
|
54
|
+
"pytesseract>=0.3.10",
|
|
55
|
+
"paddleocr>=2.7.0",
|
|
56
|
+
"paddlepaddle>=2.6.0",
|
|
57
|
+
"transformers>=4.40.0",
|
|
58
|
+
"torch==2.5.1",
|
|
59
|
+
"torchvision==0.20.1",
|
|
60
|
+
"pillow>=10.0.0",
|
|
61
|
+
"onnxruntime>=1.16.0",
|
|
62
|
+
"opencv-python>=4.8.0",
|
|
63
|
+
"shapely",
|
|
64
|
+
"pyclipper",
|
|
65
|
+
"einops",
|
|
66
|
+
"pdfplumber>=0.11.0",
|
|
67
|
+
"ruamel.yaml>=0.17.0",
|
|
68
|
+
"cachetools>=5.0.0",
|
|
69
|
+
]
|
|
51
70
|
dev = [
|
|
52
71
|
"pytest>=7.0.0",
|
|
53
72
|
"pytest-cov>=4.0.0",
|
|
@@ -173,8 +173,19 @@ def _print_static_operators():
|
|
|
173
173
|
@main.command()
|
|
174
174
|
@click.argument("config_file", type=click.Path(exists=True))
|
|
175
175
|
@click.option("--params", "-p", type=click.Path(exists=True), help="Path to custom parameters.yml")
|
|
176
|
-
|
|
176
|
+
@click.option("--verbose", "-v", is_flag=True, help="Enable verbose/debug logging")
|
|
177
|
+
def run(config_file, params, verbose):
|
|
177
178
|
"""Run a pipeline from a YAML configuration file"""
|
|
179
|
+
# Configure logging based on verbosity
|
|
180
|
+
if verbose:
|
|
181
|
+
os.environ["ZEM_VERBOSE"] = "1"
|
|
182
|
+
logger.remove()
|
|
183
|
+
logger.add(sys.stderr, level="DEBUG", format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
|
|
184
|
+
console.print("[bold yellow]Verbose mode enabled - DEBUG logging active[/bold yellow]")
|
|
185
|
+
else:
|
|
186
|
+
logger.remove()
|
|
187
|
+
logger.add(sys.stderr, level="INFO", format="<level>{message}</level>")
|
|
188
|
+
|
|
178
189
|
abs_config = os.path.abspath(config_file)
|
|
179
190
|
console.print(f"[bold green]Starting Pipeline:[/bold green] {abs_config}")
|
|
180
191
|
if params:
|
|
@@ -239,12 +250,18 @@ def preview(artifact_id, id2, limit, sample):
|
|
|
239
250
|
return pd.DataFrame(lines)
|
|
240
251
|
elif isinstance(d, list): return pd.DataFrame(d)
|
|
241
252
|
elif isinstance(d, pd.DataFrame): return d
|
|
242
|
-
return
|
|
253
|
+
elif isinstance(d, dict): return pd.DataFrame([d])
|
|
254
|
+
return d
|
|
243
255
|
|
|
244
256
|
try:
|
|
245
257
|
df1 = load_art_df(artifact_id)
|
|
246
258
|
if df1 is None:
|
|
247
|
-
console.print("[bold red]Error:[/bold red]
|
|
259
|
+
console.print("[bold red]Error:[/bold red] Artifact is empty or could not be loaded.")
|
|
260
|
+
return
|
|
261
|
+
|
|
262
|
+
if not isinstance(df1, pd.DataFrame):
|
|
263
|
+
console.print(f"[bold blue]Artifact Preview (Type: {type(df1).__name__}):[/bold blue]")
|
|
264
|
+
console.print(str(df1))
|
|
248
265
|
return
|
|
249
266
|
|
|
250
267
|
if id2:
|
|
@@ -290,6 +307,18 @@ def preview(artifact_id, id2, limit, sample):
|
|
|
290
307
|
console.print(f"[bold red]Error previewing artifact:[/bold red] {e}")
|
|
291
308
|
|
|
292
309
|
|
|
310
|
+
@main.group()
|
|
311
|
+
def ocr():
|
|
312
|
+
"""OCR related commands (Installation, etc.)"""
|
|
313
|
+
pass
|
|
314
|
+
|
|
315
|
+
@ocr.command(name="install")
|
|
316
|
+
def ocr_install():
|
|
317
|
+
"""Install OCR model weights (ONNX/PTH)"""
|
|
318
|
+
from xfmr_zem.servers.ocr.install_models import main as install_main
|
|
319
|
+
install_main()
|
|
320
|
+
|
|
321
|
+
|
|
293
322
|
if __name__ == "__main__":
|
|
294
323
|
main()
|
|
295
324
|
|
|
@@ -4,6 +4,7 @@ import yaml
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from zenml import pipeline
|
|
6
6
|
from .zenml_wrapper import mcp_generic_step
|
|
7
|
+
from loguru import logger
|
|
7
8
|
import os
|
|
8
9
|
import sys
|
|
9
10
|
|
|
@@ -39,6 +40,17 @@ class PipelineClient:
|
|
|
39
40
|
params.update(yaml.safe_load(f) or {})
|
|
40
41
|
return params
|
|
41
42
|
|
|
43
|
+
def _flatten_params(self, d: Dict[str, Any], prefix: str = "") -> Dict[str, Any]:
|
|
44
|
+
"""Flatten nested dictionary into dot-notation keys."""
|
|
45
|
+
items = []
|
|
46
|
+
for k, v in d.items():
|
|
47
|
+
new_key = f"{prefix}.{k}" if prefix else k
|
|
48
|
+
if isinstance(v, dict):
|
|
49
|
+
items.extend(self._flatten_params(v, new_key).items())
|
|
50
|
+
else:
|
|
51
|
+
items.append((new_key, v))
|
|
52
|
+
return dict(items)
|
|
53
|
+
|
|
42
54
|
def _load_config_dict(self, path: Path) -> Dict[str, Any]:
|
|
43
55
|
"""Load YAML config and perform substitution."""
|
|
44
56
|
with open(path, "r") as f:
|
|
@@ -54,8 +66,13 @@ class PipelineClient:
|
|
|
54
66
|
custom_params = self._load_params(self.params_path)
|
|
55
67
|
self.params.update(custom_params)
|
|
56
68
|
|
|
69
|
+
# Flatten params for template substitution
|
|
70
|
+
flat_params = self._flatten_params(self.params)
|
|
71
|
+
|
|
57
72
|
content = raw_content
|
|
58
|
-
|
|
73
|
+
# Use reversed sorted keys to avoid partial replacements (e.g. ocr before ocr.engine)
|
|
74
|
+
for key in sorted(flat_params.keys(), key=len, reverse=True):
|
|
75
|
+
value = flat_params[key]
|
|
59
76
|
content = content.replace(f"{{{{ {key} }}}}", str(value))
|
|
60
77
|
content = content.replace(f"{{{{{key}}}}}", str(value))
|
|
61
78
|
|
|
@@ -96,6 +113,11 @@ class PipelineClient:
|
|
|
96
113
|
server_specific_params[key] = value
|
|
97
114
|
|
|
98
115
|
env["ZEM_PARAMETERS"] = yaml.dump(server_specific_params)
|
|
116
|
+
|
|
117
|
+
# Pass verbose flag to subprocess
|
|
118
|
+
if os.environ.get("ZEM_VERBOSE"):
|
|
119
|
+
env["ZEM_VERBOSE"] = "1"
|
|
120
|
+
|
|
99
121
|
configs[name] = {
|
|
100
122
|
"command": sys.executable,
|
|
101
123
|
"args": [str(abs_path)],
|
|
@@ -146,11 +168,35 @@ class PipelineClient:
|
|
|
146
168
|
|
|
147
169
|
srv, tool = key.split(".")
|
|
148
170
|
|
|
149
|
-
if not step_alias:
|
|
150
|
-
step_alias = step_def[key].get("name")
|
|
151
|
-
|
|
152
171
|
step_alias = step_alias or f"{srv}.{tool}.{i}"
|
|
153
|
-
|
|
172
|
+
|
|
173
|
+
val = step_def[key]
|
|
174
|
+
if isinstance(val, dict):
|
|
175
|
+
if "input" in val:
|
|
176
|
+
tool_args = val.get("input", {}) or {}
|
|
177
|
+
else:
|
|
178
|
+
# Use everything except 'name' as tool_args
|
|
179
|
+
tool_args = {k: v for k, v in val.items() if k != "name"}
|
|
180
|
+
else:
|
|
181
|
+
tool_args = {}
|
|
182
|
+
|
|
183
|
+
# Standardized Parameter Injection:
|
|
184
|
+
# Merge parameters from the 'parameters' section.
|
|
185
|
+
# Priority: Step-specific args > parameters.<srv>.<tool> > parameters.<srv>
|
|
186
|
+
srv_params = self.params.get(srv, {})
|
|
187
|
+
if isinstance(srv_params, dict):
|
|
188
|
+
# 1. Server-wide defaults
|
|
189
|
+
for k, v in srv_params.items():
|
|
190
|
+
if k != tool and not isinstance(v, dict) and k not in tool_args:
|
|
191
|
+
tool_args[k] = v
|
|
192
|
+
|
|
193
|
+
# 2. Tool-specific overrides
|
|
194
|
+
tool_params = srv_params.get(tool, {})
|
|
195
|
+
if isinstance(tool_params, dict):
|
|
196
|
+
for k, v in tool_params.items():
|
|
197
|
+
if k not in tool_args:
|
|
198
|
+
tool_args[k] = v
|
|
199
|
+
|
|
154
200
|
|
|
155
201
|
# Smart Parallelization & DAG Logic:
|
|
156
202
|
# 1. By default, a step is a root (None) unless it has no 'data' input,
|
|
@@ -161,8 +207,13 @@ class PipelineClient:
|
|
|
161
207
|
has_explicit_data = "data" in tool_args
|
|
162
208
|
|
|
163
209
|
if not has_explicit_data:
|
|
164
|
-
#
|
|
165
|
-
|
|
210
|
+
# Smart Source Detection: If a step has 'file_path', 'url', etc.,
|
|
211
|
+
# it's likely a primary ingestion step and shouldn't inherit 'data' from the previous step.
|
|
212
|
+
source_keys = {"file_path", "url", "uri", "path"}
|
|
213
|
+
is_source = any(k in tool_args for k in source_keys)
|
|
214
|
+
|
|
215
|
+
if not is_source:
|
|
216
|
+
current_prev_output = last_output
|
|
166
217
|
else:
|
|
167
218
|
# Data provided? Check if it's a reference or raw data
|
|
168
219
|
for k, v in list(tool_args.items()):
|
|
@@ -174,7 +225,7 @@ class PipelineClient:
|
|
|
174
225
|
del tool_args[k]
|
|
175
226
|
else:
|
|
176
227
|
# Limitation: ZenML doesn't materialize artifacts nested in dicts
|
|
177
|
-
|
|
228
|
+
logger.warning(f" Tool argument '{k}' uses a step reference '{v}'. "
|
|
178
229
|
"Currently, only the 'data' field supports cross-step dependencies. "
|
|
179
230
|
"This value will be passed as a raw string.")
|
|
180
231
|
else:
|
|
@@ -3,7 +3,9 @@ from typing import Any, Callable, Dict, List, Optional, Union
|
|
|
3
3
|
import yaml
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from fastmcp import FastMCP
|
|
6
|
+
from loguru import logger
|
|
6
7
|
import inspect
|
|
8
|
+
import functools
|
|
7
9
|
|
|
8
10
|
class ZemServer(FastMCP):
|
|
9
11
|
"""
|
|
@@ -22,9 +24,24 @@ class ZemServer(FastMCP):
|
|
|
22
24
|
self.parameter_file = parameter_file
|
|
23
25
|
self.parameters = {}
|
|
24
26
|
|
|
25
|
-
#
|
|
27
|
+
# Configure logging based on ZEM_VERBOSE
|
|
28
|
+
import os
|
|
29
|
+
import sys
|
|
30
|
+
if os.environ.get("ZEM_VERBOSE"):
|
|
31
|
+
logger.remove()
|
|
32
|
+
logger.add(sys.stderr, level="DEBUG", format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
|
|
33
|
+
|
|
34
|
+
# 1. Load from file or auto-detect in server directory
|
|
26
35
|
if parameter_file:
|
|
27
36
|
self.load_parameters(parameter_file)
|
|
37
|
+
else:
|
|
38
|
+
# Auto-detect parameters.yml in the same directory as the server script
|
|
39
|
+
import inspect
|
|
40
|
+
caller_frame = inspect.stack()[1]
|
|
41
|
+
caller_file = caller_frame.filename
|
|
42
|
+
auto_path = Path(caller_file).parent / "parameters.yml"
|
|
43
|
+
if auto_path.exists():
|
|
44
|
+
self.load_parameters(str(auto_path))
|
|
28
45
|
|
|
29
46
|
# 2. Override with env params (from PipelineClient)
|
|
30
47
|
import os
|
|
@@ -35,7 +52,7 @@ class ZemServer(FastMCP):
|
|
|
35
52
|
if isinstance(env_params, dict):
|
|
36
53
|
self._merge_parameters(env_params)
|
|
37
54
|
except Exception as e:
|
|
38
|
-
|
|
55
|
+
logger.error(f"Error loading ZEM_PARAMETERS: {e}")
|
|
39
56
|
|
|
40
57
|
def load_parameters(self, file_path: str) -> Dict[str, Any]:
|
|
41
58
|
"""Load parameters from YAML file and merge them."""
|
|
@@ -79,8 +96,8 @@ class ZemServer(FastMCP):
|
|
|
79
96
|
else:
|
|
80
97
|
target[k] = v
|
|
81
98
|
|
|
82
|
-
#
|
|
83
|
-
#
|
|
99
|
+
# NOTE: Parameter injection is handled by PipelineClient._merge_parameters
|
|
100
|
+
# and the tool decorator is inherited from FastMCP
|
|
84
101
|
|
|
85
102
|
def get_data(self, data: Any) -> List[Dict[str, Any]]:
|
|
86
103
|
"""
|
|
@@ -9,7 +9,7 @@ from loguru import logger
|
|
|
9
9
|
logger.remove()
|
|
10
10
|
logger.add(sys.stderr, level="INFO")
|
|
11
11
|
|
|
12
|
-
server = ZemServer("data_juicer", parameter_file=os.path.join(os.path.dirname(__file__), "
|
|
12
|
+
server = ZemServer("data_juicer", parameter_file=os.path.join(os.path.dirname(__file__), "parameters.yml"))
|
|
13
13
|
|
|
14
14
|
@server.tool()
|
|
15
15
|
def clean_content(
|
|
@@ -10,7 +10,7 @@ from loguru import logger
|
|
|
10
10
|
logger.remove()
|
|
11
11
|
logger.add(sys.stderr, level="INFO")
|
|
12
12
|
|
|
13
|
-
server = ZemServer("instruction", parameter_file=os.path.join(os.path.dirname(__file__), "
|
|
13
|
+
server = ZemServer("instruction", parameter_file=os.path.join(os.path.dirname(__file__), "parameters.yml"))
|
|
14
14
|
|
|
15
15
|
@server.tool()
|
|
16
16
|
def generate_qa_pairs(
|
|
@@ -7,7 +7,7 @@ from loguru import logger
|
|
|
7
7
|
import sys
|
|
8
8
|
|
|
9
9
|
# Initialize server
|
|
10
|
-
server = ZemServer("io", parameter_file=os.path.join(os.path.dirname(__file__), "
|
|
10
|
+
server = ZemServer("io", parameter_file=os.path.join(os.path.dirname(__file__), "parameters.yml"))
|
|
11
11
|
|
|
12
12
|
@server.tool()
|
|
13
13
|
def load_jsonl(path: str, return_reference: bool = False) -> Any:
|
|
@@ -10,7 +10,7 @@ from loguru import logger
|
|
|
10
10
|
logger.remove()
|
|
11
11
|
logger.add(sys.stderr, level="INFO")
|
|
12
12
|
|
|
13
|
-
server = ZemServer("nemo", parameter_file=os.path.join(os.path.dirname(__file__), "
|
|
13
|
+
server = ZemServer("nemo", parameter_file=os.path.join(os.path.dirname(__file__), "parameters.yml"))
|
|
14
14
|
|
|
15
15
|
@server.tool()
|
|
16
16
|
def normalize(
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
import io
|
|
17
|
+
import sys
|
|
18
|
+
import threading
|
|
19
|
+
import pdfplumber
|
|
20
|
+
|
|
21
|
+
from .ocr import OCR
|
|
22
|
+
from .recognizer import Recognizer
|
|
23
|
+
from .layout_recognizer import LayoutRecognizerDocLayoutYOLO as LayoutRecognizer
|
|
24
|
+
from .table_structure_recognizer import TableStructureRecognizer
|
|
25
|
+
# from .engine import VietDocEngine
|
|
26
|
+
|
|
27
|
+
# New Phase-Based Architecture
|
|
28
|
+
from .pipeline import DocumentPipeline
|
|
29
|
+
from .phases import (
|
|
30
|
+
LayoutAnalysisPhase,
|
|
31
|
+
TextDetectionPhase,
|
|
32
|
+
TextRecognitionPhase,
|
|
33
|
+
PostProcessingPhase,
|
|
34
|
+
DocumentReconstructionPhase,
|
|
35
|
+
)
|
|
36
|
+
from .implementations import (
|
|
37
|
+
DocLayoutYOLOAnalyzer,
|
|
38
|
+
PaddleOCRTextDetector,
|
|
39
|
+
VietOCRRecognizer,
|
|
40
|
+
SVTRv2Recognizer,
|
|
41
|
+
LandingAIRecognizer,
|
|
42
|
+
VietnameseTextPostProcessor,
|
|
43
|
+
SmartMarkdownReconstruction,
|
|
44
|
+
create_default_pipeline,
|
|
45
|
+
create_svtrv2_pipeline,
|
|
46
|
+
create_experimental_pipeline,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
|
|
51
|
+
if LOCK_KEY_pdfplumber not in sys.modules:
|
|
52
|
+
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# Removed init_in_out
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
__all__ = [
|
|
59
|
+
# Legacy API (backward compatibility)
|
|
60
|
+
"OCR",
|
|
61
|
+
"Recognizer",
|
|
62
|
+
"LayoutRecognizer",
|
|
63
|
+
"TableStructureRecognizer",
|
|
64
|
+
# "VietDocEngine",
|
|
65
|
+
# "init_in_out",
|
|
66
|
+
|
|
67
|
+
# New Phase-Based Architecture
|
|
68
|
+
"DocumentPipeline",
|
|
69
|
+
|
|
70
|
+
# Abstract Phase Interfaces
|
|
71
|
+
"LayoutAnalysisPhase",
|
|
72
|
+
"TextDetectionPhase",
|
|
73
|
+
"TextRecognitionPhase",
|
|
74
|
+
"PostProcessingPhase",
|
|
75
|
+
"DocumentReconstructionPhase",
|
|
76
|
+
|
|
77
|
+
# Concrete Implementations
|
|
78
|
+
"DocLayoutYOLOAnalyzer",
|
|
79
|
+
"PaddleOCRTextDetector",
|
|
80
|
+
"VietOCRRecognizer",
|
|
81
|
+
"SVTRv2Recognizer",
|
|
82
|
+
"LandingAIRecognizer",
|
|
83
|
+
"VietnameseTextPostProcessor",
|
|
84
|
+
"SmartMarkdownReconstruction",
|
|
85
|
+
|
|
86
|
+
# Factory Functions
|
|
87
|
+
"create_default_pipeline",
|
|
88
|
+
"create_svtrv2_pipeline",
|
|
89
|
+
"create_experimental_pipeline",
|
|
90
|
+
]
|