xfmr-zem 0.2.2__tar.gz → 0.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/.github/workflows/pypi-publish.yml +10 -1
  2. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/.gitignore +5 -0
  3. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/AGENTS.md +12 -0
  4. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/CHANGELOG.md +16 -0
  5. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/PKG-INFO +19 -1
  6. xfmr_zem-0.2.5/data/ocr_test.png +0 -0
  7. xfmr_zem-0.2.5/data/vietnamese_ocr.png +0 -0
  8. xfmr_zem-0.2.5/parameters.yml +7 -0
  9. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/pyproject.toml +20 -1
  10. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/src/xfmr_zem/cli.py +32 -3
  11. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/src/xfmr_zem/client.py +59 -8
  12. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/src/xfmr_zem/server.py +21 -4
  13. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/data_juicer/server.py +1 -1
  14. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/instruction_gen/server.py +1 -1
  15. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/io/server.py +1 -1
  16. xfmr_zem-0.2.5/src/xfmr_zem/servers/llm/parameters.yml +10 -0
  17. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/nemo_curator/server.py +1 -1
  18. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py +90 -0
  19. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py +1286 -0
  20. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py +562 -0
  21. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/ocr.py +512 -0
  22. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/.gitattributes +35 -0
  23. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/README.md +5 -0
  24. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/ocr.res +6623 -0
  25. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/operators.py +725 -0
  26. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/phases.py +191 -0
  27. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/pipeline.py +561 -0
  28. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/postprocess.py +370 -0
  29. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/recognizer.py +436 -0
  30. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/table_structure_recognizer.py +569 -0
  31. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/utils/__init__.py +81 -0
  32. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/utils/file_utils.py +246 -0
  33. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/__init__.py +0 -0
  34. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/base.yml +58 -0
  35. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/vgg-seq2seq.yml +38 -0
  36. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/__init__.py +0 -0
  37. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/cnn.py +25 -0
  38. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/vgg.py +51 -0
  39. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/seqmodel/seq2seq.py +175 -0
  40. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/transformerocr.py +29 -0
  41. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/vocab.py +36 -0
  42. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/config.py +37 -0
  43. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/translate.py +111 -0
  44. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/engines.py +242 -0
  45. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/install_models.py +63 -0
  46. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/parameters.yml +4 -0
  47. xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/server.py +44 -0
  48. xfmr_zem-0.2.5/src/xfmr_zem/servers/profiler/parameters.yml +4 -0
  49. xfmr_zem-0.2.5/src/xfmr_zem/servers/sinks/parameters.yml +6 -0
  50. xfmr_zem-0.2.5/src/xfmr_zem/servers/unstructured/parameters.yml +6 -0
  51. xfmr_zem-0.2.5/src/xfmr_zem/servers/unstructured/server.py +62 -0
  52. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/src/xfmr_zem/zenml_wrapper.py +20 -7
  53. xfmr_zem-0.2.5/tests/manual/hf_ocr_test.yaml +14 -0
  54. xfmr_zem-0.2.5/tests/manual/multimodal_test.yaml +12 -0
  55. xfmr_zem-0.2.5/tests/manual/ocr_test.yaml +19 -0
  56. xfmr_zem-0.2.5/tests/manual/viet_ocr_test.yaml +16 -0
  57. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/uv.lock +2025 -105
  58. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/.github/workflows/deploy.yml +0 -0
  59. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/LICENSE +0 -0
  60. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/README.md +0 -0
  61. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/data/big_data_output.parquet +0 -0
  62. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/data/big_data_sim.parquet +0 -0
  63. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/data/dup_cleaned.parquet +0 -0
  64. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/data/dup_data.parquet +0 -0
  65. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/data/dup_data_large.parquet +0 -0
  66. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/data/nemo_full_stack_result.parquet +0 -0
  67. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/data/nemo_real_result.parquet +0 -0
  68. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/data/output_result.jsonl +0 -0
  69. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/data/sample.jsonl +0 -0
  70. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/src/xfmr_zem/__init__.py +0 -0
  71. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/src/xfmr_zem/orchestrators/parallel_local.py +0 -0
  72. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/src/xfmr_zem/schemas.py +0 -0
  73. /xfmr_zem-0.2.2/src/xfmr_zem/servers/data_juicer/parameter.yaml → /xfmr_zem-0.2.5/src/xfmr_zem/servers/data_juicer/parameters.yml +0 -0
  74. /xfmr_zem-0.2.2/src/xfmr_zem/servers/instruction_gen/parameter.yaml → /xfmr_zem-0.2.5/src/xfmr_zem/servers/instruction_gen/parameters.yml +0 -0
  75. /xfmr_zem-0.2.2/src/xfmr_zem/servers/io/parameter.yaml → /xfmr_zem-0.2.5/src/xfmr_zem/servers/io/parameters.yml +0 -0
  76. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/llm/server.py +0 -0
  77. /xfmr_zem-0.2.2/src/xfmr_zem/servers/nemo_curator/parameter.yaml → /xfmr_zem-0.2.5/src/xfmr_zem/servers/nemo_curator/parameters.yml +0 -0
  78. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/profiler/server.py +0 -0
  79. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/sinks/server.py +0 -0
  80. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/tests/manual/caching_test.yaml +0 -0
  81. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/tests/manual/llm_test.yaml +0 -0
  82. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/tests/manual/parallel_test.yaml +0 -0
  83. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/tests/manual/parquet_test.yaml +0 -0
  84. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/tests/manual/phase4_test.yaml +0 -0
  85. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/tests/manual/profiler_test.yaml +0 -0
  86. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/tests/manual/standard_data_pipeline.yaml +0 -0
  87. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/website/docs/docs.css +0 -0
  88. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/website/docs/index.html +0 -0
  89. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/website/index.html +0 -0
  90. {xfmr_zem-0.2.2 → xfmr_zem-0.2.5}/website/style.css +0 -0
@@ -12,7 +12,8 @@ jobs:
12
12
  permissions:
13
13
  # Required for Trusted Publishing (OIDC)
14
14
  id-token: write
15
- contents: read
15
+ # Required for creating releases and uploading assets
16
+ contents: write
16
17
 
17
18
  steps:
18
19
  - uses: actions/checkout@v4
@@ -25,6 +26,14 @@ jobs:
25
26
  - name: Build package
26
27
  run: uv build
27
28
 
29
+ - name: Create GitHub Release
30
+ uses: softprops/action-gh-release@v2
31
+ with:
32
+ files: dist/*
33
+ generate_release_notes: true
34
+ env:
35
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
36
+
28
37
  - name: Publish to PyPI
29
38
  uses: pypa/gh-action-pypi-publish@release/v1
30
39
  # No password/token needed if OIDC is configured on PyPI side
@@ -55,3 +55,8 @@ Thumbs.db
55
55
  # Project specific
56
56
  outputs/
57
57
  cache/
58
+
59
+ # Binary Models (OCR) - Should be downloaded separately
60
+ src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/*.onnx
61
+ src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/weight/*.pth
62
+ src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/weight/*.onnx
@@ -42,6 +42,18 @@
42
42
  - Providers: **Ollama** (local default), **OpenAI**.
43
43
  - **Sinks Server** (`src/xfmr_zem/servers/sinks/server.py`):
44
44
  - Tools: `to_huggingface`, `to_vector_db`.
45
+ ### 5. Unstructured Server (`servers/unstructured`)
46
+ Advanced document parsing for multimodal data ingestion.
47
+ - `parse_document`: Convert PDF, DOCX, HTML to structured text.
48
+ - `extract_tables`: Specifically isolate and extract table data from documents.
49
+
50
+ ### 6. OCR Server (`servers/ocr`)
51
+ Unified OCR processing with multiple engine support (SOLID Strategy Pattern).
52
+ - `extract_text`: Extract text from images using different engines:
53
+ - `tesseract`: Lightweight and fast.
54
+ - `paddle`: Medium weight, high accuracy.
55
+ - `qwen`: Heavy Vision-Language Model (Qwen3-VL-8B) for state-of-the-art OCR.
56
+ - `viet`: Specialized Vietnamese OCR using built-in `deepdoc_vietocr` pipeline. Optimized for Vietnamese diacritics and document layout reconstruction.
45
57
 
46
58
  ## Orchestration & Concurrency
47
59
 
@@ -2,6 +2,22 @@
2
2
 
3
3
  All notable changes to this project will be documented in this file.
4
4
 
5
+ ## [0.2.5] - 2026-02-03
6
+
7
+ ### Added
8
+ - **Vietnamese OCR**: Integrated specialized Deep-ocr DocumentPipeline for high-accuracy Vietnamese text extraction with layout analysis.
9
+ - **Improved OCR Dependencies**: Added `pdfplumber`, `ruamel.yaml`, and `cachetools` to resolve OCR server tool errors.
10
+
11
+ ## [0.2.4] - 2026-02-02
12
+
13
+ ### Fixed
14
+ - **CI Permissions**: Explicitly granted write permissions in workflow to bypass repository UI restrictions.
15
+
16
+ ## [0.2.3] - 2026-02-02
17
+
18
+ ### Added
19
+ - **GitHub Release Integration**: automated creation of formal GitHub Releases with binary assets.
20
+
5
21
  ## [0.2.2] - 2026-02-02
6
22
 
7
23
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xfmr-zem
3
- Version: 0.2.2
3
+ Version: 0.2.5
4
4
  Summary: Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing
5
5
  Project-URL: Homepage, https://github.com/OAI-Labs/xfmr-zem
6
6
  Project-URL: Repository, https://github.com/OAI-Labs/xfmr-zem
@@ -22,8 +22,10 @@ Requires-Dist: numpy>=1.24.0
22
22
  Requires-Dist: pandas>=2.0.0
23
23
  Requires-Dist: pyarrow>=15.0.0
24
24
  Requires-Dist: pydantic>=2.0.0
25
+ Requires-Dist: python-magic>=0.4.27
25
26
  Requires-Dist: pyyaml>=6.0
26
27
  Requires-Dist: rich>=13.0.0
28
+ Requires-Dist: unstructured[all-docs]>=0.16.0
27
29
  Requires-Dist: zenml[local,server]>=0.75.0
28
30
  Provides-Extra: all
29
31
  Requires-Dist: nemo-curator>=0.6.0; extra == 'all'
@@ -39,6 +41,22 @@ Requires-Dist: pytest>=7.0.0; extra == 'dev'
39
41
  Requires-Dist: ruff>=0.1.0; extra == 'dev'
40
42
  Provides-Extra: nemo
41
43
  Requires-Dist: nemo-curator>=0.6.0; extra == 'nemo'
44
+ Provides-Extra: ocr
45
+ Requires-Dist: cachetools>=5.0.0; extra == 'ocr'
46
+ Requires-Dist: einops; extra == 'ocr'
47
+ Requires-Dist: onnxruntime>=1.16.0; extra == 'ocr'
48
+ Requires-Dist: opencv-python>=4.8.0; extra == 'ocr'
49
+ Requires-Dist: paddleocr>=2.7.0; extra == 'ocr'
50
+ Requires-Dist: paddlepaddle>=2.6.0; extra == 'ocr'
51
+ Requires-Dist: pdfplumber>=0.11.0; extra == 'ocr'
52
+ Requires-Dist: pillow>=10.0.0; extra == 'ocr'
53
+ Requires-Dist: pyclipper; extra == 'ocr'
54
+ Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
55
+ Requires-Dist: ruamel-yaml>=0.17.0; extra == 'ocr'
56
+ Requires-Dist: shapely; extra == 'ocr'
57
+ Requires-Dist: torch==2.5.1; extra == 'ocr'
58
+ Requires-Dist: torchvision==0.20.1; extra == 'ocr'
59
+ Requires-Dist: transformers>=4.40.0; extra == 'ocr'
42
60
  Provides-Extra: zenml
43
61
  Requires-Dist: zenml>=0.75.0; extra == 'zenml'
44
62
  Description-Content-Type: text/markdown
Binary file
Binary file
@@ -0,0 +1,7 @@
1
+ # Parameters for Zem Pipeline
2
+ # This file allows overriding tool arguments at runtime
3
+
4
+ ocr:
5
+ extract_text:
6
+ engine: "huggingface"
7
+ model_id: "Qwen/Qwen2-VL-2B-Instruct"
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "xfmr-zem"
3
- version = "0.2.2"
3
+ version = "0.2.5"
4
4
  description = "Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10,<3.13"
@@ -25,6 +25,8 @@ dependencies = [
25
25
  "dask-cuda>=24.0.0",
26
26
  "ftfy>=6.3.1",
27
27
  "pydantic>=2.0.0",
28
+ "unstructured[all-docs]>=0.16.0",
29
+ "python-magic>=0.4.27",
28
30
  ]
29
31
 
30
32
  [project.urls]
@@ -48,6 +50,23 @@ all = [
48
50
  "nemo-curator>=0.6.0",
49
51
  "py-data-juicer>=1.0.0",
50
52
  ]
53
+ ocr = [
54
+ "pytesseract>=0.3.10",
55
+ "paddleocr>=2.7.0",
56
+ "paddlepaddle>=2.6.0",
57
+ "transformers>=4.40.0",
58
+ "torch==2.5.1",
59
+ "torchvision==0.20.1",
60
+ "pillow>=10.0.0",
61
+ "onnxruntime>=1.16.0",
62
+ "opencv-python>=4.8.0",
63
+ "shapely",
64
+ "pyclipper",
65
+ "einops",
66
+ "pdfplumber>=0.11.0",
67
+ "ruamel.yaml>=0.17.0",
68
+ "cachetools>=5.0.0",
69
+ ]
51
70
  dev = [
52
71
  "pytest>=7.0.0",
53
72
  "pytest-cov>=4.0.0",
@@ -173,8 +173,19 @@ def _print_static_operators():
173
173
  @main.command()
174
174
  @click.argument("config_file", type=click.Path(exists=True))
175
175
  @click.option("--params", "-p", type=click.Path(exists=True), help="Path to custom parameters.yml")
176
- def run(config_file, params):
176
+ @click.option("--verbose", "-v", is_flag=True, help="Enable verbose/debug logging")
177
+ def run(config_file, params, verbose):
177
178
  """Run a pipeline from a YAML configuration file"""
179
+ # Configure logging based on verbosity
180
+ if verbose:
181
+ os.environ["ZEM_VERBOSE"] = "1"
182
+ logger.remove()
183
+ logger.add(sys.stderr, level="DEBUG", format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
184
+ console.print("[bold yellow]Verbose mode enabled - DEBUG logging active[/bold yellow]")
185
+ else:
186
+ logger.remove()
187
+ logger.add(sys.stderr, level="INFO", format="<level>{message}</level>")
188
+
178
189
  abs_config = os.path.abspath(config_file)
179
190
  console.print(f"[bold green]Starting Pipeline:[/bold green] {abs_config}")
180
191
  if params:
@@ -239,12 +250,18 @@ def preview(artifact_id, id2, limit, sample):
239
250
  return pd.DataFrame(lines)
240
251
  elif isinstance(d, list): return pd.DataFrame(d)
241
252
  elif isinstance(d, pd.DataFrame): return d
242
- return None
253
+ elif isinstance(d, dict): return pd.DataFrame([d])
254
+ return d
243
255
 
244
256
  try:
245
257
  df1 = load_art_df(artifact_id)
246
258
  if df1 is None:
247
- console.print("[bold red]Error:[/bold red] Could not load artifact as tabular data.")
259
+ console.print("[bold red]Error:[/bold red] Artifact is empty or could not be loaded.")
260
+ return
261
+
262
+ if not isinstance(df1, pd.DataFrame):
263
+ console.print(f"[bold blue]Artifact Preview (Type: {type(df1).__name__}):[/bold blue]")
264
+ console.print(str(df1))
248
265
  return
249
266
 
250
267
  if id2:
@@ -290,6 +307,18 @@ def preview(artifact_id, id2, limit, sample):
290
307
  console.print(f"[bold red]Error previewing artifact:[/bold red] {e}")
291
308
 
292
309
 
310
+ @main.group()
311
+ def ocr():
312
+ """OCR related commands (Installation, etc.)"""
313
+ pass
314
+
315
+ @ocr.command(name="install")
316
+ def ocr_install():
317
+ """Install OCR model weights (ONNX/PTH)"""
318
+ from xfmr_zem.servers.ocr.install_models import main as install_main
319
+ install_main()
320
+
321
+
293
322
  if __name__ == "__main__":
294
323
  main()
295
324
 
@@ -4,6 +4,7 @@ import yaml
4
4
  from pathlib import Path
5
5
  from zenml import pipeline
6
6
  from .zenml_wrapper import mcp_generic_step
7
+ from loguru import logger
7
8
  import os
8
9
  import sys
9
10
 
@@ -39,6 +40,17 @@ class PipelineClient:
39
40
  params.update(yaml.safe_load(f) or {})
40
41
  return params
41
42
 
43
+ def _flatten_params(self, d: Dict[str, Any], prefix: str = "") -> Dict[str, Any]:
44
+ """Flatten nested dictionary into dot-notation keys."""
45
+ items = []
46
+ for k, v in d.items():
47
+ new_key = f"{prefix}.{k}" if prefix else k
48
+ if isinstance(v, dict):
49
+ items.extend(self._flatten_params(v, new_key).items())
50
+ else:
51
+ items.append((new_key, v))
52
+ return dict(items)
53
+
42
54
  def _load_config_dict(self, path: Path) -> Dict[str, Any]:
43
55
  """Load YAML config and perform substitution."""
44
56
  with open(path, "r") as f:
@@ -54,8 +66,13 @@ class PipelineClient:
54
66
  custom_params = self._load_params(self.params_path)
55
67
  self.params.update(custom_params)
56
68
 
69
+ # Flatten params for template substitution
70
+ flat_params = self._flatten_params(self.params)
71
+
57
72
  content = raw_content
58
- for key, value in self.params.items():
73
+ # Use reversed sorted keys to avoid partial replacements (e.g. ocr before ocr.engine)
74
+ for key in sorted(flat_params.keys(), key=len, reverse=True):
75
+ value = flat_params[key]
59
76
  content = content.replace(f"{{{{ {key} }}}}", str(value))
60
77
  content = content.replace(f"{{{{{key}}}}}", str(value))
61
78
 
@@ -96,6 +113,11 @@ class PipelineClient:
96
113
  server_specific_params[key] = value
97
114
 
98
115
  env["ZEM_PARAMETERS"] = yaml.dump(server_specific_params)
116
+
117
+ # Pass verbose flag to subprocess
118
+ if os.environ.get("ZEM_VERBOSE"):
119
+ env["ZEM_VERBOSE"] = "1"
120
+
99
121
  configs[name] = {
100
122
  "command": sys.executable,
101
123
  "args": [str(abs_path)],
@@ -146,11 +168,35 @@ class PipelineClient:
146
168
 
147
169
  srv, tool = key.split(".")
148
170
 
149
- if not step_alias:
150
- step_alias = step_def[key].get("name")
151
-
152
171
  step_alias = step_alias or f"{srv}.{tool}.{i}"
153
- tool_args = step_def[key].get("input", {}) or {}
172
+
173
+ val = step_def[key]
174
+ if isinstance(val, dict):
175
+ if "input" in val:
176
+ tool_args = val.get("input", {}) or {}
177
+ else:
178
+ # Use everything except 'name' as tool_args
179
+ tool_args = {k: v for k, v in val.items() if k != "name"}
180
+ else:
181
+ tool_args = {}
182
+
183
+ # Standardized Parameter Injection:
184
+ # Merge parameters from the 'parameters' section.
185
+ # Priority: Step-specific args > parameters.<srv>.<tool> > parameters.<srv>
186
+ srv_params = self.params.get(srv, {})
187
+ if isinstance(srv_params, dict):
188
+ # 1. Server-wide defaults
189
+ for k, v in srv_params.items():
190
+ if k != tool and not isinstance(v, dict) and k not in tool_args:
191
+ tool_args[k] = v
192
+
193
+ # 2. Tool-specific overrides
194
+ tool_params = srv_params.get(tool, {})
195
+ if isinstance(tool_params, dict):
196
+ for k, v in tool_params.items():
197
+ if k not in tool_args:
198
+ tool_args[k] = v
199
+
154
200
 
155
201
  # Smart Parallelization & DAG Logic:
156
202
  # 1. By default, a step is a root (None) unless it has no 'data' input,
@@ -161,8 +207,13 @@ class PipelineClient:
161
207
  has_explicit_data = "data" in tool_args
162
208
 
163
209
  if not has_explicit_data:
164
- # No data provided? Inherit from the last executed step to keep simple sequences working
165
- current_prev_output = last_output
210
+ # Smart Source Detection: If a step has 'file_path', 'url', etc.,
211
+ # it's likely a primary ingestion step and shouldn't inherit 'data' from the previous step.
212
+ source_keys = {"file_path", "url", "uri", "path"}
213
+ is_source = any(k in tool_args for k in source_keys)
214
+
215
+ if not is_source:
216
+ current_prev_output = last_output
166
217
  else:
167
218
  # Data provided? Check if it's a reference or raw data
168
219
  for k, v in list(tool_args.items()):
@@ -174,7 +225,7 @@ class PipelineClient:
174
225
  del tool_args[k]
175
226
  else:
176
227
  # Limitation: ZenML doesn't materialize artifacts nested in dicts
177
- print(f"[Warning] Tool argument '{k}' uses a step reference '{v}'. "
228
+ logger.warning(f" Tool argument '{k}' uses a step reference '{v}'. "
178
229
  "Currently, only the 'data' field supports cross-step dependencies. "
179
230
  "This value will be passed as a raw string.")
180
231
  else:
@@ -3,7 +3,9 @@ from typing import Any, Callable, Dict, List, Optional, Union
3
3
  import yaml
4
4
  from pathlib import Path
5
5
  from fastmcp import FastMCP
6
+ from loguru import logger
6
7
  import inspect
8
+ import functools
7
9
 
8
10
  class ZemServer(FastMCP):
9
11
  """
@@ -22,9 +24,24 @@ class ZemServer(FastMCP):
22
24
  self.parameter_file = parameter_file
23
25
  self.parameters = {}
24
26
 
25
- # 1. Load from file
27
+ # Configure logging based on ZEM_VERBOSE
28
+ import os
29
+ import sys
30
+ if os.environ.get("ZEM_VERBOSE"):
31
+ logger.remove()
32
+ logger.add(sys.stderr, level="DEBUG", format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
33
+
34
+ # 1. Load from file or auto-detect in server directory
26
35
  if parameter_file:
27
36
  self.load_parameters(parameter_file)
37
+ else:
38
+ # Auto-detect parameters.yml in the same directory as the server script
39
+ import inspect
40
+ caller_frame = inspect.stack()[1]
41
+ caller_file = caller_frame.filename
42
+ auto_path = Path(caller_file).parent / "parameters.yml"
43
+ if auto_path.exists():
44
+ self.load_parameters(str(auto_path))
28
45
 
29
46
  # 2. Override with env params (from PipelineClient)
30
47
  import os
@@ -35,7 +52,7 @@ class ZemServer(FastMCP):
35
52
  if isinstance(env_params, dict):
36
53
  self._merge_parameters(env_params)
37
54
  except Exception as e:
38
- print(f"Error loading ZEM_PARAMETERS: {e}")
55
+ logger.error(f"Error loading ZEM_PARAMETERS: {e}")
39
56
 
40
57
  def load_parameters(self, file_path: str) -> Dict[str, Any]:
41
58
  """Load parameters from YAML file and merge them."""
@@ -79,8 +96,8 @@ class ZemServer(FastMCP):
79
96
  else:
80
97
  target[k] = v
81
98
 
82
- # Removed custom tool decorator to fix multiple values for argument 'name' error
83
- # Inherit directly from FastMCP.tool
99
+ # NOTE: Parameter injection is handled by PipelineClient._merge_parameters
100
+ # and the tool decorator is inherited from FastMCP
84
101
 
85
102
  def get_data(self, data: Any) -> List[Dict[str, Any]]:
86
103
  """
@@ -9,7 +9,7 @@ from loguru import logger
9
9
  logger.remove()
10
10
  logger.add(sys.stderr, level="INFO")
11
11
 
12
- server = ZemServer("data_juicer", parameter_file=os.path.join(os.path.dirname(__file__), "parameter.yaml"))
12
+ server = ZemServer("data_juicer", parameter_file=os.path.join(os.path.dirname(__file__), "parameters.yml"))
13
13
 
14
14
  @server.tool()
15
15
  def clean_content(
@@ -10,7 +10,7 @@ from loguru import logger
10
10
  logger.remove()
11
11
  logger.add(sys.stderr, level="INFO")
12
12
 
13
- server = ZemServer("instruction", parameter_file=os.path.join(os.path.dirname(__file__), "parameter.yaml"))
13
+ server = ZemServer("instruction", parameter_file=os.path.join(os.path.dirname(__file__), "parameters.yml"))
14
14
 
15
15
  @server.tool()
16
16
  def generate_qa_pairs(
@@ -7,7 +7,7 @@ from loguru import logger
7
7
  import sys
8
8
 
9
9
  # Initialize server
10
- server = ZemServer("io", parameter_file=os.path.join(os.path.dirname(__file__), "parameter.yaml"))
10
+ server = ZemServer("io", parameter_file=os.path.join(os.path.dirname(__file__), "parameters.yml"))
11
11
 
12
12
  @server.tool()
13
13
  def load_jsonl(path: str, return_reference: bool = False) -> Any:
@@ -0,0 +1,10 @@
1
+ # Default parameters for LLM Server
2
+ mask_pii:
3
+ provider: "ollama"
4
+
5
+ classify_domain:
6
+ provider: "ollama"
7
+ categories:
8
+ - Tech
9
+ - Finance
10
+ - Legal
@@ -10,7 +10,7 @@ from loguru import logger
10
10
  logger.remove()
11
11
  logger.add(sys.stderr, level="INFO")
12
12
 
13
- server = ZemServer("nemo", parameter_file=os.path.join(os.path.dirname(__file__), "parameter.yaml"))
13
+ server = ZemServer("nemo", parameter_file=os.path.join(os.path.dirname(__file__), "parameters.yml"))
14
14
 
15
15
  @server.tool()
16
16
  def normalize(
@@ -0,0 +1,90 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import io
17
+ import sys
18
+ import threading
19
+ import pdfplumber
20
+
21
+ from .ocr import OCR
22
+ from .recognizer import Recognizer
23
+ from .layout_recognizer import LayoutRecognizerDocLayoutYOLO as LayoutRecognizer
24
+ from .table_structure_recognizer import TableStructureRecognizer
25
+ # from .engine import VietDocEngine
26
+
27
+ # New Phase-Based Architecture
28
+ from .pipeline import DocumentPipeline
29
+ from .phases import (
30
+ LayoutAnalysisPhase,
31
+ TextDetectionPhase,
32
+ TextRecognitionPhase,
33
+ PostProcessingPhase,
34
+ DocumentReconstructionPhase,
35
+ )
36
+ from .implementations import (
37
+ DocLayoutYOLOAnalyzer,
38
+ PaddleOCRTextDetector,
39
+ VietOCRRecognizer,
40
+ SVTRv2Recognizer,
41
+ LandingAIRecognizer,
42
+ VietnameseTextPostProcessor,
43
+ SmartMarkdownReconstruction,
44
+ create_default_pipeline,
45
+ create_svtrv2_pipeline,
46
+ create_experimental_pipeline,
47
+ )
48
+
49
+
50
+ LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
51
+ if LOCK_KEY_pdfplumber not in sys.modules:
52
+ sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
53
+
54
+
55
+ # Removed init_in_out
56
+
57
+
58
+ __all__ = [
59
+ # Legacy API (backward compatibility)
60
+ "OCR",
61
+ "Recognizer",
62
+ "LayoutRecognizer",
63
+ "TableStructureRecognizer",
64
+ # "VietDocEngine",
65
+ # "init_in_out",
66
+
67
+ # New Phase-Based Architecture
68
+ "DocumentPipeline",
69
+
70
+ # Abstract Phase Interfaces
71
+ "LayoutAnalysisPhase",
72
+ "TextDetectionPhase",
73
+ "TextRecognitionPhase",
74
+ "PostProcessingPhase",
75
+ "DocumentReconstructionPhase",
76
+
77
+ # Concrete Implementations
78
+ "DocLayoutYOLOAnalyzer",
79
+ "PaddleOCRTextDetector",
80
+ "VietOCRRecognizer",
81
+ "SVTRv2Recognizer",
82
+ "LandingAIRecognizer",
83
+ "VietnameseTextPostProcessor",
84
+ "SmartMarkdownReconstruction",
85
+
86
+ # Factory Functions
87
+ "create_default_pipeline",
88
+ "create_svtrv2_pipeline",
89
+ "create_experimental_pipeline",
90
+ ]