PyPI - xfmr-zem - Versions diffs - 0.2.4__tar.gz → 0.2.5__tar.gz - Mend

xfmr-zem 0.2.4tar.gz → 0.2.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

{xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/.gitignore RENAMED Viewed

@@ -55,3 +55,8 @@ Thumbs.db
 # Project specific
 outputs/
 cache/
+# Binary Models (OCR) - Should be downloaded separately
+src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/*.onnx
+src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/weight/*.pth
+src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/weight/*.onnx

{xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/AGENTS.md RENAMED Viewed

@@ -42,6 +42,18 @@
     - Providers: **Ollama** (local default), **OpenAI**.
 - **Sinks Server** (`src/xfmr_zem/servers/sinks/server.py`):
     - Tools: `to_huggingface`, `to_vector_db`.
+### 5. Unstructured Server (`servers/unstructured`)
+Advanced document parsing for multimodal data ingestion.
+- `parse_document`: Convert PDF, DOCX, HTML to structured text.
+- `extract_tables`: Specifically isolate and extract table data from documents.
+### 6. OCR Server (`servers/ocr`)
+Unified OCR processing with multiple engine support (SOLID Strategy Pattern).
+- `extract_text`: Extract text from images using different engines:
+    - `tesseract`: Lightweight and fast.
+    - `paddle`: Medium weight, high accuracy.
+    - `qwen`: Heavy Vision-Language Model (Qwen3-VL-8B) for state-of-the-art OCR.
+    - `viet`: Specialized Vietnamese OCR using built-in `deepdoc_vietocr` pipeline. Optimized for Vietnamese diacritics and document layout reconstruction.
 ## Orchestration & Concurrency

{xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/CHANGELOG.md RENAMED Viewed

@@ -2,6 +2,12 @@
 All notable changes to this project will be documented in this file.
+## [0.2.5] - 2026-02-03
+### Added
+- **Vietnamese OCR**: Integrated specialized Deep-ocr DocumentPipeline for high-accuracy Vietnamese text extraction with layout analysis.
+- **Improved OCR Dependencies**: Added `pdfplumber`, `ruamel.yaml`, and `cachetools` to resolve OCR server tool errors.
 ## [0.2.4] - 2026-02-02
 ### Fixed

{xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xfmr-zem
-Version: 0.2.4
+Version: 0.2.5
 Summary: Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing
 Project-URL: Homepage, https://github.com/OAI-Labs/xfmr-zem
 Project-URL: Repository, https://github.com/OAI-Labs/xfmr-zem
@@ -22,8 +22,10 @@ Requires-Dist: numpy>=1.24.0
 Requires-Dist: pandas>=2.0.0
 Requires-Dist: pyarrow>=15.0.0
 Requires-Dist: pydantic>=2.0.0
+Requires-Dist: python-magic>=0.4.27
 Requires-Dist: pyyaml>=6.0
 Requires-Dist: rich>=13.0.0
+Requires-Dist: unstructured[all-docs]>=0.16.0
 Requires-Dist: zenml[local,server]>=0.75.0
 Provides-Extra: all
 Requires-Dist: nemo-curator>=0.6.0; extra == 'all'
@@ -39,6 +41,22 @@ Requires-Dist: pytest>=7.0.0; extra == 'dev'
 Requires-Dist: ruff>=0.1.0; extra == 'dev'
 Provides-Extra: nemo
 Requires-Dist: nemo-curator>=0.6.0; extra == 'nemo'
+Provides-Extra: ocr
+Requires-Dist: cachetools>=5.0.0; extra == 'ocr'
+Requires-Dist: einops; extra == 'ocr'
+Requires-Dist: onnxruntime>=1.16.0; extra == 'ocr'
+Requires-Dist: opencv-python>=4.8.0; extra == 'ocr'
+Requires-Dist: paddleocr>=2.7.0; extra == 'ocr'
+Requires-Dist: paddlepaddle>=2.6.0; extra == 'ocr'
+Requires-Dist: pdfplumber>=0.11.0; extra == 'ocr'
+Requires-Dist: pillow>=10.0.0; extra == 'ocr'
+Requires-Dist: pyclipper; extra == 'ocr'
+Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
+Requires-Dist: ruamel-yaml>=0.17.0; extra == 'ocr'
+Requires-Dist: shapely; extra == 'ocr'
+Requires-Dist: torch==2.5.1; extra == 'ocr'
+Requires-Dist: torchvision==0.20.1; extra == 'ocr'
+Requires-Dist: transformers>=4.40.0; extra == 'ocr'
 Provides-Extra: zenml
 Requires-Dist: zenml>=0.75.0; extra == 'zenml'
 Description-Content-Type: text/markdown

xfmr_zem-0.2.5/data/ocr_test.png ADDED Viewed

Binary file

xfmr_zem-0.2.5/data/vietnamese_ocr.png ADDED Viewed

Binary file

xfmr_zem-0.2.5/parameters.yml ADDED Viewed

@@ -0,0 +1,7 @@
+# Parameters for Zem Pipeline
+# This file allows overriding tool arguments at runtime
+ocr:
+  extract_text:
+    engine: "huggingface"
+    model_id: "Qwen/Qwen2-VL-2B-Instruct"

{xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "xfmr-zem"
-version = "0.2.4"
+version = "0.2.5"
 description = "Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing"
 readme = "README.md"
 requires-python = ">=3.10,<3.13"
@@ -25,6 +25,8 @@ dependencies = [
     "dask-cuda>=24.0.0",
     "ftfy>=6.3.1",
     "pydantic>=2.0.0",
+    "unstructured[all-docs]>=0.16.0",
+    "python-magic>=0.4.27",
 ]
 [project.urls]
@@ -48,6 +50,23 @@ all = [
     "nemo-curator>=0.6.0",
     "py-data-juicer>=1.0.0",
 ]
+ocr = [
+    "pytesseract>=0.3.10",
+    "paddleocr>=2.7.0",
+    "paddlepaddle>=2.6.0",
+    "transformers>=4.40.0",
+    "torch==2.5.1",
+    "torchvision==0.20.1",
+    "pillow>=10.0.0",
+    "onnxruntime>=1.16.0",
+    "opencv-python>=4.8.0",
+    "shapely",
+    "pyclipper",
+    "einops",
+    "pdfplumber>=0.11.0",
+    "ruamel.yaml>=0.17.0",
+    "cachetools>=5.0.0",
+]
 dev = [
     "pytest>=7.0.0",
     "pytest-cov>=4.0.0",

{xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/cli.py RENAMED Viewed

@@ -173,8 +173,19 @@ def _print_static_operators():
 @main.command()
 @click.argument("config_file", type=click.Path(exists=True))
 @click.option("--params", "-p", type=click.Path(exists=True), help="Path to custom parameters.yml")
-def run(config_file, params):
+@click.option("--verbose", "-v", is_flag=True, help="Enable verbose/debug logging")
+def run(config_file, params, verbose):
     """Run a pipeline from a YAML configuration file"""
+    # Configure logging based on verbosity
+    if verbose:
+        os.environ["ZEM_VERBOSE"] = "1"
+        logger.remove()
+        logger.add(sys.stderr, level="DEBUG", format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
+        console.print("[bold yellow]Verbose mode enabled - DEBUG logging active[/bold yellow]")
+    else:
+        logger.remove()
+        logger.add(sys.stderr, level="INFO", format="<level>{message}</level>")
     abs_config = os.path.abspath(config_file)
     console.print(f"[bold green]Starting Pipeline:[/bold green] {abs_config}")
     if params:
@@ -239,12 +250,18 @@ def preview(artifact_id, id2, limit, sample):
                 return pd.DataFrame(lines)
         elif isinstance(d, list): return pd.DataFrame(d)
         elif isinstance(d, pd.DataFrame): return d
-        return None
+        elif isinstance(d, dict): return pd.DataFrame([d])
+        return d
     try:
         df1 = load_art_df(artifact_id)
         if df1 is None:
-            console.print("[bold red]Error:[/bold red] Could not load artifact as tabular data.")
+            console.print("[bold red]Error:[/bold red] Artifact is empty or could not be loaded.")
+            return
+        if not isinstance(df1, pd.DataFrame):
+            console.print(f"[bold blue]Artifact Preview (Type: {type(df1).__name__}):[/bold blue]")
+            console.print(str(df1))
             return
         if id2:
@@ -290,6 +307,18 @@ def preview(artifact_id, id2, limit, sample):
         console.print(f"[bold red]Error previewing artifact:[/bold red] {e}")
+@main.group()
+def ocr():
+    """OCR related commands (Installation, etc.)"""
+    pass
+@ocr.command(name="install")
+def ocr_install():
+    """Install OCR model weights (ONNX/PTH)"""
+    from xfmr_zem.servers.ocr.install_models import main as install_main
+    install_main()
 if __name__ == "__main__":
     main()

{xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/client.py RENAMED Viewed

@@ -4,6 +4,7 @@ import yaml
 from pathlib import Path
 from zenml import pipeline
 from .zenml_wrapper import mcp_generic_step
+from loguru import logger
 import os
 import sys
@@ -39,6 +40,17 @@ class PipelineClient:
                     params.update(yaml.safe_load(f) or {})
         return params
+    def _flatten_params(self, d: Dict[str, Any], prefix: str = "") -> Dict[str, Any]:
+        """Flatten nested dictionary into dot-notation keys."""
+        items = []
+        for k, v in d.items():
+            new_key = f"{prefix}.{k}" if prefix else k
+            if isinstance(v, dict):
+                items.extend(self._flatten_params(v, new_key).items())
+            else:
+                items.append((new_key, v))
+        return dict(items)
     def _load_config_dict(self, path: Path) -> Dict[str, Any]:
         """Load YAML config and perform substitution."""
         with open(path, "r") as f:
@@ -54,8 +66,13 @@ class PipelineClient:
             custom_params = self._load_params(self.params_path)
             self.params.update(custom_params)
+        # Flatten params for template substitution
+        flat_params = self._flatten_params(self.params)
         content = raw_content
-        for key, value in self.params.items():
+        # Use reversed sorted keys to avoid partial replacements (e.g. ocr before ocr.engine)
+        for key in sorted(flat_params.keys(), key=len, reverse=True):
+            value = flat_params[key]
             content = content.replace(f"{{{{ {key} }}}}", str(value))
             content = content.replace(f"{{{{{key}}}}}", str(value))
@@ -96,6 +113,11 @@ class PipelineClient:
                     server_specific_params[key] = value
             env["ZEM_PARAMETERS"] = yaml.dump(server_specific_params)
+            # Pass verbose flag to subprocess
+            if os.environ.get("ZEM_VERBOSE"):
+                env["ZEM_VERBOSE"] = "1"
             configs[name] = {
                 "command": sys.executable,
                 "args": [str(abs_path)],
@@ -146,11 +168,35 @@ class PipelineClient:
                     srv, tool = key.split(".")
-                    if not step_alias:
-                        step_alias = step_def[key].get("name")
                     step_alias = step_alias or f"{srv}.{tool}.{i}"
-                    tool_args = step_def[key].get("input", {}) or {}
+                    val = step_def[key]
+                    if isinstance(val, dict):
+                        if "input" in val:
+                            tool_args = val.get("input", {}) or {}
+                        else:
+                            # Use everything except 'name' as tool_args
+                            tool_args = {k: v for k, v in val.items() if k != "name"}
+                    else:
+                        tool_args = {}
+                # Standardized Parameter Injection:
+                # Merge parameters from the 'parameters' section.
+                # Priority: Step-specific args > parameters.<srv>.<tool> > parameters.<srv>
+                srv_params = self.params.get(srv, {})
+                if isinstance(srv_params, dict):
+                    # 1. Server-wide defaults
+                    for k, v in srv_params.items():
+                        if k != tool and not isinstance(v, dict) and k not in tool_args:
+                            tool_args[k] = v
+                    # 2. Tool-specific overrides
+                    tool_params = srv_params.get(tool, {})
+                    if isinstance(tool_params, dict):
+                        for k, v in tool_params.items():
+                            if k not in tool_args:
+                                tool_args[k] = v
                 # Smart Parallelization & DAG Logic:
                 # 1. By default, a step is a root (None) unless it has no 'data' input,
@@ -161,8 +207,13 @@ class PipelineClient:
                 has_explicit_data = "data" in tool_args
                 if not has_explicit_data:
-                    # No data provided? Inherit from the last executed step to keep simple sequences working
-                    current_prev_output = last_output
+                    # Smart Source Detection: If a step has 'file_path', 'url', etc.,
+                    # it's likely a primary ingestion step and shouldn't inherit 'data' from the previous step.
+                    source_keys = {"file_path", "url", "uri", "path"}
+                    is_source = any(k in tool_args for k in source_keys)
+                    if not is_source:
+                        current_prev_output = last_output
                 else:
                     # Data provided? Check if it's a reference or raw data
                     for k, v in list(tool_args.items()):
@@ -174,7 +225,7 @@ class PipelineClient:
                                     del tool_args[k]
                                 else:
                                     # Limitation: ZenML doesn't materialize artifacts nested in dicts
-                                    print(f"[Warning] Tool argument '{k}' uses a step reference '{v}'. "
+                                    logger.warning(f" Tool argument '{k}' uses a step reference '{v}'. "
                                           "Currently, only the 'data' field supports cross-step dependencies. "
                                           "This value will be passed as a raw string.")
                             else:

{xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/server.py RENAMED Viewed

@@ -3,7 +3,9 @@ from typing import Any, Callable, Dict, List, Optional, Union
 import yaml
 from pathlib import Path
 from fastmcp import FastMCP
+from loguru import logger
 import inspect
+import functools
 class ZemServer(FastMCP):
     """
@@ -22,9 +24,24 @@ class ZemServer(FastMCP):
         self.parameter_file = parameter_file
         self.parameters = {}
-        # 1. Load from file
+        # Configure logging based on ZEM_VERBOSE
+        import os
+        import sys
+        if os.environ.get("ZEM_VERBOSE"):
+            logger.remove()
+            logger.add(sys.stderr, level="DEBUG", format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
+        # 1. Load from file or auto-detect in server directory
         if parameter_file:
             self.load_parameters(parameter_file)
+        else:
+            # Auto-detect parameters.yml in the same directory as the server script
+            import inspect
+            caller_frame = inspect.stack()[1]
+            caller_file = caller_frame.filename
+            auto_path = Path(caller_file).parent / "parameters.yml"
+            if auto_path.exists():
+                self.load_parameters(str(auto_path))
         # 2. Override with env params (from PipelineClient)
         import os
@@ -35,7 +52,7 @@ class ZemServer(FastMCP):
                 if isinstance(env_params, dict):
                     self._merge_parameters(env_params)
             except Exception as e:
-                print(f"Error loading ZEM_PARAMETERS: {e}")
+                logger.error(f"Error loading ZEM_PARAMETERS: {e}")
     def load_parameters(self, file_path: str) -> Dict[str, Any]:
         """Load parameters from YAML file and merge them."""
@@ -79,8 +96,8 @@ class ZemServer(FastMCP):
             else:
                 target[k] = v
-    # Removed custom tool decorator to fix multiple values for argument 'name' error
-    # Inherit directly from FastMCP.tool
+    # NOTE: Parameter injection is handled by PipelineClient._merge_parameters
+    # and the tool decorator is inherited from FastMCP
     def get_data(self, data: Any) -> List[Dict[str, Any]]:
         """

{xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/data_juicer/server.py RENAMED Viewed

@@ -9,7 +9,7 @@ from loguru import logger
 logger.remove()
 logger.add(sys.stderr, level="INFO")
-server = ZemServer("data_juicer", parameter_file=os.path.join(os.path.dirname(__file__), "parameter.yaml"))
+server = ZemServer("data_juicer", parameter_file=os.path.join(os.path.dirname(__file__), "parameters.yml"))
 @server.tool()
 def clean_content(

{xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/instruction_gen/server.py RENAMED Viewed

@@ -10,7 +10,7 @@ from loguru import logger
 logger.remove()
 logger.add(sys.stderr, level="INFO")
-server = ZemServer("instruction", parameter_file=os.path.join(os.path.dirname(__file__), "parameter.yaml"))
+server = ZemServer("instruction", parameter_file=os.path.join(os.path.dirname(__file__), "parameters.yml"))
 @server.tool()
 def generate_qa_pairs(

{xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/io/server.py RENAMED Viewed

@@ -7,7 +7,7 @@ from loguru import logger
 import sys
 # Initialize server
-server = ZemServer("io", parameter_file=os.path.join(os.path.dirname(__file__), "parameter.yaml"))
+server = ZemServer("io", parameter_file=os.path.join(os.path.dirname(__file__), "parameters.yml"))
 @server.tool()
 def load_jsonl(path: str, return_reference: bool = False) -> Any:

xfmr_zem-0.2.5/src/xfmr_zem/servers/llm/parameters.yml ADDED Viewed

@@ -0,0 +1,10 @@
+# Default parameters for LLM Server
+mask_pii:
+  provider: "ollama"
+classify_domain:
+  provider: "ollama"
+  categories:
+    - Tech
+    - Finance
+    - Legal

{xfmr_zem-0.2.4 → xfmr_zem-0.2.5}/src/xfmr_zem/servers/nemo_curator/server.py RENAMED Viewed

@@ -10,7 +10,7 @@ from loguru import logger
 logger.remove()
 logger.add(sys.stderr, level="INFO")
-server = ZemServer("nemo", parameter_file=os.path.join(os.path.dirname(__file__), "parameter.yaml"))
+server = ZemServer("nemo", parameter_file=os.path.join(os.path.dirname(__file__), "parameters.yml"))
 @server.tool()
 def normalize(

xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py ADDED Viewed

@@ -0,0 +1,90 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import io
+import sys
+import threading
+import pdfplumber
+from .ocr import OCR
+from .recognizer import Recognizer
+from .layout_recognizer import LayoutRecognizerDocLayoutYOLO as LayoutRecognizer
+from .table_structure_recognizer import TableStructureRecognizer
+# from .engine import VietDocEngine
+# New Phase-Based Architecture
+from .pipeline import DocumentPipeline
+from .phases import (
+    LayoutAnalysisPhase,
+    TextDetectionPhase,
+    TextRecognitionPhase,
+    PostProcessingPhase,
+    DocumentReconstructionPhase,
+)
+from .implementations import (
+    DocLayoutYOLOAnalyzer,
+    PaddleOCRTextDetector,
+    VietOCRRecognizer,
+    SVTRv2Recognizer,
+    LandingAIRecognizer,
+    VietnameseTextPostProcessor,
+    SmartMarkdownReconstruction,
+    create_default_pipeline,
+    create_svtrv2_pipeline,
+    create_experimental_pipeline,
+)
+LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
+if LOCK_KEY_pdfplumber not in sys.modules:
+    sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
+# Removed init_in_out
+__all__ = [
+    # Legacy API (backward compatibility)
+    "OCR",
+    "Recognizer",
+    "LayoutRecognizer",
+    "TableStructureRecognizer",
+    # "VietDocEngine",
+    # "init_in_out",
+    # New Phase-Based Architecture
+    "DocumentPipeline",
+    # Abstract Phase Interfaces
+    "LayoutAnalysisPhase",
+    "TextDetectionPhase",
+    "TextRecognitionPhase",
+    "PostProcessingPhase",
+    "DocumentReconstructionPhase",
+    # Concrete Implementations
+    "DocLayoutYOLOAnalyzer",
+    "PaddleOCRTextDetector",
+    "VietOCRRecognizer",
+    "SVTRv2Recognizer",
+    "LandingAIRecognizer",
+    "VietnameseTextPostProcessor",
+    "SmartMarkdownReconstruction",
+    # Factory Functions
+    "create_default_pipeline",
+    "create_svtrv2_pipeline",
+    "create_experimental_pipeline",
+]

xfmr-zem 0.2.4__tar.gz → 0.2.5__tar.gz

xfmr-zem 0.2.4tar.gz → 0.2.5tar.gz