PyPI - xfmr-zem - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

xfmr-zem 0.2.2py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

xfmr_zem/cli.py +32 -3
xfmr_zem/client.py +59 -8
xfmr_zem/server.py +21 -4
xfmr_zem/servers/data_juicer/server.py +1 -1
xfmr_zem/servers/instruction_gen/server.py +1 -1
xfmr_zem/servers/io/server.py +1 -1
xfmr_zem/servers/llm/parameters.yml +10 -0
xfmr_zem/servers/nemo_curator/server.py +1 -1
xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py +90 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py +1286 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py +562 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/ocr.py +512 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/.gitattributes +35 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/README.md +5 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/ocr.res +6623 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/operators.py +725 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/phases.py +191 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/pipeline.py +561 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/postprocess.py +370 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/recognizer.py +436 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/table_structure_recognizer.py +569 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/utils/__init__.py +81 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/utils/file_utils.py +246 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/__init__.py +0 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/base.yml +58 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/vgg-seq2seq.yml +38 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/__init__.py +0 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/cnn.py +25 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/vgg.py +51 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/seqmodel/seq2seq.py +175 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/transformerocr.py +29 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/vocab.py +36 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/config.py +37 -0
xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/translate.py +111 -0
xfmr_zem/servers/ocr/engines.py +242 -0
xfmr_zem/servers/ocr/install_models.py +63 -0
xfmr_zem/servers/ocr/parameters.yml +4 -0
xfmr_zem/servers/ocr/server.py +44 -0
xfmr_zem/servers/profiler/parameters.yml +4 -0
xfmr_zem/servers/sinks/parameters.yml +6 -0
xfmr_zem/servers/unstructured/parameters.yml +6 -0
xfmr_zem/servers/unstructured/server.py +62 -0
xfmr_zem/zenml_wrapper.py +20 -7
{xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/METADATA +19 -1
xfmr_zem-0.2.5.dist-info/RECORD +58 -0
xfmr_zem-0.2.2.dist-info/RECORD +0 -23
/xfmr_zem/servers/data_juicer/{parameter.yaml → parameters.yml} +0 -0
/xfmr_zem/servers/instruction_gen/{parameter.yaml → parameters.yml} +0 -0
/xfmr_zem/servers/io/{parameter.yaml → parameters.yml} +0 -0
/xfmr_zem/servers/nemo_curator/{parameter.yaml → parameters.yml} +0 -0
{xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/WHEEL +0 -0
{xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/entry_points.txt +0 -0
{xfmr_zem-0.2.2.dist-info → xfmr_zem-0.2.5.dist-info}/licenses/LICENSE +0 -0

xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/config.py ADDED Viewed

@@ -0,0 +1,37 @@
+import yaml
+def load_config(config_file):
+    with open(config_file, encoding='utf-8') as f:
+        config = yaml.safe_load(f)
+    return config
+class Cfg(dict):
+    def __init__(self, config_dict):
+        super(Cfg, self).__init__(**config_dict)
+        self.__dict__ = self
+    @staticmethod
+    def load_config_from_file(fname, base_file=None):
+        from pathlib import Path
+        if base_file is None:
+            base_file = Path(__file__).resolve().parent.parent / 'config' / 'base.yml'
+        base_config = load_config(base_file)
+        with open(fname, encoding='utf-8') as f:
+            config = yaml.safe_load(f)
+        base_config.update(config)
+        return Cfg(base_config)
+    @staticmethod
+    def load_config_from_name(name):
+        from pathlib import Path
+        config_dir = Path(__file__).resolve().parent.parent / 'config'
+        return Cfg.load_config_from_file(config_dir / f'{name}.yml')
+    def save(self, fname):
+        with open(fname, 'w') as outfile:
+            yaml.dump(dict(self), outfile, default_flow_style=False, allow_unicode=True)

xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/translate.py ADDED Viewed

@@ -0,0 +1,111 @@
+import os
+import torch
+import numpy as np
+import cv2
+from ..model.vocab import Vocab
+from ..model.transformerocr import VietOCR
+import math
+from PIL import Image
+def translate(img, model, max_seq_length=128, sos_token=1, eos_token=2):
+    """data: BxCxHxW"""
+    model.eval()
+    device = img.device
+    with torch.no_grad():
+        src = model.cnn(img)
+        memory = model.transformer.forward_encoder(src)
+        translated_sentence = [[sos_token] * len(img)]
+        max_length = 0
+        while max_length <= max_seq_length and not all(np.any(np.asarray(translated_sentence).T == eos_token, axis=1)):
+            tgt_inp = torch.LongTensor(translated_sentence).to(device)
+            output, memory = model.transformer.forward_decoder(tgt_inp, memory)
+            output = output.to('cpu')
+            values, indices = torch.topk(output, 1)
+            indices = indices[:, -1, 0]
+            indices = indices.tolist()
+            translated_sentence.append(indices)
+            max_length += 1
+            del output
+        translated_sentence = np.asarray(translated_sentence).T
+    return translated_sentence
+def build_model(config):
+    vocab = Vocab(config['vocab'])
+    device = config['device']
+    model = VietOCR(len(vocab),
+            config['backbone'],
+            config['cnn'],
+            config['transformer'],
+            config['seq_modeling'])
+    model = model.to(device)
+    if 'weights' in config and config['weights']:
+        weights = config['weights']
+        if weights.startswith('http'):
+            # Logic for downloading could go here, but we assume local path for now
+            pass
+        if os.path.exists(weights):
+            model.load_state_dict(torch.load(weights, map_location=device))
+            model.eval()
+        else:
+            import logging
+            logging.warning(f"Weight file not found: {weights}")
+    return model, vocab
+def resize(w, h, expected_height, image_min_width, image_max_width):
+    new_w = int(expected_height * float(w) / float(h))
+    round_to = 10
+    new_w = math.ceil(new_w/round_to)*round_to
+    new_w = max(new_w, image_min_width)
+    new_w = min(new_w, image_max_width)
+    return new_w, expected_height
+def process_image(image, image_height, image_min_width, image_max_width):
+    img = image.convert('RGB')
+    w, h = img.size
+    new_w, image_height = resize(w, h, image_height, image_min_width, image_max_width)
+    img = img.resize((new_w, image_height), Image.LANCZOS)
+    img = np.asarray(img).transpose(2,0, 1)
+    img = img/255
+    return img
+def process_input(image, image_height, image_min_width, image_max_width):
+    img = process_image(image, image_height, image_min_width, image_max_width)
+    img = img[np.newaxis, ...]
+    img = torch.FloatTensor(img)
+    return img
+class Predictor:
+    def __init__(self, config):
+        self.model, self.vocab = build_model(config)
+        self.config = config
+    def predict(self, img):
+        img_input = process_input(img, self.config['dataset']['image_height'],
+                                  self.config['dataset']['image_min_width'],
+                                  self.config['dataset']['image_max_width'])
+        img_input = img_input.to(self.config['device'])
+        s = translate(img_input, self.model)
+        s = s[0].tolist()
+        s = self.vocab.decode(s)
+        return s

xfmr_zem/servers/ocr/engines.py ADDED Viewed

@@ -0,0 +1,242 @@
+import os
+import abc
+from typing import Dict, Any, List
+from PIL import Image
+from loguru import logger
+class OCREngineBase(abc.ABC):
+    """
+    Abstract Base Class for OCR Engines (Dependency Inversion & Open/Closed).
+    """
+    @abc.abstractmethod
+    def process(self, image_path: str) -> Dict[str, Any]:
+        """Process an image and return extracted text and metadata."""
+        pass
+class TesseractEngine(OCREngineBase):
+    """
+    Lightweight OCR using Tesseract (Fast & Simple).
+    """
+    def __init__(self):
+        logger.debug("TesseractEngine: Initializing...")
+        try:
+            import pytesseract
+            import shutil
+            logger.debug("TesseractEngine: Checking for tesseract binary...")
+            # Check if tesseract binary exists
+            if not shutil.which("tesseract"):
+                raise RuntimeError(
+                    "Tesseract binary not found. To use the 'tesseract' engine, "
+                    "please install it using: sudo apt install tesseract-ocr"
+                )
+            self.pytesseract = pytesseract
+            logger.debug("TesseractEngine: Initialization complete")
+        except ImportError:
+            logger.error("pytesseract not installed. Please install with 'pip install pytesseract'")
+            raise
+    def process(self, image_path: str) -> Dict[str, Any]:
+        logger.info(f"Using Tesseract to process: {image_path}")
+        image = Image.open(image_path)
+        text = self.pytesseract.image_to_string(image)
+        return {
+            "text": text,
+            "engine": "tesseract",
+            "metadata": {"format": image.format, "size": image.size}
+        }
+class PaddleEngine(OCREngineBase):
+    """
+    Medium-weight OCR using PaddleOCR (High accuracy for multi-language).
+    """
+    def __init__(self):
+        logger.debug("PaddleEngine: Initializing...")
+        try:
+            logger.debug("PaddleEngine: Importing PaddleOCR...")
+            from paddleocr import PaddleOCR
+            logger.debug("PaddleEngine: Creating PaddleOCR instance (use_angle_cls=True, lang='en')...")
+            self.ocr = PaddleOCR(use_angle_cls=True, lang='en') # Default to English
+            logger.debug("PaddleEngine: Initialization complete")
+        except ImportError:
+            logger.error("paddleocr not installed. Please install with 'pip install paddleocr paddlepaddle'")
+            raise
+    def process(self, image_path: str) -> Dict[str, Any]:
+        logger.info(f"Using PaddleOCR to process: {image_path}")
+        result = self.ocr.ocr(image_path, cls=True)
+        full_text = []
+        scores = []
+        for line in result:
+            if line:
+                for res in line:
+                    full_text.append(res[1][0])
+                    scores.append(float(res[1][1]))
+        return {
+            "text": "\n".join(full_text),
+            "engine": "paddleocr",
+            "metadata": {"avg_confidence": sum(scores)/len(scores) if scores else 0}
+        }
+class HuggingFaceVLEngine(OCREngineBase):
+    """
+    Advanced OCR using Hugging Face Vision Language Models (e.g. Qwen2-VL, Molmo).
+    """
+    def __init__(self, model_id: str = "Qwen/Qwen2-VL-2B-Instruct"):
+        self.model_id = model_id or "Qwen/Qwen2-VL-2B-Instruct"
+        self.model = None
+        self.processor = None
+    def _lazy_load(self):
+        if self.model is None:
+            try:
+                logger.debug(f"HuggingFaceVLEngine: Starting lazy load for model: {self.model_id}")
+                import torch
+                logger.debug(f"HuggingFaceVLEngine: PyTorch version={torch.__version__}, CUDA available={torch.cuda.is_available()}")
+                from transformers import AutoModelForVision2Seq, AutoProcessor
+                logger.info(f"Loading Hugging Face VL model: {self.model_id} (this may take a while)...")
+                logger.debug(f"HuggingFaceVLEngine: Loading processor from {self.model_id}...")
+                self.processor = AutoProcessor.from_pretrained(self.model_id)
+                logger.debug(f"HuggingFaceVLEngine: Processor loaded successfully")
+                # Use GPU if available
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+                # Default to float32
+                dtype = torch.float32
+                if device == "cuda":
+                    # Check compute capability
+                    cc_major = torch.cuda.get_device_properties(0).major
+                    # Pascal (6.1) has poor FP16 performance, so use FP32.
+                    # Volta (7.0) and newer have good FP16 performance.
+                    if cc_major >= 7:
+                        dtype = torch.float16
+                logger.debug(f"HuggingFaceVLEngine: Loading model with dtype={dtype}, device='{device}'...")
+                # Using AutoModelForVision2Seq for generality
+                self.model = AutoModelForVision2Seq.from_pretrained(
+                    self.model_id,
+                    torch_dtype=dtype,
+                    device_map=None,
+                    trust_remote_code=True
+                ).to(device)
+                self._device = device
+                logger.debug(f"HuggingFaceVLEngine: Model loaded successfully on {device}")
+            except ImportError:
+                logger.error("transformers/torch not installed. Required for HuggingFace-VL.")
+                raise
+            except Exception as e:
+                logger.error(f"Error loading model {self.model_id}: {e}")
+                raise
+    def process(self, image_path: str) -> Dict[str, Any]:
+        self._lazy_load()
+        logger.info(f"Using {self.model_id} via HuggingFaceVLEngine to process: {image_path}")
+        image = Image.open(image_path).convert("RGB")
+        # Use proper chat template format for Qwen2-VL
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": "Extract all text from this image exactly as it appears."}
+                ]
+            }
+        ]
+        # Apply chat template for proper formatting
+        text_input = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = self.processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(self._device)
+        generated_ids = self.model.generate(**inputs, max_new_tokens=512)
+        # Only decode new tokens
+        generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+        text = self.processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True)[0]
+        return {
+            "text": text,
+            "engine": "huggingface",
+            "metadata": {"model_id": self.model_id}
+        }
+class VietOCREngine(OCREngineBase):
+    """
+    Specialized Vietnamese OCR using built-in Deep-ocr DocumentPipeline (Layout + OCR + MD).
+    """
+    def __init__(self):
+        logger.debug("VietOCREngine: Initializing...")
+        try:
+            logger.debug("VietOCREngine: Importing DocumentPipeline and components...")
+            from xfmr_zem.servers.ocr.deepdoc_vietocr.pipeline import DocumentPipeline
+            from xfmr_zem.servers.ocr.deepdoc_vietocr.implementations import (
+                PaddleStructureV3Analyzer,
+                PaddleOCRTextDetector,
+                VietOCRRecognizer,
+                VietnameseTextPostProcessor,
+                SmartMarkdownReconstruction
+            )
+            logger.info("Initializing Internal Deep-ocr DocumentPipeline for Vietnamese...")
+            logger.debug("VietOCREngine: Creating PaddleStructureV3Analyzer...")
+            layout_analyzer = PaddleStructureV3Analyzer()
+            logger.debug("VietOCREngine: Creating PaddleOCRTextDetector...")
+            text_detector = PaddleOCRTextDetector()
+            logger.debug("VietOCREngine: Creating VietOCRRecognizer...")
+            text_recognizer = VietOCRRecognizer()
+            logger.debug("VietOCREngine: Creating VietnameseTextPostProcessor...")
+            post_processor = VietnameseTextPostProcessor()
+            logger.debug("VietOCREngine: Creating SmartMarkdownReconstruction...")
+            reconstructor = SmartMarkdownReconstruction()
+            logger.debug("VietOCREngine: Assembling DocumentPipeline...")
+            self.pipeline = DocumentPipeline(
+                layout_analyzer=layout_analyzer,
+                text_detector=text_detector,
+                text_recognizer=text_recognizer,
+                post_processor=post_processor,
+                reconstructor=reconstructor
+            )
+            logger.debug("VietOCREngine: Initialization complete")
+        except Exception as e:
+            logger.error(f"Error loading internal Deep-ocr components: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
+            raise
+    def process(self, image_path: str) -> Dict[str, Any]:
+        logger.info(f"Using Internal Deep-ocr (DocumentPipeline) to process: {image_path}")
+        from PIL import Image
+        img = Image.open(image_path)
+        # document.process returns reconstructed markdown text
+        markdown_text = self.pipeline.process(img)
+        return {
+            "text": markdown_text,
+            "engine": "deepdoc_vietocr",
+            "metadata": {"format": "markdown"}
+        }
+class OCREngineFactory:
+    """
+    Factory to create OCR engines (Switching strategy).
+    """
+    @staticmethod
+    def get_engine(engine_type: str, **kwargs) -> OCREngineBase:
+        if engine_type == "tesseract":
+            return TesseractEngine()
+        elif engine_type == "paddle":
+            return PaddleEngine()
+        elif engine_type == "huggingface" or engine_type == "qwen":
+            return HuggingFaceVLEngine(model_id=kwargs.get("model_id"))
+        elif engine_type == "viet":
+            return VietOCREngine()
+        else:
+            raise ValueError(f"Unknown engine type: {engine_type}")

xfmr_zem/servers/ocr/install_models.py ADDED Viewed

@@ -0,0 +1,63 @@
+import os
+import urllib.request
+from tqdm import tqdm
+import logging
+from pathlib import Path
+logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
+def download_file(url, file_path):
+    class DownloadProgressBar(tqdm):
+        def update_to(self, b=1, bsize=1, tsize=None):
+            if tsize is not None:
+                self.total = tsize
+            self.update(b * bsize - self.n)
+    if os.path.exists(file_path):
+        logging.info(f"File already exists: {file_path}")
+        return
+    logging.info(f"Downloading {url} to {file_path}")
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    try:
+        with DownloadProgressBar(unit='B', unit_scale=True,
+                                 miniters=1, desc=url.split('/')[-1]) as t:
+            urllib.request.urlretrieve(url, filename=file_path, reporthook=t.update_to)
+        logging.info("Download completed.")
+    except Exception as e:
+        logging.error(f"Failed to download: {e}")
+        if os.path.exists(file_path):
+            os.remove(file_path)
+        raise e
+def main():
+    base_dir = Path(__file__).resolve().parent / "deepdoc_vietocr"
+    models = [
+        # Detection
+        ("https://huggingface.co/monkt/paddleocr-onnx/resolve/main/detection/v5/det.onnx",
+         base_dir / "onnx" / "det.onnx"),
+        # Layout Analysis
+        ("https://huggingface.co/monkt/paddleocr-onnx/resolve/main/layout/v1/layout.onnx",
+         base_dir / "onnx" / "layout.onnx"),
+        # Table Structure
+        ("https://huggingface.co/monkt/paddleocr-onnx/resolve/main/tsr/v1/tsr.onnx",
+         base_dir / "onnx" / "tsr.onnx"),
+        # VietOCR Recognition
+        ("https://github.com/p_nhm/vietocr-weights/raw/main/vgg_seq2seq.pth",
+         base_dir / "vietocr" / "weight" / "vgg_seq2seq.pth")
+    ]
+    logging.info("Starting OCR model installation...")
+    for url, path in models:
+        try:
+            download_file(url, str(path))
+        except Exception as e:
+            logging.error(f"Skipping {path} due to error: {e}")
+if __name__ == "__main__":
+    main()

xfmr_zem/servers/ocr/parameters.yml ADDED Viewed

@@ -0,0 +1,4 @@
+# Default parameters for OCR Server
+extract_text:
+  engine: "tesseract"
+  model_id: null

xfmr_zem/servers/ocr/server.py ADDED Viewed

@@ -0,0 +1,44 @@
+import pandas as pd
+from xfmr_zem.server import ZemServer
+from xfmr_zem.servers.ocr.engines import OCREngineFactory
+from loguru import logger
+# Initialize ZemServer for OCR
+mcp = ZemServer("ocr")
+@mcp.tool()
+async def extract_text(file_path: str, engine: str = None, model_id: str = None) -> pd.DataFrame:
+    """
+    Extracts text from an image using the specified OCR engine.
+    Args:
+        file_path: Path to the image file.
+        engine: The OCR engine to use ("tesseract", "paddle", "huggingface", "viet"). Defaults to "tesseract".
+        model_id: Optional model ID for the 'huggingface' engine (e.g., "Qwen/Qwen2-VL-2B-Instruct").
+    """
+    logger.info(f"OCR Extraction: {file_path} using {engine} (model: {model_id})")
+    try:
+        # Get engine from factory (SOLID Strategy Pattern)
+        ocr_engine = OCREngineFactory.get_engine(engine, model_id=model_id)
+        # Process image
+        result = ocr_engine.process(file_path)
+        # Structure as a single-row DataFrame for Zem compatibility
+        # We wrap in a list to ensure pandas creates a row
+        df = pd.DataFrame([{
+            "text": result["text"],
+            "engine": result["engine"],
+            "metadata": result["metadata"]
+        }])
+        logger.info(f"Successfully extracted text using {engine}")
+        return df.to_dict(orient="records")
+    except Exception as e:
+        logger.error(f"OCR Error with {engine}: {e}")
+        raise RuntimeError(f"OCR failed: {str(e)}")
+if __name__ == "__main__":
+    mcp.run()

xfmr_zem/servers/profiler/parameters.yml ADDED Viewed

@@ -0,0 +1,4 @@
+# Default parameters for Profiler Server
+profile_data:
+  text_column: "text"
+  include_stats: true

xfmr_zem/servers/sinks/parameters.yml ADDED Viewed

@@ -0,0 +1,6 @@
+# Default parameters for Sinks Server
+to_huggingface:
+  private: true
+to_vector_db:
+  provider: "pinecone"

xfmr_zem/servers/unstructured/parameters.yml ADDED Viewed

@@ -0,0 +1,6 @@
+# Default parameters for Unstructured Server
+parse_document:
+  strategy: "fast"
+extract_tables:
+  strategy: "hi_res"

xfmr_zem/servers/unstructured/server.py ADDED Viewed

@@ -0,0 +1,62 @@
+import os
+import pandas as pd
+from xfmr_zem.server import ZemServer
+from unstructured.partition.auto import partition
+from loguru import logger
+# Initialize ZemServer for Unstructured
+mcp = ZemServer("unstructured")
+@mcp.tool()
+async def parse_document(file_path: str, strategy: str = "fast") -> pd.DataFrame:
+    """
+    Parses a document (PDF, DOCX, HTML, etc.) and returns all text segments as a DataFrame.
+    Args:
+        file_path: Path to the document file.
+        strategy: Partitioning strategy ("fast", "hi_res", "ocr_only"). Defaults to "fast".
+    """
+    logger.info(f"Parsing document: {file_path} with strategy: {strategy}")
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+    # Use unstructured to partition the file
+    elements = partition(filename=file_path, strategy=strategy)
+    # Convert elements to a list of dicts
+    data = []
+    for el in elements:
+        data.append({
+            "text": str(el),
+            "type": el.category,
+            "element_id": el.id,
+            "metadata": el.metadata.to_dict() if hasattr(el, "metadata") else {}
+        })
+    df = pd.DataFrame(data)
+    logger.info(f"Extracted {len(df)} elements from {file_path}")
+    return df
+@mcp.tool()
+async def extract_tables(file_path: str) -> pd.DataFrame:
+    """
+    Specifically extracts tables from a document and returns them.
+    Note: Requires 'hi_res' strategy internally.
+    """
+    logger.info(f"Extracting tables from: {file_path}")
+    # Partition with hi_res to get table structure
+    elements = partition(filename=file_path, strategy="hi_res")
+    # Filter for Table elements
+    tables = [str(el) for el in elements if el.category == "Table"]
+    if not tables:
+        logger.warning(f"No tables found in {file_path}")
+        return pd.DataFrame(columns=["table_content"])
+    return pd.DataFrame({"table_content": tables})
+if __name__ == "__main__":
+    mcp.run()

xfmr_zem/zenml_wrapper.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Any, Dict, Optional, List
 from zenml import step
 from mcp import ClientSession, StdioServerParameters
 from mcp.client.stdio import stdio_client
+from loguru import logger
 import json
 import os
@@ -23,6 +24,10 @@ def run_mcp_tool(
     """
     cmd = [command] + args
+    # Forward stderr to sys.stderr for real-time logging in verbose mode
+    import sys
+    import threading
     process = subprocess.Popen(
         cmd,
         stdin=subprocess.PIPE,
@@ -32,6 +37,16 @@ def run_mcp_tool(
         text=True,
         bufsize=0
     )
+    # Start a thread to stream stderr
+    def stream_stderr():
+        for line in process.stderr:
+            sys.stderr.write(line)
+            sys.stderr.flush()
+    stderr_thread = threading.Thread(target=stream_stderr, daemon=True)
+    stderr_thread.start()
     try:
         # 1. Initialize
@@ -52,8 +67,7 @@ def run_mcp_tool(
         while True:
             line = process.stdout.readline()
             if not line:
-                 err = process.stderr.read()
-                 raise RuntimeError(f"Server closed connection during init. Stderr: {err}")
+                 raise RuntimeError(f"Server closed connection during init. Check logs above for details.")
             if line.strip().startswith("{"):
                 try:
@@ -77,8 +91,7 @@ def run_mcp_tool(
         while True:
             line = process.stdout.readline()
             if not line:
-                 err = process.stderr.read()
-                 raise RuntimeError(f"Server closed connection during {method}. Stderr: {err}")
+                 raise RuntimeError(f"Server closed connection during {method}. Check logs above for details.")
             if line.strip().startswith("{"):
                 try:
@@ -122,7 +135,7 @@ def list_mcp_tools(
         result = run_mcp_tool(command, args, env, "tools/list", {})
         return result.get("tools", [])
     except Exception as e:
-        print(f"Error listing tools: {e}")
+        logger.error(f"Error listing tools: {e}")
         return []
@@ -162,7 +175,7 @@ def mcp_generic_step(
     args = server_config.get("args", [])
     env = server_config.get("env", os.environ.copy())
-    print(f"[{server_name}] Executing tool '{tool_name}'")
+    logger.info(f"[{server_name}] Executing tool '{tool_name}'")
     start_time = time.time()
     try:
@@ -172,7 +185,7 @@ def mcp_generic_step(
         }
         result_data = run_mcp_tool(command, args, env, "tools/call", params)
         execution_time = time.time() - start_time
-        print(f"[{server_name}] Tool '{tool_name}' finished in {execution_time:.2f}s")
+        logger.info(f"[{server_name}] Tool '{tool_name}' finished in {execution_time:.2f}s")
         output_data = {}

xfmr-zem 0.2.2__py3-none-any.whl → 0.2.5__py3-none-any.whl

xfmr-zem 0.2.2py3-none-any.whl → 0.2.5py3-none-any.whl