PyPI - xfmr-zem - Versions diffs - 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl - Mend

xfmr-zem 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

xfmr_zem/servers/ocr/parameters.yml CHANGED Viewed

@@ -2,3 +2,6 @@
 extract_text:
   engine: "tesseract"
   model_id: null
+  scanned_threshold: 50
+  zoom: 2.0
+  temp_dir: "/tmp"

xfmr_zem/servers/ocr/server.py CHANGED Viewed

@@ -1,43 +1,128 @@
+import os
 import pandas as pd
 from xfmr_zem.server import ZemServer
 from xfmr_zem.servers.ocr.engines import OCREngineFactory
 from loguru import logger
+from PIL import Image
+import io
 # Initialize ZemServer for OCR
 mcp = ZemServer("ocr")
+def extract_pdf_pages(
+    file_path: str,
+    engine: str,
+    ocr_engine,
+    scanned_threshold: int = 50,
+    zoom: float = 2.0,
+    temp_dir: str = "/tmp"
+):
+    """Helper to process PDF pages with optional OCR for scanned content."""
+    import fitz  # PyMuPDF
+    results = []
+    doc = fitz.open(file_path)
+    # Ensure temp_dir exists
+    os.makedirs(temp_dir, exist_ok=True)
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        text = page.get_text().strip()
+        # Determine if we need to OCR (Strategy: text is too short or empty)
+        is_scanned = len(text) < scanned_threshold
+        if is_scanned:
+            logger.info(f"Page {page_num + 1} appears scanned (text length: {len(text)}). Running OCR with {engine}...")
+            # Render page to image for OCR
+            pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
+            img_data = pix.tobytes("png")
+            img = Image.open(io.BytesIO(img_data))
+            # Temporary save for engine compatibility (engines expect path)
+            temp_path = os.path.join(temp_dir, f"ocr_page_{os.getpid()}_{page_num}.png")
+            img.save(temp_path)
+            try:
+                ocr_result = ocr_engine.process(temp_path)
+                final_text = ocr_result["text"]
+                source = f"{engine}_ocr"
+            finally:
+                if os.path.exists(temp_path):
+                    os.remove(temp_path)
+        else:
+            final_text = text
+            source = "digital_pdf"
+        results.append({
+            "text": final_text,
+            "page": page_num + 1,
+            "engine": source,
+            "metadata": {"file": file_path, "is_scanned": is_scanned}
+        })
+    doc.close()
+    return results
 @mcp.tool()
-async def extract_text(file_path: str, engine: str = None, model_id: str = None) -> pd.DataFrame:
+async def extract_text(
+    file_path: str,
+    engine: str = "tesseract",
+    model_id: str = None,
+    scanned_threshold: int = 50,
+    zoom: float = 2.0,
+    temp_dir: str = "/tmp"
+) -> pd.DataFrame:
     """
-    Extracts text from an image using the specified OCR engine.
+    Extracts text from an image or PDF using the specified OCR engine.
+    For PDFs, it will automatically handle scanned pages using the OCR engine.
     Args:
-        file_path: Path to the image file.
+        file_path: Path to the image or PDF file.
         engine: The OCR engine to use ("tesseract", "paddle", "huggingface", "viet"). Defaults to "tesseract".
-        model_id: Optional model ID for the 'huggingface' engine (e.g., "Qwen/Qwen2-VL-2B-Instruct").
+        model_id: Optional model ID for the 'huggingface' engine.
+        scanned_threshold: Min characters required to skip OCR on PDF page. Defaults to 50.
+        zoom: Rendering zoom factor for scanned PDF pages. Defaults to 2.0.
+        temp_dir: Directory for temporary page images. Defaults to "/tmp".
     """
-    logger.info(f"OCR Extraction: {file_path} using {engine} (model: {model_id})")
+    logger.info(f"OCR Extraction: {file_path} using {engine} (scanned_threshold={scanned_threshold}, zoom={zoom})")
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
     try:
-        # Get engine from factory (SOLID Strategy Pattern)
+        # Get engine from factory
         ocr_engine = OCREngineFactory.get_engine(engine, model_id=model_id)
-        # Process image
-        result = ocr_engine.process(file_path)
-        # Structure as a single-row DataFrame for Zem compatibility
-        # We wrap in a list to ensure pandas creates a row
-        df = pd.DataFrame([{
-            "text": result["text"],
-            "engine": result["engine"],
-            "metadata": result["metadata"]
-        }])
+        # Handle PDF vs Image
+        if file_path.lower().endswith(".pdf"):
+            logger.info(f"Processing PDF file: {file_path}")
+            data = extract_pdf_pages(
+                file_path,
+                engine,
+                ocr_engine,
+                scanned_threshold=scanned_threshold,
+                zoom=zoom,
+                temp_dir=temp_dir
+            )
+            df = pd.DataFrame(data)
+        else:
+            # Process image
+            result = ocr_engine.process(file_path)
+            df = pd.DataFrame([{
+                "text": result["text"],
+                "engine": result["engine"],
+                "metadata": result["metadata"]
+            }])
-        logger.info(f"Successfully extracted text using {engine}")
+        logger.info(f"Successfully extracted text from {file_path}")
         return df.to_dict(orient="records")
     except Exception as e:
         logger.error(f"OCR Error with {engine}: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
         raise RuntimeError(f"OCR failed: {str(e)}")
 if __name__ == "__main__":

{xfmr_zem-0.2.5.dist-info → xfmr_zem-0.2.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xfmr-zem
-Version: 0.2.5
+Version: 0.2.7
 Summary: Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing
 Project-URL: Homepage, https://github.com/OAI-Labs/xfmr-zem
 Project-URL: Repository, https://github.com/OAI-Labs/xfmr-zem
@@ -51,6 +51,7 @@ Requires-Dist: paddlepaddle>=2.6.0; extra == 'ocr'
 Requires-Dist: pdfplumber>=0.11.0; extra == 'ocr'
 Requires-Dist: pillow>=10.0.0; extra == 'ocr'
 Requires-Dist: pyclipper; extra == 'ocr'
+Requires-Dist: pymupdf>=1.23.0; extra == 'ocr'
 Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
 Requires-Dist: ruamel-yaml>=0.17.0; extra == 'ocr'
 Requires-Dist: shapely; extra == 'ocr'

{xfmr_zem-0.2.5.dist-info → xfmr_zem-0.2.7.dist-info}/RECORD RENAMED Viewed

@@ -17,8 +17,8 @@ xfmr_zem/servers/nemo_curator/parameters.yml,sha256=EGEzo0heI-ajkwFFy3xxq_YD7cXU
 xfmr_zem/servers/nemo_curator/server.py,sha256=zcHoSwxxoK_rMaDIAbEy1s8qfdp68Ue4B-XBcjGxQak,3848
 xfmr_zem/servers/ocr/engines.py,sha256=zScn4Qjxbpl2nB8UXEf3kd9l8z84TEwGs6bV5ka8Lks,10295
 xfmr_zem/servers/ocr/install_models.py,sha256=t02zpoy8djVhITOLEaRJ2mjiMrFfA9H6fpeHD3hXuio,2135
-xfmr_zem/servers/ocr/parameters.yml,sha256=04v59-6QXwN6XEpnHLc5pz6iTgNBDhloHtCCjHr8YRA,89
-xfmr_zem/servers/ocr/server.py,sha256=Yef1CYJR5RDH38jffgbcpGE-1VZLaU4w1wi572oPZcY,1571
+xfmr_zem/servers/ocr/parameters.yml,sha256=UTMwtTu0Eeit0tFkYcZOxpuzD78UBlpONXZIx6STYwc,144
+xfmr_zem/servers/ocr/server.py,sha256=eJtQnMVBFX6PLZMxZITNlNEXGarjsvkz003-uT1iIo0,4369
 xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py,sha256=XJE7RnOu5oo5p902HPWPDBd7FhVQXetmnr2-kWEG0nI,2419
 xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py,sha256=79fYr76fx8yZda3HaFcK1d5G-4sDVf1JFHNW_OBQAk8,47348
 xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py,sha256=7BeLHzf9FQUkkHMb5jDpggruJmfXVMU78MF_EeZ9PG4,10462
@@ -51,8 +51,8 @@ xfmr_zem/servers/sinks/parameters.yml,sha256=9HAnv84Utw2qWsVZH8uOjVE62lnAKBkzv4P
 xfmr_zem/servers/sinks/server.py,sha256=jI_r4sq_U_avNwF1PiE0alpaDrYpzOI-qPeLU7hgHP0,1589
 xfmr_zem/servers/unstructured/parameters.yml,sha256=N31cmc56GTr3rkVhbni4yOpbnHISReN8f-KnRZTDbBc,118
 xfmr_zem/servers/unstructured/server.py,sha256=0XmXWMAUNEJboX-J4bn_8EBUfMHIqu_ylNC_s9YOZdk,1996
-xfmr_zem-0.2.5.dist-info/METADATA,sha256=QxGjfN7Y4zZOGmcDwohYh9HcFj2JDw7XmKyC4400z6M,6332
-xfmr_zem-0.2.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-xfmr_zem-0.2.5.dist-info/entry_points.txt,sha256=uxs-IXFxpSakHivpFN3mEr13cz-z-0vkeSF_4dEBMa4,65
-xfmr_zem-0.2.5.dist-info/licenses/LICENSE,sha256=kf_ILr0zLkSy5-EBu0VF2PGaOykYo83z3UijI-bZeAE,11342
-xfmr_zem-0.2.5.dist-info/RECORD,,
+xfmr_zem-0.2.7.dist-info/METADATA,sha256=Iv77eb-eHw6rdJhG1LfoNY4Hf9I7oFlIsx1K3K7_sH0,6379
+xfmr_zem-0.2.7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+xfmr_zem-0.2.7.dist-info/entry_points.txt,sha256=uxs-IXFxpSakHivpFN3mEr13cz-z-0vkeSF_4dEBMa4,65
+xfmr_zem-0.2.7.dist-info/licenses/LICENSE,sha256=kf_ILr0zLkSy5-EBu0VF2PGaOykYo83z3UijI-bZeAE,11342
+xfmr_zem-0.2.7.dist-info/RECORD,,

{xfmr_zem-0.2.5.dist-info → xfmr_zem-0.2.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{xfmr_zem-0.2.5.dist-info → xfmr_zem-0.2.7.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{xfmr_zem-0.2.5.dist-info → xfmr_zem-0.2.7.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

xfmr-zem 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

xfmr-zem 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl