xfmr-zem 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,43 +1,101 @@
1
+ import os
1
2
  import pandas as pd
2
3
  from xfmr_zem.server import ZemServer
3
4
  from xfmr_zem.servers.ocr.engines import OCREngineFactory
4
5
  from loguru import logger
6
+ from PIL import Image
7
+ import io
5
8
 
6
9
  # Initialize ZemServer for OCR
7
10
  mcp = ZemServer("ocr")
8
11
 
12
+ def extract_pdf_pages(file_path: str, engine: str, ocr_engine, model_id: str = None):
13
+ """Helper to process PDF pages with optional OCR for scanned content."""
14
+ import fitz # PyMuPDF
15
+
16
+ results = []
17
+ doc = fitz.open(file_path)
18
+
19
+ for page_num in range(len(doc)):
20
+ page = doc[page_num]
21
+ text = page.get_text().strip()
22
+
23
+ # Determine if we need to OCR (Strategy: text is too short or empty)
24
+ is_scanned = len(text) < 50
25
+
26
+ if is_scanned:
27
+ logger.info(f"Page {page_num + 1} appears scanned. Running OCR with {engine}...")
28
+ # Render page to image for OCR
29
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for better OCR
30
+ img_data = pix.tobytes("png")
31
+ img = Image.open(io.BytesIO(img_data))
32
+
33
+ # Temporary save for engine compatibility (engines expect path)
34
+ temp_path = f"/tmp/ocr_page_{page_num}.png"
35
+ img.save(temp_path)
36
+
37
+ try:
38
+ ocr_result = ocr_engine.process(temp_path)
39
+ final_text = ocr_result["text"]
40
+ source = f"{engine}_ocr"
41
+ finally:
42
+ if os.path.exists(temp_path):
43
+ os.remove(temp_path)
44
+ else:
45
+ final_text = text
46
+ source = "digital_pdf"
47
+
48
+ results.append({
49
+ "text": final_text,
50
+ "page": page_num + 1,
51
+ "engine": source,
52
+ "metadata": {"file": file_path, "is_scanned": is_scanned}
53
+ })
54
+
55
+ doc.close()
56
+ return results
57
+
9
58
  @mcp.tool()
10
- async def extract_text(file_path: str, engine: str = None, model_id: str = None) -> pd.DataFrame:
59
+ async def extract_text(file_path: str, engine: str = "tesseract", model_id: str = None) -> pd.DataFrame:
11
60
  """
12
- Extracts text from an image using the specified OCR engine.
61
+ Extracts text from an image or PDF using the specified OCR engine.
62
+ For PDFs, it will automatically handle scanned pages using the OCR engine.
13
63
 
14
64
  Args:
15
- file_path: Path to the image file.
65
+ file_path: Path to the image or PDF file.
16
66
  engine: The OCR engine to use ("tesseract", "paddle", "huggingface", "viet"). Defaults to "tesseract".
17
- model_id: Optional model ID for the 'huggingface' engine (e.g., "Qwen/Qwen2-VL-2B-Instruct").
67
+ model_id: Optional model ID for the 'huggingface' engine.
18
68
  """
19
- logger.info(f"OCR Extraction: {file_path} using {engine} (model: {model_id})")
69
+ logger.info(f"OCR Extraction: {file_path} using {engine}")
20
70
 
71
+ if not os.path.exists(file_path):
72
+ raise FileNotFoundError(f"File not found: {file_path}")
73
+
21
74
  try:
22
- # Get engine from factory (SOLID Strategy Pattern)
75
+ # Get engine from factory
23
76
  ocr_engine = OCREngineFactory.get_engine(engine, model_id=model_id)
24
77
 
25
- # Process image
26
- result = ocr_engine.process(file_path)
27
-
28
- # Structure as a single-row DataFrame for Zem compatibility
29
- # We wrap in a list to ensure pandas creates a row
30
- df = pd.DataFrame([{
31
- "text": result["text"],
32
- "engine": result["engine"],
33
- "metadata": result["metadata"]
34
- }])
78
+ # Handle PDF vs Image
79
+ if file_path.lower().endswith(".pdf"):
80
+ logger.info(f"Processing PDF file: {file_path}")
81
+ data = extract_pdf_pages(file_path, engine, ocr_engine, model_id)
82
+ df = pd.DataFrame(data)
83
+ else:
84
+ # Process image
85
+ result = ocr_engine.process(file_path)
86
+ df = pd.DataFrame([{
87
+ "text": result["text"],
88
+ "engine": result["engine"],
89
+ "metadata": result["metadata"]
90
+ }])
35
91
 
36
- logger.info(f"Successfully extracted text using {engine}")
92
+ logger.info(f"Successfully extracted text from {file_path}")
37
93
  return df.to_dict(orient="records")
38
94
 
39
95
  except Exception as e:
40
96
  logger.error(f"OCR Error with {engine}: {e}")
97
+ import traceback
98
+ logger.error(traceback.format_exc())
41
99
  raise RuntimeError(f"OCR failed: {str(e)}")
42
100
 
43
101
  if __name__ == "__main__":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xfmr-zem
3
- Version: 0.2.5
3
+ Version: 0.2.6
4
4
  Summary: Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing
5
5
  Project-URL: Homepage, https://github.com/OAI-Labs/xfmr-zem
6
6
  Project-URL: Repository, https://github.com/OAI-Labs/xfmr-zem
@@ -51,6 +51,7 @@ Requires-Dist: paddlepaddle>=2.6.0; extra == 'ocr'
51
51
  Requires-Dist: pdfplumber>=0.11.0; extra == 'ocr'
52
52
  Requires-Dist: pillow>=10.0.0; extra == 'ocr'
53
53
  Requires-Dist: pyclipper; extra == 'ocr'
54
+ Requires-Dist: pymupdf>=1.23.0; extra == 'ocr'
54
55
  Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
55
56
  Requires-Dist: ruamel-yaml>=0.17.0; extra == 'ocr'
56
57
  Requires-Dist: shapely; extra == 'ocr'
@@ -18,7 +18,7 @@ xfmr_zem/servers/nemo_curator/server.py,sha256=zcHoSwxxoK_rMaDIAbEy1s8qfdp68Ue4B
18
18
  xfmr_zem/servers/ocr/engines.py,sha256=zScn4Qjxbpl2nB8UXEf3kd9l8z84TEwGs6bV5ka8Lks,10295
19
19
  xfmr_zem/servers/ocr/install_models.py,sha256=t02zpoy8djVhITOLEaRJ2mjiMrFfA9H6fpeHD3hXuio,2135
20
20
  xfmr_zem/servers/ocr/parameters.yml,sha256=04v59-6QXwN6XEpnHLc5pz6iTgNBDhloHtCCjHr8YRA,89
21
- xfmr_zem/servers/ocr/server.py,sha256=Yef1CYJR5RDH38jffgbcpGE-1VZLaU4w1wi572oPZcY,1571
21
+ xfmr_zem/servers/ocr/server.py,sha256=lhT5rfpz2vPbdbM7RZAtsJKfZEqoth2ijEifMA_vWSA,3590
22
22
  xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py,sha256=XJE7RnOu5oo5p902HPWPDBd7FhVQXetmnr2-kWEG0nI,2419
23
23
  xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py,sha256=79fYr76fx8yZda3HaFcK1d5G-4sDVf1JFHNW_OBQAk8,47348
24
24
  xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py,sha256=7BeLHzf9FQUkkHMb5jDpggruJmfXVMU78MF_EeZ9PG4,10462
@@ -51,8 +51,8 @@ xfmr_zem/servers/sinks/parameters.yml,sha256=9HAnv84Utw2qWsVZH8uOjVE62lnAKBkzv4P
51
51
  xfmr_zem/servers/sinks/server.py,sha256=jI_r4sq_U_avNwF1PiE0alpaDrYpzOI-qPeLU7hgHP0,1589
52
52
  xfmr_zem/servers/unstructured/parameters.yml,sha256=N31cmc56GTr3rkVhbni4yOpbnHISReN8f-KnRZTDbBc,118
53
53
  xfmr_zem/servers/unstructured/server.py,sha256=0XmXWMAUNEJboX-J4bn_8EBUfMHIqu_ylNC_s9YOZdk,1996
54
- xfmr_zem-0.2.5.dist-info/METADATA,sha256=QxGjfN7Y4zZOGmcDwohYh9HcFj2JDw7XmKyC4400z6M,6332
55
- xfmr_zem-0.2.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
56
- xfmr_zem-0.2.5.dist-info/entry_points.txt,sha256=uxs-IXFxpSakHivpFN3mEr13cz-z-0vkeSF_4dEBMa4,65
57
- xfmr_zem-0.2.5.dist-info/licenses/LICENSE,sha256=kf_ILr0zLkSy5-EBu0VF2PGaOykYo83z3UijI-bZeAE,11342
58
- xfmr_zem-0.2.5.dist-info/RECORD,,
54
+ xfmr_zem-0.2.6.dist-info/METADATA,sha256=E2bD-td6hx7ntkKGH3XIgwH8BTUADbTE5T-cFi21Qds,6379
55
+ xfmr_zem-0.2.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
56
+ xfmr_zem-0.2.6.dist-info/entry_points.txt,sha256=uxs-IXFxpSakHivpFN3mEr13cz-z-0vkeSF_4dEBMa4,65
57
+ xfmr_zem-0.2.6.dist-info/licenses/LICENSE,sha256=kf_ILr0zLkSy5-EBu0VF2PGaOykYo83z3UijI-bZeAE,11342
58
+ xfmr_zem-0.2.6.dist-info/RECORD,,