xfmr-zem 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,3 +2,6 @@
2
2
  extract_text:
3
3
  engine: "tesseract"
4
4
  model_id: null
5
+ scanned_threshold: 50
6
+ zoom: 2.0
7
+ temp_dir: "/tmp"
@@ -1,43 +1,128 @@
1
+ import os
1
2
  import pandas as pd
2
3
  from xfmr_zem.server import ZemServer
3
4
  from xfmr_zem.servers.ocr.engines import OCREngineFactory
4
5
  from loguru import logger
6
+ from PIL import Image
7
+ import io
5
8
 
6
9
  # Initialize ZemServer for OCR
7
10
  mcp = ZemServer("ocr")
8
11
 
12
+ def extract_pdf_pages(
13
+ file_path: str,
14
+ engine: str,
15
+ ocr_engine,
16
+ scanned_threshold: int = 50,
17
+ zoom: float = 2.0,
18
+ temp_dir: str = "/tmp"
19
+ ):
20
+ """Helper to process PDF pages with optional OCR for scanned content."""
21
+ import fitz # PyMuPDF
22
+
23
+ results = []
24
+ doc = fitz.open(file_path)
25
+
26
+ # Ensure temp_dir exists
27
+ os.makedirs(temp_dir, exist_ok=True)
28
+
29
+ for page_num in range(len(doc)):
30
+ page = doc[page_num]
31
+ text = page.get_text().strip()
32
+
33
+ # Determine if we need to OCR (Strategy: text is too short or empty)
34
+ is_scanned = len(text) < scanned_threshold
35
+
36
+ if is_scanned:
37
+ logger.info(f"Page {page_num + 1} appears scanned (text length: {len(text)}). Running OCR with {engine}...")
38
+ # Render page to image for OCR
39
+ pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
40
+ img_data = pix.tobytes("png")
41
+ img = Image.open(io.BytesIO(img_data))
42
+
43
+ # Temporary save for engine compatibility (engines expect path)
44
+ temp_path = os.path.join(temp_dir, f"ocr_page_{os.getpid()}_{page_num}.png")
45
+ img.save(temp_path)
46
+
47
+ try:
48
+ ocr_result = ocr_engine.process(temp_path)
49
+ final_text = ocr_result["text"]
50
+ source = f"{engine}_ocr"
51
+ finally:
52
+ if os.path.exists(temp_path):
53
+ os.remove(temp_path)
54
+ else:
55
+ final_text = text
56
+ source = "digital_pdf"
57
+
58
+ results.append({
59
+ "text": final_text,
60
+ "page": page_num + 1,
61
+ "engine": source,
62
+ "metadata": {"file": file_path, "is_scanned": is_scanned}
63
+ })
64
+
65
+ doc.close()
66
+ return results
67
+
9
68
  @mcp.tool()
10
- async def extract_text(file_path: str, engine: str = None, model_id: str = None) -> pd.DataFrame:
69
+ async def extract_text(
70
+ file_path: str,
71
+ engine: str = "tesseract",
72
+ model_id: str = None,
73
+ scanned_threshold: int = 50,
74
+ zoom: float = 2.0,
75
+ temp_dir: str = "/tmp"
76
+ ) -> pd.DataFrame:
11
77
  """
12
- Extracts text from an image using the specified OCR engine.
78
+ Extracts text from an image or PDF using the specified OCR engine.
79
+ For PDFs, it will automatically handle scanned pages using the OCR engine.
13
80
 
14
81
  Args:
15
- file_path: Path to the image file.
82
+ file_path: Path to the image or PDF file.
16
83
  engine: The OCR engine to use ("tesseract", "paddle", "huggingface", "viet"). Defaults to "tesseract".
17
- model_id: Optional model ID for the 'huggingface' engine (e.g., "Qwen/Qwen2-VL-2B-Instruct").
84
+ model_id: Optional model ID for the 'huggingface' engine.
85
+ scanned_threshold: Min characters required to skip OCR on PDF page. Defaults to 50.
86
+ zoom: Rendering zoom factor for scanned PDF pages. Defaults to 2.0.
87
+ temp_dir: Directory for temporary page images. Defaults to "/tmp".
18
88
  """
19
- logger.info(f"OCR Extraction: {file_path} using {engine} (model: {model_id})")
89
+ logger.info(f"OCR Extraction: {file_path} using {engine} (scanned_threshold={scanned_threshold}, zoom={zoom})")
20
90
 
91
+ if not os.path.exists(file_path):
92
+ raise FileNotFoundError(f"File not found: {file_path}")
93
+
21
94
  try:
22
- # Get engine from factory (SOLID Strategy Pattern)
95
+ # Get engine from factory
23
96
  ocr_engine = OCREngineFactory.get_engine(engine, model_id=model_id)
24
97
 
25
- # Process image
26
- result = ocr_engine.process(file_path)
27
-
28
- # Structure as a single-row DataFrame for Zem compatibility
29
- # We wrap in a list to ensure pandas creates a row
30
- df = pd.DataFrame([{
31
- "text": result["text"],
32
- "engine": result["engine"],
33
- "metadata": result["metadata"]
34
- }])
98
+ # Handle PDF vs Image
99
+ if file_path.lower().endswith(".pdf"):
100
+ logger.info(f"Processing PDF file: {file_path}")
101
+ data = extract_pdf_pages(
102
+ file_path,
103
+ engine,
104
+ ocr_engine,
105
+ scanned_threshold=scanned_threshold,
106
+ zoom=zoom,
107
+ temp_dir=temp_dir
108
+ )
109
+ df = pd.DataFrame(data)
110
+ else:
111
+ # Process image
112
+ result = ocr_engine.process(file_path)
113
+ df = pd.DataFrame([{
114
+ "text": result["text"],
115
+ "engine": result["engine"],
116
+ "metadata": result["metadata"]
117
+ }])
35
118
 
36
- logger.info(f"Successfully extracted text using {engine}")
119
+ logger.info(f"Successfully extracted text from {file_path}")
37
120
  return df.to_dict(orient="records")
38
121
 
39
122
  except Exception as e:
40
123
  logger.error(f"OCR Error with {engine}: {e}")
124
+ import traceback
125
+ logger.error(traceback.format_exc())
41
126
  raise RuntimeError(f"OCR failed: {str(e)}")
42
127
 
43
128
  if __name__ == "__main__":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xfmr-zem
3
- Version: 0.2.5
3
+ Version: 0.2.7
4
4
  Summary: Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing
5
5
  Project-URL: Homepage, https://github.com/OAI-Labs/xfmr-zem
6
6
  Project-URL: Repository, https://github.com/OAI-Labs/xfmr-zem
@@ -51,6 +51,7 @@ Requires-Dist: paddlepaddle>=2.6.0; extra == 'ocr'
51
51
  Requires-Dist: pdfplumber>=0.11.0; extra == 'ocr'
52
52
  Requires-Dist: pillow>=10.0.0; extra == 'ocr'
53
53
  Requires-Dist: pyclipper; extra == 'ocr'
54
+ Requires-Dist: pymupdf>=1.23.0; extra == 'ocr'
54
55
  Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
55
56
  Requires-Dist: ruamel-yaml>=0.17.0; extra == 'ocr'
56
57
  Requires-Dist: shapely; extra == 'ocr'
@@ -17,8 +17,8 @@ xfmr_zem/servers/nemo_curator/parameters.yml,sha256=EGEzo0heI-ajkwFFy3xxq_YD7cXU
17
17
  xfmr_zem/servers/nemo_curator/server.py,sha256=zcHoSwxxoK_rMaDIAbEy1s8qfdp68Ue4B-XBcjGxQak,3848
18
18
  xfmr_zem/servers/ocr/engines.py,sha256=zScn4Qjxbpl2nB8UXEf3kd9l8z84TEwGs6bV5ka8Lks,10295
19
19
  xfmr_zem/servers/ocr/install_models.py,sha256=t02zpoy8djVhITOLEaRJ2mjiMrFfA9H6fpeHD3hXuio,2135
20
- xfmr_zem/servers/ocr/parameters.yml,sha256=04v59-6QXwN6XEpnHLc5pz6iTgNBDhloHtCCjHr8YRA,89
21
- xfmr_zem/servers/ocr/server.py,sha256=Yef1CYJR5RDH38jffgbcpGE-1VZLaU4w1wi572oPZcY,1571
20
+ xfmr_zem/servers/ocr/parameters.yml,sha256=UTMwtTu0Eeit0tFkYcZOxpuzD78UBlpONXZIx6STYwc,144
21
+ xfmr_zem/servers/ocr/server.py,sha256=eJtQnMVBFX6PLZMxZITNlNEXGarjsvkz003-uT1iIo0,4369
22
22
  xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py,sha256=XJE7RnOu5oo5p902HPWPDBd7FhVQXetmnr2-kWEG0nI,2419
23
23
  xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py,sha256=79fYr76fx8yZda3HaFcK1d5G-4sDVf1JFHNW_OBQAk8,47348
24
24
  xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py,sha256=7BeLHzf9FQUkkHMb5jDpggruJmfXVMU78MF_EeZ9PG4,10462
@@ -51,8 +51,8 @@ xfmr_zem/servers/sinks/parameters.yml,sha256=9HAnv84Utw2qWsVZH8uOjVE62lnAKBkzv4P
51
51
  xfmr_zem/servers/sinks/server.py,sha256=jI_r4sq_U_avNwF1PiE0alpaDrYpzOI-qPeLU7hgHP0,1589
52
52
  xfmr_zem/servers/unstructured/parameters.yml,sha256=N31cmc56GTr3rkVhbni4yOpbnHISReN8f-KnRZTDbBc,118
53
53
  xfmr_zem/servers/unstructured/server.py,sha256=0XmXWMAUNEJboX-J4bn_8EBUfMHIqu_ylNC_s9YOZdk,1996
54
- xfmr_zem-0.2.5.dist-info/METADATA,sha256=QxGjfN7Y4zZOGmcDwohYh9HcFj2JDw7XmKyC4400z6M,6332
55
- xfmr_zem-0.2.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
56
- xfmr_zem-0.2.5.dist-info/entry_points.txt,sha256=uxs-IXFxpSakHivpFN3mEr13cz-z-0vkeSF_4dEBMa4,65
57
- xfmr_zem-0.2.5.dist-info/licenses/LICENSE,sha256=kf_ILr0zLkSy5-EBu0VF2PGaOykYo83z3UijI-bZeAE,11342
58
- xfmr_zem-0.2.5.dist-info/RECORD,,
54
+ xfmr_zem-0.2.7.dist-info/METADATA,sha256=Iv77eb-eHw6rdJhG1LfoNY4Hf9I7oFlIsx1K3K7_sH0,6379
55
+ xfmr_zem-0.2.7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
56
+ xfmr_zem-0.2.7.dist-info/entry_points.txt,sha256=uxs-IXFxpSakHivpFN3mEr13cz-z-0vkeSF_4dEBMa4,65
57
+ xfmr_zem-0.2.7.dist-info/licenses/LICENSE,sha256=kf_ILr0zLkSy5-EBu0VF2PGaOykYo83z3UijI-bZeAE,11342
58
+ xfmr_zem-0.2.7.dist-info/RECORD,,