xfmr-zem 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,3 +2,6 @@
2
2
  extract_text:
3
3
  engine: "tesseract"
4
4
  model_id: null
5
+ scanned_threshold: 50
6
+ zoom: 2.0
7
+ temp_dir: "/tmp"
@@ -9,29 +9,39 @@ import io
9
9
  # Initialize ZemServer for OCR
10
10
  mcp = ZemServer("ocr")
11
11
 
12
- def extract_pdf_pages(file_path: str, engine: str, ocr_engine, model_id: str = None):
12
+ def extract_pdf_pages(
13
+ file_path: str,
14
+ engine: str,
15
+ ocr_engine,
16
+ scanned_threshold: int = 50,
17
+ zoom: float = 2.0,
18
+ temp_dir: str = "/tmp"
19
+ ):
13
20
  """Helper to process PDF pages with optional OCR for scanned content."""
14
21
  import fitz # PyMuPDF
15
22
 
16
23
  results = []
17
24
  doc = fitz.open(file_path)
18
25
 
26
+ # Ensure temp_dir exists
27
+ os.makedirs(temp_dir, exist_ok=True)
28
+
19
29
  for page_num in range(len(doc)):
20
30
  page = doc[page_num]
21
31
  text = page.get_text().strip()
22
32
 
23
33
  # Determine if we need to OCR (Strategy: text is too short or empty)
24
- is_scanned = len(text) < 50
34
+ is_scanned = len(text) < scanned_threshold
25
35
 
26
36
  if is_scanned:
27
- logger.info(f"Page {page_num + 1} appears scanned. Running OCR with {engine}...")
37
+ logger.info(f"Page {page_num + 1} appears scanned (text length: {len(text)}). Running OCR with {engine}...")
28
38
  # Render page to image for OCR
29
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for better OCR
39
+ pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
30
40
  img_data = pix.tobytes("png")
31
41
  img = Image.open(io.BytesIO(img_data))
32
42
 
33
43
  # Temporary save for engine compatibility (engines expect path)
34
- temp_path = f"/tmp/ocr_page_{page_num}.png"
44
+ temp_path = os.path.join(temp_dir, f"ocr_page_{os.getpid()}_{page_num}.png")
35
45
  img.save(temp_path)
36
46
 
37
47
  try:
@@ -56,7 +66,14 @@ def extract_pdf_pages(file_path: str, engine: str, ocr_engine, model_id: str = N
56
66
  return results
57
67
 
58
68
  @mcp.tool()
59
- async def extract_text(file_path: str, engine: str = "tesseract", model_id: str = None) -> pd.DataFrame:
69
+ async def extract_text(
70
+ file_path: str,
71
+ engine: str = "tesseract",
72
+ model_id: str = None,
73
+ scanned_threshold: int = 50,
74
+ zoom: float = 2.0,
75
+ temp_dir: str = "/tmp"
76
+ ) -> pd.DataFrame:
60
77
  """
61
78
  Extracts text from an image or PDF using the specified OCR engine.
62
79
  For PDFs, it will automatically handle scanned pages using the OCR engine.
@@ -65,8 +82,11 @@ async def extract_text(file_path: str, engine: str = "tesseract", model_id: str
65
82
  file_path: Path to the image or PDF file.
66
83
  engine: The OCR engine to use ("tesseract", "paddle", "huggingface", "viet"). Defaults to "tesseract".
67
84
  model_id: Optional model ID for the 'huggingface' engine.
85
+ scanned_threshold: Min characters required to skip OCR on PDF page. Defaults to 50.
86
+ zoom: Rendering zoom factor for scanned PDF pages. Defaults to 2.0.
87
+ temp_dir: Directory for temporary page images. Defaults to "/tmp".
68
88
  """
69
- logger.info(f"OCR Extraction: {file_path} using {engine}")
89
+ logger.info(f"OCR Extraction: {file_path} using {engine} (scanned_threshold={scanned_threshold}, zoom={zoom})")
70
90
 
71
91
  if not os.path.exists(file_path):
72
92
  raise FileNotFoundError(f"File not found: {file_path}")
@@ -78,7 +98,14 @@ async def extract_text(file_path: str, engine: str = "tesseract", model_id: str
78
98
  # Handle PDF vs Image
79
99
  if file_path.lower().endswith(".pdf"):
80
100
  logger.info(f"Processing PDF file: {file_path}")
81
- data = extract_pdf_pages(file_path, engine, ocr_engine, model_id)
101
+ data = extract_pdf_pages(
102
+ file_path,
103
+ engine,
104
+ ocr_engine,
105
+ scanned_threshold=scanned_threshold,
106
+ zoom=zoom,
107
+ temp_dir=temp_dir
108
+ )
82
109
  df = pd.DataFrame(data)
83
110
  else:
84
111
  # Process image
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xfmr-zem
3
- Version: 0.2.6
3
+ Version: 0.2.7
4
4
  Summary: Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing
5
5
  Project-URL: Homepage, https://github.com/OAI-Labs/xfmr-zem
6
6
  Project-URL: Repository, https://github.com/OAI-Labs/xfmr-zem
@@ -17,8 +17,8 @@ xfmr_zem/servers/nemo_curator/parameters.yml,sha256=EGEzo0heI-ajkwFFy3xxq_YD7cXU
17
17
  xfmr_zem/servers/nemo_curator/server.py,sha256=zcHoSwxxoK_rMaDIAbEy1s8qfdp68Ue4B-XBcjGxQak,3848
18
18
  xfmr_zem/servers/ocr/engines.py,sha256=zScn4Qjxbpl2nB8UXEf3kd9l8z84TEwGs6bV5ka8Lks,10295
19
19
  xfmr_zem/servers/ocr/install_models.py,sha256=t02zpoy8djVhITOLEaRJ2mjiMrFfA9H6fpeHD3hXuio,2135
20
- xfmr_zem/servers/ocr/parameters.yml,sha256=04v59-6QXwN6XEpnHLc5pz6iTgNBDhloHtCCjHr8YRA,89
21
- xfmr_zem/servers/ocr/server.py,sha256=lhT5rfpz2vPbdbM7RZAtsJKfZEqoth2ijEifMA_vWSA,3590
20
+ xfmr_zem/servers/ocr/parameters.yml,sha256=UTMwtTu0Eeit0tFkYcZOxpuzD78UBlpONXZIx6STYwc,144
21
+ xfmr_zem/servers/ocr/server.py,sha256=eJtQnMVBFX6PLZMxZITNlNEXGarjsvkz003-uT1iIo0,4369
22
22
  xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py,sha256=XJE7RnOu5oo5p902HPWPDBd7FhVQXetmnr2-kWEG0nI,2419
23
23
  xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py,sha256=79fYr76fx8yZda3HaFcK1d5G-4sDVf1JFHNW_OBQAk8,47348
24
24
  xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py,sha256=7BeLHzf9FQUkkHMb5jDpggruJmfXVMU78MF_EeZ9PG4,10462
@@ -51,8 +51,8 @@ xfmr_zem/servers/sinks/parameters.yml,sha256=9HAnv84Utw2qWsVZH8uOjVE62lnAKBkzv4P
51
51
  xfmr_zem/servers/sinks/server.py,sha256=jI_r4sq_U_avNwF1PiE0alpaDrYpzOI-qPeLU7hgHP0,1589
52
52
  xfmr_zem/servers/unstructured/parameters.yml,sha256=N31cmc56GTr3rkVhbni4yOpbnHISReN8f-KnRZTDbBc,118
53
53
  xfmr_zem/servers/unstructured/server.py,sha256=0XmXWMAUNEJboX-J4bn_8EBUfMHIqu_ylNC_s9YOZdk,1996
54
- xfmr_zem-0.2.6.dist-info/METADATA,sha256=E2bD-td6hx7ntkKGH3XIgwH8BTUADbTE5T-cFi21Qds,6379
55
- xfmr_zem-0.2.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
56
- xfmr_zem-0.2.6.dist-info/entry_points.txt,sha256=uxs-IXFxpSakHivpFN3mEr13cz-z-0vkeSF_4dEBMa4,65
57
- xfmr_zem-0.2.6.dist-info/licenses/LICENSE,sha256=kf_ILr0zLkSy5-EBu0VF2PGaOykYo83z3UijI-bZeAE,11342
58
- xfmr_zem-0.2.6.dist-info/RECORD,,
54
+ xfmr_zem-0.2.7.dist-info/METADATA,sha256=Iv77eb-eHw6rdJhG1LfoNY4Hf9I7oFlIsx1K3K7_sH0,6379
55
+ xfmr_zem-0.2.7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
56
+ xfmr_zem-0.2.7.dist-info/entry_points.txt,sha256=uxs-IXFxpSakHivpFN3mEr13cz-z-0vkeSF_4dEBMa4,65
57
+ xfmr_zem-0.2.7.dist-info/licenses/LICENSE,sha256=kf_ILr0zLkSy5-EBu0VF2PGaOykYo83z3UijI-bZeAE,11342
58
+ xfmr_zem-0.2.7.dist-info/RECORD,,