xfmr-zem 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xfmr_zem/servers/ocr/server.py +75 -17
- {xfmr_zem-0.2.5.dist-info → xfmr_zem-0.2.6.dist-info}/METADATA +2 -1
- {xfmr_zem-0.2.5.dist-info → xfmr_zem-0.2.6.dist-info}/RECORD +6 -6
- {xfmr_zem-0.2.5.dist-info → xfmr_zem-0.2.6.dist-info}/WHEEL +0 -0
- {xfmr_zem-0.2.5.dist-info → xfmr_zem-0.2.6.dist-info}/entry_points.txt +0 -0
- {xfmr_zem-0.2.5.dist-info → xfmr_zem-0.2.6.dist-info}/licenses/LICENSE +0 -0
xfmr_zem/servers/ocr/server.py
CHANGED
|
@@ -1,43 +1,101 @@
|
|
|
1
|
+
import os
|
|
1
2
|
import pandas as pd
|
|
2
3
|
from xfmr_zem.server import ZemServer
|
|
3
4
|
from xfmr_zem.servers.ocr.engines import OCREngineFactory
|
|
4
5
|
from loguru import logger
|
|
6
|
+
from PIL import Image
|
|
7
|
+
import io
|
|
5
8
|
|
|
6
9
|
# Initialize ZemServer for OCR
|
|
7
10
|
mcp = ZemServer("ocr")
|
|
8
11
|
|
|
12
|
+
def extract_pdf_pages(file_path: str, engine: str, ocr_engine, model_id: str = None):
|
|
13
|
+
"""Helper to process PDF pages with optional OCR for scanned content."""
|
|
14
|
+
import fitz # PyMuPDF
|
|
15
|
+
|
|
16
|
+
results = []
|
|
17
|
+
doc = fitz.open(file_path)
|
|
18
|
+
|
|
19
|
+
for page_num in range(len(doc)):
|
|
20
|
+
page = doc[page_num]
|
|
21
|
+
text = page.get_text().strip()
|
|
22
|
+
|
|
23
|
+
# Determine if we need to OCR (Strategy: text is too short or empty)
|
|
24
|
+
is_scanned = len(text) < 50
|
|
25
|
+
|
|
26
|
+
if is_scanned:
|
|
27
|
+
logger.info(f"Page {page_num + 1} appears scanned. Running OCR with {engine}...")
|
|
28
|
+
# Render page to image for OCR
|
|
29
|
+
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for better OCR
|
|
30
|
+
img_data = pix.tobytes("png")
|
|
31
|
+
img = Image.open(io.BytesIO(img_data))
|
|
32
|
+
|
|
33
|
+
# Temporary save for engine compatibility (engines expect path)
|
|
34
|
+
temp_path = f"/tmp/ocr_page_{page_num}.png"
|
|
35
|
+
img.save(temp_path)
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
ocr_result = ocr_engine.process(temp_path)
|
|
39
|
+
final_text = ocr_result["text"]
|
|
40
|
+
source = f"{engine}_ocr"
|
|
41
|
+
finally:
|
|
42
|
+
if os.path.exists(temp_path):
|
|
43
|
+
os.remove(temp_path)
|
|
44
|
+
else:
|
|
45
|
+
final_text = text
|
|
46
|
+
source = "digital_pdf"
|
|
47
|
+
|
|
48
|
+
results.append({
|
|
49
|
+
"text": final_text,
|
|
50
|
+
"page": page_num + 1,
|
|
51
|
+
"engine": source,
|
|
52
|
+
"metadata": {"file": file_path, "is_scanned": is_scanned}
|
|
53
|
+
})
|
|
54
|
+
|
|
55
|
+
doc.close()
|
|
56
|
+
return results
|
|
57
|
+
|
|
9
58
|
@mcp.tool()
|
|
10
|
-
async def extract_text(file_path: str, engine: str =
|
|
59
|
+
async def extract_text(file_path: str, engine: str = "tesseract", model_id: str = None) -> pd.DataFrame:
|
|
11
60
|
"""
|
|
12
|
-
Extracts text from an image using the specified OCR engine.
|
|
61
|
+
Extracts text from an image or PDF using the specified OCR engine.
|
|
62
|
+
For PDFs, it will automatically handle scanned pages using the OCR engine.
|
|
13
63
|
|
|
14
64
|
Args:
|
|
15
|
-
file_path: Path to the image file.
|
|
65
|
+
file_path: Path to the image or PDF file.
|
|
16
66
|
engine: The OCR engine to use ("tesseract", "paddle", "huggingface", "viet"). Defaults to "tesseract".
|
|
17
|
-
model_id: Optional model ID for the 'huggingface' engine
|
|
67
|
+
model_id: Optional model ID for the 'huggingface' engine.
|
|
18
68
|
"""
|
|
19
|
-
logger.info(f"OCR Extraction: {file_path} using {engine}
|
|
69
|
+
logger.info(f"OCR Extraction: {file_path} using {engine}")
|
|
20
70
|
|
|
71
|
+
if not os.path.exists(file_path):
|
|
72
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
73
|
+
|
|
21
74
|
try:
|
|
22
|
-
# Get engine from factory
|
|
75
|
+
# Get engine from factory
|
|
23
76
|
ocr_engine = OCREngineFactory.get_engine(engine, model_id=model_id)
|
|
24
77
|
|
|
25
|
-
#
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
78
|
+
# Handle PDF vs Image
|
|
79
|
+
if file_path.lower().endswith(".pdf"):
|
|
80
|
+
logger.info(f"Processing PDF file: {file_path}")
|
|
81
|
+
data = extract_pdf_pages(file_path, engine, ocr_engine, model_id)
|
|
82
|
+
df = pd.DataFrame(data)
|
|
83
|
+
else:
|
|
84
|
+
# Process image
|
|
85
|
+
result = ocr_engine.process(file_path)
|
|
86
|
+
df = pd.DataFrame([{
|
|
87
|
+
"text": result["text"],
|
|
88
|
+
"engine": result["engine"],
|
|
89
|
+
"metadata": result["metadata"]
|
|
90
|
+
}])
|
|
35
91
|
|
|
36
|
-
logger.info(f"Successfully extracted text
|
|
92
|
+
logger.info(f"Successfully extracted text from {file_path}")
|
|
37
93
|
return df.to_dict(orient="records")
|
|
38
94
|
|
|
39
95
|
except Exception as e:
|
|
40
96
|
logger.error(f"OCR Error with {engine}: {e}")
|
|
97
|
+
import traceback
|
|
98
|
+
logger.error(traceback.format_exc())
|
|
41
99
|
raise RuntimeError(f"OCR failed: {str(e)}")
|
|
42
100
|
|
|
43
101
|
if __name__ == "__main__":
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xfmr-zem
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.6
|
|
4
4
|
Summary: Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing
|
|
5
5
|
Project-URL: Homepage, https://github.com/OAI-Labs/xfmr-zem
|
|
6
6
|
Project-URL: Repository, https://github.com/OAI-Labs/xfmr-zem
|
|
@@ -51,6 +51,7 @@ Requires-Dist: paddlepaddle>=2.6.0; extra == 'ocr'
|
|
|
51
51
|
Requires-Dist: pdfplumber>=0.11.0; extra == 'ocr'
|
|
52
52
|
Requires-Dist: pillow>=10.0.0; extra == 'ocr'
|
|
53
53
|
Requires-Dist: pyclipper; extra == 'ocr'
|
|
54
|
+
Requires-Dist: pymupdf>=1.23.0; extra == 'ocr'
|
|
54
55
|
Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
|
|
55
56
|
Requires-Dist: ruamel-yaml>=0.17.0; extra == 'ocr'
|
|
56
57
|
Requires-Dist: shapely; extra == 'ocr'
|
|
@@ -18,7 +18,7 @@ xfmr_zem/servers/nemo_curator/server.py,sha256=zcHoSwxxoK_rMaDIAbEy1s8qfdp68Ue4B
|
|
|
18
18
|
xfmr_zem/servers/ocr/engines.py,sha256=zScn4Qjxbpl2nB8UXEf3kd9l8z84TEwGs6bV5ka8Lks,10295
|
|
19
19
|
xfmr_zem/servers/ocr/install_models.py,sha256=t02zpoy8djVhITOLEaRJ2mjiMrFfA9H6fpeHD3hXuio,2135
|
|
20
20
|
xfmr_zem/servers/ocr/parameters.yml,sha256=04v59-6QXwN6XEpnHLc5pz6iTgNBDhloHtCCjHr8YRA,89
|
|
21
|
-
xfmr_zem/servers/ocr/server.py,sha256=
|
|
21
|
+
xfmr_zem/servers/ocr/server.py,sha256=lhT5rfpz2vPbdbM7RZAtsJKfZEqoth2ijEifMA_vWSA,3590
|
|
22
22
|
xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py,sha256=XJE7RnOu5oo5p902HPWPDBd7FhVQXetmnr2-kWEG0nI,2419
|
|
23
23
|
xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py,sha256=79fYr76fx8yZda3HaFcK1d5G-4sDVf1JFHNW_OBQAk8,47348
|
|
24
24
|
xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py,sha256=7BeLHzf9FQUkkHMb5jDpggruJmfXVMU78MF_EeZ9PG4,10462
|
|
@@ -51,8 +51,8 @@ xfmr_zem/servers/sinks/parameters.yml,sha256=9HAnv84Utw2qWsVZH8uOjVE62lnAKBkzv4P
|
|
|
51
51
|
xfmr_zem/servers/sinks/server.py,sha256=jI_r4sq_U_avNwF1PiE0alpaDrYpzOI-qPeLU7hgHP0,1589
|
|
52
52
|
xfmr_zem/servers/unstructured/parameters.yml,sha256=N31cmc56GTr3rkVhbni4yOpbnHISReN8f-KnRZTDbBc,118
|
|
53
53
|
xfmr_zem/servers/unstructured/server.py,sha256=0XmXWMAUNEJboX-J4bn_8EBUfMHIqu_ylNC_s9YOZdk,1996
|
|
54
|
-
xfmr_zem-0.2.
|
|
55
|
-
xfmr_zem-0.2.
|
|
56
|
-
xfmr_zem-0.2.
|
|
57
|
-
xfmr_zem-0.2.
|
|
58
|
-
xfmr_zem-0.2.
|
|
54
|
+
xfmr_zem-0.2.6.dist-info/METADATA,sha256=E2bD-td6hx7ntkKGH3XIgwH8BTUADbTE5T-cFi21Qds,6379
|
|
55
|
+
xfmr_zem-0.2.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
56
|
+
xfmr_zem-0.2.6.dist-info/entry_points.txt,sha256=uxs-IXFxpSakHivpFN3mEr13cz-z-0vkeSF_4dEBMa4,65
|
|
57
|
+
xfmr_zem-0.2.6.dist-info/licenses/LICENSE,sha256=kf_ILr0zLkSy5-EBu0VF2PGaOykYo83z3UijI-bZeAE,11342
|
|
58
|
+
xfmr_zem-0.2.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|