xfmr-zem 0.2.5__tar.gz → 0.2.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/CHANGELOG.md +10 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/PKG-INFO +2 -1
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/pyproject.toml +2 -1
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/parameters.yml +3 -0
- xfmr_zem-0.2.7/src/xfmr_zem/servers/ocr/server.py +129 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/uv.lock +18 -1
- xfmr_zem-0.2.5/src/xfmr_zem/servers/ocr/server.py +0 -44
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/.github/workflows/deploy.yml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/.github/workflows/pypi-publish.yml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/.gitignore +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/AGENTS.md +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/LICENSE +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/README.md +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/data/big_data_output.parquet +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/data/big_data_sim.parquet +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/data/dup_cleaned.parquet +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/data/dup_data.parquet +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/data/dup_data_large.parquet +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/data/nemo_full_stack_result.parquet +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/data/nemo_real_result.parquet +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/data/ocr_test.png +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/data/output_result.jsonl +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/data/sample.jsonl +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/data/vietnamese_ocr.png +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/parameters.yml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/__init__.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/cli.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/client.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/orchestrators/parallel_local.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/schemas.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/server.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/data_juicer/parameters.yml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/data_juicer/server.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/instruction_gen/parameters.yml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/instruction_gen/server.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/io/parameters.yml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/io/server.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/llm/parameters.yml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/llm/server.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/nemo_curator/parameters.yml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/nemo_curator/server.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/ocr.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/.gitattributes +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/README.md +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/ocr.res +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/operators.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/phases.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/pipeline.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/postprocess.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/recognizer.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/table_structure_recognizer.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/utils/__init__.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/utils/file_utils.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/__init__.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/base.yml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/vgg-seq2seq.yml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/__init__.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/cnn.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/vgg.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/seqmodel/seq2seq.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/transformerocr.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/vocab.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/config.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/translate.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/engines.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/install_models.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/profiler/parameters.yml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/profiler/server.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/sinks/parameters.yml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/sinks/server.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/unstructured/parameters.yml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/unstructured/server.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/zenml_wrapper.py +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/tests/manual/caching_test.yaml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/tests/manual/hf_ocr_test.yaml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/tests/manual/llm_test.yaml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/tests/manual/multimodal_test.yaml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/tests/manual/ocr_test.yaml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/tests/manual/parallel_test.yaml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/tests/manual/parquet_test.yaml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/tests/manual/phase4_test.yaml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/tests/manual/profiler_test.yaml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/tests/manual/standard_data_pipeline.yaml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/tests/manual/viet_ocr_test.yaml +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/website/docs/docs.css +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/website/docs/index.html +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/website/index.html +0 -0
- {xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/website/style.css +0 -0
|
@@ -2,6 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project will be documented in this file.
|
|
4
4
|
|
|
5
|
+
## [0.2.7] - 2026-02-03
|
|
6
|
+
|
|
7
|
+
### Added
|
|
8
|
+
- **Configurable OCR Parameters**: Added `scanned_threshold`, `zoom`, and `temp_dir` parameters to the OCR server for finer control over PDF processing.
|
|
9
|
+
|
|
10
|
+
## [0.2.6] - 2026-02-03
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- **PDF Extraction & Scanning**: Added support for processing multi-page PDFs in the OCR server. It automatically extracts digital text and falls back to OCR for scanned pages using `PyMuPDF`.
|
|
14
|
+
|
|
5
15
|
## [0.2.5] - 2026-02-03
|
|
6
16
|
|
|
7
17
|
### Added
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xfmr-zem
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.7
|
|
4
4
|
Summary: Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing
|
|
5
5
|
Project-URL: Homepage, https://github.com/OAI-Labs/xfmr-zem
|
|
6
6
|
Project-URL: Repository, https://github.com/OAI-Labs/xfmr-zem
|
|
@@ -51,6 +51,7 @@ Requires-Dist: paddlepaddle>=2.6.0; extra == 'ocr'
|
|
|
51
51
|
Requires-Dist: pdfplumber>=0.11.0; extra == 'ocr'
|
|
52
52
|
Requires-Dist: pillow>=10.0.0; extra == 'ocr'
|
|
53
53
|
Requires-Dist: pyclipper; extra == 'ocr'
|
|
54
|
+
Requires-Dist: pymupdf>=1.23.0; extra == 'ocr'
|
|
54
55
|
Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
|
|
55
56
|
Requires-Dist: ruamel-yaml>=0.17.0; extra == 'ocr'
|
|
56
57
|
Requires-Dist: shapely; extra == 'ocr'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "xfmr-zem"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.7"
|
|
4
4
|
description = "Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10,<3.13"
|
|
@@ -64,6 +64,7 @@ ocr = [
|
|
|
64
64
|
"pyclipper",
|
|
65
65
|
"einops",
|
|
66
66
|
"pdfplumber>=0.11.0",
|
|
67
|
+
"pymupdf>=1.23.0",
|
|
67
68
|
"ruamel.yaml>=0.17.0",
|
|
68
69
|
"cachetools>=5.0.0",
|
|
69
70
|
]
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from xfmr_zem.server import ZemServer
|
|
4
|
+
from xfmr_zem.servers.ocr.engines import OCREngineFactory
|
|
5
|
+
from loguru import logger
|
|
6
|
+
from PIL import Image
|
|
7
|
+
import io
|
|
8
|
+
|
|
9
|
+
# Initialize ZemServer for OCR
|
|
10
|
+
mcp = ZemServer("ocr")
|
|
11
|
+
|
|
12
|
+
def extract_pdf_pages(
|
|
13
|
+
file_path: str,
|
|
14
|
+
engine: str,
|
|
15
|
+
ocr_engine,
|
|
16
|
+
scanned_threshold: int = 50,
|
|
17
|
+
zoom: float = 2.0,
|
|
18
|
+
temp_dir: str = "/tmp"
|
|
19
|
+
):
|
|
20
|
+
"""Helper to process PDF pages with optional OCR for scanned content."""
|
|
21
|
+
import fitz # PyMuPDF
|
|
22
|
+
|
|
23
|
+
results = []
|
|
24
|
+
doc = fitz.open(file_path)
|
|
25
|
+
|
|
26
|
+
# Ensure temp_dir exists
|
|
27
|
+
os.makedirs(temp_dir, exist_ok=True)
|
|
28
|
+
|
|
29
|
+
for page_num in range(len(doc)):
|
|
30
|
+
page = doc[page_num]
|
|
31
|
+
text = page.get_text().strip()
|
|
32
|
+
|
|
33
|
+
# Determine if we need to OCR (Strategy: text is too short or empty)
|
|
34
|
+
is_scanned = len(text) < scanned_threshold
|
|
35
|
+
|
|
36
|
+
if is_scanned:
|
|
37
|
+
logger.info(f"Page {page_num + 1} appears scanned (text length: {len(text)}). Running OCR with {engine}...")
|
|
38
|
+
# Render page to image for OCR
|
|
39
|
+
pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
|
|
40
|
+
img_data = pix.tobytes("png")
|
|
41
|
+
img = Image.open(io.BytesIO(img_data))
|
|
42
|
+
|
|
43
|
+
# Temporary save for engine compatibility (engines expect path)
|
|
44
|
+
temp_path = os.path.join(temp_dir, f"ocr_page_{os.getpid()}_{page_num}.png")
|
|
45
|
+
img.save(temp_path)
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
ocr_result = ocr_engine.process(temp_path)
|
|
49
|
+
final_text = ocr_result["text"]
|
|
50
|
+
source = f"{engine}_ocr"
|
|
51
|
+
finally:
|
|
52
|
+
if os.path.exists(temp_path):
|
|
53
|
+
os.remove(temp_path)
|
|
54
|
+
else:
|
|
55
|
+
final_text = text
|
|
56
|
+
source = "digital_pdf"
|
|
57
|
+
|
|
58
|
+
results.append({
|
|
59
|
+
"text": final_text,
|
|
60
|
+
"page": page_num + 1,
|
|
61
|
+
"engine": source,
|
|
62
|
+
"metadata": {"file": file_path, "is_scanned": is_scanned}
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
doc.close()
|
|
66
|
+
return results
|
|
67
|
+
|
|
68
|
+
@mcp.tool()
|
|
69
|
+
async def extract_text(
|
|
70
|
+
file_path: str,
|
|
71
|
+
engine: str = "tesseract",
|
|
72
|
+
model_id: str = None,
|
|
73
|
+
scanned_threshold: int = 50,
|
|
74
|
+
zoom: float = 2.0,
|
|
75
|
+
temp_dir: str = "/tmp"
|
|
76
|
+
) -> pd.DataFrame:
|
|
77
|
+
"""
|
|
78
|
+
Extracts text from an image or PDF using the specified OCR engine.
|
|
79
|
+
For PDFs, it will automatically handle scanned pages using the OCR engine.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
file_path: Path to the image or PDF file.
|
|
83
|
+
engine: The OCR engine to use ("tesseract", "paddle", "huggingface", "viet"). Defaults to "tesseract".
|
|
84
|
+
model_id: Optional model ID for the 'huggingface' engine.
|
|
85
|
+
scanned_threshold: Min characters required to skip OCR on PDF page. Defaults to 50.
|
|
86
|
+
zoom: Rendering zoom factor for scanned PDF pages. Defaults to 2.0.
|
|
87
|
+
temp_dir: Directory for temporary page images. Defaults to "/tmp".
|
|
88
|
+
"""
|
|
89
|
+
logger.info(f"OCR Extraction: {file_path} using {engine} (scanned_threshold={scanned_threshold}, zoom={zoom})")
|
|
90
|
+
|
|
91
|
+
if not os.path.exists(file_path):
|
|
92
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
# Get engine from factory
|
|
96
|
+
ocr_engine = OCREngineFactory.get_engine(engine, model_id=model_id)
|
|
97
|
+
|
|
98
|
+
# Handle PDF vs Image
|
|
99
|
+
if file_path.lower().endswith(".pdf"):
|
|
100
|
+
logger.info(f"Processing PDF file: {file_path}")
|
|
101
|
+
data = extract_pdf_pages(
|
|
102
|
+
file_path,
|
|
103
|
+
engine,
|
|
104
|
+
ocr_engine,
|
|
105
|
+
scanned_threshold=scanned_threshold,
|
|
106
|
+
zoom=zoom,
|
|
107
|
+
temp_dir=temp_dir
|
|
108
|
+
)
|
|
109
|
+
df = pd.DataFrame(data)
|
|
110
|
+
else:
|
|
111
|
+
# Process image
|
|
112
|
+
result = ocr_engine.process(file_path)
|
|
113
|
+
df = pd.DataFrame([{
|
|
114
|
+
"text": result["text"],
|
|
115
|
+
"engine": result["engine"],
|
|
116
|
+
"metadata": result["metadata"]
|
|
117
|
+
}])
|
|
118
|
+
|
|
119
|
+
logger.info(f"Successfully extracted text from {file_path}")
|
|
120
|
+
return df.to_dict(orient="records")
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.error(f"OCR Error with {engine}: {e}")
|
|
124
|
+
import traceback
|
|
125
|
+
logger.error(traceback.format_exc())
|
|
126
|
+
raise RuntimeError(f"OCR failed: {str(e)}")
|
|
127
|
+
|
|
128
|
+
if __name__ == "__main__":
|
|
129
|
+
mcp.run()
|
|
@@ -3683,6 +3683,21 @@ crypto = [
|
|
|
3683
3683
|
{ name = "cryptography" },
|
|
3684
3684
|
]
|
|
3685
3685
|
|
|
3686
|
+
[[package]]
|
|
3687
|
+
name = "pymupdf"
|
|
3688
|
+
version = "1.26.7"
|
|
3689
|
+
source = { registry = "https://pypi.org/simple" }
|
|
3690
|
+
sdist = { url = "https://files.pythonhosted.org/packages/48/d6/09b28f027b510838559f7748807192149c419b30cb90e6d5f0cf916dc9dc/pymupdf-1.26.7.tar.gz", hash = "sha256:71add8bdc8eb1aaa207c69a13400693f06ad9b927bea976f5d5ab9df0bb489c3", size = 84327033, upload-time = "2025-12-11T21:48:50.694Z" }
|
|
3691
|
+
wheels = [
|
|
3692
|
+
{ url = "https://files.pythonhosted.org/packages/94/35/cd74cea1787b2247702ef8522186bdef32e9cb30a099e6bb864627ef6045/pymupdf-1.26.7-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:07085718dfdae5ab83b05eb5eb397f863bcc538fe05135318a01ea353e7a1353", size = 23179369, upload-time = "2025-12-11T21:47:21.587Z" },
|
|
3693
|
+
{ url = "https://files.pythonhosted.org/packages/72/74/448b6172927c829c6a3fba80078d7b0a016ebbe2c9ee528821f5ea21677a/pymupdf-1.26.7-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:31aa9c8377ea1eea02934b92f4dcf79fb2abba0bf41f8a46d64c3e31546a3c02", size = 22470101, upload-time = "2025-12-11T21:47:37.105Z" },
|
|
3694
|
+
{ url = "https://files.pythonhosted.org/packages/65/e7/47af26f3ac76be7ac3dd4d6cc7ee105948a8355d774e5ca39857bf91c11c/pymupdf-1.26.7-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e419b609996434a14a80fa060adec72c434a1cca6a511ec54db9841bc5d51b3c", size = 23502486, upload-time = "2025-12-12T09:51:25.824Z" },
|
|
3695
|
+
{ url = "https://files.pythonhosted.org/packages/2a/6b/3de1714d734ff949be1e90a22375d0598d3540b22ae73eb85c2d7d1f36a9/pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:69dfc78f206a96e5b3ac22741263ebab945fdf51f0dbe7c5757c3511b23d9d72", size = 24115727, upload-time = "2025-12-11T21:47:51.274Z" },
|
|
3696
|
+
{ url = "https://files.pythonhosted.org/packages/62/9b/f86224847949577a523be2207315ae0fd3155b5d909cd66c274d095349a3/pymupdf-1.26.7-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1d5106f46e1ca0d64d46bd51892372a4f82076bdc14a9678d33d630702abca36", size = 24324386, upload-time = "2025-12-12T14:58:45.483Z" },
|
|
3697
|
+
{ url = "https://files.pythonhosted.org/packages/85/8e/a117d39092ca645fde8b903f4a941d9aa75b370a67b4f1f435f56393dc5a/pymupdf-1.26.7-cp310-abi3-win32.whl", hash = "sha256:7c9645b6f5452629c747690190350213d3e5bbdb6b2eca227d82702b327f6eee", size = 17203888, upload-time = "2025-12-12T13:59:57.613Z" },
|
|
3698
|
+
{ url = "https://files.pythonhosted.org/packages/dd/c3/d0047678146c294469c33bae167c8ace337deafb736b0bf97b9bc481aa65/pymupdf-1.26.7-cp310-abi3-win_amd64.whl", hash = "sha256:425b1befe40d41b72eb0fe211711c7ae334db5eb60307e9dd09066ed060cceba", size = 18405952, upload-time = "2025-12-11T21:48:02.947Z" },
|
|
3699
|
+
]
|
|
3700
|
+
|
|
3686
3701
|
[[package]]
|
|
3687
3702
|
name = "pymysql"
|
|
3688
3703
|
version = "1.1.2"
|
|
@@ -5519,7 +5534,7 @@ wheels = [
|
|
|
5519
5534
|
|
|
5520
5535
|
[[package]]
|
|
5521
5536
|
name = "xfmr-zem"
|
|
5522
|
-
version = "0.2.
|
|
5537
|
+
version = "0.2.7"
|
|
5523
5538
|
source = { editable = "." }
|
|
5524
5539
|
dependencies = [
|
|
5525
5540
|
{ name = "click" },
|
|
@@ -5569,6 +5584,7 @@ ocr = [
|
|
|
5569
5584
|
{ name = "pdfplumber" },
|
|
5570
5585
|
{ name = "pillow" },
|
|
5571
5586
|
{ name = "pyclipper" },
|
|
5587
|
+
{ name = "pymupdf" },
|
|
5572
5588
|
{ name = "pytesseract" },
|
|
5573
5589
|
{ name = "ruamel-yaml" },
|
|
5574
5590
|
{ name = "shapely" },
|
|
@@ -5608,6 +5624,7 @@ requires-dist = [
|
|
|
5608
5624
|
{ name = "pyarrow", specifier = ">=15.0.0" },
|
|
5609
5625
|
{ name = "pyclipper", marker = "extra == 'ocr'" },
|
|
5610
5626
|
{ name = "pydantic", specifier = ">=2.0.0" },
|
|
5627
|
+
{ name = "pymupdf", marker = "extra == 'ocr'", specifier = ">=1.23.0" },
|
|
5611
5628
|
{ name = "pytesseract", marker = "extra == 'ocr'", specifier = ">=0.3.10" },
|
|
5612
5629
|
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0.0" },
|
|
5613
5630
|
{ name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0.0" },
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
from xfmr_zem.server import ZemServer
|
|
3
|
-
from xfmr_zem.servers.ocr.engines import OCREngineFactory
|
|
4
|
-
from loguru import logger
|
|
5
|
-
|
|
6
|
-
# Initialize ZemServer for OCR
|
|
7
|
-
mcp = ZemServer("ocr")
|
|
8
|
-
|
|
9
|
-
@mcp.tool()
|
|
10
|
-
async def extract_text(file_path: str, engine: str = None, model_id: str = None) -> pd.DataFrame:
|
|
11
|
-
"""
|
|
12
|
-
Extracts text from an image using the specified OCR engine.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
file_path: Path to the image file.
|
|
16
|
-
engine: The OCR engine to use ("tesseract", "paddle", "huggingface", "viet"). Defaults to "tesseract".
|
|
17
|
-
model_id: Optional model ID for the 'huggingface' engine (e.g., "Qwen/Qwen2-VL-2B-Instruct").
|
|
18
|
-
"""
|
|
19
|
-
logger.info(f"OCR Extraction: {file_path} using {engine} (model: {model_id})")
|
|
20
|
-
|
|
21
|
-
try:
|
|
22
|
-
# Get engine from factory (SOLID Strategy Pattern)
|
|
23
|
-
ocr_engine = OCREngineFactory.get_engine(engine, model_id=model_id)
|
|
24
|
-
|
|
25
|
-
# Process image
|
|
26
|
-
result = ocr_engine.process(file_path)
|
|
27
|
-
|
|
28
|
-
# Structure as a single-row DataFrame for Zem compatibility
|
|
29
|
-
# We wrap in a list to ensure pandas creates a row
|
|
30
|
-
df = pd.DataFrame([{
|
|
31
|
-
"text": result["text"],
|
|
32
|
-
"engine": result["engine"],
|
|
33
|
-
"metadata": result["metadata"]
|
|
34
|
-
}])
|
|
35
|
-
|
|
36
|
-
logger.info(f"Successfully extracted text using {engine}")
|
|
37
|
-
return df.to_dict(orient="records")
|
|
38
|
-
|
|
39
|
-
except Exception as e:
|
|
40
|
-
logger.error(f"OCR Error with {engine}: {e}")
|
|
41
|
-
raise RuntimeError(f"OCR failed: {str(e)}")
|
|
42
|
-
|
|
43
|
-
if __name__ == "__main__":
|
|
44
|
-
mcp.run()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py
RENAMED
|
File without changes
|
{xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py
RENAMED
|
File without changes
|
|
File without changes
|
{xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/.gitattributes
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/utils/__init__.py
RENAMED
|
File without changes
|
{xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/utils/file_utils.py
RENAMED
|
File without changes
|
{xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/__init__.py
RENAMED
|
File without changes
|
{xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/base.yml
RENAMED
|
File without changes
|
|
File without changes
|
{xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/vocab.py
RENAMED
|
File without changes
|
{xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/config.py
RENAMED
|
File without changes
|
{xfmr_zem-0.2.5 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/translate.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|