xfmr-zem 0.2.6__tar.gz → 0.2.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/CHANGELOG.md +5 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/PKG-INFO +1 -1
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/pyproject.toml +1 -1
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/parameters.yml +3 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/server.py +35 -8
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/uv.lock +1 -1
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/.github/workflows/deploy.yml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/.github/workflows/pypi-publish.yml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/.gitignore +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/AGENTS.md +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/LICENSE +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/README.md +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/big_data_output.parquet +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/big_data_sim.parquet +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/dup_cleaned.parquet +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/dup_data.parquet +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/dup_data_large.parquet +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/nemo_full_stack_result.parquet +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/nemo_real_result.parquet +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/ocr_test.png +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/output_result.jsonl +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/sample.jsonl +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/vietnamese_ocr.png +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/parameters.yml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/__init__.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/cli.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/client.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/orchestrators/parallel_local.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/schemas.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/server.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/data_juicer/parameters.yml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/data_juicer/server.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/instruction_gen/parameters.yml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/instruction_gen/server.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/io/parameters.yml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/io/server.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/llm/parameters.yml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/llm/server.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/nemo_curator/parameters.yml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/nemo_curator/server.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/ocr.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/.gitattributes +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/README.md +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/ocr.res +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/operators.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/phases.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/pipeline.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/postprocess.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/recognizer.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/table_structure_recognizer.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/utils/__init__.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/utils/file_utils.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/__init__.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/base.yml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/vgg-seq2seq.yml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/__init__.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/cnn.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/vgg.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/seqmodel/seq2seq.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/transformerocr.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/vocab.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/config.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/translate.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/engines.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/install_models.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/profiler/parameters.yml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/profiler/server.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/sinks/parameters.yml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/sinks/server.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/unstructured/parameters.yml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/unstructured/server.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/zenml_wrapper.py +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/caching_test.yaml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/hf_ocr_test.yaml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/llm_test.yaml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/multimodal_test.yaml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/ocr_test.yaml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/parallel_test.yaml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/parquet_test.yaml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/phase4_test.yaml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/profiler_test.yaml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/standard_data_pipeline.yaml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/viet_ocr_test.yaml +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/website/docs/docs.css +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/website/docs/index.html +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/website/index.html +0 -0
- {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/website/style.css +0 -0
|
@@ -2,6 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project will be documented in this file.
|
|
4
4
|
|
|
5
|
+
## [0.2.7] - 2026-02-03
|
|
6
|
+
|
|
7
|
+
### Added
|
|
8
|
+
- **Configurable OCR Parameters**: Added `scanned_threshold`, `zoom`, and `temp_dir` parameters to the OCR server for finer control over PDF processing.
|
|
9
|
+
|
|
5
10
|
## [0.2.6] - 2026-02-03
|
|
6
11
|
|
|
7
12
|
### Added
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xfmr-zem
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.7
|
|
4
4
|
Summary: Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing
|
|
5
5
|
Project-URL: Homepage, https://github.com/OAI-Labs/xfmr-zem
|
|
6
6
|
Project-URL: Repository, https://github.com/OAI-Labs/xfmr-zem
|
|
@@ -9,29 +9,39 @@ import io
|
|
|
9
9
|
# Initialize ZemServer for OCR
|
|
10
10
|
mcp = ZemServer("ocr")
|
|
11
11
|
|
|
12
|
-
def extract_pdf_pages(
|
|
12
|
+
def extract_pdf_pages(
|
|
13
|
+
file_path: str,
|
|
14
|
+
engine: str,
|
|
15
|
+
ocr_engine,
|
|
16
|
+
scanned_threshold: int = 50,
|
|
17
|
+
zoom: float = 2.0,
|
|
18
|
+
temp_dir: str = "/tmp"
|
|
19
|
+
):
|
|
13
20
|
"""Helper to process PDF pages with optional OCR for scanned content."""
|
|
14
21
|
import fitz # PyMuPDF
|
|
15
22
|
|
|
16
23
|
results = []
|
|
17
24
|
doc = fitz.open(file_path)
|
|
18
25
|
|
|
26
|
+
# Ensure temp_dir exists
|
|
27
|
+
os.makedirs(temp_dir, exist_ok=True)
|
|
28
|
+
|
|
19
29
|
for page_num in range(len(doc)):
|
|
20
30
|
page = doc[page_num]
|
|
21
31
|
text = page.get_text().strip()
|
|
22
32
|
|
|
23
33
|
# Determine if we need to OCR (Strategy: text is too short or empty)
|
|
24
|
-
is_scanned = len(text) <
|
|
34
|
+
is_scanned = len(text) < scanned_threshold
|
|
25
35
|
|
|
26
36
|
if is_scanned:
|
|
27
|
-
logger.info(f"Page {page_num + 1} appears scanned. Running OCR with {engine}...")
|
|
37
|
+
logger.info(f"Page {page_num + 1} appears scanned (text length: {len(text)}). Running OCR with {engine}...")
|
|
28
38
|
# Render page to image for OCR
|
|
29
|
-
pix = page.get_pixmap(matrix=fitz.Matrix(
|
|
39
|
+
pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
|
|
30
40
|
img_data = pix.tobytes("png")
|
|
31
41
|
img = Image.open(io.BytesIO(img_data))
|
|
32
42
|
|
|
33
43
|
# Temporary save for engine compatibility (engines expect path)
|
|
34
|
-
temp_path = f"
|
|
44
|
+
temp_path = os.path.join(temp_dir, f"ocr_page_{os.getpid()}_{page_num}.png")
|
|
35
45
|
img.save(temp_path)
|
|
36
46
|
|
|
37
47
|
try:
|
|
@@ -56,7 +66,14 @@ def extract_pdf_pages(file_path: str, engine: str, ocr_engine, model_id: str = N
|
|
|
56
66
|
return results
|
|
57
67
|
|
|
58
68
|
@mcp.tool()
|
|
59
|
-
async def extract_text(
|
|
69
|
+
async def extract_text(
|
|
70
|
+
file_path: str,
|
|
71
|
+
engine: str = "tesseract",
|
|
72
|
+
model_id: str = None,
|
|
73
|
+
scanned_threshold: int = 50,
|
|
74
|
+
zoom: float = 2.0,
|
|
75
|
+
temp_dir: str = "/tmp"
|
|
76
|
+
) -> pd.DataFrame:
|
|
60
77
|
"""
|
|
61
78
|
Extracts text from an image or PDF using the specified OCR engine.
|
|
62
79
|
For PDFs, it will automatically handle scanned pages using the OCR engine.
|
|
@@ -65,8 +82,11 @@ async def extract_text(file_path: str, engine: str = "tesseract", model_id: str
|
|
|
65
82
|
file_path: Path to the image or PDF file.
|
|
66
83
|
engine: The OCR engine to use ("tesseract", "paddle", "huggingface", "viet"). Defaults to "tesseract".
|
|
67
84
|
model_id: Optional model ID for the 'huggingface' engine.
|
|
85
|
+
scanned_threshold: Min characters required to skip OCR on PDF page. Defaults to 50.
|
|
86
|
+
zoom: Rendering zoom factor for scanned PDF pages. Defaults to 2.0.
|
|
87
|
+
temp_dir: Directory for temporary page images. Defaults to "/tmp".
|
|
68
88
|
"""
|
|
69
|
-
logger.info(f"OCR Extraction: {file_path} using {engine}")
|
|
89
|
+
logger.info(f"OCR Extraction: {file_path} using {engine} (scanned_threshold={scanned_threshold}, zoom={zoom})")
|
|
70
90
|
|
|
71
91
|
if not os.path.exists(file_path):
|
|
72
92
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
@@ -78,7 +98,14 @@ async def extract_text(file_path: str, engine: str = "tesseract", model_id: str
|
|
|
78
98
|
# Handle PDF vs Image
|
|
79
99
|
if file_path.lower().endswith(".pdf"):
|
|
80
100
|
logger.info(f"Processing PDF file: {file_path}")
|
|
81
|
-
data = extract_pdf_pages(
|
|
101
|
+
data = extract_pdf_pages(
|
|
102
|
+
file_path,
|
|
103
|
+
engine,
|
|
104
|
+
ocr_engine,
|
|
105
|
+
scanned_threshold=scanned_threshold,
|
|
106
|
+
zoom=zoom,
|
|
107
|
+
temp_dir=temp_dir
|
|
108
|
+
)
|
|
82
109
|
df = pd.DataFrame(data)
|
|
83
110
|
else:
|
|
84
111
|
# Process image
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py
RENAMED
|
File without changes
|
{xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py
RENAMED
|
File without changes
|
|
File without changes
|
{xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/.gitattributes
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/utils/__init__.py
RENAMED
|
File without changes
|
{xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/utils/file_utils.py
RENAMED
|
File without changes
|
{xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/__init__.py
RENAMED
|
File without changes
|
{xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/base.yml
RENAMED
|
File without changes
|
|
File without changes
|
{xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/vocab.py
RENAMED
|
File without changes
|
{xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/config.py
RENAMED
|
File without changes
|
{xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/translate.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|