xfmr-zem 0.2.6__tar.gz → 0.2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/CHANGELOG.md +5 -0
  2. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/PKG-INFO +1 -1
  3. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/pyproject.toml +1 -1
  4. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/parameters.yml +3 -0
  5. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/server.py +35 -8
  6. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/uv.lock +1 -1
  7. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/.github/workflows/deploy.yml +0 -0
  8. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/.github/workflows/pypi-publish.yml +0 -0
  9. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/.gitignore +0 -0
  10. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/AGENTS.md +0 -0
  11. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/LICENSE +0 -0
  12. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/README.md +0 -0
  13. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/big_data_output.parquet +0 -0
  14. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/big_data_sim.parquet +0 -0
  15. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/dup_cleaned.parquet +0 -0
  16. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/dup_data.parquet +0 -0
  17. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/dup_data_large.parquet +0 -0
  18. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/nemo_full_stack_result.parquet +0 -0
  19. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/nemo_real_result.parquet +0 -0
  20. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/ocr_test.png +0 -0
  21. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/output_result.jsonl +0 -0
  22. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/sample.jsonl +0 -0
  23. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/data/vietnamese_ocr.png +0 -0
  24. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/parameters.yml +0 -0
  25. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/__init__.py +0 -0
  26. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/cli.py +0 -0
  27. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/client.py +0 -0
  28. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/orchestrators/parallel_local.py +0 -0
  29. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/schemas.py +0 -0
  30. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/server.py +0 -0
  31. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/data_juicer/parameters.yml +0 -0
  32. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/data_juicer/server.py +0 -0
  33. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/instruction_gen/parameters.yml +0 -0
  34. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/instruction_gen/server.py +0 -0
  35. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/io/parameters.yml +0 -0
  36. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/io/server.py +0 -0
  37. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/llm/parameters.yml +0 -0
  38. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/llm/server.py +0 -0
  39. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/nemo_curator/parameters.yml +0 -0
  40. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/nemo_curator/server.py +0 -0
  41. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py +0 -0
  42. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py +0 -0
  43. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py +0 -0
  44. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/ocr.py +0 -0
  45. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/.gitattributes +0 -0
  46. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/README.md +0 -0
  47. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/ocr.res +0 -0
  48. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/operators.py +0 -0
  49. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/phases.py +0 -0
  50. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/pipeline.py +0 -0
  51. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/postprocess.py +0 -0
  52. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/recognizer.py +0 -0
  53. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/table_structure_recognizer.py +0 -0
  54. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/utils/__init__.py +0 -0
  55. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/utils/file_utils.py +0 -0
  56. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/__init__.py +0 -0
  57. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/base.yml +0 -0
  58. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/vgg-seq2seq.yml +0 -0
  59. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/__init__.py +0 -0
  60. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/cnn.py +0 -0
  61. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/vgg.py +0 -0
  62. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/seqmodel/seq2seq.py +0 -0
  63. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/transformerocr.py +0 -0
  64. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/vocab.py +0 -0
  65. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/config.py +0 -0
  66. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/translate.py +0 -0
  67. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/engines.py +0 -0
  68. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/ocr/install_models.py +0 -0
  69. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/profiler/parameters.yml +0 -0
  70. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/profiler/server.py +0 -0
  71. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/sinks/parameters.yml +0 -0
  72. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/sinks/server.py +0 -0
  73. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/unstructured/parameters.yml +0 -0
  74. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/servers/unstructured/server.py +0 -0
  75. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/src/xfmr_zem/zenml_wrapper.py +0 -0
  76. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/caching_test.yaml +0 -0
  77. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/hf_ocr_test.yaml +0 -0
  78. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/llm_test.yaml +0 -0
  79. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/multimodal_test.yaml +0 -0
  80. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/ocr_test.yaml +0 -0
  81. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/parallel_test.yaml +0 -0
  82. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/parquet_test.yaml +0 -0
  83. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/phase4_test.yaml +0 -0
  84. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/profiler_test.yaml +0 -0
  85. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/standard_data_pipeline.yaml +0 -0
  86. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/tests/manual/viet_ocr_test.yaml +0 -0
  87. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/website/docs/docs.css +0 -0
  88. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/website/docs/index.html +0 -0
  89. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/website/index.html +0 -0
  90. {xfmr_zem-0.2.6 → xfmr_zem-0.2.7}/website/style.css +0 -0
@@ -2,6 +2,11 @@
2
2
 
3
3
  All notable changes to this project will be documented in this file.
4
4
 
5
+ ## [0.2.7] - 2026-02-03
6
+
7
+ ### Added
8
+ - **Configurable OCR Parameters**: Added `scanned_threshold`, `zoom`, and `temp_dir` parameters to the OCR server for finer control over PDF processing.
9
+
5
10
  ## [0.2.6] - 2026-02-03
6
11
 
7
12
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xfmr-zem
3
- Version: 0.2.6
3
+ Version: 0.2.7
4
4
  Summary: Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing
5
5
  Project-URL: Homepage, https://github.com/OAI-Labs/xfmr-zem
6
6
  Project-URL: Repository, https://github.com/OAI-Labs/xfmr-zem
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "xfmr-zem"
3
- version = "0.2.6"
3
+ version = "0.2.7"
4
4
  description = "Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10,<3.13"
@@ -2,3 +2,6 @@
2
2
  extract_text:
3
3
  engine: "tesseract"
4
4
  model_id: null
5
+ scanned_threshold: 50
6
+ zoom: 2.0
7
+ temp_dir: "/tmp"
@@ -9,29 +9,39 @@ import io
9
9
  # Initialize ZemServer for OCR
10
10
  mcp = ZemServer("ocr")
11
11
 
12
- def extract_pdf_pages(file_path: str, engine: str, ocr_engine, model_id: str = None):
12
+ def extract_pdf_pages(
13
+ file_path: str,
14
+ engine: str,
15
+ ocr_engine,
16
+ scanned_threshold: int = 50,
17
+ zoom: float = 2.0,
18
+ temp_dir: str = "/tmp"
19
+ ):
13
20
  """Helper to process PDF pages with optional OCR for scanned content."""
14
21
  import fitz # PyMuPDF
15
22
 
16
23
  results = []
17
24
  doc = fitz.open(file_path)
18
25
 
26
+ # Ensure temp_dir exists
27
+ os.makedirs(temp_dir, exist_ok=True)
28
+
19
29
  for page_num in range(len(doc)):
20
30
  page = doc[page_num]
21
31
  text = page.get_text().strip()
22
32
 
23
33
  # Determine if we need to OCR (Strategy: text is too short or empty)
24
- is_scanned = len(text) < 50
34
+ is_scanned = len(text) < scanned_threshold
25
35
 
26
36
  if is_scanned:
27
- logger.info(f"Page {page_num + 1} appears scanned. Running OCR with {engine}...")
37
+ logger.info(f"Page {page_num + 1} appears scanned (text length: {len(text)}). Running OCR with {engine}...")
28
38
  # Render page to image for OCR
29
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for better OCR
39
+ pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
30
40
  img_data = pix.tobytes("png")
31
41
  img = Image.open(io.BytesIO(img_data))
32
42
 
33
43
  # Temporary save for engine compatibility (engines expect path)
34
- temp_path = f"/tmp/ocr_page_{page_num}.png"
44
+ temp_path = os.path.join(temp_dir, f"ocr_page_{os.getpid()}_{page_num}.png")
35
45
  img.save(temp_path)
36
46
 
37
47
  try:
@@ -56,7 +66,14 @@ def extract_pdf_pages(file_path: str, engine: str, ocr_engine, model_id: str = N
56
66
  return results
57
67
 
58
68
  @mcp.tool()
59
- async def extract_text(file_path: str, engine: str = "tesseract", model_id: str = None) -> pd.DataFrame:
69
+ async def extract_text(
70
+ file_path: str,
71
+ engine: str = "tesseract",
72
+ model_id: str = None,
73
+ scanned_threshold: int = 50,
74
+ zoom: float = 2.0,
75
+ temp_dir: str = "/tmp"
76
+ ) -> pd.DataFrame:
60
77
  """
61
78
  Extracts text from an image or PDF using the specified OCR engine.
62
79
  For PDFs, it will automatically handle scanned pages using the OCR engine.
@@ -65,8 +82,11 @@ async def extract_text(file_path: str, engine: str = "tesseract", model_id: str
65
82
  file_path: Path to the image or PDF file.
66
83
  engine: The OCR engine to use ("tesseract", "paddle", "huggingface", "viet"). Defaults to "tesseract".
67
84
  model_id: Optional model ID for the 'huggingface' engine.
85
+ scanned_threshold: Min characters required to skip OCR on PDF page. Defaults to 50.
86
+ zoom: Rendering zoom factor for scanned PDF pages. Defaults to 2.0.
87
+ temp_dir: Directory for temporary page images. Defaults to "/tmp".
68
88
  """
69
- logger.info(f"OCR Extraction: {file_path} using {engine}")
89
+ logger.info(f"OCR Extraction: {file_path} using {engine} (scanned_threshold={scanned_threshold}, zoom={zoom})")
70
90
 
71
91
  if not os.path.exists(file_path):
72
92
  raise FileNotFoundError(f"File not found: {file_path}")
@@ -78,7 +98,14 @@ async def extract_text(file_path: str, engine: str = "tesseract", model_id: str
78
98
  # Handle PDF vs Image
79
99
  if file_path.lower().endswith(".pdf"):
80
100
  logger.info(f"Processing PDF file: {file_path}")
81
- data = extract_pdf_pages(file_path, engine, ocr_engine, model_id)
101
+ data = extract_pdf_pages(
102
+ file_path,
103
+ engine,
104
+ ocr_engine,
105
+ scanned_threshold=scanned_threshold,
106
+ zoom=zoom,
107
+ temp_dir=temp_dir
108
+ )
82
109
  df = pd.DataFrame(data)
83
110
  else:
84
111
  # Process image
@@ -5534,7 +5534,7 @@ wheels = [
5534
5534
 
5535
5535
  [[package]]
5536
5536
  name = "xfmr-zem"
5537
- version = "0.2.6"
5537
+ version = "0.2.7"
5538
5538
  source = { editable = "." }
5539
5539
  dependencies = [
5540
5540
  { name = "click" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes