xfmr-zem 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xfmr_zem/client.py CHANGED
@@ -51,22 +51,48 @@ class PipelineClient:
51
51
  items.append((new_key, v))
52
52
  return dict(items)
53
53
 
54
+ def _unflatten_params(self, flat_dict: Dict[str, Any]) -> Dict[str, Any]:
55
+ """Expand dot-notation keys into nested dictionaries."""
56
+ nested = {}
57
+ for key, value in flat_dict.items():
58
+ if "." in key:
59
+ parts = key.split(".")
60
+ d = nested
61
+ for part in parts[:-1]:
62
+ if part not in d or not isinstance(d[part], dict):
63
+ d[part] = {}
64
+ d = d[part]
65
+ d[parts[-1]] = value
66
+ else:
67
+ if isinstance(value, dict) and key in nested and isinstance(nested[key], dict):
68
+ nested[key].update(value)
69
+ else:
70
+ nested[key] = value
71
+ return nested
72
+
54
73
  def _load_config_dict(self, path: Path) -> Dict[str, Any]:
55
74
  """Load YAML config and perform substitution."""
56
75
  with open(path, "r") as f:
57
76
  raw_content = f.read()
58
77
 
59
- self.params = self._load_params(None)
78
+ # 1. Load parameters from file
79
+ base_params = self._load_params(None)
80
+
81
+ # 2. Add custom parameters file if provided
82
+ if self.params_path:
83
+ custom_params = self._load_params(self.params_path)
84
+ base_params.update(custom_params)
85
+
86
+ # 3. Load internal parameters from the config file itself
60
87
  preliminary_dict = yaml.safe_load(raw_content) or {}
61
88
  internal_params = preliminary_dict.get("parameters", {})
62
89
  if internal_params:
63
- self.params.update(internal_params)
64
-
65
- if self.params_path:
66
- custom_params = self._load_params(self.params_path)
67
- self.params.update(custom_params)
90
+ base_params.update(internal_params)
68
91
 
69
- # Flatten params for template substitution
92
+ # Store unflattened parameters for hierarchical lookup
93
+ self.params = self._unflatten_params(base_params)
94
+
95
+ # 4. Flatten all params for template substitution ({{ key }})
70
96
  flat_params = self._flatten_params(self.params)
71
97
 
72
98
  content = raw_content
@@ -105,11 +131,12 @@ class PipelineClient:
105
131
  env["PYTHONPATH"] = f"{src_path}:{current_pythonpath}" if current_pythonpath else src_path
106
132
 
107
133
  server_specific_params = {}
108
- prefix = f"{name}."
109
134
  for key, value in self.params.items():
110
- if key.startswith(prefix):
111
- server_specific_params[key[len(prefix):]] = value
112
- else:
135
+ if key == name and isinstance(value, dict):
136
+ # Direct match: ocr -> { ... }
137
+ server_specific_params.update(value)
138
+ elif not isinstance(value, dict):
139
+ # Global scalars
113
140
  server_specific_params[key] = value
114
141
 
115
142
  env["ZEM_PARAMETERS"] = yaml.dump(server_specific_params)
@@ -2,3 +2,6 @@
2
2
  extract_text:
3
3
  engine: "tesseract"
4
4
  model_id: null
5
+ scanned_threshold: 50
6
+ zoom: 2.0
7
+ temp_dir: "/tmp"
@@ -9,30 +9,41 @@ import io
9
9
  # Initialize ZemServer for OCR
10
10
  mcp = ZemServer("ocr")
11
11
 
12
- def extract_pdf_pages(file_path: str, engine: str, ocr_engine, model_id: str = None):
12
+ def extract_pdf_pages(
13
+ file_path: str,
14
+ engine: str,
15
+ ocr_engine,
16
+ scanned_threshold: int = 50,
17
+ zoom: float = 2.0,
18
+ temp_dir: str = "/tmp"
19
+ ):
13
20
  """Helper to process PDF pages with optional OCR for scanned content."""
14
21
  import fitz # PyMuPDF
15
22
 
16
23
  results = []
17
24
  doc = fitz.open(file_path)
18
25
 
26
+ # Ensure temp_dir exists
27
+ os.makedirs(temp_dir, exist_ok=True)
28
+
19
29
  for page_num in range(len(doc)):
20
30
  page = doc[page_num]
21
31
  text = page.get_text().strip()
22
32
 
23
33
  # Determine if we need to OCR (Strategy: text is too short or empty)
24
- is_scanned = len(text) < 50
34
+ is_scanned = len(text) < scanned_threshold
25
35
 
26
36
  if is_scanned:
27
- logger.info(f"Page {page_num + 1} appears scanned. Running OCR with {engine}...")
37
+ logger.info(f"Page {page_num + 1} appears scanned (text length: {len(text)}). Running OCR with {engine}...")
28
38
  # Render page to image for OCR
29
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for better OCR
39
+ pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
30
40
  img_data = pix.tobytes("png")
31
41
  img = Image.open(io.BytesIO(img_data))
32
42
 
33
43
  # Temporary save for engine compatibility (engines expect path)
34
- temp_path = f"/tmp/ocr_page_{page_num}.png"
44
+ temp_path = os.path.join(temp_dir, f"ocr_page_{os.getpid()}_{page_num}.png")
35
45
  img.save(temp_path)
46
+ logger.debug(f"Saved temporary page image to: {temp_path}")
36
47
 
37
48
  try:
38
49
  ocr_result = ocr_engine.process(temp_path)
@@ -56,7 +67,14 @@ def extract_pdf_pages(file_path: str, engine: str, ocr_engine, model_id: str = N
56
67
  return results
57
68
 
58
69
  @mcp.tool()
59
- async def extract_text(file_path: str, engine: str = "tesseract", model_id: str = None) -> pd.DataFrame:
70
+ async def extract_text(
71
+ file_path: str,
72
+ engine: str = "tesseract",
73
+ model_id: str = None,
74
+ scanned_threshold: int = 50,
75
+ zoom: float = 2.0,
76
+ temp_dir: str = "/tmp"
77
+ ) -> pd.DataFrame:
60
78
  """
61
79
  Extracts text from an image or PDF using the specified OCR engine.
62
80
  For PDFs, it will automatically handle scanned pages using the OCR engine.
@@ -65,8 +83,11 @@ async def extract_text(file_path: str, engine: str = "tesseract", model_id: str
65
83
  file_path: Path to the image or PDF file.
66
84
  engine: The OCR engine to use ("tesseract", "paddle", "huggingface", "viet"). Defaults to "tesseract".
67
85
  model_id: Optional model ID for the 'huggingface' engine.
86
+ scanned_threshold: Min characters required to skip OCR on PDF page. Defaults to 50.
87
+ zoom: Rendering zoom factor for scanned PDF pages. Defaults to 2.0.
88
+ temp_dir: Directory for temporary page images. Defaults to "/tmp".
68
89
  """
69
- logger.info(f"OCR Extraction: {file_path} using {engine}")
90
+ logger.info(f"OCR Extraction: {file_path} using {engine} (scanned_threshold={scanned_threshold}, zoom={zoom})")
70
91
 
71
92
  if not os.path.exists(file_path):
72
93
  raise FileNotFoundError(f"File not found: {file_path}")
@@ -78,7 +99,14 @@ async def extract_text(file_path: str, engine: str = "tesseract", model_id: str
78
99
  # Handle PDF vs Image
79
100
  if file_path.lower().endswith(".pdf"):
80
101
  logger.info(f"Processing PDF file: {file_path}")
81
- data = extract_pdf_pages(file_path, engine, ocr_engine, model_id)
102
+ data = extract_pdf_pages(
103
+ file_path,
104
+ engine,
105
+ ocr_engine,
106
+ scanned_threshold=scanned_threshold,
107
+ zoom=zoom,
108
+ temp_dir=temp_dir
109
+ )
82
110
  df = pd.DataFrame(data)
83
111
  else:
84
112
  # Process image
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xfmr-zem
3
- Version: 0.2.6
3
+ Version: 0.2.8
4
4
  Summary: Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing
5
5
  Project-URL: Homepage, https://github.com/OAI-Labs/xfmr-zem
6
6
  Project-URL: Repository, https://github.com/OAI-Labs/xfmr-zem
@@ -1,6 +1,6 @@
1
1
  xfmr_zem/__init__.py,sha256=Abx2BepsZu-e7E93N2lOgu9w0b4TBZLN6MEzCzDCn_A,1138
2
2
  xfmr_zem/cli.py,sha256=5oz4qxXthU4mXu7bSbfKreVkAvCqrieXpGoKhJBXBvk,12538
3
- xfmr_zem/client.py,sha256=wf9N_fILDBvWd-08TnNq3B1PqKQPhR0pvVuJq0vidk0,11435
3
+ xfmr_zem/client.py,sha256=2PkJavZ8kMVq0dXoeZvpRODO96tWiXyT1alZLcw5RH0,12601
4
4
  xfmr_zem/schemas.py,sha256=0tHM0ftOWTWxNiqmAZn_MyIYJwF2p9brHK0MHlOMlKY,494
5
5
  xfmr_zem/server.py,sha256=EeohfqhUiCm0cGnV85H2ODZ4FLXjcTjbkdHrHuGHW4I,8363
6
6
  xfmr_zem/zenml_wrapper.py,sha256=LHgDewuPBjCl4EiU6JZVU-_lyEi-ATURDSG9Vf7PbEY,6739
@@ -17,8 +17,8 @@ xfmr_zem/servers/nemo_curator/parameters.yml,sha256=EGEzo0heI-ajkwFFy3xxq_YD7cXU
17
17
  xfmr_zem/servers/nemo_curator/server.py,sha256=zcHoSwxxoK_rMaDIAbEy1s8qfdp68Ue4B-XBcjGxQak,3848
18
18
  xfmr_zem/servers/ocr/engines.py,sha256=zScn4Qjxbpl2nB8UXEf3kd9l8z84TEwGs6bV5ka8Lks,10295
19
19
  xfmr_zem/servers/ocr/install_models.py,sha256=t02zpoy8djVhITOLEaRJ2mjiMrFfA9H6fpeHD3hXuio,2135
20
- xfmr_zem/servers/ocr/parameters.yml,sha256=04v59-6QXwN6XEpnHLc5pz6iTgNBDhloHtCCjHr8YRA,89
21
- xfmr_zem/servers/ocr/server.py,sha256=lhT5rfpz2vPbdbM7RZAtsJKfZEqoth2ijEifMA_vWSA,3590
20
+ xfmr_zem/servers/ocr/parameters.yml,sha256=UTMwtTu0Eeit0tFkYcZOxpuzD78UBlpONXZIx6STYwc,144
21
+ xfmr_zem/servers/ocr/server.py,sha256=wfk9L1776TOpFNlmc73jknEMDDobfcFgqBUhcVX2elc,4441
22
22
  xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py,sha256=XJE7RnOu5oo5p902HPWPDBd7FhVQXetmnr2-kWEG0nI,2419
23
23
  xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py,sha256=79fYr76fx8yZda3HaFcK1d5G-4sDVf1JFHNW_OBQAk8,47348
24
24
  xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py,sha256=7BeLHzf9FQUkkHMb5jDpggruJmfXVMU78MF_EeZ9PG4,10462
@@ -51,8 +51,8 @@ xfmr_zem/servers/sinks/parameters.yml,sha256=9HAnv84Utw2qWsVZH8uOjVE62lnAKBkzv4P
51
51
  xfmr_zem/servers/sinks/server.py,sha256=jI_r4sq_U_avNwF1PiE0alpaDrYpzOI-qPeLU7hgHP0,1589
52
52
  xfmr_zem/servers/unstructured/parameters.yml,sha256=N31cmc56GTr3rkVhbni4yOpbnHISReN8f-KnRZTDbBc,118
53
53
  xfmr_zem/servers/unstructured/server.py,sha256=0XmXWMAUNEJboX-J4bn_8EBUfMHIqu_ylNC_s9YOZdk,1996
54
- xfmr_zem-0.2.6.dist-info/METADATA,sha256=E2bD-td6hx7ntkKGH3XIgwH8BTUADbTE5T-cFi21Qds,6379
55
- xfmr_zem-0.2.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
56
- xfmr_zem-0.2.6.dist-info/entry_points.txt,sha256=uxs-IXFxpSakHivpFN3mEr13cz-z-0vkeSF_4dEBMa4,65
57
- xfmr_zem-0.2.6.dist-info/licenses/LICENSE,sha256=kf_ILr0zLkSy5-EBu0VF2PGaOykYo83z3UijI-bZeAE,11342
58
- xfmr_zem-0.2.6.dist-info/RECORD,,
54
+ xfmr_zem-0.2.8.dist-info/METADATA,sha256=sv4boGlSzTYgE1MlKIZieIVvRoioVKoWwTOPXhrqKeE,6379
55
+ xfmr_zem-0.2.8.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
56
+ xfmr_zem-0.2.8.dist-info/entry_points.txt,sha256=uxs-IXFxpSakHivpFN3mEr13cz-z-0vkeSF_4dEBMa4,65
57
+ xfmr_zem-0.2.8.dist-info/licenses/LICENSE,sha256=kf_ILr0zLkSy5-EBu0VF2PGaOykYo83z3UijI-bZeAE,11342
58
+ xfmr_zem-0.2.8.dist-info/RECORD,,