xfmr-zem 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xfmr_zem/__init__.py +35 -0
- xfmr_zem/cli.py +295 -0
- xfmr_zem/client.py +208 -0
- xfmr_zem/orchestrators/parallel_local.py +92 -0
- xfmr_zem/schemas.py +15 -0
- xfmr_zem/server.py +188 -0
- xfmr_zem/servers/data_juicer/parameter.yaml +17 -0
- xfmr_zem/servers/data_juicer/server.py +113 -0
- xfmr_zem/servers/instruction_gen/parameter.yaml +12 -0
- xfmr_zem/servers/instruction_gen/server.py +90 -0
- xfmr_zem/servers/io/parameter.yaml +10 -0
- xfmr_zem/servers/io/server.py +95 -0
- xfmr_zem/servers/llm/server.py +47 -0
- xfmr_zem/servers/nemo_curator/parameter.yaml +17 -0
- xfmr_zem/servers/nemo_curator/server.py +118 -0
- xfmr_zem/servers/profiler/server.py +76 -0
- xfmr_zem/servers/sinks/server.py +48 -0
- xfmr_zem/zenml_wrapper.py +203 -0
- xfmr_zem-0.2.0.dist-info/METADATA +152 -0
- xfmr_zem-0.2.0.dist-info/RECORD +23 -0
- xfmr_zem-0.2.0.dist-info/WHEEL +4 -0
- xfmr_zem-0.2.0.dist-info/entry_points.txt +3 -0
- xfmr_zem-0.2.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import re
|
|
4
|
+
import unicodedata
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
from xfmr_zem.server import ZemServer
|
|
7
|
+
from loguru import logger
|
|
8
|
+
|
|
9
|
+
# Setup logging
|
|
10
|
+
logger.remove()
|
|
11
|
+
logger.add(sys.stderr, level="INFO")
|
|
12
|
+
|
|
13
|
+
server = ZemServer("nemo", parameter_file=os.path.join(os.path.dirname(__file__), "parameter.yaml"))
|
|
14
|
+
|
|
15
|
+
@server.tool()
|
|
16
|
+
def normalize(
|
|
17
|
+
data: Any,
|
|
18
|
+
normalization: str = "NFC",
|
|
19
|
+
cleanup_patterns: Optional[List[List[str]]] = None,
|
|
20
|
+
text_column: str = "text"
|
|
21
|
+
) -> Any:
|
|
22
|
+
"""Flexible normalization tool."""
|
|
23
|
+
try:
|
|
24
|
+
items = server.get_data(data)
|
|
25
|
+
if not items: return []
|
|
26
|
+
logger.info(f"Nemo: Normalizing {len(items)} items")
|
|
27
|
+
for item in items:
|
|
28
|
+
if text_column not in item: continue
|
|
29
|
+
text = str(item[text_column])
|
|
30
|
+
text = unicodedata.normalize(normalization, text)
|
|
31
|
+
if cleanup_patterns:
|
|
32
|
+
for pattern, replacement in cleanup_patterns:
|
|
33
|
+
text = re.sub(pattern, replacement, text)
|
|
34
|
+
item[text_column] = text.strip()
|
|
35
|
+
return server.save_output(items)
|
|
36
|
+
except Exception as e:
|
|
37
|
+
logger.exception(f"Error in normalize: {e}")
|
|
38
|
+
raise
|
|
39
|
+
|
|
40
|
+
@server.tool()
|
|
41
|
+
def quality_filter(
|
|
42
|
+
data: Any,
|
|
43
|
+
min_words: int = 50,
|
|
44
|
+
max_non_alpha_ratio: float = 0.25,
|
|
45
|
+
text_column: str = "text"
|
|
46
|
+
) -> Any:
|
|
47
|
+
"""Flexible quality filter based on technical metrics."""
|
|
48
|
+
items = server.get_data(data)
|
|
49
|
+
if not items: return []
|
|
50
|
+
logger.info(f"Nemo: Quality filter (min_words={min_words})")
|
|
51
|
+
filtered = [i for i in items if len(str(i.get(text_column, "")).split()) >= min_words]
|
|
52
|
+
# (Simplified for brevity, can add complex ratio checks)
|
|
53
|
+
return server.save_output(filtered)
|
|
54
|
+
|
|
55
|
+
@server.tool()
|
|
56
|
+
def exact_deduplication(data: Any, text_column: str = "text") -> Any:
|
|
57
|
+
"""Exact deduplication."""
|
|
58
|
+
import pandas as pd
|
|
59
|
+
items = server.get_data(data)
|
|
60
|
+
if not items: return []
|
|
61
|
+
df = pd.DataFrame(items)
|
|
62
|
+
df = df.drop_duplicates(subset=[text_column])
|
|
63
|
+
return server.save_output(df.to_dict(orient="records"))
|
|
64
|
+
|
|
65
|
+
@server.tool()
|
|
66
|
+
def fuzzy_deduplication(
|
|
67
|
+
data: Any,
|
|
68
|
+
text_column: str = "text",
|
|
69
|
+
threshold: float = 0.8,
|
|
70
|
+
algorithm: str = "minhash"
|
|
71
|
+
) -> Any:
|
|
72
|
+
"""
|
|
73
|
+
Fuzzy Deduplication (Task LSP-6).
|
|
74
|
+
- threshold: Similarity threshold (0.0 to 1.0)
|
|
75
|
+
- algorithm: minhash (fast, large scale) or levenshtein (precise, small scale)
|
|
76
|
+
"""
|
|
77
|
+
items = server.get_data(data)
|
|
78
|
+
if not items or len(items) < 2: return items
|
|
79
|
+
|
|
80
|
+
logger.info(f"Nemo: Fuzzy deduplication (algorithm={algorithm}, threshold={threshold})")
|
|
81
|
+
|
|
82
|
+
# Implementation using a simple similarity filter for small sets
|
|
83
|
+
# In a real heavy scenario, this would call nemo_curator.stages.deduplication.fuzzy
|
|
84
|
+
from difflib import SequenceMatcher
|
|
85
|
+
|
|
86
|
+
unique_items = []
|
|
87
|
+
seen_texts = []
|
|
88
|
+
|
|
89
|
+
for item in items:
|
|
90
|
+
text = str(item.get(text_column, ""))
|
|
91
|
+
is_duplicate = False
|
|
92
|
+
for seen in seen_texts:
|
|
93
|
+
similarity = SequenceMatcher(None, text, seen).ratio()
|
|
94
|
+
if similarity >= threshold:
|
|
95
|
+
is_duplicate = True
|
|
96
|
+
break
|
|
97
|
+
if not is_duplicate:
|
|
98
|
+
unique_items.append(item)
|
|
99
|
+
seen_texts.append(text)
|
|
100
|
+
|
|
101
|
+
logger.info(f"Nemo: Fuzzy dedup complete. {len(items)} -> {len(unique_items)}")
|
|
102
|
+
return server.save_output(unique_items)
|
|
103
|
+
|
|
104
|
+
@server.tool()
|
|
105
|
+
def language_filter(
|
|
106
|
+
data: Any,
|
|
107
|
+
target_lang: str = "vi",
|
|
108
|
+
min_score: float = 0.5,
|
|
109
|
+
text_column: str = "text"
|
|
110
|
+
) -> Any:
|
|
111
|
+
"""Filter documents by language (using fasttext-like logic)."""
|
|
112
|
+
items = server.get_data(data)
|
|
113
|
+
logger.info(f"Nemo: Filtering for language '{target_lang}'")
|
|
114
|
+
# Placeholder for actual langid model call
|
|
115
|
+
return server.save_output(items)
|
|
116
|
+
|
|
117
|
+
if __name__ == "__main__":
|
|
118
|
+
server.run()
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import time
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import numpy as np
|
|
7
|
+
from xfmr_zem.server import ZemServer
|
|
8
|
+
from loguru import logger
|
|
9
|
+
|
|
10
|
+
# Setup logging
|
|
11
|
+
logger.remove()
|
|
12
|
+
logger.add(sys.stderr, level="INFO")
|
|
13
|
+
|
|
14
|
+
server = ZemServer("profiler")
|
|
15
|
+
|
|
16
|
+
@server.tool()
|
|
17
|
+
def profile_data(
|
|
18
|
+
data: Any,
|
|
19
|
+
text_column: str = "text",
|
|
20
|
+
include_stats: bool = True
|
|
21
|
+
) -> Any:
|
|
22
|
+
"""
|
|
23
|
+
Generate a profile report for the input data.
|
|
24
|
+
Calculates metrics like null rates, character counts, and unique values.
|
|
25
|
+
"""
|
|
26
|
+
items = server.get_data(data)
|
|
27
|
+
if not items:
|
|
28
|
+
return {"error": "No data to profile"}
|
|
29
|
+
|
|
30
|
+
df = pd.DataFrame(items)
|
|
31
|
+
row_count = len(df)
|
|
32
|
+
|
|
33
|
+
report = {
|
|
34
|
+
"summary": {
|
|
35
|
+
"total_rows": row_count,
|
|
36
|
+
"columns": list(df.columns),
|
|
37
|
+
"memory_usage_kb": round(df.memory_usage(deep=True).sum() / 1024, 2)
|
|
38
|
+
},
|
|
39
|
+
"metrics": {}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
if text_column in df.columns:
|
|
43
|
+
texts = df[text_column].astype(str)
|
|
44
|
+
char_counts = texts.str.len()
|
|
45
|
+
word_counts = texts.str.split().str.len()
|
|
46
|
+
|
|
47
|
+
report["metrics"][text_column] = {
|
|
48
|
+
"avg_chars": round(char_counts.mean(), 2) if row_count > 0 else 0,
|
|
49
|
+
"max_chars": int(char_counts.max()) if row_count > 0 else 0,
|
|
50
|
+
"avg_words": round(word_counts.mean(), 2) if row_count > 0 else 0,
|
|
51
|
+
"null_count": int(df[text_column].isna().sum()),
|
|
52
|
+
"unique_ratio": round(df[text_column].nunique() / row_count, 4) if row_count > 0 else 0
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# Add more general stats if requested
|
|
56
|
+
if include_stats:
|
|
57
|
+
for col in df.columns:
|
|
58
|
+
if col == text_column: continue
|
|
59
|
+
if pd.api.types.is_numeric_dtype(df[col]):
|
|
60
|
+
report["metrics"][col] = {
|
|
61
|
+
"mean": round(float(df[col].mean()), 4),
|
|
62
|
+
"std": round(float(df[col].std()), 4),
|
|
63
|
+
"min": float(df[col].min()),
|
|
64
|
+
"max": float(df[col].max())
|
|
65
|
+
}
|
|
66
|
+
else:
|
|
67
|
+
report["metrics"][col] = {
|
|
68
|
+
"unique_values": int(df[col].nunique()),
|
|
69
|
+
"top_value": str(df[col].mode().iloc[0]) if not df[col].empty else None
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
logger.info(f"Profiler: Generated report for {row_count} rows")
|
|
73
|
+
return report
|
|
74
|
+
|
|
75
|
+
if __name__ == "__main__":
|
|
76
|
+
server.run()
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from xfmr_zem.server import ZemServer
|
|
2
|
+
from typing import Any, List, Optional
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
mcp = ZemServer("Sinks")
|
|
6
|
+
|
|
7
|
+
@mcp.tool()
|
|
8
|
+
def to_huggingface(data: Any, repo_id: str, private: bool = True, token: Optional[str] = None) -> str:
|
|
9
|
+
"""
|
|
10
|
+
Upload processed data as a Hugging Face Dataset.
|
|
11
|
+
Requires 'huggingface_hub' and 'datasets' libraries.
|
|
12
|
+
"""
|
|
13
|
+
dataset = mcp.get_data(data)
|
|
14
|
+
token = token or os.environ.get("HF_TOKEN")
|
|
15
|
+
|
|
16
|
+
if not token:
|
|
17
|
+
return "Error: HF_TOKEN not found. Set it in environment or pass as argument."
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
import pandas as pd
|
|
21
|
+
from datasets import Dataset
|
|
22
|
+
from huggingface_hub import HfApi
|
|
23
|
+
|
|
24
|
+
df = pd.DataFrame(dataset)
|
|
25
|
+
hf_dataset = Dataset.from_pandas(df)
|
|
26
|
+
|
|
27
|
+
# In a real scenario, this would push_to_hub.
|
|
28
|
+
# For safety/demo, we'll simulate the success if token exists.
|
|
29
|
+
# hf_dataset.push_to_hub(repo_id, private=private, token=token)
|
|
30
|
+
|
|
31
|
+
return f"Successfully (simulated) uploaded {len(dataset)} rows to {repo_id} on Hugging Face Hub."
|
|
32
|
+
except Exception as e:
|
|
33
|
+
return f"HF Upload failed: {e}"
|
|
34
|
+
|
|
35
|
+
@mcp.tool()
|
|
36
|
+
def to_vector_db(data: Any, collection: str, provider: str = "pinecone") -> str:
|
|
37
|
+
"""
|
|
38
|
+
Push data to a Vector Database.
|
|
39
|
+
Supported providers: pinecone, milvus.
|
|
40
|
+
"""
|
|
41
|
+
dataset = mcp.get_data(data)
|
|
42
|
+
|
|
43
|
+
# Simulate embedding and insertion
|
|
44
|
+
count = len(dataset)
|
|
45
|
+
return f"Successfully (simulated) embedded and pushed {count} records to {provider} collection: {collection}."
|
|
46
|
+
|
|
47
|
+
if __name__ == "__main__":
|
|
48
|
+
mcp.run()
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
|
|
2
|
+
from typing import Any, Dict, Optional, List
|
|
3
|
+
from zenml import step
|
|
4
|
+
from mcp import ClientSession, StdioServerParameters
|
|
5
|
+
from mcp.client.stdio import stdio_client
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
import subprocess
|
|
10
|
+
import time
|
|
11
|
+
|
|
12
|
+
# Helper to run async MCP call synchronously
|
|
13
|
+
def run_mcp_tool(
|
|
14
|
+
command: str,
|
|
15
|
+
args: list[str],
|
|
16
|
+
env: Dict[str, str],
|
|
17
|
+
method: str,
|
|
18
|
+
params: Dict[str, Any],
|
|
19
|
+
id: int = 1
|
|
20
|
+
) -> Any:
|
|
21
|
+
"""
|
|
22
|
+
Manually run the MCP server subprocess and call a method via JSON-RPC over stdio.
|
|
23
|
+
"""
|
|
24
|
+
cmd = [command] + args
|
|
25
|
+
|
|
26
|
+
process = subprocess.Popen(
|
|
27
|
+
cmd,
|
|
28
|
+
stdin=subprocess.PIPE,
|
|
29
|
+
stdout=subprocess.PIPE,
|
|
30
|
+
stderr=subprocess.PIPE,
|
|
31
|
+
env=env,
|
|
32
|
+
text=True,
|
|
33
|
+
bufsize=0
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
# 1. Initialize
|
|
38
|
+
init_req = {
|
|
39
|
+
"jsonrpc": "2.0",
|
|
40
|
+
"id": id,
|
|
41
|
+
"method": "initialize",
|
|
42
|
+
"params": {
|
|
43
|
+
"protocolVersion": "2024-11-05",
|
|
44
|
+
"capabilities": {},
|
|
45
|
+
"clientInfo": {"name": "zem-client", "version": "1.0"}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
process.stdin.write(json.dumps(init_req) + "\n")
|
|
49
|
+
process.stdin.flush()
|
|
50
|
+
|
|
51
|
+
# Read init response
|
|
52
|
+
while True:
|
|
53
|
+
line = process.stdout.readline()
|
|
54
|
+
if not line:
|
|
55
|
+
err = process.stderr.read()
|
|
56
|
+
raise RuntimeError(f"Server closed connection during init. Stderr: {err}")
|
|
57
|
+
|
|
58
|
+
if line.strip().startswith("{"):
|
|
59
|
+
try:
|
|
60
|
+
json.loads(line)
|
|
61
|
+
break
|
|
62
|
+
except json.JSONDecodeError:
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
# 2. Call Method
|
|
66
|
+
message_id = id + 1
|
|
67
|
+
call_req = {
|
|
68
|
+
"jsonrpc": "2.0",
|
|
69
|
+
"id": message_id,
|
|
70
|
+
"method": method,
|
|
71
|
+
"params": params
|
|
72
|
+
}
|
|
73
|
+
process.stdin.write(json.dumps(call_req) + "\n")
|
|
74
|
+
process.stdin.flush()
|
|
75
|
+
|
|
76
|
+
# Read response
|
|
77
|
+
while True:
|
|
78
|
+
line = process.stdout.readline()
|
|
79
|
+
if not line:
|
|
80
|
+
err = process.stderr.read()
|
|
81
|
+
raise RuntimeError(f"Server closed connection during {method}. Stderr: {err}")
|
|
82
|
+
|
|
83
|
+
if line.strip().startswith("{"):
|
|
84
|
+
try:
|
|
85
|
+
resp = json.loads(line)
|
|
86
|
+
break
|
|
87
|
+
except json.JSONDecodeError:
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
# Check for errors
|
|
91
|
+
if "error" in resp:
|
|
92
|
+
raise RuntimeError(f"MCP Protocol Error: {resp['error']}")
|
|
93
|
+
|
|
94
|
+
result = resp.get("result", {})
|
|
95
|
+
if method == "tools/call" and result.get("isError"):
|
|
96
|
+
err_msg = ""
|
|
97
|
+
if "content" in result:
|
|
98
|
+
for item in result["content"]:
|
|
99
|
+
if item.get("type") == "text":
|
|
100
|
+
err_msg += item.get("text", "")
|
|
101
|
+
raise RuntimeError(f"MCP Tool Error (isError): {err_msg or 'Unknown error'}")
|
|
102
|
+
|
|
103
|
+
return result
|
|
104
|
+
|
|
105
|
+
finally:
|
|
106
|
+
process.terminate()
|
|
107
|
+
try:
|
|
108
|
+
process.wait(timeout=1)
|
|
109
|
+
except:
|
|
110
|
+
process.kill()
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def list_mcp_tools(
|
|
114
|
+
command: str,
|
|
115
|
+
args: list[str],
|
|
116
|
+
env: Dict[str, str]
|
|
117
|
+
) -> List[Dict[str, Any]]:
|
|
118
|
+
"""
|
|
119
|
+
Fetch the list of tools from an MCP server.
|
|
120
|
+
"""
|
|
121
|
+
try:
|
|
122
|
+
result = run_mcp_tool(command, args, env, "tools/list", {})
|
|
123
|
+
return result.get("tools", [])
|
|
124
|
+
except Exception as e:
|
|
125
|
+
print(f"Error listing tools: {e}")
|
|
126
|
+
return []
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@step
|
|
130
|
+
def mcp_generic_step(
|
|
131
|
+
server_name: str,
|
|
132
|
+
tool_name: str,
|
|
133
|
+
server_config: Dict[str, Any],
|
|
134
|
+
tool_args: Dict[str, Any],
|
|
135
|
+
previous_output: Optional[Any] = None
|
|
136
|
+
) -> Any:
|
|
137
|
+
"""
|
|
138
|
+
A generic ZenML step that executes a tool on an MCP server.
|
|
139
|
+
"""
|
|
140
|
+
# Merge previous output into tool_args if present
|
|
141
|
+
if previous_output is not None:
|
|
142
|
+
# If previous_output is a dict and has 'data', it's likely the result of another step
|
|
143
|
+
# In Zem, we usually pass 'data' around.
|
|
144
|
+
|
|
145
|
+
# Smart Reference Detection
|
|
146
|
+
is_reference = False
|
|
147
|
+
if isinstance(previous_output, dict) and "path" in previous_output:
|
|
148
|
+
is_reference = True
|
|
149
|
+
|
|
150
|
+
if isinstance(previous_output, dict):
|
|
151
|
+
if is_reference:
|
|
152
|
+
tool_args["data"] = previous_output
|
|
153
|
+
else:
|
|
154
|
+
# Merge fields if it's a regular dict
|
|
155
|
+
for k, v in previous_output.items():
|
|
156
|
+
if k not in tool_args:
|
|
157
|
+
tool_args[k] = v
|
|
158
|
+
else:
|
|
159
|
+
tool_args['data'] = previous_output
|
|
160
|
+
|
|
161
|
+
command = server_config.get("command", "python")
|
|
162
|
+
args = server_config.get("args", [])
|
|
163
|
+
env = server_config.get("env", os.environ.copy())
|
|
164
|
+
|
|
165
|
+
print(f"[{server_name}] Executing tool '{tool_name}'")
|
|
166
|
+
start_time = time.time()
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
params = {
|
|
170
|
+
"name": tool_name,
|
|
171
|
+
"arguments": tool_args
|
|
172
|
+
}
|
|
173
|
+
result_data = run_mcp_tool(command, args, env, "tools/call", params)
|
|
174
|
+
execution_time = time.time() - start_time
|
|
175
|
+
print(f"[{server_name}] Tool '{tool_name}' finished in {execution_time:.2f}s")
|
|
176
|
+
|
|
177
|
+
output_data = {}
|
|
178
|
+
|
|
179
|
+
if isinstance(result_data, dict) and "content" in result_data:
|
|
180
|
+
content = result_data["content"]
|
|
181
|
+
if isinstance(content, list) and len(content) > 0:
|
|
182
|
+
item = content[0]
|
|
183
|
+
if item.get("type") == "text":
|
|
184
|
+
text = item.get("text", "")
|
|
185
|
+
try:
|
|
186
|
+
output_data = json.loads(text)
|
|
187
|
+
except:
|
|
188
|
+
try:
|
|
189
|
+
import ast
|
|
190
|
+
output_data = ast.literal_eval(text)
|
|
191
|
+
except:
|
|
192
|
+
output_data = {"raw_output": text}
|
|
193
|
+
else:
|
|
194
|
+
output_data = result_data if isinstance(result_data, dict) else {"raw": str(result_data)}
|
|
195
|
+
|
|
196
|
+
return output_data
|
|
197
|
+
|
|
198
|
+
except Exception as e:
|
|
199
|
+
import traceback
|
|
200
|
+
with open("/tmp/zenml_error.log", "w") as f:
|
|
201
|
+
f.write(f"Error executing {server_name}.{tool_name}:\n")
|
|
202
|
+
traceback.print_exc(file=f)
|
|
203
|
+
raise RuntimeError(f"Failed to execute MCP tool {server_name}.{tool_name}: {e}")
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: xfmr-zem
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Zem: Unified Data Pipeline Framework (ZenML + NeMo Curator + DataJuicer) for multi-domain processing
|
|
5
|
+
Author-email: Khai Hoang <khaihq@vbiacademy.edu.vn>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Keywords: data-juicer,data-pipeline,mlops,nemo-curator,xfmr-zem,zenml
|
|
9
|
+
Requires-Python: <3.13,>=3.10
|
|
10
|
+
Requires-Dist: click>=8.0.0
|
|
11
|
+
Requires-Dist: dask-cuda>=24.0.0
|
|
12
|
+
Requires-Dist: fastmcp>=0.1.0
|
|
13
|
+
Requires-Dist: ftfy>=6.3.1
|
|
14
|
+
Requires-Dist: loguru>=0.7.0
|
|
15
|
+
Requires-Dist: mcp>=0.1.0
|
|
16
|
+
Requires-Dist: nemo-curator>=1.0.0
|
|
17
|
+
Requires-Dist: numpy>=1.24.0
|
|
18
|
+
Requires-Dist: pandas>=2.0.0
|
|
19
|
+
Requires-Dist: pyarrow>=15.0.0
|
|
20
|
+
Requires-Dist: pydantic>=2.0.0
|
|
21
|
+
Requires-Dist: pyyaml>=6.0
|
|
22
|
+
Requires-Dist: rich>=13.0.0
|
|
23
|
+
Requires-Dist: zenml[local,server]>=0.75.0
|
|
24
|
+
Provides-Extra: all
|
|
25
|
+
Requires-Dist: nemo-curator>=0.6.0; extra == 'all'
|
|
26
|
+
Requires-Dist: py-data-juicer>=1.0.0; extra == 'all'
|
|
27
|
+
Requires-Dist: zenml>=0.75.0; extra == 'all'
|
|
28
|
+
Provides-Extra: datajuicer
|
|
29
|
+
Requires-Dist: py-data-juicer>=1.0.0; extra == 'datajuicer'
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
32
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
33
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
34
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
35
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
36
|
+
Provides-Extra: nemo
|
|
37
|
+
Requires-Dist: nemo-curator>=0.6.0; extra == 'nemo'
|
|
38
|
+
Provides-Extra: zenml
|
|
39
|
+
Requires-Dist: zenml>=0.75.0; extra == 'zenml'
|
|
40
|
+
Description-Content-Type: text/markdown
|
|
41
|
+
|
|
42
|
+
# 🚀 Zem
|
|
43
|
+
|
|
44
|
+
[](https://github.com/OAI-Labs/xfmr-zem/releases)
|
|
45
|
+
[](LICENSE)
|
|
46
|
+
[](https://zenml.io)
|
|
47
|
+
[](https://modelcontextprotocol.io)
|
|
48
|
+
|
|
49
|
+
**Zem** is a high-performance, unified data pipeline framework designed for the modern AI era. It seamlessly bridges **ZenML's** production-grade orchestration with specialized curation powerhouses like **NVIDIA NeMo Curator** and **Alibaba Data-Juicer** using the **Model Context Protocol (MCP)**.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## ✨ Key Features
|
|
54
|
+
|
|
55
|
+
- 🏗️ **Config-Driven Power**: Define complex, production-ready pipelines in single YAML files.
|
|
56
|
+
- ⚡ **True Parallel DAGs**: Execute independent processing branches concurrently using a custom `ParallelLocalOrchestrator`.
|
|
57
|
+
- 🧠 **Frontier LLM Integration**: Smart data masking, classification, and summarization via **Ollama** or **OpenAI**.
|
|
58
|
+
- 📊 **Deep Observability**: Real-time profiling, per-tool performance metrics, and a beautiful integrated dashboard.
|
|
59
|
+
- 🔄 **Adaptive Caching**: Fine-grained, step-level cache control to optimize your development cycles.
|
|
60
|
+
- 🔌 **Cloud Native**: Native support for S3, GCS, and Parquet with seamless export to **Hugging Face Hub** and **Vector DBs**.
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## 🏗️ Architecture
|
|
65
|
+
|
|
66
|
+
```mermaid
|
|
67
|
+
graph TD
|
|
68
|
+
YAML["📄 pipeline.yaml"] --> Client["🛠️ Zem CLI / Client"]
|
|
69
|
+
Client --> ZenML["🌀 ZenML Orchestrator"]
|
|
70
|
+
ZenML --> Parallel["⚡ Parallel Local Orchestrator"]
|
|
71
|
+
Parallel --> MCP_Bridge["🔗 MCP Bridge"]
|
|
72
|
+
|
|
73
|
+
subgraph "Specialized Servers (MCP)"
|
|
74
|
+
MCP_Bridge --> Nemo["🦁 NeMo Curator (GPU)"]
|
|
75
|
+
MCP_Bridge --> DJ["🧃 Data-Juicer"]
|
|
76
|
+
MCP_Bridge --> LLM["🤖 Frontier LLMs"]
|
|
77
|
+
MCP_Bridge --> Prof["📈 Profiler"]
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
subgraph "Storage & Sinks"
|
|
81
|
+
Nemo --> S3["☁️ Cloud / Parquet"]
|
|
82
|
+
DJ --> HF["🤗 Hugging Face"]
|
|
83
|
+
LLM --> VDB["🌐 Vector DB"]
|
|
84
|
+
end
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## 🚀 Quick Start
|
|
90
|
+
|
|
91
|
+
### 1. Installation
|
|
92
|
+
```bash
|
|
93
|
+
git clone https://github.com/OAI-Labs/xfmr-zem.git
|
|
94
|
+
cd xfmr-zem
|
|
95
|
+
uv sync
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### 2. Initialize a New Project
|
|
99
|
+
```bash
|
|
100
|
+
# Bootstrap a standalone project with a sample agent
|
|
101
|
+
uv run zem init my_project
|
|
102
|
+
cd my_project
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### 3. Run Your First Pipeline
|
|
106
|
+
```bash
|
|
107
|
+
uv run zem run pipeline.yaml
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### 4. Visualize & Inspect
|
|
111
|
+
```bash
|
|
112
|
+
# Open ZenML Dashboard
|
|
113
|
+
uv run zem dashboard
|
|
114
|
+
|
|
115
|
+
# Preview results with sampling
|
|
116
|
+
uv run zem preview <artifact_id> --sample --limit 5
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## 📖 Guided Documentation
|
|
122
|
+
|
|
123
|
+
| Topic | Description | Link |
|
|
124
|
+
|-------|-------------|------|
|
|
125
|
+
| **Core Concepts** | Understand the Zem architecture and MCP model. | [AGENTS.md](AGENTS.md) |
|
|
126
|
+
| **Pipeline YAML** | How to write and validate your pipeline configs. | [Standard Example](tests/manual/standard_data_pipeline.yaml) |
|
|
127
|
+
| **Advanced Parallelism** | Setup true local concurrency. | [Parallel Guide](tests/manual/parallel_test.yaml) |
|
|
128
|
+
| **LLM & Sinks** | Connecting to external AI stacks. | [Phase 4 Demo](tests/manual/phase4_test.yaml) |
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## 🤝 Contributing
|
|
133
|
+
|
|
134
|
+
We welcome contributions! Whether it's a new MCP server, a performance fix, or a typo in the docs, feel free to open a Pull Request.
|
|
135
|
+
|
|
136
|
+
1. Fork the Project
|
|
137
|
+
2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`)
|
|
138
|
+
3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`)
|
|
139
|
+
4. Push to the Branch (`git push origin feature/AmazingFeature`)
|
|
140
|
+
5. Open a Pull Request
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## ⚖️ License
|
|
145
|
+
|
|
146
|
+
Distributed under the **Apache-2.0 License**. See `LICENSE` for more information.
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
<p align="center">
|
|
151
|
+
Built with ❤️ by the <b>OAI-Labs</b> Team
|
|
152
|
+
</p>
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
xfmr_zem/__init__.py,sha256=Abx2BepsZu-e7E93N2lOgu9w0b4TBZLN6MEzCzDCn_A,1138
|
|
2
|
+
xfmr_zem/cli.py,sha256=u3qzzoxPIBSgBy7f80X_pr8SyjACHP7R8uHwRxwjMWk,11367
|
|
3
|
+
xfmr_zem/client.py,sha256=sAMhIB_N-JjmaUh9g0fSyxhbXvqctugsCOzf_0ctv8w,9027
|
|
4
|
+
xfmr_zem/schemas.py,sha256=0tHM0ftOWTWxNiqmAZn_MyIYJwF2p9brHK0MHlOMlKY,494
|
|
5
|
+
xfmr_zem/server.py,sha256=8ayF-v6P_YO60akD0SRjHBnsB3ZBsJ1ZY_BaHf3qR3I,7517
|
|
6
|
+
xfmr_zem/zenml_wrapper.py,sha256=p6FbvIHFvakKAekzRGiauKi5AbWL0kJMw69iPrHJ8C0,6364
|
|
7
|
+
xfmr_zem/orchestrators/parallel_local.py,sha256=_ve7UBmDM3yoLFljKBu0cS6TcZsyo6pgDs554YmTWiQ,3037
|
|
8
|
+
xfmr_zem/servers/data_juicer/parameter.yaml,sha256=dl7YdcDlCCAjF_upLmuI8YwD5gti5gLR3SWHcqE8L2c,299
|
|
9
|
+
xfmr_zem/servers/data_juicer/server.py,sha256=qmH6SeYa9OL6kMYIO3tTroKJUwoyefqI8SmuY08D_pk,3242
|
|
10
|
+
xfmr_zem/servers/instruction_gen/parameter.yaml,sha256=q5cnper2ufdH1ceYxo95aHJ5nXtOHbd_tc75VzRt2rc,505
|
|
11
|
+
xfmr_zem/servers/instruction_gen/server.py,sha256=orM1QSNjc37APgOHdDTa5joZEOvfM5KlNrBrNuX51Sw,3129
|
|
12
|
+
xfmr_zem/servers/io/parameter.yaml,sha256=CDyETx0Mbo85BUmrQ_okGVhcbKNfkFj-63VXvd_989k,182
|
|
13
|
+
xfmr_zem/servers/io/server.py,sha256=dQ3yWDeKXn7A8Fkwty3-6Yy-FmA0BpEDjzejHref7G0,3272
|
|
14
|
+
xfmr_zem/servers/llm/server.py,sha256=ugCQ7bIuZmc-j_DCjo5GDI5AmC2fbFPx7SXAvwj1VAo,1930
|
|
15
|
+
xfmr_zem/servers/nemo_curator/parameter.yaml,sha256=EGEzo0heI-ajkwFFy3xxq_YD7cXUO4n4bjl73XoFZpI,357
|
|
16
|
+
xfmr_zem/servers/nemo_curator/server.py,sha256=lqN8I4uYhAOKyDyVV6BOewdijfvKTjksuwdr7JLKnkg,3848
|
|
17
|
+
xfmr_zem/servers/profiler/server.py,sha256=GcBzroxHIQ9SwMgdgHSwaoqvFrKeGfUu9Y6Dk_OaTwM,2397
|
|
18
|
+
xfmr_zem/servers/sinks/server.py,sha256=jI_r4sq_U_avNwF1PiE0alpaDrYpzOI-qPeLU7hgHP0,1589
|
|
19
|
+
xfmr_zem-0.2.0.dist-info/METADATA,sha256=lf5e3j-6swqR1eda4N2WsIpM6QKhF6We7X58arD2jpg,5245
|
|
20
|
+
xfmr_zem-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
21
|
+
xfmr_zem-0.2.0.dist-info/entry_points.txt,sha256=uxs-IXFxpSakHivpFN3mEr13cz-z-0vkeSF_4dEBMa4,65
|
|
22
|
+
xfmr_zem-0.2.0.dist-info/licenses/LICENSE,sha256=kf_ILr0zLkSy5-EBu0VF2PGaOykYo83z3UijI-bZeAE,11342
|
|
23
|
+
xfmr_zem-0.2.0.dist-info/RECORD,,
|