@vespermcp/mcp-server 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -4
- package/build/cleaning/cleaner.js +27 -2
- package/build/cleaning/executor.js +7 -6
- package/build/cleaning/planner.js +16 -4
- package/build/config/config-manager.js +199 -0
- package/build/export/exporter.js +26 -2
- package/build/index.js +272 -72
- package/build/ingestion/ingestor.js +17 -16
- package/build/ingestion/kaggle-downloader.js +25 -2
- package/build/install/install-service.js +1 -1
- package/build/jobs/manager.js +17 -10
- package/build/metadata/monitoring-service.js +2 -2
- package/build/metadata/scraper.js +8 -8
- package/build/metadata/store.js +17 -2
- package/build/monitoring/observability.js +2 -2
- package/build/preparation/target-detector.js +75 -0
- package/build/python/cleaner.py +226 -0
- package/build/python/export_engine.py +131 -0
- package/build/python/framework_adapters.py +100 -0
- package/build/python/github_adapter.py +106 -0
- package/build/python/image_engine.py +86 -0
- package/build/python/media_engine.py +133 -0
- package/build/python/nasa_adapter.py +82 -0
- package/build/python/quality_engine.py +243 -0
- package/build/python/splitter_engine.py +283 -0
- package/build/python/target_engine.py +154 -0
- package/build/python/test_framework_adapters.py +61 -0
- package/build/python/uci_adapter.py +94 -0
- package/build/python/worldbank_adapter.py +99 -0
- package/build/quality/analyzer.js +40 -4
- package/build/quality/image-analyzer.js +73 -5
- package/build/quality/media-analyzer.js +74 -5
- package/build/scripts/cleanup-kaggle.js +41 -0
- package/build/scripts/repro-bug.js +37 -0
- package/build/scripts/repro-export-bug.js +56 -0
- package/build/scripts/test-mcp-v5.js +12 -11
- package/build/scripts/test-production-sync.js +36 -0
- package/build/scripts/test-target-detector.js +29 -0
- package/build/scripts/test-write.js +14 -0
- package/build/scripts/verify-integration.js +57 -0
- package/build/scripts/verify-priority.js +33 -0
- package/build/search/engine.js +13 -2
- package/build/search/jit-orchestrator.js +6 -40
- package/build/search/vector-store.js +18 -0
- package/build/splitting/splitter.js +27 -2
- package/build/tools/formatter.js +23 -8
- package/build/utils/downloader.js +2 -2
- package/build/utils/selector.js +69 -0
- package/package.json +8 -4
- package/src/python/cleaner.py +33 -3
- package/src/python/export_engine.py +19 -0
- package/src/python/target_engine.py +154 -0
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import polars as pl
|
|
4
|
+
import numpy as np
|
|
5
|
+
from sklearn.model_selection import train_test_split
|
|
6
|
+
|
|
7
|
+
def execute_split(file_path, config):
|
|
8
|
+
# Load Data
|
|
9
|
+
if file_path.endswith(".csv"):
|
|
10
|
+
df = pl.read_csv(file_path, ignore_errors=True)
|
|
11
|
+
elif file_path.endswith(".parquet"):
|
|
12
|
+
df = pl.read_parquet(file_path)
|
|
13
|
+
else:
|
|
14
|
+
raise ValueError("Unsupported format")
|
|
15
|
+
|
|
16
|
+
train_ratio = config["ratios"]["train"]
|
|
17
|
+
val_ratio = config["ratios"]["val"]
|
|
18
|
+
test_ratio = config["ratios"]["test"]
|
|
19
|
+
holdout_ratio = config["ratios"].get("holdout", 0)
|
|
20
|
+
seed = config.get("random_seed", 42)
|
|
21
|
+
shuffle = config.get("shuffle", True)
|
|
22
|
+
|
|
23
|
+
# Strategy
|
|
24
|
+
strategy = config["type"]
|
|
25
|
+
target_col = config.get("target_column", None)
|
|
26
|
+
time_col = config.get("time_column", None)
|
|
27
|
+
|
|
28
|
+
train_df, val_df, test_df, holdout_df = None, None, None, None
|
|
29
|
+
|
|
30
|
+
# --- 1. RANDOM / STRATIFIED SPLIT ---
|
|
31
|
+
if strategy in ["random", "stratified"]:
|
|
32
|
+
if strategy == "random":
|
|
33
|
+
if shuffle:
|
|
34
|
+
df = df.sample(fraction=1.0, seed=seed, shuffle=True)
|
|
35
|
+
|
|
36
|
+
n = len(df)
|
|
37
|
+
n_train = int(n * train_ratio)
|
|
38
|
+
n_val = int(n * val_ratio)
|
|
39
|
+
n_test = int(n * test_ratio)
|
|
40
|
+
|
|
41
|
+
train_df = df.slice(0, n_train)
|
|
42
|
+
val_df = df.slice(n_train, n_val)
|
|
43
|
+
test_df = df.slice(n_train + n_val, n_test)
|
|
44
|
+
holdout_df = df.slice(n_train + n_val + n_test, n - (n_train + n_val + n_test))
|
|
45
|
+
|
|
46
|
+
elif strategy == "stratified":
|
|
47
|
+
if not target_col or target_col not in df.columns:
|
|
48
|
+
return {"error": f"Target column '{target_col}' not found needed for stratification"}
|
|
49
|
+
|
|
50
|
+
y = df[target_col].to_list()
|
|
51
|
+
indices = np.arange(len(df))
|
|
52
|
+
|
|
53
|
+
# Split 1: Train vs Others
|
|
54
|
+
others_ratio = val_ratio + test_ratio + holdout_ratio
|
|
55
|
+
if others_ratio == 0:
|
|
56
|
+
train_idx, others_idx = indices, []
|
|
57
|
+
else:
|
|
58
|
+
train_idx, others_idx = train_test_split(indices, test_size=others_ratio, stratify=y, random_state=seed, shuffle=True)
|
|
59
|
+
|
|
60
|
+
train_df = df[train_idx]
|
|
61
|
+
|
|
62
|
+
if len(others_idx) > 0:
|
|
63
|
+
y_others = [y[i] for i in others_idx]
|
|
64
|
+
|
|
65
|
+
# Split 2: Val vs (Test + Holdout)
|
|
66
|
+
test_holdout_ratio = (test_ratio + holdout_ratio) / others_ratio
|
|
67
|
+
if test_holdout_ratio > 0 and test_holdout_ratio < 1:
|
|
68
|
+
val_idx, test_holdout_idx = train_test_split(others_idx, test_size=test_holdout_ratio, stratify=y_others, random_state=seed, shuffle=True)
|
|
69
|
+
val_df = df[val_idx]
|
|
70
|
+
|
|
71
|
+
if len(test_holdout_idx) > 0:
|
|
72
|
+
y_th = [y[i] for i in test_holdout_idx]
|
|
73
|
+
relative_holdout_ratio = holdout_ratio / (test_ratio + holdout_ratio)
|
|
74
|
+
|
|
75
|
+
if relative_holdout_ratio > 0 and relative_holdout_ratio < 1:
|
|
76
|
+
test_idx, holdout_idx = train_test_split(test_holdout_idx, test_size=relative_holdout_ratio, stratify=y_th, random_state=seed, shuffle=True)
|
|
77
|
+
test_df = df[test_idx]
|
|
78
|
+
holdout_df = df[holdout_idx]
|
|
79
|
+
elif relative_holdout_ratio >= 1:
|
|
80
|
+
test_df = df.slice(0, 0)
|
|
81
|
+
holdout_df = df[test_holdout_idx]
|
|
82
|
+
else:
|
|
83
|
+
test_df = df[test_holdout_idx]
|
|
84
|
+
holdout_df = df.slice(0, 0)
|
|
85
|
+
elif test_holdout_ratio >= 1:
|
|
86
|
+
val_df = df.slice(0, 0)
|
|
87
|
+
# Chained split for Test/Holdout
|
|
88
|
+
y_th = y_others
|
|
89
|
+
relative_holdout_ratio = holdout_ratio / (test_ratio + holdout_ratio)
|
|
90
|
+
if relative_holdout_ratio > 0 and relative_holdout_ratio < 1:
|
|
91
|
+
test_idx, holdout_idx = train_test_split(others_idx, test_size=relative_holdout_ratio, stratify=y_th, random_state=seed, shuffle=True)
|
|
92
|
+
test_df = df[test_idx]
|
|
93
|
+
holdout_df = df[holdout_idx]
|
|
94
|
+
else:
|
|
95
|
+
test_df = df[others_idx]
|
|
96
|
+
holdout_df = df.slice(0, 0)
|
|
97
|
+
else:
|
|
98
|
+
val_df = df[others_idx]
|
|
99
|
+
test_df = df.slice(0, 0)
|
|
100
|
+
holdout_df = df.slice(0, 0)
|
|
101
|
+
|
|
102
|
+
# --- 2. TIME-BASED SPLIT ---
|
|
103
|
+
elif strategy == "time":
|
|
104
|
+
if not time_col or time_col not in df.columns:
|
|
105
|
+
return {"error": f"Time column '{time_col}' not found"}
|
|
106
|
+
|
|
107
|
+
df = df.sort(time_col)
|
|
108
|
+
|
|
109
|
+
n = len(df)
|
|
110
|
+
n_train = int(n * train_ratio)
|
|
111
|
+
n_val = int(n * val_ratio)
|
|
112
|
+
n_test = int(n * test_ratio)
|
|
113
|
+
|
|
114
|
+
train_df = df.slice(0, n_train)
|
|
115
|
+
val_df = df.slice(n_train, n_val)
|
|
116
|
+
test_df = df.slice(n_train + n_val, n_test)
|
|
117
|
+
holdout_df = df.slice(n_train + n_val + n_test, n - (n_train + n_val + n_test))
|
|
118
|
+
|
|
119
|
+
# --- 3. GROUP-BASED SPLIT ---
|
|
120
|
+
elif strategy == "group":
|
|
121
|
+
if not config.get("group_column") or config["group_column"] not in df.columns:
|
|
122
|
+
return {"error": f"Group column '{config.get('group_column')}' not found"}
|
|
123
|
+
|
|
124
|
+
group_col = config["group_column"]
|
|
125
|
+
groups = df[group_col].unique().to_list()
|
|
126
|
+
|
|
127
|
+
# Split groups first to ensure zero leakage
|
|
128
|
+
n_grps = len(groups)
|
|
129
|
+
n_train = int(n_grps * train_ratio)
|
|
130
|
+
n_val = int(n_grps * val_ratio)
|
|
131
|
+
n_test = int(n_grps * test_ratio)
|
|
132
|
+
|
|
133
|
+
if shuffle:
|
|
134
|
+
np.random.seed(seed)
|
|
135
|
+
np.random.shuffle(groups)
|
|
136
|
+
|
|
137
|
+
train_grps = set(groups[:n_train])
|
|
138
|
+
val_grps = set(groups[n_train:n_train+n_val])
|
|
139
|
+
test_grps = set(groups[n_train+n_val:n_train+n_val+n_test])
|
|
140
|
+
holdout_grps = set(groups[n_train+n_val+n_test:])
|
|
141
|
+
|
|
142
|
+
train_df = df.filter(pl.col(group_col).is_in(train_grps))
|
|
143
|
+
val_df = df.filter(pl.col(group_col).is_in(val_grps))
|
|
144
|
+
test_df = df.filter(pl.col(group_col).is_in(test_grps))
|
|
145
|
+
holdout_df = df.filter(pl.col(group_col).is_in(holdout_grps))
|
|
146
|
+
|
|
147
|
+
else:
|
|
148
|
+
return {"error": f"Strategy {strategy} not implemented yet"}
|
|
149
|
+
|
|
150
|
+
# Save outputs
|
|
151
|
+
base_name = file_path.replace(".csv", "").replace(".parquet", "")
|
|
152
|
+
train_path = f"{base_name}_train.csv"
|
|
153
|
+
val_path = f"{base_name}_val.csv"
|
|
154
|
+
test_path = f"{base_name}_test.csv"
|
|
155
|
+
holdout_path = f"{base_name}_holdout.csv"
|
|
156
|
+
|
|
157
|
+
train_df.write_csv(train_path)
|
|
158
|
+
val_df.write_csv(val_path)
|
|
159
|
+
test_df.write_csv(test_path)
|
|
160
|
+
holdout_df.write_csv(holdout_path)
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
"success": True,
|
|
164
|
+
"paths": { "train": train_path, "val": val_path, "test": test_path, "holdout": holdout_path },
|
|
165
|
+
"stats": {
|
|
166
|
+
"train_rows": len(train_df),
|
|
167
|
+
"val_rows": len(val_df),
|
|
168
|
+
"test_rows": len(test_df),
|
|
169
|
+
"holdout_rows": len(holdout_df)
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
def validate_split(config):
|
|
174
|
+
# Config contains paths to check and optional ID column
|
|
175
|
+
train_path = config["paths"]["train"]
|
|
176
|
+
val_path = config["paths"]["val"]
|
|
177
|
+
test_path = config["paths"]["test"]
|
|
178
|
+
holdout_path = config["paths"].get("holdout")
|
|
179
|
+
id_col = config.get("id_column", "id") # Default to 'id' if exists
|
|
180
|
+
target_col = config.get("target_column", None)
|
|
181
|
+
|
|
182
|
+
# Load dfs
|
|
183
|
+
try:
|
|
184
|
+
train_df = pl.read_csv(train_path) if train_path.endswith(".csv") else pl.read_parquet(train_path)
|
|
185
|
+
val_df = pl.read_csv(val_path) if val_path.endswith(".csv") else pl.read_parquet(val_path)
|
|
186
|
+
test_df = pl.read_csv(test_path) if test_path.endswith(".csv") else pl.read_parquet(test_path)
|
|
187
|
+
holdout_df = None
|
|
188
|
+
if holdout_path:
|
|
189
|
+
holdout_df = pl.read_csv(holdout_path) if holdout_path.endswith(".csv") else pl.read_parquet(holdout_path)
|
|
190
|
+
except:
|
|
191
|
+
return {"error": "Failed to load split files for validation"}
|
|
192
|
+
|
|
193
|
+
report = {
|
|
194
|
+
"leakage_detected": False,
|
|
195
|
+
"leakage_count": 0,
|
|
196
|
+
"distribution_mismatch": False,
|
|
197
|
+
"warnings": []
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
# 1. Leakage Check (ID intersection)
|
|
201
|
+
if id_col in train_df.columns:
|
|
202
|
+
train_ids = set(train_df[id_col].to_list())
|
|
203
|
+
val_ids = set(val_df[id_col].to_list())
|
|
204
|
+
test_ids = set(test_df[id_col].to_list())
|
|
205
|
+
holdout_ids = set(holdout_df[id_col].to_list()) if holdout_df is not None else set()
|
|
206
|
+
|
|
207
|
+
leakage_tv = len(train_ids.intersection(val_ids))
|
|
208
|
+
leakage_tt = len(train_ids.intersection(test_ids))
|
|
209
|
+
leakage_th = len(train_ids.intersection(holdout_ids))
|
|
210
|
+
leakage_vt = len(val_ids.intersection(test_ids))
|
|
211
|
+
leakage_vh = len(val_ids.intersection(holdout_ids))
|
|
212
|
+
leakage_th_val = len(test_ids.intersection(holdout_ids))
|
|
213
|
+
|
|
214
|
+
total_leakage = leakage_tv + leakage_tt + leakage_th + leakage_vt + leakage_vh + leakage_th_val
|
|
215
|
+
|
|
216
|
+
if total_leakage > 0:
|
|
217
|
+
report["leakage_detected"] = True
|
|
218
|
+
report["leakage_count"] = total_leakage
|
|
219
|
+
report["warnings"].append(f"Found {total_leakage} overlapping IDs between splits.")
|
|
220
|
+
else:
|
|
221
|
+
report["warnings"].append(f"ID column '{id_col}' not found. Skipping exact leakage check.")
|
|
222
|
+
|
|
223
|
+
# 2. Distribution Check (Target Distribution)
|
|
224
|
+
if target_col and target_col in train_df.columns:
|
|
225
|
+
try:
|
|
226
|
+
def get_ratios(df, col):
|
|
227
|
+
counts = df[col].value_counts()
|
|
228
|
+
total = len(df)
|
|
229
|
+
ratios = {}
|
|
230
|
+
for row in counts.rows():
|
|
231
|
+
ratios[str(row[0])] = row[1] / total
|
|
232
|
+
return ratios
|
|
233
|
+
|
|
234
|
+
train_metrics = get_ratios(train_df, target_col)
|
|
235
|
+
val_metrics = get_ratios(val_df, target_col)
|
|
236
|
+
# test_metrics = get_ratios(test_df, target_col) # Optional: could check all
|
|
237
|
+
|
|
238
|
+
for cls in train_metrics:
|
|
239
|
+
train_r = train_metrics[cls]
|
|
240
|
+
val_r = val_metrics.get(cls, 0)
|
|
241
|
+
diff = abs(train_r - val_r)
|
|
242
|
+
if diff > 0.1: # 10% drift
|
|
243
|
+
report["distribution_mismatch"] = True
|
|
244
|
+
report["warnings"].append(f"Class '{cls}' drift: Train={train_r:.2f}, Val={val_r:.2f}")
|
|
245
|
+
except:
|
|
246
|
+
pass
|
|
247
|
+
|
|
248
|
+
return report
|
|
249
|
+
|
|
250
|
+
def main():
|
|
251
|
+
# Usage:
|
|
252
|
+
# split: python splitter_engine.py split <file_path> <config_json>
|
|
253
|
+
# validate: python splitter_engine.py validate <config_json> (dummy file arg ignored)
|
|
254
|
+
|
|
255
|
+
if len(sys.argv) < 3:
|
|
256
|
+
print(json.dumps({"error": "Usage: splitter_engine.py <action> <arg1> [arg2]"}), file=sys.stderr)
|
|
257
|
+
sys.exit(1)
|
|
258
|
+
|
|
259
|
+
action = sys.argv[1]
|
|
260
|
+
|
|
261
|
+
try:
|
|
262
|
+
if action == "split":
|
|
263
|
+
file_path = sys.argv[2]
|
|
264
|
+
config = json.loads(sys.argv[3])
|
|
265
|
+
result = execute_split(file_path, config)
|
|
266
|
+
print(json.dumps(result))
|
|
267
|
+
|
|
268
|
+
elif action == "validate":
|
|
269
|
+
config = json.loads(sys.argv[2])
|
|
270
|
+
result = validate_split(config)
|
|
271
|
+
print(json.dumps(result))
|
|
272
|
+
|
|
273
|
+
else:
|
|
274
|
+
# Fallback for old calls (implicit split) - if users used old signature
|
|
275
|
+
# But since we control the caller, we can just update the caller (DataSplitter.ts).
|
|
276
|
+
raise ValueError(f"Unknown action: {action}")
|
|
277
|
+
|
|
278
|
+
except Exception as e:
|
|
279
|
+
print(json.dumps({"success": False, "error": str(e)}))
|
|
280
|
+
sys.exit(1)
|
|
281
|
+
|
|
282
|
+
if __name__ == "__main__":
|
|
283
|
+
main()
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
# Common names for target variables in datasets
|
|
7
|
+
TARGET_CANDIDATES = [
|
|
8
|
+
'target', 'label', 'class', 'outcome', 'y',
|
|
9
|
+
'price', 'saleprice', 'sales', 'cost', 'value', 'total',
|
|
10
|
+
'diagnosis', 'species', 'churn', 'survived', 'credit_risk'
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
def load_data(file_path):
|
|
14
|
+
if file_path.endswith('.csv'):
|
|
15
|
+
return pd.read_csv(file_path)
|
|
16
|
+
elif file_path.endswith('.parquet'):
|
|
17
|
+
return pd.read_parquet(file_path)
|
|
18
|
+
else:
|
|
19
|
+
raise ValueError("Unsupported file format")
|
|
20
|
+
|
|
21
|
+
def detect_target(file_path):
|
|
22
|
+
try:
|
|
23
|
+
df = load_data(file_path)
|
|
24
|
+
columns = [c.lower() for c in df.columns]
|
|
25
|
+
candidates = []
|
|
26
|
+
|
|
27
|
+
# 1. Exact Name Match
|
|
28
|
+
for col_original in df.columns:
|
|
29
|
+
col_lower = col_original.lower()
|
|
30
|
+
confidence = 0.0
|
|
31
|
+
reasons = []
|
|
32
|
+
|
|
33
|
+
if col_lower in TARGET_CANDIDATES:
|
|
34
|
+
confidence += 0.6
|
|
35
|
+
reasons.append(f"Matches common target name '{col_lower}'")
|
|
36
|
+
|
|
37
|
+
# Boost if exact match 'target' or 'label'
|
|
38
|
+
if col_lower in ['target', 'label', 'class']:
|
|
39
|
+
confidence += 0.2
|
|
40
|
+
|
|
41
|
+
# 2. Position Heuristic (Last column is often target)
|
|
42
|
+
if col_original == df.columns[-1]:
|
|
43
|
+
confidence += 0.3
|
|
44
|
+
reasons.append("Is the last column")
|
|
45
|
+
|
|
46
|
+
# 3. Completeness
|
|
47
|
+
missing_rate = df[col_original].isnull().mean()
|
|
48
|
+
if missing_rate > 0.5:
|
|
49
|
+
confidence -= 0.5
|
|
50
|
+
reasons.append(f"High missing rate ({missing_rate:.1%})")
|
|
51
|
+
elif missing_rate > 0:
|
|
52
|
+
confidence -= 0.1
|
|
53
|
+
reasons.append(f"Has missing values ({missing_rate:.1%})")
|
|
54
|
+
|
|
55
|
+
# 4. Cardinality / Unique Values
|
|
56
|
+
# If regression-like (many unique numeric values) or class-like (few unique values)
|
|
57
|
+
# This is hard to score generally, but extremes are bad for targets (e.g. all unique = ID usually)
|
|
58
|
+
n_unique = df[col_original].nunique()
|
|
59
|
+
if n_unique == len(df):
|
|
60
|
+
confidence -= 0.8
|
|
61
|
+
reasons.append("All values are unique (likely ID)")
|
|
62
|
+
|
|
63
|
+
if confidence > 0.3:
|
|
64
|
+
candidates.append({
|
|
65
|
+
"column": col_original,
|
|
66
|
+
"confidence": min(confidence, 1.0),
|
|
67
|
+
"reason": reasons
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
# Sort by confidence
|
|
71
|
+
candidates.sort(key=lambda x: x['confidence'], reverse=True)
|
|
72
|
+
|
|
73
|
+
best_target = None
|
|
74
|
+
best_conf = 0.0
|
|
75
|
+
|
|
76
|
+
if candidates:
|
|
77
|
+
best_target = candidates[0]['column']
|
|
78
|
+
best_conf = candidates[0]['confidence']
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
"target_column": best_target,
|
|
82
|
+
"confidence": best_conf,
|
|
83
|
+
"candidates": candidates,
|
|
84
|
+
"is_unified": False # Wrapper will handle unification logic
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
except Exception as e:
|
|
88
|
+
return {"error": str(e)}
|
|
89
|
+
|
|
90
|
+
def validate_target(file_path, target_column):
|
|
91
|
+
try:
|
|
92
|
+
df = load_data(file_path)
|
|
93
|
+
if target_column not in df.columns:
|
|
94
|
+
return {"error": f"Column '{target_column}' not found in dataset."}
|
|
95
|
+
|
|
96
|
+
series = df[target_column]
|
|
97
|
+
total_rows = len(df)
|
|
98
|
+
missing_count = series.isnull().sum()
|
|
99
|
+
|
|
100
|
+
# Determine type
|
|
101
|
+
is_numeric = pd.api.types.is_numeric_dtype(series)
|
|
102
|
+
n_unique = series.nunique()
|
|
103
|
+
|
|
104
|
+
problem_type = "unknown"
|
|
105
|
+
if is_numeric and n_unique > 20:
|
|
106
|
+
problem_type = "regression"
|
|
107
|
+
elif n_unique < 50: # String or few numeric values
|
|
108
|
+
problem_type = "classification"
|
|
109
|
+
else:
|
|
110
|
+
# Heuristic fallback
|
|
111
|
+
problem_type = "regression" if is_numeric else "classification"
|
|
112
|
+
|
|
113
|
+
warnings = []
|
|
114
|
+
if missing_count > 0:
|
|
115
|
+
warnings.append(f"Target has {missing_count} missing values.")
|
|
116
|
+
|
|
117
|
+
# Imbalance check for classification
|
|
118
|
+
if problem_type == "classification":
|
|
119
|
+
counts = series.value_counts(normalize=True)
|
|
120
|
+
if counts.iloc[0] > 0.9: # Dominant class > 90%
|
|
121
|
+
warnings.append(f"Highly imbalanced target: Class '{counts.index[0]}' is {counts.iloc[0]:.1%}")
|
|
122
|
+
|
|
123
|
+
return {
|
|
124
|
+
"valid": True,
|
|
125
|
+
"problem_type": problem_type,
|
|
126
|
+
"missing_count": int(missing_count),
|
|
127
|
+
"total_rows": total_rows,
|
|
128
|
+
"warnings": warnings
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
except Exception as e:
|
|
132
|
+
return {"error": str(e)}
|
|
133
|
+
|
|
134
|
+
if __name__ == "__main__":
|
|
135
|
+
if len(sys.argv) < 3:
|
|
136
|
+
print(json.dumps({"error": "Usage: target_engine.py <action> <file_path> [args]"}));
|
|
137
|
+
sys.exit(1)
|
|
138
|
+
|
|
139
|
+
action = sys.argv[1]
|
|
140
|
+
file_path = sys.argv[2]
|
|
141
|
+
|
|
142
|
+
result = {}
|
|
143
|
+
if action == "detect":
|
|
144
|
+
result = detect_target(file_path)
|
|
145
|
+
elif action == "validate":
|
|
146
|
+
target_col = sys.argv[3] if len(sys.argv) > 3 else None
|
|
147
|
+
if target_col:
|
|
148
|
+
result = validate_target(file_path, target_col)
|
|
149
|
+
else:
|
|
150
|
+
result = {"error": "Target column required for validation"}
|
|
151
|
+
else:
|
|
152
|
+
result = {"error": f"Unknown action: {action}"}
|
|
153
|
+
|
|
154
|
+
print(json.dumps(result))
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
import polars as pl
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
# Mock data creation
|
|
8
|
+
def create_mock_data():
|
|
9
|
+
df = pl.DataFrame({
|
|
10
|
+
"feature1": np.random.rand(100),
|
|
11
|
+
"feature2": np.random.rand(100),
|
|
12
|
+
"label": np.random.randint(0, 2, 100)
|
|
13
|
+
})
|
|
14
|
+
os.makedirs("test_adapters", exist_ok=True)
|
|
15
|
+
df.write_parquet("test_adapters/data.parquet")
|
|
16
|
+
df.write_csv("test_adapters/data.csv")
|
|
17
|
+
print("Created mock data in test_adapters/")
|
|
18
|
+
|
|
19
|
+
def test_pytorch():
|
|
20
|
+
print("\n--- Testing PyTorch Adapter ---")
|
|
21
|
+
try:
|
|
22
|
+
from framework_adapters import VesperPyTorchDataset
|
|
23
|
+
import torch
|
|
24
|
+
from torch.utils.data import DataLoader
|
|
25
|
+
|
|
26
|
+
dataset = VesperPyTorchDataset("test_adapters/data.parquet", target_col="label")
|
|
27
|
+
loader = DataLoader(dataset, batch_size=10, shuffle=True)
|
|
28
|
+
|
|
29
|
+
batch = next(iter(loader))
|
|
30
|
+
print(f"Loaded batch: {batch}")
|
|
31
|
+
print("PASS: PyTorch DataLoader works")
|
|
32
|
+
|
|
33
|
+
except ImportError:
|
|
34
|
+
print("SKIP: PyTorch not installed")
|
|
35
|
+
except Exception as e:
|
|
36
|
+
print(f"FAIL: PyTorch test failed: {e}")
|
|
37
|
+
|
|
38
|
+
def test_huggingface():
|
|
39
|
+
print("\n--- Testing HuggingFace Adapter ---")
|
|
40
|
+
try:
|
|
41
|
+
from framework_adapters import load_vesper_dataset
|
|
42
|
+
ds = load_vesper_dataset("test_adapters/data.csv")
|
|
43
|
+
print(f"Loaded dataset: {ds}")
|
|
44
|
+
print("PASS: HuggingFace Dataset works")
|
|
45
|
+
|
|
46
|
+
except ImportError:
|
|
47
|
+
print("SKIP: HuggingFace datasets not installed")
|
|
48
|
+
except Exception as e:
|
|
49
|
+
print(f"FAIL: HuggingFace test failed: {e}")
|
|
50
|
+
|
|
51
|
+
if __name__ == "__main__":
|
|
52
|
+
create_mock_data()
|
|
53
|
+
# Add src/python to path to import adapters
|
|
54
|
+
sys.path.append(os.path.join(os.getcwd(), "src", "python"))
|
|
55
|
+
|
|
56
|
+
test_pytorch()
|
|
57
|
+
test_huggingface()
|
|
58
|
+
|
|
59
|
+
# Cleanup
|
|
60
|
+
import shutil
|
|
61
|
+
shutil.rmtree("test_adapters")
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import argparse
|
|
4
|
+
import urllib.request
|
|
5
|
+
import urllib.parse
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
|
|
8
|
+
# API Endpoint found in network inspection of UCI website
|
|
9
|
+
UCI_API_URL = "https://archive.ics.uci.edu/api/datasets/list"
|
|
10
|
+
|
|
11
|
+
def search_uci(query: str, limit: int = 10):
|
|
12
|
+
"""
|
|
13
|
+
Search UCI datasets using their internal API.
|
|
14
|
+
"""
|
|
15
|
+
try:
|
|
16
|
+
# Fetch data dictionary from API
|
|
17
|
+
# Only fetching first 100 to filter locally
|
|
18
|
+
params = {
|
|
19
|
+
"skip": 0,
|
|
20
|
+
"take": 100,
|
|
21
|
+
"sort": "desc",
|
|
22
|
+
"orderBy": "NumHits",
|
|
23
|
+
"search": query
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
query_string = urllib.parse.urlencode(params)
|
|
27
|
+
url = f"{UCI_API_URL}?{query_string}"
|
|
28
|
+
|
|
29
|
+
req = urllib.request.Request(url)
|
|
30
|
+
with urllib.request.urlopen(req) as response:
|
|
31
|
+
data = json.load(response)
|
|
32
|
+
|
|
33
|
+
datasets = data.get('data', [])
|
|
34
|
+
if not datasets:
|
|
35
|
+
datasets = []
|
|
36
|
+
|
|
37
|
+
results = []
|
|
38
|
+
count = 0
|
|
39
|
+
|
|
40
|
+
# We trust the API search mostly, but can do extra filtering if needed
|
|
41
|
+
# The API "search" param is supported
|
|
42
|
+
|
|
43
|
+
for ds in datasets:
|
|
44
|
+
# Normalize to Vesper schema
|
|
45
|
+
# API fields: id, name, abstract, numHits, area, task, dateDonated
|
|
46
|
+
|
|
47
|
+
metadata = {
|
|
48
|
+
"id": f"uci:{ds.get('id')}",
|
|
49
|
+
"source": "uci",
|
|
50
|
+
"name": ds.get('name'),
|
|
51
|
+
"description": ds.get('abstract') or "No description available.",
|
|
52
|
+
"downloads": ds.get('numHits') or 0,
|
|
53
|
+
"likes": 0,
|
|
54
|
+
"last_updated": ds.get('dateDonated') or datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
55
|
+
"quality_score": 80,
|
|
56
|
+
"license": {
|
|
57
|
+
"id": "other",
|
|
58
|
+
"category": "open",
|
|
59
|
+
"usage_restrictions": [],
|
|
60
|
+
"warnings": []
|
|
61
|
+
},
|
|
62
|
+
"tags": [t for t in [ds.get('area'), ds.get('task')] if t],
|
|
63
|
+
"total_examples": ds.get('numInstances'),
|
|
64
|
+
"is_safe_source": True,
|
|
65
|
+
"is_structured": True,
|
|
66
|
+
"metadata_url": f"https://archive.ics.uci.edu/dataset/{ds.get('id')}/{ds.get('name').replace(' ', '+')}"
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
results.append(metadata)
|
|
70
|
+
count += 1
|
|
71
|
+
if count >= limit:
|
|
72
|
+
break
|
|
73
|
+
|
|
74
|
+
return results
|
|
75
|
+
|
|
76
|
+
except Exception as e:
|
|
77
|
+
# Fallback empty or specific error
|
|
78
|
+
return {"error": str(e)}
|
|
79
|
+
|
|
80
|
+
def main():
|
|
81
|
+
parser = argparse.ArgumentParser(description="UCI Adapter")
|
|
82
|
+
parser.add_argument("--action", required=True, choices=["search"])
|
|
83
|
+
parser.add_argument("--query", required=True)
|
|
84
|
+
parser.add_argument("--limit", type=int, default=10)
|
|
85
|
+
|
|
86
|
+
args = parser.parse_args()
|
|
87
|
+
|
|
88
|
+
if args.action == "search":
|
|
89
|
+
results = search_uci(args.query, args.limit)
|
|
90
|
+
# JSON dump print for stdout capture
|
|
91
|
+
print(json.dumps(results))
|
|
92
|
+
|
|
93
|
+
if __name__ == "__main__":
|
|
94
|
+
main()
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import argparse
|
|
4
|
+
import urllib.request
|
|
5
|
+
import urllib.parse
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
|
|
8
|
+
# WB API for indicators (Series)
|
|
9
|
+
# Source 2 is World Development Indicators
|
|
10
|
+
WB_API_URL = "https://api.worldbank.org/v2/indicator"
|
|
11
|
+
|
|
12
|
+
def search_worldbank(query: str, limit: int = 10):
|
|
13
|
+
"""
|
|
14
|
+
Search World Bank indicators.
|
|
15
|
+
"""
|
|
16
|
+
try:
|
|
17
|
+
# The World Bank Indicators API doesn't have a direct "search" parameter for indicators
|
|
18
|
+
# that works exactly like a search engine. We fetch a page and filter by query terms.
|
|
19
|
+
# Alternatively, we could use the 'qterm' on the documents API, but indicators are more tabular.
|
|
20
|
+
|
|
21
|
+
params = {
|
|
22
|
+
"format": "json",
|
|
23
|
+
"per_page": 299, # Max per page to search through more indicators
|
|
24
|
+
"source": 2
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
query_string = urllib.parse.urlencode(params)
|
|
28
|
+
url = f"{WB_API_URL}?{query_string}"
|
|
29
|
+
|
|
30
|
+
req = urllib.request.Request(url)
|
|
31
|
+
with urllib.request.urlopen(req) as response:
|
|
32
|
+
data = json.load(response)
|
|
33
|
+
|
|
34
|
+
# WB response is [metadata, data_list]
|
|
35
|
+
if len(data) < 2:
|
|
36
|
+
return []
|
|
37
|
+
|
|
38
|
+
indicators = data[1]
|
|
39
|
+
|
|
40
|
+
results = []
|
|
41
|
+
count = 0
|
|
42
|
+
|
|
43
|
+
query_terms = query.lower().split()
|
|
44
|
+
|
|
45
|
+
for ind in indicators:
|
|
46
|
+
name = ind.get('name', '')
|
|
47
|
+
source_note = ind.get('sourceNote', '')
|
|
48
|
+
text = (name + " " + source_note).lower()
|
|
49
|
+
|
|
50
|
+
# Simple keyword matching
|
|
51
|
+
if all(term in text for term in query_terms):
|
|
52
|
+
metadata = {
|
|
53
|
+
"id": f"wb:{ind.get('id')}",
|
|
54
|
+
"source": "worldbank",
|
|
55
|
+
"name": name,
|
|
56
|
+
"description": source_note or "No description available.",
|
|
57
|
+
"downloads": 1000, # Placeholder (high relevance for WB)
|
|
58
|
+
"likes": 100,
|
|
59
|
+
"last_updated": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
60
|
+
"quality_score": 95, # Institutional data is high quality
|
|
61
|
+
"license": {
|
|
62
|
+
"id": "cc-by-4.0",
|
|
63
|
+
"name": "Creative Commons Attribution 4.0",
|
|
64
|
+
"category": "safe",
|
|
65
|
+
"usage_restrictions": [],
|
|
66
|
+
"warnings": []
|
|
67
|
+
},
|
|
68
|
+
"tags": [ind.get('source', {}).get('value')] if ind.get('source') else [],
|
|
69
|
+
"total_examples": 0, # Time series length varies
|
|
70
|
+
"is_safe_source": True,
|
|
71
|
+
"is_structured": True,
|
|
72
|
+
"metadata_url": f"https://data.worldbank.org/indicator/{ind.get('id')}",
|
|
73
|
+
"domain": "economics"
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
results.append(metadata)
|
|
77
|
+
count += 1
|
|
78
|
+
if count >= limit:
|
|
79
|
+
break
|
|
80
|
+
|
|
81
|
+
return results
|
|
82
|
+
|
|
83
|
+
except Exception as e:
|
|
84
|
+
return {"error": str(e)}
|
|
85
|
+
|
|
86
|
+
def main():
|
|
87
|
+
parser = argparse.ArgumentParser(description="World Bank Adapter")
|
|
88
|
+
parser.add_argument("--action", required=True, choices=["search"])
|
|
89
|
+
parser.add_argument("--query", required=True)
|
|
90
|
+
parser.add_argument("--limit", type=int, default=10)
|
|
91
|
+
|
|
92
|
+
args = parser.parse_args()
|
|
93
|
+
|
|
94
|
+
if args.action == "search":
|
|
95
|
+
results = search_worldbank(args.query, args.limit)
|
|
96
|
+
print(json.dumps(results))
|
|
97
|
+
|
|
98
|
+
if __name__ == "__main__":
|
|
99
|
+
main()
|