@vespermcp/mcp-server 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -4
- package/build/cleaning/cleaner.js +27 -2
- package/build/cleaning/executor.js +7 -6
- package/build/cleaning/planner.js +16 -4
- package/build/config/config-manager.js +199 -0
- package/build/export/exporter.js +26 -2
- package/build/index.js +272 -72
- package/build/ingestion/ingestor.js +17 -16
- package/build/ingestion/kaggle-downloader.js +25 -2
- package/build/install/install-service.js +1 -1
- package/build/jobs/manager.js +17 -10
- package/build/metadata/monitoring-service.js +2 -2
- package/build/metadata/scraper.js +8 -8
- package/build/metadata/store.js +17 -2
- package/build/monitoring/observability.js +2 -2
- package/build/preparation/target-detector.js +75 -0
- package/build/python/cleaner.py +226 -0
- package/build/python/export_engine.py +131 -0
- package/build/python/framework_adapters.py +100 -0
- package/build/python/github_adapter.py +106 -0
- package/build/python/image_engine.py +86 -0
- package/build/python/media_engine.py +133 -0
- package/build/python/nasa_adapter.py +82 -0
- package/build/python/quality_engine.py +243 -0
- package/build/python/splitter_engine.py +283 -0
- package/build/python/target_engine.py +154 -0
- package/build/python/test_framework_adapters.py +61 -0
- package/build/python/uci_adapter.py +94 -0
- package/build/python/worldbank_adapter.py +99 -0
- package/build/quality/analyzer.js +40 -4
- package/build/quality/image-analyzer.js +73 -5
- package/build/quality/media-analyzer.js +74 -5
- package/build/scripts/cleanup-kaggle.js +41 -0
- package/build/scripts/repro-bug.js +37 -0
- package/build/scripts/repro-export-bug.js +56 -0
- package/build/scripts/test-mcp-v5.js +12 -11
- package/build/scripts/test-production-sync.js +36 -0
- package/build/scripts/test-target-detector.js +29 -0
- package/build/scripts/test-write.js +14 -0
- package/build/scripts/verify-integration.js +57 -0
- package/build/scripts/verify-priority.js +33 -0
- package/build/search/engine.js +13 -2
- package/build/search/jit-orchestrator.js +6 -40
- package/build/search/vector-store.js +18 -0
- package/build/splitting/splitter.js +27 -2
- package/build/tools/formatter.js +23 -8
- package/build/utils/downloader.js +2 -2
- package/build/utils/selector.js +69 -0
- package/package.json +8 -4
- package/src/python/cleaner.py +33 -3
- package/src/python/export_engine.py +19 -0
- package/src/python/target_engine.py +154 -0
package/build/metadata/store.js
CHANGED
|
@@ -17,7 +17,7 @@ export class MetadataStore {
|
|
|
17
17
|
// Add install_path if missing
|
|
18
18
|
try {
|
|
19
19
|
this.db.exec("ALTER TABLE datasets ADD COLUMN install_path TEXT");
|
|
20
|
-
console.
|
|
20
|
+
console.error("[MetadataStore] Migrated: Added install_path column");
|
|
21
21
|
}
|
|
22
22
|
catch (e) {
|
|
23
23
|
// Probably already exists
|
|
@@ -315,10 +315,25 @@ export class MetadataStore {
|
|
|
315
315
|
* Perform database maintenance (VACUUM, ANALYZE).
|
|
316
316
|
*/
|
|
317
317
|
optimize() {
|
|
318
|
-
console.
|
|
318
|
+
console.error("[MetadataStore] Running database optimization (VACUUM, ANALYZE)...");
|
|
319
319
|
this.db.exec("VACUUM");
|
|
320
320
|
this.db.exec("ANALYZE");
|
|
321
321
|
}
|
|
322
|
+
/**
|
|
323
|
+
* Delete all datasets from a specific source
|
|
324
|
+
*/
|
|
325
|
+
deleteBySource(source) {
|
|
326
|
+
const info = this.db.prepare("DELETE FROM datasets WHERE source = ?").run(source);
|
|
327
|
+
console.error(`[MetadataStore] Deleted ${info.changes} datasets from source: ${source}`);
|
|
328
|
+
return info.changes;
|
|
329
|
+
}
|
|
330
|
+
/**
|
|
331
|
+
* Get all dataset IDs from a specific source
|
|
332
|
+
*/
|
|
333
|
+
getDatasetIdsBySource(source) {
|
|
334
|
+
const rows = this.db.prepare("SELECT id FROM datasets WHERE source = ?").all(source);
|
|
335
|
+
return rows.map(r => r.id);
|
|
336
|
+
}
|
|
322
337
|
close() {
|
|
323
338
|
this.db.close();
|
|
324
339
|
}
|
|
@@ -2,11 +2,11 @@ export class MockErrorTracker {
|
|
|
2
2
|
exceptions = [];
|
|
3
3
|
messages = [];
|
|
4
4
|
captureException(error, context) {
|
|
5
|
-
console.
|
|
5
|
+
console.error(`[ErrorTracker] Exception captured: ${error.message}`);
|
|
6
6
|
this.exceptions.push({ error, context });
|
|
7
7
|
}
|
|
8
8
|
captureMessage(message, level = "info") {
|
|
9
|
-
console.
|
|
9
|
+
console.error(`[ErrorTracker] Message captured (${level}): ${message}`);
|
|
10
10
|
this.messages.push({ message, level });
|
|
11
11
|
}
|
|
12
12
|
}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import { spawn } from "child_process";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import fs from "fs";
|
|
4
|
+
export class TargetDetector {
|
|
5
|
+
pythonPath = "python";
|
|
6
|
+
scriptPath;
|
|
7
|
+
constructor(buildDir = process.cwd()) {
|
|
8
|
+
const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
9
|
+
const dataRoot = path.join(homeDir, ".vesper");
|
|
10
|
+
// Use same robust path resolution as other services
|
|
11
|
+
const scriptPath0 = path.resolve(dataRoot, "python", "target_engine.py");
|
|
12
|
+
const scriptPath1 = path.resolve(buildDir, "python", "target_engine.py");
|
|
13
|
+
const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "target_engine.py");
|
|
14
|
+
const scriptPath3 = path.resolve(buildDir, "..", "python", "target_engine.py");
|
|
15
|
+
if (fs.existsSync(scriptPath0)) {
|
|
16
|
+
this.scriptPath = scriptPath0;
|
|
17
|
+
}
|
|
18
|
+
else if (fs.existsSync(scriptPath1)) {
|
|
19
|
+
this.scriptPath = scriptPath1;
|
|
20
|
+
}
|
|
21
|
+
else if (fs.existsSync(scriptPath2)) {
|
|
22
|
+
this.scriptPath = scriptPath2;
|
|
23
|
+
}
|
|
24
|
+
else if (fs.existsSync(scriptPath3)) {
|
|
25
|
+
this.scriptPath = scriptPath3;
|
|
26
|
+
}
|
|
27
|
+
else {
|
|
28
|
+
this.scriptPath = scriptPath0;
|
|
29
|
+
}
|
|
30
|
+
if (process.platform === "win32") {
|
|
31
|
+
this.pythonPath = "py";
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Detect probable target columns in a dataset
|
|
36
|
+
*/
|
|
37
|
+
async detectTarget(filePath) {
|
|
38
|
+
return this.runPython("detect", [filePath]);
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Validate a specific column as a target
|
|
42
|
+
*/
|
|
43
|
+
async validateTarget(filePath, targetColumn) {
|
|
44
|
+
return this.runPython("validate", [filePath, targetColumn]);
|
|
45
|
+
}
|
|
46
|
+
async runPython(action, args) {
|
|
47
|
+
return new Promise((resolve, reject) => {
|
|
48
|
+
const childProcess = spawn(this.pythonPath, [this.scriptPath, action, ...args], {
|
|
49
|
+
env: { ...process.env, PYTHONIOENCODING: 'utf-8' }
|
|
50
|
+
});
|
|
51
|
+
let stdout = "";
|
|
52
|
+
let stderr = "";
|
|
53
|
+
childProcess.stdout.on("data", (data) => stdout += data.toString());
|
|
54
|
+
childProcess.stderr.on("data", (data) => stderr += data.toString());
|
|
55
|
+
childProcess.on("close", (code) => {
|
|
56
|
+
if (code !== 0) {
|
|
57
|
+
reject(new Error(`Target Detector (${action}) failed: ${stderr}`));
|
|
58
|
+
return;
|
|
59
|
+
}
|
|
60
|
+
try {
|
|
61
|
+
const result = JSON.parse(stdout);
|
|
62
|
+
if (result.error) {
|
|
63
|
+
reject(new Error(result.error));
|
|
64
|
+
}
|
|
65
|
+
else {
|
|
66
|
+
resolve(result);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
catch (e) {
|
|
70
|
+
reject(new Error(`Failed to parse target detector output: ${stdout}\nError: ${e}`));
|
|
71
|
+
}
|
|
72
|
+
});
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
}
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import polars as pl
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
# --- Operations Library ---
|
|
7
|
+
|
|
8
|
+
def op_remove_duplicates(df, params):
|
|
9
|
+
subset = params.get("subset", None) # List of cols or None
|
|
10
|
+
before = len(df)
|
|
11
|
+
if subset:
|
|
12
|
+
df = df.unique(subset=subset)
|
|
13
|
+
else:
|
|
14
|
+
df = df.unique()
|
|
15
|
+
return df, {"rows_removed": before - len(df)}
|
|
16
|
+
|
|
17
|
+
def op_drop_columns(df, params):
|
|
18
|
+
cols = params.get("columns", [])
|
|
19
|
+
before = len(df.columns)
|
|
20
|
+
# Filter only existing cols to avoid errors
|
|
21
|
+
cols_to_drop = [c for c in cols if c in df.columns]
|
|
22
|
+
df = df.drop(cols_to_drop)
|
|
23
|
+
return df, {"columns_dropped": len(cols_to_drop)}
|
|
24
|
+
|
|
25
|
+
def op_fill_missing(df, params):
|
|
26
|
+
col = params["column"]
|
|
27
|
+
method = params.get("method", "mean") # mean, median, mode, constant
|
|
28
|
+
value = params.get("value", None)
|
|
29
|
+
|
|
30
|
+
if col not in df.columns:
|
|
31
|
+
return df, {"error": f"Column {col} not found"}
|
|
32
|
+
|
|
33
|
+
affected = df[col].null_count()
|
|
34
|
+
|
|
35
|
+
if method == "constant":
|
|
36
|
+
df = df.with_columns(pl.col(col).fill_null(value))
|
|
37
|
+
elif method == "mean":
|
|
38
|
+
mean_val = df[col].mean()
|
|
39
|
+
df = df.with_columns(pl.col(col).fill_null(mean_val))
|
|
40
|
+
elif method == "median":
|
|
41
|
+
median_val = df[col].median()
|
|
42
|
+
df = df.with_columns(pl.col(col).fill_null(median_val))
|
|
43
|
+
|
|
44
|
+
return df, {"rows_imputed": affected}
|
|
45
|
+
|
|
46
|
+
def op_fix_types(df, params):
|
|
47
|
+
col = params["column"]
|
|
48
|
+
target_type = params["type"] # "int", "float", "string", "date"
|
|
49
|
+
|
|
50
|
+
if col not in df.columns:
|
|
51
|
+
return df, {"error": f"Column {col} not found"}
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
if target_type == "int":
|
|
55
|
+
df = df.with_columns(pl.col(col).cast(pl.Int64, strict=False))
|
|
56
|
+
elif target_type == "float":
|
|
57
|
+
df = df.with_columns(pl.col(col).cast(pl.Float64, strict=False))
|
|
58
|
+
elif target_type == "string":
|
|
59
|
+
df = df.with_columns(pl.col(col).cast(pl.Utf8))
|
|
60
|
+
elif target_type == "date":
|
|
61
|
+
df = df.with_columns(pl.col(col).str.to_date(strict=False))
|
|
62
|
+
|
|
63
|
+
return df, {"status": "Converted"}
|
|
64
|
+
except Exception as e:
|
|
65
|
+
return df, {"error": str(e)}
|
|
66
|
+
|
|
67
|
+
def op_remove_outliers(df, params):
|
|
68
|
+
col = params["column"]
|
|
69
|
+
method = params.get("method", "iqr")
|
|
70
|
+
threshold = params.get("threshold", 1.5)
|
|
71
|
+
|
|
72
|
+
if col not in df.columns:
|
|
73
|
+
return df, {"error": f"Column {col} not found"}
|
|
74
|
+
|
|
75
|
+
before = len(df)
|
|
76
|
+
|
|
77
|
+
if method == "iqr":
|
|
78
|
+
q1 = df[col].quantile(0.25)
|
|
79
|
+
q3 = df[col].quantile(0.75)
|
|
80
|
+
iqr = q3 - q1
|
|
81
|
+
lower = q1 - (threshold * iqr)
|
|
82
|
+
upper = q3 + (threshold * iqr)
|
|
83
|
+
|
|
84
|
+
df = df.filter((pl.col(col) >= lower) & (pl.col(col) <= upper))
|
|
85
|
+
|
|
86
|
+
return df, {"rows_removed": before - len(df)}
|
|
87
|
+
|
|
88
|
+
def op_encode_categories(df, params):
|
|
89
|
+
col = params["column"]
|
|
90
|
+
method = params.get("method", "label") # label, onehot
|
|
91
|
+
|
|
92
|
+
if col not in df.columns:
|
|
93
|
+
return df, {"error": f"Column {col} not found"}
|
|
94
|
+
|
|
95
|
+
if method == "label":
|
|
96
|
+
# Polars dense_rank acts similar to label encoding
|
|
97
|
+
df = df.with_columns(pl.col(col).rank("dense").alias(f"{col}_encoded"))
|
|
98
|
+
elif method == "onehot":
|
|
99
|
+
dummies = df[col].to_dummies()
|
|
100
|
+
df = pl.concat([df, dummies], how="horizontal")
|
|
101
|
+
|
|
102
|
+
return df, {"status": f"Encoded using {method}"}
|
|
103
|
+
|
|
104
|
+
# --- Registry ---
|
|
105
|
+
|
|
106
|
+
OPERATIONS = {
|
|
107
|
+
"RemoveDuplicates": op_remove_duplicates,
|
|
108
|
+
"DropColumns": op_drop_columns,
|
|
109
|
+
"FillMissing": op_fill_missing,
|
|
110
|
+
"FixTypes": op_fix_types,
|
|
111
|
+
"RemoveOutliers": op_remove_outliers,
|
|
112
|
+
"EncodeCategories": op_encode_categories
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
def main():
|
|
116
|
+
if len(sys.argv) < 3:
|
|
117
|
+
print(json.dumps({"error": "Usage: cleaner.py <file_path> <operations_json>"}), file=sys.stderr)
|
|
118
|
+
sys.exit(1)
|
|
119
|
+
|
|
120
|
+
file_path = sys.argv[1]
|
|
121
|
+
ops_json = sys.argv[2]
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
operations = json.loads(ops_json)
|
|
125
|
+
|
|
126
|
+
# Load Data
|
|
127
|
+
file_path_lower = file_path.lower()
|
|
128
|
+
if file_path_lower.endswith(".csv"):
|
|
129
|
+
df = pl.read_csv(file_path, ignore_errors=True)
|
|
130
|
+
elif file_path_lower.endswith(".parquet"):
|
|
131
|
+
df = pl.read_parquet(file_path)
|
|
132
|
+
elif file_path_lower.endswith(".jsonl") or file_path_lower.endswith(".ndjson"):
|
|
133
|
+
# Explicit NDJSON
|
|
134
|
+
df = pl.read_ndjson(file_path)
|
|
135
|
+
elif file_path_lower.endswith(".json"):
|
|
136
|
+
# Ambiguous .json
|
|
137
|
+
try:
|
|
138
|
+
df = pl.read_json(file_path)
|
|
139
|
+
except Exception:
|
|
140
|
+
try:
|
|
141
|
+
df = pl.read_ndjson(file_path)
|
|
142
|
+
except Exception as e:
|
|
143
|
+
raise ValueError(f"Failed to read JSON: {str(e)}")
|
|
144
|
+
else:
|
|
145
|
+
raise ValueError(f"Unsupported format: {file_path}")
|
|
146
|
+
|
|
147
|
+
logs = []
|
|
148
|
+
total_rows_affected = 0
|
|
149
|
+
|
|
150
|
+
# Execute Pipeline
|
|
151
|
+
for op in operations:
|
|
152
|
+
op_type = op["type"]
|
|
153
|
+
params = op.get("params", {})
|
|
154
|
+
|
|
155
|
+
if op_type == "RenameTarget":
|
|
156
|
+
old_name = params.get("old_name")
|
|
157
|
+
new_name = params.get("new_name", "target")
|
|
158
|
+
if old_name and old_name in df.columns:
|
|
159
|
+
df = df.rename({old_name: new_name})
|
|
160
|
+
logs.append(f"Renamed column '{old_name}' to '{new_name}'")
|
|
161
|
+
else:
|
|
162
|
+
logs.append(f"Failed RenameTarget: Column '{old_name}' not found or not specified.")
|
|
163
|
+
elif op_type in OPERATIONS:
|
|
164
|
+
try:
|
|
165
|
+
df, stats = OPERATIONS[op_type](df, params)
|
|
166
|
+
logs.append(f"Executed {op_type}: {stats}")
|
|
167
|
+
total_rows_affected += stats.get("rows_removed", 0)
|
|
168
|
+
except Exception as e:
|
|
169
|
+
logs.append(f"Failed {op_type}: {str(e)}")
|
|
170
|
+
else:
|
|
171
|
+
logs.append(f"Unknown operation: {op_type}")
|
|
172
|
+
|
|
173
|
+
# Save Result (overwrite or new file)
|
|
174
|
+
# Save Result (overwrite or new file)
|
|
175
|
+
output_format = sys.argv[3] if len(sys.argv) > 3 else None
|
|
176
|
+
|
|
177
|
+
if not output_format:
|
|
178
|
+
# Legacy logic: preserve CSV or default to parquet
|
|
179
|
+
if file_path_lower.endswith(".csv"):
|
|
180
|
+
output_format = "csv"
|
|
181
|
+
else:
|
|
182
|
+
output_format = "parquet"
|
|
183
|
+
|
|
184
|
+
base_name = file_path.rsplit(".", 1)[0]
|
|
185
|
+
if output_format == "csv":
|
|
186
|
+
output_path = f"{base_name}_cleaned.csv"
|
|
187
|
+
# Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
|
|
188
|
+
for col in df.columns:
|
|
189
|
+
dtype = df.schema[col]
|
|
190
|
+
# Only keep simple types; stringify everything else for CSV
|
|
191
|
+
is_simple = (
|
|
192
|
+
dtype.is_numeric() or
|
|
193
|
+
dtype.is_temporal() or
|
|
194
|
+
str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
|
|
195
|
+
)
|
|
196
|
+
if not is_simple:
|
|
197
|
+
# Use a robust helper for clean JSON serialization
|
|
198
|
+
def safe_serialize(val):
|
|
199
|
+
try:
|
|
200
|
+
# Handle Polars nested types (convert to Python list/dict first)
|
|
201
|
+
if hasattr(val, "to_list"):
|
|
202
|
+
return json.dumps(val.to_list())
|
|
203
|
+
if hasattr(val, "to_dict"):
|
|
204
|
+
return json.dumps(val.to_dict())
|
|
205
|
+
return json.dumps(val)
|
|
206
|
+
except:
|
|
207
|
+
return str(val)
|
|
208
|
+
df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
|
|
209
|
+
df.write_csv(output_path)
|
|
210
|
+
else:
|
|
211
|
+
output_path = f"{base_name}_cleaned.parquet"
|
|
212
|
+
df.write_parquet(output_path)
|
|
213
|
+
|
|
214
|
+
print(json.dumps({
|
|
215
|
+
"success": True,
|
|
216
|
+
"output_path": output_path,
|
|
217
|
+
"rows_affected": total_rows_affected,
|
|
218
|
+
"logs": logs
|
|
219
|
+
}, default=str))
|
|
220
|
+
|
|
221
|
+
except Exception as e:
|
|
222
|
+
print(json.dumps({"success": False, "error": str(e)}, default=str))
|
|
223
|
+
sys.exit(1)
|
|
224
|
+
|
|
225
|
+
if __name__ == "__main__":
|
|
226
|
+
main()
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import polars as pl
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
# Optional TensorFlow import for TFRecord support
|
|
7
|
+
try:
|
|
8
|
+
import tensorflow as tf
|
|
9
|
+
HAS_TENSORFLOW = True
|
|
10
|
+
except ImportError:
|
|
11
|
+
HAS_TENSORFLOW = False
|
|
12
|
+
|
|
13
|
+
def export_data(file_path, output_path, format, options=None):
|
|
14
|
+
options = options or {}
|
|
15
|
+
|
|
16
|
+
# Load Data
|
|
17
|
+
try:
|
|
18
|
+
if file_path.endswith(".csv"):
|
|
19
|
+
df = pl.read_csv(file_path, ignore_errors=True)
|
|
20
|
+
elif file_path.endswith(".parquet"):
|
|
21
|
+
df = pl.read_parquet(file_path)
|
|
22
|
+
else:
|
|
23
|
+
return {"error": f"Unsupported input format: {file_path}"}
|
|
24
|
+
except Exception as e:
|
|
25
|
+
return {"error": f"Failed to load input file: {str(e)}"}
|
|
26
|
+
|
|
27
|
+
output_dir = os.path.dirname(output_path)
|
|
28
|
+
if output_dir and not os.path.exists(output_dir):
|
|
29
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
# Export Logic
|
|
33
|
+
if format == "csv":
|
|
34
|
+
# Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
|
|
35
|
+
for col in df.columns:
|
|
36
|
+
dtype = df.schema[col]
|
|
37
|
+
is_simple = (
|
|
38
|
+
dtype.is_numeric() or
|
|
39
|
+
dtype.is_temporal() or
|
|
40
|
+
str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
|
|
41
|
+
)
|
|
42
|
+
if not is_simple:
|
|
43
|
+
def safe_serialize(val):
|
|
44
|
+
try:
|
|
45
|
+
if hasattr(val, "to_list"):
|
|
46
|
+
return json.dumps(val.to_list())
|
|
47
|
+
if hasattr(val, "to_dict"):
|
|
48
|
+
return json.dumps(val.to_dict())
|
|
49
|
+
return json.dumps(val)
|
|
50
|
+
except:
|
|
51
|
+
return str(val)
|
|
52
|
+
df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
|
|
53
|
+
df.write_csv(output_path)
|
|
54
|
+
|
|
55
|
+
elif format == "parquet":
|
|
56
|
+
compression = options.get("compression", "snappy")
|
|
57
|
+
df.write_parquet(output_path, compression=compression)
|
|
58
|
+
|
|
59
|
+
elif format == "jsonl":
|
|
60
|
+
df.write_ndjson(output_path)
|
|
61
|
+
|
|
62
|
+
elif format == "arrow" or format == "ipc":
|
|
63
|
+
compression = options.get("compression", "uncompressed")
|
|
64
|
+
if compression == "uncompressed": compression = None
|
|
65
|
+
df.write_ipc(output_path, compression=compression)
|
|
66
|
+
|
|
67
|
+
elif format == "tfrecord":
|
|
68
|
+
if not HAS_TENSORFLOW:
|
|
69
|
+
return {"error": "TensorFlow is not installed. Cannot export to TFRecord."}
|
|
70
|
+
|
|
71
|
+
# TFRecord Export Logic (using TensorFlow)
|
|
72
|
+
with tf.io.TFRecordWriter(output_path) as writer:
|
|
73
|
+
# Convert Polars -> Pandas for iteration (simpler for now)
|
|
74
|
+
# TODO: Optimize with Arrow -> Tensor conversion if needed for massive data
|
|
75
|
+
pdf = df.to_pandas()
|
|
76
|
+
for _, row in pdf.iterrows():
|
|
77
|
+
feature = {}
|
|
78
|
+
for col, value in row.items():
|
|
79
|
+
if value is None:
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
# Type inference for TFRecord features
|
|
83
|
+
if isinstance(value, int):
|
|
84
|
+
feature[col] = tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
|
|
85
|
+
elif isinstance(value, float):
|
|
86
|
+
feature[col] = tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
|
|
87
|
+
elif isinstance(value, str):
|
|
88
|
+
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode('utf-8')]))
|
|
89
|
+
elif isinstance(value, bytes):
|
|
90
|
+
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
|
|
91
|
+
else:
|
|
92
|
+
# Fallback to string for unknown types
|
|
93
|
+
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode('utf-8')]))
|
|
94
|
+
|
|
95
|
+
example = tf.train.Example(features=tf.train.Features(feature=feature))
|
|
96
|
+
writer.write(example.SerializeToString())
|
|
97
|
+
|
|
98
|
+
else:
|
|
99
|
+
return {"error": f"Unknown export format: {format}"}
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
"success": True,
|
|
103
|
+
"output_path": output_path,
|
|
104
|
+
"rows": len(df),
|
|
105
|
+
"format": format
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
except Exception as e:
|
|
109
|
+
return {"error": f"Export failed: {str(e)}"}
|
|
110
|
+
|
|
111
|
+
def main():
|
|
112
|
+
if len(sys.argv) < 4:
|
|
113
|
+
print(json.dumps({"error": "Usage: export_engine.py <input_file> <output_file> <format> [options_json]"}), file=sys.stderr)
|
|
114
|
+
sys.exit(1)
|
|
115
|
+
|
|
116
|
+
input_file = sys.argv[1]
|
|
117
|
+
output_file = sys.argv[2]
|
|
118
|
+
fmt = sys.argv[3]
|
|
119
|
+
|
|
120
|
+
options = {}
|
|
121
|
+
if len(sys.argv) > 4:
|
|
122
|
+
try:
|
|
123
|
+
options = json.loads(sys.argv[4])
|
|
124
|
+
except:
|
|
125
|
+
pass
|
|
126
|
+
|
|
127
|
+
result = export_data(input_file, output_file, fmt, options)
|
|
128
|
+
print(json.dumps(result))
|
|
129
|
+
|
|
130
|
+
if __name__ == "__main__":
|
|
131
|
+
main()
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import warnings
|
|
4
|
+
|
|
5
|
+
# --- PyTorch Adapter ---
|
|
6
|
+
try:
|
|
7
|
+
import torch
|
|
8
|
+
from torch.utils.data import Dataset
|
|
9
|
+
import polars as pl
|
|
10
|
+
|
|
11
|
+
class VesperPyTorchDataset(Dataset):
|
|
12
|
+
"""
|
|
13
|
+
PyTorch Dataset wrapper for Vesper exported files (Parquet/CSV/Arrow).
|
|
14
|
+
Efficiently loads data using Polars and converts to Tensors on demand.
|
|
15
|
+
"""
|
|
16
|
+
def __init__(self, file_path, target_col=None, transform=None):
|
|
17
|
+
self.file_path = file_path
|
|
18
|
+
self.target_col = target_col
|
|
19
|
+
self.transform = transform
|
|
20
|
+
|
|
21
|
+
# Auto-detect format
|
|
22
|
+
if file_path.endswith(".parquet"):
|
|
23
|
+
self.df = pl.read_parquet(file_path)
|
|
24
|
+
elif file_path.endswith(".csv"):
|
|
25
|
+
self.df = pl.read_csv(file_path, ignore_errors=True)
|
|
26
|
+
elif file_path.endswith(".arrow"):
|
|
27
|
+
self.df = pl.read_ipc(file_path)
|
|
28
|
+
else:
|
|
29
|
+
raise ValueError(f"Unsupported file format for PyTorch loader: {file_path}")
|
|
30
|
+
|
|
31
|
+
self.data = self.df.to_pandas() # Convert to pandas for easier row access in __getitem__ (Polars slice can be slow row-wise)
|
|
32
|
+
|
|
33
|
+
def __len__(self):
|
|
34
|
+
return len(self.data)
|
|
35
|
+
|
|
36
|
+
def __getitem__(self, idx):
|
|
37
|
+
row = self.data.iloc[idx]
|
|
38
|
+
|
|
39
|
+
# Simple assumption: all numeric columns except target are features
|
|
40
|
+
# In production, metadata would tell us which columns are features
|
|
41
|
+
if self.target_col and self.target_col in row:
|
|
42
|
+
y = row[self.target_col]
|
|
43
|
+
x = row.drop(self.target_col).values
|
|
44
|
+
|
|
45
|
+
# Convert to tensors
|
|
46
|
+
x = torch.tensor(x, dtype=torch.float32)
|
|
47
|
+
# Auto-detect target type (scalar vs class index)
|
|
48
|
+
if isinstance(y, (int, float)):
|
|
49
|
+
y = torch.tensor(y, dtype=torch.float32) # Regression/Binary
|
|
50
|
+
else:
|
|
51
|
+
# TODO: Label encoding if string
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
sample = (x, y)
|
|
55
|
+
else:
|
|
56
|
+
# Unsupervised
|
|
57
|
+
x = torch.tensor(row.values, dtype=torch.float32)
|
|
58
|
+
sample = x
|
|
59
|
+
|
|
60
|
+
if self.transform:
|
|
61
|
+
sample = self.transform(sample)
|
|
62
|
+
|
|
63
|
+
return sample
|
|
64
|
+
|
|
65
|
+
except ImportError:
|
|
66
|
+
class VesperPyTorchDataset:
|
|
67
|
+
def __init__(self, *args, **kwargs):
|
|
68
|
+
raise ImportError("PyTorch or Polars not installed.")
|
|
69
|
+
|
|
70
|
+
# --- HuggingFace Adapter ---
|
|
71
|
+
try:
|
|
72
|
+
from datasets import load_dataset as hf_load_dataset
|
|
73
|
+
|
|
74
|
+
def load_vesper_dataset(file_path):
|
|
75
|
+
"""
|
|
76
|
+
Loads a Vesper export into a Hugging Face Dataset.
|
|
77
|
+
Supported: Parquet, CSV, JSONL, Arrow.
|
|
78
|
+
"""
|
|
79
|
+
output_format = "parquet" # Default fallback
|
|
80
|
+
if file_path.endswith(".csv"): output_format = "csv"
|
|
81
|
+
elif file_path.endswith(".jsonl"): output_format = "json"
|
|
82
|
+
elif file_path.endswith(".arrow"): output_format = "arrow"
|
|
83
|
+
|
|
84
|
+
# 'arrow' format in HF might need custom script, but usually parquet/csv/json are native
|
|
85
|
+
if output_format == "arrow":
|
|
86
|
+
# Use pandas/polars to read then convert to HF dataset
|
|
87
|
+
import polars as pl
|
|
88
|
+
from datasets import Dataset
|
|
89
|
+
df = pl.read_ipc(file_path).to_pandas()
|
|
90
|
+
return Dataset.from_pandas(df)
|
|
91
|
+
|
|
92
|
+
return hf_load_dataset(output_format, data_files=file_path, split="train")
|
|
93
|
+
|
|
94
|
+
except ImportError:
|
|
95
|
+
def load_vesper_dataset(*args, **kwargs):
|
|
96
|
+
raise ImportError("HuggingFace 'datasets' library not installed.")
|
|
97
|
+
|
|
98
|
+
if __name__ == "__main__":
|
|
99
|
+
print("Vesper Framework Adapters Library")
|
|
100
|
+
print("Usage: import this module in your training script.")
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import argparse
|
|
4
|
+
import urllib.request
|
|
5
|
+
import urllib.parse
|
|
6
|
+
import os
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
|
|
9
|
+
GITHUB_API_URL = "https://api.github.com/search/repositories"
|
|
10
|
+
|
|
11
|
+
def search_github(query: str, limit: int = 10):
|
|
12
|
+
"""
|
|
13
|
+
Search GitHub for dataset repositories.
|
|
14
|
+
"""
|
|
15
|
+
try:
|
|
16
|
+
# Construct refined query:
|
|
17
|
+
# User query + (topic:dataset OR topic:data)
|
|
18
|
+
# We also filter for repositories with > 5 stars to ensure some relevance
|
|
19
|
+
refined_query = f"{query} topic:dataset stars:>5"
|
|
20
|
+
|
|
21
|
+
params = {
|
|
22
|
+
"q": refined_query,
|
|
23
|
+
"sort": "stars",
|
|
24
|
+
"order": "desc",
|
|
25
|
+
"per_page": limit
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
query_string = urllib.parse.urlencode(params)
|
|
29
|
+
url = f"{GITHUB_API_URL}?{query_string}"
|
|
30
|
+
|
|
31
|
+
req = urllib.request.Request(url)
|
|
32
|
+
|
|
33
|
+
# Add User-Agent (Required by GitHub)
|
|
34
|
+
req.add_header("User-Agent", "Vesper-Dataset-Search")
|
|
35
|
+
|
|
36
|
+
# Add Authorization if token exists
|
|
37
|
+
token = os.environ.get("GITHUB_TOKEN")
|
|
38
|
+
if token:
|
|
39
|
+
req.add_header("Authorization", f"token {token}")
|
|
40
|
+
|
|
41
|
+
with urllib.request.urlopen(req) as response:
|
|
42
|
+
data = json.load(response)
|
|
43
|
+
|
|
44
|
+
items = data.get('items', [])
|
|
45
|
+
results = []
|
|
46
|
+
|
|
47
|
+
for item in items:
|
|
48
|
+
# Map GitHub fields to Vesper schema
|
|
49
|
+
# repo: owner/name
|
|
50
|
+
repo_id = item.get("full_name")
|
|
51
|
+
|
|
52
|
+
# Simple licensing map
|
|
53
|
+
license_info = item.get("license") or {}
|
|
54
|
+
license_key = license_info.get("key", "unknown")
|
|
55
|
+
license_category = "safe" if license_key in ["mit", "apache-2.0", "cc0-1.0", "bsd-3-clause"] else "unknown"
|
|
56
|
+
|
|
57
|
+
metadata = {
|
|
58
|
+
"id": f"github:{repo_id}",
|
|
59
|
+
"source": "github",
|
|
60
|
+
"name": item.get("name"),
|
|
61
|
+
"description": item.get("description") or "No description provided.",
|
|
62
|
+
"downloads": item.get("forks_count") * 10, # Proxy estimation
|
|
63
|
+
"likes": item.get("stargazers_count"),
|
|
64
|
+
"stars": item.get("stargazers_count"),
|
|
65
|
+
"last_updated": item.get("updated_at"),
|
|
66
|
+
"quality_score": min(100, 50 + (item.get("stargazers_count", 0))), # Rough heuristic
|
|
67
|
+
"license": {
|
|
68
|
+
"id": license_key,
|
|
69
|
+
"name": license_info.get("name", "Unknown"),
|
|
70
|
+
"category": license_category,
|
|
71
|
+
"usage_restrictions": [],
|
|
72
|
+
"warnings": []
|
|
73
|
+
},
|
|
74
|
+
"tags": item.get("topics", []),
|
|
75
|
+
"total_examples": 0, # Unknown without drilling deeper
|
|
76
|
+
"is_safe_source": True, # GitHub is generally safe code, content varies
|
|
77
|
+
"is_structured": False, # Often contains code + data
|
|
78
|
+
"metadata_url": item.get("html_url"),
|
|
79
|
+
"domain": "general"
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
results.append(metadata)
|
|
83
|
+
|
|
84
|
+
return results
|
|
85
|
+
|
|
86
|
+
except urllib.error.HTTPError as e:
|
|
87
|
+
if e.code == 403:
|
|
88
|
+
return {"error": "Rate limit exceeded. Set GITHUB_TOKEN environment variable."}
|
|
89
|
+
return {"error": f"HTTP Error {e.code}: {e.reason}"}
|
|
90
|
+
except Exception as e:
|
|
91
|
+
return {"error": str(e)}
|
|
92
|
+
|
|
93
|
+
def main():
|
|
94
|
+
parser = argparse.ArgumentParser(description="GitHub Adapter")
|
|
95
|
+
parser.add_argument("--action", required=True, choices=["search"])
|
|
96
|
+
parser.add_argument("--query", required=True)
|
|
97
|
+
parser.add_argument("--limit", type=int, default=10)
|
|
98
|
+
|
|
99
|
+
args = parser.parse_args()
|
|
100
|
+
|
|
101
|
+
if args.action == "search":
|
|
102
|
+
results = search_github(args.query, args.limit)
|
|
103
|
+
print(json.dumps(results))
|
|
104
|
+
|
|
105
|
+
if __name__ == "__main__":
|
|
106
|
+
main()
|