@vespermcp/mcp-server 1.0.4 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +6 -4
  2. package/build/cleaning/cleaner.js +27 -2
  3. package/build/cleaning/executor.js +7 -6
  4. package/build/cleaning/planner.js +16 -4
  5. package/build/config/config-manager.js +199 -0
  6. package/build/export/exporter.js +26 -2
  7. package/build/index.js +272 -72
  8. package/build/ingestion/ingestor.js +17 -16
  9. package/build/ingestion/kaggle-downloader.js +25 -2
  10. package/build/install/install-service.js +1 -1
  11. package/build/jobs/manager.js +17 -10
  12. package/build/metadata/monitoring-service.js +2 -2
  13. package/build/metadata/scraper.js +8 -8
  14. package/build/metadata/store.js +17 -2
  15. package/build/monitoring/observability.js +2 -2
  16. package/build/preparation/target-detector.js +75 -0
  17. package/build/python/cleaner.py +226 -0
  18. package/build/python/export_engine.py +131 -0
  19. package/build/python/framework_adapters.py +100 -0
  20. package/build/python/github_adapter.py +106 -0
  21. package/build/python/image_engine.py +86 -0
  22. package/build/python/media_engine.py +133 -0
  23. package/build/python/nasa_adapter.py +82 -0
  24. package/build/python/quality_engine.py +243 -0
  25. package/build/python/splitter_engine.py +283 -0
  26. package/build/python/target_engine.py +154 -0
  27. package/build/python/test_framework_adapters.py +61 -0
  28. package/build/python/uci_adapter.py +94 -0
  29. package/build/python/worldbank_adapter.py +99 -0
  30. package/build/quality/analyzer.js +40 -4
  31. package/build/quality/image-analyzer.js +73 -5
  32. package/build/quality/media-analyzer.js +74 -5
  33. package/build/scripts/cleanup-kaggle.js +41 -0
  34. package/build/scripts/repro-bug.js +37 -0
  35. package/build/scripts/repro-export-bug.js +56 -0
  36. package/build/scripts/test-mcp-v5.js +12 -11
  37. package/build/scripts/test-production-sync.js +36 -0
  38. package/build/scripts/test-target-detector.js +29 -0
  39. package/build/scripts/test-write.js +14 -0
  40. package/build/scripts/verify-integration.js +57 -0
  41. package/build/scripts/verify-priority.js +33 -0
  42. package/build/search/engine.js +13 -2
  43. package/build/search/jit-orchestrator.js +6 -40
  44. package/build/search/vector-store.js +18 -0
  45. package/build/splitting/splitter.js +27 -2
  46. package/build/tools/formatter.js +23 -8
  47. package/build/utils/downloader.js +2 -2
  48. package/build/utils/selector.js +69 -0
  49. package/package.json +8 -4
  50. package/src/python/cleaner.py +33 -3
  51. package/src/python/export_engine.py +19 -0
  52. package/src/python/target_engine.py +154 -0
@@ -17,7 +17,7 @@ export class MetadataStore {
17
17
  // Add install_path if missing
18
18
  try {
19
19
  this.db.exec("ALTER TABLE datasets ADD COLUMN install_path TEXT");
20
- console.log("[MetadataStore] Migrated: Added install_path column");
20
+ console.error("[MetadataStore] Migrated: Added install_path column");
21
21
  }
22
22
  catch (e) {
23
23
  // Probably already exists
@@ -315,10 +315,25 @@ export class MetadataStore {
315
315
  * Perform database maintenance (VACUUM, ANALYZE).
316
316
  */
317
317
  optimize() {
318
- console.log("[MetadataStore] Running database optimization (VACUUM, ANALYZE)...");
318
+ console.error("[MetadataStore] Running database optimization (VACUUM, ANALYZE)...");
319
319
  this.db.exec("VACUUM");
320
320
  this.db.exec("ANALYZE");
321
321
  }
322
+ /**
323
+ * Delete all datasets from a specific source
324
+ */
325
+ deleteBySource(source) {
326
+ const info = this.db.prepare("DELETE FROM datasets WHERE source = ?").run(source);
327
+ console.error(`[MetadataStore] Deleted ${info.changes} datasets from source: ${source}`);
328
+ return info.changes;
329
+ }
330
+ /**
331
+ * Get all dataset IDs from a specific source
332
+ */
333
+ getDatasetIdsBySource(source) {
334
+ const rows = this.db.prepare("SELECT id FROM datasets WHERE source = ?").all(source);
335
+ return rows.map(r => r.id);
336
+ }
322
337
  close() {
323
338
  this.db.close();
324
339
  }
@@ -2,11 +2,11 @@ export class MockErrorTracker {
2
2
  exceptions = [];
3
3
  messages = [];
4
4
  captureException(error, context) {
5
- console.log(`[ErrorTracker] Exception captured: ${error.message}`);
5
+ console.error(`[ErrorTracker] Exception captured: ${error.message}`);
6
6
  this.exceptions.push({ error, context });
7
7
  }
8
8
  captureMessage(message, level = "info") {
9
- console.log(`[ErrorTracker] Message captured (${level}): ${message}`);
9
+ console.error(`[ErrorTracker] Message captured (${level}): ${message}`);
10
10
  this.messages.push({ message, level });
11
11
  }
12
12
  }
@@ -0,0 +1,75 @@
1
+ import { spawn } from "child_process";
2
+ import path from "path";
3
+ import fs from "fs";
4
+ export class TargetDetector {
5
+ pythonPath = "python";
6
+ scriptPath;
7
+ constructor(buildDir = process.cwd()) {
8
+ const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
9
+ const dataRoot = path.join(homeDir, ".vesper");
10
+ // Use same robust path resolution as other services
11
+ const scriptPath0 = path.resolve(dataRoot, "python", "target_engine.py");
12
+ const scriptPath1 = path.resolve(buildDir, "python", "target_engine.py");
13
+ const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "target_engine.py");
14
+ const scriptPath3 = path.resolve(buildDir, "..", "python", "target_engine.py");
15
+ if (fs.existsSync(scriptPath0)) {
16
+ this.scriptPath = scriptPath0;
17
+ }
18
+ else if (fs.existsSync(scriptPath1)) {
19
+ this.scriptPath = scriptPath1;
20
+ }
21
+ else if (fs.existsSync(scriptPath2)) {
22
+ this.scriptPath = scriptPath2;
23
+ }
24
+ else if (fs.existsSync(scriptPath3)) {
25
+ this.scriptPath = scriptPath3;
26
+ }
27
+ else {
28
+ this.scriptPath = scriptPath0;
29
+ }
30
+ if (process.platform === "win32") {
31
+ this.pythonPath = "py";
32
+ }
33
+ }
34
+ /**
35
+ * Detect probable target columns in a dataset
36
+ */
37
+ async detectTarget(filePath) {
38
+ return this.runPython("detect", [filePath]);
39
+ }
40
+ /**
41
+ * Validate a specific column as a target
42
+ */
43
+ async validateTarget(filePath, targetColumn) {
44
+ return this.runPython("validate", [filePath, targetColumn]);
45
+ }
46
+ async runPython(action, args) {
47
+ return new Promise((resolve, reject) => {
48
+ const childProcess = spawn(this.pythonPath, [this.scriptPath, action, ...args], {
49
+ env: { ...process.env, PYTHONIOENCODING: 'utf-8' }
50
+ });
51
+ let stdout = "";
52
+ let stderr = "";
53
+ childProcess.stdout.on("data", (data) => stdout += data.toString());
54
+ childProcess.stderr.on("data", (data) => stderr += data.toString());
55
+ childProcess.on("close", (code) => {
56
+ if (code !== 0) {
57
+ reject(new Error(`Target Detector (${action}) failed: ${stderr}`));
58
+ return;
59
+ }
60
+ try {
61
+ const result = JSON.parse(stdout);
62
+ if (result.error) {
63
+ reject(new Error(result.error));
64
+ }
65
+ else {
66
+ resolve(result);
67
+ }
68
+ }
69
+ catch (e) {
70
+ reject(new Error(`Failed to parse target detector output: ${stdout}\nError: ${e}`));
71
+ }
72
+ });
73
+ });
74
+ }
75
+ }
@@ -0,0 +1,226 @@
1
+ import sys
2
+ import json
3
+ import polars as pl
4
+ import numpy as np
5
+
6
+ # --- Operations Library ---
7
+
8
+ def op_remove_duplicates(df, params):
9
+ subset = params.get("subset", None) # List of cols or None
10
+ before = len(df)
11
+ if subset:
12
+ df = df.unique(subset=subset)
13
+ else:
14
+ df = df.unique()
15
+ return df, {"rows_removed": before - len(df)}
16
+
17
+ def op_drop_columns(df, params):
18
+ cols = params.get("columns", [])
19
+ before = len(df.columns)
20
+ # Filter only existing cols to avoid errors
21
+ cols_to_drop = [c for c in cols if c in df.columns]
22
+ df = df.drop(cols_to_drop)
23
+ return df, {"columns_dropped": len(cols_to_drop)}
24
+
25
+ def op_fill_missing(df, params):
26
+ col = params["column"]
27
+ method = params.get("method", "mean") # mean, median, mode, constant
28
+ value = params.get("value", None)
29
+
30
+ if col not in df.columns:
31
+ return df, {"error": f"Column {col} not found"}
32
+
33
+ affected = df[col].null_count()
34
+
35
+ if method == "constant":
36
+ df = df.with_columns(pl.col(col).fill_null(value))
37
+ elif method == "mean":
38
+ mean_val = df[col].mean()
39
+ df = df.with_columns(pl.col(col).fill_null(mean_val))
40
+ elif method == "median":
41
+ median_val = df[col].median()
42
+ df = df.with_columns(pl.col(col).fill_null(median_val))
43
+
44
+ return df, {"rows_imputed": affected}
45
+
46
+ def op_fix_types(df, params):
47
+ col = params["column"]
48
+ target_type = params["type"] # "int", "float", "string", "date"
49
+
50
+ if col not in df.columns:
51
+ return df, {"error": f"Column {col} not found"}
52
+
53
+ try:
54
+ if target_type == "int":
55
+ df = df.with_columns(pl.col(col).cast(pl.Int64, strict=False))
56
+ elif target_type == "float":
57
+ df = df.with_columns(pl.col(col).cast(pl.Float64, strict=False))
58
+ elif target_type == "string":
59
+ df = df.with_columns(pl.col(col).cast(pl.Utf8))
60
+ elif target_type == "date":
61
+ df = df.with_columns(pl.col(col).str.to_date(strict=False))
62
+
63
+ return df, {"status": "Converted"}
64
+ except Exception as e:
65
+ return df, {"error": str(e)}
66
+
67
+ def op_remove_outliers(df, params):
68
+ col = params["column"]
69
+ method = params.get("method", "iqr")
70
+ threshold = params.get("threshold", 1.5)
71
+
72
+ if col not in df.columns:
73
+ return df, {"error": f"Column {col} not found"}
74
+
75
+ before = len(df)
76
+
77
+ if method == "iqr":
78
+ q1 = df[col].quantile(0.25)
79
+ q3 = df[col].quantile(0.75)
80
+ iqr = q3 - q1
81
+ lower = q1 - (threshold * iqr)
82
+ upper = q3 + (threshold * iqr)
83
+
84
+ df = df.filter((pl.col(col) >= lower) & (pl.col(col) <= upper))
85
+
86
+ return df, {"rows_removed": before - len(df)}
87
+
88
+ def op_encode_categories(df, params):
89
+ col = params["column"]
90
+ method = params.get("method", "label") # label, onehot
91
+
92
+ if col not in df.columns:
93
+ return df, {"error": f"Column {col} not found"}
94
+
95
+ if method == "label":
96
+ # Polars dense_rank acts similar to label encoding
97
+ df = df.with_columns(pl.col(col).rank("dense").alias(f"{col}_encoded"))
98
+ elif method == "onehot":
99
+ dummies = df[col].to_dummies()
100
+ df = pl.concat([df, dummies], how="horizontal")
101
+
102
+ return df, {"status": f"Encoded using {method}"}
103
+
104
+ # --- Registry ---
105
+
106
+ OPERATIONS = {
107
+ "RemoveDuplicates": op_remove_duplicates,
108
+ "DropColumns": op_drop_columns,
109
+ "FillMissing": op_fill_missing,
110
+ "FixTypes": op_fix_types,
111
+ "RemoveOutliers": op_remove_outliers,
112
+ "EncodeCategories": op_encode_categories
113
+ }
114
+
115
+ def main():
116
+ if len(sys.argv) < 3:
117
+ print(json.dumps({"error": "Usage: cleaner.py <file_path> <operations_json>"}), file=sys.stderr)
118
+ sys.exit(1)
119
+
120
+ file_path = sys.argv[1]
121
+ ops_json = sys.argv[2]
122
+
123
+ try:
124
+ operations = json.loads(ops_json)
125
+
126
+ # Load Data
127
+ file_path_lower = file_path.lower()
128
+ if file_path_lower.endswith(".csv"):
129
+ df = pl.read_csv(file_path, ignore_errors=True)
130
+ elif file_path_lower.endswith(".parquet"):
131
+ df = pl.read_parquet(file_path)
132
+ elif file_path_lower.endswith(".jsonl") or file_path_lower.endswith(".ndjson"):
133
+ # Explicit NDJSON
134
+ df = pl.read_ndjson(file_path)
135
+ elif file_path_lower.endswith(".json"):
136
+ # Ambiguous .json
137
+ try:
138
+ df = pl.read_json(file_path)
139
+ except Exception:
140
+ try:
141
+ df = pl.read_ndjson(file_path)
142
+ except Exception as e:
143
+ raise ValueError(f"Failed to read JSON: {str(e)}")
144
+ else:
145
+ raise ValueError(f"Unsupported format: {file_path}")
146
+
147
+ logs = []
148
+ total_rows_affected = 0
149
+
150
+ # Execute Pipeline
151
+ for op in operations:
152
+ op_type = op["type"]
153
+ params = op.get("params", {})
154
+
155
+ if op_type == "RenameTarget":
156
+ old_name = params.get("old_name")
157
+ new_name = params.get("new_name", "target")
158
+ if old_name and old_name in df.columns:
159
+ df = df.rename({old_name: new_name})
160
+ logs.append(f"Renamed column '{old_name}' to '{new_name}'")
161
+ else:
162
+ logs.append(f"Failed RenameTarget: Column '{old_name}' not found or not specified.")
163
+ elif op_type in OPERATIONS:
164
+ try:
165
+ df, stats = OPERATIONS[op_type](df, params)
166
+ logs.append(f"Executed {op_type}: {stats}")
167
+ total_rows_affected += stats.get("rows_removed", 0)
168
+ except Exception as e:
169
+ logs.append(f"Failed {op_type}: {str(e)}")
170
+ else:
171
+ logs.append(f"Unknown operation: {op_type}")
172
+
173
+ # Save Result (overwrite or new file)
174
+ # Save Result (overwrite or new file)
175
+ output_format = sys.argv[3] if len(sys.argv) > 3 else None
176
+
177
+ if not output_format:
178
+ # Legacy logic: preserve CSV or default to parquet
179
+ if file_path_lower.endswith(".csv"):
180
+ output_format = "csv"
181
+ else:
182
+ output_format = "parquet"
183
+
184
+ base_name = file_path.rsplit(".", 1)[0]
185
+ if output_format == "csv":
186
+ output_path = f"{base_name}_cleaned.csv"
187
+ # Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
188
+ for col in df.columns:
189
+ dtype = df.schema[col]
190
+ # Only keep simple types; stringify everything else for CSV
191
+ is_simple = (
192
+ dtype.is_numeric() or
193
+ dtype.is_temporal() or
194
+ str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
195
+ )
196
+ if not is_simple:
197
+ # Use a robust helper for clean JSON serialization
198
+ def safe_serialize(val):
199
+ try:
200
+ # Handle Polars nested types (convert to Python list/dict first)
201
+ if hasattr(val, "to_list"):
202
+ return json.dumps(val.to_list())
203
+ if hasattr(val, "to_dict"):
204
+ return json.dumps(val.to_dict())
205
+ return json.dumps(val)
206
+ except:
207
+ return str(val)
208
+ df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
209
+ df.write_csv(output_path)
210
+ else:
211
+ output_path = f"{base_name}_cleaned.parquet"
212
+ df.write_parquet(output_path)
213
+
214
+ print(json.dumps({
215
+ "success": True,
216
+ "output_path": output_path,
217
+ "rows_affected": total_rows_affected,
218
+ "logs": logs
219
+ }, default=str))
220
+
221
+ except Exception as e:
222
+ print(json.dumps({"success": False, "error": str(e)}, default=str))
223
+ sys.exit(1)
224
+
225
+ if __name__ == "__main__":
226
+ main()
@@ -0,0 +1,131 @@
1
+ import sys
2
+ import json
3
+ import polars as pl
4
+ import os
5
+
6
+ # Optional TensorFlow import for TFRecord support
7
+ try:
8
+ import tensorflow as tf
9
+ HAS_TENSORFLOW = True
10
+ except ImportError:
11
+ HAS_TENSORFLOW = False
12
+
13
+ def export_data(file_path, output_path, format, options=None):
14
+ options = options or {}
15
+
16
+ # Load Data
17
+ try:
18
+ if file_path.endswith(".csv"):
19
+ df = pl.read_csv(file_path, ignore_errors=True)
20
+ elif file_path.endswith(".parquet"):
21
+ df = pl.read_parquet(file_path)
22
+ else:
23
+ return {"error": f"Unsupported input format: {file_path}"}
24
+ except Exception as e:
25
+ return {"error": f"Failed to load input file: {str(e)}"}
26
+
27
+ output_dir = os.path.dirname(output_path)
28
+ if output_dir and not os.path.exists(output_dir):
29
+ os.makedirs(output_dir, exist_ok=True)
30
+
31
+ try:
32
+ # Export Logic
33
+ if format == "csv":
34
+ # Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
35
+ for col in df.columns:
36
+ dtype = df.schema[col]
37
+ is_simple = (
38
+ dtype.is_numeric() or
39
+ dtype.is_temporal() or
40
+ str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
41
+ )
42
+ if not is_simple:
43
+ def safe_serialize(val):
44
+ try:
45
+ if hasattr(val, "to_list"):
46
+ return json.dumps(val.to_list())
47
+ if hasattr(val, "to_dict"):
48
+ return json.dumps(val.to_dict())
49
+ return json.dumps(val)
50
+ except:
51
+ return str(val)
52
+ df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
53
+ df.write_csv(output_path)
54
+
55
+ elif format == "parquet":
56
+ compression = options.get("compression", "snappy")
57
+ df.write_parquet(output_path, compression=compression)
58
+
59
+ elif format == "jsonl":
60
+ df.write_ndjson(output_path)
61
+
62
+ elif format == "arrow" or format == "ipc":
63
+ compression = options.get("compression", "uncompressed")
64
+ if compression == "uncompressed": compression = None
65
+ df.write_ipc(output_path, compression=compression)
66
+
67
+ elif format == "tfrecord":
68
+ if not HAS_TENSORFLOW:
69
+ return {"error": "TensorFlow is not installed. Cannot export to TFRecord."}
70
+
71
+ # TFRecord Export Logic (using TensorFlow)
72
+ with tf.io.TFRecordWriter(output_path) as writer:
73
+ # Convert Polars -> Pandas for iteration (simpler for now)
74
+ # TODO: Optimize with Arrow -> Tensor conversion if needed for massive data
75
+ pdf = df.to_pandas()
76
+ for _, row in pdf.iterrows():
77
+ feature = {}
78
+ for col, value in row.items():
79
+ if value is None:
80
+ continue
81
+
82
+ # Type inference for TFRecord features
83
+ if isinstance(value, int):
84
+ feature[col] = tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
85
+ elif isinstance(value, float):
86
+ feature[col] = tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
87
+ elif isinstance(value, str):
88
+ feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode('utf-8')]))
89
+ elif isinstance(value, bytes):
90
+ feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
91
+ else:
92
+ # Fallback to string for unknown types
93
+ feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode('utf-8')]))
94
+
95
+ example = tf.train.Example(features=tf.train.Features(feature=feature))
96
+ writer.write(example.SerializeToString())
97
+
98
+ else:
99
+ return {"error": f"Unknown export format: {format}"}
100
+
101
+ return {
102
+ "success": True,
103
+ "output_path": output_path,
104
+ "rows": len(df),
105
+ "format": format
106
+ }
107
+
108
+ except Exception as e:
109
+ return {"error": f"Export failed: {str(e)}"}
110
+
111
+ def main():
112
+ if len(sys.argv) < 4:
113
+ print(json.dumps({"error": "Usage: export_engine.py <input_file> <output_file> <format> [options_json]"}), file=sys.stderr)
114
+ sys.exit(1)
115
+
116
+ input_file = sys.argv[1]
117
+ output_file = sys.argv[2]
118
+ fmt = sys.argv[3]
119
+
120
+ options = {}
121
+ if len(sys.argv) > 4:
122
+ try:
123
+ options = json.loads(sys.argv[4])
124
+ except:
125
+ pass
126
+
127
+ result = export_data(input_file, output_file, fmt, options)
128
+ print(json.dumps(result))
129
+
130
+ if __name__ == "__main__":
131
+ main()
@@ -0,0 +1,100 @@
1
+ import os
2
+ import json
3
+ import warnings
4
+
5
+ # --- PyTorch Adapter ---
6
+ try:
7
+ import torch
8
+ from torch.utils.data import Dataset
9
+ import polars as pl
10
+
11
+ class VesperPyTorchDataset(Dataset):
12
+ """
13
+ PyTorch Dataset wrapper for Vesper exported files (Parquet/CSV/Arrow).
14
+ Efficiently loads data using Polars and converts to Tensors on demand.
15
+ """
16
+ def __init__(self, file_path, target_col=None, transform=None):
17
+ self.file_path = file_path
18
+ self.target_col = target_col
19
+ self.transform = transform
20
+
21
+ # Auto-detect format
22
+ if file_path.endswith(".parquet"):
23
+ self.df = pl.read_parquet(file_path)
24
+ elif file_path.endswith(".csv"):
25
+ self.df = pl.read_csv(file_path, ignore_errors=True)
26
+ elif file_path.endswith(".arrow"):
27
+ self.df = pl.read_ipc(file_path)
28
+ else:
29
+ raise ValueError(f"Unsupported file format for PyTorch loader: {file_path}")
30
+
31
+ self.data = self.df.to_pandas() # Convert to pandas for easier row access in __getitem__ (Polars slice can be slow row-wise)
32
+
33
+ def __len__(self):
34
+ return len(self.data)
35
+
36
+ def __getitem__(self, idx):
37
+ row = self.data.iloc[idx]
38
+
39
+ # Simple assumption: all numeric columns except target are features
40
+ # In production, metadata would tell us which columns are features
41
+ if self.target_col and self.target_col in row:
42
+ y = row[self.target_col]
43
+ x = row.drop(self.target_col).values
44
+
45
+ # Convert to tensors
46
+ x = torch.tensor(x, dtype=torch.float32)
47
+ # Auto-detect target type (scalar vs class index)
48
+ if isinstance(y, (int, float)):
49
+ y = torch.tensor(y, dtype=torch.float32) # Regression/Binary
50
+ else:
51
+ # TODO: Label encoding if string
52
+ pass
53
+
54
+ sample = (x, y)
55
+ else:
56
+ # Unsupervised
57
+ x = torch.tensor(row.values, dtype=torch.float32)
58
+ sample = x
59
+
60
+ if self.transform:
61
+ sample = self.transform(sample)
62
+
63
+ return sample
64
+
65
+ except ImportError:
66
+ class VesperPyTorchDataset:
67
+ def __init__(self, *args, **kwargs):
68
+ raise ImportError("PyTorch or Polars not installed.")
69
+
70
+ # --- HuggingFace Adapter ---
71
+ try:
72
+ from datasets import load_dataset as hf_load_dataset
73
+
74
+ def load_vesper_dataset(file_path):
75
+ """
76
+ Loads a Vesper export into a Hugging Face Dataset.
77
+ Supported: Parquet, CSV, JSONL, Arrow.
78
+ """
79
+ output_format = "parquet" # Default fallback
80
+ if file_path.endswith(".csv"): output_format = "csv"
81
+ elif file_path.endswith(".jsonl"): output_format = "json"
82
+ elif file_path.endswith(".arrow"): output_format = "arrow"
83
+
84
+ # 'arrow' format in HF might need custom script, but usually parquet/csv/json are native
85
+ if output_format == "arrow":
86
+ # Use pandas/polars to read then convert to HF dataset
87
+ import polars as pl
88
+ from datasets import Dataset
89
+ df = pl.read_ipc(file_path).to_pandas()
90
+ return Dataset.from_pandas(df)
91
+
92
+ return hf_load_dataset(output_format, data_files=file_path, split="train")
93
+
94
+ except ImportError:
95
+ def load_vesper_dataset(*args, **kwargs):
96
+ raise ImportError("HuggingFace 'datasets' library not installed.")
97
+
98
+ if __name__ == "__main__":
99
+ print("Vesper Framework Adapters Library")
100
+ print("Usage: import this module in your training script.")
@@ -0,0 +1,106 @@
1
+ import sys
2
+ import json
3
+ import argparse
4
+ import urllib.request
5
+ import urllib.parse
6
+ import os
7
+ from datetime import datetime
8
+
9
+ GITHUB_API_URL = "https://api.github.com/search/repositories"
10
+
11
+ def search_github(query: str, limit: int = 10):
12
+ """
13
+ Search GitHub for dataset repositories.
14
+ """
15
+ try:
16
+ # Construct refined query:
17
+ # User query + (topic:dataset OR topic:data)
18
+ # We also filter for repositories with > 5 stars to ensure some relevance
19
+ refined_query = f"{query} topic:dataset stars:>5"
20
+
21
+ params = {
22
+ "q": refined_query,
23
+ "sort": "stars",
24
+ "order": "desc",
25
+ "per_page": limit
26
+ }
27
+
28
+ query_string = urllib.parse.urlencode(params)
29
+ url = f"{GITHUB_API_URL}?{query_string}"
30
+
31
+ req = urllib.request.Request(url)
32
+
33
+ # Add User-Agent (Required by GitHub)
34
+ req.add_header("User-Agent", "Vesper-Dataset-Search")
35
+
36
+ # Add Authorization if token exists
37
+ token = os.environ.get("GITHUB_TOKEN")
38
+ if token:
39
+ req.add_header("Authorization", f"token {token}")
40
+
41
+ with urllib.request.urlopen(req) as response:
42
+ data = json.load(response)
43
+
44
+ items = data.get('items', [])
45
+ results = []
46
+
47
+ for item in items:
48
+ # Map GitHub fields to Vesper schema
49
+ # repo: owner/name
50
+ repo_id = item.get("full_name")
51
+
52
+ # Simple licensing map
53
+ license_info = item.get("license") or {}
54
+ license_key = license_info.get("key", "unknown")
55
+ license_category = "safe" if license_key in ["mit", "apache-2.0", "cc0-1.0", "bsd-3-clause"] else "unknown"
56
+
57
+ metadata = {
58
+ "id": f"github:{repo_id}",
59
+ "source": "github",
60
+ "name": item.get("name"),
61
+ "description": item.get("description") or "No description provided.",
62
+ "downloads": item.get("forks_count") * 10, # Proxy estimation
63
+ "likes": item.get("stargazers_count"),
64
+ "stars": item.get("stargazers_count"),
65
+ "last_updated": item.get("updated_at"),
66
+ "quality_score": min(100, 50 + (item.get("stargazers_count", 0))), # Rough heuristic
67
+ "license": {
68
+ "id": license_key,
69
+ "name": license_info.get("name", "Unknown"),
70
+ "category": license_category,
71
+ "usage_restrictions": [],
72
+ "warnings": []
73
+ },
74
+ "tags": item.get("topics", []),
75
+ "total_examples": 0, # Unknown without drilling deeper
76
+ "is_safe_source": True, # GitHub is generally safe code, content varies
77
+ "is_structured": False, # Often contains code + data
78
+ "metadata_url": item.get("html_url"),
79
+ "domain": "general"
80
+ }
81
+
82
+ results.append(metadata)
83
+
84
+ return results
85
+
86
+ except urllib.error.HTTPError as e:
87
+ if e.code == 403:
88
+ return {"error": "Rate limit exceeded. Set GITHUB_TOKEN environment variable."}
89
+ return {"error": f"HTTP Error {e.code}: {e.reason}"}
90
+ except Exception as e:
91
+ return {"error": str(e)}
92
+
93
+ def main():
94
+ parser = argparse.ArgumentParser(description="GitHub Adapter")
95
+ parser.add_argument("--action", required=True, choices=["search"])
96
+ parser.add_argument("--query", required=True)
97
+ parser.add_argument("--limit", type=int, default=10)
98
+
99
+ args = parser.parse_args()
100
+
101
+ if args.action == "search":
102
+ results = search_github(args.query, args.limit)
103
+ print(json.dumps(results))
104
+
105
+ if __name__ == "__main__":
106
+ main()