vesper-wizard 2.2.0 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -1352,6 +1352,25 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1352
1352
  properties: {},
1353
1353
  },
1354
1354
  },
1355
+ {
1356
+ name: "vesper_convert_format",
1357
+ description: "Convert a dataset file between formats (CSV, Parquet, JSON, JSONL). Auto-detects input format from extension. Saves output in the same directory with the new extension and registers it in the Vesper registry.",
1358
+ inputSchema: {
1359
+ type: "object",
1360
+ properties: {
1361
+ file_path: {
1362
+ type: "string",
1363
+ description: "Absolute path to the input dataset file.",
1364
+ },
1365
+ target_format: {
1366
+ type: "string",
1367
+ enum: ["csv", "parquet", "json", "jsonl"],
1368
+ description: "The desired output format.",
1369
+ },
1370
+ },
1371
+ required: ["file_path", "target_format"],
1372
+ },
1373
+ },
1355
1374
  {
1356
1375
  name: "fuse_datasets",
1357
1376
  description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
@@ -1784,7 +1803,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1784
1803
  max_items: maxItems,
1785
1804
  workers,
1786
1805
  image_column: imageColumn,
1787
- output_root: path.join(dataRoot, "data", "assets"),
1806
+ output_root: requestedOutputDir || process.cwd(),
1788
1807
  recipes_dir: path.join(dataRoot, "recipes"),
1789
1808
  };
1790
1809
  try {
@@ -2272,6 +2291,63 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2272
2291
  content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
2273
2292
  };
2274
2293
  }
2294
+ case "vesper_convert_format": {
2295
+ const filePath = String(request.params.arguments?.file_path || "").trim();
2296
+ const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
2297
+ if (!filePath) {
2298
+ throw new McpError(ErrorCode.InvalidParams, "file_path is required");
2299
+ }
2300
+ if (!["csv", "parquet", "json", "jsonl"].includes(targetFormat)) {
2301
+ throw new McpError(ErrorCode.InvalidParams, "target_format must be one of: csv, parquet, json, jsonl");
2302
+ }
2303
+ if (!fs.existsSync(filePath)) {
2304
+ return {
2305
+ content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }],
2306
+ isError: true,
2307
+ };
2308
+ }
2309
+ const inputExt = path.extname(filePath).toLowerCase();
2310
+ const extMap = { csv: ".csv", parquet: ".parquet", json: ".json", jsonl: ".jsonl" };
2311
+ const outputExt = extMap[targetFormat];
2312
+ if (inputExt === outputExt) {
2313
+ return {
2314
+ content: [{ type: "text", text: `File is already in ${targetFormat} format: ${filePath}` }],
2315
+ };
2316
+ }
2317
+ const outputPath = filePath.replace(/\.[^.]+$/, outputExt);
2318
+ try {
2319
+ await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
2320
+ const convertScript = path.join(dataRoot, "python", "convert_engine.py");
2321
+ const result = await runPythonJson(convertScript, [filePath, outputPath]);
2322
+ if (!result.ok) {
2323
+ return {
2324
+ content: [{ type: "text", text: `ERROR: Conversion failed: ${result.error}` }],
2325
+ isError: true,
2326
+ };
2327
+ }
2328
+ // Register converted file in the registry
2329
+ const datasetId = path.basename(outputPath, outputExt);
2330
+ try {
2331
+ upsertRegistry(datasetId, outputPath, "completed");
2332
+ }
2333
+ catch (e) {
2334
+ console.error(`[Convert] Registry write failed: ${e?.message || e}`);
2335
+ }
2336
+ let msg = `**Conversion complete**\n`;
2337
+ msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
2338
+ msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
2339
+ msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
2340
+ if (result.size_mb !== undefined)
2341
+ msg += `- **Size**: ${result.size_mb} MB\n`;
2342
+ return { content: [{ type: "text", text: msg }] };
2343
+ }
2344
+ catch (error) {
2345
+ return {
2346
+ content: [{ type: "text", text: `ERROR: Conversion failed: ${error.message}` }],
2347
+ isError: true,
2348
+ };
2349
+ }
2350
+ }
2275
2351
  case "fuse_datasets": {
2276
2352
  const rawSources = request.params.arguments?.sources;
2277
2353
  if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
@@ -2312,10 +2388,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2312
2388
  try {
2313
2389
  const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
2314
2390
  const ext = extMap[outputFormat] || ".feather";
2315
- const outDir = path.join(dataRoot, "fusion");
2391
+ const outDir = process.cwd();
2316
2392
  if (!fs.existsSync(outDir))
2317
2393
  fs.mkdirSync(outDir, { recursive: true });
2318
2394
  const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
2395
+ console.error(`[Fusion] Resolved output directory: ${outDir}`);
2319
2396
  const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
2320
2397
  strategy,
2321
2398
  join_on: joinOn,
@@ -2874,10 +2951,12 @@ async function runExportCli(args) {
2874
2951
  const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
2875
2952
  const ext = extMap[requestedFormat] || ".parquet";
2876
2953
  const safeName = toSafeDatasetPathFragment(datasetId);
2877
- const outDir = targetDir || path.join(dataRoot, "exports");
2954
+ const outDir = targetDir || process.cwd();
2878
2955
  if (!fs.existsSync(outDir))
2879
2956
  fs.mkdirSync(outDir, { recursive: true });
2880
2957
  const outputFile = path.join(outDir, `${safeName}${ext}`);
2958
+ console.error(`[Export] Resolved output directory: ${outDir}`);
2959
+ console.error(`[Export] Output file: ${outputFile}`);
2881
2960
  const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
2882
2961
  console.log(`Export complete: ${result.output_path}`);
2883
2962
  console.log(`Format: ${result.format}${result.compression ? ` (${result.compression})` : ""}`);
@@ -21,11 +21,12 @@ export class InstallService {
21
21
  // Create target directory
22
22
  const installLabel = dataset?.name || datasetId;
23
23
  const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
24
- // If caller specified a target dir, use it directly (don't nest under datasets/)
25
- // Otherwise fall back to the project root's datasets/ folder
24
+ // If caller specified a target dir, use it directly
25
+ // Otherwise use the current working directory
26
26
  const installDir = targetDir
27
27
  ? path.resolve(targetDir)
28
- : path.join(this.projectRoot, "datasets", sanitizedName);
28
+ : path.resolve(process.cwd(), sanitizedName);
29
+ console.error(`[InstallService] Resolved install directory: ${installDir}`);
29
30
  if (!fs.existsSync(installDir)) {
30
31
  fs.mkdirSync(installDir, { recursive: true });
31
32
  }
@@ -3,7 +3,7 @@ import { categorizeLicense } from "./license.js";
3
3
  import { calculateQualityScore } from "./quality.js";
4
4
  import { classifyDomain } from "./domain.js";
5
5
  import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
6
- import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent } from "../search/query-intent.js";
6
+ import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "../search/query-intent.js";
7
7
  export class HuggingFaceScraper {
8
8
  /**
9
9
  * Bulk discovery: Fetch many datasets quickly without deep details.
@@ -85,6 +85,9 @@ export class HuggingFaceScraper {
85
85
  has_readme: false,
86
86
  is_incomplete: true // Flag for Phase 2
87
87
  };
88
+ // Hard language exclusion
89
+ if (intent && shouldExcludeByLanguage(metadata, intent))
90
+ continue;
88
91
  results.push(metadata);
89
92
  }
90
93
  }
@@ -302,10 +305,16 @@ export class HuggingFaceScraper {
302
305
  description_length: description.length,
303
306
  has_readme: !!(cardData.readme || cardData.readme_content)
304
307
  };
305
- if (intent) {
306
- metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
308
+ // Hard language exclusion — drop bilingual/multilingual for single-language queries
309
+ if (intent && shouldExcludeByLanguage(metadata, intent)) {
310
+ // skip — do not push
311
+ }
312
+ else {
313
+ if (intent) {
314
+ metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
315
+ }
316
+ results.push(metadata);
307
317
  }
308
- results.push(metadata);
309
318
  }
310
319
  catch (e) {
311
320
  // Track all errors for user feedback
@@ -0,0 +1,92 @@
1
+ """
2
+ Convert a dataset file between formats (CSV, Parquet, JSON, JSONL).
3
+ Usage: convert_engine.py <input_path> <output_path>
4
+ Outputs JSON: {"ok": true, "output_path": "...", "rows": N, "columns": N} or {"ok": false, "error": "..."}
5
+ """
6
+ import sys
7
+ import json
8
+ import os
9
+
10
+ try:
11
+ import polars as pl
12
+ except Exception:
13
+ print(json.dumps({"ok": False, "error": "polars is required. Install with: pip install polars"}))
14
+ sys.exit(1)
15
+
16
+
17
+ def _load(src: str) -> pl.DataFrame:
18
+ ext = os.path.splitext(src)[1].lower()
19
+ if ext == ".csv":
20
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
21
+ if ext in (".tsv", ".tab"):
22
+ return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
23
+ if ext in (".parquet", ".pq"):
24
+ return pl.read_parquet(src)
25
+ if ext in (".feather", ".ftr", ".arrow", ".ipc"):
26
+ return pl.read_ipc(src)
27
+ if ext in (".jsonl", ".ndjson"):
28
+ return pl.read_ndjson(src)
29
+ if ext == ".json":
30
+ raw = open(src, "r", encoding="utf-8").read().strip()
31
+ if raw.startswith("["):
32
+ return pl.read_json(src)
33
+ if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
34
+ return pl.read_ndjson(src)
35
+ obj = json.loads(raw)
36
+ if isinstance(obj, dict):
37
+ for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
38
+ if key in obj and isinstance(obj[key], list):
39
+ return pl.DataFrame(obj[key])
40
+ for v in obj.values():
41
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
42
+ return pl.DataFrame(v)
43
+ return pl.read_json(src)
44
+ # Fallback: try csv
45
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
46
+
47
+
48
+ def _write(df: pl.DataFrame, dst: str) -> None:
49
+ ext = os.path.splitext(dst)[1].lower()
50
+ os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
51
+ if ext in (".parquet", ".pq"):
52
+ df.write_parquet(dst)
53
+ elif ext == ".csv":
54
+ df.write_csv(dst)
55
+ elif ext == ".json":
56
+ df.write_json(dst, row_oriented=True)
57
+ elif ext in (".jsonl", ".ndjson"):
58
+ df.write_ndjson(dst)
59
+ else:
60
+ raise ValueError(f"Unsupported output format: {ext}")
61
+
62
+
63
+ def main():
64
+ if len(sys.argv) < 3:
65
+ print(json.dumps({"ok": False, "error": "Usage: convert_engine.py <input> <output>"}))
66
+ sys.exit(1)
67
+
68
+ input_path = sys.argv[1]
69
+ output_path = sys.argv[2]
70
+
71
+ if not os.path.exists(input_path):
72
+ print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
73
+ sys.exit(1)
74
+
75
+ try:
76
+ df = _load(input_path)
77
+ _write(df, output_path)
78
+ size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
79
+ print(json.dumps({
80
+ "ok": True,
81
+ "output_path": output_path,
82
+ "rows": df.height,
83
+ "columns": df.width,
84
+ "size_mb": size_mb,
85
+ }))
86
+ except Exception as e:
87
+ print(json.dumps({"ok": False, "error": str(e)}))
88
+ sys.exit(1)
89
+
90
+
91
+ if __name__ == "__main__":
92
+ main()
@@ -1,5 +1,5 @@
1
1
  import { JITOrchestrator } from "./jit-orchestrator.js";
2
- import { analyzeDatasetQuery, scoreDatasetAgainstIntent } from "./query-intent.js";
2
+ import { analyzeDatasetQuery, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "./query-intent.js";
3
3
  import fs from "fs";
4
4
  function log(msg) {
5
5
  fs.appendFileSync("debug.log", new Date().toISOString() + " " + msg + "\n");
@@ -68,6 +68,12 @@ export class SearchEngine {
68
68
  // Filter: Safe only
69
69
  if (options.safeOnly && metadata.license.category === "restricted")
70
70
  continue;
71
+ // Filter: Hard language exclusion — completely drop bilingual/multilingual datasets
72
+ // when user explicitly requested a single language
73
+ if (shouldExcludeByLanguage(metadata, intent)) {
74
+ log(`Language exclusion: Dropped ${match.id} (bilingual/multilingual for single-language query)`);
75
+ continue;
76
+ }
71
77
  const text = `${metadata.name} ${metadata.description} ${metadata.tags.join(" ")}`.toLowerCase();
72
78
  // Filter: Explicit Negative Keywords
73
79
  if (negativeKeywords.some(neg => text.includes(neg))) {
@@ -172,6 +172,51 @@ const LANGUAGE_TO_CODE = {
172
172
  russian: "ru",
173
173
  hindi: "hi",
174
174
  };
175
+ const BILINGUAL_INDICATORS = [
176
+ "translation", "parallel", "bilingual", "multilingual",
177
+ "cross-lingual", "crosslingual", "machine-translation",
178
+ "aligned", "comparable corpus",
179
+ ];
180
+ /**
181
+ * Hard-exclude a dataset when the user requests a single specific language
182
+ * and the dataset is bilingual, multilingual, or tagged with other languages.
183
+ * Returns true if the dataset should be EXCLUDED from results.
184
+ */
185
+ export function shouldExcludeByLanguage(dataset, intent) {
186
+ if (!intent?.language || intent.language === "multilingual")
187
+ return false;
188
+ const aliases = getLanguageAliases(intent.language);
189
+ const datasetLanguages = dataset.languages.map(normalizeToken).filter(l => l && l !== "unknown");
190
+ // If the dataset has language tags and ANY of them are NOT the requested language, exclude
191
+ if (datasetLanguages.length > 0) {
192
+ const hasRequestedLang = aliases.some(a => datasetLanguages.includes(a));
193
+ const hasOtherLangs = datasetLanguages.some(lang => !aliases.includes(lang));
194
+ if (hasOtherLangs)
195
+ return true; // bilingual/multilingual → exclude
196
+ if (!hasRequestedLang)
197
+ return true; // wrong language entirely
198
+ }
199
+ // Check name, description, and tags for bilingual indicators or other language names
200
+ const text = [
201
+ dataset.name,
202
+ dataset.description,
203
+ dataset.tags.join(" "),
204
+ ].join(" ").toLowerCase();
205
+ // Check for bilingual/translation keywords
206
+ if (BILINGUAL_INDICATORS.some(indicator => text.includes(indicator))) {
207
+ return true;
208
+ }
209
+ // Check if the text mentions other specific languages by name
210
+ const otherLanguageNames = Object.keys(LANGUAGE_ALIASES).filter(lang => lang !== intent.language && lang !== "multilingual");
211
+ for (const otherLang of otherLanguageNames) {
212
+ const otherAliases = LANGUAGE_ALIASES[otherLang];
213
+ // Only check the full language name (not 2-letter codes which could appear in regular text)
214
+ if (otherAliases && otherAliases[0] && text.includes(otherAliases[0])) {
215
+ return true;
216
+ }
217
+ }
218
+ return false;
219
+ }
175
220
  function buildHeuristicIntent(query, requirements) {
176
221
  const originalQuery = `${query || ""} ${requirements || ""}`.trim();
177
222
  const normalizedQuery = originalQuery.toLowerCase();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "vesper-wizard",
3
- "version": "2.2.0",
3
+ "version": "2.3.1",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
@@ -43,7 +43,7 @@ const managedVenvDir = path.join(vesperDataDir, '.venv');
43
43
  const managedPython = process.platform === 'win32'
44
44
  ? path.join(managedVenvDir, 'Scripts', 'python.exe')
45
45
  : path.join(managedVenvDir, 'bin', 'python');
46
- const requirementsPath = path.resolve(__dirname, '..', 'requirements.txt');
46
+ const requirementsPath = path.resolve(__dirname, '..', 'src', 'python', 'requirements.txt');
47
47
 
48
48
  // 2. Create data directories
49
49
  const dirs = [
@@ -62,7 +62,7 @@ const VESPER_API_URL = process.env.VESPER_API_URL || '';
62
62
  const DEFAULT_VESPER_API_CANDIDATES = [
63
63
  'http://localhost:3000',
64
64
  'http://127.0.0.1:3000',
65
- 'https://vesper.dev',
65
+ 'https://getvesper.dev',
66
66
  ];
67
67
 
68
68
  // ── Device Auth Helpers ──────────────────────────────────────
package/scripts/wizard.js CHANGED
@@ -62,7 +62,7 @@ const VESPER_API_URL = process.env.VESPER_API_URL || '';
62
62
  const DEFAULT_VESPER_API_CANDIDATES = [
63
63
  'http://localhost:3000',
64
64
  'http://127.0.0.1:3000',
65
- 'https://vesper.dev',
65
+ 'https://getvesper.dev',
66
66
  ];
67
67
 
68
68
  // ── Device Auth Helpers ──────────────────────────────────────
@@ -0,0 +1,92 @@
1
+ """
2
+ Convert a dataset file between formats (CSV, Parquet, JSON, JSONL).
3
+ Usage: convert_engine.py <input_path> <output_path>
4
+ Outputs JSON: {"ok": true, "output_path": "...", "rows": N, "columns": N} or {"ok": false, "error": "..."}
5
+ """
6
+ import sys
7
+ import json
8
+ import os
9
+
10
+ try:
11
+ import polars as pl
12
+ except Exception:
13
+ print(json.dumps({"ok": False, "error": "polars is required. Install with: pip install polars"}))
14
+ sys.exit(1)
15
+
16
+
17
+ def _load(src: str) -> pl.DataFrame:
18
+ ext = os.path.splitext(src)[1].lower()
19
+ if ext == ".csv":
20
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
21
+ if ext in (".tsv", ".tab"):
22
+ return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
23
+ if ext in (".parquet", ".pq"):
24
+ return pl.read_parquet(src)
25
+ if ext in (".feather", ".ftr", ".arrow", ".ipc"):
26
+ return pl.read_ipc(src)
27
+ if ext in (".jsonl", ".ndjson"):
28
+ return pl.read_ndjson(src)
29
+ if ext == ".json":
30
+ raw = open(src, "r", encoding="utf-8").read().strip()
31
+ if raw.startswith("["):
32
+ return pl.read_json(src)
33
+ if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
34
+ return pl.read_ndjson(src)
35
+ obj = json.loads(raw)
36
+ if isinstance(obj, dict):
37
+ for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
38
+ if key in obj and isinstance(obj[key], list):
39
+ return pl.DataFrame(obj[key])
40
+ for v in obj.values():
41
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
42
+ return pl.DataFrame(v)
43
+ return pl.read_json(src)
44
+ # Fallback: try csv
45
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
46
+
47
+
48
+ def _write(df: pl.DataFrame, dst: str) -> None:
49
+ ext = os.path.splitext(dst)[1].lower()
50
+ os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
51
+ if ext in (".parquet", ".pq"):
52
+ df.write_parquet(dst)
53
+ elif ext == ".csv":
54
+ df.write_csv(dst)
55
+ elif ext == ".json":
56
+ df.write_json(dst, row_oriented=True)
57
+ elif ext in (".jsonl", ".ndjson"):
58
+ df.write_ndjson(dst)
59
+ else:
60
+ raise ValueError(f"Unsupported output format: {ext}")
61
+
62
+
63
+ def main():
64
+ if len(sys.argv) < 3:
65
+ print(json.dumps({"ok": False, "error": "Usage: convert_engine.py <input> <output>"}))
66
+ sys.exit(1)
67
+
68
+ input_path = sys.argv[1]
69
+ output_path = sys.argv[2]
70
+
71
+ if not os.path.exists(input_path):
72
+ print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
73
+ sys.exit(1)
74
+
75
+ try:
76
+ df = _load(input_path)
77
+ _write(df, output_path)
78
+ size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
79
+ print(json.dumps({
80
+ "ok": True,
81
+ "output_path": output_path,
82
+ "rows": df.height,
83
+ "columns": df.width,
84
+ "size_mb": size_mb,
85
+ }))
86
+ except Exception as e:
87
+ print(json.dumps({"ok": False, "error": str(e)}))
88
+ sys.exit(1)
89
+
90
+
91
+ if __name__ == "__main__":
92
+ main()
@@ -0,0 +1,12 @@
1
+ polars==1.2.0
2
+ pandas==2.2.0
3
+ numpy==1.26.0
4
+ scikit-learn==1.4.0
5
+ # Optional source/download extras:
6
+ kaggle>=1.6.17
7
+ aiohttp>=3.9.0
8
+ aiofiles>=24.1.0
9
+ datasets>=2.20.0
10
+ webdataset>=0.2.86
11
+ # Optional for secure key storage (preferred over file fallback):
12
+ # keyring>=24.0.0