vesper-wizard 2.1.6 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -896,6 +896,49 @@ async function handlePrepareJob(jobId, query, requirements, outputDir) {
896
896
  quality_score: qualityScore
897
897
  });
898
898
  }
899
+ else {
900
+ // Even for explicit dataset refs, save minimal metadata so get_dataset_info can find it
901
+ try {
902
+ const existingMeta = metadataStore.getDataset(datasetIdForDownload);
903
+ if (!existingMeta) {
904
+ metadataStore.saveDataset({
905
+ id: datasetIdForDownload,
906
+ source: source,
907
+ name: datasetIdForDownload.split("/").pop() || datasetIdForDownload,
908
+ description: `Dataset prepared from ${source}:${datasetIdForDownload}`,
909
+ quality_warnings: [],
910
+ downloads: 0,
911
+ likes: 0,
912
+ stars: 0,
913
+ tags: [],
914
+ last_updated: new Date().toISOString(),
915
+ task: "unknown",
916
+ domain: "unknown",
917
+ languages: [],
918
+ splits: [],
919
+ license: { id: "unknown", category: "unknown", usage_restrictions: [], warnings: [] },
920
+ quality_score: qualityScore,
921
+ download_url: source === "huggingface" ? `https://huggingface.co/datasets/${datasetIdForDownload}` : "",
922
+ total_examples: 0,
923
+ is_structured: false,
924
+ has_target_column: false,
925
+ is_safe_source: true,
926
+ has_personal_data: false,
927
+ is_paywalled: false,
928
+ is_scraped_web_data: false,
929
+ uses_https: true,
930
+ has_train_split: false,
931
+ has_test_split: false,
932
+ has_validation_split: false,
933
+ description_length: 0,
934
+ has_readme: false,
935
+ });
936
+ }
937
+ }
938
+ catch (e) {
939
+ console.error(`[Prepare] Failed to save minimal metadata for ${datasetIdForDownload}: ${e?.message || e}`);
940
+ }
941
+ }
899
942
  markPipelineStep("register", "running");
900
943
  update({ progress: 85, status_text: "Installing dataset into project..." });
901
944
  const installPath = await installService.install(datasetIdForDownload, rawFilePath, outputDir);
@@ -1309,6 +1352,25 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1309
1352
  properties: {},
1310
1353
  },
1311
1354
  },
1355
+ {
1356
+ name: "vesper_convert_format",
1357
+ description: "Convert a dataset file between formats (CSV, Parquet, JSON, JSONL). Auto-detects input format from extension. Saves output in the same directory with the new extension and registers it in the Vesper registry.",
1358
+ inputSchema: {
1359
+ type: "object",
1360
+ properties: {
1361
+ file_path: {
1362
+ type: "string",
1363
+ description: "Absolute path to the input dataset file.",
1364
+ },
1365
+ target_format: {
1366
+ type: "string",
1367
+ enum: ["csv", "parquet", "json", "jsonl"],
1368
+ description: "The desired output format.",
1369
+ },
1370
+ },
1371
+ required: ["file_path", "target_format"],
1372
+ },
1373
+ },
1312
1374
  {
1313
1375
  name: "fuse_datasets",
1314
1376
  description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
@@ -1741,7 +1803,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1741
1803
  max_items: maxItems,
1742
1804
  workers,
1743
1805
  image_column: imageColumn,
1744
- output_root: path.join(dataRoot, "data", "assets"),
1806
+ output_root: requestedOutputDir || process.cwd(),
1745
1807
  recipes_dir: path.join(dataRoot, "recipes"),
1746
1808
  };
1747
1809
  try {
@@ -1845,8 +1907,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1845
1907
  }
1846
1908
  const dataset = metadataStore.getDataset(datasetId);
1847
1909
  if (!dataset) {
1910
+ // Fallback: check the registry for local path info
1911
+ const regEntry = getRegistryEntry(datasetId);
1912
+ const regPath = regEntry?.local_path || regEntry?.path;
1913
+ if (regEntry) {
1914
+ const exists = regPath && fs.existsSync(regPath);
1915
+ return {
1916
+ content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
1917
+ };
1918
+ }
1848
1919
  return {
1849
- content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}` }],
1920
+ content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
1850
1921
  isError: true,
1851
1922
  };
1852
1923
  }
@@ -2167,7 +2238,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2167
2238
  const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
2168
2239
  const ext = extMap[requestedFormat] || ".feather";
2169
2240
  const safeName = toSafeDatasetPathFragment(datasetId);
2170
- const outDir = targetDir || path.join(dataRoot, "exports");
2241
+ const outDir = targetDir;
2171
2242
  if (!fs.existsSync(outDir))
2172
2243
  fs.mkdirSync(outDir, { recursive: true });
2173
2244
  const outputFile = path.join(outDir, `${safeName}${ext}`);
@@ -2203,6 +2274,80 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2203
2274
  };
2204
2275
  }
2205
2276
  }
2277
+ case "vesper_list_datasets": {
2278
+ const entries = readRegistry();
2279
+ if (entries.length === 0) {
2280
+ return {
2281
+ content: [{ type: "text", text: "No prepared datasets found in the Vesper registry. Use prepare_dataset or download_dataset to add datasets." }]
2282
+ };
2283
+ }
2284
+ const lines = entries.map((e, i) => {
2285
+ const id = e.dataset_id || e.id || "unknown";
2286
+ const localPath = e.local_path || e.path || "unknown";
2287
+ const exists = typeof localPath === "string" && localPath !== "unknown" && fs.existsSync(localPath);
2288
+ return `${i + 1}. **${id}**\n Path: ${localPath}\n Status: ${e.status || "unknown"}${exists ? "" : " (file missing)"}`;
2289
+ });
2290
+ return {
2291
+ content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
2292
+ };
2293
+ }
2294
+ case "vesper_convert_format": {
2295
+ const filePath = String(request.params.arguments?.file_path || "").trim();
2296
+ const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
2297
+ if (!filePath) {
2298
+ throw new McpError(ErrorCode.InvalidParams, "file_path is required");
2299
+ }
2300
+ if (!["csv", "parquet", "json", "jsonl"].includes(targetFormat)) {
2301
+ throw new McpError(ErrorCode.InvalidParams, "target_format must be one of: csv, parquet, json, jsonl");
2302
+ }
2303
+ if (!fs.existsSync(filePath)) {
2304
+ return {
2305
+ content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }],
2306
+ isError: true,
2307
+ };
2308
+ }
2309
+ const inputExt = path.extname(filePath).toLowerCase();
2310
+ const extMap = { csv: ".csv", parquet: ".parquet", json: ".json", jsonl: ".jsonl" };
2311
+ const outputExt = extMap[targetFormat];
2312
+ if (inputExt === outputExt) {
2313
+ return {
2314
+ content: [{ type: "text", text: `File is already in ${targetFormat} format: ${filePath}` }],
2315
+ };
2316
+ }
2317
+ const outputPath = filePath.replace(/\.[^.]+$/, outputExt);
2318
+ try {
2319
+ await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
2320
+ const convertScript = path.join(dataRoot, "python", "convert_engine.py");
2321
+ const result = await runPythonJson(convertScript, [filePath, outputPath]);
2322
+ if (!result.ok) {
2323
+ return {
2324
+ content: [{ type: "text", text: `ERROR: Conversion failed: ${result.error}` }],
2325
+ isError: true,
2326
+ };
2327
+ }
2328
+ // Register converted file in the registry
2329
+ const datasetId = path.basename(outputPath, outputExt);
2330
+ try {
2331
+ upsertRegistry(datasetId, outputPath, "completed");
2332
+ }
2333
+ catch (e) {
2334
+ console.error(`[Convert] Registry write failed: ${e?.message || e}`);
2335
+ }
2336
+ let msg = `**Conversion complete**\n`;
2337
+ msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
2338
+ msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
2339
+ msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
2340
+ if (result.size_mb !== undefined)
2341
+ msg += `- **Size**: ${result.size_mb} MB\n`;
2342
+ return { content: [{ type: "text", text: msg }] };
2343
+ }
2344
+ catch (error) {
2345
+ return {
2346
+ content: [{ type: "text", text: `ERROR: Conversion failed: ${error.message}` }],
2347
+ isError: true,
2348
+ };
2349
+ }
2350
+ }
2206
2351
  case "fuse_datasets": {
2207
2352
  const rawSources = request.params.arguments?.sources;
2208
2353
  if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
@@ -2243,10 +2388,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2243
2388
  try {
2244
2389
  const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
2245
2390
  const ext = extMap[outputFormat] || ".feather";
2246
- const outDir = path.join(dataRoot, "fusion");
2391
+ const outDir = process.cwd();
2247
2392
  if (!fs.existsSync(outDir))
2248
2393
  fs.mkdirSync(outDir, { recursive: true });
2249
2394
  const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
2395
+ console.error(`[Fusion] Resolved output directory: ${outDir}`);
2250
2396
  const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
2251
2397
  strategy,
2252
2398
  join_on: joinOn,
@@ -2805,10 +2951,12 @@ async function runExportCli(args) {
2805
2951
  const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
2806
2952
  const ext = extMap[requestedFormat] || ".parquet";
2807
2953
  const safeName = toSafeDatasetPathFragment(datasetId);
2808
- const outDir = targetDir || path.join(dataRoot, "exports");
2954
+ const outDir = targetDir || process.cwd();
2809
2955
  if (!fs.existsSync(outDir))
2810
2956
  fs.mkdirSync(outDir, { recursive: true });
2811
2957
  const outputFile = path.join(outDir, `${safeName}${ext}`);
2958
+ console.error(`[Export] Resolved output directory: ${outDir}`);
2959
+ console.error(`[Export] Output file: ${outputFile}`);
2812
2960
  const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
2813
2961
  console.log(`Export complete: ${result.output_path}`);
2814
2962
  console.log(`Format: ${result.format}${result.compression ? ` (${result.compression})` : ""}`);
@@ -21,7 +21,12 @@ export class InstallService {
21
21
  // Create target directory
22
22
  const installLabel = dataset?.name || datasetId;
23
23
  const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
24
- const installDir = targetDir || path.join(this.projectRoot, "datasets", sanitizedName);
24
+ // If caller specified a target dir, use it directly
25
+ // Otherwise use the current working directory
26
+ const installDir = targetDir
27
+ ? path.resolve(targetDir)
28
+ : path.resolve(process.cwd(), sanitizedName);
29
+ console.error(`[InstallService] Resolved install directory: ${installDir}`);
25
30
  if (!fs.existsSync(installDir)) {
26
31
  fs.mkdirSync(installDir, { recursive: true });
27
32
  }
@@ -3,7 +3,7 @@ import { categorizeLicense } from "./license.js";
3
3
  import { calculateQualityScore } from "./quality.js";
4
4
  import { classifyDomain } from "./domain.js";
5
5
  import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
6
- import { analyzeDatasetQuery, buildIntentSearchQuery, scoreDatasetAgainstIntent } from "../search/query-intent.js";
6
+ import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "../search/query-intent.js";
7
7
  export class HuggingFaceScraper {
8
8
  /**
9
9
  * Bulk discovery: Fetch many datasets quickly without deep details.
@@ -21,10 +21,11 @@ export class HuggingFaceScraper {
21
21
  let processed = 0;
22
22
  try {
23
23
  const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
24
+ const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
24
25
  for await (const ds of listDatasets({
25
26
  limit: limit,
26
27
  additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
27
- search: { query: hfQuery },
28
+ search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
28
29
  ...(hfToken ? { accessToken: hfToken } : {})
29
30
  })) {
30
31
  if (results.length >= limit)
@@ -84,6 +85,9 @@ export class HuggingFaceScraper {
84
85
  has_readme: false,
85
86
  is_incomplete: true // Flag for Phase 2
86
87
  };
88
+ // Hard language exclusion
89
+ if (intent && shouldExcludeByLanguage(metadata, intent))
90
+ continue;
87
91
  results.push(metadata);
88
92
  }
89
93
  }
@@ -120,10 +124,11 @@ export class HuggingFaceScraper {
120
124
  }
121
125
  // Add delay between batches to avoid rate limits
122
126
  const BATCH_DELAY = hfToken ? 500 : 2000;
127
+ const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
123
128
  for await (const ds of listDatasets({
124
129
  limit: fetchLimit,
125
130
  additionalFields: ["description", "tags"],
126
- search: { query: hfQuery },
131
+ search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
127
132
  ...(hfToken ? { accessToken: hfToken } : {})
128
133
  })) {
129
134
  if (results.length >= limit)
@@ -300,10 +305,16 @@ export class HuggingFaceScraper {
300
305
  description_length: description.length,
301
306
  has_readme: !!(cardData.readme || cardData.readme_content)
302
307
  };
303
- if (intent) {
304
- metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
308
+ // Hard language exclusion — drop bilingual/multilingual for single-language queries
309
+ if (intent && shouldExcludeByLanguage(metadata, intent)) {
310
+ // skip — do not push
311
+ }
312
+ else {
313
+ if (intent) {
314
+ metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
315
+ }
316
+ results.push(metadata);
305
317
  }
306
- results.push(metadata);
307
318
  }
308
319
  catch (e) {
309
320
  // Track all errors for user feedback
@@ -0,0 +1,92 @@
1
+ """
2
+ Convert a dataset file between formats (CSV, Parquet, JSON, JSONL).
3
+ Usage: convert_engine.py <input_path> <output_path>
4
+ Outputs JSON: {"ok": true, "output_path": "...", "rows": N, "columns": N} or {"ok": false, "error": "..."}
5
+ """
6
+ import sys
7
+ import json
8
+ import os
9
+
10
+ try:
11
+ import polars as pl
12
+ except Exception:
13
+ print(json.dumps({"ok": False, "error": "polars is required. Install with: pip install polars"}))
14
+ sys.exit(1)
15
+
16
+
17
+ def _load(src: str) -> pl.DataFrame:
18
+ ext = os.path.splitext(src)[1].lower()
19
+ if ext == ".csv":
20
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
21
+ if ext in (".tsv", ".tab"):
22
+ return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
23
+ if ext in (".parquet", ".pq"):
24
+ return pl.read_parquet(src)
25
+ if ext in (".feather", ".ftr", ".arrow", ".ipc"):
26
+ return pl.read_ipc(src)
27
+ if ext in (".jsonl", ".ndjson"):
28
+ return pl.read_ndjson(src)
29
+ if ext == ".json":
30
+ raw = open(src, "r", encoding="utf-8").read().strip()
31
+ if raw.startswith("["):
32
+ return pl.read_json(src)
33
+ if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
34
+ return pl.read_ndjson(src)
35
+ obj = json.loads(raw)
36
+ if isinstance(obj, dict):
37
+ for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
38
+ if key in obj and isinstance(obj[key], list):
39
+ return pl.DataFrame(obj[key])
40
+ for v in obj.values():
41
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
42
+ return pl.DataFrame(v)
43
+ return pl.read_json(src)
44
+ # Fallback: try csv
45
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
46
+
47
+
48
+ def _write(df: pl.DataFrame, dst: str) -> None:
49
+ ext = os.path.splitext(dst)[1].lower()
50
+ os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
51
+ if ext in (".parquet", ".pq"):
52
+ df.write_parquet(dst)
53
+ elif ext == ".csv":
54
+ df.write_csv(dst)
55
+ elif ext == ".json":
56
+ df.write_json(dst, row_oriented=True)
57
+ elif ext in (".jsonl", ".ndjson"):
58
+ df.write_ndjson(dst)
59
+ else:
60
+ raise ValueError(f"Unsupported output format: {ext}")
61
+
62
+
63
+ def main():
64
+ if len(sys.argv) < 3:
65
+ print(json.dumps({"ok": False, "error": "Usage: convert_engine.py <input> <output>"}))
66
+ sys.exit(1)
67
+
68
+ input_path = sys.argv[1]
69
+ output_path = sys.argv[2]
70
+
71
+ if not os.path.exists(input_path):
72
+ print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
73
+ sys.exit(1)
74
+
75
+ try:
76
+ df = _load(input_path)
77
+ _write(df, output_path)
78
+ size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
79
+ print(json.dumps({
80
+ "ok": True,
81
+ "output_path": output_path,
82
+ "rows": df.height,
83
+ "columns": df.width,
84
+ "size_mb": size_mb,
85
+ }))
86
+ except Exception as e:
87
+ print(json.dumps({"ok": False, "error": str(e)}))
88
+ sys.exit(1)
89
+
90
+
91
+ if __name__ == "__main__":
92
+ main()
@@ -1,5 +1,5 @@
1
1
  import { JITOrchestrator } from "./jit-orchestrator.js";
2
- import { analyzeDatasetQuery, scoreDatasetAgainstIntent } from "./query-intent.js";
2
+ import { analyzeDatasetQuery, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "./query-intent.js";
3
3
  import fs from "fs";
4
4
  function log(msg) {
5
5
  fs.appendFileSync("debug.log", new Date().toISOString() + " " + msg + "\n");
@@ -68,6 +68,12 @@ export class SearchEngine {
68
68
  // Filter: Safe only
69
69
  if (options.safeOnly && metadata.license.category === "restricted")
70
70
  continue;
71
+ // Filter: Hard language exclusion — completely drop bilingual/multilingual datasets
72
+ // when user explicitly requested a single language
73
+ if (shouldExcludeByLanguage(metadata, intent)) {
74
+ log(`Language exclusion: Dropped ${match.id} (bilingual/multilingual for single-language query)`);
75
+ continue;
76
+ }
71
77
  const text = `${metadata.name} ${metadata.description} ${metadata.tags.join(" ")}`.toLowerCase();
72
78
  // Filter: Explicit Negative Keywords
73
79
  if (negativeKeywords.some(neg => text.includes(neg))) {
@@ -68,11 +68,23 @@ export function scoreDatasetAgainstIntent(dataset, intent) {
68
68
  const aliases = getLanguageAliases(intent.language);
69
69
  const datasetLanguages = dataset.languages.map(normalizeToken);
70
70
  const languageMatch = aliases.some(alias => datasetLanguages.includes(alias) || text.includes(alias));
71
+ const isMultilingualIntent = intent.language === "multilingual";
71
72
  if (languageMatch) {
72
- score += 0.45;
73
+ // Check if the dataset is monolingual in the requested language vs multilingual
74
+ const nonRequestedLanguages = datasetLanguages.filter(lang => !aliases.includes(lang) && lang !== "" && lang !== "unknown");
75
+ if (nonRequestedLanguages.length === 0 || isMultilingualIntent) {
76
+ // Purely the requested language (or user wants multilingual) → full boost
77
+ score += 0.55;
78
+ }
79
+ else {
80
+ // Bilingual/multilingual dataset that CONTAINS the language but isn't exclusive
81
+ // Penalize proportionally to how many other languages are present
82
+ const ratio = nonRequestedLanguages.length / Math.max(datasetLanguages.length, 1);
83
+ score += 0.1 - (ratio * 0.4); // ranges from +0.1 (mostly target lang) to -0.3 (mostly other langs)
84
+ }
73
85
  }
74
86
  else if (dataset.languages.length > 0) {
75
- score -= 0.55;
87
+ score -= 0.65;
76
88
  }
77
89
  else {
78
90
  score -= 0.1;
@@ -131,6 +143,80 @@ export function scoreDatasetAgainstIntent(dataset, intent) {
131
143
  export function buildIntentSearchQuery(intent) {
132
144
  return intent.searchQuery;
133
145
  }
146
+ /**
147
+ * Build HuggingFace-compatible filter tags from the parsed intent.
148
+ * Returns e.g. ["language:en", "task_ids:text-classification"].
149
+ */
150
+ export function buildHuggingFaceFilterTags(intent) {
151
+ const tags = [];
152
+ if (intent.language && intent.language !== "multilingual") {
153
+ const langCode = LANGUAGE_TO_CODE[intent.language];
154
+ if (langCode)
155
+ tags.push(`language:${langCode}`);
156
+ }
157
+ if (intent.task) {
158
+ tags.push(`task_ids:${intent.task}`);
159
+ }
160
+ return tags;
161
+ }
162
+ const LANGUAGE_TO_CODE = {
163
+ english: "en",
164
+ spanish: "es",
165
+ french: "fr",
166
+ german: "de",
167
+ portuguese: "pt",
168
+ chinese: "zh",
169
+ japanese: "ja",
170
+ korean: "ko",
171
+ arabic: "ar",
172
+ russian: "ru",
173
+ hindi: "hi",
174
+ };
175
+ const BILINGUAL_INDICATORS = [
176
+ "translation", "parallel", "bilingual", "multilingual",
177
+ "cross-lingual", "crosslingual", "machine-translation",
178
+ "aligned", "comparable corpus",
179
+ ];
180
+ /**
181
+ * Hard-exclude a dataset when the user requests a single specific language
182
+ * and the dataset is bilingual, multilingual, or tagged with other languages.
183
+ * Returns true if the dataset should be EXCLUDED from results.
184
+ */
185
+ export function shouldExcludeByLanguage(dataset, intent) {
186
+ if (!intent?.language || intent.language === "multilingual")
187
+ return false;
188
+ const aliases = getLanguageAliases(intent.language);
189
+ const datasetLanguages = dataset.languages.map(normalizeToken).filter(l => l && l !== "unknown");
190
+ // If the dataset has language tags and ANY of them are NOT the requested language, exclude
191
+ if (datasetLanguages.length > 0) {
192
+ const hasRequestedLang = aliases.some(a => datasetLanguages.includes(a));
193
+ const hasOtherLangs = datasetLanguages.some(lang => !aliases.includes(lang));
194
+ if (hasOtherLangs)
195
+ return true; // bilingual/multilingual → exclude
196
+ if (!hasRequestedLang)
197
+ return true; // wrong language entirely
198
+ }
199
+ // Check name, description, and tags for bilingual indicators or other language names
200
+ const text = [
201
+ dataset.name,
202
+ dataset.description,
203
+ dataset.tags.join(" "),
204
+ ].join(" ").toLowerCase();
205
+ // Check for bilingual/translation keywords
206
+ if (BILINGUAL_INDICATORS.some(indicator => text.includes(indicator))) {
207
+ return true;
208
+ }
209
+ // Check if the text mentions other specific languages by name
210
+ const otherLanguageNames = Object.keys(LANGUAGE_ALIASES).filter(lang => lang !== intent.language && lang !== "multilingual");
211
+ for (const otherLang of otherLanguageNames) {
212
+ const otherAliases = LANGUAGE_ALIASES[otherLang];
213
+ // Only check the full language name (not 2-letter codes which could appear in regular text)
214
+ if (otherAliases && otherAliases[0] && text.includes(otherAliases[0])) {
215
+ return true;
216
+ }
217
+ }
218
+ return false;
219
+ }
134
220
  function buildHeuristicIntent(query, requirements) {
135
221
  const originalQuery = `${query || ""} ${requirements || ""}`.trim();
136
222
  const normalizedQuery = originalQuery.toLowerCase();
@@ -342,6 +428,20 @@ function normalizeToken(value) {
342
428
  return value.toLowerCase().replace(/^[^a-z0-9]+|[^a-z0-9-]+$/g, "").trim();
343
429
  }
344
430
  function extractRequestedRows(text) {
431
+ // Match "1 million", "2.5 billion", "500 thousand" etc.
432
+ const wordMultipliers = {
433
+ thousand: 1_000, million: 1_000_000, billion: 1_000_000_000,
434
+ mil: 1_000_000, bil: 1_000_000_000,
435
+ };
436
+ const wordPattern = new RegExp(`(\\d+(?:\\.\\d+)?)\\s*(${Object.keys(wordMultipliers).join("|")})\\b`, "i");
437
+ const wordMatch = text.match(wordPattern);
438
+ if (wordMatch) {
439
+ const base = Number(wordMatch[1]);
440
+ const multiplier = wordMultipliers[wordMatch[2].toLowerCase()];
441
+ const value = Math.round(base * multiplier);
442
+ if (Number.isFinite(value) && value > 0)
443
+ return value;
444
+ }
345
445
  const explicit = text.match(/(\d[\d,\s]{1,12})\s*(samples?|rows?|records?)/i);
346
446
  if (explicit) {
347
447
  const value = Number(explicit[1].replace(/[\s,]/g, ""));
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "vesper-wizard",
3
- "version": "2.1.6",
3
+ "version": "2.3.0",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
@@ -0,0 +1,92 @@
1
+ """
2
+ Convert a dataset file between formats (CSV, Parquet, JSON, JSONL).
3
+ Usage: convert_engine.py <input_path> <output_path>
4
+ Outputs JSON: {"ok": true, "output_path": "...", "rows": N, "columns": N} or {"ok": false, "error": "..."}
5
+ """
6
+ import sys
7
+ import json
8
+ import os
9
+
10
+ try:
11
+ import polars as pl
12
+ except Exception:
13
+ print(json.dumps({"ok": False, "error": "polars is required. Install with: pip install polars"}))
14
+ sys.exit(1)
15
+
16
+
17
+ def _load(src: str) -> pl.DataFrame:
18
+ ext = os.path.splitext(src)[1].lower()
19
+ if ext == ".csv":
20
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
21
+ if ext in (".tsv", ".tab"):
22
+ return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
23
+ if ext in (".parquet", ".pq"):
24
+ return pl.read_parquet(src)
25
+ if ext in (".feather", ".ftr", ".arrow", ".ipc"):
26
+ return pl.read_ipc(src)
27
+ if ext in (".jsonl", ".ndjson"):
28
+ return pl.read_ndjson(src)
29
+ if ext == ".json":
30
+ raw = open(src, "r", encoding="utf-8").read().strip()
31
+ if raw.startswith("["):
32
+ return pl.read_json(src)
33
+ if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
34
+ return pl.read_ndjson(src)
35
+ obj = json.loads(raw)
36
+ if isinstance(obj, dict):
37
+ for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
38
+ if key in obj and isinstance(obj[key], list):
39
+ return pl.DataFrame(obj[key])
40
+ for v in obj.values():
41
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
42
+ return pl.DataFrame(v)
43
+ return pl.read_json(src)
44
+ # Fallback: try csv
45
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
46
+
47
+
48
+ def _write(df: pl.DataFrame, dst: str) -> None:
49
+ ext = os.path.splitext(dst)[1].lower()
50
+ os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
51
+ if ext in (".parquet", ".pq"):
52
+ df.write_parquet(dst)
53
+ elif ext == ".csv":
54
+ df.write_csv(dst)
55
+ elif ext == ".json":
56
+ df.write_json(dst, row_oriented=True)
57
+ elif ext in (".jsonl", ".ndjson"):
58
+ df.write_ndjson(dst)
59
+ else:
60
+ raise ValueError(f"Unsupported output format: {ext}")
61
+
62
+
63
+ def main():
64
+ if len(sys.argv) < 3:
65
+ print(json.dumps({"ok": False, "error": "Usage: convert_engine.py <input> <output>"}))
66
+ sys.exit(1)
67
+
68
+ input_path = sys.argv[1]
69
+ output_path = sys.argv[2]
70
+
71
+ if not os.path.exists(input_path):
72
+ print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
73
+ sys.exit(1)
74
+
75
+ try:
76
+ df = _load(input_path)
77
+ _write(df, output_path)
78
+ size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
79
+ print(json.dumps({
80
+ "ok": True,
81
+ "output_path": output_path,
82
+ "rows": df.height,
83
+ "columns": df.width,
84
+ "size_mb": size_mb,
85
+ }))
86
+ except Exception as e:
87
+ print(json.dumps({"ok": False, "error": str(e)}))
88
+ sys.exit(1)
89
+
90
+
91
+ if __name__ == "__main__":
92
+ main()