vesper-wizard 2.1.6 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -896,6 +896,49 @@ async function handlePrepareJob(jobId, query, requirements, outputDir) {
896
896
  quality_score: qualityScore
897
897
  });
898
898
  }
899
+ else {
900
+ // Even for explicit dataset refs, save minimal metadata so get_dataset_info can find it
901
+ try {
902
+ const existingMeta = metadataStore.getDataset(datasetIdForDownload);
903
+ if (!existingMeta) {
904
+ metadataStore.saveDataset({
905
+ id: datasetIdForDownload,
906
+ source: source,
907
+ name: datasetIdForDownload.split("/").pop() || datasetIdForDownload,
908
+ description: `Dataset prepared from ${source}:${datasetIdForDownload}`,
909
+ quality_warnings: [],
910
+ downloads: 0,
911
+ likes: 0,
912
+ stars: 0,
913
+ tags: [],
914
+ last_updated: new Date().toISOString(),
915
+ task: "unknown",
916
+ domain: "unknown",
917
+ languages: [],
918
+ splits: [],
919
+ license: { id: "unknown", category: "unknown", usage_restrictions: [], warnings: [] },
920
+ quality_score: qualityScore,
921
+ download_url: source === "huggingface" ? `https://huggingface.co/datasets/${datasetIdForDownload}` : "",
922
+ total_examples: 0,
923
+ is_structured: false,
924
+ has_target_column: false,
925
+ is_safe_source: true,
926
+ has_personal_data: false,
927
+ is_paywalled: false,
928
+ is_scraped_web_data: false,
929
+ uses_https: true,
930
+ has_train_split: false,
931
+ has_test_split: false,
932
+ has_validation_split: false,
933
+ description_length: 0,
934
+ has_readme: false,
935
+ });
936
+ }
937
+ }
938
+ catch (e) {
939
+ console.error(`[Prepare] Failed to save minimal metadata for ${datasetIdForDownload}: ${e?.message || e}`);
940
+ }
941
+ }
899
942
  markPipelineStep("register", "running");
900
943
  update({ progress: 85, status_text: "Installing dataset into project..." });
901
944
  const installPath = await installService.install(datasetIdForDownload, rawFilePath, outputDir);
@@ -1845,8 +1888,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1845
1888
  }
1846
1889
  const dataset = metadataStore.getDataset(datasetId);
1847
1890
  if (!dataset) {
1891
+ // Fallback: check the registry for local path info
1892
+ const regEntry = getRegistryEntry(datasetId);
1893
+ const regPath = regEntry?.local_path || regEntry?.path;
1894
+ if (regEntry) {
1895
+ const exists = regPath && fs.existsSync(regPath);
1896
+ return {
1897
+ content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
1898
+ };
1899
+ }
1848
1900
  return {
1849
- content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}` }],
1901
+ content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
1850
1902
  isError: true,
1851
1903
  };
1852
1904
  }
@@ -2167,7 +2219,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2167
2219
  const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
2168
2220
  const ext = extMap[requestedFormat] || ".feather";
2169
2221
  const safeName = toSafeDatasetPathFragment(datasetId);
2170
- const outDir = targetDir || path.join(dataRoot, "exports");
2222
+ const outDir = targetDir;
2171
2223
  if (!fs.existsSync(outDir))
2172
2224
  fs.mkdirSync(outDir, { recursive: true });
2173
2225
  const outputFile = path.join(outDir, `${safeName}${ext}`);
@@ -2203,6 +2255,23 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2203
2255
  };
2204
2256
  }
2205
2257
  }
2258
+ case "vesper_list_datasets": {
2259
+ const entries = readRegistry();
2260
+ if (entries.length === 0) {
2261
+ return {
2262
+ content: [{ type: "text", text: "No prepared datasets found in the Vesper registry. Use prepare_dataset or download_dataset to add datasets." }]
2263
+ };
2264
+ }
2265
+ const lines = entries.map((e, i) => {
2266
+ const id = e.dataset_id || e.id || "unknown";
2267
+ const localPath = e.local_path || e.path || "unknown";
2268
+ const exists = typeof localPath === "string" && localPath !== "unknown" && fs.existsSync(localPath);
2269
+ return `${i + 1}. **${id}**\n Path: ${localPath}\n Status: ${e.status || "unknown"}${exists ? "" : " (file missing)"}`;
2270
+ });
2271
+ return {
2272
+ content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
2273
+ };
2274
+ }
2206
2275
  case "fuse_datasets": {
2207
2276
  const rawSources = request.params.arguments?.sources;
2208
2277
  if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
@@ -21,7 +21,11 @@ export class InstallService {
21
21
  // Create target directory
22
22
  const installLabel = dataset?.name || datasetId;
23
23
  const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
24
- const installDir = targetDir || path.join(this.projectRoot, "datasets", sanitizedName);
24
+ // If caller specified a target dir, use it directly (don't nest under datasets/)
25
+ // Otherwise fall back to the project root's datasets/ folder
26
+ const installDir = targetDir
27
+ ? path.resolve(targetDir)
28
+ : path.join(this.projectRoot, "datasets", sanitizedName);
25
29
  if (!fs.existsSync(installDir)) {
26
30
  fs.mkdirSync(installDir, { recursive: true });
27
31
  }
@@ -3,7 +3,7 @@ import { categorizeLicense } from "./license.js";
3
3
  import { calculateQualityScore } from "./quality.js";
4
4
  import { classifyDomain } from "./domain.js";
5
5
  import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
6
- import { analyzeDatasetQuery, buildIntentSearchQuery, scoreDatasetAgainstIntent } from "../search/query-intent.js";
6
+ import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent } from "../search/query-intent.js";
7
7
  export class HuggingFaceScraper {
8
8
  /**
9
9
  * Bulk discovery: Fetch many datasets quickly without deep details.
@@ -21,10 +21,11 @@ export class HuggingFaceScraper {
21
21
  let processed = 0;
22
22
  try {
23
23
  const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
24
+ const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
24
25
  for await (const ds of listDatasets({
25
26
  limit: limit,
26
27
  additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
27
- search: { query: hfQuery },
28
+ search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
28
29
  ...(hfToken ? { accessToken: hfToken } : {})
29
30
  })) {
30
31
  if (results.length >= limit)
@@ -120,10 +121,11 @@ export class HuggingFaceScraper {
120
121
  }
121
122
  // Add delay between batches to avoid rate limits
122
123
  const BATCH_DELAY = hfToken ? 500 : 2000;
124
+ const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
123
125
  for await (const ds of listDatasets({
124
126
  limit: fetchLimit,
125
127
  additionalFields: ["description", "tags"],
126
- search: { query: hfQuery },
128
+ search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
127
129
  ...(hfToken ? { accessToken: hfToken } : {})
128
130
  })) {
129
131
  if (results.length >= limit)
@@ -68,11 +68,23 @@ export function scoreDatasetAgainstIntent(dataset, intent) {
68
68
  const aliases = getLanguageAliases(intent.language);
69
69
  const datasetLanguages = dataset.languages.map(normalizeToken);
70
70
  const languageMatch = aliases.some(alias => datasetLanguages.includes(alias) || text.includes(alias));
71
+ const isMultilingualIntent = intent.language === "multilingual";
71
72
  if (languageMatch) {
72
- score += 0.45;
73
+ // Check if the dataset is monolingual in the requested language vs multilingual
74
+ const nonRequestedLanguages = datasetLanguages.filter(lang => !aliases.includes(lang) && lang !== "" && lang !== "unknown");
75
+ if (nonRequestedLanguages.length === 0 || isMultilingualIntent) {
76
+ // Purely the requested language (or user wants multilingual) → full boost
77
+ score += 0.55;
78
+ }
79
+ else {
80
+ // Bilingual/multilingual dataset that CONTAINS the language but isn't exclusive
81
+ // Penalize proportionally to how many other languages are present
82
+ const ratio = nonRequestedLanguages.length / Math.max(datasetLanguages.length, 1);
83
+ score += 0.1 - (ratio * 0.4); // ranges from +0.1 (mostly target lang) to -0.3 (mostly other langs)
84
+ }
73
85
  }
74
86
  else if (dataset.languages.length > 0) {
75
- score -= 0.55;
87
+ score -= 0.65;
76
88
  }
77
89
  else {
78
90
  score -= 0.1;
@@ -131,6 +143,35 @@ export function scoreDatasetAgainstIntent(dataset, intent) {
131
143
  export function buildIntentSearchQuery(intent) {
132
144
  return intent.searchQuery;
133
145
  }
146
+ /**
147
+ * Build HuggingFace-compatible filter tags from the parsed intent.
148
+ * Returns e.g. ["language:en", "task_ids:text-classification"].
149
+ */
150
+ export function buildHuggingFaceFilterTags(intent) {
151
+ const tags = [];
152
+ if (intent.language && intent.language !== "multilingual") {
153
+ const langCode = LANGUAGE_TO_CODE[intent.language];
154
+ if (langCode)
155
+ tags.push(`language:${langCode}`);
156
+ }
157
+ if (intent.task) {
158
+ tags.push(`task_ids:${intent.task}`);
159
+ }
160
+ return tags;
161
+ }
162
+ const LANGUAGE_TO_CODE = {
163
+ english: "en",
164
+ spanish: "es",
165
+ french: "fr",
166
+ german: "de",
167
+ portuguese: "pt",
168
+ chinese: "zh",
169
+ japanese: "ja",
170
+ korean: "ko",
171
+ arabic: "ar",
172
+ russian: "ru",
173
+ hindi: "hi",
174
+ };
134
175
  function buildHeuristicIntent(query, requirements) {
135
176
  const originalQuery = `${query || ""} ${requirements || ""}`.trim();
136
177
  const normalizedQuery = originalQuery.toLowerCase();
@@ -342,6 +383,20 @@ function normalizeToken(value) {
342
383
  return value.toLowerCase().replace(/^[^a-z0-9]+|[^a-z0-9-]+$/g, "").trim();
343
384
  }
344
385
  function extractRequestedRows(text) {
386
+ // Match "1 million", "2.5 billion", "500 thousand" etc.
387
+ const wordMultipliers = {
388
+ thousand: 1_000, million: 1_000_000, billion: 1_000_000_000,
389
+ mil: 1_000_000, bil: 1_000_000_000,
390
+ };
391
+ const wordPattern = new RegExp(`(\\d+(?:\\.\\d+)?)\\s*(${Object.keys(wordMultipliers).join("|")})\\b`, "i");
392
+ const wordMatch = text.match(wordPattern);
393
+ if (wordMatch) {
394
+ const base = Number(wordMatch[1]);
395
+ const multiplier = wordMultipliers[wordMatch[2].toLowerCase()];
396
+ const value = Math.round(base * multiplier);
397
+ if (Number.isFinite(value) && value > 0)
398
+ return value;
399
+ }
345
400
  const explicit = text.match(/(\d[\d,\s]{1,12})\s*(samples?|rows?|records?)/i);
346
401
  if (explicit) {
347
402
  const value = Number(explicit[1].replace(/[\s,]/g, ""));
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "vesper-wizard",
3
- "version": "2.1.6",
3
+ "version": "2.2.0",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",