npm - vesper-wizard - Versions diffs - 2.1.6 → 2.2.0 - Mend

vesper-wizard 2.1.6 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/build/index.js +71 -2
package/build/install/install-service.js +5 -1
package/build/metadata/scraper.js +5 -3
package/build/search/query-intent.js +57 -2
package/package.json +1 -1

package/build/index.js CHANGED Viewed

@@ -896,6 +896,49 @@ async function handlePrepareJob(jobId, query, requirements, outputDir) {
             quality_score: qualityScore
         });
     }
+    else {
+        // Even for explicit dataset refs, save minimal metadata so get_dataset_info can find it
+        try {
+            const existingMeta = metadataStore.getDataset(datasetIdForDownload);
+            if (!existingMeta) {
+                metadataStore.saveDataset({
+                    id: datasetIdForDownload,
+                    source: source,
+                    name: datasetIdForDownload.split("/").pop() || datasetIdForDownload,
+                    description: `Dataset prepared from ${source}:${datasetIdForDownload}`,
+                    quality_warnings: [],
+                    downloads: 0,
+                    likes: 0,
+                    stars: 0,
+                    tags: [],
+                    last_updated: new Date().toISOString(),
+                    task: "unknown",
+                    domain: "unknown",
+                    languages: [],
+                    splits: [],
+                    license: { id: "unknown", category: "unknown", usage_restrictions: [], warnings: [] },
+                    quality_score: qualityScore,
+                    download_url: source === "huggingface" ? `https://huggingface.co/datasets/${datasetIdForDownload}` : "",
+                    total_examples: 0,
+                    is_structured: false,
+                    has_target_column: false,
+                    is_safe_source: true,
+                    has_personal_data: false,
+                    is_paywalled: false,
+                    is_scraped_web_data: false,
+                    uses_https: true,
+                    has_train_split: false,
+                    has_test_split: false,
+                    has_validation_split: false,
+                    description_length: 0,
+                    has_readme: false,
+                });
+            }
+        }
+        catch (e) {
+            console.error(`[Prepare] Failed to save minimal metadata for ${datasetIdForDownload}: ${e?.message || e}`);
+        }
+    }
     markPipelineStep("register", "running");
     update({ progress: 85, status_text: "Installing dataset into project..." });
     const installPath = await installService.install(datasetIdForDownload, rawFilePath, outputDir);
@@ -1845,8 +1888,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                 }
                 const dataset = metadataStore.getDataset(datasetId);
                 if (!dataset) {
+                    // Fallback: check the registry for local path info
+                    const regEntry = getRegistryEntry(datasetId);
+                    const regPath = regEntry?.local_path || regEntry?.path;
+                    if (regEntry) {
+                        const exists = regPath && fs.existsSync(regPath);
+                        return {
+                            content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
+                        };
+                    }
                     return {
-                        content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}` }],
+                        content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
                         isError: true,
                     };
                 }
@@ -2167,7 +2219,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
                     const ext = extMap[requestedFormat] || ".feather";
                     const safeName = toSafeDatasetPathFragment(datasetId);
-                    const outDir = targetDir || path.join(dataRoot, "exports");
+                    const outDir = targetDir;
                     if (!fs.existsSync(outDir))
                         fs.mkdirSync(outDir, { recursive: true });
                     const outputFile = path.join(outDir, `${safeName}${ext}`);
@@ -2203,6 +2255,23 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     };
                 }
             }
+            case "vesper_list_datasets": {
+                const entries = readRegistry();
+                if (entries.length === 0) {
+                    return {
+                        content: [{ type: "text", text: "No prepared datasets found in the Vesper registry. Use prepare_dataset or download_dataset to add datasets." }]
+                    };
+                }
+                const lines = entries.map((e, i) => {
+                    const id = e.dataset_id || e.id || "unknown";
+                    const localPath = e.local_path || e.path || "unknown";
+                    const exists = typeof localPath === "string" && localPath !== "unknown" && fs.existsSync(localPath);
+                    return `${i + 1}. **${id}**\n   Path: ${localPath}\n   Status: ${e.status || "unknown"}${exists ? "" : " (file missing)"}`;
+                });
+                return {
+                    content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
+                };
+            }
             case "fuse_datasets": {
                 const rawSources = request.params.arguments?.sources;
                 if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {

package/build/install/install-service.js CHANGED Viewed

@@ -21,7 +21,11 @@ export class InstallService {
         // Create target directory
         const installLabel = dataset?.name || datasetId;
         const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
-        const installDir = targetDir || path.join(this.projectRoot, "datasets", sanitizedName);
+        // If caller specified a target dir, use it directly (don't nest under datasets/)
+        // Otherwise fall back to the project root's datasets/ folder
+        const installDir = targetDir
+            ? path.resolve(targetDir)
+            : path.join(this.projectRoot, "datasets", sanitizedName);
         if (!fs.existsSync(installDir)) {
             fs.mkdirSync(installDir, { recursive: true });
         }

package/build/metadata/scraper.js CHANGED Viewed

@@ -3,7 +3,7 @@ import { categorizeLicense } from "./license.js";
 import { calculateQualityScore } from "./quality.js";
 import { classifyDomain } from "./domain.js";
 import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
-import { analyzeDatasetQuery, buildIntentSearchQuery, scoreDatasetAgainstIntent } from "../search/query-intent.js";
+import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent } from "../search/query-intent.js";
 export class HuggingFaceScraper {
     /**
      * Bulk discovery: Fetch many datasets quickly without deep details.
@@ -21,10 +21,11 @@ export class HuggingFaceScraper {
         let processed = 0;
         try {
             const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
+            const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
             for await (const ds of listDatasets({
                 limit: limit,
                 additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
-                search: { query: hfQuery },
+                search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
                 ...(hfToken ? { accessToken: hfToken } : {})
             })) {
                 if (results.length >= limit)
@@ -120,10 +121,11 @@ export class HuggingFaceScraper {
             }
             // Add delay between batches to avoid rate limits
             const BATCH_DELAY = hfToken ? 500 : 2000;
+            const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
             for await (const ds of listDatasets({
                 limit: fetchLimit,
                 additionalFields: ["description", "tags"],
-                search: { query: hfQuery },
+                search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
                 ...(hfToken ? { accessToken: hfToken } : {})
             })) {
                 if (results.length >= limit)

package/build/search/query-intent.js CHANGED Viewed

@@ -68,11 +68,23 @@ export function scoreDatasetAgainstIntent(dataset, intent) {
         const aliases = getLanguageAliases(intent.language);
         const datasetLanguages = dataset.languages.map(normalizeToken);
         const languageMatch = aliases.some(alias => datasetLanguages.includes(alias) || text.includes(alias));
+        const isMultilingualIntent = intent.language === "multilingual";
         if (languageMatch) {
-            score += 0.45;
+            // Check if the dataset is monolingual in the requested language vs multilingual
+            const nonRequestedLanguages = datasetLanguages.filter(lang => !aliases.includes(lang) && lang !== "" && lang !== "unknown");
+            if (nonRequestedLanguages.length === 0 || isMultilingualIntent) {
+                // Purely the requested language (or user wants multilingual) → full boost
+                score += 0.55;
+            }
+            else {
+                // Bilingual/multilingual dataset that CONTAINS the language but isn't exclusive
+                // Penalize proportionally to how many other languages are present
+                const ratio = nonRequestedLanguages.length / Math.max(datasetLanguages.length, 1);
+                score += 0.1 - (ratio * 0.4); // ranges from +0.1 (mostly target lang) to -0.3 (mostly other langs)
+            }
         }
         else if (dataset.languages.length > 0) {
-            score -= 0.55;
+            score -= 0.65;
         }
         else {
             score -= 0.1;
@@ -131,6 +143,35 @@ export function scoreDatasetAgainstIntent(dataset, intent) {
 export function buildIntentSearchQuery(intent) {
     return intent.searchQuery;
 }
+/**
+ * Build HuggingFace-compatible filter tags from the parsed intent.
+ * Returns e.g. ["language:en", "task_ids:text-classification"].
+ */
+export function buildHuggingFaceFilterTags(intent) {
+    const tags = [];
+    if (intent.language && intent.language !== "multilingual") {
+        const langCode = LANGUAGE_TO_CODE[intent.language];
+        if (langCode)
+            tags.push(`language:${langCode}`);
+    }
+    if (intent.task) {
+        tags.push(`task_ids:${intent.task}`);
+    }
+    return tags;
+}
+const LANGUAGE_TO_CODE = {
+    english: "en",
+    spanish: "es",
+    french: "fr",
+    german: "de",
+    portuguese: "pt",
+    chinese: "zh",
+    japanese: "ja",
+    korean: "ko",
+    arabic: "ar",
+    russian: "ru",
+    hindi: "hi",
+};
 function buildHeuristicIntent(query, requirements) {
     const originalQuery = `${query || ""} ${requirements || ""}`.trim();
     const normalizedQuery = originalQuery.toLowerCase();
@@ -342,6 +383,20 @@ function normalizeToken(value) {
     return value.toLowerCase().replace(/^[^a-z0-9]+|[^a-z0-9-]+$/g, "").trim();
 }
 function extractRequestedRows(text) {
+    // Match "1 million", "2.5 billion", "500 thousand" etc.
+    const wordMultipliers = {
+        thousand: 1_000, million: 1_000_000, billion: 1_000_000_000,
+        mil: 1_000_000, bil: 1_000_000_000,
+    };
+    const wordPattern = new RegExp(`(\\d+(?:\\.\\d+)?)\\s*(${Object.keys(wordMultipliers).join("|")})\\b`, "i");
+    const wordMatch = text.match(wordPattern);
+    if (wordMatch) {
+        const base = Number(wordMatch[1]);
+        const multiplier = wordMultipliers[wordMatch[2].toLowerCase()];
+        const value = Math.round(base * multiplier);
+        if (Number.isFinite(value) && value > 0)
+            return value;
+    }
     const explicit = text.match(/(\d[\d,\s]{1,12})\s*(samples?|rows?|records?)/i);
     if (explicit) {
         const value = Number(explicit[1].replace(/[\s,]/g, ""));

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "vesper-wizard",
-  "version": "2.1.6",
+  "version": "2.2.0",
   "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
   "type": "module",
   "main": "build/index.js",