npm - vesper-wizard - Versions diffs - 2.1.5 → 2.2.0 - Mend

vesper-wizard 2.1.5 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/build/export/exporter.js +22 -9
package/build/gateway/unified-dataset-gateway.js +2 -1
package/build/index.js +138 -17
package/build/install/install-service.js +5 -1
package/build/metadata/scraper.js +26 -7
package/build/search/engine.js +9 -5
package/build/search/jit-orchestrator.js +18 -14
package/build/search/query-intent.js +464 -0
package/build/utils/python-runtime.js +130 -0
package/package.json +1 -1
package/scripts/postinstall.cjs +74 -32

package/build/export/exporter.js CHANGED Viewed

@@ -1,10 +1,12 @@
 import { spawn } from "child_process";
 import path from "path";
 import fs from "fs";
+import { ensurePythonPackages, resolvePythonCommand } from "../utils/python-runtime.js";
 export class DataExporter {
-    pythonPath = "python";
+    buildDir;
     scriptPath;
     constructor(buildDir = process.cwd()) {
+        this.buildDir = buildDir;
         const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
         const dataRoot = path.join(homeDir, ".vesper");
         const scriptPath0 = path.resolve(dataRoot, "python", "export_engine.py");
@@ -26,27 +28,38 @@ export class DataExporter {
         else {
             this.scriptPath = scriptPath0;
         }
-        // Detect Python command
-        if (process.platform === "win32") {
-            this.pythonPath = "py";
-        }
     }
     /**
      * Exports a dataset file to a specified format
      */
     async export(inputFile, outputFile, format, options = {}) {
+        const pythonRequirements = [
+            { module: "polars", packageName: "polars" },
+        ];
+        if (format === "feather") {
+            pythonRequirements.push({ module: "pyarrow", packageName: "pyarrow" });
+        }
+        if (format === "tfrecord") {
+            pythonRequirements.push({ module: "tensorflow", packageName: "tensorflow" });
+        }
+        const pythonPath = await ensurePythonPackages(this.buildDir, pythonRequirements).catch(() => resolvePythonCommand(this.buildDir));
         return new Promise((resolve, reject) => {
             if (!fs.existsSync(inputFile)) {
                 reject(new Error(`Input file not found: ${inputFile}`));
                 return;
             }
             const args = [this.scriptPath, inputFile, outputFile, format, JSON.stringify(options)];
-            const process = spawn(this.pythonPath, args);
+            const childProcess = spawn(pythonPath, args, {
+                env: {
+                    ...process.env,
+                    PYTHONIOENCODING: "utf-8",
+                },
+            });
             let stdout = "";
             let stderr = "";
-            process.stdout.on("data", (data) => stdout += data.toString());
-            process.stderr.on("data", (data) => stderr += data.toString());
-            process.on("close", (code) => {
+            childProcess.stdout.on("data", (data) => stdout += data.toString());
+            childProcess.stderr.on("data", (data) => stderr += data.toString());
+            childProcess.on("close", (code) => {
                 if (code !== 0) {
                     reject(new Error(`Export failed: ${stderr || stdout}`));
                     return;

package/build/gateway/unified-dataset-gateway.js CHANGED Viewed

@@ -3,6 +3,7 @@ import path from "path";
 import http from "http";
 import https from "https";
 import { HuggingFaceScraper } from "../metadata/scraper.js";
+import { analyzeDatasetQuery } from "../search/query-intent.js";
 export class UnifiedDatasetGateway {
     deps;
     constructor(deps) {
@@ -236,7 +237,7 @@ export class UnifiedDatasetGateway {
     async discoverFromSource(source, query, limit) {
         switch (source) {
             case "huggingface":
-                return await new HuggingFaceScraper().scrape(limit, true, query);
+                return await new HuggingFaceScraper().scrape(limit, true, await analyzeDatasetQuery(query));
             case "openml":
                 return await this.deps.openmlSource.discover(query, limit);
             case "kaggle":

package/build/index.js CHANGED Viewed

@@ -361,6 +361,21 @@ function extractRequestedRows(query, requirements) {
         if (Number.isFinite(n) && n > 0)
             return n;
     }
+    const commaNumbers = [...text.matchAll(/\b\d{1,3}(?:,\d{3})+\b/g)]
+        .map(m => Number(m[0].replace(/,/g, "")))
+        .filter(n => Number.isFinite(n) && n > 0);
+    if (commaNumbers.length > 0)
+        return Math.max(...commaNumbers);
+    const humanSized = [...text.matchAll(/\b(\d+(?:\.\d+)?)\s*([kmb])\b/gi)]
+        .map(m => {
+        const base = Number(m[1]);
+        const suffix = m[2].toLowerCase();
+        const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
+        return Math.round(base * multiplier);
+    })
+        .filter(n => Number.isFinite(n) && n > 0);
+    if (humanSized.length > 0)
+        return Math.max(...humanSized);
     const allNums = [...text.matchAll(/\b\d{4,9}\b/g)]
         .map(m => Number(m[0]))
         .filter(n => Number.isFinite(n) && n > 0);
@@ -644,7 +659,7 @@ jobManager.on("processJob", async (job, execute) => {
         console.error(`[Vesper] Starting prepareDatasetTask for job ${job.id}...`);
         const metadata = job.metadata ? JSON.parse(job.metadata) : {};
         switch (job.type) {
-            case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements);
+            case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements, metadata.outputDir);
             case "clean": return await handleCleanJob(job.id, metadata.datasetId, metadata.ops);
             default: throw new Error(`Unhandled job type: ${job.type}`);
         }
@@ -662,7 +677,7 @@ jobManager.on("processJob", async (job, execute) => {
 /**
  * Logic for preparing a dataset (Search + Ingest + Process)
  */
-async function handlePrepareJob(jobId, query, requirements) {
+async function handlePrepareJob(jobId, query, requirements, outputDir) {
     hydrateExternalKeys();
     const update = (updates) => jobManager.updateJob(jobId, updates);
     const pipelineSteps = ["search", "validate", "download", "normalize", "quality", "register"];
@@ -689,6 +704,7 @@ async function handlePrepareJob(jobId, query, requirements) {
         // Continue anyway - direct file downloads may still work without datasets lib
     }
     const requestedRows = extractRequestedRows(query, requirements);
+    const searchQuery = requirements ? `${query} ${requirements}` : query;
     let selectedDataset;
     let datasetIdForDownload = "";
     let source;
@@ -729,7 +745,7 @@ async function handlePrepareJob(jobId, query, requirements) {
     else {
         markPipelineStep("search", "running");
         update({ progress: 10, status_text: "Searching for best dataset matching query..." });
-        const results = await searchEngine.search(query, { limit: 10 });
+        const results = await searchEngine.search(searchQuery, { limit: 10 });
         if (results.length === 0) {
             markPipelineStep("search", "failed");
             throw new Error("No datasets found matching the query. Try refining your search terms.");
@@ -777,7 +793,7 @@ async function handlePrepareJob(jobId, query, requirements) {
         let currentRows = await countRows(rawFilePath);
         if (currentRows < requestedRows) {
             update({ progress: 64, status_text: `Only ${currentRows.toLocaleString()} rows found. Fetching more matching datasets...` });
-            const additional = await searchEngine.search(query, { limit: 8 });
+            const additional = await searchEngine.search(searchQuery, { limit: 8 });
             const sourceFiles = [rawFilePath];
             let totalRows = currentRows;
             for (const ds of additional) {
@@ -880,9 +896,52 @@ async function handlePrepareJob(jobId, query, requirements) {
             quality_score: qualityScore
         });
     }
+    else {
+        // Even for explicit dataset refs, save minimal metadata so get_dataset_info can find it
+        try {
+            const existingMeta = metadataStore.getDataset(datasetIdForDownload);
+            if (!existingMeta) {
+                metadataStore.saveDataset({
+                    id: datasetIdForDownload,
+                    source: source,
+                    name: datasetIdForDownload.split("/").pop() || datasetIdForDownload,
+                    description: `Dataset prepared from ${source}:${datasetIdForDownload}`,
+                    quality_warnings: [],
+                    downloads: 0,
+                    likes: 0,
+                    stars: 0,
+                    tags: [],
+                    last_updated: new Date().toISOString(),
+                    task: "unknown",
+                    domain: "unknown",
+                    languages: [],
+                    splits: [],
+                    license: { id: "unknown", category: "unknown", usage_restrictions: [], warnings: [] },
+                    quality_score: qualityScore,
+                    download_url: source === "huggingface" ? `https://huggingface.co/datasets/${datasetIdForDownload}` : "",
+                    total_examples: 0,
+                    is_structured: false,
+                    has_target_column: false,
+                    is_safe_source: true,
+                    has_personal_data: false,
+                    is_paywalled: false,
+                    is_scraped_web_data: false,
+                    uses_https: true,
+                    has_train_split: false,
+                    has_test_split: false,
+                    has_validation_split: false,
+                    description_length: 0,
+                    has_readme: false,
+                });
+            }
+        }
+        catch (e) {
+            console.error(`[Prepare] Failed to save minimal metadata for ${datasetIdForDownload}: ${e?.message || e}`);
+        }
+    }
     markPipelineStep("register", "running");
     update({ progress: 85, status_text: "Installing dataset into project..." });
-    const installPath = await installService.install(datasetIdForDownload, rawFilePath);
+    const installPath = await installService.install(datasetIdForDownload, rawFilePath, outputDir);
     update({ progress: 100, status_text: "Preparation complete!" });
     // Register prepared dataset in local registry for lookup by export/list tools
     try {
@@ -1013,7 +1072,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                         },
                         target_dir: {
                             type: "string",
-                            description: "Optional output directory for operation='download'.",
+                            description: "Optional output directory for operation='download'. Defaults to the current working directory when omitted.",
+                        },
+                        output_dir: {
+                            type: "string",
+                            description: "Alias for target_dir. Defaults to the current working directory when omitted.",
                         },
                         public_only: {
                             type: "boolean",
@@ -1052,7 +1115,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
             },
             {
                 name: "download_dataset",
-                description: "Download a dataset by source and ID/slug into local Vesper storage. Defaults to HuggingFace. Public providers work keylessly, while Kaggle and data.world can use server-managed credentials when configured.",
+                description: "Download a dataset by source and ID/slug into a local directory. Defaults to HuggingFace. Public providers work keylessly, while Kaggle and data.world can use server-managed credentials when configured.",
                 inputSchema: {
                     type: "object",
                     properties: {
@@ -1067,7 +1130,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                         },
                         target_dir: {
                             type: "string",
-                            description: "Optional target directory for downloaded files.",
+                            description: "Optional target directory for downloaded files. Defaults to the current working directory when omitted.",
+                        },
+                        output_dir: {
+                            type: "string",
+                            description: "Alias for target_dir. Defaults to the current working directory when omitted.",
                         }
                     },
                     required: ["dataset_id"],
@@ -1194,6 +1261,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                     properties: {
                         query: { type: "string" },
                         requirements: { type: "string" },
+                        target_dir: { type: "string", description: "Optional local directory for the prepared dataset. Defaults to the current working directory when omitted." },
+                        output_dir: { type: "string", description: "Alias for target_dir. Defaults to the current working directory when omitted." },
                         download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
                         cleaning_options: { type: "object" },
                         split_config: { type: "object" },
@@ -1238,7 +1307,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                         },
                         target_dir: {
                             type: "string",
-                            description: "Optional custom local directory for export (e.g., './naruto-quotes').",
+                            description: "Optional custom local directory for export. Defaults to the current working directory when omitted.",
+                        },
+                        output_dir: {
+                            type: "string",
+                            description: "Alias for target_dir. Defaults to the current working directory when omitted.",
                         },
                         format: {
                             type: "string",
@@ -1425,7 +1498,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     if (tool === "vesper_export" && req === "split") {
                         // Auto-trigger prepare_dataset (start a background prepare job)
                         try {
-                            jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false });
+                            jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false, outputDir: process.cwd() });
                             // Mark split as complete so export can proceed; export handler will also wait for data if needed.
                             markStepComplete(String(datasetId), "split");
                         }
@@ -1481,6 +1554,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                         if (!datasetId) {
                             throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
                         }
+                        const requestedTargetDir = request.params.arguments?.target_dir
+                            ? String(request.params.arguments.target_dir).trim()
+                            : request.params.arguments?.output_dir
+                                ? String(request.params.arguments.output_dir).trim()
+                                : "";
+                        const targetDir = requestedTargetDir || process.cwd();
                         try {
                             await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
                         }
@@ -1490,7 +1569,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                         const result = await unifiedDatasetGateway.download({
                             datasetId,
                             source,
-                            targetDir: request.params.arguments?.target_dir ? String(request.params.arguments.target_dir) : undefined,
+                            targetDir,
                         });
                         try {
                             upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
@@ -1597,7 +1676,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                 hydrateExternalKeys();
                 const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
                 const datasetId = String(request.params.arguments?.dataset_id || "").trim();
-                const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments.target_dir) : undefined;
+                const requestedTargetDir = request.params.arguments?.target_dir
+                    ? String(request.params.arguments.target_dir).trim()
+                    : request.params.arguments?.output_dir
+                        ? String(request.params.arguments.output_dir).trim()
+                        : "";
+                const targetDir = requestedTargetDir || process.cwd();
                 if (!datasetId) {
                     throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
                 }
@@ -1804,8 +1888,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                 }
                 const dataset = metadataStore.getDataset(datasetId);
                 if (!dataset) {
+                    // Fallback: check the registry for local path info
+                    const regEntry = getRegistryEntry(datasetId);
+                    const regPath = regEntry?.local_path || regEntry?.path;
+                    if (regEntry) {
+                        const exists = regPath && fs.existsSync(regPath);
+                        return {
+                            content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
+                        };
+                    }
                     return {
-                        content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}` }],
+                        content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
                         isError: true,
                     };
                 }
@@ -1975,10 +2068,16 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                 const query = String(request.params.arguments?.query);
                 const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
                 const downloadImages = request.params.arguments?.download_images === true;
+                const requestedOutputDir = request.params.arguments?.target_dir
+                    ? String(request.params.arguments.target_dir).trim()
+                    : request.params.arguments?.output_dir
+                        ? String(request.params.arguments.output_dir).trim()
+                        : "";
+                const outputDir = requestedOutputDir || process.cwd();
                 if (!query || query === "undefined") {
                     throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
                 }
-                const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
+                const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages, outputDir });
                 return {
                     content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
                 };
@@ -2019,7 +2118,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
             }
             case "export_dataset": {
                 const datasetId = String(request.params.arguments?.dataset_id);
-                const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments?.target_dir) : undefined;
+                const requestedTargetDir = request.params.arguments?.target_dir
+                    ? String(request.params.arguments?.target_dir).trim()
+                    : request.params.arguments?.output_dir
+                        ? String(request.params.arguments?.output_dir).trim()
+                        : "";
+                const targetDir = requestedTargetDir || process.cwd();
                 const requestedFormat = String(request.params.arguments?.format || "feather");
                 const fastMode = request.params.arguments?.fast === true;
                 const preview = request.params.arguments?.preview === true;
@@ -2032,7 +2136,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
                     // Start a prepare job for this dataset id (acts like calling prepare_dataset)
                     try {
-                        jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false });
+                        jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false, outputDir: process.cwd() });
                     }
                     catch (e) {
                         console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
@@ -2115,7 +2219,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
                     const ext = extMap[requestedFormat] || ".feather";
                     const safeName = toSafeDatasetPathFragment(datasetId);
-                    const outDir = targetDir || path.join(dataRoot, "exports");
+                    const outDir = targetDir;
                     if (!fs.existsSync(outDir))
                         fs.mkdirSync(outDir, { recursive: true });
                     const outputFile = path.join(outDir, `${safeName}${ext}`);
@@ -2151,6 +2255,23 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     };
                 }
             }
+            case "vesper_list_datasets": {
+                const entries = readRegistry();
+                if (entries.length === 0) {
+                    return {
+                        content: [{ type: "text", text: "No prepared datasets found in the Vesper registry. Use prepare_dataset or download_dataset to add datasets." }]
+                    };
+                }
+                const lines = entries.map((e, i) => {
+                    const id = e.dataset_id || e.id || "unknown";
+                    const localPath = e.local_path || e.path || "unknown";
+                    const exists = typeof localPath === "string" && localPath !== "unknown" && fs.existsSync(localPath);
+                    return `${i + 1}. **${id}**\n   Path: ${localPath}\n   Status: ${e.status || "unknown"}${exists ? "" : " (file missing)"}`;
+                });
+                return {
+                    content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
+                };
+            }
             case "fuse_datasets": {
                 const rawSources = request.params.arguments?.sources;
                 if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {

package/build/install/install-service.js CHANGED Viewed

@@ -21,7 +21,11 @@ export class InstallService {
         // Create target directory
         const installLabel = dataset?.name || datasetId;
         const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
-        const installDir = targetDir || path.join(this.projectRoot, "datasets", sanitizedName);
+        // If caller specified a target dir, use it directly (don't nest under datasets/)
+        // Otherwise fall back to the project root's datasets/ folder
+        const installDir = targetDir
+            ? path.resolve(targetDir)
+            : path.join(this.projectRoot, "datasets", sanitizedName);
         if (!fs.existsSync(installDir)) {
             fs.mkdirSync(installDir, { recursive: true });
         }

package/build/metadata/scraper.js CHANGED Viewed

@@ -3,22 +3,29 @@ import { categorizeLicense } from "./license.js";
 import { calculateQualityScore } from "./quality.js";
 import { classifyDomain } from "./domain.js";
 import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
+import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent } from "../search/query-intent.js";
 export class HuggingFaceScraper {
     /**
      * Bulk discovery: Fetch many datasets quickly without deep details.
      * Hits the 25k target in minutes.
      */
-    async scrapeBulk(limit = 1000, query) {
+    async scrapeBulk(limit = 1000, queryOrIntent) {
+        const intent = typeof queryOrIntent === "string"
+            ? await analyzeDatasetQuery(queryOrIntent)
+            : queryOrIntent;
+        const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
+        const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
         const filterMsg = query ? `, query: ${query}` : "";
         console.error(`[Bulk Scraper] Fetching datasets (target limit: ${limit}${filterMsg})...`);
         const results = [];
         let processed = 0;
         try {
             const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
+            const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
             for await (const ds of listDatasets({
                 limit: limit,
                 additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
-                search: { query: query },
+                search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
                 ...(hfToken ? { accessToken: hfToken } : {})
             })) {
                 if (results.length >= limit)
@@ -86,8 +93,12 @@ export class HuggingFaceScraper {
         }
         return results;
     }
-    async scrape(limit = 100, applyMVPFilters = true, query // Use as general search query
-    ) {
+    async scrape(limit = 100, applyMVPFilters = true, queryOrIntent) {
+        const intent = typeof queryOrIntent === "string"
+            ? await analyzeDatasetQuery(queryOrIntent)
+            : queryOrIntent;
+        const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
+        const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
         const filterMsg = query ? `, query: ${query}` : "";
         console.error(`Fetching datasets (target limit: ${limit}, MVP filters: ${applyMVPFilters}${filterMsg})...`);
         const results = [];
@@ -110,10 +121,11 @@ export class HuggingFaceScraper {
             }
             // Add delay between batches to avoid rate limits
             const BATCH_DELAY = hfToken ? 500 : 2000;
+            const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
             for await (const ds of listDatasets({
                 limit: fetchLimit,
                 additionalFields: ["description", "tags"],
-                search: { query: query },
+                search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
                 ...(hfToken ? { accessToken: hfToken } : {})
             })) {
                 if (results.length >= limit)
@@ -290,6 +302,9 @@ export class HuggingFaceScraper {
                             description_length: description.length,
                             has_readme: !!(cardData.readme || cardData.readme_content)
                         };
+                        if (intent) {
+                            metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
+                        }
                         results.push(metadata);
                     }
                     catch (e) {
@@ -340,8 +355,12 @@ export class HuggingFaceScraper {
         if (otherErrors > 0) {
             console.error(`[HF Scraper] ⚠️ ${otherErrors} datasets skipped due to errors`);
         }
-        // Sort by downloads descending
-        return results.sort((a, b) => b.downloads - a.downloads);
+        return results.sort((a, b) => {
+            const intentDelta = Number(b.intent_score || 0) - Number(a.intent_score || 0);
+            if (intentDelta !== 0)
+                return intentDelta;
+            return b.downloads - a.downloads;
+        });
     }
     extractTask(tags) {
         const taskTags = [

package/build/search/engine.js CHANGED Viewed

@@ -1,4 +1,5 @@
 import { JITOrchestrator } from "./jit-orchestrator.js";
+import { analyzeDatasetQuery, scoreDatasetAgainstIntent } from "./query-intent.js";
 import fs from "fs";
 function log(msg) {
     fs.appendFileSync("debug.log", new Date().toISOString() + " " + msg + "\n");
@@ -17,9 +18,10 @@ export class SearchEngine {
     async search(query, options = {}) {
         const limit = options.limit || 5;
         const enableJIT = options.enableJIT !== false; // Default: true
+        const intent = await analyzeDatasetQuery(query);
         log(`Search request: "${query}" Limit=${limit} Safe=${options.safeOnly} JIT=${enableJIT}`);
         // 1. Perform local search
-        const localResults = await this.localSearch(query, options);
+        const localResults = await this.localSearch(query, options, intent);
         // 2. Check if JIT should be triggered
         const shouldTrigger = enableJIT && this.shouldTriggerJIT(localResults, query);
         if (!shouldTrigger) {
@@ -28,10 +30,10 @@ export class SearchEngine {
         }
         // 3. Trigger JIT fallback
         console.error(`\nWARNING: Low confidence results (${localResults.length} found, top score: ${localResults[0]?.relevance_score || 0})`);
-        await this.jitOrchestrator.fetchAndIngest(query, 10);
+        await this.jitOrchestrator.fetchAndIngest(query, 10, intent);
         // 4. Re-run local search with updated index
         console.error(`Re-searching with updated library...`);
-        const enhancedResults = await this.localSearch(query, options);
+        const enhancedResults = await this.localSearch(query, options, intent);
         const newCount = enhancedResults.length - localResults.length;
         if (newCount > 0) {
             console.error(`Found ${newCount} additional results\n`);
@@ -41,7 +43,7 @@ export class SearchEngine {
     /**
      * Perform hybrid search (Vector + Lexical + Penalties)
      */
-    async localSearch(query, options) {
+    async localSearch(query, options, intent) {
         const limit = options.limit || 5;
         // 1. Parse Query
         const words = query.toLowerCase().split(/\s+/);
@@ -136,11 +138,13 @@ export class SearchEngine {
             bonus = sourceBonuses[metadata.source] || 0;
             // Final Combined Score
             // 70% Vector, 30% Lexical, minus Penalties, plus Bonuses
-            const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty + bonus;
+            const intentScore = scoreDatasetAgainstIntent(metadata, intent);
+            const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty + bonus + intentScore;
             metadata.relevance_score = Math.round(finalScore * 100) / 100;
             metadata.vector_score = Math.round(vectorScore * 100) / 100;
             metadata.lexical_score = Math.round(lexicalScore * 100) / 100;
             metadata.accessibility_bonus = bonus;
+            metadata.intent_score = intentScore;
             results.push(metadata);
         }
         // Sort by final score and limit

package/build/search/jit-orchestrator.js CHANGED Viewed

@@ -2,6 +2,7 @@ import { HuggingFaceScraper } from "../metadata/scraper.js";
 import { UCIScraper } from "../metadata/uci-scraper.js";
 import { GitHubScraper } from "../metadata/github-scraper.js";
 import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
+import { analyzeDatasetQuery, buildIntentSearchQuery } from "./query-intent.js";
 // Common stop words to filter out for better search
 const STOP_WORDS = new Set([
     "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
@@ -61,7 +62,7 @@ export class JITOrchestrator {
     /**
      * Main JIT workflow: fetch, save, index, return new datasets
      */
-    async fetchAndIngest(query, limit = 10) {
+    async fetchAndIngest(query, limit = 10, providedIntent) {
         // Rate limiting check
         if (!this.canTrigger(query)) {
             console.error(`[JIT] Query "${query}" was searched recently. Waiting...`);
@@ -69,9 +70,12 @@ export class JITOrchestrator {
         }
         console.error(`\n[JIT] Searching live sources for: "${query}"`);
         this.lastTriggerTime.set(query, Date.now());
-        // Simplify query for better API results
-        const keywords = this.simplifyQuery(query);
-        if (keywords.length > 0) {
+        const intent = providedIntent || await analyzeDatasetQuery(query);
+        const keywords = this.simplifyQuery(buildIntentSearchQuery(intent));
+        if (intent.llmBacked || intent.language || intent.task || intent.domain || intent.minRows) {
+            console.error(`[JIT] Intent: ${JSON.stringify({ language: intent.language, task: intent.task, domain: intent.domain, minRows: intent.minRows, searchQuery: intent.searchQuery })}`);
+        }
+        else if (keywords.length > 0) {
             console.error(`[JIT] Keywords extracted: ${keywords.join(", ")}`);
         }
         const newDatasets = [];
@@ -81,15 +85,16 @@ export class JITOrchestrator {
             // Get existing dataset IDs to avoid duplicates
             const existing = this.metadataStore.getAllDatasets();
             existing.forEach(ds => existingIds.add(ds.id));
-            // 1. Scrape HuggingFace - try each keyword separately for better results
-            let hfResults = [];
-            for (const keyword of keywords) {
-                if (hfResults.length >= limit)
-                    break;
-                const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / keywords.length));
-                for (const ds of results) {
-                    if (!hfResults.some(existing => existing.id === ds.id)) {
-                        hfResults.push(ds);
+            let hfResults = await this.scrapeHuggingFace(intent, limit);
+            if (hfResults.length < Math.max(3, Math.floor(limit / 2))) {
+                for (const keyword of keywords) {
+                    if (hfResults.length >= limit)
+                        break;
+                    const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / Math.max(keywords.length, 1)));
+                    for (const ds of results) {
+                        if (!hfResults.some(existing => existing.id === ds.id)) {
+                            hfResults.push(ds);
+                        }
                     }
                 }
             }
@@ -170,7 +175,6 @@ export class JITOrchestrator {
     async scrapeHuggingFace(query, limit) {
         const scraper = new HuggingFaceScraper();
         try {
-            // Pass the query as a general search term
             return await scraper.scrape(limit, true, query);
         }
         catch (error) {