npm - vesper-wizard - Versions diffs - 2.2.0 → 2.3.0 - Mend

vesper-wizard 2.2.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/build/index.js +82 -3
package/build/install/install-service.js +4 -3
package/build/metadata/scraper.js +13 -4
package/build/python/convert_engine.py +92 -0
package/build/search/engine.js +7 -1
package/build/search/query-intent.js +45 -0
package/package.json +1 -1
package/src/python/convert_engine.py +92 -0

package/build/index.js CHANGED Viewed

@@ -1352,6 +1352,25 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                     properties: {},
                 },
             },
+            {
+                name: "vesper_convert_format",
+                description: "Convert a dataset file between formats (CSV, Parquet, JSON, JSONL). Auto-detects input format from extension. Saves output in the same directory with the new extension and registers it in the Vesper registry.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        file_path: {
+                            type: "string",
+                            description: "Absolute path to the input dataset file.",
+                        },
+                        target_format: {
+                            type: "string",
+                            enum: ["csv", "parquet", "json", "jsonl"],
+                            description: "The desired output format.",
+                        },
+                    },
+                    required: ["file_path", "target_format"],
+                },
+            },
             {
                 name: "fuse_datasets",
                 description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
@@ -1784,7 +1803,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     max_items: maxItems,
                     workers,
                     image_column: imageColumn,
-                    output_root: path.join(dataRoot, "data", "assets"),
+                    output_root: requestedOutputDir || process.cwd(),
                     recipes_dir: path.join(dataRoot, "recipes"),
                 };
                 try {
@@ -2272,6 +2291,63 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
                 };
             }
+            case "vesper_convert_format": {
+                const filePath = String(request.params.arguments?.file_path || "").trim();
+                const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
+                if (!filePath) {
+                    throw new McpError(ErrorCode.InvalidParams, "file_path is required");
+                }
+                if (!["csv", "parquet", "json", "jsonl"].includes(targetFormat)) {
+                    throw new McpError(ErrorCode.InvalidParams, "target_format must be one of: csv, parquet, json, jsonl");
+                }
+                if (!fs.existsSync(filePath)) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }],
+                        isError: true,
+                    };
+                }
+                const inputExt = path.extname(filePath).toLowerCase();
+                const extMap = { csv: ".csv", parquet: ".parquet", json: ".json", jsonl: ".jsonl" };
+                const outputExt = extMap[targetFormat];
+                if (inputExt === outputExt) {
+                    return {
+                        content: [{ type: "text", text: `File is already in ${targetFormat} format: ${filePath}` }],
+                    };
+                }
+                const outputPath = filePath.replace(/\.[^.]+$/, outputExt);
+                try {
+                    await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
+                    const convertScript = path.join(dataRoot, "python", "convert_engine.py");
+                    const result = await runPythonJson(convertScript, [filePath, outputPath]);
+                    if (!result.ok) {
+                        return {
+                            content: [{ type: "text", text: `ERROR: Conversion failed: ${result.error}` }],
+                            isError: true,
+                        };
+                    }
+                    // Register converted file in the registry
+                    const datasetId = path.basename(outputPath, outputExt);
+                    try {
+                        upsertRegistry(datasetId, outputPath, "completed");
+                    }
+                    catch (e) {
+                        console.error(`[Convert] Registry write failed: ${e?.message || e}`);
+                    }
+                    let msg = `**Conversion complete**\n`;
+                    msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
+                    msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
+                    msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
+                    if (result.size_mb !== undefined)
+                        msg += `- **Size**: ${result.size_mb} MB\n`;
+                    return { content: [{ type: "text", text: msg }] };
+                }
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: Conversion failed: ${error.message}` }],
+                        isError: true,
+                    };
+                }
+            }
             case "fuse_datasets": {
                 const rawSources = request.params.arguments?.sources;
                 if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
@@ -2312,10 +2388,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                 try {
                     const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
                     const ext = extMap[outputFormat] || ".feather";
-                    const outDir = path.join(dataRoot, "fusion");
+                    const outDir = process.cwd();
                     if (!fs.existsSync(outDir))
                         fs.mkdirSync(outDir, { recursive: true });
                     const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
+                    console.error(`[Fusion] Resolved output directory: ${outDir}`);
                     const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
                         strategy,
                         join_on: joinOn,
@@ -2874,10 +2951,12 @@ async function runExportCli(args) {
     const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
     const ext = extMap[requestedFormat] || ".parquet";
     const safeName = toSafeDatasetPathFragment(datasetId);
-    const outDir = targetDir || path.join(dataRoot, "exports");
+    const outDir = targetDir || process.cwd();
     if (!fs.existsSync(outDir))
         fs.mkdirSync(outDir, { recursive: true });
     const outputFile = path.join(outDir, `${safeName}${ext}`);
+    console.error(`[Export] Resolved output directory: ${outDir}`);
+    console.error(`[Export] Output file: ${outputFile}`);
     const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
     console.log(`Export complete: ${result.output_path}`);
     console.log(`Format: ${result.format}${result.compression ? ` (${result.compression})` : ""}`);

package/build/install/install-service.js CHANGED Viewed

@@ -21,11 +21,12 @@ export class InstallService {
         // Create target directory
         const installLabel = dataset?.name || datasetId;
         const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
-        // If caller specified a target dir, use it directly (don't nest under datasets/)
-        // Otherwise fall back to the project root's datasets/ folder
+        // If caller specified a target dir, use it directly
+        // Otherwise use the current working directory
         const installDir = targetDir
             ? path.resolve(targetDir)
-            : path.join(this.projectRoot, "datasets", sanitizedName);
+            : path.resolve(process.cwd(), sanitizedName);
+        console.error(`[InstallService] Resolved install directory: ${installDir}`);
         if (!fs.existsSync(installDir)) {
             fs.mkdirSync(installDir, { recursive: true });
         }

package/build/metadata/scraper.js CHANGED Viewed

@@ -3,7 +3,7 @@ import { categorizeLicense } from "./license.js";
 import { calculateQualityScore } from "./quality.js";
 import { classifyDomain } from "./domain.js";
 import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
-import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent } from "../search/query-intent.js";
+import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "../search/query-intent.js";
 export class HuggingFaceScraper {
     /**
      * Bulk discovery: Fetch many datasets quickly without deep details.
@@ -85,6 +85,9 @@ export class HuggingFaceScraper {
                     has_readme: false,
                     is_incomplete: true // Flag for Phase 2
                 };
+                // Hard language exclusion
+                if (intent && shouldExcludeByLanguage(metadata, intent))
+                    continue;
                 results.push(metadata);
             }
         }
@@ -302,10 +305,16 @@ export class HuggingFaceScraper {
                             description_length: description.length,
                             has_readme: !!(cardData.readme || cardData.readme_content)
                         };
-                        if (intent) {
-                            metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
+                        // Hard language exclusion — drop bilingual/multilingual for single-language queries
+                        if (intent && shouldExcludeByLanguage(metadata, intent)) {
+                            // skip — do not push
+                        }
+                        else {
+                            if (intent) {
+                                metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
+                            }
+                            results.push(metadata);
                         }
-                        results.push(metadata);
                     }
                     catch (e) {
                         // Track all errors for user feedback

package/build/python/convert_engine.py ADDED Viewed

@@ -0,0 +1,92 @@
+"""
+Convert a dataset file between formats (CSV, Parquet, JSON, JSONL).
+Usage: convert_engine.py <input_path> <output_path>
+Outputs JSON: {"ok": true, "output_path": "...", "rows": N, "columns": N} or {"ok": false, "error": "..."}
+"""
+import sys
+import json
+import os
+try:
+    import polars as pl
+except Exception:
+    print(json.dumps({"ok": False, "error": "polars is required. Install with: pip install polars"}))
+    sys.exit(1)
+def _load(src: str) -> pl.DataFrame:
+    ext = os.path.splitext(src)[1].lower()
+    if ext == ".csv":
+        return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
+    if ext in (".tsv", ".tab"):
+        return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
+    if ext in (".parquet", ".pq"):
+        return pl.read_parquet(src)
+    if ext in (".feather", ".ftr", ".arrow", ".ipc"):
+        return pl.read_ipc(src)
+    if ext in (".jsonl", ".ndjson"):
+        return pl.read_ndjson(src)
+    if ext == ".json":
+        raw = open(src, "r", encoding="utf-8").read().strip()
+        if raw.startswith("["):
+            return pl.read_json(src)
+        if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
+            return pl.read_ndjson(src)
+        obj = json.loads(raw)
+        if isinstance(obj, dict):
+            for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
+                if key in obj and isinstance(obj[key], list):
+                    return pl.DataFrame(obj[key])
+            for v in obj.values():
+                if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
+                    return pl.DataFrame(v)
+        return pl.read_json(src)
+    # Fallback: try csv
+    return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
+def _write(df: pl.DataFrame, dst: str) -> None:
+    ext = os.path.splitext(dst)[1].lower()
+    os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
+    if ext in (".parquet", ".pq"):
+        df.write_parquet(dst)
+    elif ext == ".csv":
+        df.write_csv(dst)
+    elif ext == ".json":
+        df.write_json(dst, row_oriented=True)
+    elif ext in (".jsonl", ".ndjson"):
+        df.write_ndjson(dst)
+    else:
+        raise ValueError(f"Unsupported output format: {ext}")
+def main():
+    if len(sys.argv) < 3:
+        print(json.dumps({"ok": False, "error": "Usage: convert_engine.py <input> <output>"}))
+        sys.exit(1)
+    input_path = sys.argv[1]
+    output_path = sys.argv[2]
+    if not os.path.exists(input_path):
+        print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
+        sys.exit(1)
+    try:
+        df = _load(input_path)
+        _write(df, output_path)
+        size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
+        print(json.dumps({
+            "ok": True,
+            "output_path": output_path,
+            "rows": df.height,
+            "columns": df.width,
+            "size_mb": size_mb,
+        }))
+    except Exception as e:
+        print(json.dumps({"ok": False, "error": str(e)}))
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

package/build/search/engine.js CHANGED Viewed

@@ -1,5 +1,5 @@
 import { JITOrchestrator } from "./jit-orchestrator.js";
-import { analyzeDatasetQuery, scoreDatasetAgainstIntent } from "./query-intent.js";
+import { analyzeDatasetQuery, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "./query-intent.js";
 import fs from "fs";
 function log(msg) {
     fs.appendFileSync("debug.log", new Date().toISOString() + " " + msg + "\n");
@@ -68,6 +68,12 @@ export class SearchEngine {
             // Filter: Safe only
             if (options.safeOnly && metadata.license.category === "restricted")
                 continue;
+            // Filter: Hard language exclusion — completely drop bilingual/multilingual datasets
+            // when user explicitly requested a single language
+            if (shouldExcludeByLanguage(metadata, intent)) {
+                log(`Language exclusion: Dropped ${match.id} (bilingual/multilingual for single-language query)`);
+                continue;
+            }
             const text = `${metadata.name} ${metadata.description} ${metadata.tags.join(" ")}`.toLowerCase();
             // Filter: Explicit Negative Keywords
             if (negativeKeywords.some(neg => text.includes(neg))) {

package/build/search/query-intent.js CHANGED Viewed

@@ -172,6 +172,51 @@ const LANGUAGE_TO_CODE = {
     russian: "ru",
     hindi: "hi",
 };
+const BILINGUAL_INDICATORS = [
+    "translation", "parallel", "bilingual", "multilingual",
+    "cross-lingual", "crosslingual", "machine-translation",
+    "aligned", "comparable corpus",
+];
+/**
+ * Hard-exclude a dataset when the user requests a single specific language
+ * and the dataset is bilingual, multilingual, or tagged with other languages.
+ * Returns true if the dataset should be EXCLUDED from results.
+ */
+export function shouldExcludeByLanguage(dataset, intent) {
+    if (!intent?.language || intent.language === "multilingual")
+        return false;
+    const aliases = getLanguageAliases(intent.language);
+    const datasetLanguages = dataset.languages.map(normalizeToken).filter(l => l && l !== "unknown");
+    // If the dataset has language tags and ANY of them are NOT the requested language, exclude
+    if (datasetLanguages.length > 0) {
+        const hasRequestedLang = aliases.some(a => datasetLanguages.includes(a));
+        const hasOtherLangs = datasetLanguages.some(lang => !aliases.includes(lang));
+        if (hasOtherLangs)
+            return true; // bilingual/multilingual → exclude
+        if (!hasRequestedLang)
+            return true; // wrong language entirely
+    }
+    // Check name, description, and tags for bilingual indicators or other language names
+    const text = [
+        dataset.name,
+        dataset.description,
+        dataset.tags.join(" "),
+    ].join(" ").toLowerCase();
+    // Check for bilingual/translation keywords
+    if (BILINGUAL_INDICATORS.some(indicator => text.includes(indicator))) {
+        return true;
+    }
+    // Check if the text mentions other specific languages by name
+    const otherLanguageNames = Object.keys(LANGUAGE_ALIASES).filter(lang => lang !== intent.language && lang !== "multilingual");
+    for (const otherLang of otherLanguageNames) {
+        const otherAliases = LANGUAGE_ALIASES[otherLang];
+        // Only check the full language name (not 2-letter codes which could appear in regular text)
+        if (otherAliases && otherAliases[0] && text.includes(otherAliases[0])) {
+            return true;
+        }
+    }
+    return false;
+}
 function buildHeuristicIntent(query, requirements) {
     const originalQuery = `${query || ""} ${requirements || ""}`.trim();
     const normalizedQuery = originalQuery.toLowerCase();

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "vesper-wizard",
-  "version": "2.2.0",
+  "version": "2.3.0",
   "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
   "type": "module",
   "main": "build/index.js",

package/src/python/convert_engine.py ADDED Viewed

@@ -0,0 +1,92 @@
+"""
+Convert a dataset file between formats (CSV, Parquet, JSON, JSONL).
+Usage: convert_engine.py <input_path> <output_path>
+Outputs JSON: {"ok": true, "output_path": "...", "rows": N, "columns": N} or {"ok": false, "error": "..."}
+"""
+import sys
+import json
+import os
+try:
+    import polars as pl
+except Exception:
+    print(json.dumps({"ok": False, "error": "polars is required. Install with: pip install polars"}))
+    sys.exit(1)
+def _load(src: str) -> pl.DataFrame:
+    ext = os.path.splitext(src)[1].lower()
+    if ext == ".csv":
+        return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
+    if ext in (".tsv", ".tab"):
+        return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
+    if ext in (".parquet", ".pq"):
+        return pl.read_parquet(src)
+    if ext in (".feather", ".ftr", ".arrow", ".ipc"):
+        return pl.read_ipc(src)
+    if ext in (".jsonl", ".ndjson"):
+        return pl.read_ndjson(src)
+    if ext == ".json":
+        raw = open(src, "r", encoding="utf-8").read().strip()
+        if raw.startswith("["):
+            return pl.read_json(src)
+        if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
+            return pl.read_ndjson(src)
+        obj = json.loads(raw)
+        if isinstance(obj, dict):
+            for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
+                if key in obj and isinstance(obj[key], list):
+                    return pl.DataFrame(obj[key])
+            for v in obj.values():
+                if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
+                    return pl.DataFrame(v)
+        return pl.read_json(src)
+    # Fallback: try csv
+    return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
+def _write(df: pl.DataFrame, dst: str) -> None:
+    ext = os.path.splitext(dst)[1].lower()
+    os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
+    if ext in (".parquet", ".pq"):
+        df.write_parquet(dst)
+    elif ext == ".csv":
+        df.write_csv(dst)
+    elif ext == ".json":
+        df.write_json(dst, row_oriented=True)
+    elif ext in (".jsonl", ".ndjson"):
+        df.write_ndjson(dst)
+    else:
+        raise ValueError(f"Unsupported output format: {ext}")
+def main():
+    if len(sys.argv) < 3:
+        print(json.dumps({"ok": False, "error": "Usage: convert_engine.py <input> <output>"}))
+        sys.exit(1)
+    input_path = sys.argv[1]
+    output_path = sys.argv[2]
+    if not os.path.exists(input_path):
+        print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
+        sys.exit(1)
+    try:
+        df = _load(input_path)
+        _write(df, output_path)
+        size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
+        print(json.dumps({
+            "ok": True,
+            "output_path": output_path,
+            "rows": df.height,
+            "columns": df.width,
+            "size_mb": size_mb,
+        }))
+    except Exception as e:
+        print(json.dumps({"ok": False, "error": str(e)}))
+        sys.exit(1)
+if __name__ == "__main__":
+    main()