npm - @vespermcp/mcp-server - Versions diffs - 1.1.3 → 1.2.1 - Mend

@vespermcp/mcp-server 1.1.3 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/README.md +34 -0
package/build/config/secure-keys.js +51 -0
package/build/config/user-config.js +48 -0
package/build/fusion/engine.js +69 -0
package/build/index.js +900 -50
package/build/ingestion/hf-downloader.js +12 -3
package/build/ingestion/ingestor.js +33 -9
package/build/ingestion/kaggle-downloader.js +2 -2
package/build/metadata/kaggle-source.js +70 -0
package/build/metadata/scraper.js +34 -10
package/build/python/config.py +259 -0
package/build/python/export_engine.py +148 -52
package/build/python/fusion_engine.py +368 -0
package/build/python/kaggle_engine.py +204 -0
package/build/python/row_count.py +54 -0
package/build/python/test_fusion_engine.py +89 -0
package/build/scripts/build-index.js +5 -5
package/build/search/jit-orchestrator.js +72 -12
package/build/tools/formatter.js +14 -14
package/package.json +9 -3
package/scripts/refresh-index.cjs +87 -0
package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
package/src/python/config.py +259 -0
package/src/python/export_engine.py +148 -52
package/src/python/fusion_engine.py +368 -0
package/src/python/kaggle_engine.py +204 -0
package/src/python/row_count.py +54 -0
package/src/python/test_fusion_engine.py +89 -0

package/build/ingestion/hf-downloader.js CHANGED Viewed

@@ -8,12 +8,16 @@ export class HFDownloader {
         this.hfToken = token || process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
         this.downloader = new RobustDownloader();
     }
+    getToken() {
+        return this.hfToken || process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
+    }
     /**
      * Finds the most suitable data file in a repository
      * Returns the relative path within the repo
      */
     async findBestFile(repoId) {
         try {
+            const token = this.getToken();
             const files = [];
             const blacklist = [
                 ".gitattributes",
@@ -28,7 +32,7 @@ export class HFDownloader {
             for await (const file of listFiles({
                 repo: { type: "dataset", name: repoId },
                 recursive: true,
-                ...(this.hfToken ? { accessToken: this.hfToken } : {})
+                ...(token ? { accessToken: token } : {})
             })) {
                 if (file.type === "file") {
                     const fileName = path.basename(file.path);
@@ -62,7 +66,11 @@ export class HFDownloader {
             return fallback || null;
         }
         catch (error) {
-            console.error(`[HF] Failed to list files for ${repoId}:`, error.message);
+            const msg = String(error?.message || error);
+            if (msg.includes("401") || msg.includes("403") || msg.toLowerCase().includes("unauthorized")) {
+                throw new Error("Hugging Face gated/private dataset requires token. Run 'vespermcp config keys' to set HF token.");
+            }
+            console.error(`[HF] Failed to list files for ${repoId}:`, msg);
             return null;
         }
     }
@@ -70,9 +78,10 @@ export class HFDownloader {
      * Downloads a file from HF to local path
      */
     async download(repoId, filePath, targetPath, onProgress) {
+        const token = this.getToken();
         const url = `https://huggingface.co/datasets/${repoId}/resolve/main/${filePath}`;
         await this.downloader.download(url, targetPath, {
-            headers: this.hfToken ? { 'Authorization': `Bearer ${this.hfToken}` } : {},
+            headers: token ? { 'Authorization': `Bearer ${token}` } : {},
             resume: true,
             onProgress: (bytes, total) => {
                 if (total > 0 && onProgress) {

package/build/ingestion/ingestor.js CHANGED Viewed

@@ -1,13 +1,15 @@
 import path from "path";
 import fs from "fs";
 import { HFDownloader } from "./hf-downloader.js";
-import { KaggleDownloader } from "./kaggle-downloader.js";
+import { KaggleSource } from "../metadata/kaggle-source.js";
+import { SecureKeysManager } from "../config/secure-keys.js";
 export class DataIngestor {
     projectRoot;
     store;
     rawDataDir;
     hfDownloader;
-    kaggleDownloader;
+    kaggleSource;
+    secureKeys;
     constructor(projectRoot, store) {
         this.projectRoot = projectRoot;
         this.store = store;
@@ -16,19 +18,26 @@ export class DataIngestor {
             fs.mkdirSync(this.rawDataDir, { recursive: true });
         }
         this.hfDownloader = new HFDownloader();
-        this.kaggleDownloader = new KaggleDownloader();
+        this.kaggleSource = new KaggleSource();
+        this.secureKeys = new SecureKeysManager();
     }
     /**
      * Check if Kaggle credentials are available
      */
     hasKaggleCredentials() {
-        return this.kaggleDownloader.hasCredentials();
+        if (process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY)
+            return true;
+        const keys = this.secureKeys.getAll();
+        if (keys.kaggle_username && keys.kaggle_key)
+            return true;
+        const kaggleJsonPath = path.join(process.env.HOME || process.env.USERPROFILE || "", ".kaggle", "kaggle.json");
+        return !!(kaggleJsonPath && fs.existsSync(kaggleJsonPath));
     }
     /**
      * Get helpful error message if Kaggle credentials are missing
      */
     getKaggleCredentialError() {
-        return this.kaggleDownloader.getCredentialError();
+        return "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds), or provide ~/.kaggle/kaggle.json.";
     }
     /**
      * Ensures a dataset is available locally
@@ -67,10 +76,25 @@ export class DataIngestor {
             }
         }
         else if (source === "kaggle") {
-            // Kaggle support has been disabled
-            const errorMsg = "Kaggle datasets are no longer supported. Please use HuggingFace or other open-access sources.";
-            this.failDownload(datasetId, errorMsg);
-            throw new Error(errorMsg);
+            if (!this.hasKaggleCredentials()) {
+                const errorMsg = this.getKaggleCredentialError();
+                this.failDownload(datasetId, errorMsg);
+                throw new Error(errorMsg);
+            }
+            const targetDir = path.join(this.rawDataDir, datasetId.replace(/\//g, "_"));
+            this.store.registerDownload(datasetId, targetDir, "downloading");
+            try {
+                onProgress?.("Downloading from Kaggle...");
+                const result = await this.kaggleSource.download(datasetId, targetDir);
+                const stats = fs.statSync(result.local_path);
+                this.completeDownload(datasetId, result.local_path, stats.size);
+                onProgress?.("Kaggle download complete", 100);
+                return result.local_path;
+            }
+            catch (e) {
+                this.failDownload(datasetId, e.message);
+                throw e;
+            }
         }
         throw new Error(`Download logic for ${source} not yet implemented`);
     }

package/build/ingestion/kaggle-downloader.js CHANGED Viewed

@@ -23,8 +23,8 @@ export class KaggleDownloader {
     getCredentialError() {
         if (!this.username && !this.key) {
             return "Kaggle credentials missing. Please set KAGGLE_USERNAME and KAGGLE_KEY environment variables.\n" +
-                "💡 Tip: Get your API token from https://www.kaggle.com/settings → API → Create New Token\n" +
-                "💡 Alternative: Download the dataset manually and use analyze_quality() on local files.";
+                "Tip: Get your API token from https://www.kaggle.com/settings -> API -> Create New Token\n" +
+                "Alternative: Download the dataset manually and use analyze_quality() on local files.";
         }
         if (!this.username) {
             return "KAGGLE_USERNAME is missing. Please set it in your MCP config or environment variables.";

package/build/metadata/kaggle-source.js ADDED Viewed

@@ -0,0 +1,70 @@
+import { spawn } from "child_process";
+import path from "path";
+import fs from "fs";
+import os from "os";
+export class KaggleSource {
+    pythonPath = "python";
+    scriptPath;
+    constructor(buildDir = process.cwd()) {
+        const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
+        const dataRoot = path.join(homeDir, ".vesper");
+        const scriptPath0 = path.resolve(dataRoot, "python", "kaggle_engine.py");
+        const scriptPath1 = path.resolve(buildDir, "python", "kaggle_engine.py");
+        const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "kaggle_engine.py");
+        if (fs.existsSync(scriptPath0)) {
+            this.scriptPath = scriptPath0;
+        }
+        else if (fs.existsSync(scriptPath1)) {
+            this.scriptPath = scriptPath1;
+        }
+        else if (fs.existsSync(scriptPath2)) {
+            this.scriptPath = scriptPath2;
+        }
+        else {
+            this.scriptPath = scriptPath0;
+        }
+        if (process.platform === "win32")
+            this.pythonPath = "py";
+    }
+    async discover(query, limit = 20) {
+        const result = await this.run(["discover", query, String(limit)]);
+        if (!result.ok) {
+            throw new Error(result.error || "Kaggle discover failed");
+        }
+        return (result.results || []);
+    }
+    async download(datasetRef, targetDir) {
+        const args = ["download", datasetRef];
+        if (targetDir)
+            args.push(targetDir);
+        const result = await this.run(args);
+        if (!result.ok) {
+            throw new Error(result.error || "Kaggle download failed");
+        }
+        return {
+            local_path: result.local_path,
+            target_dir: result.target_dir,
+        };
+    }
+    async run(args) {
+        return new Promise((resolve, reject) => {
+            const processRef = spawn(this.pythonPath, [this.scriptPath, ...args]);
+            let stdout = "";
+            let stderr = "";
+            processRef.stdout.on("data", (d) => (stdout += d.toString()));
+            processRef.stderr.on("data", (d) => (stderr += d.toString()));
+            processRef.on("close", (code) => {
+                if (code !== 0) {
+                    reject(new Error(stderr || stdout || `kaggle_engine exited with code ${code}`));
+                    return;
+                }
+                try {
+                    resolve(JSON.parse(stdout));
+                }
+                catch {
+                    reject(new Error(`Failed to parse kaggle_engine output: ${stdout}`));
+                }
+            });
+        });
+    }
+}

package/build/metadata/scraper.js CHANGED Viewed

@@ -93,15 +93,23 @@ export class HuggingFaceScraper {
         const results = [];
         let processed = 0;
         let skippedMVP = 0;
+        let rateLimitHits = 0;
+        let otherErrors = 0;
         try {
             // Fetch more datasets to account for filtering
             const fetchLimit = applyMVPFilters ? limit * 30 : limit * 10;
-            const CONCURRENCY = 25; // Increased for high-volume indexing
-            const queue = [];
             // Support HuggingFace token from environment variable
             const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
+            // CRITICAL: Low concurrency without token to avoid rate limits
+            // With token: 10 concurrent (HF allows more)
+            // Without token: 2 concurrent (stay under ~30 req/min limit)
+            const CONCURRENCY = hfToken ? 10 : 2;
+            const queue = [];
+            if (!hfToken) {
+                console.error(`[HF Scraper] ⚠️ No HF_TOKEN set - using conservative rate limits`);
+            }
             // Add delay between batches to avoid rate limits
-            const BATCH_DELAY = 1000; // 1 second delay between batches
+            const BATCH_DELAY = hfToken ? 500 : 2000;
             for await (const ds of listDatasets({
                 limit: fetchLimit,
                 additionalFields: ["description", "tags"],
@@ -242,11 +250,20 @@ export class HuggingFaceScraper {
                         results.push(metadata);
                     }
                     catch (e) {
-                        // Log rate limit errors, silently skip others
+                        // Track all errors for user feedback
                         if (e?.status === 429 || e?.message?.includes('rate limit')) {
-                            console.error(`[HF Scraper] Rate limit error for ${repoId}: ${e.message}`);
+                            rateLimitHits++;
+                            if (rateLimitHits <= 3) {
+                                console.error(`[HF Scraper] Rate limit hit for ${repoId}`);
+                            }
+                        }
+                        else {
+                            otherErrors++;
+                            // Log first few non-rate-limit errors for debugging
+                            if (otherErrors <= 2) {
+                                console.error(`[HF Scraper] Error for ${repoId}: ${e.message?.slice(0, 80)}`);
+                            }
                         }
-                        // Silently skip other errors
                     }
                 })();
                 queue.push(processTask);
@@ -265,14 +282,21 @@ export class HuggingFaceScraper {
         catch (e) {
             // Handle rate limit errors with better messaging
             if (e?.status === 429 || e?.message?.includes('rate limit')) {
-                console.error("Scraping failed due to rate limit:", e.message);
-                console.error("Consider setting HF_TOKEN environment variable to increase rate limits");
+                console.error("[HF Scraper] ❌ Scraping failed due to rate limit:", e.message);
+                console.error("[HF Scraper] 💡 Set HF_TOKEN environment variable for unlimited access");
             }
             else {
-                console.error("Scraping failed overall:", e.message);
+                console.error("[HF Scraper] ❌ Scraping failed:", e.message);
             }
         }
-        console.error(`[HF Scraper] Complete: ${results.length} datasets scraped, ${skippedMVP} skipped`);
+        // User-friendly summary
+        console.error(`[HF Scraper] ✅ Complete: ${results.length} datasets found`);
+        if (rateLimitHits > 0) {
+            console.error(`[HF Scraper] ⚠️ ${rateLimitHits} requests rate-limited. Set HF_TOKEN for better results.`);
+        }
+        if (otherErrors > 0) {
+            console.error(`[HF Scraper] ⚠️ ${otherErrors} datasets skipped due to errors`);
+        }
         // Sort by downloads descending
         return results.sort((a, b) => b.downloads - a.downloads);
     }

package/build/python/config.py ADDED Viewed

@@ -0,0 +1,259 @@
+import os
+import sys
+import json
+import base64
+import hashlib
+import secrets
+from pathlib import Path
+from typing import Dict, Optional
+SERVICE_NAME = "vesper"
+KEY_ALIASES = {
+    "hf_token": ["HF_TOKEN", "HUGGINGFACE_TOKEN"],
+    "kaggle_username": ["KAGGLE_USERNAME"],
+    "kaggle_key": ["KAGGLE_KEY"],
+}
+try:
+    import keyring  # type: ignore
+    HAS_KEYRING = True
+except Exception:
+    HAS_KEYRING = False
+try:
+    from cryptography.fernet import Fernet, InvalidToken  # type: ignore
+    HAS_FERNET = True
+except Exception:
+    HAS_FERNET = False
+def _config_path() -> Path:
+    return Path.home() / ".vesper" / "config.toml"
+def _secret_path() -> Path:
+    return Path.home() / ".vesper" / ".config_key"
+def _ensure_parent(path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+def _read_fallback_toml() -> Dict[str, str]:
+    path = _config_path()
+    if not path.exists():
+        return {}
+    values: Dict[str, str] = {}
+    in_keys = False
+    method = ""
+    for raw in path.read_text(encoding="utf-8").splitlines():
+        line = raw.strip()
+        if not line or line.startswith("#"):
+            continue
+        if line.startswith("[") and line.endswith("]"):
+            in_keys = (line == "[keys]")
+            continue
+        if line.startswith("method") and "=" in line:
+            method = line.split("=", 1)[1].strip().strip('"').strip("'")
+            continue
+        if not in_keys or "=" not in line:
+            continue
+        key, val = line.split("=", 1)
+        key = key.strip()
+        val = val.strip().strip('"').strip("'")
+        values[key] = val
+    if method:
+        values["__method__"] = method
+    return values
+def _get_or_create_local_secret() -> str:
+    secret_file = _secret_path()
+    _ensure_parent(secret_file)
+    if secret_file.exists():
+        return secret_file.read_text(encoding="utf-8").strip()
+    secret = base64.urlsafe_b64encode(secrets.token_bytes(32)).decode("utf-8")
+    secret_file.write_text(secret, encoding="utf-8")
+    try:
+        os.chmod(secret_file, 0o600)
+    except Exception:
+        pass
+    return secret
+def _xor_encrypt(plain: str, secret: str) -> str:
+    key = hashlib.sha256(secret.encode("utf-8")).digest()
+    data = plain.encode("utf-8")
+    out = bytes([data[i] ^ key[i % len(key)] for i in range(len(data))])
+    return base64.urlsafe_b64encode(out).decode("utf-8")
+def _xor_decrypt(cipher_text: str, secret: str) -> str:
+    key = hashlib.sha256(secret.encode("utf-8")).digest()
+    data = base64.urlsafe_b64decode(cipher_text.encode("utf-8"))
+    out = bytes([data[i] ^ key[i % len(key)] for i in range(len(data))])
+    return out.decode("utf-8")
+def _encrypt_value(value: str, secret: str) -> Dict[str, str]:
+    if HAS_FERNET:
+        token = Fernet(secret.encode("utf-8")).encrypt(value.encode("utf-8")).decode("utf-8")
+        return {"method": "fernet", "value": token}
+    # fallback encryption (weaker than fernet, but still not plaintext)
+    return {"method": "xor", "value": _xor_encrypt(value, secret)}
+def _decrypt_value(value: str, method: str, secret: str) -> Optional[str]:
+    try:
+        if method == "fernet" and HAS_FERNET:
+            return Fernet(secret.encode("utf-8")).decrypt(value.encode("utf-8")).decode("utf-8")
+        if method == "xor":
+            return _xor_decrypt(value, secret)
+        return None
+    except InvalidToken:
+        return None
+    except Exception:
+        return None
+def _write_fallback_toml(values: Dict[str, str]) -> None:
+    path = _config_path()
+    _ensure_parent(path)
+    method = values.get("__method__", "fernet" if HAS_FERNET else "xor")
+    lines = [
+        "# Vesper optional API keys fallback storage",
+        "# Encrypted fallback (keyring is preferred)",
+        "[meta]",
+        f'method = "{method}"',
+        "[keys]",
+    ]
+    for key in sorted(values.keys()):
+        if key.startswith("__"):
+            continue
+        val = str(values[key]).replace('"', '\\"')
+        lines.append(f'{key} = "{val}"')
+    path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+def _get_from_env(name: str) -> Optional[str]:
+    for env_key in KEY_ALIASES.get(name, []):
+        val = os.getenv(env_key)
+        if val:
+            return val
+    return None
+def get_key(name: str) -> Optional[str]:
+    # 1) env vars (highest priority)
+    env_val = _get_from_env(name)
+    if env_val:
+        return env_val
+    # 2) keyring (secure)
+    if HAS_KEYRING:
+        try:
+            val = keyring.get_password(SERVICE_NAME, name)
+            if val:
+                return val
+        except Exception:
+            pass
+    # 3) encrypted fallback config.toml
+    fallback = _read_fallback_toml()
+    enc = fallback.get(name)
+    if not enc:
+        return None
+    secret = _get_or_create_local_secret()
+    method = fallback.get("__method__", "fernet" if HAS_FERNET else "xor")
+    return _decrypt_value(enc, method, secret)
+def set_key(name: str, value: str) -> Dict[str, str]:
+    if not value:
+        return {"ok": "false", "method": "none", "error": "Empty value"}
+    if HAS_KEYRING:
+        try:
+            keyring.set_password(SERVICE_NAME, name, value)
+            return {"ok": "true", "method": "keyring"}
+        except Exception:
+            pass
+    fallback = _read_fallback_toml()
+    secret = _get_or_create_local_secret()
+    enc = _encrypt_value(value, secret)
+    fallback["__method__"] = enc["method"]
+    fallback[name] = enc["value"]
+    _write_fallback_toml(fallback)
+    return {"ok": "true", "method": f'toml:{enc["method"]}'}
+def has_key(name: str) -> bool:
+    return bool(get_key(name))
+def get_all() -> Dict[str, Optional[str]]:
+    return {
+        "hf_token": get_key("hf_token"),
+        "kaggle_username": get_key("kaggle_username"),
+        "kaggle_key": get_key("kaggle_key"),
+    }
+def _print_json(data):
+    print(json.dumps(data))
+def main() -> None:
+    if len(sys.argv) < 2:
+        _print_json({
+            "ok": False,
+            "error": "Usage: config.py <get|set|has|all> [name] [value]",
+        })
+        sys.exit(1)
+    cmd = sys.argv[1].lower()
+    if cmd == "all":
+        _print_json({"ok": True, "data": get_all()})
+        return
+    if len(sys.argv) < 3:
+        _print_json({"ok": False, "error": "Missing key name"})
+        sys.exit(1)
+    name = sys.argv[2]
+    if cmd == "get":
+        _print_json({"ok": True, "name": name, "value": get_key(name)})
+        return
+    if cmd == "has":
+        _print_json({"ok": True, "name": name, "value": has_key(name)})
+        return
+    if cmd == "set":
+        if len(sys.argv) < 4:
+            _print_json({"ok": False, "error": "Missing value for set"})
+            sys.exit(1)
+        value = sys.argv[3]
+        result = set_key(name, value)
+        _print_json({"ok": result.get("ok") == "true", "name": name, "method": result.get("method"), "error": result.get("error")})
+        return
+    _print_json({"ok": False, "error": f"Unknown command: {cmd}"})
+    sys.exit(1)
+if __name__ == "__main__":
+    main()