npm - @vespermcp/mcp-server - Versions diffs - 1.1.2 → 1.2.0 - Mend

@vespermcp/mcp-server 1.1.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/README.md +34 -0
package/build/config/secure-keys.js +51 -0
package/build/config/user-config.js +48 -0
package/build/fusion/engine.js +69 -0
package/build/index.js +813 -25
package/build/ingestion/hf-downloader.js +34 -5
package/build/ingestion/ingestor.js +33 -9
package/build/metadata/kaggle-source.js +70 -0
package/build/metadata/scraper.js +34 -10
package/build/python/config.py +259 -0
package/build/python/export_engine.py +148 -52
package/build/python/fusion_engine.py +368 -0
package/build/python/kaggle_engine.py +204 -0
package/build/python/row_count.py +54 -0
package/build/python/test_fusion_engine.py +89 -0
package/build/scripts/build-index.js +5 -5
package/build/search/jit-orchestrator.js +74 -14
package/package.json +8 -2
package/scripts/refresh-index.cjs +87 -0
package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
package/src/python/config.py +259 -0
package/src/python/export_engine.py +148 -52
package/src/python/fusion_engine.py +368 -0
package/src/python/kaggle_engine.py +204 -0
package/src/python/row_count.py +54 -0
package/src/python/test_fusion_engine.py +89 -0

package/build/python/kaggle_engine.py ADDED Viewed

@@ -0,0 +1,204 @@
+import sys
+import os
+import json
+import tempfile
+from typing import Dict, Any, List
+from config import get_all
+try:
+    from kaggle.api.kaggle_api_extended import KaggleApi
+    HAS_KAGGLE = True
+except Exception:
+    HAS_KAGGLE = False
+def _ensure_auth() -> Dict[str, Any]:
+    if not HAS_KAGGLE:
+        return {
+            "ok": False,
+            "error": "kaggle package not installed. Install with: pip install kaggle",
+        }
+    # Priority:
+    # 1) Existing env vars
+    # 2) secure local store (keyring or ~/.vesper/config.toml)
+    # 3) ~/.kaggle/kaggle.json handled by KaggleApi.authenticate()
+    if not os.getenv("KAGGLE_USERNAME") or not os.getenv("KAGGLE_KEY"):
+        keys = get_all()
+        if keys.get("kaggle_username") and keys.get("kaggle_key"):
+            os.environ["KAGGLE_USERNAME"] = keys["kaggle_username"]
+            os.environ["KAGGLE_KEY"] = keys["kaggle_key"]
+    api = KaggleApi()
+    try:
+        api.authenticate()
+    except Exception as e:
+        return {
+            "ok": False,
+            "error": "Kaggle requires API key — run 'vespermcp config keys' (30 seconds) or provide ~/.kaggle/kaggle.json",
+            "details": str(e),
+        }
+    return {"ok": True, "api": api}
+def _dataset_to_dict(ds) -> Dict[str, Any]:
+    # kaggle API object fields differ by version; use getattr defensively
+    ref = getattr(ds, "ref", None) or getattr(ds, "datasetRef", None) or ""
+    title = getattr(ds, "title", None) or ref
+    subtitle = getattr(ds, "subtitle", None) or ""
+    owner = getattr(ds, "creatorName", None) or getattr(ds, "ownerName", None) or ""
+    votes = int(getattr(ds, "voteCount", 0) or 0)
+    downloads = int(getattr(ds, "downloadCount", 0) or 0)
+    size = int(getattr(ds, "totalBytes", 0) or 0)
+    last_updated = str(getattr(ds, "lastUpdated", ""))
+    tags = []
+    raw_tags = getattr(ds, "tags", None)
+    if raw_tags:
+        for t in raw_tags:
+            tags.append(getattr(t, "name", str(t)))
+    return {
+        "id": ref,
+        "source": "kaggle",
+        "name": title,
+        "description": subtitle or title,
+        "downloads": downloads,
+        "likes": votes,
+        "stars": 0,
+        "tags": tags,
+        "last_updated": last_updated,
+        "task": "unknown",
+        "domain": "unknown",
+        "languages": [],
+        "splits": [{"name": "data", "num_examples": 0, "size_bytes": size}],
+        "license": {
+            "id": "unknown",
+            "name": "unknown",
+            "category": "unknown",
+            "usage_restrictions": [],
+            "warnings": ["Kaggle license details may vary by dataset"],
+        },
+        "quality_score": 40,
+        "quality_warnings": ["Review dataset card and competition rules before use"],
+        "download_url": f"https://www.kaggle.com/datasets/{ref}",
+        "format": None,
+        "total_examples": 0,
+        "total_size_bytes": size,
+        "total_size_mb": round(size / (1024 * 1024), 2) if size else 0,
+        "columns": [],
+        "is_structured": False,
+        "has_target_column": False,
+        "is_safe_source": True,
+        "has_personal_data": False,
+        "is_paywalled": False,
+        "is_scraped_web_data": False,
+        "uses_https": True,
+        "has_train_split": False,
+        "has_test_split": False,
+        "has_validation_split": False,
+        "description_length": len(subtitle or title),
+        "has_readme": True,
+    }
+def discover(query: str, limit: int = 20) -> Dict[str, Any]:
+    auth = _ensure_auth()
+    if not auth.get("ok"):
+        return auth
+    api: KaggleApi = auth["api"]
+    try:
+        datasets = api.dataset_list(search=query, page_size=max(1, min(limit, 100)))
+        items = [_dataset_to_dict(ds) for ds in datasets[:limit]]
+        return {"ok": True, "results": items, "count": len(items)}
+    except Exception as e:
+        return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
+def _pick_best_file(root: str) -> str:
+    candidates: List[str] = []
+    for base, _, files in os.walk(root):
+        for name in files:
+            full = os.path.join(base, name)
+            lower = name.lower()
+            if lower.endswith((".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow")):
+                candidates.append(full)
+    if not candidates:
+        raise RuntimeError("No suitable data file found after download")
+    # prioritize common tabular formats
+    priorities = [".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow"]
+    for ext in priorities:
+        for c in candidates:
+            if c.lower().endswith(ext):
+                return c
+    return candidates[0]
+def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
+    auth = _ensure_auth()
+    if not auth.get("ok"):
+        return auth
+    api: KaggleApi = auth["api"]
+    if not target_dir:
+        target_dir = tempfile.mkdtemp(prefix="vesper_kaggle_")
+    os.makedirs(target_dir, exist_ok=True)
+    try:
+        if "kaggle.com/datasets/" in dataset_ref:
+            dataset_ref = dataset_ref.split("kaggle.com/datasets/")[1].lstrip("/")
+        # unzip in place, remove zip for convenience
+        api.dataset_download_files(dataset_ref, path=target_dir, unzip=True, quiet=True)
+        best_file = _pick_best_file(target_dir)
+        return {
+            "ok": True,
+            "dataset_id": dataset_ref,
+            "target_dir": target_dir,
+            "local_path": best_file,
+        }
+    except Exception as e:
+        msg = str(e)
+        if "401" in msg or "Unauthorized" in msg:
+            return {"ok": False, "error": "Invalid Kaggle credentials (401). Run 'vespermcp config kaggle' again."}
+        if "429" in msg or "Too Many Requests" in msg:
+            return {"ok": False, "error": "Kaggle rate limit reached. Please retry later."}
+        return {"ok": False, "error": f"Kaggle download failed: {msg}"}
+def main():
+    if len(sys.argv) < 2:
+        print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py <discover|download> ..."}))
+        sys.exit(1)
+    command = sys.argv[1]
+    if command == "discover":
+        if len(sys.argv) < 3:
+            print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py discover <query> [limit]"}))
+            sys.exit(1)
+        query = sys.argv[2]
+        limit = int(sys.argv[3]) if len(sys.argv) > 3 else 20
+        print(json.dumps(discover(query, limit)))
+        return
+    if command == "download":
+        if len(sys.argv) < 3:
+            print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py download <dataset_ref> [target_dir]"}))
+            sys.exit(1)
+        dataset_ref = sys.argv[2]
+        target_dir = sys.argv[3] if len(sys.argv) > 3 else ""
+        print(json.dumps(download(dataset_ref, target_dir)))
+        return
+    print(json.dumps({"ok": False, "error": f"Unknown command: {command}"}))
+    sys.exit(1)
+if __name__ == "__main__":
+    main()

package/build/python/row_count.py ADDED Viewed

@@ -0,0 +1,54 @@
+import sys
+import json
+import os
+try:
+    import polars as pl
+except Exception:
+    print(json.dumps({"ok": False, "error": "polars is required"}))
+    sys.exit(1)
+def count_rows(path: str) -> int:
+    ext = os.path.splitext(path)[1].lower()
+    if ext == ".csv":
+        # Faster than full read for large csv
+        return int(pl.scan_csv(path, ignore_errors=True).select(pl.len()).collect().item())
+    if ext in [".parquet", ".pq"]:
+        return int(pl.scan_parquet(path).select(pl.len()).collect().item())
+    if ext in [".feather", ".ftr", ".arrow", ".ipc"]:
+        return int(pl.scan_ipc(path).select(pl.len()).collect().item())
+    if ext in [".jsonl", ".ndjson"]:
+        return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
+    if ext == ".json":
+        # fallback to eager for plain JSON arrays
+        try:
+            return int(pl.read_json(path).height)
+        except Exception:
+            return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
+    # unknown extension fallback
+    return int(pl.read_csv(path, ignore_errors=True).height)
+def main():
+    if len(sys.argv) < 2:
+        print(json.dumps({"ok": False, "error": "Usage: row_count.py <file_path>"}))
+        sys.exit(1)
+    p = sys.argv[1]
+    if not os.path.exists(p):
+        print(json.dumps({"ok": False, "error": f"File not found: {p}"}))
+        sys.exit(1)
+    try:
+        rows = count_rows(p)
+        print(json.dumps({"ok": True, "rows": rows}))
+    except Exception as e:
+        print(json.dumps({"ok": False, "error": str(e)}))
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

package/build/python/test_fusion_engine.py ADDED Viewed

@@ -0,0 +1,89 @@
+import os
+import tempfile
+import polars as pl
+from fusion_engine import fuse_datasets
+def run_basic_tests():
+    tmp = tempfile.gettempdir()
+    # ----- Test 1: concat -----
+    p1 = os.path.join(tmp, "fuse_test_a.csv")
+    p2 = os.path.join(tmp, "fuse_test_b.csv")
+    out_concat = os.path.join(tmp, "fuse_test_concat.feather")
+    df1 = pl.DataFrame({
+        "id": [1, 2, 3],
+        "text": ["a", "b", "c"],
+        "price": [10.0, 20.0, 30.0],
+    })
+    df2 = pl.DataFrame({
+        "id": [4, 5, 3],
+        "text": ["d", "e", "c"],
+        "price": [40.0, 50.0, 30.0],
+        "image_path": ["img1.jpg", "img2.jpg", "img3.jpg"],
+    })
+    df1.write_csv(p1)
+    df2.write_csv(p2)
+    concat_res = fuse_datasets(
+        sources=[p1, p2],
+        strategy="concat",
+        dedup=True,
+        run_quality_after=False,
+        leakage_check=True,
+        output_path=out_concat,
+        output_format="feather",
+        compression="lz4",
+        preview=True,
+        id_column="id",
+    )
+    assert concat_res.get("success") is True, f"Concat failed: {concat_res}"
+    assert os.path.exists(out_concat), "Concat output file missing"
+    # ----- Test 2: join with conflicting column names -----
+    p3 = os.path.join(tmp, "fuse_test_c.csv")
+    p4 = os.path.join(tmp, "fuse_test_d.csv")
+    out_join = os.path.join(tmp, "fuse_test_join.parquet")
+    left = pl.DataFrame({
+        "id": [1, 2, 3],
+        "price": [100, 200, 300],
+        "text": ["x", "y", "z"],
+    })
+    right = pl.DataFrame({
+        "id": [2, 3, 4],
+        "price": [999, 888, 777],
+        "caption": ["two", "three", "four"],
+    })
+    left.write_csv(p3)
+    right.write_csv(p4)
+    join_res = fuse_datasets(
+        sources=[p3, p4],
+        strategy="join",
+        join_on="id",
+        how="inner",
+        dedup=True,
+        run_quality_after=False,
+        leakage_check=False,
+        output_path=out_join,
+        output_format="parquet",
+        compression="snappy",
+        preview=True,
+    )
+    assert join_res.get("success") is True, f"Join failed: {join_res}"
+    assert os.path.exists(out_join), "Join output file missing"
+    assert len(join_res.get("stats", {}).get("conflict_renames", [])) >= 1, "Expected conflict rename for price column"
+    print("✅ Fusion tests passed")
+    print("Concat:", concat_res["stats"])
+    print("Join:", join_res["stats"])
+if __name__ == "__main__":
+    run_basic_tests()

package/build/scripts/build-index.js CHANGED Viewed

@@ -13,7 +13,7 @@ async function main() {
     // Filter to only new datasets
     const toIndex = datasets.filter(ds => !indexedIds.has(ds.id));
     console.error(`Total datasets: ${datasets.length}, Already indexed: ${indexedIds.size}, To index: ${toIndex.length}`);
-    const BATCH_SIZE = 50;
+    const BATCH_SIZE = 20;
     let processed = 0;
     for (let i = 0; i < toIndex.length; i += BATCH_SIZE) {
         const batch = toIndex.slice(i, i + BATCH_SIZE);
@@ -26,9 +26,9 @@ async function main() {
                 `Languages: ${ds.languages?.join(", ") || ""}`,
                 `Tags: ${ds.tags?.join(" ") || ""}`
             ].join(" ").slice(0, 1500));
-            // Embed batch (Xenova supports array input)
-            // Note: Parallelizing at the embed level is better for CPU utilization
-            await Promise.all(batch.map(async (ds, idx) => {
+            // Memory-safe sequential embedding (avoids OOM on large libraries)
+            for (let idx = 0; idx < batch.length; idx++) {
+                const ds = batch[idx];
                 try {
                     const vector = await embedder.embed(texts[idx]);
                     vectorStore.add(ds.id, vector);
@@ -36,7 +36,7 @@ async function main() {
                 catch (err) {
                     console.error(`Failed to index ${ds.id}:`, err);
                 }
-            }));
+            }
             processed += batch.length;
             if (processed % 100 === 0 || i + BATCH_SIZE >= toIndex.length) {
                 console.error(`Indexed ${processed}/${toIndex.length} new datasets...`);

package/build/search/jit-orchestrator.js CHANGED Viewed

@@ -2,6 +2,19 @@ import { HuggingFaceScraper } from "../metadata/scraper.js";
 import { UCIScraper } from "../metadata/uci-scraper.js";
 import { GitHubScraper } from "../metadata/github-scraper.js";
 import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
+// Common stop words to filter out for better search
+const STOP_WORDS = new Set([
+    "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
+    "of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
+    "be", "have", "has", "had", "do", "does", "did", "will", "would",
+    "could", "should", "may", "might", "must", "shall", "can", "need",
+    "about", "into", "through", "during", "before", "after", "above",
+    "below", "between", "under", "again", "further", "then", "once",
+    "here", "there", "when", "where", "why", "how", "all", "each",
+    "few", "more", "most", "other", "some", "such", "no", "nor", "not",
+    "only", "own", "same", "so", "than", "too", "very", "just", "also",
+    "dataset", "datasets", "data", "find", "search", "get", "looking"
+]);
 /**
  * Just-In-Time Orchestrator
  * Automatically fetches and indexes new datasets when local search is insufficient
@@ -17,6 +30,24 @@ export class JITOrchestrator {
         this.vectorStore = vectorStore;
         this.embedder = embedder;
     }
+    /**
+     * Simplify a complex user query into keywords that work better with APIs
+     * HuggingFace and other APIs often fail on long multi-word queries
+     */
+    simplifyQuery(query) {
+        // Split into words, lowercase, remove punctuation
+        const words = query.toLowerCase()
+            .replace(/[^\w\s-]/g, "")
+            .split(/\s+/)
+            .filter(w => w.length > 2 && !STOP_WORDS.has(w));
+        // Return unique keywords (max 3 for API-friendly queries)
+        const unique = [...new Set(words)];
+        // If we have a lot of words, prioritize longer/more specific ones
+        if (unique.length > 3) {
+            unique.sort((a, b) => b.length - a.length);
+        }
+        return unique.slice(0, 3);
+    }
     /**
      * Check if JIT should be triggered based on rate limiting
      */
@@ -33,20 +64,37 @@ export class JITOrchestrator {
     async fetchAndIngest(query, limit = 10) {
         // Rate limiting check
         if (!this.canTrigger(query)) {
-            console.error(`[JIT] Rate limit: Query "${query}" triggered too recently`);
+            console.error(`[JIT] ⏳ Query "${query}" was searched recently. Waiting...`);
             return [];
         }
-        console.error(`\n[JIT] Searching live sources for: "${query}"`);
+        console.error(`\n[JIT] 🔍 Searching live sources for: "${query}"`);
         this.lastTriggerTime.set(query, Date.now());
+        // Simplify query for better API results
+        const keywords = this.simplifyQuery(query);
+        if (keywords.length > 0) {
+            console.error(`[JIT] 🔑 Keywords extracted: ${keywords.join(", ")}`);
+        }
         const newDatasets = [];
         const existingIds = new Set();
+        const sourceResults = {};
         try {
             // Get existing dataset IDs to avoid duplicates
             const existing = this.metadataStore.getAllDatasets();
             existing.forEach(ds => existingIds.add(ds.id));
-            // 1. Scrape HuggingFace (Open Access)
-            const hfResults = await this.scrapeHuggingFace(query, limit);
-            console.error(`  HuggingFace: Found ${hfResults.length} datasets`);
+            // 1. Scrape HuggingFace - try each keyword separately for better results
+            let hfResults = [];
+            for (const keyword of keywords) {
+                if (hfResults.length >= limit)
+                    break;
+                const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / keywords.length));
+                for (const ds of results) {
+                    if (!hfResults.some(existing => existing.id === ds.id)) {
+                        hfResults.push(ds);
+                    }
+                }
+            }
+            sourceResults["HuggingFace"] = hfResults.length;
+            console.error(`  📦 HuggingFace: ${hfResults.length} datasets`);
             for (const ds of hfResults) {
                 if (!existingIds.has(ds.id)) {
                     newDatasets.push(ds);
@@ -55,7 +103,8 @@ export class JITOrchestrator {
             }
             // 2. Scrape UCI (Open Access)
             const uciResults = await this.scrapeUCI(query, Math.floor(limit / 2));
-            console.error(`  UCI: Found ${uciResults.length} datasets`);
+            sourceResults["UCI"] = uciResults.length;
+            console.error(`  📦 UCI: ${uciResults.length} datasets`);
             for (const ds of uciResults) {
                 if (!existingIds.has(ds.id)) {
                     newDatasets.push(ds);
@@ -64,38 +113,49 @@ export class JITOrchestrator {
             }
             // 3. Scrape GitHub (Open Access)
             const githubResults = await this.scrapeGitHub(query, Math.floor(limit / 2));
-            console.error(`  GitHub: Found ${githubResults.length} datasets`);
+            sourceResults["GitHub"] = githubResults.length;
+            console.error(`  📦 GitHub: ${githubResults.length} datasets`);
             for (const ds of githubResults) {
                 if (!existingIds.has(ds.id)) {
                     newDatasets.push(ds);
                     existingIds.add(ds.id);
                 }
             }
-            // 4. Scrape World Bank (Open Access)
+            // 4. Scrape World Bank (Open Access) - Economic/demographic data
             const wbResults = await this.scrapeWorldBank(query, Math.floor(limit / 2));
-            console.error(`  World Bank: Found ${wbResults.length} datasets`);
+            sourceResults["WorldBank"] = wbResults.length;
+            console.error(`  📦 World Bank: ${wbResults.length} datasets`);
             for (const ds of wbResults) {
                 if (!existingIds.has(ds.id)) {
                     newDatasets.push(ds);
                     existingIds.add(ds.id);
                 }
             }
-            // 5. Scrape NASA (Open Access)
+            // 5. Scrape NASA (Open Access) - Scientific/space data
             const nasaResults = await this.scrapeNASA(query, Math.floor(limit / 2));
-            console.error(`  NASA: Found ${nasaResults.length} datasets`);
+            sourceResults["NASA"] = nasaResults.length;
+            console.error(`  📦 NASA: ${nasaResults.length} datasets`);
             for (const ds of nasaResults) {
                 if (!existingIds.has(ds.id)) {
                     newDatasets.push(ds);
                     existingIds.add(ds.id);
                 }
             }
-            // 3. Save and index new datasets
+            // Save and index new datasets
             if (newDatasets.length > 0) {
                 await this.saveAndIndex(newDatasets);
-                console.error(`[JIT] Indexed ${newDatasets.length} new datasets`);
+                console.error(`[JIT] ✅ Indexed ${newDatasets.length} new datasets`);
             }
             else {
-                console.error(`  [JIT] No new datasets found`);
+                // Provide helpful feedback when no results found
+                const allZero = Object.values(sourceResults).every(v => v === 0);
+                if (allZero) {
+                    console.error(`[JIT] ⚠️ No datasets found across all sources.`);
+                    console.error(`[JIT] 💡 Try: broader keywords, or set HF_TOKEN for better HuggingFace access`);
+                }
+                else {
+                    console.error(`[JIT] ℹ️ All found datasets already in index`);
+                }
             }
             return newDatasets;
         }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vespermcp/mcp-server",
-  "version": "1.1.2",
+  "version": "1.2.0",
   "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
   "type": "module",
   "main": "build/index.js",
@@ -27,8 +27,14 @@
     "check-db": "tsx src/scripts/check-db.ts",
     "test-jit": "tsx src/scripts/test-jit.ts",
     "demo-ui": "tsx src/scripts/demo-ui.ts",
+    "fuse": "node build/index.js fuse",
+    "discover": "node build/index.js discover",
+    "download": "node build/index.js download",
+    "config": "node build/index.js config",
+    "test-fusion-engine": "py src/python/test_fusion_engine.py",
     "setup": "node build/index.js --setup",
     "setup:silent": "node build/index.js --setup --silent",
+    "refresh-index": "node scripts/refresh-index.cjs",
     "test": "vitest",
     "start": "node build/index.js"
   },
@@ -86,4 +92,4 @@
     "typescript": "^5.9.3",
     "vitest": "^4.0.17"
   }
-}
+}

package/scripts/refresh-index.cjs ADDED Viewed

@@ -0,0 +1,87 @@
+#!/usr/bin/env node
+const { spawnSync } = require("child_process");
+const fs = require("fs");
+const path = require("path");
+const os = require("os");
+const Database = require("better-sqlite3");
+function runCommand(command, args, options = {}) {
+  const result = spawnSync(command, args, {
+    stdio: "inherit",
+    shell: process.platform === "win32",
+    ...options,
+  });
+  if (result.status !== 0) {
+    throw new Error(`Command failed: ${command} ${args.join(" ")} (exit ${result.status})`);
+  }
+}
+function countDatasets(dbPath) {
+  if (!fs.existsSync(dbPath)) return "N/A";
+  const db = new Database(dbPath);
+  const count = db.prepare("SELECT COUNT(*) AS c FROM datasets").get().c;
+  db.close();
+  return count;
+}
+function countVectors(jsonPath) {
+  if (!fs.existsSync(jsonPath)) return "N/A";
+  const data = JSON.parse(fs.readFileSync(jsonPath, "utf8"));
+  if (typeof data.count === "number") return data.count;
+  if (Array.isArray(data.ids)) return data.ids.length;
+  return "N/A";
+}
+function syncRuntime(workspaceRoot) {
+  const runtimeDir = path.join(os.homedir(), ".vesper", "data");
+  fs.mkdirSync(runtimeDir, { recursive: true });
+  const files = ["metadata.db", "vectors.json", "vectors.bin"];
+  for (const file of files) {
+    const src = path.join(workspaceRoot, "data", file);
+    const dest = path.join(runtimeDir, file);
+    if (!fs.existsSync(src)) {
+      throw new Error(`Missing source file: ${src}`);
+    }
+    fs.copyFileSync(src, dest);
+  }
+  return runtimeDir;
+}
+function main() {
+  const workspaceRoot = process.cwd();
+  const runtimeDbPath = path.join(os.homedir(), ".vesper", "data", "metadata.db");
+  const runtimeVecPath = path.join(os.homedir(), ".vesper", "data", "vectors.json");
+  const workspaceDbPath = path.join(workspaceRoot, "data", "metadata.db");
+  const workspaceVecPath = path.join(workspaceRoot, "data", "vectors.json");
+  console.log("\n[refresh-index] Step 1/3: Massive scrape...");
+  runCommand("npm", ["run", "massive-scrape"]);
+  console.log("\n[refresh-index] Step 2/3: High-memory indexing...");
+  const env = { ...process.env, NODE_OPTIONS: "--max-old-space-size=8192" };
+  runCommand("npm", ["run", "index"], { env });
+  console.log("\n[refresh-index] Step 3/3: Sync workspace index to runtime...");
+  const runtimeDir = syncRuntime(workspaceRoot);
+  const wsDb = countDatasets(workspaceDbPath);
+  const wsVec = countVectors(workspaceVecPath);
+  const rtDb = countDatasets(runtimeDbPath);
+  const rtVec = countVectors(runtimeVecPath);
+  console.log("\n[refresh-index] Completed successfully.");
+  console.log(`[refresh-index] Workspace: DB=${wsDb}, VECTORS=${wsVec}`);
+  console.log(`[refresh-index] Runtime:   DB=${rtDb}, VECTORS=${rtVec}`);
+  console.log(`[refresh-index] Runtime path: ${runtimeDir}\n`);
+}
+try {
+  main();
+} catch (error) {
+  console.error("\n[refresh-index] Failed:", error.message);
+  process.exit(1);
+}

package/src/python/__pycache__/export_engine.cpython-312.pyc ADDED Viewed

Binary file

package/src/python/__pycache__/fusion_engine.cpython-312.pyc ADDED Viewed

Binary file