npm - @vespermcp/mcp-server - Versions diffs - 1.1.3 → 1.2.1 - Mend

@vespermcp/mcp-server 1.1.3 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/README.md +34 -0
package/build/config/secure-keys.js +51 -0
package/build/config/user-config.js +48 -0
package/build/fusion/engine.js +69 -0
package/build/index.js +900 -50
package/build/ingestion/hf-downloader.js +12 -3
package/build/ingestion/ingestor.js +33 -9
package/build/ingestion/kaggle-downloader.js +2 -2
package/build/metadata/kaggle-source.js +70 -0
package/build/metadata/scraper.js +34 -10
package/build/python/config.py +259 -0
package/build/python/export_engine.py +148 -52
package/build/python/fusion_engine.py +368 -0
package/build/python/kaggle_engine.py +204 -0
package/build/python/row_count.py +54 -0
package/build/python/test_fusion_engine.py +89 -0
package/build/scripts/build-index.js +5 -5
package/build/search/jit-orchestrator.js +72 -12
package/build/tools/formatter.js +14 -14
package/package.json +9 -3
package/scripts/refresh-index.cjs +87 -0
package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
package/src/python/config.py +259 -0
package/src/python/export_engine.py +148 -52
package/src/python/fusion_engine.py +368 -0
package/src/python/kaggle_engine.py +204 -0
package/src/python/row_count.py +54 -0
package/src/python/test_fusion_engine.py +89 -0

package/scripts/refresh-index.cjs ADDED Viewed

@@ -0,0 +1,87 @@
+#!/usr/bin/env node
+const { spawnSync } = require("child_process");
+const fs = require("fs");
+const path = require("path");
+const os = require("os");
+const Database = require("better-sqlite3");
+function runCommand(command, args, options = {}) {
+  const result = spawnSync(command, args, {
+    stdio: "inherit",
+    shell: process.platform === "win32",
+    ...options,
+  });
+  if (result.status !== 0) {
+    throw new Error(`Command failed: ${command} ${args.join(" ")} (exit ${result.status})`);
+  }
+}
+function countDatasets(dbPath) {
+  if (!fs.existsSync(dbPath)) return "N/A";
+  const db = new Database(dbPath);
+  const count = db.prepare("SELECT COUNT(*) AS c FROM datasets").get().c;
+  db.close();
+  return count;
+}
+function countVectors(jsonPath) {
+  if (!fs.existsSync(jsonPath)) return "N/A";
+  const data = JSON.parse(fs.readFileSync(jsonPath, "utf8"));
+  if (typeof data.count === "number") return data.count;
+  if (Array.isArray(data.ids)) return data.ids.length;
+  return "N/A";
+}
+function syncRuntime(workspaceRoot) {
+  const runtimeDir = path.join(os.homedir(), ".vesper", "data");
+  fs.mkdirSync(runtimeDir, { recursive: true });
+  const files = ["metadata.db", "vectors.json", "vectors.bin"];
+  for (const file of files) {
+    const src = path.join(workspaceRoot, "data", file);
+    const dest = path.join(runtimeDir, file);
+    if (!fs.existsSync(src)) {
+      throw new Error(`Missing source file: ${src}`);
+    }
+    fs.copyFileSync(src, dest);
+  }
+  return runtimeDir;
+}
+function main() {
+  const workspaceRoot = process.cwd();
+  const runtimeDbPath = path.join(os.homedir(), ".vesper", "data", "metadata.db");
+  const runtimeVecPath = path.join(os.homedir(), ".vesper", "data", "vectors.json");
+  const workspaceDbPath = path.join(workspaceRoot, "data", "metadata.db");
+  const workspaceVecPath = path.join(workspaceRoot, "data", "vectors.json");
+  console.log("\n[refresh-index] Step 1/3: Massive scrape...");
+  runCommand("npm", ["run", "massive-scrape"]);
+  console.log("\n[refresh-index] Step 2/3: High-memory indexing...");
+  const env = { ...process.env, NODE_OPTIONS: "--max-old-space-size=8192" };
+  runCommand("npm", ["run", "index"], { env });
+  console.log("\n[refresh-index] Step 3/3: Sync workspace index to runtime...");
+  const runtimeDir = syncRuntime(workspaceRoot);
+  const wsDb = countDatasets(workspaceDbPath);
+  const wsVec = countVectors(workspaceVecPath);
+  const rtDb = countDatasets(runtimeDbPath);
+  const rtVec = countVectors(runtimeVecPath);
+  console.log("\n[refresh-index] Completed successfully.");
+  console.log(`[refresh-index] Workspace: DB=${wsDb}, VECTORS=${wsVec}`);
+  console.log(`[refresh-index] Runtime:   DB=${rtDb}, VECTORS=${rtVec}`);
+  console.log(`[refresh-index] Runtime path: ${runtimeDir}\n`);
+}
+try {
+  main();
+} catch (error) {
+  console.error("\n[refresh-index] Failed:", error.message);
+  process.exit(1);
+}

package/src/python/__pycache__/export_engine.cpython-312.pyc ADDED Viewed

Binary file

package/src/python/__pycache__/fusion_engine.cpython-312.pyc ADDED Viewed

Binary file

package/src/python/config.py ADDED Viewed

@@ -0,0 +1,259 @@
+import os
+import sys
+import json
+import base64
+import hashlib
+import secrets
+from pathlib import Path
+from typing import Dict, Optional
+SERVICE_NAME = "vesper"
+KEY_ALIASES = {
+    "hf_token": ["HF_TOKEN", "HUGGINGFACE_TOKEN"],
+    "kaggle_username": ["KAGGLE_USERNAME"],
+    "kaggle_key": ["KAGGLE_KEY"],
+}
+try:
+    import keyring  # type: ignore
+    HAS_KEYRING = True
+except Exception:
+    HAS_KEYRING = False
+try:
+    from cryptography.fernet import Fernet, InvalidToken  # type: ignore
+    HAS_FERNET = True
+except Exception:
+    HAS_FERNET = False
+def _config_path() -> Path:
+    return Path.home() / ".vesper" / "config.toml"
+def _secret_path() -> Path:
+    return Path.home() / ".vesper" / ".config_key"
+def _ensure_parent(path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+def _read_fallback_toml() -> Dict[str, str]:
+    path = _config_path()
+    if not path.exists():
+        return {}
+    values: Dict[str, str] = {}
+    in_keys = False
+    method = ""
+    for raw in path.read_text(encoding="utf-8").splitlines():
+        line = raw.strip()
+        if not line or line.startswith("#"):
+            continue
+        if line.startswith("[") and line.endswith("]"):
+            in_keys = (line == "[keys]")
+            continue
+        if line.startswith("method") and "=" in line:
+            method = line.split("=", 1)[1].strip().strip('"').strip("'")
+            continue
+        if not in_keys or "=" not in line:
+            continue
+        key, val = line.split("=", 1)
+        key = key.strip()
+        val = val.strip().strip('"').strip("'")
+        values[key] = val
+    if method:
+        values["__method__"] = method
+    return values
+def _get_or_create_local_secret() -> str:
+    secret_file = _secret_path()
+    _ensure_parent(secret_file)
+    if secret_file.exists():
+        return secret_file.read_text(encoding="utf-8").strip()
+    secret = base64.urlsafe_b64encode(secrets.token_bytes(32)).decode("utf-8")
+    secret_file.write_text(secret, encoding="utf-8")
+    try:
+        os.chmod(secret_file, 0o600)
+    except Exception:
+        pass
+    return secret
+def _xor_encrypt(plain: str, secret: str) -> str:
+    key = hashlib.sha256(secret.encode("utf-8")).digest()
+    data = plain.encode("utf-8")
+    out = bytes([data[i] ^ key[i % len(key)] for i in range(len(data))])
+    return base64.urlsafe_b64encode(out).decode("utf-8")
+def _xor_decrypt(cipher_text: str, secret: str) -> str:
+    key = hashlib.sha256(secret.encode("utf-8")).digest()
+    data = base64.urlsafe_b64decode(cipher_text.encode("utf-8"))
+    out = bytes([data[i] ^ key[i % len(key)] for i in range(len(data))])
+    return out.decode("utf-8")
+def _encrypt_value(value: str, secret: str) -> Dict[str, str]:
+    if HAS_FERNET:
+        token = Fernet(secret.encode("utf-8")).encrypt(value.encode("utf-8")).decode("utf-8")
+        return {"method": "fernet", "value": token}
+    # fallback encryption (weaker than fernet, but still not plaintext)
+    return {"method": "xor", "value": _xor_encrypt(value, secret)}
+def _decrypt_value(value: str, method: str, secret: str) -> Optional[str]:
+    try:
+        if method == "fernet" and HAS_FERNET:
+            return Fernet(secret.encode("utf-8")).decrypt(value.encode("utf-8")).decode("utf-8")
+        if method == "xor":
+            return _xor_decrypt(value, secret)
+        return None
+    except InvalidToken:
+        return None
+    except Exception:
+        return None
+def _write_fallback_toml(values: Dict[str, str]) -> None:
+    path = _config_path()
+    _ensure_parent(path)
+    method = values.get("__method__", "fernet" if HAS_FERNET else "xor")
+    lines = [
+        "# Vesper optional API keys fallback storage",
+        "# Encrypted fallback (keyring is preferred)",
+        "[meta]",
+        f'method = "{method}"',
+        "[keys]",
+    ]
+    for key in sorted(values.keys()):
+        if key.startswith("__"):
+            continue
+        val = str(values[key]).replace('"', '\\"')
+        lines.append(f'{key} = "{val}"')
+    path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+def _get_from_env(name: str) -> Optional[str]:
+    for env_key in KEY_ALIASES.get(name, []):
+        val = os.getenv(env_key)
+        if val:
+            return val
+    return None
+def get_key(name: str) -> Optional[str]:
+    # 1) env vars (highest priority)
+    env_val = _get_from_env(name)
+    if env_val:
+        return env_val
+    # 2) keyring (secure)
+    if HAS_KEYRING:
+        try:
+            val = keyring.get_password(SERVICE_NAME, name)
+            if val:
+                return val
+        except Exception:
+            pass
+    # 3) encrypted fallback config.toml
+    fallback = _read_fallback_toml()
+    enc = fallback.get(name)
+    if not enc:
+        return None
+    secret = _get_or_create_local_secret()
+    method = fallback.get("__method__", "fernet" if HAS_FERNET else "xor")
+    return _decrypt_value(enc, method, secret)
+def set_key(name: str, value: str) -> Dict[str, str]:
+    if not value:
+        return {"ok": "false", "method": "none", "error": "Empty value"}
+    if HAS_KEYRING:
+        try:
+            keyring.set_password(SERVICE_NAME, name, value)
+            return {"ok": "true", "method": "keyring"}
+        except Exception:
+            pass
+    fallback = _read_fallback_toml()
+    secret = _get_or_create_local_secret()
+    enc = _encrypt_value(value, secret)
+    fallback["__method__"] = enc["method"]
+    fallback[name] = enc["value"]
+    _write_fallback_toml(fallback)
+    return {"ok": "true", "method": f'toml:{enc["method"]}'}
+def has_key(name: str) -> bool:
+    return bool(get_key(name))
+def get_all() -> Dict[str, Optional[str]]:
+    return {
+        "hf_token": get_key("hf_token"),
+        "kaggle_username": get_key("kaggle_username"),
+        "kaggle_key": get_key("kaggle_key"),
+    }
+def _print_json(data):
+    print(json.dumps(data))
+def main() -> None:
+    if len(sys.argv) < 2:
+        _print_json({
+            "ok": False,
+            "error": "Usage: config.py <get|set|has|all> [name] [value]",
+        })
+        sys.exit(1)
+    cmd = sys.argv[1].lower()
+    if cmd == "all":
+        _print_json({"ok": True, "data": get_all()})
+        return
+    if len(sys.argv) < 3:
+        _print_json({"ok": False, "error": "Missing key name"})
+        sys.exit(1)
+    name = sys.argv[2]
+    if cmd == "get":
+        _print_json({"ok": True, "name": name, "value": get_key(name)})
+        return
+    if cmd == "has":
+        _print_json({"ok": True, "name": name, "value": has_key(name)})
+        return
+    if cmd == "set":
+        if len(sys.argv) < 4:
+            _print_json({"ok": False, "error": "Missing value for set"})
+            sys.exit(1)
+        value = sys.argv[3]
+        result = set_key(name, value)
+        _print_json({"ok": result.get("ok") == "true", "name": name, "method": result.get("method"), "error": result.get("error")})
+        return
+    _print_json({"ok": False, "error": f"Unknown command: {cmd}"})
+    sys.exit(1)
+if __name__ == "__main__":
+    main()

package/src/python/export_engine.py CHANGED Viewed

@@ -2,25 +2,102 @@ import sys
 import json
 import polars as pl
 import os
+import time
+# Optional imports for extra formats
+try:
+    import pyarrow as pa
+    import pyarrow.feather as pf
+    HAS_PYARROW = True
+except ImportError:
+    HAS_PYARROW = False
-# Optional TensorFlow import for TFRecord support
 try:
     import tensorflow as tf
     HAS_TENSORFLOW = True
 except ImportError:
     HAS_TENSORFLOW = False
-def export_data(file_path, output_path, format, options=None):
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _load(file_path: str, options: dict) -> pl.DataFrame:
+    """Load any supported input format into a Polars DataFrame."""
+    sample_rows = options.get("sample_rows")  # int | None
+    columns = options.get("columns")          # list[str] | None
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext == ".csv":
+        df = pl.read_csv(file_path, ignore_errors=True)
+    elif ext in (".parquet", ".pq"):
+        df = pl.read_parquet(file_path)
+    elif ext in (".feather", ".ftr", ".arrow", ".ipc"):
+        df = pl.read_ipc(file_path)
+    elif ext == ".jsonl":
+        df = pl.read_ndjson(file_path)
+    else:
+        raise ValueError(f"Unsupported input format: {ext}")
+    # Column selection (before sampling for speed)
+    if columns:
+        valid = [c for c in columns if c in df.columns]
+        if valid:
+            df = df.select(valid)
+    # Optional sampling
+    if sample_rows and sample_rows < len(df):
+        seed = options.get("random_seed", 42)
+        df = df.sample(n=sample_rows, seed=seed)
+    return df
+def _safe_csv_df(df: pl.DataFrame) -> pl.DataFrame:
+    """Stringify complex columns so CSV doesn't choke."""
+    for col in df.columns:
+        dtype = df.schema[col]
+        is_simple = (
+            dtype.is_numeric()
+            or dtype.is_temporal()
+            or str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
+        )
+        if not is_simple:
+            def safe_serialize(val):
+                try:
+                    if hasattr(val, "to_list"):
+                        return json.dumps(val.to_list())
+                    if hasattr(val, "to_dict"):
+                        return json.dumps(val.to_dict())
+                    return json.dumps(val)
+                except Exception:
+                    return str(val)
+            df = df.with_columns(
+                pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8)
+            )
+    return df
+def _write_preview(df: pl.DataFrame, output_path: str, n: int = 500):
+    """Write a small CSV preview next to the exported file."""
+    preview_path = os.path.splitext(output_path)[0] + "_preview.csv"
+    preview_df = _safe_csv_df(df.head(min(n, len(df))))
+    preview_df.write_csv(preview_path)
+    return preview_path
+# ---------------------------------------------------------------------------
+# Main export function
+# ---------------------------------------------------------------------------
+def export_data(file_path: str, output_path: str, format: str, options: dict | None = None):
     options = options or {}
-    # Load Data
+    t0 = time.perf_counter()
+    # ---- Load ----
     try:
-        if file_path.endswith(".csv"):
-            df = pl.read_csv(file_path, ignore_errors=True)
-        elif file_path.endswith(".parquet"):
-            df = pl.read_parquet(file_path)
-        else:
-            return {"error": f"Unsupported input format: {file_path}"}
+        df = _load(file_path, options)
     except Exception as e:
         return {"error": f"Failed to load input file: {str(e)}"}
@@ -28,104 +105,123 @@ def export_data(file_path, output_path, format, options=None):
     if output_dir and not os.path.exists(output_dir):
         os.makedirs(output_dir, exist_ok=True)
+    preview_path = None
+    generate_preview = options.get("preview", False)
     try:
-        # Export Logic
-        if format == "csv":
-            # Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
-            for col in df.columns:
-                dtype = df.schema[col]
-                is_simple = (
-                    dtype.is_numeric() or
-                    dtype.is_temporal() or
-                    str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
-                )
-                if not is_simple:
-                    def safe_serialize(val):
-                        try:
-                            if hasattr(val, "to_list"):
-                                return json.dumps(val.to_list())
-                            if hasattr(val, "to_dict"):
-                                return json.dumps(val.to_dict())
-                            return json.dumps(val)
-                        except:
-                            return str(val)
-                    df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
-            df.write_csv(output_path)
+        # ---- Feather (Arrow IPC) – fastest binary format ----
+        if format == "feather":
+            if not HAS_PYARROW:
+                return {"error": "pyarrow is not installed. Run: pip install pyarrow"}
+            compression = options.get("compression", "lz4")
+            if compression in ("uncompressed", "none", "None", None):
+                compression = "uncompressed"
+            # Polars write_ipc uses Arrow IPC (= Feather v2) under the hood
+            arrow_table = df.to_arrow()
+            pf.write_feather(arrow_table, output_path, compression=compression)
+            if generate_preview:
+                preview_path = _write_preview(df, output_path)
+        # ---- Parquet – best compression, big-data friendly ----
         elif format == "parquet":
             compression = options.get("compression", "snappy")
+            if compression in ("uncompressed", "none", "None", None):
+                compression = "uncompressed"
             df.write_parquet(output_path, compression=compression)
+            if generate_preview:
+                preview_path = _write_preview(df, output_path)
+        # ---- CSV – human-readable fallback ----
+        elif format == "csv":
+            df = _safe_csv_df(df)
+            df.write_csv(output_path)
+        # ---- JSONL ----
         elif format == "jsonl":
             df.write_ndjson(output_path)
-        elif format == "arrow" or format == "ipc":
+            if generate_preview:
+                preview_path = _write_preview(df, output_path)
+        # ---- Arrow IPC (legacy name kept for compat) ----
+        elif format in ("arrow", "ipc"):
             compression = options.get("compression", "uncompressed")
-            if compression == "uncompressed": compression = None
+            if compression == "uncompressed":
+                compression = None
             df.write_ipc(output_path, compression=compression)
+            if generate_preview:
+                preview_path = _write_preview(df, output_path)
+        # ---- TFRecord ----
         elif format == "tfrecord":
             if not HAS_TENSORFLOW:
                 return {"error": "TensorFlow is not installed. Cannot export to TFRecord."}
-            # TFRecord Export Logic (using TensorFlow)
             with tf.io.TFRecordWriter(output_path) as writer:
-                # Convert Polars -> Pandas for iteration (simpler for now)
-                # TODO: Optimize with Arrow -> Tensor conversion if needed for massive data
                 pdf = df.to_pandas()
                 for _, row in pdf.iterrows():
                     feature = {}
                     for col, value in row.items():
                         if value is None:
                             continue
-                        # Type inference for TFRecord features
                         if isinstance(value, int):
                             feature[col] = tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
                         elif isinstance(value, float):
                             feature[col] = tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
                         elif isinstance(value, str):
-                            feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode('utf-8')]))
+                            feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode("utf-8")]))
                         elif isinstance(value, bytes):
                             feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
                         else:
-                            # Fallback to string for unknown types
-                            feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode('utf-8')]))
+                            feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode("utf-8")]))
                     example = tf.train.Example(features=tf.train.Features(feature=feature))
                     writer.write(example.SerializeToString())
         else:
             return {"error": f"Unknown export format: {format}"}
-        return {
+        elapsed = round(time.perf_counter() - t0, 3)
+        file_size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
+        result = {
             "success": True,
             "output_path": output_path,
             "rows": len(df),
-            "format": format
+            "columns": len(df.columns),
+            "format": format,
+            "compression": options.get("compression", "default"),
+            "file_size_mb": file_size_mb,
+            "elapsed_seconds": elapsed,
         }
+        if preview_path:
+            result["preview_path"] = preview_path
+        return result
     except Exception as e:
         return {"error": f"Export failed: {str(e)}"}
 def main():
     if len(sys.argv) < 4:
-        print(json.dumps({"error": "Usage: export_engine.py <input_file> <output_file> <format> [options_json]"}), file=sys.stderr)
+        print(
+            json.dumps({"error": "Usage: export_engine.py <input_file> <output_file> <format> [options_json]"}),
+            file=sys.stderr,
+        )
         sys.exit(1)
     input_file = sys.argv[1]
     output_file = sys.argv[2]
     fmt = sys.argv[3]
     options = {}
     if len(sys.argv) > 4:
         try:
             options = json.loads(sys.argv[4])
-        except:
+        except Exception:
             pass
     result = export_data(input_file, output_file, fmt, options)
     print(json.dumps(result))
 if __name__ == "__main__":
     main()