npm - @vespermcp/mcp-server - Versions diffs - 1.2.26 → 1.2.28 - Mend

@vespermcp/mcp-server 1.2.26 → 1.2.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md +43 -0
package/build/index.js +904 -4
package/build/metadata/semantic-scholar-source.js +64 -11
package/build/python/normalize_schema_engine.py +224 -0
package/build/web/fusion-engine.js +3 -0
package/build/web/web-core.js +2 -0
package/package.json +6 -1
package/src/python/normalize_schema_engine.py +224 -0

package/build/metadata/semantic-scholar-source.js CHANGED Viewed

@@ -1,6 +1,19 @@
-import { rateLimitedFetch } from "./rate-limiter.js";
 import { CircuitBreaker } from "./circuit-breaker.js";
 import { estimateQualityScore } from "./quality.js";
+function sleep(ms) {
+    return new Promise((resolve) => setTimeout(resolve, ms));
+}
+function parseRetryAfterMs(value) {
+    if (!value)
+        return null;
+    const asNum = Number(value);
+    if (Number.isFinite(asNum) && asNum >= 0)
+        return asNum * 1000;
+    const ts = Date.parse(value);
+    if (!Number.isFinite(ts))
+        return null;
+    return Math.max(0, ts - Date.now());
+}
 export class SemanticScholarSource {
     cache;
     breaker = new CircuitBreaker("semantic_scholar", {
@@ -19,12 +32,12 @@ export class SemanticScholarSource {
         const start = Date.now();
         const cleanQuery = String(query || "").trim();
         if (!cleanQuery)
-            return { results: [], cacheHit: false, latencyMs: Date.now() - start };
+            return { results: [], cacheHit: false, latencyMs: Date.now() - start, rateLimited: false, rateLimitAttempts: 0 };
         const perPage = Math.max(1, Math.min(100, Number(limit || 20)));
         const cacheKey = `webcore:semantic_scholar:discover:${cleanQuery.toLowerCase()}:limit=${perPage}`;
         const cached = await this.cache?.getJson(cacheKey);
         if (cached)
-            return { results: cached, cacheHit: true, latencyMs: Date.now() - start };
+            return { results: cached, cacheHit: true, latencyMs: Date.now() - start, rateLimited: false, rateLimitAttempts: 0 };
         if (!this.breaker.canAttempt()) {
             throw new Error("Semantic Scholar connector is temporarily unavailable (circuit open).");
         }
@@ -43,16 +56,32 @@ export class SemanticScholarSource {
             "publicationTypes",
             "openAccessPdf",
         ].join(","));
-        const response = await rateLimitedFetch(url.toString(), {
-            headers: {
-                "Accept": "application/json",
-                "User-Agent": "vesper/2.0 (phase1-semantic-scholar-connector)",
-            },
-        }, { maxRetries: 5, initialDelay: 1000, maxDelay: 20000 }).catch((e) => {
+        const headers = {
+            "Accept": "application/json",
+            "User-Agent": "vesper/2.0 (phase1-semantic-scholar-connector)",
+        };
+        // Handle S2 429s gracefully: respect Retry-After and avoid failing whole fusion runs.
+        const fetched = await this.fetchWith429Retry(url.toString(), headers).catch((e) => {
+            const msg = String(e?.message || e || "");
+            if (msg.includes("429")) {
+                this.breaker.onFailure();
+                return { response: null, rateLimited: true, attempts: 6 };
+            }
             this.breaker.onFailure();
             throw e;
         });
-        const data = await response.json().catch((e) => {
+        if (!fetched?.response) {
+            // Cache short empty result to avoid immediate retry storms on repeated identical queries.
+            await this.cache?.setJson(cacheKey, [], 120);
+            return {
+                results: [],
+                cacheHit: false,
+                latencyMs: Date.now() - start,
+                rateLimited: !!fetched?.rateLimited,
+                rateLimitAttempts: fetched?.attempts ?? 0,
+            };
+        }
+        const data = await fetched.response.json().catch((e) => {
             this.breaker.onFailure();
             throw new Error(`Semantic Scholar JSON parse failed: ${e?.message || String(e)}`);
         });
@@ -60,7 +89,31 @@ export class SemanticScholarSource {
         const result = papers.map((p) => this.toDatasetMetadata(p)).filter(Boolean);
         this.breaker.onSuccess();
         await this.cache?.setJson(cacheKey, result, 86400); // 24h
-        return { results: result, cacheHit: false, latencyMs: Date.now() - start };
+        return {
+            results: result,
+            cacheHit: false,
+            latencyMs: Date.now() - start,
+            rateLimited: false,
+            rateLimitAttempts: fetched.attempts,
+        };
+    }
+    async fetchWith429Retry(url, headers) {
+        const maxAttempts = 6;
+        for (let attempt = 1; attempt <= maxAttempts; attempt++) {
+            const response = await fetch(url, { headers });
+            if (response.ok)
+                return { response, rateLimited: false, attempts: attempt };
+            if (response.status !== 429) {
+                const error = new Error(`HTTP error: ${response.status}`);
+                error.status = response.status;
+                throw error;
+            }
+            const retryAfterMs = parseRetryAfterMs(response.headers.get("retry-after"));
+            const backoffMs = Math.min(30000, 1500 * Math.pow(2, attempt - 1));
+            const jitterMs = Math.floor(Math.random() * 400);
+            await sleep((retryAfterMs ?? backoffMs) + jitterMs);
+        }
+        return { response: null, rateLimited: true, attempts: maxAttempts };
     }
     toDatasetMetadata(paper) {
         const paperId = String(paper.paperId || paper.externalIds?.DOI || "").trim();

package/build/python/normalize_schema_engine.py ADDED Viewed

@@ -0,0 +1,224 @@
+"""
+Normalize schema for semi-structured JSON/JSONL exports so they can be converted to Parquet safely.
+Usage:
+  normalize_schema_engine.py <input_path> <output_path> [options_json]
+options_json (optional):
+  {
+    "flatten_metadata_json": true,
+    "max_keys": 200,
+    "extras_mode": "blob" | "drop"
+  }
+Outputs JSON:
+  {"ok": true, "output_path": "...", "rows": N, "columns": M, "flattened_keys": K, "extras_mode": "..."}
+or {"ok": false, "error": "..."}
+"""
+import sys
+import json
+import os
+import re
+from typing import Any, Dict, List, Tuple
+def _safe_col(name: str) -> str:
+    s = str(name or "").strip()
+    s = re.sub(r"[^a-zA-Z0-9_]", "_", s)
+    s = re.sub(r"_+", "_", s).strip("_")
+    if not s:
+        return "unknown"
+    if len(s) > 64:
+        s = s[:64]
+    return s
+def _coerce_cell(v: Any) -> Any:
+    if v is None:
+        return None
+    if isinstance(v, (str, int, float, bool)):
+        return v
+    # Keep nested values as JSON strings so downstream Parquet has stable scalar columns.
+    try:
+        return json.dumps(v, ensure_ascii=False)
+    except Exception:
+        return str(v)
+def _load_records(src: str) -> List[Dict[str, Any]]:
+    ext = os.path.splitext(src)[1].lower()
+    if ext in (".jsonl", ".ndjson"):
+        rows: List[Dict[str, Any]] = []
+        with open(src, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                obj = json.loads(line)
+                if isinstance(obj, dict):
+                    rows.append(obj)
+        return rows
+    raw = open(src, "r", encoding="utf-8").read().strip()
+    if not raw:
+        return []
+    obj = json.loads(raw)
+    if isinstance(obj, list):
+        return [r for r in obj if isinstance(r, dict)]
+    if isinstance(obj, dict):
+        for key in ("results", "rows", "items", "records", "data", "entries", "samples"):
+            v = obj.get(key)
+            if isinstance(v, list) and (len(v) == 0 or isinstance(v[0], dict)):
+                return [r for r in v if isinstance(r, dict)]
+        # Sometimes the dict itself is the record
+        return [obj]
+    return []
+def _gather_flat_keys(records: List[Dict[str, Any]], max_keys: int) -> List[str]:
+    keys: List[str] = []
+    seen = set()
+    for r in records:
+        mj = r.get("metadata_json")
+        if not isinstance(mj, dict):
+            continue
+        for k in mj.keys():
+            col = f"metadata__{_safe_col(k)}"
+            if col in seen:
+                continue
+            seen.add(col)
+            keys.append(col)
+            if len(keys) >= max_keys:
+                return keys
+    return keys
+def _normalize_records(
+    records: List[Dict[str, Any]],
+    flat_keys: List[str],
+    extras_mode: str,
+) -> Tuple[List[Dict[str, Any]], int]:
+    flat_set = set(flat_keys)
+    extras_count = 0
+    out: List[Dict[str, Any]] = []
+    for r in records:
+        base: Dict[str, Any] = {}
+        # Keep top-level stable fields as-is.
+        for k in (
+            "source_type",
+            "source_url",
+            "content",
+            "quality_score",
+            "collected_at",
+            "content_type",
+        ):
+            if k in r:
+                base[k] = _coerce_cell(r.get(k))
+        # Preserve source_chain as a JSON string (it is nested).
+        if "source_chain" in r:
+            base["source_chain"] = _coerce_cell(r.get("source_chain"))
+        mj = r.get("metadata_json")
+        extras: Dict[str, Any] = {}
+        if isinstance(mj, dict):
+            for k, v in mj.items():
+                col = f"metadata__{_safe_col(k)}"
+                if col in flat_set:
+                    base[col] = _coerce_cell(v)
+                else:
+                    extras[k] = v
+        # Fill missing flattened keys with nulls for uniform schema.
+        for col in flat_keys:
+            if col not in base:
+                base[col] = None
+        if extras:
+            extras_count += 1
+            if extras_mode == "blob":
+                base["metadata_json_blob"] = _coerce_cell(extras)
+            # extras_mode == "drop" => ignore
+        else:
+            if extras_mode == "blob":
+                base["metadata_json_blob"] = None
+        out.append(base)
+    return out, extras_count
+def main():
+    if len(sys.argv) < 3:
+        print(json.dumps({"ok": False, "error": "Usage: normalize_schema_engine.py <input> <output> [options_json]"}))
+        sys.exit(1)
+    input_path = sys.argv[1]
+    output_path = sys.argv[2]
+    options_raw = sys.argv[3] if len(sys.argv) >= 4 else "{}"
+    if not os.path.exists(input_path):
+        print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
+        sys.exit(1)
+    try:
+        options = json.loads(options_raw) if options_raw else {}
+    except Exception:
+        options = {}
+    flatten_metadata_json = options.get("flatten_metadata_json", True) is True
+    max_keys = int(options.get("max_keys", 200) or 200)
+    max_keys = max(0, min(2000, max_keys))
+    extras_mode = str(options.get("extras_mode", "blob") or "blob").lower()
+    if extras_mode not in ("blob", "drop"):
+        extras_mode = "blob"
+    try:
+        records = _load_records(input_path)
+        if not records:
+            os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+            with open(output_path, "w", encoding="utf-8") as f:
+                f.write("")
+            print(json.dumps({"ok": True, "output_path": output_path, "rows": 0, "columns": 0, "flattened_keys": 0, "extras_mode": extras_mode}))
+            return
+        flat_keys: List[str] = []
+        if flatten_metadata_json and max_keys > 0:
+            flat_keys = _gather_flat_keys(records, max_keys)
+        normalized, extras_rows = _normalize_records(records, flat_keys, extras_mode)
+        ext = os.path.splitext(output_path)[1].lower()
+        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+        if ext in (".jsonl", ".ndjson"):
+            with open(output_path, "w", encoding="utf-8") as f:
+                for r in normalized:
+                    f.write(json.dumps(r, ensure_ascii=False) + "\n")
+        elif ext == ".json":
+            with open(output_path, "w", encoding="utf-8") as f:
+                json.dump(normalized, f, ensure_ascii=False)
+        else:
+            raise ValueError(f"Unsupported output format: {ext}. Use .jsonl or .json")
+        columns = len(normalized[0].keys()) if normalized else 0
+        print(json.dumps({
+            "ok": True,
+            "output_path": output_path,
+            "rows": len(normalized),
+            "columns": columns,
+            "flattened_keys": len(flat_keys),
+            "extras_mode": extras_mode,
+            "extras_rows": extras_rows,
+        }))
+    except Exception as e:
+        print(json.dumps({"ok": False, "error": str(e)}))
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

package/build/web/fusion-engine.js CHANGED Viewed

@@ -176,6 +176,7 @@ export class WebFusionEngine {
                 let docs = [];
                 let cacheHit = false;
                 let latencyMs = 0;
+                let rateLimited = false;
                 if (spec.type === "s3") {
                     const out = await this.collectFromS3(spec);
                     docs = out.docs;
@@ -194,6 +195,7 @@ export class WebFusionEngine {
                     const perSrcTel = res.telemetry?.per_source?.find((t) => t.source === spec.type);
                     cacheHit = perSrcTel ? !!perSrcTel.cache_hit : false;
                     latencyMs = perSrcTel ? Number(perSrcTel.latency_ms) : Date.now() - start;
+                    rateLimited = perSrcTel ? !!perSrcTel.rate_limited : false;
                 }
                 const filtered = spec.min_stars !== undefined
                     ? docs.filter((d) => normalizeStars(d) >= Number(spec.min_stars))
@@ -204,6 +206,7 @@ export class WebFusionEngine {
                     cache_hit: cacheHit,
                     latency_ms: latencyMs || (Date.now() - start),
                     result_count: filtered.length,
+                    ...(spec.type === "s3" ? {} : { rate_limited: rateLimited }),
                 });
             }
             catch (e) {

package/build/web/web-core.js CHANGED Viewed

@@ -87,6 +87,7 @@ export class WebCoreEngine {
                         cache_hit: out.cacheHit,
                         latency_ms: out.latencyMs || (Date.now() - t0),
                         result_count: docs.length,
+                        rate_limited: !!out.rateLimited,
                     });
                 }
                 catch (e) {
@@ -96,6 +97,7 @@ export class WebCoreEngine {
                         latency_ms: Date.now() - t0,
                         result_count: 0,
                         error: e?.message || String(e),
+                        rate_limited: String(e?.message || "").includes("429"),
                     });
                 }
             }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vespermcp/mcp-server",
-  "version": "1.2.26",
+  "version": "1.2.28",
   "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
   "type": "module",
   "main": "build/index.js",
@@ -37,6 +37,7 @@
     "setup": "node build/index.js --setup",
     "setup:silent": "node build/index.js --setup --silent",
     "refresh-index": "node scripts/refresh-index.cjs",
+    "telemetry:receiver": "tsx telemetry/lineage-receiver.ts",
     "test": "vitest",
     "start": "node build/index.js"
   },
@@ -79,9 +80,13 @@
     "ajv": "^8.17.1",
     "ajv-formats": "^3.0.1",
     "better-sqlite3": "^12.6.0",
+    "chalk": "^5.6.2",
+    "cli-table3": "^0.6.5",
+    "express": "^5.1.0",
     "inquirer": "^13.3.0",
     "lodash": "^4.17.21",
     "pdf-parse": "^2.4.5",
+    "pg": "^8.16.3",
     "uuid": "^13.0.0",
     "zod": "^4.3.5",
     "zod-to-json-schema": "^3.25.1"

package/src/python/normalize_schema_engine.py ADDED Viewed

@@ -0,0 +1,224 @@
+"""
+Normalize schema for semi-structured JSON/JSONL exports so they can be converted to Parquet safely.
+Usage:
+  normalize_schema_engine.py <input_path> <output_path> [options_json]
+options_json (optional):
+  {
+    "flatten_metadata_json": true,
+    "max_keys": 200,
+    "extras_mode": "blob" | "drop"
+  }
+Outputs JSON:
+  {"ok": true, "output_path": "...", "rows": N, "columns": M, "flattened_keys": K, "extras_mode": "..."}
+or {"ok": false, "error": "..."}
+"""
+import sys
+import json
+import os
+import re
+from typing import Any, Dict, List, Tuple
+def _safe_col(name: str) -> str:
+    s = str(name or "").strip()
+    s = re.sub(r"[^a-zA-Z0-9_]", "_", s)
+    s = re.sub(r"_+", "_", s).strip("_")
+    if not s:
+        return "unknown"
+    if len(s) > 64:
+        s = s[:64]
+    return s
+def _coerce_cell(v: Any) -> Any:
+    if v is None:
+        return None
+    if isinstance(v, (str, int, float, bool)):
+        return v
+    # Keep nested values as JSON strings so downstream Parquet has stable scalar columns.
+    try:
+        return json.dumps(v, ensure_ascii=False)
+    except Exception:
+        return str(v)
+def _load_records(src: str) -> List[Dict[str, Any]]:
+    ext = os.path.splitext(src)[1].lower()
+    if ext in (".jsonl", ".ndjson"):
+        rows: List[Dict[str, Any]] = []
+        with open(src, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                obj = json.loads(line)
+                if isinstance(obj, dict):
+                    rows.append(obj)
+        return rows
+    raw = open(src, "r", encoding="utf-8").read().strip()
+    if not raw:
+        return []
+    obj = json.loads(raw)
+    if isinstance(obj, list):
+        return [r for r in obj if isinstance(r, dict)]
+    if isinstance(obj, dict):
+        for key in ("results", "rows", "items", "records", "data", "entries", "samples"):
+            v = obj.get(key)
+            if isinstance(v, list) and (len(v) == 0 or isinstance(v[0], dict)):
+                return [r for r in v if isinstance(r, dict)]
+        # Sometimes the dict itself is the record
+        return [obj]
+    return []
+def _gather_flat_keys(records: List[Dict[str, Any]], max_keys: int) -> List[str]:
+    keys: List[str] = []
+    seen = set()
+    for r in records:
+        mj = r.get("metadata_json")
+        if not isinstance(mj, dict):
+            continue
+        for k in mj.keys():
+            col = f"metadata__{_safe_col(k)}"
+            if col in seen:
+                continue
+            seen.add(col)
+            keys.append(col)
+            if len(keys) >= max_keys:
+                return keys
+    return keys
+def _normalize_records(
+    records: List[Dict[str, Any]],
+    flat_keys: List[str],
+    extras_mode: str,
+) -> Tuple[List[Dict[str, Any]], int]:
+    flat_set = set(flat_keys)
+    extras_count = 0
+    out: List[Dict[str, Any]] = []
+    for r in records:
+        base: Dict[str, Any] = {}
+        # Keep top-level stable fields as-is.
+        for k in (
+            "source_type",
+            "source_url",
+            "content",
+            "quality_score",
+            "collected_at",
+            "content_type",
+        ):
+            if k in r:
+                base[k] = _coerce_cell(r.get(k))
+        # Preserve source_chain as a JSON string (it is nested).
+        if "source_chain" in r:
+            base["source_chain"] = _coerce_cell(r.get("source_chain"))
+        mj = r.get("metadata_json")
+        extras: Dict[str, Any] = {}
+        if isinstance(mj, dict):
+            for k, v in mj.items():
+                col = f"metadata__{_safe_col(k)}"
+                if col in flat_set:
+                    base[col] = _coerce_cell(v)
+                else:
+                    extras[k] = v
+        # Fill missing flattened keys with nulls for uniform schema.
+        for col in flat_keys:
+            if col not in base:
+                base[col] = None
+        if extras:
+            extras_count += 1
+            if extras_mode == "blob":
+                base["metadata_json_blob"] = _coerce_cell(extras)
+            # extras_mode == "drop" => ignore
+        else:
+            if extras_mode == "blob":
+                base["metadata_json_blob"] = None
+        out.append(base)
+    return out, extras_count
+def main():
+    if len(sys.argv) < 3:
+        print(json.dumps({"ok": False, "error": "Usage: normalize_schema_engine.py <input> <output> [options_json]"}))
+        sys.exit(1)
+    input_path = sys.argv[1]
+    output_path = sys.argv[2]
+    options_raw = sys.argv[3] if len(sys.argv) >= 4 else "{}"
+    if not os.path.exists(input_path):
+        print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
+        sys.exit(1)
+    try:
+        options = json.loads(options_raw) if options_raw else {}
+    except Exception:
+        options = {}
+    flatten_metadata_json = options.get("flatten_metadata_json", True) is True
+    max_keys = int(options.get("max_keys", 200) or 200)
+    max_keys = max(0, min(2000, max_keys))
+    extras_mode = str(options.get("extras_mode", "blob") or "blob").lower()
+    if extras_mode not in ("blob", "drop"):
+        extras_mode = "blob"
+    try:
+        records = _load_records(input_path)
+        if not records:
+            os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+            with open(output_path, "w", encoding="utf-8") as f:
+                f.write("")
+            print(json.dumps({"ok": True, "output_path": output_path, "rows": 0, "columns": 0, "flattened_keys": 0, "extras_mode": extras_mode}))
+            return
+        flat_keys: List[str] = []
+        if flatten_metadata_json and max_keys > 0:
+            flat_keys = _gather_flat_keys(records, max_keys)
+        normalized, extras_rows = _normalize_records(records, flat_keys, extras_mode)
+        ext = os.path.splitext(output_path)[1].lower()
+        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+        if ext in (".jsonl", ".ndjson"):
+            with open(output_path, "w", encoding="utf-8") as f:
+                for r in normalized:
+                    f.write(json.dumps(r, ensure_ascii=False) + "\n")
+        elif ext == ".json":
+            with open(output_path, "w", encoding="utf-8") as f:
+                json.dump(normalized, f, ensure_ascii=False)
+        else:
+            raise ValueError(f"Unsupported output format: {ext}. Use .jsonl or .json")
+        columns = len(normalized[0].keys()) if normalized else 0
+        print(json.dumps({
+            "ok": True,
+            "output_path": output_path,
+            "rows": len(normalized),
+            "columns": columns,
+            "flattened_keys": len(flat_keys),
+            "extras_mode": extras_mode,
+            "extras_rows": extras_rows,
+        }))
+    except Exception as e:
+        print(json.dumps({"ok": False, "error": str(e)}))
+        sys.exit(1)
+if __name__ == "__main__":
+    main()