@vespermcp/mcp-server 1.1.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,204 @@
1
+ import sys
2
+ import os
3
+ import json
4
+ import tempfile
5
+ from typing import Dict, Any, List
6
+ from config import get_all
7
+
8
+ try:
9
+ from kaggle.api.kaggle_api_extended import KaggleApi
10
+ HAS_KAGGLE = True
11
+ except Exception:
12
+ HAS_KAGGLE = False
13
+
14
+
15
+ def _ensure_auth() -> Dict[str, Any]:
16
+ if not HAS_KAGGLE:
17
+ return {
18
+ "ok": False,
19
+ "error": "kaggle package not installed. Install with: pip install kaggle",
20
+ }
21
+
22
+ # Priority:
23
+ # 1) Existing env vars
24
+ # 2) secure local store (keyring or ~/.vesper/config.toml)
25
+ # 3) ~/.kaggle/kaggle.json handled by KaggleApi.authenticate()
26
+ if not os.getenv("KAGGLE_USERNAME") or not os.getenv("KAGGLE_KEY"):
27
+ keys = get_all()
28
+ if keys.get("kaggle_username") and keys.get("kaggle_key"):
29
+ os.environ["KAGGLE_USERNAME"] = keys["kaggle_username"]
30
+ os.environ["KAGGLE_KEY"] = keys["kaggle_key"]
31
+
32
+ api = KaggleApi()
33
+ try:
34
+ api.authenticate()
35
+ except Exception as e:
36
+ return {
37
+ "ok": False,
38
+ "error": "Kaggle requires API key — run 'vespermcp config keys' (30 seconds) or provide ~/.kaggle/kaggle.json",
39
+ "details": str(e),
40
+ }
41
+
42
+ return {"ok": True, "api": api}
43
+
44
+
45
+ def _dataset_to_dict(ds) -> Dict[str, Any]:
46
+ # kaggle API object fields differ by version; use getattr defensively
47
+ ref = getattr(ds, "ref", None) or getattr(ds, "datasetRef", None) or ""
48
+ title = getattr(ds, "title", None) or ref
49
+ subtitle = getattr(ds, "subtitle", None) or ""
50
+ owner = getattr(ds, "creatorName", None) or getattr(ds, "ownerName", None) or ""
51
+ votes = int(getattr(ds, "voteCount", 0) or 0)
52
+ downloads = int(getattr(ds, "downloadCount", 0) or 0)
53
+ size = int(getattr(ds, "totalBytes", 0) or 0)
54
+ last_updated = str(getattr(ds, "lastUpdated", ""))
55
+ tags = []
56
+ raw_tags = getattr(ds, "tags", None)
57
+ if raw_tags:
58
+ for t in raw_tags:
59
+ tags.append(getattr(t, "name", str(t)))
60
+
61
+ return {
62
+ "id": ref,
63
+ "source": "kaggle",
64
+ "name": title,
65
+ "description": subtitle or title,
66
+ "downloads": downloads,
67
+ "likes": votes,
68
+ "stars": 0,
69
+ "tags": tags,
70
+ "last_updated": last_updated,
71
+ "task": "unknown",
72
+ "domain": "unknown",
73
+ "languages": [],
74
+ "splits": [{"name": "data", "num_examples": 0, "size_bytes": size}],
75
+ "license": {
76
+ "id": "unknown",
77
+ "name": "unknown",
78
+ "category": "unknown",
79
+ "usage_restrictions": [],
80
+ "warnings": ["Kaggle license details may vary by dataset"],
81
+ },
82
+ "quality_score": 40,
83
+ "quality_warnings": ["Review dataset card and competition rules before use"],
84
+ "download_url": f"https://www.kaggle.com/datasets/{ref}",
85
+ "format": None,
86
+ "total_examples": 0,
87
+ "total_size_bytes": size,
88
+ "total_size_mb": round(size / (1024 * 1024), 2) if size else 0,
89
+ "columns": [],
90
+ "is_structured": False,
91
+ "has_target_column": False,
92
+ "is_safe_source": True,
93
+ "has_personal_data": False,
94
+ "is_paywalled": False,
95
+ "is_scraped_web_data": False,
96
+ "uses_https": True,
97
+ "has_train_split": False,
98
+ "has_test_split": False,
99
+ "has_validation_split": False,
100
+ "description_length": len(subtitle or title),
101
+ "has_readme": True,
102
+ }
103
+
104
+
105
+ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
106
+ auth = _ensure_auth()
107
+ if not auth.get("ok"):
108
+ return auth
109
+
110
+ api: KaggleApi = auth["api"]
111
+ try:
112
+ datasets = api.dataset_list(search=query, page_size=max(1, min(limit, 100)))
113
+ items = [_dataset_to_dict(ds) for ds in datasets[:limit]]
114
+ return {"ok": True, "results": items, "count": len(items)}
115
+ except Exception as e:
116
+ return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
117
+
118
+
119
+ def _pick_best_file(root: str) -> str:
120
+ candidates: List[str] = []
121
+ for base, _, files in os.walk(root):
122
+ for name in files:
123
+ full = os.path.join(base, name)
124
+ lower = name.lower()
125
+ if lower.endswith((".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow")):
126
+ candidates.append(full)
127
+
128
+ if not candidates:
129
+ raise RuntimeError("No suitable data file found after download")
130
+
131
+ # prioritize common tabular formats
132
+ priorities = [".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow"]
133
+ for ext in priorities:
134
+ for c in candidates:
135
+ if c.lower().endswith(ext):
136
+ return c
137
+ return candidates[0]
138
+
139
+
140
+ def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
141
+ auth = _ensure_auth()
142
+ if not auth.get("ok"):
143
+ return auth
144
+
145
+ api: KaggleApi = auth["api"]
146
+
147
+ if not target_dir:
148
+ target_dir = tempfile.mkdtemp(prefix="vesper_kaggle_")
149
+
150
+ os.makedirs(target_dir, exist_ok=True)
151
+
152
+ try:
153
+ if "kaggle.com/datasets/" in dataset_ref:
154
+ dataset_ref = dataset_ref.split("kaggle.com/datasets/")[1].lstrip("/")
155
+
156
+ # unzip in place, remove zip for convenience
157
+ api.dataset_download_files(dataset_ref, path=target_dir, unzip=True, quiet=True)
158
+ best_file = _pick_best_file(target_dir)
159
+ return {
160
+ "ok": True,
161
+ "dataset_id": dataset_ref,
162
+ "target_dir": target_dir,
163
+ "local_path": best_file,
164
+ }
165
+ except Exception as e:
166
+ msg = str(e)
167
+ if "401" in msg or "Unauthorized" in msg:
168
+ return {"ok": False, "error": "Invalid Kaggle credentials (401). Run 'vespermcp config kaggle' again."}
169
+ if "429" in msg or "Too Many Requests" in msg:
170
+ return {"ok": False, "error": "Kaggle rate limit reached. Please retry later."}
171
+ return {"ok": False, "error": f"Kaggle download failed: {msg}"}
172
+
173
+
174
+ def main():
175
+ if len(sys.argv) < 2:
176
+ print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py <discover|download> ..."}))
177
+ sys.exit(1)
178
+
179
+ command = sys.argv[1]
180
+
181
+ if command == "discover":
182
+ if len(sys.argv) < 3:
183
+ print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py discover <query> [limit]"}))
184
+ sys.exit(1)
185
+ query = sys.argv[2]
186
+ limit = int(sys.argv[3]) if len(sys.argv) > 3 else 20
187
+ print(json.dumps(discover(query, limit)))
188
+ return
189
+
190
+ if command == "download":
191
+ if len(sys.argv) < 3:
192
+ print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py download <dataset_ref> [target_dir]"}))
193
+ sys.exit(1)
194
+ dataset_ref = sys.argv[2]
195
+ target_dir = sys.argv[3] if len(sys.argv) > 3 else ""
196
+ print(json.dumps(download(dataset_ref, target_dir)))
197
+ return
198
+
199
+ print(json.dumps({"ok": False, "error": f"Unknown command: {command}"}))
200
+ sys.exit(1)
201
+
202
+
203
+ if __name__ == "__main__":
204
+ main()
@@ -0,0 +1,54 @@
1
+ import sys
2
+ import json
3
+ import os
4
+
5
+ try:
6
+ import polars as pl
7
+ except Exception:
8
+ print(json.dumps({"ok": False, "error": "polars is required"}))
9
+ sys.exit(1)
10
+
11
+
12
+ def count_rows(path: str) -> int:
13
+ ext = os.path.splitext(path)[1].lower()
14
+
15
+ if ext == ".csv":
16
+ # Faster than full read for large csv
17
+ return int(pl.scan_csv(path, ignore_errors=True).select(pl.len()).collect().item())
18
+ if ext in [".parquet", ".pq"]:
19
+ return int(pl.scan_parquet(path).select(pl.len()).collect().item())
20
+ if ext in [".feather", ".ftr", ".arrow", ".ipc"]:
21
+ return int(pl.scan_ipc(path).select(pl.len()).collect().item())
22
+ if ext in [".jsonl", ".ndjson"]:
23
+ return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
24
+ if ext == ".json":
25
+ # fallback to eager for plain JSON arrays
26
+ try:
27
+ return int(pl.read_json(path).height)
28
+ except Exception:
29
+ return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
30
+
31
+ # unknown extension fallback
32
+ return int(pl.read_csv(path, ignore_errors=True).height)
33
+
34
+
35
+ def main():
36
+ if len(sys.argv) < 2:
37
+ print(json.dumps({"ok": False, "error": "Usage: row_count.py <file_path>"}))
38
+ sys.exit(1)
39
+
40
+ p = sys.argv[1]
41
+ if not os.path.exists(p):
42
+ print(json.dumps({"ok": False, "error": f"File not found: {p}"}))
43
+ sys.exit(1)
44
+
45
+ try:
46
+ rows = count_rows(p)
47
+ print(json.dumps({"ok": True, "rows": rows}))
48
+ except Exception as e:
49
+ print(json.dumps({"ok": False, "error": str(e)}))
50
+ sys.exit(1)
51
+
52
+
53
+ if __name__ == "__main__":
54
+ main()
@@ -0,0 +1,89 @@
1
+ import os
2
+ import tempfile
3
+ import polars as pl
4
+ from fusion_engine import fuse_datasets
5
+
6
+
7
+ def run_basic_tests():
8
+ tmp = tempfile.gettempdir()
9
+
10
+ # ----- Test 1: concat -----
11
+ p1 = os.path.join(tmp, "fuse_test_a.csv")
12
+ p2 = os.path.join(tmp, "fuse_test_b.csv")
13
+ out_concat = os.path.join(tmp, "fuse_test_concat.feather")
14
+
15
+ df1 = pl.DataFrame({
16
+ "id": [1, 2, 3],
17
+ "text": ["a", "b", "c"],
18
+ "price": [10.0, 20.0, 30.0],
19
+ })
20
+ df2 = pl.DataFrame({
21
+ "id": [4, 5, 3],
22
+ "text": ["d", "e", "c"],
23
+ "price": [40.0, 50.0, 30.0],
24
+ "image_path": ["img1.jpg", "img2.jpg", "img3.jpg"],
25
+ })
26
+
27
+ df1.write_csv(p1)
28
+ df2.write_csv(p2)
29
+
30
+ concat_res = fuse_datasets(
31
+ sources=[p1, p2],
32
+ strategy="concat",
33
+ dedup=True,
34
+ run_quality_after=False,
35
+ leakage_check=True,
36
+ output_path=out_concat,
37
+ output_format="feather",
38
+ compression="lz4",
39
+ preview=True,
40
+ id_column="id",
41
+ )
42
+
43
+ assert concat_res.get("success") is True, f"Concat failed: {concat_res}"
44
+ assert os.path.exists(out_concat), "Concat output file missing"
45
+
46
+ # ----- Test 2: join with conflicting column names -----
47
+ p3 = os.path.join(tmp, "fuse_test_c.csv")
48
+ p4 = os.path.join(tmp, "fuse_test_d.csv")
49
+ out_join = os.path.join(tmp, "fuse_test_join.parquet")
50
+
51
+ left = pl.DataFrame({
52
+ "id": [1, 2, 3],
53
+ "price": [100, 200, 300],
54
+ "text": ["x", "y", "z"],
55
+ })
56
+ right = pl.DataFrame({
57
+ "id": [2, 3, 4],
58
+ "price": [999, 888, 777],
59
+ "caption": ["two", "three", "four"],
60
+ })
61
+
62
+ left.write_csv(p3)
63
+ right.write_csv(p4)
64
+
65
+ join_res = fuse_datasets(
66
+ sources=[p3, p4],
67
+ strategy="join",
68
+ join_on="id",
69
+ how="inner",
70
+ dedup=True,
71
+ run_quality_after=False,
72
+ leakage_check=False,
73
+ output_path=out_join,
74
+ output_format="parquet",
75
+ compression="snappy",
76
+ preview=True,
77
+ )
78
+
79
+ assert join_res.get("success") is True, f"Join failed: {join_res}"
80
+ assert os.path.exists(out_join), "Join output file missing"
81
+ assert len(join_res.get("stats", {}).get("conflict_renames", [])) >= 1, "Expected conflict rename for price column"
82
+
83
+ print("✅ Fusion tests passed")
84
+ print("Concat:", concat_res["stats"])
85
+ print("Join:", join_res["stats"])
86
+
87
+
88
+ if __name__ == "__main__":
89
+ run_basic_tests()
@@ -13,7 +13,7 @@ async function main() {
13
13
  // Filter to only new datasets
14
14
  const toIndex = datasets.filter(ds => !indexedIds.has(ds.id));
15
15
  console.error(`Total datasets: ${datasets.length}, Already indexed: ${indexedIds.size}, To index: ${toIndex.length}`);
16
- const BATCH_SIZE = 50;
16
+ const BATCH_SIZE = 20;
17
17
  let processed = 0;
18
18
  for (let i = 0; i < toIndex.length; i += BATCH_SIZE) {
19
19
  const batch = toIndex.slice(i, i + BATCH_SIZE);
@@ -26,9 +26,9 @@ async function main() {
26
26
  `Languages: ${ds.languages?.join(", ") || ""}`,
27
27
  `Tags: ${ds.tags?.join(" ") || ""}`
28
28
  ].join(" ").slice(0, 1500));
29
- // Embed batch (Xenova supports array input)
30
- // Note: Parallelizing at the embed level is better for CPU utilization
31
- await Promise.all(batch.map(async (ds, idx) => {
29
+ // Memory-safe sequential embedding (avoids OOM on large libraries)
30
+ for (let idx = 0; idx < batch.length; idx++) {
31
+ const ds = batch[idx];
32
32
  try {
33
33
  const vector = await embedder.embed(texts[idx]);
34
34
  vectorStore.add(ds.id, vector);
@@ -36,7 +36,7 @@ async function main() {
36
36
  catch (err) {
37
37
  console.error(`Failed to index ${ds.id}:`, err);
38
38
  }
39
- }));
39
+ }
40
40
  processed += batch.length;
41
41
  if (processed % 100 === 0 || i + BATCH_SIZE >= toIndex.length) {
42
42
  console.error(`Indexed ${processed}/${toIndex.length} new datasets...`);
@@ -2,6 +2,19 @@ import { HuggingFaceScraper } from "../metadata/scraper.js";
2
2
  import { UCIScraper } from "../metadata/uci-scraper.js";
3
3
  import { GitHubScraper } from "../metadata/github-scraper.js";
4
4
  import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
5
+ // Common stop words to filter out for better search
6
+ const STOP_WORDS = new Set([
7
+ "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
8
+ "of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
9
+ "be", "have", "has", "had", "do", "does", "did", "will", "would",
10
+ "could", "should", "may", "might", "must", "shall", "can", "need",
11
+ "about", "into", "through", "during", "before", "after", "above",
12
+ "below", "between", "under", "again", "further", "then", "once",
13
+ "here", "there", "when", "where", "why", "how", "all", "each",
14
+ "few", "more", "most", "other", "some", "such", "no", "nor", "not",
15
+ "only", "own", "same", "so", "than", "too", "very", "just", "also",
16
+ "dataset", "datasets", "data", "find", "search", "get", "looking"
17
+ ]);
5
18
  /**
6
19
  * Just-In-Time Orchestrator
7
20
  * Automatically fetches and indexes new datasets when local search is insufficient
@@ -17,6 +30,24 @@ export class JITOrchestrator {
17
30
  this.vectorStore = vectorStore;
18
31
  this.embedder = embedder;
19
32
  }
33
+ /**
34
+ * Simplify a complex user query into keywords that work better with APIs
35
+ * HuggingFace and other APIs often fail on long multi-word queries
36
+ */
37
+ simplifyQuery(query) {
38
+ // Split into words, lowercase, remove punctuation
39
+ const words = query.toLowerCase()
40
+ .replace(/[^\w\s-]/g, "")
41
+ .split(/\s+/)
42
+ .filter(w => w.length > 2 && !STOP_WORDS.has(w));
43
+ // Return unique keywords (max 3 for API-friendly queries)
44
+ const unique = [...new Set(words)];
45
+ // If we have a lot of words, prioritize longer/more specific ones
46
+ if (unique.length > 3) {
47
+ unique.sort((a, b) => b.length - a.length);
48
+ }
49
+ return unique.slice(0, 3);
50
+ }
20
51
  /**
21
52
  * Check if JIT should be triggered based on rate limiting
22
53
  */
@@ -33,20 +64,37 @@ export class JITOrchestrator {
33
64
  async fetchAndIngest(query, limit = 10) {
34
65
  // Rate limiting check
35
66
  if (!this.canTrigger(query)) {
36
- console.error(`[JIT] Rate limit: Query "${query}" triggered too recently`);
67
+ console.error(`[JIT] Query "${query}" was searched recently. Waiting...`);
37
68
  return [];
38
69
  }
39
- console.error(`\n[JIT] Searching live sources for: "${query}"`);
70
+ console.error(`\n[JIT] 🔍 Searching live sources for: "${query}"`);
40
71
  this.lastTriggerTime.set(query, Date.now());
72
+ // Simplify query for better API results
73
+ const keywords = this.simplifyQuery(query);
74
+ if (keywords.length > 0) {
75
+ console.error(`[JIT] 🔑 Keywords extracted: ${keywords.join(", ")}`);
76
+ }
41
77
  const newDatasets = [];
42
78
  const existingIds = new Set();
79
+ const sourceResults = {};
43
80
  try {
44
81
  // Get existing dataset IDs to avoid duplicates
45
82
  const existing = this.metadataStore.getAllDatasets();
46
83
  existing.forEach(ds => existingIds.add(ds.id));
47
- // 1. Scrape HuggingFace (Open Access)
48
- const hfResults = await this.scrapeHuggingFace(query, limit);
49
- console.error(` HuggingFace: Found ${hfResults.length} datasets`);
84
+ // 1. Scrape HuggingFace - try each keyword separately for better results
85
+ let hfResults = [];
86
+ for (const keyword of keywords) {
87
+ if (hfResults.length >= limit)
88
+ break;
89
+ const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / keywords.length));
90
+ for (const ds of results) {
91
+ if (!hfResults.some(existing => existing.id === ds.id)) {
92
+ hfResults.push(ds);
93
+ }
94
+ }
95
+ }
96
+ sourceResults["HuggingFace"] = hfResults.length;
97
+ console.error(` 📦 HuggingFace: ${hfResults.length} datasets`);
50
98
  for (const ds of hfResults) {
51
99
  if (!existingIds.has(ds.id)) {
52
100
  newDatasets.push(ds);
@@ -55,7 +103,8 @@ export class JITOrchestrator {
55
103
  }
56
104
  // 2. Scrape UCI (Open Access)
57
105
  const uciResults = await this.scrapeUCI(query, Math.floor(limit / 2));
58
- console.error(` UCI: Found ${uciResults.length} datasets`);
106
+ sourceResults["UCI"] = uciResults.length;
107
+ console.error(` 📦 UCI: ${uciResults.length} datasets`);
59
108
  for (const ds of uciResults) {
60
109
  if (!existingIds.has(ds.id)) {
61
110
  newDatasets.push(ds);
@@ -64,38 +113,49 @@ export class JITOrchestrator {
64
113
  }
65
114
  // 3. Scrape GitHub (Open Access)
66
115
  const githubResults = await this.scrapeGitHub(query, Math.floor(limit / 2));
67
- console.error(` GitHub: Found ${githubResults.length} datasets`);
116
+ sourceResults["GitHub"] = githubResults.length;
117
+ console.error(` 📦 GitHub: ${githubResults.length} datasets`);
68
118
  for (const ds of githubResults) {
69
119
  if (!existingIds.has(ds.id)) {
70
120
  newDatasets.push(ds);
71
121
  existingIds.add(ds.id);
72
122
  }
73
123
  }
74
- // 4. Scrape World Bank (Open Access)
124
+ // 4. Scrape World Bank (Open Access) - Economic/demographic data
75
125
  const wbResults = await this.scrapeWorldBank(query, Math.floor(limit / 2));
76
- console.error(` World Bank: Found ${wbResults.length} datasets`);
126
+ sourceResults["WorldBank"] = wbResults.length;
127
+ console.error(` 📦 World Bank: ${wbResults.length} datasets`);
77
128
  for (const ds of wbResults) {
78
129
  if (!existingIds.has(ds.id)) {
79
130
  newDatasets.push(ds);
80
131
  existingIds.add(ds.id);
81
132
  }
82
133
  }
83
- // 5. Scrape NASA (Open Access)
134
+ // 5. Scrape NASA (Open Access) - Scientific/space data
84
135
  const nasaResults = await this.scrapeNASA(query, Math.floor(limit / 2));
85
- console.error(` NASA: Found ${nasaResults.length} datasets`);
136
+ sourceResults["NASA"] = nasaResults.length;
137
+ console.error(` 📦 NASA: ${nasaResults.length} datasets`);
86
138
  for (const ds of nasaResults) {
87
139
  if (!existingIds.has(ds.id)) {
88
140
  newDatasets.push(ds);
89
141
  existingIds.add(ds.id);
90
142
  }
91
143
  }
92
- // 3. Save and index new datasets
144
+ // Save and index new datasets
93
145
  if (newDatasets.length > 0) {
94
146
  await this.saveAndIndex(newDatasets);
95
- console.error(`[JIT] Indexed ${newDatasets.length} new datasets`);
147
+ console.error(`[JIT] Indexed ${newDatasets.length} new datasets`);
96
148
  }
97
149
  else {
98
- console.error(` [JIT] No new datasets found`);
150
+ // Provide helpful feedback when no results found
151
+ const allZero = Object.values(sourceResults).every(v => v === 0);
152
+ if (allZero) {
153
+ console.error(`[JIT] ⚠️ No datasets found across all sources.`);
154
+ console.error(`[JIT] 💡 Try: broader keywords, or set HF_TOKEN for better HuggingFace access`);
155
+ }
156
+ else {
157
+ console.error(`[JIT] ℹ️ All found datasets already in index`);
158
+ }
99
159
  }
100
160
  return newDatasets;
101
161
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vespermcp/mcp-server",
3
- "version": "1.1.2",
3
+ "version": "1.2.0",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
@@ -27,8 +27,14 @@
27
27
  "check-db": "tsx src/scripts/check-db.ts",
28
28
  "test-jit": "tsx src/scripts/test-jit.ts",
29
29
  "demo-ui": "tsx src/scripts/demo-ui.ts",
30
+ "fuse": "node build/index.js fuse",
31
+ "discover": "node build/index.js discover",
32
+ "download": "node build/index.js download",
33
+ "config": "node build/index.js config",
34
+ "test-fusion-engine": "py src/python/test_fusion_engine.py",
30
35
  "setup": "node build/index.js --setup",
31
36
  "setup:silent": "node build/index.js --setup --silent",
37
+ "refresh-index": "node scripts/refresh-index.cjs",
32
38
  "test": "vitest",
33
39
  "start": "node build/index.js"
34
40
  },
@@ -86,4 +92,4 @@
86
92
  "typescript": "^5.9.3",
87
93
  "vitest": "^4.0.17"
88
94
  }
89
- }
95
+ }
@@ -0,0 +1,87 @@
1
+ #!/usr/bin/env node
2
+
3
+ const { spawnSync } = require("child_process");
4
+ const fs = require("fs");
5
+ const path = require("path");
6
+ const os = require("os");
7
+ const Database = require("better-sqlite3");
8
+
9
+ function runCommand(command, args, options = {}) {
10
+ const result = spawnSync(command, args, {
11
+ stdio: "inherit",
12
+ shell: process.platform === "win32",
13
+ ...options,
14
+ });
15
+
16
+ if (result.status !== 0) {
17
+ throw new Error(`Command failed: ${command} ${args.join(" ")} (exit ${result.status})`);
18
+ }
19
+ }
20
+
21
+ function countDatasets(dbPath) {
22
+ if (!fs.existsSync(dbPath)) return "N/A";
23
+ const db = new Database(dbPath);
24
+ const count = db.prepare("SELECT COUNT(*) AS c FROM datasets").get().c;
25
+ db.close();
26
+ return count;
27
+ }
28
+
29
+ function countVectors(jsonPath) {
30
+ if (!fs.existsSync(jsonPath)) return "N/A";
31
+ const data = JSON.parse(fs.readFileSync(jsonPath, "utf8"));
32
+ if (typeof data.count === "number") return data.count;
33
+ if (Array.isArray(data.ids)) return data.ids.length;
34
+ return "N/A";
35
+ }
36
+
37
+ function syncRuntime(workspaceRoot) {
38
+ const runtimeDir = path.join(os.homedir(), ".vesper", "data");
39
+ fs.mkdirSync(runtimeDir, { recursive: true });
40
+
41
+ const files = ["metadata.db", "vectors.json", "vectors.bin"];
42
+ for (const file of files) {
43
+ const src = path.join(workspaceRoot, "data", file);
44
+ const dest = path.join(runtimeDir, file);
45
+ if (!fs.existsSync(src)) {
46
+ throw new Error(`Missing source file: ${src}`);
47
+ }
48
+ fs.copyFileSync(src, dest);
49
+ }
50
+
51
+ return runtimeDir;
52
+ }
53
+
54
+ function main() {
55
+ const workspaceRoot = process.cwd();
56
+ const runtimeDbPath = path.join(os.homedir(), ".vesper", "data", "metadata.db");
57
+ const runtimeVecPath = path.join(os.homedir(), ".vesper", "data", "vectors.json");
58
+ const workspaceDbPath = path.join(workspaceRoot, "data", "metadata.db");
59
+ const workspaceVecPath = path.join(workspaceRoot, "data", "vectors.json");
60
+
61
+ console.log("\n[refresh-index] Step 1/3: Massive scrape...");
62
+ runCommand("npm", ["run", "massive-scrape"]);
63
+
64
+ console.log("\n[refresh-index] Step 2/3: High-memory indexing...");
65
+ const env = { ...process.env, NODE_OPTIONS: "--max-old-space-size=8192" };
66
+ runCommand("npm", ["run", "index"], { env });
67
+
68
+ console.log("\n[refresh-index] Step 3/3: Sync workspace index to runtime...");
69
+ const runtimeDir = syncRuntime(workspaceRoot);
70
+
71
+ const wsDb = countDatasets(workspaceDbPath);
72
+ const wsVec = countVectors(workspaceVecPath);
73
+ const rtDb = countDatasets(runtimeDbPath);
74
+ const rtVec = countVectors(runtimeVecPath);
75
+
76
+ console.log("\n[refresh-index] Completed successfully.");
77
+ console.log(`[refresh-index] Workspace: DB=${wsDb}, VECTORS=${wsVec}`);
78
+ console.log(`[refresh-index] Runtime: DB=${rtDb}, VECTORS=${rtVec}`);
79
+ console.log(`[refresh-index] Runtime path: ${runtimeDir}\n`);
80
+ }
81
+
82
+ try {
83
+ main();
84
+ } catch (error) {
85
+ console.error("\n[refresh-index] Failed:", error.message);
86
+ process.exit(1);
87
+ }