@vespermcp/mcp-server 1.1.3 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,204 @@
1
+ import sys
2
+ import os
3
+ import json
4
+ import tempfile
5
+ from typing import Dict, Any, List
6
+ from config import get_all
7
+
8
+ try:
9
+ from kaggle.api.kaggle_api_extended import KaggleApi
10
+ HAS_KAGGLE = True
11
+ except Exception:
12
+ HAS_KAGGLE = False
13
+
14
+
15
+ def _ensure_auth() -> Dict[str, Any]:
16
+ if not HAS_KAGGLE:
17
+ return {
18
+ "ok": False,
19
+ "error": "kaggle package not installed. Install with: pip install kaggle",
20
+ }
21
+
22
+ # Priority:
23
+ # 1) Existing env vars
24
+ # 2) secure local store (keyring or ~/.vesper/config.toml)
25
+ # 3) ~/.kaggle/kaggle.json handled by KaggleApi.authenticate()
26
+ if not os.getenv("KAGGLE_USERNAME") or not os.getenv("KAGGLE_KEY"):
27
+ keys = get_all()
28
+ if keys.get("kaggle_username") and keys.get("kaggle_key"):
29
+ os.environ["KAGGLE_USERNAME"] = keys["kaggle_username"]
30
+ os.environ["KAGGLE_KEY"] = keys["kaggle_key"]
31
+
32
+ api = KaggleApi()
33
+ try:
34
+ api.authenticate()
35
+ except Exception as e:
36
+ return {
37
+ "ok": False,
38
+ "error": "Kaggle requires API key — run 'vespermcp config keys' (30 seconds) or provide ~/.kaggle/kaggle.json",
39
+ "details": str(e),
40
+ }
41
+
42
+ return {"ok": True, "api": api}
43
+
44
+
45
+ def _dataset_to_dict(ds) -> Dict[str, Any]:
46
+ # kaggle API object fields differ by version; use getattr defensively
47
+ ref = getattr(ds, "ref", None) or getattr(ds, "datasetRef", None) or ""
48
+ title = getattr(ds, "title", None) or ref
49
+ subtitle = getattr(ds, "subtitle", None) or ""
50
+ owner = getattr(ds, "creatorName", None) or getattr(ds, "ownerName", None) or ""
51
+ votes = int(getattr(ds, "voteCount", 0) or 0)
52
+ downloads = int(getattr(ds, "downloadCount", 0) or 0)
53
+ size = int(getattr(ds, "totalBytes", 0) or 0)
54
+ last_updated = str(getattr(ds, "lastUpdated", ""))
55
+ tags = []
56
+ raw_tags = getattr(ds, "tags", None)
57
+ if raw_tags:
58
+ for t in raw_tags:
59
+ tags.append(getattr(t, "name", str(t)))
60
+
61
+ return {
62
+ "id": ref,
63
+ "source": "kaggle",
64
+ "name": title,
65
+ "description": subtitle or title,
66
+ "downloads": downloads,
67
+ "likes": votes,
68
+ "stars": 0,
69
+ "tags": tags,
70
+ "last_updated": last_updated,
71
+ "task": "unknown",
72
+ "domain": "unknown",
73
+ "languages": [],
74
+ "splits": [{"name": "data", "num_examples": 0, "size_bytes": size}],
75
+ "license": {
76
+ "id": "unknown",
77
+ "name": "unknown",
78
+ "category": "unknown",
79
+ "usage_restrictions": [],
80
+ "warnings": ["Kaggle license details may vary by dataset"],
81
+ },
82
+ "quality_score": 40,
83
+ "quality_warnings": ["Review dataset card and competition rules before use"],
84
+ "download_url": f"https://www.kaggle.com/datasets/{ref}",
85
+ "format": None,
86
+ "total_examples": 0,
87
+ "total_size_bytes": size,
88
+ "total_size_mb": round(size / (1024 * 1024), 2) if size else 0,
89
+ "columns": [],
90
+ "is_structured": False,
91
+ "has_target_column": False,
92
+ "is_safe_source": True,
93
+ "has_personal_data": False,
94
+ "is_paywalled": False,
95
+ "is_scraped_web_data": False,
96
+ "uses_https": True,
97
+ "has_train_split": False,
98
+ "has_test_split": False,
99
+ "has_validation_split": False,
100
+ "description_length": len(subtitle or title),
101
+ "has_readme": True,
102
+ }
103
+
104
+
105
+ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
106
+ auth = _ensure_auth()
107
+ if not auth.get("ok"):
108
+ return auth
109
+
110
+ api: KaggleApi = auth["api"]
111
+ try:
112
+ datasets = api.dataset_list(search=query, page_size=max(1, min(limit, 100)))
113
+ items = [_dataset_to_dict(ds) for ds in datasets[:limit]]
114
+ return {"ok": True, "results": items, "count": len(items)}
115
+ except Exception as e:
116
+ return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
117
+
118
+
119
+ def _pick_best_file(root: str) -> str:
120
+ candidates: List[str] = []
121
+ for base, _, files in os.walk(root):
122
+ for name in files:
123
+ full = os.path.join(base, name)
124
+ lower = name.lower()
125
+ if lower.endswith((".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow")):
126
+ candidates.append(full)
127
+
128
+ if not candidates:
129
+ raise RuntimeError("No suitable data file found after download")
130
+
131
+ # prioritize common tabular formats
132
+ priorities = [".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow"]
133
+ for ext in priorities:
134
+ for c in candidates:
135
+ if c.lower().endswith(ext):
136
+ return c
137
+ return candidates[0]
138
+
139
+
140
+ def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
141
+ auth = _ensure_auth()
142
+ if not auth.get("ok"):
143
+ return auth
144
+
145
+ api: KaggleApi = auth["api"]
146
+
147
+ if not target_dir:
148
+ target_dir = tempfile.mkdtemp(prefix="vesper_kaggle_")
149
+
150
+ os.makedirs(target_dir, exist_ok=True)
151
+
152
+ try:
153
+ if "kaggle.com/datasets/" in dataset_ref:
154
+ dataset_ref = dataset_ref.split("kaggle.com/datasets/")[1].lstrip("/")
155
+
156
+ # unzip in place, remove zip for convenience
157
+ api.dataset_download_files(dataset_ref, path=target_dir, unzip=True, quiet=True)
158
+ best_file = _pick_best_file(target_dir)
159
+ return {
160
+ "ok": True,
161
+ "dataset_id": dataset_ref,
162
+ "target_dir": target_dir,
163
+ "local_path": best_file,
164
+ }
165
+ except Exception as e:
166
+ msg = str(e)
167
+ if "401" in msg or "Unauthorized" in msg:
168
+ return {"ok": False, "error": "Invalid Kaggle credentials (401). Run 'vespermcp config kaggle' again."}
169
+ if "429" in msg or "Too Many Requests" in msg:
170
+ return {"ok": False, "error": "Kaggle rate limit reached. Please retry later."}
171
+ return {"ok": False, "error": f"Kaggle download failed: {msg}"}
172
+
173
+
174
+ def main():
175
+ if len(sys.argv) < 2:
176
+ print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py <discover|download> ..."}))
177
+ sys.exit(1)
178
+
179
+ command = sys.argv[1]
180
+
181
+ if command == "discover":
182
+ if len(sys.argv) < 3:
183
+ print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py discover <query> [limit]"}))
184
+ sys.exit(1)
185
+ query = sys.argv[2]
186
+ limit = int(sys.argv[3]) if len(sys.argv) > 3 else 20
187
+ print(json.dumps(discover(query, limit)))
188
+ return
189
+
190
+ if command == "download":
191
+ if len(sys.argv) < 3:
192
+ print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py download <dataset_ref> [target_dir]"}))
193
+ sys.exit(1)
194
+ dataset_ref = sys.argv[2]
195
+ target_dir = sys.argv[3] if len(sys.argv) > 3 else ""
196
+ print(json.dumps(download(dataset_ref, target_dir)))
197
+ return
198
+
199
+ print(json.dumps({"ok": False, "error": f"Unknown command: {command}"}))
200
+ sys.exit(1)
201
+
202
+
203
+ if __name__ == "__main__":
204
+ main()
@@ -0,0 +1,54 @@
1
+ import sys
2
+ import json
3
+ import os
4
+
5
+ try:
6
+ import polars as pl
7
+ except Exception:
8
+ print(json.dumps({"ok": False, "error": "polars is required"}))
9
+ sys.exit(1)
10
+
11
+
12
+ def count_rows(path: str) -> int:
13
+ ext = os.path.splitext(path)[1].lower()
14
+
15
+ if ext == ".csv":
16
+ # Faster than full read for large csv
17
+ return int(pl.scan_csv(path, ignore_errors=True).select(pl.len()).collect().item())
18
+ if ext in [".parquet", ".pq"]:
19
+ return int(pl.scan_parquet(path).select(pl.len()).collect().item())
20
+ if ext in [".feather", ".ftr", ".arrow", ".ipc"]:
21
+ return int(pl.scan_ipc(path).select(pl.len()).collect().item())
22
+ if ext in [".jsonl", ".ndjson"]:
23
+ return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
24
+ if ext == ".json":
25
+ # fallback to eager for plain JSON arrays
26
+ try:
27
+ return int(pl.read_json(path).height)
28
+ except Exception:
29
+ return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
30
+
31
+ # unknown extension fallback
32
+ return int(pl.read_csv(path, ignore_errors=True).height)
33
+
34
+
35
+ def main():
36
+ if len(sys.argv) < 2:
37
+ print(json.dumps({"ok": False, "error": "Usage: row_count.py <file_path>"}))
38
+ sys.exit(1)
39
+
40
+ p = sys.argv[1]
41
+ if not os.path.exists(p):
42
+ print(json.dumps({"ok": False, "error": f"File not found: {p}"}))
43
+ sys.exit(1)
44
+
45
+ try:
46
+ rows = count_rows(p)
47
+ print(json.dumps({"ok": True, "rows": rows}))
48
+ except Exception as e:
49
+ print(json.dumps({"ok": False, "error": str(e)}))
50
+ sys.exit(1)
51
+
52
+
53
+ if __name__ == "__main__":
54
+ main()
@@ -0,0 +1,89 @@
1
+ import os
2
+ import tempfile
3
+ import polars as pl
4
+ from fusion_engine import fuse_datasets
5
+
6
+
7
+ def run_basic_tests():
8
+ tmp = tempfile.gettempdir()
9
+
10
+ # ----- Test 1: concat -----
11
+ p1 = os.path.join(tmp, "fuse_test_a.csv")
12
+ p2 = os.path.join(tmp, "fuse_test_b.csv")
13
+ out_concat = os.path.join(tmp, "fuse_test_concat.feather")
14
+
15
+ df1 = pl.DataFrame({
16
+ "id": [1, 2, 3],
17
+ "text": ["a", "b", "c"],
18
+ "price": [10.0, 20.0, 30.0],
19
+ })
20
+ df2 = pl.DataFrame({
21
+ "id": [4, 5, 3],
22
+ "text": ["d", "e", "c"],
23
+ "price": [40.0, 50.0, 30.0],
24
+ "image_path": ["img1.jpg", "img2.jpg", "img3.jpg"],
25
+ })
26
+
27
+ df1.write_csv(p1)
28
+ df2.write_csv(p2)
29
+
30
+ concat_res = fuse_datasets(
31
+ sources=[p1, p2],
32
+ strategy="concat",
33
+ dedup=True,
34
+ run_quality_after=False,
35
+ leakage_check=True,
36
+ output_path=out_concat,
37
+ output_format="feather",
38
+ compression="lz4",
39
+ preview=True,
40
+ id_column="id",
41
+ )
42
+
43
+ assert concat_res.get("success") is True, f"Concat failed: {concat_res}"
44
+ assert os.path.exists(out_concat), "Concat output file missing"
45
+
46
+ # ----- Test 2: join with conflicting column names -----
47
+ p3 = os.path.join(tmp, "fuse_test_c.csv")
48
+ p4 = os.path.join(tmp, "fuse_test_d.csv")
49
+ out_join = os.path.join(tmp, "fuse_test_join.parquet")
50
+
51
+ left = pl.DataFrame({
52
+ "id": [1, 2, 3],
53
+ "price": [100, 200, 300],
54
+ "text": ["x", "y", "z"],
55
+ })
56
+ right = pl.DataFrame({
57
+ "id": [2, 3, 4],
58
+ "price": [999, 888, 777],
59
+ "caption": ["two", "three", "four"],
60
+ })
61
+
62
+ left.write_csv(p3)
63
+ right.write_csv(p4)
64
+
65
+ join_res = fuse_datasets(
66
+ sources=[p3, p4],
67
+ strategy="join",
68
+ join_on="id",
69
+ how="inner",
70
+ dedup=True,
71
+ run_quality_after=False,
72
+ leakage_check=False,
73
+ output_path=out_join,
74
+ output_format="parquet",
75
+ compression="snappy",
76
+ preview=True,
77
+ )
78
+
79
+ assert join_res.get("success") is True, f"Join failed: {join_res}"
80
+ assert os.path.exists(out_join), "Join output file missing"
81
+ assert len(join_res.get("stats", {}).get("conflict_renames", [])) >= 1, "Expected conflict rename for price column"
82
+
83
+ print("✅ Fusion tests passed")
84
+ print("Concat:", concat_res["stats"])
85
+ print("Join:", join_res["stats"])
86
+
87
+
88
+ if __name__ == "__main__":
89
+ run_basic_tests()
@@ -13,7 +13,7 @@ async function main() {
13
13
  // Filter to only new datasets
14
14
  const toIndex = datasets.filter(ds => !indexedIds.has(ds.id));
15
15
  console.error(`Total datasets: ${datasets.length}, Already indexed: ${indexedIds.size}, To index: ${toIndex.length}`);
16
- const BATCH_SIZE = 50;
16
+ const BATCH_SIZE = 20;
17
17
  let processed = 0;
18
18
  for (let i = 0; i < toIndex.length; i += BATCH_SIZE) {
19
19
  const batch = toIndex.slice(i, i + BATCH_SIZE);
@@ -26,9 +26,9 @@ async function main() {
26
26
  `Languages: ${ds.languages?.join(", ") || ""}`,
27
27
  `Tags: ${ds.tags?.join(" ") || ""}`
28
28
  ].join(" ").slice(0, 1500));
29
- // Embed batch (Xenova supports array input)
30
- // Note: Parallelizing at the embed level is better for CPU utilization
31
- await Promise.all(batch.map(async (ds, idx) => {
29
+ // Memory-safe sequential embedding (avoids OOM on large libraries)
30
+ for (let idx = 0; idx < batch.length; idx++) {
31
+ const ds = batch[idx];
32
32
  try {
33
33
  const vector = await embedder.embed(texts[idx]);
34
34
  vectorStore.add(ds.id, vector);
@@ -36,7 +36,7 @@ async function main() {
36
36
  catch (err) {
37
37
  console.error(`Failed to index ${ds.id}:`, err);
38
38
  }
39
- }));
39
+ }
40
40
  processed += batch.length;
41
41
  if (processed % 100 === 0 || i + BATCH_SIZE >= toIndex.length) {
42
42
  console.error(`Indexed ${processed}/${toIndex.length} new datasets...`);
@@ -2,6 +2,19 @@ import { HuggingFaceScraper } from "../metadata/scraper.js";
2
2
  import { UCIScraper } from "../metadata/uci-scraper.js";
3
3
  import { GitHubScraper } from "../metadata/github-scraper.js";
4
4
  import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
5
+ // Common stop words to filter out for better search
6
+ const STOP_WORDS = new Set([
7
+ "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
8
+ "of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
9
+ "be", "have", "has", "had", "do", "does", "did", "will", "would",
10
+ "could", "should", "may", "might", "must", "shall", "can", "need",
11
+ "about", "into", "through", "during", "before", "after", "above",
12
+ "below", "between", "under", "again", "further", "then", "once",
13
+ "here", "there", "when", "where", "why", "how", "all", "each",
14
+ "few", "more", "most", "other", "some", "such", "no", "nor", "not",
15
+ "only", "own", "same", "so", "than", "too", "very", "just", "also",
16
+ "dataset", "datasets", "data", "find", "search", "get", "looking"
17
+ ]);
5
18
  /**
6
19
  * Just-In-Time Orchestrator
7
20
  * Automatically fetches and indexes new datasets when local search is insufficient
@@ -17,6 +30,24 @@ export class JITOrchestrator {
17
30
  this.vectorStore = vectorStore;
18
31
  this.embedder = embedder;
19
32
  }
33
+ /**
34
+ * Simplify a complex user query into keywords that work better with APIs
35
+ * HuggingFace and other APIs often fail on long multi-word queries
36
+ */
37
+ simplifyQuery(query) {
38
+ // Split into words, lowercase, remove punctuation
39
+ const words = query.toLowerCase()
40
+ .replace(/[^\w\s-]/g, "")
41
+ .split(/\s+/)
42
+ .filter(w => w.length > 2 && !STOP_WORDS.has(w));
43
+ // Return unique keywords (max 3 for API-friendly queries)
44
+ const unique = [...new Set(words)];
45
+ // If we have a lot of words, prioritize longer/more specific ones
46
+ if (unique.length > 3) {
47
+ unique.sort((a, b) => b.length - a.length);
48
+ }
49
+ return unique.slice(0, 3);
50
+ }
20
51
  /**
21
52
  * Check if JIT should be triggered based on rate limiting
22
53
  */
@@ -33,20 +64,37 @@ export class JITOrchestrator {
33
64
  async fetchAndIngest(query, limit = 10) {
34
65
  // Rate limiting check
35
66
  if (!this.canTrigger(query)) {
36
- console.error(`[JIT] Rate limit: Query "${query}" triggered too recently`);
67
+ console.error(`[JIT] Query "${query}" was searched recently. Waiting...`);
37
68
  return [];
38
69
  }
39
70
  console.error(`\n[JIT] Searching live sources for: "${query}"`);
40
71
  this.lastTriggerTime.set(query, Date.now());
72
+ // Simplify query for better API results
73
+ const keywords = this.simplifyQuery(query);
74
+ if (keywords.length > 0) {
75
+ console.error(`[JIT] Keywords extracted: ${keywords.join(", ")}`);
76
+ }
41
77
  const newDatasets = [];
42
78
  const existingIds = new Set();
79
+ const sourceResults = {};
43
80
  try {
44
81
  // Get existing dataset IDs to avoid duplicates
45
82
  const existing = this.metadataStore.getAllDatasets();
46
83
  existing.forEach(ds => existingIds.add(ds.id));
47
- // 1. Scrape HuggingFace (Open Access)
48
- const hfResults = await this.scrapeHuggingFace(query, limit);
49
- console.error(` HuggingFace: Found ${hfResults.length} datasets`);
84
+ // 1. Scrape HuggingFace - try each keyword separately for better results
85
+ let hfResults = [];
86
+ for (const keyword of keywords) {
87
+ if (hfResults.length >= limit)
88
+ break;
89
+ const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / keywords.length));
90
+ for (const ds of results) {
91
+ if (!hfResults.some(existing => existing.id === ds.id)) {
92
+ hfResults.push(ds);
93
+ }
94
+ }
95
+ }
96
+ sourceResults["HuggingFace"] = hfResults.length;
97
+ console.error(` [source] HuggingFace: ${hfResults.length} datasets`);
50
98
  for (const ds of hfResults) {
51
99
  if (!existingIds.has(ds.id)) {
52
100
  newDatasets.push(ds);
@@ -55,7 +103,8 @@ export class JITOrchestrator {
55
103
  }
56
104
  // 2. Scrape UCI (Open Access)
57
105
  const uciResults = await this.scrapeUCI(query, Math.floor(limit / 2));
58
- console.error(` UCI: Found ${uciResults.length} datasets`);
106
+ sourceResults["UCI"] = uciResults.length;
107
+ console.error(` [source] UCI: ${uciResults.length} datasets`);
59
108
  for (const ds of uciResults) {
60
109
  if (!existingIds.has(ds.id)) {
61
110
  newDatasets.push(ds);
@@ -64,38 +113,49 @@ export class JITOrchestrator {
64
113
  }
65
114
  // 3. Scrape GitHub (Open Access)
66
115
  const githubResults = await this.scrapeGitHub(query, Math.floor(limit / 2));
67
- console.error(` GitHub: Found ${githubResults.length} datasets`);
116
+ sourceResults["GitHub"] = githubResults.length;
117
+ console.error(` [source] GitHub: ${githubResults.length} datasets`);
68
118
  for (const ds of githubResults) {
69
119
  if (!existingIds.has(ds.id)) {
70
120
  newDatasets.push(ds);
71
121
  existingIds.add(ds.id);
72
122
  }
73
123
  }
74
- // 4. Scrape World Bank (Open Access)
124
+ // 4. Scrape World Bank (Open Access) - Economic/demographic data
75
125
  const wbResults = await this.scrapeWorldBank(query, Math.floor(limit / 2));
76
- console.error(` World Bank: Found ${wbResults.length} datasets`);
126
+ sourceResults["WorldBank"] = wbResults.length;
127
+ console.error(` [source] World Bank: ${wbResults.length} datasets`);
77
128
  for (const ds of wbResults) {
78
129
  if (!existingIds.has(ds.id)) {
79
130
  newDatasets.push(ds);
80
131
  existingIds.add(ds.id);
81
132
  }
82
133
  }
83
- // 5. Scrape NASA (Open Access)
134
+ // 5. Scrape NASA (Open Access) - Scientific/space data
84
135
  const nasaResults = await this.scrapeNASA(query, Math.floor(limit / 2));
85
- console.error(` NASA: Found ${nasaResults.length} datasets`);
136
+ sourceResults["NASA"] = nasaResults.length;
137
+ console.error(` [source] NASA: ${nasaResults.length} datasets`);
86
138
  for (const ds of nasaResults) {
87
139
  if (!existingIds.has(ds.id)) {
88
140
  newDatasets.push(ds);
89
141
  existingIds.add(ds.id);
90
142
  }
91
143
  }
92
- // 3. Save and index new datasets
144
+ // Save and index new datasets
93
145
  if (newDatasets.length > 0) {
94
146
  await this.saveAndIndex(newDatasets);
95
147
  console.error(`[JIT] Indexed ${newDatasets.length} new datasets`);
96
148
  }
97
149
  else {
98
- console.error(` [JIT] No new datasets found`);
150
+ // Provide helpful feedback when no results found
151
+ const allZero = Object.values(sourceResults).every(v => v === 0);
152
+ if (allZero) {
153
+ console.error(`[JIT] No datasets found across all sources.`);
154
+ console.error(`[JIT] Try: broader keywords, or set HF_TOKEN for better HuggingFace access`);
155
+ }
156
+ else {
157
+ console.error(`[JIT] All found datasets already in index`);
158
+ }
99
159
  }
100
160
  return newDatasets;
101
161
  }
@@ -2,29 +2,29 @@
2
2
  * Format job status for visual representation
3
3
  */
4
4
  export function formatJobStatus(job) {
5
- const emojiMap = {
6
- "pending": "",
7
- "queued": "📋",
8
- "running": "🔄",
9
- "completed": "",
10
- "failed": "",
11
- "retrying": "🔁"
5
+ const statusMap = {
6
+ "pending": "PENDING",
7
+ "queued": "QUEUED",
8
+ "running": "RUNNING",
9
+ "completed": "COMPLETED",
10
+ "failed": "FAILED",
11
+ "retrying": "RETRYING"
12
12
  };
13
- const emoji = emojiMap[job.status] || "";
13
+ const statusText = statusMap[job.status] || "UNKNOWN";
14
14
  const barWidth = 20;
15
15
  const filledWidth = Math.round((job.progress / 100) * barWidth);
16
16
  const emptyWidth = barWidth - filledWidth;
17
17
  const bar = "█".repeat(filledWidth) + "░".repeat(emptyWidth);
18
18
  let output = `═ Job Status: ${job.type.toUpperCase()} ═\n`;
19
19
  output += `ID: ${job.id}\n`;
20
- output += `Status: ${emoji} ${job.status.toUpperCase()}\n`;
20
+ output += `Status: ${statusText}\n`;
21
21
  output += `Progress: ${bar} ${job.progress}%\n`;
22
22
  output += `Activity: ${job.status_text}\n`;
23
23
  if (job.result_url) {
24
- output += `\n✅ Result: ${job.result_url}\n`;
24
+ output += `\nResult: ${job.result_url}\n`;
25
25
  }
26
26
  if (job.error) {
27
- output += `\n❌ ERROR:\n`;
27
+ output += `\nERROR:\n`;
28
28
  // Format multi-line errors nicely
29
29
  const errorLines = job.error.split('\n');
30
30
  errorLines.forEach(line => {
@@ -51,7 +51,7 @@ export function formatSearchResults(results) {
51
51
  const openSources = ["huggingface", "uci", "github", "worldbank", "nasa"];
52
52
  const isOpen = openSources.includes(ds.source);
53
53
  const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
54
- const accessBadge = isOpen ? "🔓 Open Access" : "🔒 Requires API Key";
54
+ const accessBadge = isOpen ? "Open Access" : "Requires API Key";
55
55
  // Safety indicator
56
56
  let safetyIndicator = "";
57
57
  if (ds.license.category === "safe") {
@@ -128,7 +128,7 @@ export function formatDatasetInfo(ds) {
128
128
  const openSources = ["huggingface", "uci", "github", "worldbank", "nasa"];
129
129
  const isOpen = openSources.includes(ds.source);
130
130
  const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
131
- const accessBadge = isOpen ? "🔓 Open Access" : "🔒 Requires API Key";
131
+ const accessBadge = isOpen ? "Open Access" : "Requires API Key";
132
132
  let safetyIndicator = "";
133
133
  if (ds.license.category === "safe") {
134
134
  safetyIndicator = "Safe for use";
@@ -143,7 +143,7 @@ export function formatDatasetInfo(ds) {
143
143
  output += `Safety: ${safetyIndicator}\n`;
144
144
  output += `ID: ${ds.id}\n\n`;
145
145
  if (!isOpen && ds.source === "kaggle") {
146
- output += `⚠️ NOTE: This dataset requires a Kaggle API key (KAGGE_USERNAME/KAGGLE_KEY) to prepare.\n\n`;
146
+ output += `NOTE: This dataset requires a Kaggle API key (KAGGE_USERNAME/KAGGLE_KEY) to prepare.\n\n`;
147
147
  }
148
148
  // Description
149
149
  if (ds.description) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vespermcp/mcp-server",
3
- "version": "1.1.3",
3
+ "version": "1.2.1",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
@@ -17,7 +17,7 @@
17
17
  "mcp-config-template.json"
18
18
  ],
19
19
  "scripts": {
20
- "build": "tsc && node -e \"const fs=require('fs');const path=require('path');const src='src/python';const dest='build/python';if(!fs.existsSync(dest))fs.mkdirSync(dest,{recursive:true});fs.readdirSync(src).forEach(f=>{if(f.endsWith('.py'))fs.copyFileSync(path.join(src,f),path.join(dest,f));});console.log('Copied Python scripts to build/python');\"",
20
+ "build": "tsc && node -e \"const fs=require('fs');const path=require('path');const src='src/python';const dest='build/python';if(!fs.existsSync(dest))fs.mkdirSync(dest,{recursive:true});fs.readdirSync(src).forEach(f=>{if(f.endsWith('.py'))fs.copyFileSync(path.join(src,f),path.join(dest,f));});console.log('Copied Python scripts to build/python');\"",
21
21
  "dev": "tsx watch src/index.ts",
22
22
  "postinstall": "node scripts/postinstall.cjs",
23
23
  "scrape": "tsx src/scripts/scrape-metadata.ts",
@@ -27,8 +27,14 @@
27
27
  "check-db": "tsx src/scripts/check-db.ts",
28
28
  "test-jit": "tsx src/scripts/test-jit.ts",
29
29
  "demo-ui": "tsx src/scripts/demo-ui.ts",
30
+ "fuse": "node build/index.js fuse",
31
+ "discover": "node build/index.js discover",
32
+ "download": "node build/index.js download",
33
+ "config": "node build/index.js config",
34
+ "test-fusion-engine": "py src/python/test_fusion_engine.py",
30
35
  "setup": "node build/index.js --setup",
31
36
  "setup:silent": "node build/index.js --setup --silent",
37
+ "refresh-index": "node scripts/refresh-index.cjs",
32
38
  "test": "vitest",
33
39
  "start": "node build/index.js"
34
40
  },
@@ -86,4 +92,4 @@
86
92
  "typescript": "^5.9.3",
87
93
  "vitest": "^4.0.17"
88
94
  }
89
- }
95
+ }