@vespermcp/mcp-server 1.1.3 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,12 +8,16 @@ export class HFDownloader {
8
8
  this.hfToken = token || process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
9
9
  this.downloader = new RobustDownloader();
10
10
  }
11
+ getToken() {
12
+ return this.hfToken || process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
13
+ }
11
14
  /**
12
15
  * Finds the most suitable data file in a repository
13
16
  * Returns the relative path within the repo
14
17
  */
15
18
  async findBestFile(repoId) {
16
19
  try {
20
+ const token = this.getToken();
17
21
  const files = [];
18
22
  const blacklist = [
19
23
  ".gitattributes",
@@ -28,7 +32,7 @@ export class HFDownloader {
28
32
  for await (const file of listFiles({
29
33
  repo: { type: "dataset", name: repoId },
30
34
  recursive: true,
31
- ...(this.hfToken ? { accessToken: this.hfToken } : {})
35
+ ...(token ? { accessToken: token } : {})
32
36
  })) {
33
37
  if (file.type === "file") {
34
38
  const fileName = path.basename(file.path);
@@ -62,7 +66,11 @@ export class HFDownloader {
62
66
  return fallback || null;
63
67
  }
64
68
  catch (error) {
65
- console.error(`[HF] Failed to list files for ${repoId}:`, error.message);
69
+ const msg = String(error?.message || error);
70
+ if (msg.includes("401") || msg.includes("403") || msg.toLowerCase().includes("unauthorized")) {
71
+ throw new Error("Hugging Face gated/private dataset requires token. Run 'vespermcp config keys' to set HF token.");
72
+ }
73
+ console.error(`[HF] Failed to list files for ${repoId}:`, msg);
66
74
  return null;
67
75
  }
68
76
  }
@@ -70,9 +78,10 @@ export class HFDownloader {
70
78
  * Downloads a file from HF to local path
71
79
  */
72
80
  async download(repoId, filePath, targetPath, onProgress) {
81
+ const token = this.getToken();
73
82
  const url = `https://huggingface.co/datasets/${repoId}/resolve/main/${filePath}`;
74
83
  await this.downloader.download(url, targetPath, {
75
- headers: this.hfToken ? { 'Authorization': `Bearer ${this.hfToken}` } : {},
84
+ headers: token ? { 'Authorization': `Bearer ${token}` } : {},
76
85
  resume: true,
77
86
  onProgress: (bytes, total) => {
78
87
  if (total > 0 && onProgress) {
@@ -1,13 +1,15 @@
1
1
  import path from "path";
2
2
  import fs from "fs";
3
3
  import { HFDownloader } from "./hf-downloader.js";
4
- import { KaggleDownloader } from "./kaggle-downloader.js";
4
+ import { KaggleSource } from "../metadata/kaggle-source.js";
5
+ import { SecureKeysManager } from "../config/secure-keys.js";
5
6
  export class DataIngestor {
6
7
  projectRoot;
7
8
  store;
8
9
  rawDataDir;
9
10
  hfDownloader;
10
- kaggleDownloader;
11
+ kaggleSource;
12
+ secureKeys;
11
13
  constructor(projectRoot, store) {
12
14
  this.projectRoot = projectRoot;
13
15
  this.store = store;
@@ -16,19 +18,26 @@ export class DataIngestor {
16
18
  fs.mkdirSync(this.rawDataDir, { recursive: true });
17
19
  }
18
20
  this.hfDownloader = new HFDownloader();
19
- this.kaggleDownloader = new KaggleDownloader();
21
+ this.kaggleSource = new KaggleSource();
22
+ this.secureKeys = new SecureKeysManager();
20
23
  }
21
24
  /**
22
25
  * Check if Kaggle credentials are available
23
26
  */
24
27
  hasKaggleCredentials() {
25
- return this.kaggleDownloader.hasCredentials();
28
+ if (process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY)
29
+ return true;
30
+ const keys = this.secureKeys.getAll();
31
+ if (keys.kaggle_username && keys.kaggle_key)
32
+ return true;
33
+ const kaggleJsonPath = path.join(process.env.HOME || process.env.USERPROFILE || "", ".kaggle", "kaggle.json");
34
+ return !!(kaggleJsonPath && fs.existsSync(kaggleJsonPath));
26
35
  }
27
36
  /**
28
37
  * Get helpful error message if Kaggle credentials are missing
29
38
  */
30
39
  getKaggleCredentialError() {
31
- return this.kaggleDownloader.getCredentialError();
40
+ return "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds), or provide ~/.kaggle/kaggle.json.";
32
41
  }
33
42
  /**
34
43
  * Ensures a dataset is available locally
@@ -67,10 +76,25 @@ export class DataIngestor {
67
76
  }
68
77
  }
69
78
  else if (source === "kaggle") {
70
- // Kaggle support has been disabled
71
- const errorMsg = "Kaggle datasets are no longer supported. Please use HuggingFace or other open-access sources.";
72
- this.failDownload(datasetId, errorMsg);
73
- throw new Error(errorMsg);
79
+ if (!this.hasKaggleCredentials()) {
80
+ const errorMsg = this.getKaggleCredentialError();
81
+ this.failDownload(datasetId, errorMsg);
82
+ throw new Error(errorMsg);
83
+ }
84
+ const targetDir = path.join(this.rawDataDir, datasetId.replace(/\//g, "_"));
85
+ this.store.registerDownload(datasetId, targetDir, "downloading");
86
+ try {
87
+ onProgress?.("Downloading from Kaggle...");
88
+ const result = await this.kaggleSource.download(datasetId, targetDir);
89
+ const stats = fs.statSync(result.local_path);
90
+ this.completeDownload(datasetId, result.local_path, stats.size);
91
+ onProgress?.("Kaggle download complete", 100);
92
+ return result.local_path;
93
+ }
94
+ catch (e) {
95
+ this.failDownload(datasetId, e.message);
96
+ throw e;
97
+ }
74
98
  }
75
99
  throw new Error(`Download logic for ${source} not yet implemented`);
76
100
  }
@@ -23,8 +23,8 @@ export class KaggleDownloader {
23
23
  getCredentialError() {
24
24
  if (!this.username && !this.key) {
25
25
  return "Kaggle credentials missing. Please set KAGGLE_USERNAME and KAGGLE_KEY environment variables.\n" +
26
- "💡 Tip: Get your API token from https://www.kaggle.com/settings API Create New Token\n" +
27
- "💡 Alternative: Download the dataset manually and use analyze_quality() on local files.";
26
+ "Tip: Get your API token from https://www.kaggle.com/settings -> API -> Create New Token\n" +
27
+ "Alternative: Download the dataset manually and use analyze_quality() on local files.";
28
28
  }
29
29
  if (!this.username) {
30
30
  return "KAGGLE_USERNAME is missing. Please set it in your MCP config or environment variables.";
@@ -0,0 +1,70 @@
1
+ import { spawn } from "child_process";
2
+ import path from "path";
3
+ import fs from "fs";
4
+ import os from "os";
5
+ export class KaggleSource {
6
+ pythonPath = "python";
7
+ scriptPath;
8
+ constructor(buildDir = process.cwd()) {
9
+ const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
10
+ const dataRoot = path.join(homeDir, ".vesper");
11
+ const scriptPath0 = path.resolve(dataRoot, "python", "kaggle_engine.py");
12
+ const scriptPath1 = path.resolve(buildDir, "python", "kaggle_engine.py");
13
+ const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "kaggle_engine.py");
14
+ if (fs.existsSync(scriptPath0)) {
15
+ this.scriptPath = scriptPath0;
16
+ }
17
+ else if (fs.existsSync(scriptPath1)) {
18
+ this.scriptPath = scriptPath1;
19
+ }
20
+ else if (fs.existsSync(scriptPath2)) {
21
+ this.scriptPath = scriptPath2;
22
+ }
23
+ else {
24
+ this.scriptPath = scriptPath0;
25
+ }
26
+ if (process.platform === "win32")
27
+ this.pythonPath = "py";
28
+ }
29
+ async discover(query, limit = 20) {
30
+ const result = await this.run(["discover", query, String(limit)]);
31
+ if (!result.ok) {
32
+ throw new Error(result.error || "Kaggle discover failed");
33
+ }
34
+ return (result.results || []);
35
+ }
36
+ async download(datasetRef, targetDir) {
37
+ const args = ["download", datasetRef];
38
+ if (targetDir)
39
+ args.push(targetDir);
40
+ const result = await this.run(args);
41
+ if (!result.ok) {
42
+ throw new Error(result.error || "Kaggle download failed");
43
+ }
44
+ return {
45
+ local_path: result.local_path,
46
+ target_dir: result.target_dir,
47
+ };
48
+ }
49
+ async run(args) {
50
+ return new Promise((resolve, reject) => {
51
+ const processRef = spawn(this.pythonPath, [this.scriptPath, ...args]);
52
+ let stdout = "";
53
+ let stderr = "";
54
+ processRef.stdout.on("data", (d) => (stdout += d.toString()));
55
+ processRef.stderr.on("data", (d) => (stderr += d.toString()));
56
+ processRef.on("close", (code) => {
57
+ if (code !== 0) {
58
+ reject(new Error(stderr || stdout || `kaggle_engine exited with code ${code}`));
59
+ return;
60
+ }
61
+ try {
62
+ resolve(JSON.parse(stdout));
63
+ }
64
+ catch {
65
+ reject(new Error(`Failed to parse kaggle_engine output: ${stdout}`));
66
+ }
67
+ });
68
+ });
69
+ }
70
+ }
@@ -93,15 +93,23 @@ export class HuggingFaceScraper {
93
93
  const results = [];
94
94
  let processed = 0;
95
95
  let skippedMVP = 0;
96
+ let rateLimitHits = 0;
97
+ let otherErrors = 0;
96
98
  try {
97
99
  // Fetch more datasets to account for filtering
98
100
  const fetchLimit = applyMVPFilters ? limit * 30 : limit * 10;
99
- const CONCURRENCY = 25; // Increased for high-volume indexing
100
- const queue = [];
101
101
  // Support HuggingFace token from environment variable
102
102
  const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
103
+ // CRITICAL: Low concurrency without token to avoid rate limits
104
+ // With token: 10 concurrent (HF allows more)
105
+ // Without token: 2 concurrent (stay under ~30 req/min limit)
106
+ const CONCURRENCY = hfToken ? 10 : 2;
107
+ const queue = [];
108
+ if (!hfToken) {
109
+ console.error(`[HF Scraper] ⚠️ No HF_TOKEN set - using conservative rate limits`);
110
+ }
103
111
  // Add delay between batches to avoid rate limits
104
- const BATCH_DELAY = 1000; // 1 second delay between batches
112
+ const BATCH_DELAY = hfToken ? 500 : 2000;
105
113
  for await (const ds of listDatasets({
106
114
  limit: fetchLimit,
107
115
  additionalFields: ["description", "tags"],
@@ -242,11 +250,20 @@ export class HuggingFaceScraper {
242
250
  results.push(metadata);
243
251
  }
244
252
  catch (e) {
245
- // Log rate limit errors, silently skip others
253
+ // Track all errors for user feedback
246
254
  if (e?.status === 429 || e?.message?.includes('rate limit')) {
247
- console.error(`[HF Scraper] Rate limit error for ${repoId}: ${e.message}`);
255
+ rateLimitHits++;
256
+ if (rateLimitHits <= 3) {
257
+ console.error(`[HF Scraper] Rate limit hit for ${repoId}`);
258
+ }
259
+ }
260
+ else {
261
+ otherErrors++;
262
+ // Log first few non-rate-limit errors for debugging
263
+ if (otherErrors <= 2) {
264
+ console.error(`[HF Scraper] Error for ${repoId}: ${e.message?.slice(0, 80)}`);
265
+ }
248
266
  }
249
- // Silently skip other errors
250
267
  }
251
268
  })();
252
269
  queue.push(processTask);
@@ -265,14 +282,21 @@ export class HuggingFaceScraper {
265
282
  catch (e) {
266
283
  // Handle rate limit errors with better messaging
267
284
  if (e?.status === 429 || e?.message?.includes('rate limit')) {
268
- console.error("Scraping failed due to rate limit:", e.message);
269
- console.error("Consider setting HF_TOKEN environment variable to increase rate limits");
285
+ console.error("[HF Scraper] ❌ Scraping failed due to rate limit:", e.message);
286
+ console.error("[HF Scraper] 💡 Set HF_TOKEN environment variable for unlimited access");
270
287
  }
271
288
  else {
272
- console.error("Scraping failed overall:", e.message);
289
+ console.error("[HF Scraper] ❌ Scraping failed:", e.message);
273
290
  }
274
291
  }
275
- console.error(`[HF Scraper] Complete: ${results.length} datasets scraped, ${skippedMVP} skipped`);
292
+ // User-friendly summary
293
+ console.error(`[HF Scraper] ✅ Complete: ${results.length} datasets found`);
294
+ if (rateLimitHits > 0) {
295
+ console.error(`[HF Scraper] ⚠️ ${rateLimitHits} requests rate-limited. Set HF_TOKEN for better results.`);
296
+ }
297
+ if (otherErrors > 0) {
298
+ console.error(`[HF Scraper] ⚠️ ${otherErrors} datasets skipped due to errors`);
299
+ }
276
300
  // Sort by downloads descending
277
301
  return results.sort((a, b) => b.downloads - a.downloads);
278
302
  }
@@ -0,0 +1,259 @@
1
+ import os
2
+ import sys
3
+ import json
4
+ import base64
5
+ import hashlib
6
+ import secrets
7
+ from pathlib import Path
8
+ from typing import Dict, Optional
9
+
10
+ SERVICE_NAME = "vesper"
11
+
12
+ KEY_ALIASES = {
13
+ "hf_token": ["HF_TOKEN", "HUGGINGFACE_TOKEN"],
14
+ "kaggle_username": ["KAGGLE_USERNAME"],
15
+ "kaggle_key": ["KAGGLE_KEY"],
16
+ }
17
+
18
+ try:
19
+ import keyring # type: ignore
20
+ HAS_KEYRING = True
21
+ except Exception:
22
+ HAS_KEYRING = False
23
+
24
+ try:
25
+ from cryptography.fernet import Fernet, InvalidToken # type: ignore
26
+ HAS_FERNET = True
27
+ except Exception:
28
+ HAS_FERNET = False
29
+
30
+
31
+ def _config_path() -> Path:
32
+ return Path.home() / ".vesper" / "config.toml"
33
+
34
+
35
+ def _secret_path() -> Path:
36
+ return Path.home() / ".vesper" / ".config_key"
37
+
38
+
39
+ def _ensure_parent(path: Path) -> None:
40
+ path.parent.mkdir(parents=True, exist_ok=True)
41
+
42
+
43
+ def _read_fallback_toml() -> Dict[str, str]:
44
+ path = _config_path()
45
+ if not path.exists():
46
+ return {}
47
+
48
+ values: Dict[str, str] = {}
49
+ in_keys = False
50
+ method = ""
51
+
52
+ for raw in path.read_text(encoding="utf-8").splitlines():
53
+ line = raw.strip()
54
+ if not line or line.startswith("#"):
55
+ continue
56
+ if line.startswith("[") and line.endswith("]"):
57
+ in_keys = (line == "[keys]")
58
+ continue
59
+ if line.startswith("method") and "=" in line:
60
+ method = line.split("=", 1)[1].strip().strip('"').strip("'")
61
+ continue
62
+ if not in_keys or "=" not in line:
63
+ continue
64
+
65
+ key, val = line.split("=", 1)
66
+ key = key.strip()
67
+ val = val.strip().strip('"').strip("'")
68
+ values[key] = val
69
+
70
+ if method:
71
+ values["__method__"] = method
72
+
73
+ return values
74
+
75
+
76
+ def _get_or_create_local_secret() -> str:
77
+ secret_file = _secret_path()
78
+ _ensure_parent(secret_file)
79
+
80
+ if secret_file.exists():
81
+ return secret_file.read_text(encoding="utf-8").strip()
82
+
83
+ secret = base64.urlsafe_b64encode(secrets.token_bytes(32)).decode("utf-8")
84
+ secret_file.write_text(secret, encoding="utf-8")
85
+ try:
86
+ os.chmod(secret_file, 0o600)
87
+ except Exception:
88
+ pass
89
+ return secret
90
+
91
+
92
+ def _xor_encrypt(plain: str, secret: str) -> str:
93
+ key = hashlib.sha256(secret.encode("utf-8")).digest()
94
+ data = plain.encode("utf-8")
95
+ out = bytes([data[i] ^ key[i % len(key)] for i in range(len(data))])
96
+ return base64.urlsafe_b64encode(out).decode("utf-8")
97
+
98
+
99
+ def _xor_decrypt(cipher_text: str, secret: str) -> str:
100
+ key = hashlib.sha256(secret.encode("utf-8")).digest()
101
+ data = base64.urlsafe_b64decode(cipher_text.encode("utf-8"))
102
+ out = bytes([data[i] ^ key[i % len(key)] for i in range(len(data))])
103
+ return out.decode("utf-8")
104
+
105
+
106
+ def _encrypt_value(value: str, secret: str) -> Dict[str, str]:
107
+ if HAS_FERNET:
108
+ token = Fernet(secret.encode("utf-8")).encrypt(value.encode("utf-8")).decode("utf-8")
109
+ return {"method": "fernet", "value": token}
110
+ # fallback encryption (weaker than fernet, but still not plaintext)
111
+ return {"method": "xor", "value": _xor_encrypt(value, secret)}
112
+
113
+
114
+ def _decrypt_value(value: str, method: str, secret: str) -> Optional[str]:
115
+ try:
116
+ if method == "fernet" and HAS_FERNET:
117
+ return Fernet(secret.encode("utf-8")).decrypt(value.encode("utf-8")).decode("utf-8")
118
+ if method == "xor":
119
+ return _xor_decrypt(value, secret)
120
+ return None
121
+ except InvalidToken:
122
+ return None
123
+ except Exception:
124
+ return None
125
+
126
+
127
+ def _write_fallback_toml(values: Dict[str, str]) -> None:
128
+ path = _config_path()
129
+ _ensure_parent(path)
130
+
131
+ method = values.get("__method__", "fernet" if HAS_FERNET else "xor")
132
+ lines = [
133
+ "# Vesper optional API keys fallback storage",
134
+ "# Encrypted fallback (keyring is preferred)",
135
+ "[meta]",
136
+ f'method = "{method}"',
137
+ "[keys]",
138
+ ]
139
+ for key in sorted(values.keys()):
140
+ if key.startswith("__"):
141
+ continue
142
+ val = str(values[key]).replace('"', '\\"')
143
+ lines.append(f'{key} = "{val}"')
144
+
145
+ path.write_text("\n".join(lines) + "\n", encoding="utf-8")
146
+
147
+
148
+ def _get_from_env(name: str) -> Optional[str]:
149
+ for env_key in KEY_ALIASES.get(name, []):
150
+ val = os.getenv(env_key)
151
+ if val:
152
+ return val
153
+ return None
154
+
155
+
156
+ def get_key(name: str) -> Optional[str]:
157
+ # 1) env vars (highest priority)
158
+ env_val = _get_from_env(name)
159
+ if env_val:
160
+ return env_val
161
+
162
+ # 2) keyring (secure)
163
+ if HAS_KEYRING:
164
+ try:
165
+ val = keyring.get_password(SERVICE_NAME, name)
166
+ if val:
167
+ return val
168
+ except Exception:
169
+ pass
170
+
171
+ # 3) encrypted fallback config.toml
172
+ fallback = _read_fallback_toml()
173
+ enc = fallback.get(name)
174
+ if not enc:
175
+ return None
176
+ secret = _get_or_create_local_secret()
177
+ method = fallback.get("__method__", "fernet" if HAS_FERNET else "xor")
178
+ return _decrypt_value(enc, method, secret)
179
+
180
+
181
+ def set_key(name: str, value: str) -> Dict[str, str]:
182
+ if not value:
183
+ return {"ok": "false", "method": "none", "error": "Empty value"}
184
+
185
+ if HAS_KEYRING:
186
+ try:
187
+ keyring.set_password(SERVICE_NAME, name, value)
188
+ return {"ok": "true", "method": "keyring"}
189
+ except Exception:
190
+ pass
191
+
192
+ fallback = _read_fallback_toml()
193
+ secret = _get_or_create_local_secret()
194
+ enc = _encrypt_value(value, secret)
195
+ fallback["__method__"] = enc["method"]
196
+ fallback[name] = enc["value"]
197
+ _write_fallback_toml(fallback)
198
+ return {"ok": "true", "method": f'toml:{enc["method"]}'}
199
+
200
+
201
+ def has_key(name: str) -> bool:
202
+ return bool(get_key(name))
203
+
204
+
205
+ def get_all() -> Dict[str, Optional[str]]:
206
+ return {
207
+ "hf_token": get_key("hf_token"),
208
+ "kaggle_username": get_key("kaggle_username"),
209
+ "kaggle_key": get_key("kaggle_key"),
210
+ }
211
+
212
+
213
+ def _print_json(data):
214
+ print(json.dumps(data))
215
+
216
+
217
+ def main() -> None:
218
+ if len(sys.argv) < 2:
219
+ _print_json({
220
+ "ok": False,
221
+ "error": "Usage: config.py <get|set|has|all> [name] [value]",
222
+ })
223
+ sys.exit(1)
224
+
225
+ cmd = sys.argv[1].lower()
226
+
227
+ if cmd == "all":
228
+ _print_json({"ok": True, "data": get_all()})
229
+ return
230
+
231
+ if len(sys.argv) < 3:
232
+ _print_json({"ok": False, "error": "Missing key name"})
233
+ sys.exit(1)
234
+
235
+ name = sys.argv[2]
236
+
237
+ if cmd == "get":
238
+ _print_json({"ok": True, "name": name, "value": get_key(name)})
239
+ return
240
+
241
+ if cmd == "has":
242
+ _print_json({"ok": True, "name": name, "value": has_key(name)})
243
+ return
244
+
245
+ if cmd == "set":
246
+ if len(sys.argv) < 4:
247
+ _print_json({"ok": False, "error": "Missing value for set"})
248
+ sys.exit(1)
249
+ value = sys.argv[3]
250
+ result = set_key(name, value)
251
+ _print_json({"ok": result.get("ok") == "true", "name": name, "method": result.get("method"), "error": result.get("error")})
252
+ return
253
+
254
+ _print_json({"ok": False, "error": f"Unknown command: {cmd}"})
255
+ sys.exit(1)
256
+
257
+
258
+ if __name__ == "__main__":
259
+ main()