@vespermcp/mcp-server 1.2.21 → 1.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/README.md +49 -0
  2. package/build/cache/service.js +7 -0
  3. package/build/cloud/adapters/supabase.js +49 -0
  4. package/build/cloud/storage-manager.js +6 -0
  5. package/build/export/exporter.js +22 -9
  6. package/build/gateway/unified-dataset-gateway.js +441 -0
  7. package/build/index.js +1815 -839
  8. package/build/ingestion/ingestor.js +7 -4
  9. package/build/install/install-service.js +11 -6
  10. package/build/lib/supabase.js +3 -0
  11. package/build/metadata/arxiv-source.js +229 -0
  12. package/build/metadata/circuit-breaker.js +62 -0
  13. package/build/metadata/github-source.js +203 -0
  14. package/build/metadata/hackernews-source.js +123 -0
  15. package/build/metadata/quality.js +27 -0
  16. package/build/metadata/scraper.js +85 -14
  17. package/build/metadata/semantic-scholar-source.js +138 -0
  18. package/build/python/asset_downloader_engine.py +2 -0
  19. package/build/python/convert_engine.py +92 -0
  20. package/build/python/export_engine.py +45 -0
  21. package/build/python/kaggle_engine.py +77 -5
  22. package/build/python/normalize_engine.py +83 -0
  23. package/build/python/vesper/core/asset_downloader.py +5 -1
  24. package/build/scripts/test-phase1-webcore-quality.js +104 -0
  25. package/build/search/engine.js +45 -6
  26. package/build/search/jit-orchestrator.js +18 -14
  27. package/build/search/query-intent.js +509 -0
  28. package/build/tools/formatter.js +6 -3
  29. package/build/utils/python-runtime.js +130 -0
  30. package/build/web/extract-web.js +297 -0
  31. package/build/web/fusion-engine.js +457 -0
  32. package/build/web/types.js +1 -0
  33. package/build/web/web-core.js +242 -0
  34. package/package.json +12 -5
  35. package/scripts/postinstall.cjs +87 -31
  36. package/scripts/wizard.cjs +652 -0
  37. package/scripts/wizard.js +338 -12
  38. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  39. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  40. package/src/python/asset_downloader_engine.py +2 -0
  41. package/src/python/convert_engine.py +92 -0
  42. package/src/python/export_engine.py +45 -0
  43. package/src/python/kaggle_engine.py +77 -5
  44. package/src/python/normalize_engine.py +83 -0
  45. package/src/python/requirements.txt +12 -0
  46. package/src/python/vesper/core/asset_downloader.py +5 -1
  47. package/wizard.cjs +3 -0
@@ -46,6 +46,9 @@ export class DataIngestor {
46
46
  getKaggleCredentialError() {
47
47
  return "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds), or provide ~/.kaggle/kaggle.json.";
48
48
  }
49
+ toSafeDatasetPath(datasetId) {
50
+ return datasetId.replace(/[:\/]/g, "_");
51
+ }
49
52
  /**
50
53
  * Ensures a dataset is available locally
51
54
  */
@@ -115,7 +118,7 @@ export class DataIngestor {
115
118
  this.failDownload(datasetId, errorMsg);
116
119
  throw new Error(errorMsg);
117
120
  }
118
- const targetDir = path.join(this.rawDataDir, datasetId.replace(/\//g, "_"));
121
+ const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
119
122
  this.store.registerDownload(datasetId, targetDir, "downloading");
120
123
  try {
121
124
  onProgress?.("Downloading from Kaggle...");
@@ -131,7 +134,7 @@ export class DataIngestor {
131
134
  }
132
135
  }
133
136
  else if (source === "openml") {
134
- const targetDir = path.join(this.rawDataDir, datasetId.replace(/:/g, "_"));
137
+ const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
135
138
  this.store.registerDownload(datasetId, targetDir, "downloading");
136
139
  try {
137
140
  onProgress?.("Downloading from OpenML...");
@@ -147,7 +150,7 @@ export class DataIngestor {
147
150
  }
148
151
  }
149
152
  else if (source === "dataworld") {
150
- const targetDir = path.join(this.rawDataDir, datasetId.replace(/[:\/]/g, "_"));
153
+ const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
151
154
  this.store.registerDownload(datasetId, targetDir, "downloading");
152
155
  try {
153
156
  onProgress?.("Downloading from data.world...");
@@ -181,7 +184,7 @@ export class DataIngestor {
181
184
  * Generates a safe local filename for a dataset ID
182
185
  */
183
186
  getTargetPath(datasetId, extension = "parquet") {
184
- const safeId = datasetId.replace(/\//g, "_").replace(/:/g, "_");
187
+ const safeId = this.toSafeDatasetPath(datasetId);
185
188
  return path.join(this.rawDataDir, `${safeId}.${extension}`);
186
189
  }
187
190
  /**
@@ -18,12 +18,15 @@ export class InstallService {
18
18
  throw new Error(`Source file not found for installation: ${sourcePath}`);
19
19
  }
20
20
  const dataset = this.metadataStore.getDataset(datasetId);
21
- if (!dataset) {
22
- throw new Error(`Dataset metadata not found for ${datasetId}`);
23
- }
24
21
  // Create target directory
25
- const sanitizedName = dataset.name.replace(/[^a-z0-9]/gi, "_").toLowerCase();
26
- const installDir = targetDir || path.join(this.projectRoot, "datasets", sanitizedName);
22
+ const installLabel = dataset?.name || datasetId;
23
+ const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
24
+ // If caller specified a target dir, use it directly
25
+ // Otherwise use the current working directory
26
+ const installDir = targetDir
27
+ ? path.resolve(targetDir)
28
+ : path.resolve(process.cwd(), sanitizedName);
29
+ console.error(`[InstallService] Resolved install directory: ${installDir}`);
27
30
  if (!fs.existsSync(installDir)) {
28
31
  fs.mkdirSync(installDir, { recursive: true });
29
32
  }
@@ -34,7 +37,9 @@ export class InstallService {
34
37
  fs.copyFileSync(sourcePath, targetPath);
35
38
  // Update metadata
36
39
  const absolutePath = path.resolve(targetPath);
37
- this.metadataStore.updateInstallPath(datasetId, absolutePath);
40
+ if (dataset) {
41
+ this.metadataStore.updateInstallPath(datasetId, absolutePath);
42
+ }
38
43
  console.error(`[InstallService] Dataset ${datasetId} installed to ${absolutePath}`);
39
44
  return absolutePath;
40
45
  }
@@ -0,0 +1,3 @@
1
+ import { createClient } from '@supabase/supabase-js';
2
+ export const supabase = createClient(process.env.SUPABASE_URL, process.env.SUPABASE_SERVICE_ROLE_KEY // for MCP, use service_role, not anon
3
+ );
@@ -0,0 +1,229 @@
1
+ import { rateLimitedFetch } from "./rate-limiter.js";
2
+ import { CircuitBreaker } from "./circuit-breaker.js";
3
+ import { estimateQualityScore } from "./quality.js";
4
+ export class ArxivSource {
5
+ cache;
6
+ baseUrl = "http://export.arxiv.org/api/query";
7
+ breaker = new CircuitBreaker("arxiv", {
8
+ failureThreshold: 5,
9
+ openDurationMs: 30_000,
10
+ halfOpenSuccessesToClose: 2,
11
+ });
12
+ constructor(cache) {
13
+ this.cache = cache;
14
+ }
15
+ async discover(query, limit = 20) {
16
+ const out = await this.discoverWithTelemetry(query, limit, { full_text: false });
17
+ return out.results;
18
+ }
19
+ async discoverWithTelemetry(query, limit = 20, input = {}) {
20
+ const start = Date.now();
21
+ const cleanQuery = String(query || "").trim();
22
+ if (!cleanQuery) {
23
+ return { results: [], cacheHit: false, latencyMs: Date.now() - start };
24
+ }
25
+ const fullText = input.full_text === true;
26
+ const maxResults = Math.max(1, Math.min(100, Number(limit || 20)));
27
+ const cacheKey = `webcore:arxiv:discover:${cleanQuery.toLowerCase()}:limit=${maxResults}:full_text=${fullText ? 1 : 0}`;
28
+ const cached = await this.cache?.getJson(cacheKey);
29
+ if (cached) {
30
+ return { results: cached, cacheHit: true, latencyMs: Date.now() - start };
31
+ }
32
+ if (!this.breaker.canAttempt()) {
33
+ throw new Error("ArXiv connector is temporarily unavailable (circuit open).");
34
+ }
35
+ const url = `${this.baseUrl}?search_query=all:${encodeURIComponent(cleanQuery)}&start=0&max_results=${maxResults}&sortBy=relevance&sortOrder=descending`;
36
+ const response = await rateLimitedFetch(url, {
37
+ headers: {
38
+ "User-Agent": "vesper/2.0 (phase1-arxiv-connector)"
39
+ }
40
+ }, { maxRetries: 5, initialDelay: 1000, maxDelay: 15000 }).catch((e) => {
41
+ this.breaker.onFailure();
42
+ throw e;
43
+ });
44
+ const xml = await response.text();
45
+ const entries = this.parseEntries(xml);
46
+ let pdfExtractMsTotal = 0;
47
+ const result = [];
48
+ for (const entry of entries) {
49
+ if (fullText) {
50
+ const pdfStart = Date.now();
51
+ const pdfText = await this.extractPdfText(entry.id).catch(() => "");
52
+ pdfExtractMsTotal += Date.now() - pdfStart;
53
+ const truncated = pdfText ? this.truncateTo50k(pdfText) : undefined;
54
+ result.push(this.toDatasetMetadata(entry, {
55
+ webcore_content: truncated,
56
+ contentDepth: truncated ? truncated.length : entry.summary.length,
57
+ }));
58
+ }
59
+ else {
60
+ result.push(this.toDatasetMetadata(entry, { contentDepth: entry.summary.length }));
61
+ }
62
+ }
63
+ this.breaker.onSuccess();
64
+ await this.cache?.setJson(cacheKey, result, 86400); // 24h
65
+ return { results: result, cacheHit: false, latencyMs: Date.now() - start, pdf_extract_ms_total: pdfExtractMsTotal };
66
+ }
67
+ parseEntries(xml) {
68
+ const entries = [];
69
+ const entryMatches = xml.match(/<entry>([\s\S]*?)<\/entry>/g) || [];
70
+ for (const block of entryMatches) {
71
+ const idUrl = this.extractTag(block, "id");
72
+ const title = this.decodeXml(this.extractTag(block, "title"));
73
+ const summary = this.decodeXml(this.extractTag(block, "summary"));
74
+ const updated = this.extractTag(block, "updated");
75
+ const published = this.extractTag(block, "published");
76
+ const pdfUrl = this.extractPdfUrl(block) || (idUrl ? idUrl.replace("/abs/", "/pdf/") : "");
77
+ const authors = this.extractAllTags(block, "name").map((v) => this.decodeXml(v));
78
+ const categories = this.extractAllCategoryTerms(block);
79
+ if (!idUrl || !title)
80
+ continue;
81
+ const shortId = this.extractArxivId(idUrl);
82
+ entries.push({
83
+ id: shortId,
84
+ title: title.replace(/\s+/g, " ").trim(),
85
+ summary: summary.replace(/\s+/g, " ").trim(),
86
+ updated,
87
+ published,
88
+ authors,
89
+ categories,
90
+ pdfUrl,
91
+ });
92
+ }
93
+ return entries;
94
+ }
95
+ toDatasetMetadata(entry, input) {
96
+ const description = entry.summary || entry.title;
97
+ const publishedAt = entry.published || entry.updated || new Date().toISOString();
98
+ const qualityWarnings = [];
99
+ if (description.length < 120) {
100
+ qualityWarnings.push("Short abstract may reduce extraction confidence");
101
+ }
102
+ const abstractLength = description.length;
103
+ const authorsPresent = Array.isArray(entry.authors) && entry.authors.length > 0;
104
+ const datePresent = !!(entry.published || entry.updated);
105
+ const contentDepth = Math.max(abstractLength, input.contentDepth || abstractLength);
106
+ const quality01 = estimateQualityScore({
107
+ abstractLength,
108
+ authorsPresent,
109
+ datePresent,
110
+ contentDepth,
111
+ });
112
+ return {
113
+ id: entry.id,
114
+ source: "arxiv",
115
+ name: entry.title,
116
+ description,
117
+ authors: entry.authors,
118
+ downloads: 0,
119
+ likes: 0,
120
+ stars: 0,
121
+ tags: entry.categories,
122
+ last_updated: entry.updated || publishedAt,
123
+ task: "research-paper",
124
+ languages: [],
125
+ domain: "research",
126
+ splits: [],
127
+ license: {
128
+ id: "unknown",
129
+ category: "unknown",
130
+ usage_restrictions: [],
131
+ warnings: [],
132
+ },
133
+ quality_score: Math.round(quality01 * 100),
134
+ quality_warnings: qualityWarnings,
135
+ download_url: entry.pdfUrl,
136
+ format: "PDF",
137
+ total_examples: 1,
138
+ total_size_bytes: undefined,
139
+ total_size_mb: undefined,
140
+ columns: [
141
+ { name: "title", type: "string" },
142
+ { name: "abstract", type: "string" },
143
+ { name: "authors", type: "string[]" },
144
+ { name: "categories", type: "string[]" },
145
+ { name: "published_at", type: "datetime" },
146
+ { name: "source_url", type: "string" },
147
+ ],
148
+ is_structured: true,
149
+ has_target_column: false,
150
+ is_safe_source: true,
151
+ has_personal_data: false,
152
+ is_paywalled: false,
153
+ is_scraped_web_data: false,
154
+ uses_https: true,
155
+ has_train_split: false,
156
+ has_test_split: false,
157
+ has_validation_split: false,
158
+ description_length: description.length,
159
+ has_readme: false,
160
+ metadata_url: `https://arxiv.org/abs/${entry.id}`,
161
+ ...(input.webcore_content ? { webcore_content: input.webcore_content, webcore_content_kind: "pdf_text" } : {}),
162
+ };
163
+ }
164
+ truncateTo50k(text) {
165
+ return String(text || "").slice(0, 50_000);
166
+ }
167
+ async extractPdfText(arxivId) {
168
+ // Lazy-load heavy dependency only when enabled.
169
+ const pdfParseMod = await import("pdf-parse");
170
+ const pdfParse = pdfParseMod.default || pdfParseMod;
171
+ const pdfUrl = `https://arxiv.org/pdf/${arxivId}.pdf`;
172
+ const start = Date.now();
173
+ const response = await rateLimitedFetch(pdfUrl, {
174
+ headers: {
175
+ "User-Agent": "vesper/2.0 (phase1-arxiv-pdf-extract)"
176
+ }
177
+ }, { maxRetries: 3, initialDelay: 1000, maxDelay: 8000 });
178
+ const arrayBuf = await response.arrayBuffer();
179
+ const buffer = Buffer.from(arrayBuf);
180
+ const parsed = await pdfParse(buffer);
181
+ const text = String(parsed?.text || "");
182
+ // Soft truncate; later caller truncates too.
183
+ if (text.length > 200_000) {
184
+ // Avoid pathological PDFs.
185
+ return text.slice(0, 200_000);
186
+ }
187
+ void start;
188
+ return text;
189
+ }
190
+ extractTag(xml, tagName) {
191
+ const m = xml.match(new RegExp(`<${tagName}>([\\s\\S]*?)<\\/${tagName}>`, "i"));
192
+ return (m?.[1] || "").trim();
193
+ }
194
+ extractAllTags(xml, tagName) {
195
+ const out = [];
196
+ const rgx = new RegExp(`<${tagName}>([\\s\\S]*?)<\\/${tagName}>`, "gi");
197
+ let m = null;
198
+ while ((m = rgx.exec(xml)) !== null) {
199
+ out.push((m[1] || "").trim());
200
+ }
201
+ return out;
202
+ }
203
+ extractAllCategoryTerms(xml) {
204
+ const out = [];
205
+ const rgx = /<category[^>]*term="([^"]+)"[^>]*\/?>/gi;
206
+ let m = null;
207
+ while ((m = rgx.exec(xml)) !== null) {
208
+ out.push((m[1] || "").trim());
209
+ }
210
+ return Array.from(new Set(out));
211
+ }
212
+ extractPdfUrl(xml) {
213
+ const m = xml.match(/<link[^>]*title="pdf"[^>]*href="([^"]+)"[^>]*\/?>/i);
214
+ return (m?.[1] || "").trim();
215
+ }
216
+ extractArxivId(idUrl) {
217
+ const cleaned = idUrl.trim();
218
+ const match = cleaned.match(/\/abs\/([^/?#]+)/i);
219
+ return match?.[1] || cleaned;
220
+ }
221
+ decodeXml(input) {
222
+ return input
223
+ .replace(/&lt;/g, "<")
224
+ .replace(/&gt;/g, ">")
225
+ .replace(/&amp;/g, "&")
226
+ .replace(/&quot;/g, "\"")
227
+ .replace(/&#39;/g, "'");
228
+ }
229
+ }
@@ -0,0 +1,62 @@
1
+ export class CircuitBreaker {
2
+ name;
3
+ options;
4
+ state = "closed";
5
+ consecutiveFailures = 0;
6
+ openUntilMs = 0;
7
+ constructor(name, options) {
8
+ this.name = name;
9
+ this.options = options;
10
+ }
11
+ halfOpenSuccesses = 0;
12
+ canAttempt() {
13
+ if (this.state === "closed")
14
+ return true;
15
+ if (this.state === "open") {
16
+ if (Date.now() >= this.openUntilMs) {
17
+ this.state = "half_open";
18
+ this.halfOpenSuccesses = 0;
19
+ return true;
20
+ }
21
+ return false;
22
+ }
23
+ return true; // half_open
24
+ }
25
+ onSuccess() {
26
+ if (this.state === "half_open") {
27
+ this.halfOpenSuccesses++;
28
+ if (this.halfOpenSuccesses >= this.options.halfOpenSuccessesToClose) {
29
+ this.state = "closed";
30
+ this.consecutiveFailures = 0;
31
+ this.openUntilMs = 0;
32
+ }
33
+ return;
34
+ }
35
+ this.consecutiveFailures = 0;
36
+ this.state = "closed";
37
+ }
38
+ onFailure() {
39
+ if (this.state === "half_open") {
40
+ this.trip();
41
+ return;
42
+ }
43
+ this.consecutiveFailures++;
44
+ if (this.consecutiveFailures >= this.options.failureThreshold) {
45
+ this.trip();
46
+ }
47
+ }
48
+ trip() {
49
+ this.state = "open";
50
+ this.openUntilMs = Date.now() + this.options.openDurationMs;
51
+ this.consecutiveFailures = 0;
52
+ this.halfOpenSuccesses = 0;
53
+ console.error(`[CircuitBreaker] Opened: ${this.name} (until ${new Date(this.openUntilMs).toISOString()})`);
54
+ }
55
+ getStatus() {
56
+ return {
57
+ name: this.name,
58
+ state: this.state,
59
+ ...(this.state === "open" ? { open_until: new Date(this.openUntilMs).toISOString() } : {}),
60
+ };
61
+ }
62
+ }
@@ -0,0 +1,203 @@
1
+ import { rateLimitedFetch } from "./rate-limiter.js";
2
+ import { CircuitBreaker } from "./circuit-breaker.js";
3
+ import { estimateQualityScore } from "./quality.js";
4
+ export class GithubSource {
5
+ cache;
6
+ breaker = new CircuitBreaker("github", {
7
+ failureThreshold: 5,
8
+ openDurationMs: 30_000,
9
+ halfOpenSuccessesToClose: 2,
10
+ });
11
+ constructor(cache) {
12
+ this.cache = cache;
13
+ }
14
+ async discover(query, limit = 20) {
15
+ const out = await this.discoverWithTelemetry(query, limit, { include_readme: false });
16
+ return out.results;
17
+ }
18
+ async discoverWithTelemetry(query, limit = 20, input = {}) {
19
+ const start = Date.now();
20
+ const cleanQuery = String(query || "").trim();
21
+ if (!cleanQuery) {
22
+ return { results: [], cacheHit: false, latencyMs: Date.now() - start, readme_fetch_ms_total: 0 };
23
+ }
24
+ const includeReadme = input.include_readme === true;
25
+ const perPage = Math.max(1, Math.min(100, Number(limit || 20)));
26
+ const cacheKey = `webcore:github:discover:${cleanQuery.toLowerCase()}:per_page=${perPage}:readme=${includeReadme ? 1 : 0}`;
27
+ const cached = await this.cache?.getJson(cacheKey);
28
+ if (cached) {
29
+ return { results: cached, cacheHit: true, latencyMs: Date.now() - start };
30
+ }
31
+ if (!this.breaker.canAttempt()) {
32
+ throw new Error("GitHub connector is temporarily unavailable (circuit open).");
33
+ }
34
+ const refinedQuery = `${cleanQuery} in:name,description,readme`;
35
+ const url = `https://api.github.com/search/repositories?q=${encodeURIComponent(refinedQuery)}&sort=stars&order=desc&per_page=${perPage}`;
36
+ const headers = {
37
+ "Accept": "application/vnd.github+json",
38
+ "User-Agent": "vesper/2.0 (phase1-github-connector)",
39
+ "X-GitHub-Api-Version": "2022-11-28",
40
+ };
41
+ const token = process.env.GITHUB_TOKEN || process.env.GH_TOKEN;
42
+ if (token)
43
+ headers["Authorization"] = `Bearer ${token}`;
44
+ const response = await rateLimitedFetch(url, { headers }, { maxRetries: 5, initialDelay: 1000, maxDelay: 20000 })
45
+ .catch((e) => {
46
+ // GitHub uses 403 for rate limiting; treat as breaker-worthy failure
47
+ this.breaker.onFailure();
48
+ if (String(e?.message || "").includes("403")) {
49
+ throw new Error("GitHub API rate limit exceeded (403). Set GITHUB_TOKEN for higher limits.");
50
+ }
51
+ throw e;
52
+ });
53
+ const data = (await response.json());
54
+ const items = Array.isArray(data?.items) ? data.items : [];
55
+ const repos = items.slice(0, perPage);
56
+ let readmeFetchMsTotal = 0;
57
+ const maxReadmes = includeReadme ? Math.min(5, repos.length) : 0;
58
+ const result = [];
59
+ for (let i = 0; i < repos.length; i++) {
60
+ const repo = repos[i];
61
+ if (includeReadme && i < maxReadmes) {
62
+ const fullName = String(repo.full_name || repo.name || "").trim();
63
+ const readmeKey = fullName ? `webcore:github:readme:${fullName}` : "webcore:github:readme:unknown";
64
+ const cachedReadme = await this.cache?.getJson(readmeKey);
65
+ if (cachedReadme) {
66
+ result.push(this.toDatasetMetadata(repo, { readmeText: cachedReadme }));
67
+ continue;
68
+ }
69
+ const t0 = Date.now();
70
+ const readmeText = await this.fetchReadme(repo).catch(() => undefined);
71
+ readmeFetchMsTotal += Date.now() - t0;
72
+ if (readmeText) {
73
+ await this.cache?.setJson(readmeKey, readmeText, 21600); // 6h
74
+ }
75
+ result.push(this.toDatasetMetadata(repo, { readmeText: readmeText ? this.truncate50k(readmeText) : undefined }));
76
+ }
77
+ else {
78
+ result.push(this.toDatasetMetadata(repo, { readmeText: undefined }));
79
+ }
80
+ }
81
+ this.breaker.onSuccess();
82
+ await this.cache?.setJson(cacheKey, result, 21600); // 6h
83
+ return { results: result, cacheHit: false, latencyMs: Date.now() - start, readme_fetch_ms_total: readmeFetchMsTotal };
84
+ }
85
+ toDatasetMetadata(repo, input) {
86
+ const fullName = String(repo.full_name || repo.name || "unknown").trim();
87
+ const description = String(repo.description || "").trim() || "No description provided.";
88
+ const ownerRepo = this.parseOwnerRepo(fullName);
89
+ const owner = ownerRepo ? ownerRepo.split("/")[0] : "";
90
+ const stars = Number(repo.stargazers_count || 0);
91
+ const forks = Number(repo.forks_count || 0);
92
+ const updatedAt = repo.updated_at || new Date().toISOString();
93
+ const topics = Array.isArray(repo.topics) ? repo.topics.filter(Boolean).map(String) : [];
94
+ const language = repo.language ? [String(repo.language)] : [];
95
+ const licenseId = repo.license?.spdx_id && repo.license.spdx_id !== "NOASSERTION"
96
+ ? String(repo.license.spdx_id)
97
+ : "unknown";
98
+ const licenseName = repo.license?.name ? String(repo.license.name) : undefined;
99
+ const qualityWarnings = [];
100
+ if (stars < 5)
101
+ qualityWarnings.push("Low star count; may be low-signal");
102
+ if (description.length < 80)
103
+ qualityWarnings.push("Short description; relevance may be weaker");
104
+ const abstractLength = input.readmeText ? input.readmeText.length : description.length;
105
+ const authorsPresent = !!owner;
106
+ const datePresent = !!updatedAt;
107
+ const contentDepth = Math.max(abstractLength, input.readmeText ? input.readmeText.length : description.length);
108
+ const quality01 = estimateQualityScore({
109
+ abstractLength,
110
+ authorsPresent,
111
+ datePresent,
112
+ contentDepth,
113
+ });
114
+ return {
115
+ id: fullName,
116
+ source: "github",
117
+ name: fullName.split("/").pop() || fullName,
118
+ description,
119
+ ...(owner ? { authors: [owner] } : {}),
120
+ downloads: forks * 10,
121
+ likes: stars,
122
+ stars,
123
+ tags: topics,
124
+ last_updated: updatedAt,
125
+ task: "code",
126
+ languages: language,
127
+ domain: "research",
128
+ splits: [],
129
+ license: {
130
+ id: licenseId,
131
+ name: licenseName,
132
+ category: "unknown",
133
+ usage_restrictions: [],
134
+ warnings: [],
135
+ },
136
+ quality_score: Math.round(quality01 * 100),
137
+ quality_warnings: qualityWarnings,
138
+ download_url: String(repo.html_url || `https://github.com/${fullName}`),
139
+ format: "GIT",
140
+ total_examples: 1,
141
+ is_structured: false,
142
+ has_target_column: false,
143
+ is_safe_source: true,
144
+ has_personal_data: false,
145
+ is_paywalled: false,
146
+ is_scraped_web_data: false,
147
+ uses_https: true,
148
+ has_train_split: false,
149
+ has_test_split: false,
150
+ has_validation_split: false,
151
+ description_length: description.length,
152
+ has_readme: false,
153
+ metadata_url: String(repo.html_url || `https://github.com/${fullName}`),
154
+ ...(input.readmeText
155
+ ? { webcore_content: this.truncate50k(input.readmeText), webcore_content_kind: "readme_text" }
156
+ : {}),
157
+ };
158
+ }
159
+ truncate50k(text) {
160
+ return String(text || "").slice(0, 50_000);
161
+ }
162
+ parseOwnerRepo(fullName) {
163
+ const trimmed = String(fullName || "").trim();
164
+ if (trimmed.includes("/"))
165
+ return trimmed;
166
+ return null;
167
+ }
168
+ async fetchReadme(repo) {
169
+ const fullName = String(repo.full_name || repo.name || "").trim();
170
+ const ownerRepo = this.parseOwnerRepo(fullName);
171
+ if (!ownerRepo)
172
+ return undefined;
173
+ const [owner, name] = ownerRepo.split("/");
174
+ const candidates = [];
175
+ if (repo.default_branch)
176
+ candidates.push(String(repo.default_branch));
177
+ candidates.push("main", "master");
178
+ const uniq = Array.from(new Set(candidates.filter(Boolean)));
179
+ const token = process.env.GITHUB_TOKEN || process.env.GH_TOKEN;
180
+ const headers = {
181
+ "User-Agent": "vesper/2.0 (phase1-github-readme)",
182
+ "Accept": "text/plain",
183
+ };
184
+ if (token)
185
+ headers["Authorization"] = `Bearer ${token}`;
186
+ for (const branch of uniq) {
187
+ const url = `https://raw.githubusercontent.com/${owner}/${name}/${branch}/README.md`;
188
+ const res = await fetch(url, { headers }).catch(() => null);
189
+ if (!res)
190
+ continue;
191
+ if (res.status === 404)
192
+ continue;
193
+ if (res.status === 429 || res.status === 403) {
194
+ this.breaker.onFailure();
195
+ throw new Error(`GitHub README fetch failed with status ${res.status}`);
196
+ }
197
+ if (!res.ok)
198
+ continue;
199
+ return await res.text();
200
+ }
201
+ return undefined;
202
+ }
203
+ }