@vespermcp/mcp-server 1.2.21 → 1.2.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -0
- package/build/cache/service.js +7 -0
- package/build/cloud/adapters/supabase.js +49 -0
- package/build/cloud/storage-manager.js +6 -0
- package/build/export/exporter.js +22 -9
- package/build/gateway/unified-dataset-gateway.js +441 -0
- package/build/index.js +1815 -839
- package/build/ingestion/ingestor.js +7 -4
- package/build/install/install-service.js +11 -6
- package/build/lib/supabase.js +3 -0
- package/build/metadata/arxiv-source.js +229 -0
- package/build/metadata/circuit-breaker.js +62 -0
- package/build/metadata/github-source.js +203 -0
- package/build/metadata/hackernews-source.js +123 -0
- package/build/metadata/quality.js +27 -0
- package/build/metadata/scraper.js +85 -14
- package/build/metadata/semantic-scholar-source.js +138 -0
- package/build/python/asset_downloader_engine.py +2 -0
- package/build/python/convert_engine.py +92 -0
- package/build/python/export_engine.py +45 -0
- package/build/python/kaggle_engine.py +77 -5
- package/build/python/normalize_engine.py +83 -0
- package/build/python/vesper/core/asset_downloader.py +5 -1
- package/build/scripts/test-phase1-webcore-quality.js +104 -0
- package/build/search/engine.js +45 -6
- package/build/search/jit-orchestrator.js +18 -14
- package/build/search/query-intent.js +509 -0
- package/build/tools/formatter.js +6 -3
- package/build/utils/python-runtime.js +130 -0
- package/build/web/extract-web.js +297 -0
- package/build/web/fusion-engine.js +457 -0
- package/build/web/types.js +1 -0
- package/build/web/web-core.js +242 -0
- package/package.json +12 -5
- package/scripts/postinstall.cjs +87 -31
- package/scripts/wizard.cjs +652 -0
- package/scripts/wizard.js +338 -12
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +2 -0
- package/src/python/convert_engine.py +92 -0
- package/src/python/export_engine.py +45 -0
- package/src/python/kaggle_engine.py +77 -5
- package/src/python/normalize_engine.py +83 -0
- package/src/python/requirements.txt +12 -0
- package/src/python/vesper/core/asset_downloader.py +5 -1
- package/wizard.cjs +3 -0
|
@@ -46,6 +46,9 @@ export class DataIngestor {
|
|
|
46
46
|
getKaggleCredentialError() {
|
|
47
47
|
return "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds), or provide ~/.kaggle/kaggle.json.";
|
|
48
48
|
}
|
|
49
|
+
toSafeDatasetPath(datasetId) {
|
|
50
|
+
return datasetId.replace(/[:\/]/g, "_");
|
|
51
|
+
}
|
|
49
52
|
/**
|
|
50
53
|
* Ensures a dataset is available locally
|
|
51
54
|
*/
|
|
@@ -115,7 +118,7 @@ export class DataIngestor {
|
|
|
115
118
|
this.failDownload(datasetId, errorMsg);
|
|
116
119
|
throw new Error(errorMsg);
|
|
117
120
|
}
|
|
118
|
-
const targetDir = path.join(this.rawDataDir,
|
|
121
|
+
const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
|
|
119
122
|
this.store.registerDownload(datasetId, targetDir, "downloading");
|
|
120
123
|
try {
|
|
121
124
|
onProgress?.("Downloading from Kaggle...");
|
|
@@ -131,7 +134,7 @@ export class DataIngestor {
|
|
|
131
134
|
}
|
|
132
135
|
}
|
|
133
136
|
else if (source === "openml") {
|
|
134
|
-
const targetDir = path.join(this.rawDataDir,
|
|
137
|
+
const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
|
|
135
138
|
this.store.registerDownload(datasetId, targetDir, "downloading");
|
|
136
139
|
try {
|
|
137
140
|
onProgress?.("Downloading from OpenML...");
|
|
@@ -147,7 +150,7 @@ export class DataIngestor {
|
|
|
147
150
|
}
|
|
148
151
|
}
|
|
149
152
|
else if (source === "dataworld") {
|
|
150
|
-
const targetDir = path.join(this.rawDataDir,
|
|
153
|
+
const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
|
|
151
154
|
this.store.registerDownload(datasetId, targetDir, "downloading");
|
|
152
155
|
try {
|
|
153
156
|
onProgress?.("Downloading from data.world...");
|
|
@@ -181,7 +184,7 @@ export class DataIngestor {
|
|
|
181
184
|
* Generates a safe local filename for a dataset ID
|
|
182
185
|
*/
|
|
183
186
|
getTargetPath(datasetId, extension = "parquet") {
|
|
184
|
-
const safeId =
|
|
187
|
+
const safeId = this.toSafeDatasetPath(datasetId);
|
|
185
188
|
return path.join(this.rawDataDir, `${safeId}.${extension}`);
|
|
186
189
|
}
|
|
187
190
|
/**
|
|
@@ -18,12 +18,15 @@ export class InstallService {
|
|
|
18
18
|
throw new Error(`Source file not found for installation: ${sourcePath}`);
|
|
19
19
|
}
|
|
20
20
|
const dataset = this.metadataStore.getDataset(datasetId);
|
|
21
|
-
if (!dataset) {
|
|
22
|
-
throw new Error(`Dataset metadata not found for ${datasetId}`);
|
|
23
|
-
}
|
|
24
21
|
// Create target directory
|
|
25
|
-
const
|
|
26
|
-
const
|
|
22
|
+
const installLabel = dataset?.name || datasetId;
|
|
23
|
+
const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
|
|
24
|
+
// If caller specified a target dir, use it directly
|
|
25
|
+
// Otherwise use the current working directory
|
|
26
|
+
const installDir = targetDir
|
|
27
|
+
? path.resolve(targetDir)
|
|
28
|
+
: path.resolve(process.cwd(), sanitizedName);
|
|
29
|
+
console.error(`[InstallService] Resolved install directory: ${installDir}`);
|
|
27
30
|
if (!fs.existsSync(installDir)) {
|
|
28
31
|
fs.mkdirSync(installDir, { recursive: true });
|
|
29
32
|
}
|
|
@@ -34,7 +37,9 @@ export class InstallService {
|
|
|
34
37
|
fs.copyFileSync(sourcePath, targetPath);
|
|
35
38
|
// Update metadata
|
|
36
39
|
const absolutePath = path.resolve(targetPath);
|
|
37
|
-
|
|
40
|
+
if (dataset) {
|
|
41
|
+
this.metadataStore.updateInstallPath(datasetId, absolutePath);
|
|
42
|
+
}
|
|
38
43
|
console.error(`[InstallService] Dataset ${datasetId} installed to ${absolutePath}`);
|
|
39
44
|
return absolutePath;
|
|
40
45
|
}
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
import { rateLimitedFetch } from "./rate-limiter.js";
|
|
2
|
+
import { CircuitBreaker } from "./circuit-breaker.js";
|
|
3
|
+
import { estimateQualityScore } from "./quality.js";
|
|
4
|
+
export class ArxivSource {
|
|
5
|
+
cache;
|
|
6
|
+
baseUrl = "http://export.arxiv.org/api/query";
|
|
7
|
+
breaker = new CircuitBreaker("arxiv", {
|
|
8
|
+
failureThreshold: 5,
|
|
9
|
+
openDurationMs: 30_000,
|
|
10
|
+
halfOpenSuccessesToClose: 2,
|
|
11
|
+
});
|
|
12
|
+
constructor(cache) {
|
|
13
|
+
this.cache = cache;
|
|
14
|
+
}
|
|
15
|
+
async discover(query, limit = 20) {
|
|
16
|
+
const out = await this.discoverWithTelemetry(query, limit, { full_text: false });
|
|
17
|
+
return out.results;
|
|
18
|
+
}
|
|
19
|
+
async discoverWithTelemetry(query, limit = 20, input = {}) {
|
|
20
|
+
const start = Date.now();
|
|
21
|
+
const cleanQuery = String(query || "").trim();
|
|
22
|
+
if (!cleanQuery) {
|
|
23
|
+
return { results: [], cacheHit: false, latencyMs: Date.now() - start };
|
|
24
|
+
}
|
|
25
|
+
const fullText = input.full_text === true;
|
|
26
|
+
const maxResults = Math.max(1, Math.min(100, Number(limit || 20)));
|
|
27
|
+
const cacheKey = `webcore:arxiv:discover:${cleanQuery.toLowerCase()}:limit=${maxResults}:full_text=${fullText ? 1 : 0}`;
|
|
28
|
+
const cached = await this.cache?.getJson(cacheKey);
|
|
29
|
+
if (cached) {
|
|
30
|
+
return { results: cached, cacheHit: true, latencyMs: Date.now() - start };
|
|
31
|
+
}
|
|
32
|
+
if (!this.breaker.canAttempt()) {
|
|
33
|
+
throw new Error("ArXiv connector is temporarily unavailable (circuit open).");
|
|
34
|
+
}
|
|
35
|
+
const url = `${this.baseUrl}?search_query=all:${encodeURIComponent(cleanQuery)}&start=0&max_results=${maxResults}&sortBy=relevance&sortOrder=descending`;
|
|
36
|
+
const response = await rateLimitedFetch(url, {
|
|
37
|
+
headers: {
|
|
38
|
+
"User-Agent": "vesper/2.0 (phase1-arxiv-connector)"
|
|
39
|
+
}
|
|
40
|
+
}, { maxRetries: 5, initialDelay: 1000, maxDelay: 15000 }).catch((e) => {
|
|
41
|
+
this.breaker.onFailure();
|
|
42
|
+
throw e;
|
|
43
|
+
});
|
|
44
|
+
const xml = await response.text();
|
|
45
|
+
const entries = this.parseEntries(xml);
|
|
46
|
+
let pdfExtractMsTotal = 0;
|
|
47
|
+
const result = [];
|
|
48
|
+
for (const entry of entries) {
|
|
49
|
+
if (fullText) {
|
|
50
|
+
const pdfStart = Date.now();
|
|
51
|
+
const pdfText = await this.extractPdfText(entry.id).catch(() => "");
|
|
52
|
+
pdfExtractMsTotal += Date.now() - pdfStart;
|
|
53
|
+
const truncated = pdfText ? this.truncateTo50k(pdfText) : undefined;
|
|
54
|
+
result.push(this.toDatasetMetadata(entry, {
|
|
55
|
+
webcore_content: truncated,
|
|
56
|
+
contentDepth: truncated ? truncated.length : entry.summary.length,
|
|
57
|
+
}));
|
|
58
|
+
}
|
|
59
|
+
else {
|
|
60
|
+
result.push(this.toDatasetMetadata(entry, { contentDepth: entry.summary.length }));
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
this.breaker.onSuccess();
|
|
64
|
+
await this.cache?.setJson(cacheKey, result, 86400); // 24h
|
|
65
|
+
return { results: result, cacheHit: false, latencyMs: Date.now() - start, pdf_extract_ms_total: pdfExtractMsTotal };
|
|
66
|
+
}
|
|
67
|
+
parseEntries(xml) {
|
|
68
|
+
const entries = [];
|
|
69
|
+
const entryMatches = xml.match(/<entry>([\s\S]*?)<\/entry>/g) || [];
|
|
70
|
+
for (const block of entryMatches) {
|
|
71
|
+
const idUrl = this.extractTag(block, "id");
|
|
72
|
+
const title = this.decodeXml(this.extractTag(block, "title"));
|
|
73
|
+
const summary = this.decodeXml(this.extractTag(block, "summary"));
|
|
74
|
+
const updated = this.extractTag(block, "updated");
|
|
75
|
+
const published = this.extractTag(block, "published");
|
|
76
|
+
const pdfUrl = this.extractPdfUrl(block) || (idUrl ? idUrl.replace("/abs/", "/pdf/") : "");
|
|
77
|
+
const authors = this.extractAllTags(block, "name").map((v) => this.decodeXml(v));
|
|
78
|
+
const categories = this.extractAllCategoryTerms(block);
|
|
79
|
+
if (!idUrl || !title)
|
|
80
|
+
continue;
|
|
81
|
+
const shortId = this.extractArxivId(idUrl);
|
|
82
|
+
entries.push({
|
|
83
|
+
id: shortId,
|
|
84
|
+
title: title.replace(/\s+/g, " ").trim(),
|
|
85
|
+
summary: summary.replace(/\s+/g, " ").trim(),
|
|
86
|
+
updated,
|
|
87
|
+
published,
|
|
88
|
+
authors,
|
|
89
|
+
categories,
|
|
90
|
+
pdfUrl,
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
return entries;
|
|
94
|
+
}
|
|
95
|
+
toDatasetMetadata(entry, input) {
|
|
96
|
+
const description = entry.summary || entry.title;
|
|
97
|
+
const publishedAt = entry.published || entry.updated || new Date().toISOString();
|
|
98
|
+
const qualityWarnings = [];
|
|
99
|
+
if (description.length < 120) {
|
|
100
|
+
qualityWarnings.push("Short abstract may reduce extraction confidence");
|
|
101
|
+
}
|
|
102
|
+
const abstractLength = description.length;
|
|
103
|
+
const authorsPresent = Array.isArray(entry.authors) && entry.authors.length > 0;
|
|
104
|
+
const datePresent = !!(entry.published || entry.updated);
|
|
105
|
+
const contentDepth = Math.max(abstractLength, input.contentDepth || abstractLength);
|
|
106
|
+
const quality01 = estimateQualityScore({
|
|
107
|
+
abstractLength,
|
|
108
|
+
authorsPresent,
|
|
109
|
+
datePresent,
|
|
110
|
+
contentDepth,
|
|
111
|
+
});
|
|
112
|
+
return {
|
|
113
|
+
id: entry.id,
|
|
114
|
+
source: "arxiv",
|
|
115
|
+
name: entry.title,
|
|
116
|
+
description,
|
|
117
|
+
authors: entry.authors,
|
|
118
|
+
downloads: 0,
|
|
119
|
+
likes: 0,
|
|
120
|
+
stars: 0,
|
|
121
|
+
tags: entry.categories,
|
|
122
|
+
last_updated: entry.updated || publishedAt,
|
|
123
|
+
task: "research-paper",
|
|
124
|
+
languages: [],
|
|
125
|
+
domain: "research",
|
|
126
|
+
splits: [],
|
|
127
|
+
license: {
|
|
128
|
+
id: "unknown",
|
|
129
|
+
category: "unknown",
|
|
130
|
+
usage_restrictions: [],
|
|
131
|
+
warnings: [],
|
|
132
|
+
},
|
|
133
|
+
quality_score: Math.round(quality01 * 100),
|
|
134
|
+
quality_warnings: qualityWarnings,
|
|
135
|
+
download_url: entry.pdfUrl,
|
|
136
|
+
format: "PDF",
|
|
137
|
+
total_examples: 1,
|
|
138
|
+
total_size_bytes: undefined,
|
|
139
|
+
total_size_mb: undefined,
|
|
140
|
+
columns: [
|
|
141
|
+
{ name: "title", type: "string" },
|
|
142
|
+
{ name: "abstract", type: "string" },
|
|
143
|
+
{ name: "authors", type: "string[]" },
|
|
144
|
+
{ name: "categories", type: "string[]" },
|
|
145
|
+
{ name: "published_at", type: "datetime" },
|
|
146
|
+
{ name: "source_url", type: "string" },
|
|
147
|
+
],
|
|
148
|
+
is_structured: true,
|
|
149
|
+
has_target_column: false,
|
|
150
|
+
is_safe_source: true,
|
|
151
|
+
has_personal_data: false,
|
|
152
|
+
is_paywalled: false,
|
|
153
|
+
is_scraped_web_data: false,
|
|
154
|
+
uses_https: true,
|
|
155
|
+
has_train_split: false,
|
|
156
|
+
has_test_split: false,
|
|
157
|
+
has_validation_split: false,
|
|
158
|
+
description_length: description.length,
|
|
159
|
+
has_readme: false,
|
|
160
|
+
metadata_url: `https://arxiv.org/abs/${entry.id}`,
|
|
161
|
+
...(input.webcore_content ? { webcore_content: input.webcore_content, webcore_content_kind: "pdf_text" } : {}),
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
truncateTo50k(text) {
|
|
165
|
+
return String(text || "").slice(0, 50_000);
|
|
166
|
+
}
|
|
167
|
+
async extractPdfText(arxivId) {
|
|
168
|
+
// Lazy-load heavy dependency only when enabled.
|
|
169
|
+
const pdfParseMod = await import("pdf-parse");
|
|
170
|
+
const pdfParse = pdfParseMod.default || pdfParseMod;
|
|
171
|
+
const pdfUrl = `https://arxiv.org/pdf/${arxivId}.pdf`;
|
|
172
|
+
const start = Date.now();
|
|
173
|
+
const response = await rateLimitedFetch(pdfUrl, {
|
|
174
|
+
headers: {
|
|
175
|
+
"User-Agent": "vesper/2.0 (phase1-arxiv-pdf-extract)"
|
|
176
|
+
}
|
|
177
|
+
}, { maxRetries: 3, initialDelay: 1000, maxDelay: 8000 });
|
|
178
|
+
const arrayBuf = await response.arrayBuffer();
|
|
179
|
+
const buffer = Buffer.from(arrayBuf);
|
|
180
|
+
const parsed = await pdfParse(buffer);
|
|
181
|
+
const text = String(parsed?.text || "");
|
|
182
|
+
// Soft truncate; later caller truncates too.
|
|
183
|
+
if (text.length > 200_000) {
|
|
184
|
+
// Avoid pathological PDFs.
|
|
185
|
+
return text.slice(0, 200_000);
|
|
186
|
+
}
|
|
187
|
+
void start;
|
|
188
|
+
return text;
|
|
189
|
+
}
|
|
190
|
+
extractTag(xml, tagName) {
|
|
191
|
+
const m = xml.match(new RegExp(`<${tagName}>([\\s\\S]*?)<\\/${tagName}>`, "i"));
|
|
192
|
+
return (m?.[1] || "").trim();
|
|
193
|
+
}
|
|
194
|
+
extractAllTags(xml, tagName) {
|
|
195
|
+
const out = [];
|
|
196
|
+
const rgx = new RegExp(`<${tagName}>([\\s\\S]*?)<\\/${tagName}>`, "gi");
|
|
197
|
+
let m = null;
|
|
198
|
+
while ((m = rgx.exec(xml)) !== null) {
|
|
199
|
+
out.push((m[1] || "").trim());
|
|
200
|
+
}
|
|
201
|
+
return out;
|
|
202
|
+
}
|
|
203
|
+
extractAllCategoryTerms(xml) {
|
|
204
|
+
const out = [];
|
|
205
|
+
const rgx = /<category[^>]*term="([^"]+)"[^>]*\/?>/gi;
|
|
206
|
+
let m = null;
|
|
207
|
+
while ((m = rgx.exec(xml)) !== null) {
|
|
208
|
+
out.push((m[1] || "").trim());
|
|
209
|
+
}
|
|
210
|
+
return Array.from(new Set(out));
|
|
211
|
+
}
|
|
212
|
+
extractPdfUrl(xml) {
|
|
213
|
+
const m = xml.match(/<link[^>]*title="pdf"[^>]*href="([^"]+)"[^>]*\/?>/i);
|
|
214
|
+
return (m?.[1] || "").trim();
|
|
215
|
+
}
|
|
216
|
+
extractArxivId(idUrl) {
|
|
217
|
+
const cleaned = idUrl.trim();
|
|
218
|
+
const match = cleaned.match(/\/abs\/([^/?#]+)/i);
|
|
219
|
+
return match?.[1] || cleaned;
|
|
220
|
+
}
|
|
221
|
+
decodeXml(input) {
|
|
222
|
+
return input
|
|
223
|
+
.replace(/</g, "<")
|
|
224
|
+
.replace(/>/g, ">")
|
|
225
|
+
.replace(/&/g, "&")
|
|
226
|
+
.replace(/"/g, "\"")
|
|
227
|
+
.replace(/'/g, "'");
|
|
228
|
+
}
|
|
229
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
export class CircuitBreaker {
|
|
2
|
+
name;
|
|
3
|
+
options;
|
|
4
|
+
state = "closed";
|
|
5
|
+
consecutiveFailures = 0;
|
|
6
|
+
openUntilMs = 0;
|
|
7
|
+
constructor(name, options) {
|
|
8
|
+
this.name = name;
|
|
9
|
+
this.options = options;
|
|
10
|
+
}
|
|
11
|
+
halfOpenSuccesses = 0;
|
|
12
|
+
canAttempt() {
|
|
13
|
+
if (this.state === "closed")
|
|
14
|
+
return true;
|
|
15
|
+
if (this.state === "open") {
|
|
16
|
+
if (Date.now() >= this.openUntilMs) {
|
|
17
|
+
this.state = "half_open";
|
|
18
|
+
this.halfOpenSuccesses = 0;
|
|
19
|
+
return true;
|
|
20
|
+
}
|
|
21
|
+
return false;
|
|
22
|
+
}
|
|
23
|
+
return true; // half_open
|
|
24
|
+
}
|
|
25
|
+
onSuccess() {
|
|
26
|
+
if (this.state === "half_open") {
|
|
27
|
+
this.halfOpenSuccesses++;
|
|
28
|
+
if (this.halfOpenSuccesses >= this.options.halfOpenSuccessesToClose) {
|
|
29
|
+
this.state = "closed";
|
|
30
|
+
this.consecutiveFailures = 0;
|
|
31
|
+
this.openUntilMs = 0;
|
|
32
|
+
}
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
35
|
+
this.consecutiveFailures = 0;
|
|
36
|
+
this.state = "closed";
|
|
37
|
+
}
|
|
38
|
+
onFailure() {
|
|
39
|
+
if (this.state === "half_open") {
|
|
40
|
+
this.trip();
|
|
41
|
+
return;
|
|
42
|
+
}
|
|
43
|
+
this.consecutiveFailures++;
|
|
44
|
+
if (this.consecutiveFailures >= this.options.failureThreshold) {
|
|
45
|
+
this.trip();
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
trip() {
|
|
49
|
+
this.state = "open";
|
|
50
|
+
this.openUntilMs = Date.now() + this.options.openDurationMs;
|
|
51
|
+
this.consecutiveFailures = 0;
|
|
52
|
+
this.halfOpenSuccesses = 0;
|
|
53
|
+
console.error(`[CircuitBreaker] Opened: ${this.name} (until ${new Date(this.openUntilMs).toISOString()})`);
|
|
54
|
+
}
|
|
55
|
+
getStatus() {
|
|
56
|
+
return {
|
|
57
|
+
name: this.name,
|
|
58
|
+
state: this.state,
|
|
59
|
+
...(this.state === "open" ? { open_until: new Date(this.openUntilMs).toISOString() } : {}),
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
}
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
import { rateLimitedFetch } from "./rate-limiter.js";
|
|
2
|
+
import { CircuitBreaker } from "./circuit-breaker.js";
|
|
3
|
+
import { estimateQualityScore } from "./quality.js";
|
|
4
|
+
export class GithubSource {
|
|
5
|
+
cache;
|
|
6
|
+
breaker = new CircuitBreaker("github", {
|
|
7
|
+
failureThreshold: 5,
|
|
8
|
+
openDurationMs: 30_000,
|
|
9
|
+
halfOpenSuccessesToClose: 2,
|
|
10
|
+
});
|
|
11
|
+
constructor(cache) {
|
|
12
|
+
this.cache = cache;
|
|
13
|
+
}
|
|
14
|
+
async discover(query, limit = 20) {
|
|
15
|
+
const out = await this.discoverWithTelemetry(query, limit, { include_readme: false });
|
|
16
|
+
return out.results;
|
|
17
|
+
}
|
|
18
|
+
async discoverWithTelemetry(query, limit = 20, input = {}) {
|
|
19
|
+
const start = Date.now();
|
|
20
|
+
const cleanQuery = String(query || "").trim();
|
|
21
|
+
if (!cleanQuery) {
|
|
22
|
+
return { results: [], cacheHit: false, latencyMs: Date.now() - start, readme_fetch_ms_total: 0 };
|
|
23
|
+
}
|
|
24
|
+
const includeReadme = input.include_readme === true;
|
|
25
|
+
const perPage = Math.max(1, Math.min(100, Number(limit || 20)));
|
|
26
|
+
const cacheKey = `webcore:github:discover:${cleanQuery.toLowerCase()}:per_page=${perPage}:readme=${includeReadme ? 1 : 0}`;
|
|
27
|
+
const cached = await this.cache?.getJson(cacheKey);
|
|
28
|
+
if (cached) {
|
|
29
|
+
return { results: cached, cacheHit: true, latencyMs: Date.now() - start };
|
|
30
|
+
}
|
|
31
|
+
if (!this.breaker.canAttempt()) {
|
|
32
|
+
throw new Error("GitHub connector is temporarily unavailable (circuit open).");
|
|
33
|
+
}
|
|
34
|
+
const refinedQuery = `${cleanQuery} in:name,description,readme`;
|
|
35
|
+
const url = `https://api.github.com/search/repositories?q=${encodeURIComponent(refinedQuery)}&sort=stars&order=desc&per_page=${perPage}`;
|
|
36
|
+
const headers = {
|
|
37
|
+
"Accept": "application/vnd.github+json",
|
|
38
|
+
"User-Agent": "vesper/2.0 (phase1-github-connector)",
|
|
39
|
+
"X-GitHub-Api-Version": "2022-11-28",
|
|
40
|
+
};
|
|
41
|
+
const token = process.env.GITHUB_TOKEN || process.env.GH_TOKEN;
|
|
42
|
+
if (token)
|
|
43
|
+
headers["Authorization"] = `Bearer ${token}`;
|
|
44
|
+
const response = await rateLimitedFetch(url, { headers }, { maxRetries: 5, initialDelay: 1000, maxDelay: 20000 })
|
|
45
|
+
.catch((e) => {
|
|
46
|
+
// GitHub uses 403 for rate limiting; treat as breaker-worthy failure
|
|
47
|
+
this.breaker.onFailure();
|
|
48
|
+
if (String(e?.message || "").includes("403")) {
|
|
49
|
+
throw new Error("GitHub API rate limit exceeded (403). Set GITHUB_TOKEN for higher limits.");
|
|
50
|
+
}
|
|
51
|
+
throw e;
|
|
52
|
+
});
|
|
53
|
+
const data = (await response.json());
|
|
54
|
+
const items = Array.isArray(data?.items) ? data.items : [];
|
|
55
|
+
const repos = items.slice(0, perPage);
|
|
56
|
+
let readmeFetchMsTotal = 0;
|
|
57
|
+
const maxReadmes = includeReadme ? Math.min(5, repos.length) : 0;
|
|
58
|
+
const result = [];
|
|
59
|
+
for (let i = 0; i < repos.length; i++) {
|
|
60
|
+
const repo = repos[i];
|
|
61
|
+
if (includeReadme && i < maxReadmes) {
|
|
62
|
+
const fullName = String(repo.full_name || repo.name || "").trim();
|
|
63
|
+
const readmeKey = fullName ? `webcore:github:readme:${fullName}` : "webcore:github:readme:unknown";
|
|
64
|
+
const cachedReadme = await this.cache?.getJson(readmeKey);
|
|
65
|
+
if (cachedReadme) {
|
|
66
|
+
result.push(this.toDatasetMetadata(repo, { readmeText: cachedReadme }));
|
|
67
|
+
continue;
|
|
68
|
+
}
|
|
69
|
+
const t0 = Date.now();
|
|
70
|
+
const readmeText = await this.fetchReadme(repo).catch(() => undefined);
|
|
71
|
+
readmeFetchMsTotal += Date.now() - t0;
|
|
72
|
+
if (readmeText) {
|
|
73
|
+
await this.cache?.setJson(readmeKey, readmeText, 21600); // 6h
|
|
74
|
+
}
|
|
75
|
+
result.push(this.toDatasetMetadata(repo, { readmeText: readmeText ? this.truncate50k(readmeText) : undefined }));
|
|
76
|
+
}
|
|
77
|
+
else {
|
|
78
|
+
result.push(this.toDatasetMetadata(repo, { readmeText: undefined }));
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
this.breaker.onSuccess();
|
|
82
|
+
await this.cache?.setJson(cacheKey, result, 21600); // 6h
|
|
83
|
+
return { results: result, cacheHit: false, latencyMs: Date.now() - start, readme_fetch_ms_total: readmeFetchMsTotal };
|
|
84
|
+
}
|
|
85
|
+
toDatasetMetadata(repo, input) {
|
|
86
|
+
const fullName = String(repo.full_name || repo.name || "unknown").trim();
|
|
87
|
+
const description = String(repo.description || "").trim() || "No description provided.";
|
|
88
|
+
const ownerRepo = this.parseOwnerRepo(fullName);
|
|
89
|
+
const owner = ownerRepo ? ownerRepo.split("/")[0] : "";
|
|
90
|
+
const stars = Number(repo.stargazers_count || 0);
|
|
91
|
+
const forks = Number(repo.forks_count || 0);
|
|
92
|
+
const updatedAt = repo.updated_at || new Date().toISOString();
|
|
93
|
+
const topics = Array.isArray(repo.topics) ? repo.topics.filter(Boolean).map(String) : [];
|
|
94
|
+
const language = repo.language ? [String(repo.language)] : [];
|
|
95
|
+
const licenseId = repo.license?.spdx_id && repo.license.spdx_id !== "NOASSERTION"
|
|
96
|
+
? String(repo.license.spdx_id)
|
|
97
|
+
: "unknown";
|
|
98
|
+
const licenseName = repo.license?.name ? String(repo.license.name) : undefined;
|
|
99
|
+
const qualityWarnings = [];
|
|
100
|
+
if (stars < 5)
|
|
101
|
+
qualityWarnings.push("Low star count; may be low-signal");
|
|
102
|
+
if (description.length < 80)
|
|
103
|
+
qualityWarnings.push("Short description; relevance may be weaker");
|
|
104
|
+
const abstractLength = input.readmeText ? input.readmeText.length : description.length;
|
|
105
|
+
const authorsPresent = !!owner;
|
|
106
|
+
const datePresent = !!updatedAt;
|
|
107
|
+
const contentDepth = Math.max(abstractLength, input.readmeText ? input.readmeText.length : description.length);
|
|
108
|
+
const quality01 = estimateQualityScore({
|
|
109
|
+
abstractLength,
|
|
110
|
+
authorsPresent,
|
|
111
|
+
datePresent,
|
|
112
|
+
contentDepth,
|
|
113
|
+
});
|
|
114
|
+
return {
|
|
115
|
+
id: fullName,
|
|
116
|
+
source: "github",
|
|
117
|
+
name: fullName.split("/").pop() || fullName,
|
|
118
|
+
description,
|
|
119
|
+
...(owner ? { authors: [owner] } : {}),
|
|
120
|
+
downloads: forks * 10,
|
|
121
|
+
likes: stars,
|
|
122
|
+
stars,
|
|
123
|
+
tags: topics,
|
|
124
|
+
last_updated: updatedAt,
|
|
125
|
+
task: "code",
|
|
126
|
+
languages: language,
|
|
127
|
+
domain: "research",
|
|
128
|
+
splits: [],
|
|
129
|
+
license: {
|
|
130
|
+
id: licenseId,
|
|
131
|
+
name: licenseName,
|
|
132
|
+
category: "unknown",
|
|
133
|
+
usage_restrictions: [],
|
|
134
|
+
warnings: [],
|
|
135
|
+
},
|
|
136
|
+
quality_score: Math.round(quality01 * 100),
|
|
137
|
+
quality_warnings: qualityWarnings,
|
|
138
|
+
download_url: String(repo.html_url || `https://github.com/${fullName}`),
|
|
139
|
+
format: "GIT",
|
|
140
|
+
total_examples: 1,
|
|
141
|
+
is_structured: false,
|
|
142
|
+
has_target_column: false,
|
|
143
|
+
is_safe_source: true,
|
|
144
|
+
has_personal_data: false,
|
|
145
|
+
is_paywalled: false,
|
|
146
|
+
is_scraped_web_data: false,
|
|
147
|
+
uses_https: true,
|
|
148
|
+
has_train_split: false,
|
|
149
|
+
has_test_split: false,
|
|
150
|
+
has_validation_split: false,
|
|
151
|
+
description_length: description.length,
|
|
152
|
+
has_readme: false,
|
|
153
|
+
metadata_url: String(repo.html_url || `https://github.com/${fullName}`),
|
|
154
|
+
...(input.readmeText
|
|
155
|
+
? { webcore_content: this.truncate50k(input.readmeText), webcore_content_kind: "readme_text" }
|
|
156
|
+
: {}),
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
truncate50k(text) {
|
|
160
|
+
return String(text || "").slice(0, 50_000);
|
|
161
|
+
}
|
|
162
|
+
parseOwnerRepo(fullName) {
|
|
163
|
+
const trimmed = String(fullName || "").trim();
|
|
164
|
+
if (trimmed.includes("/"))
|
|
165
|
+
return trimmed;
|
|
166
|
+
return null;
|
|
167
|
+
}
|
|
168
|
+
async fetchReadme(repo) {
|
|
169
|
+
const fullName = String(repo.full_name || repo.name || "").trim();
|
|
170
|
+
const ownerRepo = this.parseOwnerRepo(fullName);
|
|
171
|
+
if (!ownerRepo)
|
|
172
|
+
return undefined;
|
|
173
|
+
const [owner, name] = ownerRepo.split("/");
|
|
174
|
+
const candidates = [];
|
|
175
|
+
if (repo.default_branch)
|
|
176
|
+
candidates.push(String(repo.default_branch));
|
|
177
|
+
candidates.push("main", "master");
|
|
178
|
+
const uniq = Array.from(new Set(candidates.filter(Boolean)));
|
|
179
|
+
const token = process.env.GITHUB_TOKEN || process.env.GH_TOKEN;
|
|
180
|
+
const headers = {
|
|
181
|
+
"User-Agent": "vesper/2.0 (phase1-github-readme)",
|
|
182
|
+
"Accept": "text/plain",
|
|
183
|
+
};
|
|
184
|
+
if (token)
|
|
185
|
+
headers["Authorization"] = `Bearer ${token}`;
|
|
186
|
+
for (const branch of uniq) {
|
|
187
|
+
const url = `https://raw.githubusercontent.com/${owner}/${name}/${branch}/README.md`;
|
|
188
|
+
const res = await fetch(url, { headers }).catch(() => null);
|
|
189
|
+
if (!res)
|
|
190
|
+
continue;
|
|
191
|
+
if (res.status === 404)
|
|
192
|
+
continue;
|
|
193
|
+
if (res.status === 429 || res.status === 403) {
|
|
194
|
+
this.breaker.onFailure();
|
|
195
|
+
throw new Error(`GitHub README fetch failed with status ${res.status}`);
|
|
196
|
+
}
|
|
197
|
+
if (!res.ok)
|
|
198
|
+
continue;
|
|
199
|
+
return await res.text();
|
|
200
|
+
}
|
|
201
|
+
return undefined;
|
|
202
|
+
}
|
|
203
|
+
}
|