@vespermcp/mcp-server 1.2.22 → 1.2.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,62 @@
1
+ export class CircuitBreaker {
2
+ name;
3
+ options;
4
+ state = "closed";
5
+ consecutiveFailures = 0;
6
+ openUntilMs = 0;
7
+ constructor(name, options) {
8
+ this.name = name;
9
+ this.options = options;
10
+ }
11
+ halfOpenSuccesses = 0;
12
+ canAttempt() {
13
+ if (this.state === "closed")
14
+ return true;
15
+ if (this.state === "open") {
16
+ if (Date.now() >= this.openUntilMs) {
17
+ this.state = "half_open";
18
+ this.halfOpenSuccesses = 0;
19
+ return true;
20
+ }
21
+ return false;
22
+ }
23
+ return true; // half_open
24
+ }
25
+ onSuccess() {
26
+ if (this.state === "half_open") {
27
+ this.halfOpenSuccesses++;
28
+ if (this.halfOpenSuccesses >= this.options.halfOpenSuccessesToClose) {
29
+ this.state = "closed";
30
+ this.consecutiveFailures = 0;
31
+ this.openUntilMs = 0;
32
+ }
33
+ return;
34
+ }
35
+ this.consecutiveFailures = 0;
36
+ this.state = "closed";
37
+ }
38
+ onFailure() {
39
+ if (this.state === "half_open") {
40
+ this.trip();
41
+ return;
42
+ }
43
+ this.consecutiveFailures++;
44
+ if (this.consecutiveFailures >= this.options.failureThreshold) {
45
+ this.trip();
46
+ }
47
+ }
48
+ trip() {
49
+ this.state = "open";
50
+ this.openUntilMs = Date.now() + this.options.openDurationMs;
51
+ this.consecutiveFailures = 0;
52
+ this.halfOpenSuccesses = 0;
53
+ console.error(`[CircuitBreaker] Opened: ${this.name} (until ${new Date(this.openUntilMs).toISOString()})`);
54
+ }
55
+ getStatus() {
56
+ return {
57
+ name: this.name,
58
+ state: this.state,
59
+ ...(this.state === "open" ? { open_until: new Date(this.openUntilMs).toISOString() } : {}),
60
+ };
61
+ }
62
+ }
@@ -0,0 +1,228 @@
1
+ import { rateLimitedFetch } from "./rate-limiter.js";
2
+ import { CircuitBreaker } from "./circuit-breaker.js";
3
+ import { estimateQualityScore } from "./quality.js";
4
+ export class GithubSource {
5
+ cache;
6
+ breaker = new CircuitBreaker("github", {
7
+ failureThreshold: 5,
8
+ openDurationMs: 30_000,
9
+ halfOpenSuccessesToClose: 2,
10
+ });
11
+ constructor(cache) {
12
+ this.cache = cache;
13
+ }
14
+ async discover(query, limit = 20) {
15
+ const out = await this.discoverWithTelemetry(query, limit, { include_readme: false });
16
+ return out.results;
17
+ }
18
+ async discoverWithTelemetry(query, limit = 20, input = {}) {
19
+ const start = Date.now();
20
+ const cleanQuery = String(query || "").trim();
21
+ if (!cleanQuery) {
22
+ return { results: [], cacheHit: false, latencyMs: Date.now() - start, readme_fetch_ms_total: 0 };
23
+ }
24
+ const includeReadme = input.include_readme === true;
25
+ const perPage = Math.max(1, Math.min(100, Number(limit || 20)));
26
+ const cacheKey = `webcore:github:discover:${cleanQuery.toLowerCase()}:per_page=${perPage}:readme=${includeReadme ? 1 : 0}`;
27
+ const cached = await this.cache?.getJson(cacheKey);
28
+ if (cached) {
29
+ return { results: cached, cacheHit: true, latencyMs: Date.now() - start };
30
+ }
31
+ if (!this.breaker.canAttempt()) {
32
+ throw new Error("GitHub connector is temporarily unavailable (circuit open).");
33
+ }
34
+ const refinedQuery = `${cleanQuery} in:name,description,readme`;
35
+ const url = `https://api.github.com/search/repositories?q=${encodeURIComponent(refinedQuery)}&sort=stars&order=desc&per_page=${perPage}`;
36
+ const headers = {
37
+ "Accept": "application/vnd.github+json",
38
+ "User-Agent": "vesper/2.0 (phase1-github-connector)",
39
+ "X-GitHub-Api-Version": "2022-11-28",
40
+ };
41
+ const token = process.env.GITHUB_TOKEN || process.env.GH_TOKEN;
42
+ if (token)
43
+ headers["Authorization"] = `Bearer ${token}`;
44
+ const response = await rateLimitedFetch(url, { headers }, { maxRetries: 5, initialDelay: 1000, maxDelay: 20000 })
45
+ .catch((e) => {
46
+ // GitHub uses 403 for rate limiting; treat as breaker-worthy failure
47
+ this.breaker.onFailure();
48
+ if (String(e?.message || "").includes("403")) {
49
+ throw new Error("GitHub API rate limit exceeded (403). Set GITHUB_TOKEN for higher limits.");
50
+ }
51
+ throw e;
52
+ });
53
+ const data = (await response.json());
54
+ const items = Array.isArray(data?.items) ? data.items : [];
55
+ const repos = items.slice(0, perPage);
56
+ let readmeFetchMsTotal = 0;
57
+ const maxReadmes = includeReadme ? Math.min(5, repos.length) : 0;
58
+ const result = [];
59
+ for (let i = 0; i < repos.length; i++) {
60
+ const repo = repos[i];
61
+ if (includeReadme && i < maxReadmes) {
62
+ const fullName = String(repo.full_name || repo.name || "").trim();
63
+ const readmeKey = fullName ? `webcore:github:readme:${fullName}` : "webcore:github:readme:unknown";
64
+ const cachedReadme = await this.cache?.getJson(readmeKey);
65
+ if (cachedReadme) {
66
+ result.push(this.toDatasetMetadata(repo, { readmeText: cachedReadme }));
67
+ continue;
68
+ }
69
+ const t0 = Date.now();
70
+ const readmeText = await this.fetchReadme(repo).catch(() => undefined);
71
+ readmeFetchMsTotal += Date.now() - t0;
72
+ if (readmeText) {
73
+ await this.cache?.setJson(readmeKey, readmeText, 21600); // 6h
74
+ }
75
+ result.push(this.toDatasetMetadata(repo, { readmeText: readmeText ? this.truncate50k(readmeText) : undefined }));
76
+ }
77
+ else {
78
+ result.push(this.toDatasetMetadata(repo, { readmeText: undefined }));
79
+ }
80
+ }
81
+ this.breaker.onSuccess();
82
+ await this.cache?.setJson(cacheKey, result, 21600); // 6h
83
+ return { results: result, cacheHit: false, latencyMs: Date.now() - start, readme_fetch_ms_total: readmeFetchMsTotal };
84
+ }
85
+ toDatasetMetadata(repo, input) {
86
+ const fullName = String(repo.full_name || repo.name || "unknown").trim();
87
+ const description = String(repo.description || "").trim() || "No description provided.";
88
+ const ownerRepo = this.parseOwnerRepo(fullName);
89
+ const owner = ownerRepo ? ownerRepo.split("/")[0] : "";
90
+ const stars = Number(repo.stargazers_count || 0);
91
+ const forks = Number(repo.forks_count || 0);
92
+ const updatedAt = repo.updated_at || new Date().toISOString();
93
+ const topics = Array.isArray(repo.topics) ? repo.topics.filter(Boolean).map(String) : [];
94
+ const language = repo.language ? [String(repo.language)] : [];
95
+ const licenseId = repo.license?.spdx_id && repo.license.spdx_id !== "NOASSERTION"
96
+ ? String(repo.license.spdx_id)
97
+ : "unknown";
98
+ const licenseName = repo.license?.name ? String(repo.license.name) : undefined;
99
+ const qualityWarnings = [];
100
+ if (stars < 5)
101
+ qualityWarnings.push("Low star count; may be low-signal");
102
+ if (description.length < 80)
103
+ qualityWarnings.push("Short description; relevance may be weaker");
104
+ const lowSignalPatterns = [
105
+ /\bawesome\b/i,
106
+ /\bresources?\b/i,
107
+ /\bcurated\b/i,
108
+ /\blist\b/i,
109
+ /\bcollection\b/i,
110
+ ];
111
+ const lowSignalText = `${fullName} ${description}`;
112
+ const looksResourceList = lowSignalPatterns.some((rx) => rx.test(lowSignalText));
113
+ if (looksResourceList) {
114
+ qualityWarnings.push("Repository appears to be a resource/list collection; relevance may be indirect.");
115
+ }
116
+ const abstractLength = input.readmeText ? input.readmeText.length : description.length;
117
+ const authorsPresent = !!owner;
118
+ const datePresent = !!updatedAt;
119
+ const contentDepth = Math.max(abstractLength, input.readmeText ? input.readmeText.length : description.length);
120
+ const quality01 = estimateQualityScore({
121
+ abstractLength,
122
+ authorsPresent,
123
+ datePresent,
124
+ contentDepth,
125
+ });
126
+ let adjustedQuality01 = quality01;
127
+ // Calibrate GitHub quality so resource-list repos don't dominate.
128
+ // Keep penalty moderate; readme-rich/long-form repos still score well.
129
+ if (looksResourceList) {
130
+ adjustedQuality01 -= 0.14;
131
+ }
132
+ if (!input.readmeText && description.length < 140) {
133
+ adjustedQuality01 -= 0.08;
134
+ }
135
+ if (stars < 50) {
136
+ adjustedQuality01 -= 0.04;
137
+ }
138
+ adjustedQuality01 = Math.max(0.3, Math.min(1.0, adjustedQuality01));
139
+ return {
140
+ id: fullName,
141
+ source: "github",
142
+ name: fullName.split("/").pop() || fullName,
143
+ description,
144
+ ...(owner ? { authors: [owner] } : {}),
145
+ downloads: forks * 10,
146
+ likes: stars,
147
+ stars,
148
+ tags: topics,
149
+ last_updated: updatedAt,
150
+ task: "code",
151
+ languages: language,
152
+ domain: "research",
153
+ splits: [],
154
+ license: {
155
+ id: licenseId,
156
+ name: licenseName,
157
+ category: "unknown",
158
+ usage_restrictions: [],
159
+ warnings: [],
160
+ },
161
+ quality_score: Math.round(adjustedQuality01 * 100),
162
+ quality_warnings: qualityWarnings,
163
+ download_url: String(repo.html_url || `https://github.com/${fullName}`),
164
+ format: "GIT",
165
+ total_examples: 1,
166
+ is_structured: false,
167
+ has_target_column: false,
168
+ is_safe_source: true,
169
+ has_personal_data: false,
170
+ is_paywalled: false,
171
+ is_scraped_web_data: false,
172
+ uses_https: true,
173
+ has_train_split: false,
174
+ has_test_split: false,
175
+ has_validation_split: false,
176
+ description_length: description.length,
177
+ has_readme: false,
178
+ metadata_url: String(repo.html_url || `https://github.com/${fullName}`),
179
+ ...(input.readmeText
180
+ ? { webcore_content: this.truncate50k(input.readmeText), webcore_content_kind: "readme_text" }
181
+ : {}),
182
+ };
183
+ }
184
+ truncate50k(text) {
185
+ return String(text || "").slice(0, 50_000);
186
+ }
187
+ parseOwnerRepo(fullName) {
188
+ const trimmed = String(fullName || "").trim();
189
+ if (trimmed.includes("/"))
190
+ return trimmed;
191
+ return null;
192
+ }
193
+ async fetchReadme(repo) {
194
+ const fullName = String(repo.full_name || repo.name || "").trim();
195
+ const ownerRepo = this.parseOwnerRepo(fullName);
196
+ if (!ownerRepo)
197
+ return undefined;
198
+ const [owner, name] = ownerRepo.split("/");
199
+ const candidates = [];
200
+ if (repo.default_branch)
201
+ candidates.push(String(repo.default_branch));
202
+ candidates.push("main", "master");
203
+ const uniq = Array.from(new Set(candidates.filter(Boolean)));
204
+ const token = process.env.GITHUB_TOKEN || process.env.GH_TOKEN;
205
+ const headers = {
206
+ "User-Agent": "vesper/2.0 (phase1-github-readme)",
207
+ "Accept": "text/plain",
208
+ };
209
+ if (token)
210
+ headers["Authorization"] = `Bearer ${token}`;
211
+ for (const branch of uniq) {
212
+ const url = `https://raw.githubusercontent.com/${owner}/${name}/${branch}/README.md`;
213
+ const res = await fetch(url, { headers }).catch(() => null);
214
+ if (!res)
215
+ continue;
216
+ if (res.status === 404)
217
+ continue;
218
+ if (res.status === 429 || res.status === 403) {
219
+ this.breaker.onFailure();
220
+ throw new Error(`GitHub README fetch failed with status ${res.status}`);
221
+ }
222
+ if (!res.ok)
223
+ continue;
224
+ return await res.text();
225
+ }
226
+ return undefined;
227
+ }
228
+ }
@@ -0,0 +1,123 @@
1
+ import { rateLimitedFetch } from "./rate-limiter.js";
2
+ import { CircuitBreaker } from "./circuit-breaker.js";
3
+ import { estimateQualityScore } from "./quality.js";
4
+ export class HackerNewsSource {
5
+ cache;
6
+ breaker = new CircuitBreaker("hackernews", {
7
+ failureThreshold: 5,
8
+ openDurationMs: 30_000,
9
+ halfOpenSuccessesToClose: 2,
10
+ });
11
+ constructor(cache) {
12
+ this.cache = cache;
13
+ }
14
+ async discover(query, limit = 20) {
15
+ const out = await this.discoverWithTelemetry(query, limit);
16
+ return out.results;
17
+ }
18
+ async discoverWithTelemetry(query, limit = 20) {
19
+ const start = Date.now();
20
+ const cleanQuery = String(query || "").trim();
21
+ if (!cleanQuery)
22
+ return { results: [], cacheHit: false, latencyMs: Date.now() - start };
23
+ const perPage = Math.max(1, Math.min(100, Number(limit || 20)));
24
+ const cacheKey = `webcore:hackernews:discover:${cleanQuery.toLowerCase()}:hitsPerPage=${perPage}`;
25
+ const cached = await this.cache?.getJson(cacheKey);
26
+ if (cached)
27
+ return { results: cached, cacheHit: true, latencyMs: Date.now() - start };
28
+ if (!this.breaker.canAttempt()) {
29
+ throw new Error("Hacker News connector is temporarily unavailable (circuit open).");
30
+ }
31
+ const url = new URL("https://hn.algolia.com/api/v1/search");
32
+ url.searchParams.set("query", cleanQuery);
33
+ url.searchParams.set("hitsPerPage", String(perPage));
34
+ url.searchParams.set("page", "0");
35
+ const response = await rateLimitedFetch(url.toString(), {
36
+ headers: {
37
+ "Accept": "application/json",
38
+ "User-Agent": "vesper/2.0 (phase1-hackernews-connector)",
39
+ },
40
+ }, { maxRetries: 5, initialDelay: 750, maxDelay: 15000 }).catch((e) => {
41
+ this.breaker.onFailure();
42
+ throw e;
43
+ });
44
+ const data = await response.json().catch((e) => {
45
+ this.breaker.onFailure();
46
+ throw new Error(`Hacker News JSON parse failed: ${e?.message || String(e)}`);
47
+ });
48
+ const hits = Array.isArray(data?.hits) ? data.hits : [];
49
+ const result = hits.slice(0, perPage).map((h) => this.toDatasetMetadata(h));
50
+ this.breaker.onSuccess();
51
+ await this.cache?.setJson(cacheKey, result, 7200); // 2h
52
+ return { results: result, cacheHit: false, latencyMs: Date.now() - start };
53
+ }
54
+ toDatasetMetadata(hit) {
55
+ const objectID = String(hit.objectID ?? "").trim();
56
+ const title = String(hit.title || "").trim() || `HN item ${objectID}`;
57
+ const createdAt = hit.created_at ? String(hit.created_at) : new Date().toISOString();
58
+ const author = hit.author ? String(hit.author).trim() : "";
59
+ const points = Number(hit.points || 0);
60
+ const comments = Number(hit.num_comments || 0);
61
+ const storyText = String(hit.story_text || "").trim();
62
+ const qualityWarnings = [];
63
+ if (!storyText)
64
+ qualityWarnings.push("Missing story text in Hacker News response");
65
+ const abstractLength = (storyText || title).length;
66
+ const authorsPresent = !!author;
67
+ const datePresent = !!createdAt;
68
+ const contentDepth = abstractLength;
69
+ const quality01 = estimateQualityScore({
70
+ abstractLength,
71
+ authorsPresent,
72
+ datePresent,
73
+ contentDepth,
74
+ });
75
+ const itemUrl = hit.url ? String(hit.url) : (objectID ? `https://news.ycombinator.com/item?id=${objectID}` : "");
76
+ return {
77
+ id: objectID || title,
78
+ source: "hackernews",
79
+ name: title,
80
+ description: storyText || title,
81
+ authors: author ? [author] : undefined,
82
+ downloads: comments,
83
+ likes: points,
84
+ stars: points,
85
+ tags: author ? ["hackernews", `author:${author}`] : ["hackernews"],
86
+ last_updated: createdAt,
87
+ task: "thread",
88
+ languages: [],
89
+ domain: "tech",
90
+ splits: [],
91
+ license: {
92
+ id: "unknown",
93
+ category: "unknown",
94
+ usage_restrictions: [],
95
+ warnings: [],
96
+ },
97
+ quality_score: Math.round(quality01 * 100),
98
+ quality_warnings: qualityWarnings,
99
+ download_url: itemUrl,
100
+ format: "TEXT",
101
+ total_examples: 1,
102
+ total_size_bytes: undefined,
103
+ total_size_mb: undefined,
104
+ columns: [
105
+ { name: "title", type: "string" },
106
+ { name: "text", type: "string" },
107
+ ],
108
+ is_structured: true,
109
+ has_target_column: false,
110
+ is_safe_source: true,
111
+ has_personal_data: false,
112
+ is_paywalled: false,
113
+ is_scraped_web_data: false,
114
+ uses_https: true,
115
+ has_train_split: false,
116
+ has_test_split: false,
117
+ has_validation_split: false,
118
+ description_length: (storyText || title).length,
119
+ has_readme: false,
120
+ metadata_url: itemUrl,
121
+ };
122
+ }
123
+ }
@@ -46,3 +46,30 @@ export function calculateQualityScore(data) {
46
46
  score += 2;
47
47
  return Math.min(100, score);
48
48
  }
49
+ /**
50
+ * Phase 1 Web Core quality estimator (document-first).
51
+ *
52
+ * Returns a score in the range [0.3, 1.0] (never 0) using weighted signals:
53
+ * - abstract length (text richness)
54
+ * - authors presence (provenance)
55
+ * - date presence (freshness / completeness)
56
+ * - content depth (optional full text / README length)
57
+ */
58
+ export function estimateQualityScore(input) {
59
+ const abstractLen = Math.max(0, Number(input.abstractLength || 0));
60
+ const contentDepth = Math.max(0, Number(input.contentDepth || 0));
61
+ // Logistic-ish squashing: near-1 once abstract is moderately long.
62
+ const abstractScore = 1 / (1 + Math.exp(-(abstractLen - 200) / 120));
63
+ const depthScore = 1 / (1 + Math.exp(-(contentDepth - 1200) / 500));
64
+ const authorsScore = input.authorsPresent ? 1 : 0;
65
+ const dateScore = input.datePresent ? 1 : 0;
66
+ // Base is intentionally high so that for complete records the score clusters near 1.0.
67
+ const base = 0.3;
68
+ const score = base +
69
+ 0.25 * abstractScore +
70
+ 0.25 * depthScore +
71
+ 0.1 * authorsScore +
72
+ 0.1 * dateScore;
73
+ // Clamp to [0.3, 1.0]
74
+ return Math.max(0.3, Math.min(1.0, score));
75
+ }
@@ -0,0 +1,138 @@
1
+ import { rateLimitedFetch } from "./rate-limiter.js";
2
+ import { CircuitBreaker } from "./circuit-breaker.js";
3
+ import { estimateQualityScore } from "./quality.js";
4
+ export class SemanticScholarSource {
5
+ cache;
6
+ breaker = new CircuitBreaker("semantic_scholar", {
7
+ failureThreshold: 5,
8
+ openDurationMs: 30_000,
9
+ halfOpenSuccessesToClose: 2,
10
+ });
11
+ constructor(cache) {
12
+ this.cache = cache;
13
+ }
14
+ async discover(query, limit = 20) {
15
+ const out = await this.discoverWithTelemetry(query, limit);
16
+ return out.results;
17
+ }
18
+ async discoverWithTelemetry(query, limit = 20) {
19
+ const start = Date.now();
20
+ const cleanQuery = String(query || "").trim();
21
+ if (!cleanQuery)
22
+ return { results: [], cacheHit: false, latencyMs: Date.now() - start };
23
+ const perPage = Math.max(1, Math.min(100, Number(limit || 20)));
24
+ const cacheKey = `webcore:semantic_scholar:discover:${cleanQuery.toLowerCase()}:limit=${perPage}`;
25
+ const cached = await this.cache?.getJson(cacheKey);
26
+ if (cached)
27
+ return { results: cached, cacheHit: true, latencyMs: Date.now() - start };
28
+ if (!this.breaker.canAttempt()) {
29
+ throw new Error("Semantic Scholar connector is temporarily unavailable (circuit open).");
30
+ }
31
+ const url = new URL("https://api.semanticscholar.org/graph/v1/paper/search");
32
+ url.searchParams.set("query", cleanQuery);
33
+ url.searchParams.set("limit", String(perPage));
34
+ url.searchParams.set("fields", [
35
+ "paperId",
36
+ "title",
37
+ "abstract",
38
+ "url",
39
+ "venue",
40
+ "year",
41
+ "citationCount",
42
+ "authors",
43
+ "publicationTypes",
44
+ "openAccessPdf",
45
+ ].join(","));
46
+ const response = await rateLimitedFetch(url.toString(), {
47
+ headers: {
48
+ "Accept": "application/json",
49
+ "User-Agent": "vesper/2.0 (phase1-semantic-scholar-connector)",
50
+ },
51
+ }, { maxRetries: 5, initialDelay: 1000, maxDelay: 20000 }).catch((e) => {
52
+ this.breaker.onFailure();
53
+ throw e;
54
+ });
55
+ const data = await response.json().catch((e) => {
56
+ this.breaker.onFailure();
57
+ throw new Error(`Semantic Scholar JSON parse failed: ${e?.message || String(e)}`);
58
+ });
59
+ const papers = Array.isArray(data?.data) ? data.data : [];
60
+ const result = papers.map((p) => this.toDatasetMetadata(p)).filter(Boolean);
61
+ this.breaker.onSuccess();
62
+ await this.cache?.setJson(cacheKey, result, 86400); // 24h
63
+ return { results: result, cacheHit: false, latencyMs: Date.now() - start };
64
+ }
65
+ toDatasetMetadata(paper) {
66
+ const paperId = String(paper.paperId || paper.externalIds?.DOI || "").trim();
67
+ const title = String(paper.title || "").trim();
68
+ const abstract = String(paper.abstract || "").trim();
69
+ const url = String(paper.url || "").trim();
70
+ const authors = Array.isArray(paper.authors) ? paper.authors.map((a) => String(a.name || "").trim()).filter(Boolean) : [];
71
+ const tags = [
72
+ ...(Array.isArray(paper.publicationTypes) ? paper.publicationTypes.filter(Boolean).map(String) : []),
73
+ ...(paper.venue ? [String(paper.venue)] : []),
74
+ ];
75
+ const citationCount = Number(paper.citationCount || 0);
76
+ const qualityWarnings = [];
77
+ if (!abstract)
78
+ qualityWarnings.push("Missing abstract from Semantic Scholar response");
79
+ const abstractLength = (abstract || title).length;
80
+ const authorsPresent = authors.length > 0;
81
+ const datePresent = paper.year !== undefined && paper.year !== null;
82
+ const contentDepth = abstractLength;
83
+ const quality01 = estimateQualityScore({
84
+ abstractLength,
85
+ authorsPresent,
86
+ datePresent,
87
+ contentDepth,
88
+ });
89
+ const openAccessPdfUrl = paper.openAccessPdf?.url ? String(paper.openAccessPdf.url).trim() : undefined;
90
+ const downloadUrl = openAccessPdfUrl || url || (paperId ? `https://www.semanticscholar.org/paper/${paperId}` : "");
91
+ return {
92
+ id: paperId || url || title,
93
+ source: "semantic_scholar",
94
+ name: title || "Untitled",
95
+ description: abstract || title,
96
+ authors,
97
+ downloads: 0,
98
+ likes: citationCount,
99
+ stars: citationCount,
100
+ tags,
101
+ last_updated: new Date().toISOString(),
102
+ task: "research-paper",
103
+ languages: [],
104
+ domain: "research",
105
+ splits: [],
106
+ license: {
107
+ id: "unknown",
108
+ category: "unknown",
109
+ usage_restrictions: [],
110
+ warnings: [],
111
+ },
112
+ quality_score: Math.round(quality01 * 100),
113
+ quality_warnings: qualityWarnings,
114
+ download_url: downloadUrl,
115
+ format: "PDF",
116
+ total_examples: 1,
117
+ total_size_bytes: undefined,
118
+ total_size_mb: undefined,
119
+ columns: [
120
+ { name: "title", type: "string" },
121
+ { name: "abstract", type: "string" },
122
+ ],
123
+ is_structured: true,
124
+ has_target_column: false,
125
+ is_safe_source: true,
126
+ has_personal_data: false,
127
+ is_paywalled: false,
128
+ is_scraped_web_data: false,
129
+ uses_https: true,
130
+ has_train_split: false,
131
+ has_test_split: false,
132
+ has_validation_split: false,
133
+ description_length: (abstract || title).length,
134
+ has_readme: false,
135
+ metadata_url: url || (paperId ? `https://www.semanticscholar.org/paper/${paperId}` : ""),
136
+ };
137
+ }
138
+ }
@@ -0,0 +1,104 @@
1
+ import { CacheService, MockRedisProvider } from "../cache/service.js";
2
+ import { ArxivSource } from "../metadata/arxiv-source.js";
3
+ import { GithubSource } from "../metadata/github-source.js";
4
+ import { SemanticScholarSource } from "../metadata/semantic-scholar-source.js";
5
+ import { HackerNewsSource } from "../metadata/hackernews-source.js";
6
+ import { WebCoreEngine } from "../web/web-core.js";
7
+ function percentile(values, p) {
8
+ if (values.length === 0)
9
+ return 0;
10
+ const sorted = [...values].sort((a, b) => a - b);
11
+ const idx = Math.max(0, Math.ceil(p * sorted.length) - 1);
12
+ return sorted[idx] ?? 0;
13
+ }
14
+ function makeQueries() {
15
+ const topics = [
16
+ "agentic RAG evaluation",
17
+ "tool-augmented retrieval",
18
+ "LLM dataset quality scoring",
19
+ "semantic deduplication embeddings",
20
+ "cross-source dataset fusion",
21
+ "retrieval augmented generation metrics",
22
+ "dataset export parquet arrow jsonl",
23
+ "data safety dataset provenance",
24
+ "synthetic data generation alignment",
25
+ "multi-source corpus building",
26
+ ];
27
+ const suffixes = [
28
+ "paper",
29
+ "benchmark",
30
+ "latency",
31
+ "quality",
32
+ "dedup",
33
+ "provenance",
34
+ "evaluation",
35
+ "pipeline",
36
+ "MCP",
37
+ "agents",
38
+ ];
39
+ const out = [];
40
+ for (let i = 0; out.length < 100; i++) {
41
+ const t = topics[i % topics.length];
42
+ const s = suffixes[Math.floor(i / topics.length) % suffixes.length];
43
+ out.push(`${t} ${s}`.trim());
44
+ }
45
+ return out.slice(0, 100);
46
+ }
47
+ async function main() {
48
+ const cache = new CacheService(new MockRedisProvider());
49
+ const engine = new WebCoreEngine({
50
+ arxivSource: new ArxivSource(cache),
51
+ githubSource: new GithubSource(cache),
52
+ semanticScholarSource: new SemanticScholarSource(cache),
53
+ hackerNewsSource: new HackerNewsSource(cache),
54
+ });
55
+ const baseQuery = "agentic RAG evaluation";
56
+ const flags = { arxiv_full_text: true, github_include_readme: true };
57
+ const sources = ["arxiv", "github"];
58
+ console.log("Phase 1 Web Core validation:");
59
+ console.log("- Checking cached latency regression on a single query...");
60
+ const run1 = await engine.find({ query: baseQuery, sources: [...sources], limit: 2, ...flags });
61
+ const run2 = await engine.find({ query: baseQuery, sources: [...sources], limit: 2, ...flags });
62
+ const run2Arxiv = run2.telemetry?.per_source.find((t) => t.source === "arxiv");
63
+ const run2Github = run2.telemetry?.per_source.find((t) => t.source === "github");
64
+ console.log("Cached telemetry (run2):", {
65
+ arxiv: run2Arxiv ? { cache_hit: run2Arxiv.cache_hit, latency_ms: run2Arxiv.latency_ms } : null,
66
+ github: run2Github ? { cache_hit: run2Github.cache_hit, latency_ms: run2Github.latency_ms } : null,
67
+ });
68
+ // 100 query quality distribution test
69
+ console.log("- Running 100 test queries (quality distribution + extraction latency)...");
70
+ const queries = makeQueries();
71
+ const qualityScores = [];
72
+ const pdfExtractMs = [];
73
+ for (let i = 0; i < queries.length; i++) {
74
+ const q = queries[i];
75
+ const res = await engine.find({ query: q, sources: [...sources], limit: 2, ...flags });
76
+ for (const doc of res.results) {
77
+ qualityScores.push(Number(doc.quality_score));
78
+ }
79
+ const arxivTelemetry = res.telemetry?.per_source.find((t) => t.source === "arxiv");
80
+ if (arxivTelemetry) {
81
+ pdfExtractMs.push(Number(arxivTelemetry.pdf_extract_ms_total || 0));
82
+ }
83
+ if ((i + 1) % 10 === 0) {
84
+ console.log(` progress: ${i + 1}/100`);
85
+ }
86
+ }
87
+ const q95 = percentile(qualityScores, 0.95);
88
+ const pdfP95 = percentile(pdfExtractMs, 0.95);
89
+ console.log("\nResults:");
90
+ console.log(`- Quality score p95: ${q95}`);
91
+ console.log(`- PDF extract ms p95 (arxiv): ${pdfP95}`);
92
+ const okQuality = q95 > 0.9;
93
+ const okPdfLatency = pdfP95 < 5000;
94
+ console.log(`\nSuccess criteria:`);
95
+ console.log(`- Quality p95 > 0.9: ${okQuality ? "PASS" : "FAIL"}`);
96
+ console.log(`- PDF extract p95 < 5000ms additional: ${okPdfLatency ? "PASS" : "FAIL"}`);
97
+ if (!okQuality) {
98
+ console.log("Tip: adjust estimateQualityScore() weights/thresholds in src/metadata/quality.ts then rerun.");
99
+ }
100
+ }
101
+ main().catch((e) => {
102
+ console.error(e);
103
+ process.exit(1);
104
+ });