@vespermcp/mcp-server 1.2.22 → 1.2.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,483 @@
1
+ import { S3Client, ListObjectsV2Command, GetObjectCommand } from "@aws-sdk/client-s3";
2
+ import { fromEnv, fromTemporaryCredentials } from "@aws-sdk/credential-providers";
3
+ import crypto from "crypto";
4
+ import { estimateQualityScore } from "../metadata/quality.js";
5
+ ;
6
+ function identityKey(doc) {
7
+ const mj = doc.metadata_json || {};
8
+ switch (doc.source_type) {
9
+ case "arxiv":
10
+ return typeof mj.arxiv_id === "string" ? mj.arxiv_id : null;
11
+ case "github":
12
+ return typeof mj.full_name === "string" ? mj.full_name : null;
13
+ case "semantic_scholar":
14
+ return typeof mj.paper_id === "string" ? mj.paper_id : null;
15
+ case "hackernews":
16
+ return typeof mj.item_id !== "undefined" ? String(mj.item_id) : null;
17
+ case "s3":
18
+ if (typeof mj.bucket === "string" && typeof mj.key === "string") {
19
+ return `s3://${mj.bucket}/${mj.key}`;
20
+ }
21
+ return typeof doc.source_url === "string" ? doc.source_url : null;
22
+ default:
23
+ return null;
24
+ }
25
+ }
26
+ function safeContentForEmbedding(content) {
27
+ // Keep embeddings cheap and consistent.
28
+ const text = String(content || "").replace(/\s+/g, " ").trim();
29
+ return text.slice(0, 4000);
30
+ }
31
+ function normalizedHash(content) {
32
+ const base = String(content || "")
33
+ .toLowerCase()
34
+ .replace(/\s+/g, " ")
35
+ .trim();
36
+ return crypto.createHash("sha256").update(base).digest("hex");
37
+ }
38
+ function tokenize(content) {
39
+ const words = String(content || "")
40
+ .toLowerCase()
41
+ .replace(/[^a-z0-9\s]/g, " ")
42
+ .split(/\s+/)
43
+ .filter((w) => w.length >= 3);
44
+ return new Set(words);
45
+ }
46
+ function titleTokens(doc) {
47
+ const mj = doc.metadata_json || {};
48
+ const raw = typeof mj.title === "string" ? mj.title : "";
49
+ return tokenize(raw);
50
+ }
51
+ function isSuspiciousPair(a, b) {
52
+ // semantic fallback should be selective; do cheap prefilter first
53
+ const aLen = a.content.length;
54
+ const bLen = b.content.length;
55
+ const maxLen = Math.max(aLen, bLen, 1);
56
+ const lenRatio = Math.abs(aLen - bLen) / maxLen;
57
+ // Loosened again to allow abstract-vs-summary style comparisons.
58
+ if (lenRatio > 0.8)
59
+ return false;
60
+ // Fast path: same normalized title-like prefix often indicates same research object.
61
+ const aPrefix = a.content.slice(0, 140).toLowerCase().replace(/[^a-z0-9\s]/g, " ").trim();
62
+ const bPrefix = b.content.slice(0, 140).toLowerCase().replace(/[^a-z0-9\s]/g, " ").trim();
63
+ if (aPrefix && bPrefix && (aPrefix.includes(bPrefix) || bPrefix.includes(aPrefix))) {
64
+ return true;
65
+ }
66
+ // Cross-source papers often have close titles even if abstracts differ.
67
+ const aTitle = titleTokens(a);
68
+ const bTitle = titleTokens(b);
69
+ if (aTitle.size > 0 && bTitle.size > 0) {
70
+ let tInter = 0;
71
+ for (const t of aTitle)
72
+ if (bTitle.has(t))
73
+ tInter++;
74
+ const tUnion = aTitle.size + bTitle.size - tInter;
75
+ const tJaccard = tUnion > 0 ? tInter / tUnion : 0;
76
+ if (tJaccard >= 0.25)
77
+ return true;
78
+ }
79
+ const aTokens = tokenize(a.content);
80
+ const bTokens = tokenize(b.content);
81
+ if (aTokens.size === 0 || bTokens.size === 0)
82
+ return false;
83
+ let inter = 0;
84
+ for (const t of aTokens)
85
+ if (bTokens.has(t))
86
+ inter++;
87
+ const union = aTokens.size + bTokens.size - inter;
88
+ const jaccard = union > 0 ? inter / union : 0;
89
+ // Loosened from 0.12 -> 0.08 to let semantic stage inspect more borderline matches.
90
+ return jaccard >= 0.08;
91
+ }
92
+ function normalizeStars(doc) {
93
+ const mj = doc.metadata_json || {};
94
+ const starsLike = mj.stars ?? mj.citation_count ?? mj.points;
95
+ const n = Number(starsLike || 0);
96
+ return Number.isFinite(n) ? n : 0;
97
+ }
98
+ function asChain(doc) {
99
+ const qc = Number(doc.quality_score || 0);
100
+ return [
101
+ {
102
+ source_type: doc.source_type,
103
+ source_url: doc.source_url,
104
+ collected_at: doc.collected_at,
105
+ quality_score: qc,
106
+ },
107
+ ];
108
+ }
109
+ function dedupExactKeepBest(docs) {
110
+ const out = new Map();
111
+ for (const d of docs) {
112
+ const key = identityKey(d) || `content:${d.content}`;
113
+ const prev = out.get(key);
114
+ if (!prev || (d.quality_score || 0) > (prev.quality_score || 0)) {
115
+ out.set(key, d);
116
+ }
117
+ }
118
+ return out;
119
+ }
120
+ function sortByQualityDesc(docs) {
121
+ return [...docs].sort((a, b) => (b.quality_score || 0) - (a.quality_score || 0));
122
+ }
123
+ export class WebFusionEngine {
124
+ deps;
125
+ constructor(deps) {
126
+ this.deps = deps;
127
+ }
128
+ async fuse(input) {
129
+ if (!Array.isArray(input?.sources) || input.sources.length < 2) {
130
+ throw new Error("vesper.fuse: sources must contain at least 2 entries.");
131
+ }
132
+ const mergeStrategy = input.merge_strategy || "union";
133
+ const deduplication = input.deduplication || "semantic";
134
+ const sourcesRequested = input.sources;
135
+ const requestedTypes = Array.from(new Set(sourcesRequested.map(s => s.type)));
136
+ // For better semantic dedup accuracy, use richer content when enabled.
137
+ const arxivFullText = deduplication === "semantic";
138
+ const githubIncludeReadme = deduplication === "semantic";
139
+ const telemetry = { per_source: [] };
140
+ const perSourceDocs = {};
141
+ const perSourceErrors = [];
142
+ await Promise.all(sourcesRequested.map(async (spec) => {
143
+ const start = Date.now();
144
+ try {
145
+ let docs = [];
146
+ let cacheHit = false;
147
+ let latencyMs = 0;
148
+ if (spec.type === "s3") {
149
+ const out = await this.collectFromS3(spec);
150
+ docs = out.docs;
151
+ cacheHit = out.cacheHit;
152
+ latencyMs = out.latencyMs;
153
+ }
154
+ else {
155
+ const res = await this.deps.webCoreEngine.find({
156
+ query: spec.query,
157
+ sources: [spec.type],
158
+ limit: Math.max(1, Number(spec.max_results || 10)),
159
+ arxiv_full_text: arxivFullText,
160
+ github_include_readme: githubIncludeReadme,
161
+ });
162
+ docs = res.results || [];
163
+ const perSrcTel = res.telemetry?.per_source?.find((t) => t.source === spec.type);
164
+ cacheHit = perSrcTel ? !!perSrcTel.cache_hit : false;
165
+ latencyMs = perSrcTel ? Number(perSrcTel.latency_ms) : Date.now() - start;
166
+ }
167
+ const filtered = spec.min_stars !== undefined
168
+ ? docs.filter((d) => normalizeStars(d) >= Number(spec.min_stars))
169
+ : docs;
170
+ perSourceDocs[spec.type] = (perSourceDocs[spec.type] || []).concat(filtered);
171
+ telemetry.per_source.push({
172
+ source_type: spec.type,
173
+ cache_hit: cacheHit,
174
+ latency_ms: latencyMs || (Date.now() - start),
175
+ result_count: filtered.length,
176
+ });
177
+ }
178
+ catch (e) {
179
+ const errMsg = e?.message || String(e);
180
+ perSourceErrors.push(errMsg);
181
+ if (spec.type === "s3") {
182
+ // Graceful S3 error row for downstream agents.
183
+ const now = new Date().toISOString();
184
+ const errorDoc = {
185
+ source_type: "s3",
186
+ source_url: spec.bucket ? `s3://${spec.bucket}/${spec.path || ""}` : "s3://",
187
+ content: `S3 error: ${errMsg}`,
188
+ metadata_json: {
189
+ error: true,
190
+ message: errMsg,
191
+ bucket: spec.bucket,
192
+ path: spec.path,
193
+ region: spec.region,
194
+ },
195
+ quality_score: 0,
196
+ collected_at: now,
197
+ content_type: "text",
198
+ source_chain: [
199
+ { source_type: "s3", source_url: spec.bucket ? `s3://${spec.bucket}/${spec.path || ""}` : "s3://", collected_at: now, quality_score: 0 },
200
+ ],
201
+ };
202
+ perSourceDocs[spec.type] = (perSourceDocs[spec.type] || []).concat([errorDoc]);
203
+ }
204
+ telemetry.per_source.push({
205
+ source_type: spec.type,
206
+ cache_hit: false,
207
+ latency_ms: Date.now() - start,
208
+ result_count: spec.type === "s3" ? 1 : 0,
209
+ error: errMsg,
210
+ });
211
+ }
212
+ }));
213
+ const allDocs = Object.values(perSourceDocs).flat();
214
+ const inputDocuments = allDocs.length;
215
+ if (inputDocuments === 0) {
216
+ return {
217
+ input: { merge_strategy: mergeStrategy, deduplication, sources_requested: sourcesRequested },
218
+ results: [],
219
+ stats: { input_documents: 0, output_documents: 0, groups: 0, duplicates_removed: 0 },
220
+ telemetry,
221
+ };
222
+ }
223
+ // 1) Group/dedup
224
+ let groups = [];
225
+ let duplicatesRemoved = 0;
226
+ const shouldDedup = mergeStrategy === "dedup" || deduplication !== "none";
227
+ if (!shouldDedup) {
228
+ groups = allDocs.map((d) => [d]);
229
+ }
230
+ else if (deduplication === "exact") {
231
+ const map = dedupExactKeepBest(allDocs);
232
+ groups = Array.from(map.values()).map((d) => [d]);
233
+ }
234
+ else {
235
+ // Hash-first grouping (exact normalized-content duplicates).
236
+ const hashBuckets = new Map();
237
+ for (const d of allDocs) {
238
+ const h = normalizedHash(d.content);
239
+ const bucket = hashBuckets.get(h) || [];
240
+ bucket.push(d);
241
+ hashBuckets.set(h, bucket);
242
+ }
243
+ const hashRepresentatives = [];
244
+ const seededGroups = [];
245
+ for (const bucket of hashBuckets.values()) {
246
+ const best = bucket.slice().sort((a, b) => (b.quality_score || 0) - (a.quality_score || 0))[0];
247
+ hashRepresentatives.push(best);
248
+ seededGroups.push(bucket);
249
+ }
250
+ // Semantic fallback on suspicious pairs only.
251
+ const threshold = 0.93;
252
+ const docsSorted = sortByQualityDesc(hashRepresentatives);
253
+ const accepted = [];
254
+ let candidatesChecked = 0;
255
+ let suspiciousPairsChecked = 0;
256
+ let suspiciousPrefilterRejected = 0;
257
+ for (const d of docsSorted) {
258
+ const dContent = safeContentForEmbedding(d.content);
259
+ const dVec = await this.deps.embedder.embed(dContent);
260
+ let matchedIndex = null;
261
+ // Try match against existing representatives first.
262
+ for (let i = 0; i < accepted.length; i++) {
263
+ // Cross-source focus: keep intra-source docs separate unless exact hash matched.
264
+ if (accepted[i].rep.source_type === d.source_type) {
265
+ suspiciousPrefilterRejected++;
266
+ continue;
267
+ }
268
+ if (!isSuspiciousPair(d, accepted[i].rep)) {
269
+ suspiciousPrefilterRejected++;
270
+ continue;
271
+ }
272
+ suspiciousPairsChecked++;
273
+ candidatesChecked++;
274
+ const sim = dotProduct(dVec, accepted[i].repVec);
275
+ if (sim >= threshold) {
276
+ // If identity keys disagree and similarity is only moderately high, be conservative.
277
+ const dk = identityKey(d);
278
+ const ak = identityKey(accepted[i].rep);
279
+ if (dk && ak && dk !== ak)
280
+ continue;
281
+ matchedIndex = i;
282
+ break;
283
+ }
284
+ }
285
+ if (matchedIndex === null) {
286
+ accepted.push({ rep: d, repVec: dVec, members: [d] });
287
+ }
288
+ else {
289
+ accepted[matchedIndex].members.push(d);
290
+ }
291
+ }
292
+ groups = accepted.map(a => a.members);
293
+ duplicatesRemoved = inputDocuments - groups.reduce((sum, g) => sum + 1, 0);
294
+ telemetry.fusion_dedup_semantic = {
295
+ threshold,
296
+ embedding_dim: undefined,
297
+ candidates_checked: candidatesChecked,
298
+ hash_groups: hashBuckets.size,
299
+ suspicious_pairs_checked: suspiciousPairsChecked,
300
+ suspicious_prefilter_rejected: suspiciousPrefilterRejected,
301
+ };
302
+ }
303
+ if (deduplication !== "semantic" || !shouldDedup) {
304
+ duplicatesRemoved = inputDocuments - groups.length;
305
+ }
306
+ // 2) Merge strategy
307
+ const results = [];
308
+ for (const g of groups) {
309
+ const chainEntries = g.flatMap(asChain);
310
+ // union / dedup: one per group (best representative), always preserve provenance.
311
+ const best = g.slice().sort((a, b) => (b.quality_score || 0) - (a.quality_score || 0))[0];
312
+ results.push({
313
+ ...best,
314
+ source_chain: chainEntries,
315
+ metadata_json: {
316
+ ...best.metadata_json,
317
+ fused_from: g.map(d => ({
318
+ source_type: d.source_type,
319
+ source_url: d.source_url,
320
+ quality_score: d.quality_score,
321
+ })),
322
+ },
323
+ });
324
+ }
325
+ const outputDocuments = results.length;
326
+ return {
327
+ input: { merge_strategy: mergeStrategy, deduplication, sources_requested: sourcesRequested },
328
+ results,
329
+ stats: {
330
+ input_documents: inputDocuments,
331
+ output_documents: outputDocuments,
332
+ groups: groups.length,
333
+ duplicates_removed: duplicatesRemoved,
334
+ },
335
+ telemetry,
336
+ };
337
+ }
338
+ async collectFromS3(spec) {
339
+ const started = Date.now();
340
+ const bucket = String(spec.bucket || "").trim();
341
+ const prefix = String(spec.path || "").trim().replace(/^\/+/, "");
342
+ const region = String(spec.region || process.env.AWS_REGION || process.env.AWS_DEFAULT_REGION || "us-east-1");
343
+ const max = Math.max(1, Math.min(50, Number(spec.max_results || 10)));
344
+ if (!bucket)
345
+ throw new Error("s3 source requires bucket");
346
+ const hasPassThroughCreds = !!(spec.credentials?.accessKeyId && spec.credentials?.secretAccessKey) || !!spec.credentials?.roleArn;
347
+ const hasEnvCreds = !!(process.env.AWS_ACCESS_KEY_ID && process.env.AWS_SECRET_ACCESS_KEY);
348
+ if (!hasPassThroughCreds && !hasEnvCreds) {
349
+ throw new Error("S3 fusion requires AWS credentials for bucket listing. Provide sources[].credentials (accessKeyId/secretAccessKey or roleArn) or set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY.");
350
+ }
351
+ const cacheKey = `vesper:fuse:s3:list:${bucket}:${prefix}:max=${max}:region=${region}`;
352
+ const cached = await this.deps.cache?.getJson(cacheKey);
353
+ if (cached) {
354
+ return { docs: cached, cacheHit: true, latencyMs: Date.now() - started };
355
+ }
356
+ const client = this.makeS3Client(region, spec.credentials);
357
+ const listed = [];
358
+ let continuationToken = undefined;
359
+ while (listed.length < max) {
360
+ const listResp = await client.send(new ListObjectsV2Command({
361
+ Bucket: bucket,
362
+ Prefix: prefix || undefined,
363
+ MaxKeys: Math.min(1000, max - listed.length),
364
+ ContinuationToken: continuationToken,
365
+ }));
366
+ const contents = listResp.Contents || [];
367
+ for (const obj of contents) {
368
+ if (!obj.Key)
369
+ continue;
370
+ listed.push({
371
+ key: obj.Key,
372
+ size: obj.Size,
373
+ etag: obj.ETag,
374
+ last_modified: obj.LastModified ? obj.LastModified.toISOString() : undefined,
375
+ });
376
+ if (listed.length >= max)
377
+ break;
378
+ }
379
+ if (!listResp.IsTruncated || !listResp.NextContinuationToken)
380
+ break;
381
+ continuationToken = listResp.NextContinuationToken;
382
+ }
383
+ const docs = [];
384
+ for (const obj of listed) {
385
+ const key = obj.key;
386
+ const objCacheKey = `vesper:fuse:s3:obj:${bucket}:${key}`;
387
+ const cachedText = await this.deps.cache?.getJson(objCacheKey);
388
+ let text = cachedText || "";
389
+ if (!cachedText) {
390
+ text = await this.getS3ObjectText(client, bucket, key);
391
+ if (text) {
392
+ await this.deps.cache?.setJson(objCacheKey, text, 21600); // 6h
393
+ }
394
+ }
395
+ if (!text)
396
+ continue;
397
+ const truncated = text.slice(0, 50_000);
398
+ const datePresent = !!obj.last_modified;
399
+ const quality = estimateQualityScore({
400
+ abstractLength: truncated.length,
401
+ authorsPresent: false,
402
+ datePresent,
403
+ contentDepth: truncated.length,
404
+ });
405
+ const collectedAt = new Date().toISOString();
406
+ docs.push({
407
+ source_type: "s3",
408
+ source_url: `s3://${bucket}/${key}`,
409
+ content: truncated,
410
+ metadata_json: {
411
+ bucket,
412
+ key,
413
+ size: obj.size,
414
+ etag: obj.etag,
415
+ last_modified: obj.last_modified,
416
+ prefix,
417
+ region,
418
+ },
419
+ quality_score: quality,
420
+ collected_at: collectedAt,
421
+ content_type: "text",
422
+ source_chain: [
423
+ { source_type: "s3", source_url: `s3://${bucket}/${key}`, collected_at: collectedAt, quality_score: quality },
424
+ ],
425
+ });
426
+ }
427
+ await this.deps.cache?.setJson(cacheKey, docs, 21600); // 6h
428
+ return { docs, cacheHit: false, latencyMs: Date.now() - started };
429
+ }
430
+ makeS3Client(region, credentials) {
431
+ // Pass-through only: use explicit credentials if provided, otherwise env/instance.
432
+ if (credentials?.roleArn) {
433
+ return new S3Client({
434
+ region,
435
+ credentials: fromTemporaryCredentials({
436
+ params: { RoleArn: credentials.roleArn, RoleSessionName: "vesper-fuse" },
437
+ clientConfig: { region },
438
+ }),
439
+ });
440
+ }
441
+ if (credentials?.accessKeyId && credentials?.secretAccessKey) {
442
+ return new S3Client({
443
+ region,
444
+ credentials: {
445
+ accessKeyId: credentials.accessKeyId,
446
+ secretAccessKey: credentials.secretAccessKey,
447
+ sessionToken: credentials.sessionToken,
448
+ },
449
+ });
450
+ }
451
+ return new S3Client({ region, credentials: fromEnv() });
452
+ }
453
+ async getS3ObjectText(client, bucket, key) {
454
+ // Only attempt to read likely-text files.
455
+ const lower = key.toLowerCase();
456
+ const isLikelyText = /\.(txt|md|json|jsonl|csv|tsv|yaml|yml|log)$/i.test(lower);
457
+ if (!isLikelyText)
458
+ return "";
459
+ const resp = await client.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
460
+ const body = resp.Body;
461
+ if (!body)
462
+ return "";
463
+ const chunks = [];
464
+ let total = 0;
465
+ for await (const chunk of body) {
466
+ const buf = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
467
+ chunks.push(buf);
468
+ total += buf.length;
469
+ if (total > 2_000_000)
470
+ break; // cap read at ~2MB
471
+ }
472
+ const buffer = Buffer.concat(chunks);
473
+ return buffer.toString("utf-8");
474
+ }
475
+ }
476
+ function dotProduct(a, b) {
477
+ const n = Math.min(a.length, b.length);
478
+ let sum = 0;
479
+ for (let i = 0; i < n; i++)
480
+ sum += a[i] * b[i];
481
+ // embedder normalizes vectors, so dot product approximates cosine similarity.
482
+ return sum;
483
+ }
@@ -0,0 +1 @@
1
+ export {};