@vespermcp/mcp-server 1.2.22 → 1.2.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/cache/service.js +7 -0
- package/build/gateway/unified-dataset-gateway.js +34 -3
- package/build/index.js +238 -4
- package/build/metadata/arxiv-source.js +229 -0
- package/build/metadata/circuit-breaker.js +62 -0
- package/build/metadata/github-source.js +228 -0
- package/build/metadata/hackernews-source.js +123 -0
- package/build/metadata/quality.js +27 -0
- package/build/metadata/semantic-scholar-source.js +138 -0
- package/build/scripts/test-phase1-webcore-quality.js +104 -0
- package/build/search/engine.js +2 -1
- package/build/web/extract-web.js +297 -0
- package/build/web/fusion-engine.js +483 -0
- package/build/web/types.js +1 -0
- package/build/web/web-core.js +242 -0
- package/package.json +6 -1
- package/scripts/wizard.cjs +61 -10
- package/scripts/wizard.js +34 -2
- package/wizard.cjs +1 -1
package/build/cache/service.js
CHANGED
|
@@ -27,6 +27,13 @@ export class CacheService {
|
|
|
27
27
|
constructor(provider) {
|
|
28
28
|
this.provider = provider;
|
|
29
29
|
}
|
|
30
|
+
async getJson(key) {
|
|
31
|
+
const data = await this.provider.get(key);
|
|
32
|
+
return data ? JSON.parse(data) : null;
|
|
33
|
+
}
|
|
34
|
+
async setJson(key, value, ttlSeconds) {
|
|
35
|
+
await this.provider.set(key, JSON.stringify(value), ttlSeconds);
|
|
36
|
+
}
|
|
30
37
|
/**
|
|
31
38
|
* Caches quality reports (TTL: 24h)
|
|
32
39
|
*/
|
|
@@ -57,6 +57,27 @@ export class UnifiedDatasetGateway {
|
|
|
57
57
|
? ["data.world is available through server-managed credentials."]
|
|
58
58
|
: ["data.world support exists, but no server-managed token is configured yet."],
|
|
59
59
|
},
|
|
60
|
+
{
|
|
61
|
+
source: "arxiv",
|
|
62
|
+
display_name: "ArXiv",
|
|
63
|
+
available: true,
|
|
64
|
+
auth_mode: "public",
|
|
65
|
+
supported_operations: ["discover", "info"],
|
|
66
|
+
requires_end_user_key: false,
|
|
67
|
+
notes: ["ArXiv papers are fetched through the official public API with no user key required."],
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
source: "github",
|
|
71
|
+
display_name: "GitHub",
|
|
72
|
+
available: true,
|
|
73
|
+
auth_mode: "public-or-server-managed",
|
|
74
|
+
supported_operations: ["discover", "info"],
|
|
75
|
+
requires_end_user_key: false,
|
|
76
|
+
notes: [
|
|
77
|
+
"Repository search is available without a user key, but unauthenticated requests are heavily rate limited.",
|
|
78
|
+
"Set GITHUB_TOKEN on the server for higher limits and reliability."
|
|
79
|
+
],
|
|
80
|
+
},
|
|
60
81
|
{
|
|
61
82
|
source: "s3",
|
|
62
83
|
display_name: "Amazon S3",
|
|
@@ -244,6 +265,10 @@ export class UnifiedDatasetGateway {
|
|
|
244
265
|
return await this.deps.kaggleSource.discover(query, limit);
|
|
245
266
|
case "dataworld":
|
|
246
267
|
return await this.deps.dataworldSource.discover(query, limit);
|
|
268
|
+
case "arxiv":
|
|
269
|
+
return await this.deps.arxivSource.discover(query, limit);
|
|
270
|
+
case "github":
|
|
271
|
+
return await this.deps.githubSource.discover(query, limit);
|
|
247
272
|
case "s3":
|
|
248
273
|
throw new Error("S3 does not support search/discovery in the unified gateway. Use a direct s3://bucket/key object reference.");
|
|
249
274
|
case "bigquery":
|
|
@@ -265,7 +290,7 @@ export class UnifiedDatasetGateway {
|
|
|
265
290
|
}
|
|
266
291
|
return [source];
|
|
267
292
|
}
|
|
268
|
-
const providers = ["huggingface", "openml"];
|
|
293
|
+
const providers = ["arxiv", "huggingface", "openml"];
|
|
269
294
|
if (!publicOnly && this.deps.dataIngestor.hasKaggleCredentials()) {
|
|
270
295
|
providers.push("kaggle");
|
|
271
296
|
}
|
|
@@ -299,8 +324,14 @@ export class UnifiedDatasetGateway {
|
|
|
299
324
|
return { source: "openml", datasetId: trimmed.replace(/^openml:/i, "") };
|
|
300
325
|
if (/^dataworld:/i.test(trimmed))
|
|
301
326
|
return { source: "dataworld", datasetId: trimmed.replace(/^dataworld:/i, "") };
|
|
327
|
+
if (/^arxiv:/i.test(trimmed))
|
|
328
|
+
return { source: "arxiv", datasetId: trimmed.replace(/^arxiv:/i, "") };
|
|
329
|
+
if (/^github:/i.test(trimmed))
|
|
330
|
+
return { source: "github", datasetId: trimmed.replace(/^github:/i, "") };
|
|
302
331
|
if (/^bigquery:/i.test(trimmed))
|
|
303
332
|
return { source: "bigquery", datasetId: trimmed.replace(/^bigquery:/i, "") };
|
|
333
|
+
if (/^\d{4}\.\d{4,5}(v\d+)?$/i.test(trimmed))
|
|
334
|
+
return { source: "arxiv", datasetId: trimmed };
|
|
304
335
|
if (/^\d+$/.test(trimmed))
|
|
305
336
|
return { source: "openml", datasetId: trimmed };
|
|
306
337
|
if (trimmed.includes("/") && !trimmed.includes(":"))
|
|
@@ -316,7 +347,7 @@ export class UnifiedDatasetGateway {
|
|
|
316
347
|
lookupKnownDataset(datasetId) {
|
|
317
348
|
const candidates = new Set([
|
|
318
349
|
datasetId,
|
|
319
|
-
datasetId.replace(/^(huggingface|hf|kaggle|openml|dataworld|bigquery):/i, ""),
|
|
350
|
+
datasetId.replace(/^(huggingface|hf|kaggle|openml|dataworld|arxiv|github|bigquery):/i, ""),
|
|
320
351
|
]);
|
|
321
352
|
for (const candidate of candidates) {
|
|
322
353
|
const dataset = this.deps.metadataStore.getDataset(candidate);
|
|
@@ -326,7 +357,7 @@ export class UnifiedDatasetGateway {
|
|
|
326
357
|
return undefined;
|
|
327
358
|
}
|
|
328
359
|
matchesDatasetReference(dataset, requested) {
|
|
329
|
-
const normalizedRequested = requested.replace(/^(huggingface|hf|kaggle|openml|dataworld|bigquery):/i, "").toLowerCase();
|
|
360
|
+
const normalizedRequested = requested.replace(/^(huggingface|hf|kaggle|openml|dataworld|arxiv|github|bigquery):/i, "").toLowerCase();
|
|
330
361
|
const fullId = `${dataset.source}:${dataset.id}`.toLowerCase();
|
|
331
362
|
return dataset.id.toLowerCase() === normalizedRequested || fullId === requested.toLowerCase();
|
|
332
363
|
}
|
package/build/index.js
CHANGED
|
@@ -248,7 +248,7 @@ export function hasStep(datasetId, step) {
|
|
|
248
248
|
// --- Dataset ID Auto-Detection ---
|
|
249
249
|
export function parseDatasetId(id) {
|
|
250
250
|
const trimmed = id.trim();
|
|
251
|
-
if (/^(kaggle:|hf:|huggingface:|openml:|dataworld:|http|https):/i.test(trimmed))
|
|
251
|
+
if (/^(kaggle:|hf:|huggingface:|openml:|dataworld:|arxiv:|github:|http|https):/i.test(trimmed))
|
|
252
252
|
return trimmed;
|
|
253
253
|
if (trimmed.includes("/") && !trimmed.includes(":"))
|
|
254
254
|
return `kaggle:${trimmed}`;
|
|
@@ -270,7 +270,14 @@ import { HuggingFaceScraper } from "./metadata/scraper.js";
|
|
|
270
270
|
import { KaggleSource } from "./metadata/kaggle-source.js";
|
|
271
271
|
import { OpenMLSource } from "./metadata/openml-source.js";
|
|
272
272
|
import { DataWorldSource } from "./metadata/dataworld-source.js";
|
|
273
|
+
import { ArxivSource } from "./metadata/arxiv-source.js";
|
|
274
|
+
import { GithubSource } from "./metadata/github-source.js";
|
|
273
275
|
import { UnifiedDatasetGateway } from "./gateway/unified-dataset-gateway.js";
|
|
276
|
+
import { WebCoreEngine } from "./web/web-core.js";
|
|
277
|
+
import { WebFusionEngine } from "./web/fusion-engine.js";
|
|
278
|
+
import { WebExtractorEngine } from "./web/extract-web.js";
|
|
279
|
+
import { SemanticScholarSource } from "./metadata/semantic-scholar-source.js";
|
|
280
|
+
import { HackerNewsSource } from "./metadata/hackernews-source.js";
|
|
274
281
|
import { formatSearchResults, formatDatasetInfo, formatJobStatus } from "./tools/formatter.js";
|
|
275
282
|
import { JobManager } from "./jobs/manager.js";
|
|
276
283
|
import { QualityAnalyzer } from "./quality/analyzer.js";
|
|
@@ -648,7 +655,14 @@ const fusionEngine = new DataFusionEngine(__dirname);
|
|
|
648
655
|
const kaggleSource = new KaggleSource(__dirname);
|
|
649
656
|
const openmlSource = new OpenMLSource(__dirname);
|
|
650
657
|
const dataworldSource = new DataWorldSource(__dirname);
|
|
658
|
+
const arxivSource = new ArxivSource(cacheService);
|
|
659
|
+
const githubSource = new GithubSource(cacheService);
|
|
651
660
|
const secureKeys = new SecureKeysManager(__dirname);
|
|
661
|
+
const semanticScholarSource = new SemanticScholarSource(cacheService);
|
|
662
|
+
const hackerNewsSource = new HackerNewsSource(cacheService);
|
|
663
|
+
const webCoreEngine = new WebCoreEngine({ arxivSource, githubSource, semanticScholarSource, hackerNewsSource });
|
|
664
|
+
const webFusionEngine = new WebFusionEngine({ webCoreEngine, embedder, cache: cacheService });
|
|
665
|
+
const webExtractorEngine = new WebExtractorEngine(cacheService);
|
|
652
666
|
function hydrateExternalKeys() {
|
|
653
667
|
const keys = secureKeys.getAll();
|
|
654
668
|
if (!process.env.HF_TOKEN && !process.env.HUGGINGFACE_TOKEN && keys.hf_token) {
|
|
@@ -674,6 +688,8 @@ const unifiedDatasetGateway = new UnifiedDatasetGateway({
|
|
|
674
688
|
kaggleSource,
|
|
675
689
|
openmlSource,
|
|
676
690
|
dataworldSource,
|
|
691
|
+
arxivSource,
|
|
692
|
+
githubSource,
|
|
677
693
|
hasDataWorldToken,
|
|
678
694
|
});
|
|
679
695
|
// CRITICAL FIX: Pass __dirname (build directory) to analyzers
|
|
@@ -757,7 +773,7 @@ async function handlePrepareJob(jobId, query, requirements, outputDir) {
|
|
|
757
773
|
let datasetIdForDownload = "";
|
|
758
774
|
let source;
|
|
759
775
|
const parsedQuery = parseDatasetId(query);
|
|
760
|
-
const isExplicitDatasetRef = /^(kaggle:|hf:|huggingface:|openml:|dataworld:)/i.test(parsedQuery) || (query.includes("/") && !query.includes(" "));
|
|
776
|
+
const isExplicitDatasetRef = /^(kaggle:|hf:|huggingface:|openml:|dataworld:|arxiv:|github:)/i.test(parsedQuery) || (query.includes("/") && !query.includes(" "));
|
|
761
777
|
if (isExplicitDatasetRef) {
|
|
762
778
|
let explicitId = parsedQuery;
|
|
763
779
|
if (/^hf:/i.test(explicitId)) {
|
|
@@ -779,6 +795,12 @@ async function handlePrepareJob(jobId, query, requirements, outputDir) {
|
|
|
779
795
|
source = "dataworld";
|
|
780
796
|
datasetIdForDownload = explicitId.replace(/^dataworld:/i, "");
|
|
781
797
|
}
|
|
798
|
+
else if (/^arxiv:/i.test(explicitId)) {
|
|
799
|
+
throw new Error("prepare_dataset does not support direct arXiv downloads yet. Use unified_dataset_api with operation='discover' or 'info' for arXiv.");
|
|
800
|
+
}
|
|
801
|
+
else if (/^github:/i.test(explicitId)) {
|
|
802
|
+
throw new Error("prepare_dataset does not support direct GitHub downloads yet. Use unified_dataset_api with operation='discover' or 'info' for GitHub.");
|
|
803
|
+
}
|
|
782
804
|
else {
|
|
783
805
|
// Default to HuggingFace for ambiguous refs (user/dataset without prefix)
|
|
784
806
|
source = "huggingface";
|
|
@@ -803,12 +825,22 @@ async function handlePrepareJob(jobId, query, requirements, outputDir) {
|
|
|
803
825
|
const hasDwToken = hasDataWorldToken();
|
|
804
826
|
selectedDataset = results.find(r => {
|
|
805
827
|
const s = (r.source || "").toLowerCase();
|
|
828
|
+
if (s === "arxiv")
|
|
829
|
+
return false; // Phase 1: discover/info only, no direct download yet
|
|
830
|
+
if (s === "github")
|
|
831
|
+
return false; // Phase 1: discover/info only, no direct download yet
|
|
806
832
|
if (s === "kaggle" && !hasKaggleCreds)
|
|
807
833
|
return false;
|
|
808
834
|
if (s === "dataworld" && !hasDwToken)
|
|
809
835
|
return false;
|
|
810
836
|
return true;
|
|
811
837
|
}) || results[0]; // Fallback to first if all require credentials
|
|
838
|
+
if ((selectedDataset.source || "").toLowerCase() === "arxiv") {
|
|
839
|
+
throw new Error("Matched an arXiv paper, but prepare_dataset currently supports downloadable dataset providers only.");
|
|
840
|
+
}
|
|
841
|
+
if ((selectedDataset.source || "").toLowerCase() === "github") {
|
|
842
|
+
throw new Error("Matched a GitHub repo, but prepare_dataset currently supports downloadable dataset providers only.");
|
|
843
|
+
}
|
|
812
844
|
datasetIdForDownload = selectedDataset.id;
|
|
813
845
|
source = selectedDataset.source;
|
|
814
846
|
update({
|
|
@@ -1103,7 +1135,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1103
1135
|
},
|
|
1104
1136
|
source: {
|
|
1105
1137
|
type: "string",
|
|
1106
|
-
enum: ["auto", "huggingface", "kaggle", "openml", "dataworld", "s3", "bigquery"],
|
|
1138
|
+
enum: ["auto", "huggingface", "kaggle", "openml", "dataworld", "arxiv", "github", "s3", "bigquery"],
|
|
1107
1139
|
description: "Optional provider selector. Use 'auto' to let Vesper choose a compatible backend.",
|
|
1108
1140
|
},
|
|
1109
1141
|
query: {
|
|
@@ -1138,6 +1170,95 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1138
1170
|
required: ["operation"],
|
|
1139
1171
|
},
|
|
1140
1172
|
},
|
|
1173
|
+
{
|
|
1174
|
+
name: "vesper_web_find",
|
|
1175
|
+
description: "Phase 1 Web Core: search web-native sources (ArXiv, GitHub) and return structured, validated documents using a unified schema (source_type, source_url, content, metadata_json, quality_score, collected_at, content_type).",
|
|
1176
|
+
inputSchema: {
|
|
1177
|
+
type: "object",
|
|
1178
|
+
properties: {
|
|
1179
|
+
query: { type: "string", description: "Natural language query, e.g. 'agentic RAG evaluation'" },
|
|
1180
|
+
sources: {
|
|
1181
|
+
type: "array",
|
|
1182
|
+
items: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews"] },
|
|
1183
|
+
description: "Optional subset of sources. Defaults to ['arxiv','github'] when omitted.",
|
|
1184
|
+
},
|
|
1185
|
+
limit: { type: "number", description: "Max documents to return (default 10, max 50)." },
|
|
1186
|
+
arxiv_full_text: { type: "boolean", description: "When true, fetch and parse ArXiv PDFs and return full text as document content (slower)." },
|
|
1187
|
+
github_include_readme: { type: "boolean", description: "When true, fetch and include GitHub README.md text as document content (slower)." },
|
|
1188
|
+
},
|
|
1189
|
+
required: ["query"],
|
|
1190
|
+
},
|
|
1191
|
+
},
|
|
1192
|
+
{
|
|
1193
|
+
name: "vesper.fuse",
|
|
1194
|
+
description: "Phase 2 Data Fusion: fuse results from multiple web-native sources into one unified, deduplicated corpus (provenance via source_chain).",
|
|
1195
|
+
inputSchema: {
|
|
1196
|
+
type: "object",
|
|
1197
|
+
properties: {
|
|
1198
|
+
sources: {
|
|
1199
|
+
type: "array",
|
|
1200
|
+
description: "Web sources to collect from, each with its own query.",
|
|
1201
|
+
items: {
|
|
1202
|
+
type: "object",
|
|
1203
|
+
properties: {
|
|
1204
|
+
type: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews", "s3"] },
|
|
1205
|
+
query: { type: "string", description: "Query for this source." },
|
|
1206
|
+
max_results: { type: "number", description: "Max results for this source (optional)." },
|
|
1207
|
+
min_stars: { type: "number", description: "Optional popularity filter (GitHub) based on stars/proxy fields." },
|
|
1208
|
+
bucket: { type: "string", description: "S3 bucket (for type='s3')." },
|
|
1209
|
+
path: { type: "string", description: "S3 prefix/path (for type='s3')." },
|
|
1210
|
+
region: { type: "string", description: "AWS region (for type='s3')." },
|
|
1211
|
+
credentials: {
|
|
1212
|
+
type: "object",
|
|
1213
|
+
description: "Pass-through AWS credentials (optional; not persisted).",
|
|
1214
|
+
properties: {
|
|
1215
|
+
accessKeyId: { type: "string" },
|
|
1216
|
+
secretAccessKey: { type: "string" },
|
|
1217
|
+
sessionToken: { type: "string" },
|
|
1218
|
+
roleArn: { type: "string" },
|
|
1219
|
+
}
|
|
1220
|
+
},
|
|
1221
|
+
},
|
|
1222
|
+
required: ["type", "query"],
|
|
1223
|
+
},
|
|
1224
|
+
},
|
|
1225
|
+
merge_strategy: {
|
|
1226
|
+
type: "string",
|
|
1227
|
+
enum: ["union", "dedup"],
|
|
1228
|
+
description: "How to merge collected documents.",
|
|
1229
|
+
},
|
|
1230
|
+
deduplication: {
|
|
1231
|
+
type: "string",
|
|
1232
|
+
enum: ["semantic", "exact", "none"],
|
|
1233
|
+
description: "How to deduplicate across sources.",
|
|
1234
|
+
},
|
|
1235
|
+
},
|
|
1236
|
+
required: ["sources"],
|
|
1237
|
+
},
|
|
1238
|
+
},
|
|
1239
|
+
{
|
|
1240
|
+
name: "vesper.extract_web",
|
|
1241
|
+
description: "Phase 3 Structured Web extraction. Whitelist-only domains, deterministic extraction (tables/lists/infobox), schema validation, and cache fallback on live extraction failure.",
|
|
1242
|
+
inputSchema: {
|
|
1243
|
+
type: "object",
|
|
1244
|
+
properties: {
|
|
1245
|
+
url: { type: "string", description: "Target URL from approved whitelist domains." },
|
|
1246
|
+
mode: { type: "string", enum: ["auto", "table", "list", "infobox"], description: "Extraction mode (default auto)." },
|
|
1247
|
+
strict_schema: { type: "boolean", description: "When true (default), enforce domain-specific required fields." },
|
|
1248
|
+
schema: {
|
|
1249
|
+
type: "object",
|
|
1250
|
+
properties: {
|
|
1251
|
+
required_fields: {
|
|
1252
|
+
type: "array",
|
|
1253
|
+
items: { type: "string" },
|
|
1254
|
+
description: "Optional required top-level fields in extracted data payload."
|
|
1255
|
+
}
|
|
1256
|
+
}
|
|
1257
|
+
}
|
|
1258
|
+
},
|
|
1259
|
+
required: ["url"],
|
|
1260
|
+
},
|
|
1261
|
+
},
|
|
1141
1262
|
{
|
|
1142
1263
|
name: "discover_datasets",
|
|
1143
1264
|
description: "Discover datasets from a specific source. Public providers work keylessly; Kaggle and data.world can also be exposed through server-managed credentials.",
|
|
@@ -1150,7 +1271,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1150
1271
|
},
|
|
1151
1272
|
source: {
|
|
1152
1273
|
type: "string",
|
|
1153
|
-
enum: ["huggingface", "kaggle", "openml", "dataworld"],
|
|
1274
|
+
enum: ["huggingface", "kaggle", "openml", "dataworld", "arxiv", "github"],
|
|
1154
1275
|
description: "Data source to discover from.",
|
|
1155
1276
|
},
|
|
1156
1277
|
limit: {
|
|
@@ -1589,6 +1710,119 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1589
1710
|
markStepComplete(String(datasetId), String(step));
|
|
1590
1711
|
}
|
|
1591
1712
|
switch (request.params.name) {
|
|
1713
|
+
case "vesper_web_find": {
|
|
1714
|
+
hydrateExternalKeys();
|
|
1715
|
+
const query = String(request.params.arguments?.query || "").trim();
|
|
1716
|
+
const limit = Number(request.params.arguments?.limit || 10);
|
|
1717
|
+
const sources = Array.isArray(request.params.arguments?.sources)
|
|
1718
|
+
? (request.params.arguments?.sources).map(s => String(s).trim().toLowerCase()).filter(Boolean)
|
|
1719
|
+
: undefined;
|
|
1720
|
+
try {
|
|
1721
|
+
const result = await webCoreEngine.find({
|
|
1722
|
+
query,
|
|
1723
|
+
sources: sources,
|
|
1724
|
+
limit,
|
|
1725
|
+
arxiv_full_text: request.params.arguments?.arxiv_full_text === true,
|
|
1726
|
+
github_include_readme: request.params.arguments?.github_include_readme === true,
|
|
1727
|
+
});
|
|
1728
|
+
return {
|
|
1729
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1730
|
+
};
|
|
1731
|
+
}
|
|
1732
|
+
catch (error) {
|
|
1733
|
+
return {
|
|
1734
|
+
content: [{ type: "text", text: `ERROR: web_find failed: ${error.message}` }],
|
|
1735
|
+
isError: true,
|
|
1736
|
+
};
|
|
1737
|
+
}
|
|
1738
|
+
}
|
|
1739
|
+
case "vesper.fuse": {
|
|
1740
|
+
hydrateExternalKeys();
|
|
1741
|
+
const sources = Array.isArray(request.params.arguments?.sources)
|
|
1742
|
+
? request.params.arguments?.sources
|
|
1743
|
+
: undefined;
|
|
1744
|
+
if (!sources || !Array.isArray(sources)) {
|
|
1745
|
+
return {
|
|
1746
|
+
content: [{ type: "text", text: "ERROR: vesper.fuse requires 'sources' array." }],
|
|
1747
|
+
isError: true,
|
|
1748
|
+
};
|
|
1749
|
+
}
|
|
1750
|
+
try {
|
|
1751
|
+
const mergeStrategyRaw = request.params.arguments?.merge_strategy
|
|
1752
|
+
? String(request.params.arguments?.merge_strategy).toLowerCase()
|
|
1753
|
+
: undefined;
|
|
1754
|
+
const dedupRaw = request.params.arguments?.deduplication
|
|
1755
|
+
? String(request.params.arguments?.deduplication).toLowerCase()
|
|
1756
|
+
: undefined;
|
|
1757
|
+
const merge_strategy = mergeStrategyRaw && ["union", "dedup"].includes(mergeStrategyRaw)
|
|
1758
|
+
? mergeStrategyRaw
|
|
1759
|
+
: undefined;
|
|
1760
|
+
const deduplication = dedupRaw && ["semantic", "exact", "none"].includes(dedupRaw)
|
|
1761
|
+
? dedupRaw
|
|
1762
|
+
: undefined;
|
|
1763
|
+
const result = await webFusionEngine.fuse({
|
|
1764
|
+
sources: sources.map((s) => ({
|
|
1765
|
+
type: String(s?.type || "").trim().toLowerCase(),
|
|
1766
|
+
query: String(s?.query || "").trim(),
|
|
1767
|
+
max_results: s?.max_results !== undefined ? Number(s.max_results) : undefined,
|
|
1768
|
+
min_stars: s?.min_stars !== undefined ? Number(s.min_stars) : undefined,
|
|
1769
|
+
bucket: s?.bucket !== undefined ? String(s.bucket) : undefined,
|
|
1770
|
+
path: s?.path !== undefined ? String(s.path) : undefined,
|
|
1771
|
+
region: s?.region !== undefined ? String(s.region) : undefined,
|
|
1772
|
+
credentials: s?.credentials ? {
|
|
1773
|
+
accessKeyId: s.credentials.accessKeyId !== undefined ? String(s.credentials.accessKeyId) : undefined,
|
|
1774
|
+
secretAccessKey: s.credentials.secretAccessKey !== undefined ? String(s.credentials.secretAccessKey) : undefined,
|
|
1775
|
+
sessionToken: s.credentials.sessionToken !== undefined ? String(s.credentials.sessionToken) : undefined,
|
|
1776
|
+
roleArn: s.credentials.roleArn !== undefined ? String(s.credentials.roleArn) : undefined,
|
|
1777
|
+
} : undefined,
|
|
1778
|
+
})),
|
|
1779
|
+
merge_strategy,
|
|
1780
|
+
deduplication,
|
|
1781
|
+
});
|
|
1782
|
+
return {
|
|
1783
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1784
|
+
};
|
|
1785
|
+
}
|
|
1786
|
+
catch (error) {
|
|
1787
|
+
return {
|
|
1788
|
+
content: [{ type: "text", text: `ERROR: vesper.fuse failed: ${error.message}` }],
|
|
1789
|
+
isError: true,
|
|
1790
|
+
};
|
|
1791
|
+
}
|
|
1792
|
+
}
|
|
1793
|
+
case "vesper.extract_web": {
|
|
1794
|
+
hydrateExternalKeys();
|
|
1795
|
+
const url = String(request.params.arguments?.url || "").trim();
|
|
1796
|
+
const mode = request.params.arguments?.mode
|
|
1797
|
+
? String(request.params.arguments?.mode).trim().toLowerCase()
|
|
1798
|
+
: "auto";
|
|
1799
|
+
const schema = request.params.arguments?.schema && typeof request.params.arguments.schema === "object"
|
|
1800
|
+
? request.params.arguments.schema
|
|
1801
|
+
: undefined;
|
|
1802
|
+
if (!url) {
|
|
1803
|
+
return {
|
|
1804
|
+
content: [{ type: "text", text: "ERROR: vesper.extract_web requires 'url'." }],
|
|
1805
|
+
isError: true,
|
|
1806
|
+
};
|
|
1807
|
+
}
|
|
1808
|
+
try {
|
|
1809
|
+
const out = await webExtractorEngine.extract({
|
|
1810
|
+
url,
|
|
1811
|
+
mode: mode,
|
|
1812
|
+
strict_schema: request.params.arguments?.strict_schema !== false,
|
|
1813
|
+
schema: schema,
|
|
1814
|
+
});
|
|
1815
|
+
return {
|
|
1816
|
+
content: [{ type: "text", text: JSON.stringify(out, null, 2) }],
|
|
1817
|
+
};
|
|
1818
|
+
}
|
|
1819
|
+
catch (error) {
|
|
1820
|
+
return {
|
|
1821
|
+
content: [{ type: "text", text: `ERROR: vesper.extract_web failed: ${error.message}` }],
|
|
1822
|
+
isError: true,
|
|
1823
|
+
};
|
|
1824
|
+
}
|
|
1825
|
+
}
|
|
1592
1826
|
case "unified_dataset_api": {
|
|
1593
1827
|
hydrateExternalKeys();
|
|
1594
1828
|
const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
import { rateLimitedFetch } from "./rate-limiter.js";
|
|
2
|
+
import { CircuitBreaker } from "./circuit-breaker.js";
|
|
3
|
+
import { estimateQualityScore } from "./quality.js";
|
|
4
|
+
export class ArxivSource {
|
|
5
|
+
cache;
|
|
6
|
+
baseUrl = "http://export.arxiv.org/api/query";
|
|
7
|
+
breaker = new CircuitBreaker("arxiv", {
|
|
8
|
+
failureThreshold: 5,
|
|
9
|
+
openDurationMs: 30_000,
|
|
10
|
+
halfOpenSuccessesToClose: 2,
|
|
11
|
+
});
|
|
12
|
+
constructor(cache) {
|
|
13
|
+
this.cache = cache;
|
|
14
|
+
}
|
|
15
|
+
async discover(query, limit = 20) {
|
|
16
|
+
const out = await this.discoverWithTelemetry(query, limit, { full_text: false });
|
|
17
|
+
return out.results;
|
|
18
|
+
}
|
|
19
|
+
async discoverWithTelemetry(query, limit = 20, input = {}) {
|
|
20
|
+
const start = Date.now();
|
|
21
|
+
const cleanQuery = String(query || "").trim();
|
|
22
|
+
if (!cleanQuery) {
|
|
23
|
+
return { results: [], cacheHit: false, latencyMs: Date.now() - start };
|
|
24
|
+
}
|
|
25
|
+
const fullText = input.full_text === true;
|
|
26
|
+
const maxResults = Math.max(1, Math.min(100, Number(limit || 20)));
|
|
27
|
+
const cacheKey = `webcore:arxiv:discover:${cleanQuery.toLowerCase()}:limit=${maxResults}:full_text=${fullText ? 1 : 0}`;
|
|
28
|
+
const cached = await this.cache?.getJson(cacheKey);
|
|
29
|
+
if (cached) {
|
|
30
|
+
return { results: cached, cacheHit: true, latencyMs: Date.now() - start };
|
|
31
|
+
}
|
|
32
|
+
if (!this.breaker.canAttempt()) {
|
|
33
|
+
throw new Error("ArXiv connector is temporarily unavailable (circuit open).");
|
|
34
|
+
}
|
|
35
|
+
const url = `${this.baseUrl}?search_query=all:${encodeURIComponent(cleanQuery)}&start=0&max_results=${maxResults}&sortBy=relevance&sortOrder=descending`;
|
|
36
|
+
const response = await rateLimitedFetch(url, {
|
|
37
|
+
headers: {
|
|
38
|
+
"User-Agent": "vesper/2.0 (phase1-arxiv-connector)"
|
|
39
|
+
}
|
|
40
|
+
}, { maxRetries: 5, initialDelay: 1000, maxDelay: 15000 }).catch((e) => {
|
|
41
|
+
this.breaker.onFailure();
|
|
42
|
+
throw e;
|
|
43
|
+
});
|
|
44
|
+
const xml = await response.text();
|
|
45
|
+
const entries = this.parseEntries(xml);
|
|
46
|
+
let pdfExtractMsTotal = 0;
|
|
47
|
+
const result = [];
|
|
48
|
+
for (const entry of entries) {
|
|
49
|
+
if (fullText) {
|
|
50
|
+
const pdfStart = Date.now();
|
|
51
|
+
const pdfText = await this.extractPdfText(entry.id).catch(() => "");
|
|
52
|
+
pdfExtractMsTotal += Date.now() - pdfStart;
|
|
53
|
+
const truncated = pdfText ? this.truncateTo50k(pdfText) : undefined;
|
|
54
|
+
result.push(this.toDatasetMetadata(entry, {
|
|
55
|
+
webcore_content: truncated,
|
|
56
|
+
contentDepth: truncated ? truncated.length : entry.summary.length,
|
|
57
|
+
}));
|
|
58
|
+
}
|
|
59
|
+
else {
|
|
60
|
+
result.push(this.toDatasetMetadata(entry, { contentDepth: entry.summary.length }));
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
this.breaker.onSuccess();
|
|
64
|
+
await this.cache?.setJson(cacheKey, result, 86400); // 24h
|
|
65
|
+
return { results: result, cacheHit: false, latencyMs: Date.now() - start, pdf_extract_ms_total: pdfExtractMsTotal };
|
|
66
|
+
}
|
|
67
|
+
parseEntries(xml) {
|
|
68
|
+
const entries = [];
|
|
69
|
+
const entryMatches = xml.match(/<entry>([\s\S]*?)<\/entry>/g) || [];
|
|
70
|
+
for (const block of entryMatches) {
|
|
71
|
+
const idUrl = this.extractTag(block, "id");
|
|
72
|
+
const title = this.decodeXml(this.extractTag(block, "title"));
|
|
73
|
+
const summary = this.decodeXml(this.extractTag(block, "summary"));
|
|
74
|
+
const updated = this.extractTag(block, "updated");
|
|
75
|
+
const published = this.extractTag(block, "published");
|
|
76
|
+
const pdfUrl = this.extractPdfUrl(block) || (idUrl ? idUrl.replace("/abs/", "/pdf/") : "");
|
|
77
|
+
const authors = this.extractAllTags(block, "name").map((v) => this.decodeXml(v));
|
|
78
|
+
const categories = this.extractAllCategoryTerms(block);
|
|
79
|
+
if (!idUrl || !title)
|
|
80
|
+
continue;
|
|
81
|
+
const shortId = this.extractArxivId(idUrl);
|
|
82
|
+
entries.push({
|
|
83
|
+
id: shortId,
|
|
84
|
+
title: title.replace(/\s+/g, " ").trim(),
|
|
85
|
+
summary: summary.replace(/\s+/g, " ").trim(),
|
|
86
|
+
updated,
|
|
87
|
+
published,
|
|
88
|
+
authors,
|
|
89
|
+
categories,
|
|
90
|
+
pdfUrl,
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
return entries;
|
|
94
|
+
}
|
|
95
|
+
toDatasetMetadata(entry, input) {
|
|
96
|
+
const description = entry.summary || entry.title;
|
|
97
|
+
const publishedAt = entry.published || entry.updated || new Date().toISOString();
|
|
98
|
+
const qualityWarnings = [];
|
|
99
|
+
if (description.length < 120) {
|
|
100
|
+
qualityWarnings.push("Short abstract may reduce extraction confidence");
|
|
101
|
+
}
|
|
102
|
+
const abstractLength = description.length;
|
|
103
|
+
const authorsPresent = Array.isArray(entry.authors) && entry.authors.length > 0;
|
|
104
|
+
const datePresent = !!(entry.published || entry.updated);
|
|
105
|
+
const contentDepth = Math.max(abstractLength, input.contentDepth || abstractLength);
|
|
106
|
+
const quality01 = estimateQualityScore({
|
|
107
|
+
abstractLength,
|
|
108
|
+
authorsPresent,
|
|
109
|
+
datePresent,
|
|
110
|
+
contentDepth,
|
|
111
|
+
});
|
|
112
|
+
return {
|
|
113
|
+
id: entry.id,
|
|
114
|
+
source: "arxiv",
|
|
115
|
+
name: entry.title,
|
|
116
|
+
description,
|
|
117
|
+
authors: entry.authors,
|
|
118
|
+
downloads: 0,
|
|
119
|
+
likes: 0,
|
|
120
|
+
stars: 0,
|
|
121
|
+
tags: entry.categories,
|
|
122
|
+
last_updated: entry.updated || publishedAt,
|
|
123
|
+
task: "research-paper",
|
|
124
|
+
languages: [],
|
|
125
|
+
domain: "research",
|
|
126
|
+
splits: [],
|
|
127
|
+
license: {
|
|
128
|
+
id: "unknown",
|
|
129
|
+
category: "unknown",
|
|
130
|
+
usage_restrictions: [],
|
|
131
|
+
warnings: [],
|
|
132
|
+
},
|
|
133
|
+
quality_score: Math.round(quality01 * 100),
|
|
134
|
+
quality_warnings: qualityWarnings,
|
|
135
|
+
download_url: entry.pdfUrl,
|
|
136
|
+
format: "PDF",
|
|
137
|
+
total_examples: 1,
|
|
138
|
+
total_size_bytes: undefined,
|
|
139
|
+
total_size_mb: undefined,
|
|
140
|
+
columns: [
|
|
141
|
+
{ name: "title", type: "string" },
|
|
142
|
+
{ name: "abstract", type: "string" },
|
|
143
|
+
{ name: "authors", type: "string[]" },
|
|
144
|
+
{ name: "categories", type: "string[]" },
|
|
145
|
+
{ name: "published_at", type: "datetime" },
|
|
146
|
+
{ name: "source_url", type: "string" },
|
|
147
|
+
],
|
|
148
|
+
is_structured: true,
|
|
149
|
+
has_target_column: false,
|
|
150
|
+
is_safe_source: true,
|
|
151
|
+
has_personal_data: false,
|
|
152
|
+
is_paywalled: false,
|
|
153
|
+
is_scraped_web_data: false,
|
|
154
|
+
uses_https: true,
|
|
155
|
+
has_train_split: false,
|
|
156
|
+
has_test_split: false,
|
|
157
|
+
has_validation_split: false,
|
|
158
|
+
description_length: description.length,
|
|
159
|
+
has_readme: false,
|
|
160
|
+
metadata_url: `https://arxiv.org/abs/${entry.id}`,
|
|
161
|
+
...(input.webcore_content ? { webcore_content: input.webcore_content, webcore_content_kind: "pdf_text" } : {}),
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
truncateTo50k(text) {
|
|
165
|
+
return String(text || "").slice(0, 50_000);
|
|
166
|
+
}
|
|
167
|
+
async extractPdfText(arxivId) {
|
|
168
|
+
// Lazy-load heavy dependency only when enabled.
|
|
169
|
+
const pdfParseMod = await import("pdf-parse");
|
|
170
|
+
const pdfParse = pdfParseMod.default || pdfParseMod;
|
|
171
|
+
const pdfUrl = `https://arxiv.org/pdf/${arxivId}.pdf`;
|
|
172
|
+
const start = Date.now();
|
|
173
|
+
const response = await rateLimitedFetch(pdfUrl, {
|
|
174
|
+
headers: {
|
|
175
|
+
"User-Agent": "vesper/2.0 (phase1-arxiv-pdf-extract)"
|
|
176
|
+
}
|
|
177
|
+
}, { maxRetries: 3, initialDelay: 1000, maxDelay: 8000 });
|
|
178
|
+
const arrayBuf = await response.arrayBuffer();
|
|
179
|
+
const buffer = Buffer.from(arrayBuf);
|
|
180
|
+
const parsed = await pdfParse(buffer);
|
|
181
|
+
const text = String(parsed?.text || "");
|
|
182
|
+
// Soft truncate; later caller truncates too.
|
|
183
|
+
if (text.length > 200_000) {
|
|
184
|
+
// Avoid pathological PDFs.
|
|
185
|
+
return text.slice(0, 200_000);
|
|
186
|
+
}
|
|
187
|
+
void start;
|
|
188
|
+
return text;
|
|
189
|
+
}
|
|
190
|
+
extractTag(xml, tagName) {
|
|
191
|
+
const m = xml.match(new RegExp(`<${tagName}>([\\s\\S]*?)<\\/${tagName}>`, "i"));
|
|
192
|
+
return (m?.[1] || "").trim();
|
|
193
|
+
}
|
|
194
|
+
extractAllTags(xml, tagName) {
|
|
195
|
+
const out = [];
|
|
196
|
+
const rgx = new RegExp(`<${tagName}>([\\s\\S]*?)<\\/${tagName}>`, "gi");
|
|
197
|
+
let m = null;
|
|
198
|
+
while ((m = rgx.exec(xml)) !== null) {
|
|
199
|
+
out.push((m[1] || "").trim());
|
|
200
|
+
}
|
|
201
|
+
return out;
|
|
202
|
+
}
|
|
203
|
+
extractAllCategoryTerms(xml) {
|
|
204
|
+
const out = [];
|
|
205
|
+
const rgx = /<category[^>]*term="([^"]+)"[^>]*\/?>/gi;
|
|
206
|
+
let m = null;
|
|
207
|
+
while ((m = rgx.exec(xml)) !== null) {
|
|
208
|
+
out.push((m[1] || "").trim());
|
|
209
|
+
}
|
|
210
|
+
return Array.from(new Set(out));
|
|
211
|
+
}
|
|
212
|
+
extractPdfUrl(xml) {
|
|
213
|
+
const m = xml.match(/<link[^>]*title="pdf"[^>]*href="([^"]+)"[^>]*\/?>/i);
|
|
214
|
+
return (m?.[1] || "").trim();
|
|
215
|
+
}
|
|
216
|
+
extractArxivId(idUrl) {
|
|
217
|
+
const cleaned = idUrl.trim();
|
|
218
|
+
const match = cleaned.match(/\/abs\/([^/?#]+)/i);
|
|
219
|
+
return match?.[1] || cleaned;
|
|
220
|
+
}
|
|
221
|
+
decodeXml(input) {
|
|
222
|
+
return input
|
|
223
|
+
.replace(/</g, "<")
|
|
224
|
+
.replace(/>/g, ">")
|
|
225
|
+
.replace(/&/g, "&")
|
|
226
|
+
.replace(/"/g, "\"")
|
|
227
|
+
.replace(/'/g, "'");
|
|
228
|
+
}
|
|
229
|
+
}
|