@vespermcp/mcp-server 1.2.22 → 1.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,6 +27,13 @@ export class CacheService {
27
27
  constructor(provider) {
28
28
  this.provider = provider;
29
29
  }
30
+ async getJson(key) {
31
+ const data = await this.provider.get(key);
32
+ return data ? JSON.parse(data) : null;
33
+ }
34
+ async setJson(key, value, ttlSeconds) {
35
+ await this.provider.set(key, JSON.stringify(value), ttlSeconds);
36
+ }
30
37
  /**
31
38
  * Caches quality reports (TTL: 24h)
32
39
  */
@@ -57,6 +57,27 @@ export class UnifiedDatasetGateway {
57
57
  ? ["data.world is available through server-managed credentials."]
58
58
  : ["data.world support exists, but no server-managed token is configured yet."],
59
59
  },
60
+ {
61
+ source: "arxiv",
62
+ display_name: "ArXiv",
63
+ available: true,
64
+ auth_mode: "public",
65
+ supported_operations: ["discover", "info"],
66
+ requires_end_user_key: false,
67
+ notes: ["ArXiv papers are fetched through the official public API with no user key required."],
68
+ },
69
+ {
70
+ source: "github",
71
+ display_name: "GitHub",
72
+ available: true,
73
+ auth_mode: "public-or-server-managed",
74
+ supported_operations: ["discover", "info"],
75
+ requires_end_user_key: false,
76
+ notes: [
77
+ "Repository search is available without a user key, but unauthenticated requests are heavily rate limited.",
78
+ "Set GITHUB_TOKEN on the server for higher limits and reliability."
79
+ ],
80
+ },
60
81
  {
61
82
  source: "s3",
62
83
  display_name: "Amazon S3",
@@ -244,6 +265,10 @@ export class UnifiedDatasetGateway {
244
265
  return await this.deps.kaggleSource.discover(query, limit);
245
266
  case "dataworld":
246
267
  return await this.deps.dataworldSource.discover(query, limit);
268
+ case "arxiv":
269
+ return await this.deps.arxivSource.discover(query, limit);
270
+ case "github":
271
+ return await this.deps.githubSource.discover(query, limit);
247
272
  case "s3":
248
273
  throw new Error("S3 does not support search/discovery in the unified gateway. Use a direct s3://bucket/key object reference.");
249
274
  case "bigquery":
@@ -265,7 +290,7 @@ export class UnifiedDatasetGateway {
265
290
  }
266
291
  return [source];
267
292
  }
268
- const providers = ["huggingface", "openml"];
293
+ const providers = ["arxiv", "huggingface", "openml"];
269
294
  if (!publicOnly && this.deps.dataIngestor.hasKaggleCredentials()) {
270
295
  providers.push("kaggle");
271
296
  }
@@ -299,8 +324,14 @@ export class UnifiedDatasetGateway {
299
324
  return { source: "openml", datasetId: trimmed.replace(/^openml:/i, "") };
300
325
  if (/^dataworld:/i.test(trimmed))
301
326
  return { source: "dataworld", datasetId: trimmed.replace(/^dataworld:/i, "") };
327
+ if (/^arxiv:/i.test(trimmed))
328
+ return { source: "arxiv", datasetId: trimmed.replace(/^arxiv:/i, "") };
329
+ if (/^github:/i.test(trimmed))
330
+ return { source: "github", datasetId: trimmed.replace(/^github:/i, "") };
302
331
  if (/^bigquery:/i.test(trimmed))
303
332
  return { source: "bigquery", datasetId: trimmed.replace(/^bigquery:/i, "") };
333
+ if (/^\d{4}\.\d{4,5}(v\d+)?$/i.test(trimmed))
334
+ return { source: "arxiv", datasetId: trimmed };
304
335
  if (/^\d+$/.test(trimmed))
305
336
  return { source: "openml", datasetId: trimmed };
306
337
  if (trimmed.includes("/") && !trimmed.includes(":"))
@@ -316,7 +347,7 @@ export class UnifiedDatasetGateway {
316
347
  lookupKnownDataset(datasetId) {
317
348
  const candidates = new Set([
318
349
  datasetId,
319
- datasetId.replace(/^(huggingface|hf|kaggle|openml|dataworld|bigquery):/i, ""),
350
+ datasetId.replace(/^(huggingface|hf|kaggle|openml|dataworld|arxiv|github|bigquery):/i, ""),
320
351
  ]);
321
352
  for (const candidate of candidates) {
322
353
  const dataset = this.deps.metadataStore.getDataset(candidate);
@@ -326,7 +357,7 @@ export class UnifiedDatasetGateway {
326
357
  return undefined;
327
358
  }
328
359
  matchesDatasetReference(dataset, requested) {
329
- const normalizedRequested = requested.replace(/^(huggingface|hf|kaggle|openml|dataworld|bigquery):/i, "").toLowerCase();
360
+ const normalizedRequested = requested.replace(/^(huggingface|hf|kaggle|openml|dataworld|arxiv|github|bigquery):/i, "").toLowerCase();
330
361
  const fullId = `${dataset.source}:${dataset.id}`.toLowerCase();
331
362
  return dataset.id.toLowerCase() === normalizedRequested || fullId === requested.toLowerCase();
332
363
  }
package/build/index.js CHANGED
@@ -248,7 +248,7 @@ export function hasStep(datasetId, step) {
248
248
  // --- Dataset ID Auto-Detection ---
249
249
  export function parseDatasetId(id) {
250
250
  const trimmed = id.trim();
251
- if (/^(kaggle:|hf:|huggingface:|openml:|dataworld:|http|https):/i.test(trimmed))
251
+ if (/^(kaggle:|hf:|huggingface:|openml:|dataworld:|arxiv:|github:|http|https):/i.test(trimmed))
252
252
  return trimmed;
253
253
  if (trimmed.includes("/") && !trimmed.includes(":"))
254
254
  return `kaggle:${trimmed}`;
@@ -270,7 +270,14 @@ import { HuggingFaceScraper } from "./metadata/scraper.js";
270
270
  import { KaggleSource } from "./metadata/kaggle-source.js";
271
271
  import { OpenMLSource } from "./metadata/openml-source.js";
272
272
  import { DataWorldSource } from "./metadata/dataworld-source.js";
273
+ import { ArxivSource } from "./metadata/arxiv-source.js";
274
+ import { GithubSource } from "./metadata/github-source.js";
273
275
  import { UnifiedDatasetGateway } from "./gateway/unified-dataset-gateway.js";
276
+ import { WebCoreEngine } from "./web/web-core.js";
277
+ import { WebFusionEngine } from "./web/fusion-engine.js";
278
+ import { WebExtractorEngine } from "./web/extract-web.js";
279
+ import { SemanticScholarSource } from "./metadata/semantic-scholar-source.js";
280
+ import { HackerNewsSource } from "./metadata/hackernews-source.js";
274
281
  import { formatSearchResults, formatDatasetInfo, formatJobStatus } from "./tools/formatter.js";
275
282
  import { JobManager } from "./jobs/manager.js";
276
283
  import { QualityAnalyzer } from "./quality/analyzer.js";
@@ -648,7 +655,14 @@ const fusionEngine = new DataFusionEngine(__dirname);
648
655
  const kaggleSource = new KaggleSource(__dirname);
649
656
  const openmlSource = new OpenMLSource(__dirname);
650
657
  const dataworldSource = new DataWorldSource(__dirname);
658
+ const arxivSource = new ArxivSource(cacheService);
659
+ const githubSource = new GithubSource(cacheService);
651
660
  const secureKeys = new SecureKeysManager(__dirname);
661
+ const semanticScholarSource = new SemanticScholarSource(cacheService);
662
+ const hackerNewsSource = new HackerNewsSource(cacheService);
663
+ const webCoreEngine = new WebCoreEngine({ arxivSource, githubSource, semanticScholarSource, hackerNewsSource });
664
+ const webFusionEngine = new WebFusionEngine({ webCoreEngine, embedder, cache: cacheService });
665
+ const webExtractorEngine = new WebExtractorEngine(cacheService);
652
666
  function hydrateExternalKeys() {
653
667
  const keys = secureKeys.getAll();
654
668
  if (!process.env.HF_TOKEN && !process.env.HUGGINGFACE_TOKEN && keys.hf_token) {
@@ -674,6 +688,8 @@ const unifiedDatasetGateway = new UnifiedDatasetGateway({
674
688
  kaggleSource,
675
689
  openmlSource,
676
690
  dataworldSource,
691
+ arxivSource,
692
+ githubSource,
677
693
  hasDataWorldToken,
678
694
  });
679
695
  // CRITICAL FIX: Pass __dirname (build directory) to analyzers
@@ -757,7 +773,7 @@ async function handlePrepareJob(jobId, query, requirements, outputDir) {
757
773
  let datasetIdForDownload = "";
758
774
  let source;
759
775
  const parsedQuery = parseDatasetId(query);
760
- const isExplicitDatasetRef = /^(kaggle:|hf:|huggingface:|openml:|dataworld:)/i.test(parsedQuery) || (query.includes("/") && !query.includes(" "));
776
+ const isExplicitDatasetRef = /^(kaggle:|hf:|huggingface:|openml:|dataworld:|arxiv:|github:)/i.test(parsedQuery) || (query.includes("/") && !query.includes(" "));
761
777
  if (isExplicitDatasetRef) {
762
778
  let explicitId = parsedQuery;
763
779
  if (/^hf:/i.test(explicitId)) {
@@ -779,6 +795,12 @@ async function handlePrepareJob(jobId, query, requirements, outputDir) {
779
795
  source = "dataworld";
780
796
  datasetIdForDownload = explicitId.replace(/^dataworld:/i, "");
781
797
  }
798
+ else if (/^arxiv:/i.test(explicitId)) {
799
+ throw new Error("prepare_dataset does not support direct arXiv downloads yet. Use unified_dataset_api with operation='discover' or 'info' for arXiv.");
800
+ }
801
+ else if (/^github:/i.test(explicitId)) {
802
+ throw new Error("prepare_dataset does not support direct GitHub downloads yet. Use unified_dataset_api with operation='discover' or 'info' for GitHub.");
803
+ }
782
804
  else {
783
805
  // Default to HuggingFace for ambiguous refs (user/dataset without prefix)
784
806
  source = "huggingface";
@@ -803,12 +825,22 @@ async function handlePrepareJob(jobId, query, requirements, outputDir) {
803
825
  const hasDwToken = hasDataWorldToken();
804
826
  selectedDataset = results.find(r => {
805
827
  const s = (r.source || "").toLowerCase();
828
+ if (s === "arxiv")
829
+ return false; // Phase 1: discover/info only, no direct download yet
830
+ if (s === "github")
831
+ return false; // Phase 1: discover/info only, no direct download yet
806
832
  if (s === "kaggle" && !hasKaggleCreds)
807
833
  return false;
808
834
  if (s === "dataworld" && !hasDwToken)
809
835
  return false;
810
836
  return true;
811
837
  }) || results[0]; // Fallback to first if all require credentials
838
+ if ((selectedDataset.source || "").toLowerCase() === "arxiv") {
839
+ throw new Error("Matched an arXiv paper, but prepare_dataset currently supports downloadable dataset providers only.");
840
+ }
841
+ if ((selectedDataset.source || "").toLowerCase() === "github") {
842
+ throw new Error("Matched a GitHub repo, but prepare_dataset currently supports downloadable dataset providers only.");
843
+ }
812
844
  datasetIdForDownload = selectedDataset.id;
813
845
  source = selectedDataset.source;
814
846
  update({
@@ -1103,7 +1135,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1103
1135
  },
1104
1136
  source: {
1105
1137
  type: "string",
1106
- enum: ["auto", "huggingface", "kaggle", "openml", "dataworld", "s3", "bigquery"],
1138
+ enum: ["auto", "huggingface", "kaggle", "openml", "dataworld", "arxiv", "github", "s3", "bigquery"],
1107
1139
  description: "Optional provider selector. Use 'auto' to let Vesper choose a compatible backend.",
1108
1140
  },
1109
1141
  query: {
@@ -1138,6 +1170,95 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1138
1170
  required: ["operation"],
1139
1171
  },
1140
1172
  },
1173
+ {
1174
+ name: "vesper_web_find",
1175
+ description: "Phase 1 Web Core: search web-native sources (ArXiv, GitHub) and return structured, validated documents using a unified schema (source_type, source_url, content, metadata_json, quality_score, collected_at, content_type).",
1176
+ inputSchema: {
1177
+ type: "object",
1178
+ properties: {
1179
+ query: { type: "string", description: "Natural language query, e.g. 'agentic RAG evaluation'" },
1180
+ sources: {
1181
+ type: "array",
1182
+ items: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews"] },
1183
+ description: "Optional subset of sources. Defaults to ['arxiv','github'] when omitted.",
1184
+ },
1185
+ limit: { type: "number", description: "Max documents to return (default 10, max 50)." },
1186
+ arxiv_full_text: { type: "boolean", description: "When true, fetch and parse ArXiv PDFs and return full text as document content (slower)." },
1187
+ github_include_readme: { type: "boolean", description: "When true, fetch and include GitHub README.md text as document content (slower)." },
1188
+ },
1189
+ required: ["query"],
1190
+ },
1191
+ },
1192
+ {
1193
+ name: "vesper.fuse",
1194
+ description: "Phase 2 Data Fusion: fuse results from multiple web-native sources into one unified, deduplicated corpus (provenance via source_chain).",
1195
+ inputSchema: {
1196
+ type: "object",
1197
+ properties: {
1198
+ sources: {
1199
+ type: "array",
1200
+ description: "Web sources to collect from, each with its own query.",
1201
+ items: {
1202
+ type: "object",
1203
+ properties: {
1204
+ type: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews", "s3"] },
1205
+ query: { type: "string", description: "Query for this source." },
1206
+ max_results: { type: "number", description: "Max results for this source (optional)." },
1207
+ min_stars: { type: "number", description: "Optional popularity filter (GitHub) based on stars/proxy fields." },
1208
+ bucket: { type: "string", description: "S3 bucket (for type='s3')." },
1209
+ path: { type: "string", description: "S3 prefix/path (for type='s3')." },
1210
+ region: { type: "string", description: "AWS region (for type='s3')." },
1211
+ credentials: {
1212
+ type: "object",
1213
+ description: "Pass-through AWS credentials (optional; not persisted).",
1214
+ properties: {
1215
+ accessKeyId: { type: "string" },
1216
+ secretAccessKey: { type: "string" },
1217
+ sessionToken: { type: "string" },
1218
+ roleArn: { type: "string" },
1219
+ }
1220
+ },
1221
+ },
1222
+ required: ["type", "query"],
1223
+ },
1224
+ },
1225
+ merge_strategy: {
1226
+ type: "string",
1227
+ enum: ["union", "dedup"],
1228
+ description: "How to merge collected documents.",
1229
+ },
1230
+ deduplication: {
1231
+ type: "string",
1232
+ enum: ["semantic", "exact", "none"],
1233
+ description: "How to deduplicate across sources.",
1234
+ },
1235
+ },
1236
+ required: ["sources"],
1237
+ },
1238
+ },
1239
+ {
1240
+ name: "vesper.extract_web",
1241
+ description: "Phase 3 Structured Web extraction. Whitelist-only domains, deterministic extraction (tables/lists/infobox), schema validation, and cache fallback on live extraction failure.",
1242
+ inputSchema: {
1243
+ type: "object",
1244
+ properties: {
1245
+ url: { type: "string", description: "Target URL from approved whitelist domains." },
1246
+ mode: { type: "string", enum: ["auto", "table", "list", "infobox"], description: "Extraction mode (default auto)." },
1247
+ strict_schema: { type: "boolean", description: "When true (default), enforce domain-specific required fields." },
1248
+ schema: {
1249
+ type: "object",
1250
+ properties: {
1251
+ required_fields: {
1252
+ type: "array",
1253
+ items: { type: "string" },
1254
+ description: "Optional required top-level fields in extracted data payload."
1255
+ }
1256
+ }
1257
+ }
1258
+ },
1259
+ required: ["url"],
1260
+ },
1261
+ },
1141
1262
  {
1142
1263
  name: "discover_datasets",
1143
1264
  description: "Discover datasets from a specific source. Public providers work keylessly; Kaggle and data.world can also be exposed through server-managed credentials.",
@@ -1150,7 +1271,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1150
1271
  },
1151
1272
  source: {
1152
1273
  type: "string",
1153
- enum: ["huggingface", "kaggle", "openml", "dataworld"],
1274
+ enum: ["huggingface", "kaggle", "openml", "dataworld", "arxiv", "github"],
1154
1275
  description: "Data source to discover from.",
1155
1276
  },
1156
1277
  limit: {
@@ -1589,6 +1710,119 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1589
1710
  markStepComplete(String(datasetId), String(step));
1590
1711
  }
1591
1712
  switch (request.params.name) {
1713
+ case "vesper_web_find": {
1714
+ hydrateExternalKeys();
1715
+ const query = String(request.params.arguments?.query || "").trim();
1716
+ const limit = Number(request.params.arguments?.limit || 10);
1717
+ const sources = Array.isArray(request.params.arguments?.sources)
1718
+ ? (request.params.arguments?.sources).map(s => String(s).trim().toLowerCase()).filter(Boolean)
1719
+ : undefined;
1720
+ try {
1721
+ const result = await webCoreEngine.find({
1722
+ query,
1723
+ sources: sources,
1724
+ limit,
1725
+ arxiv_full_text: request.params.arguments?.arxiv_full_text === true,
1726
+ github_include_readme: request.params.arguments?.github_include_readme === true,
1727
+ });
1728
+ return {
1729
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1730
+ };
1731
+ }
1732
+ catch (error) {
1733
+ return {
1734
+ content: [{ type: "text", text: `ERROR: web_find failed: ${error.message}` }],
1735
+ isError: true,
1736
+ };
1737
+ }
1738
+ }
1739
+ case "vesper.fuse": {
1740
+ hydrateExternalKeys();
1741
+ const sources = Array.isArray(request.params.arguments?.sources)
1742
+ ? request.params.arguments?.sources
1743
+ : undefined;
1744
+ if (!sources || !Array.isArray(sources)) {
1745
+ return {
1746
+ content: [{ type: "text", text: "ERROR: vesper.fuse requires 'sources' array." }],
1747
+ isError: true,
1748
+ };
1749
+ }
1750
+ try {
1751
+ const mergeStrategyRaw = request.params.arguments?.merge_strategy
1752
+ ? String(request.params.arguments?.merge_strategy).toLowerCase()
1753
+ : undefined;
1754
+ const dedupRaw = request.params.arguments?.deduplication
1755
+ ? String(request.params.arguments?.deduplication).toLowerCase()
1756
+ : undefined;
1757
+ const merge_strategy = mergeStrategyRaw && ["union", "dedup"].includes(mergeStrategyRaw)
1758
+ ? mergeStrategyRaw
1759
+ : undefined;
1760
+ const deduplication = dedupRaw && ["semantic", "exact", "none"].includes(dedupRaw)
1761
+ ? dedupRaw
1762
+ : undefined;
1763
+ const result = await webFusionEngine.fuse({
1764
+ sources: sources.map((s) => ({
1765
+ type: String(s?.type || "").trim().toLowerCase(),
1766
+ query: String(s?.query || "").trim(),
1767
+ max_results: s?.max_results !== undefined ? Number(s.max_results) : undefined,
1768
+ min_stars: s?.min_stars !== undefined ? Number(s.min_stars) : undefined,
1769
+ bucket: s?.bucket !== undefined ? String(s.bucket) : undefined,
1770
+ path: s?.path !== undefined ? String(s.path) : undefined,
1771
+ region: s?.region !== undefined ? String(s.region) : undefined,
1772
+ credentials: s?.credentials ? {
1773
+ accessKeyId: s.credentials.accessKeyId !== undefined ? String(s.credentials.accessKeyId) : undefined,
1774
+ secretAccessKey: s.credentials.secretAccessKey !== undefined ? String(s.credentials.secretAccessKey) : undefined,
1775
+ sessionToken: s.credentials.sessionToken !== undefined ? String(s.credentials.sessionToken) : undefined,
1776
+ roleArn: s.credentials.roleArn !== undefined ? String(s.credentials.roleArn) : undefined,
1777
+ } : undefined,
1778
+ })),
1779
+ merge_strategy,
1780
+ deduplication,
1781
+ });
1782
+ return {
1783
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1784
+ };
1785
+ }
1786
+ catch (error) {
1787
+ return {
1788
+ content: [{ type: "text", text: `ERROR: vesper.fuse failed: ${error.message}` }],
1789
+ isError: true,
1790
+ };
1791
+ }
1792
+ }
1793
+ case "vesper.extract_web": {
1794
+ hydrateExternalKeys();
1795
+ const url = String(request.params.arguments?.url || "").trim();
1796
+ const mode = request.params.arguments?.mode
1797
+ ? String(request.params.arguments?.mode).trim().toLowerCase()
1798
+ : "auto";
1799
+ const schema = request.params.arguments?.schema && typeof request.params.arguments.schema === "object"
1800
+ ? request.params.arguments.schema
1801
+ : undefined;
1802
+ if (!url) {
1803
+ return {
1804
+ content: [{ type: "text", text: "ERROR: vesper.extract_web requires 'url'." }],
1805
+ isError: true,
1806
+ };
1807
+ }
1808
+ try {
1809
+ const out = await webExtractorEngine.extract({
1810
+ url,
1811
+ mode: mode,
1812
+ strict_schema: request.params.arguments?.strict_schema !== false,
1813
+ schema: schema,
1814
+ });
1815
+ return {
1816
+ content: [{ type: "text", text: JSON.stringify(out, null, 2) }],
1817
+ };
1818
+ }
1819
+ catch (error) {
1820
+ return {
1821
+ content: [{ type: "text", text: `ERROR: vesper.extract_web failed: ${error.message}` }],
1822
+ isError: true,
1823
+ };
1824
+ }
1825
+ }
1592
1826
  case "unified_dataset_api": {
1593
1827
  hydrateExternalKeys();
1594
1828
  const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
@@ -0,0 +1,229 @@
1
+ import { rateLimitedFetch } from "./rate-limiter.js";
2
+ import { CircuitBreaker } from "./circuit-breaker.js";
3
+ import { estimateQualityScore } from "./quality.js";
4
+ export class ArxivSource {
5
+ cache;
6
+ baseUrl = "http://export.arxiv.org/api/query";
7
+ breaker = new CircuitBreaker("arxiv", {
8
+ failureThreshold: 5,
9
+ openDurationMs: 30_000,
10
+ halfOpenSuccessesToClose: 2,
11
+ });
12
+ constructor(cache) {
13
+ this.cache = cache;
14
+ }
15
+ async discover(query, limit = 20) {
16
+ const out = await this.discoverWithTelemetry(query, limit, { full_text: false });
17
+ return out.results;
18
+ }
19
+ async discoverWithTelemetry(query, limit = 20, input = {}) {
20
+ const start = Date.now();
21
+ const cleanQuery = String(query || "").trim();
22
+ if (!cleanQuery) {
23
+ return { results: [], cacheHit: false, latencyMs: Date.now() - start };
24
+ }
25
+ const fullText = input.full_text === true;
26
+ const maxResults = Math.max(1, Math.min(100, Number(limit || 20)));
27
+ const cacheKey = `webcore:arxiv:discover:${cleanQuery.toLowerCase()}:limit=${maxResults}:full_text=${fullText ? 1 : 0}`;
28
+ const cached = await this.cache?.getJson(cacheKey);
29
+ if (cached) {
30
+ return { results: cached, cacheHit: true, latencyMs: Date.now() - start };
31
+ }
32
+ if (!this.breaker.canAttempt()) {
33
+ throw new Error("ArXiv connector is temporarily unavailable (circuit open).");
34
+ }
35
+ const url = `${this.baseUrl}?search_query=all:${encodeURIComponent(cleanQuery)}&start=0&max_results=${maxResults}&sortBy=relevance&sortOrder=descending`;
36
+ const response = await rateLimitedFetch(url, {
37
+ headers: {
38
+ "User-Agent": "vesper/2.0 (phase1-arxiv-connector)"
39
+ }
40
+ }, { maxRetries: 5, initialDelay: 1000, maxDelay: 15000 }).catch((e) => {
41
+ this.breaker.onFailure();
42
+ throw e;
43
+ });
44
+ const xml = await response.text();
45
+ const entries = this.parseEntries(xml);
46
+ let pdfExtractMsTotal = 0;
47
+ const result = [];
48
+ for (const entry of entries) {
49
+ if (fullText) {
50
+ const pdfStart = Date.now();
51
+ const pdfText = await this.extractPdfText(entry.id).catch(() => "");
52
+ pdfExtractMsTotal += Date.now() - pdfStart;
53
+ const truncated = pdfText ? this.truncateTo50k(pdfText) : undefined;
54
+ result.push(this.toDatasetMetadata(entry, {
55
+ webcore_content: truncated,
56
+ contentDepth: truncated ? truncated.length : entry.summary.length,
57
+ }));
58
+ }
59
+ else {
60
+ result.push(this.toDatasetMetadata(entry, { contentDepth: entry.summary.length }));
61
+ }
62
+ }
63
+ this.breaker.onSuccess();
64
+ await this.cache?.setJson(cacheKey, result, 86400); // 24h
65
+ return { results: result, cacheHit: false, latencyMs: Date.now() - start, pdf_extract_ms_total: pdfExtractMsTotal };
66
+ }
67
+ parseEntries(xml) {
68
+ const entries = [];
69
+ const entryMatches = xml.match(/<entry>([\s\S]*?)<\/entry>/g) || [];
70
+ for (const block of entryMatches) {
71
+ const idUrl = this.extractTag(block, "id");
72
+ const title = this.decodeXml(this.extractTag(block, "title"));
73
+ const summary = this.decodeXml(this.extractTag(block, "summary"));
74
+ const updated = this.extractTag(block, "updated");
75
+ const published = this.extractTag(block, "published");
76
+ const pdfUrl = this.extractPdfUrl(block) || (idUrl ? idUrl.replace("/abs/", "/pdf/") : "");
77
+ const authors = this.extractAllTags(block, "name").map((v) => this.decodeXml(v));
78
+ const categories = this.extractAllCategoryTerms(block);
79
+ if (!idUrl || !title)
80
+ continue;
81
+ const shortId = this.extractArxivId(idUrl);
82
+ entries.push({
83
+ id: shortId,
84
+ title: title.replace(/\s+/g, " ").trim(),
85
+ summary: summary.replace(/\s+/g, " ").trim(),
86
+ updated,
87
+ published,
88
+ authors,
89
+ categories,
90
+ pdfUrl,
91
+ });
92
+ }
93
+ return entries;
94
+ }
95
+ toDatasetMetadata(entry, input) {
96
+ const description = entry.summary || entry.title;
97
+ const publishedAt = entry.published || entry.updated || new Date().toISOString();
98
+ const qualityWarnings = [];
99
+ if (description.length < 120) {
100
+ qualityWarnings.push("Short abstract may reduce extraction confidence");
101
+ }
102
+ const abstractLength = description.length;
103
+ const authorsPresent = Array.isArray(entry.authors) && entry.authors.length > 0;
104
+ const datePresent = !!(entry.published || entry.updated);
105
+ const contentDepth = Math.max(abstractLength, input.contentDepth || abstractLength);
106
+ const quality01 = estimateQualityScore({
107
+ abstractLength,
108
+ authorsPresent,
109
+ datePresent,
110
+ contentDepth,
111
+ });
112
+ return {
113
+ id: entry.id,
114
+ source: "arxiv",
115
+ name: entry.title,
116
+ description,
117
+ authors: entry.authors,
118
+ downloads: 0,
119
+ likes: 0,
120
+ stars: 0,
121
+ tags: entry.categories,
122
+ last_updated: entry.updated || publishedAt,
123
+ task: "research-paper",
124
+ languages: [],
125
+ domain: "research",
126
+ splits: [],
127
+ license: {
128
+ id: "unknown",
129
+ category: "unknown",
130
+ usage_restrictions: [],
131
+ warnings: [],
132
+ },
133
+ quality_score: Math.round(quality01 * 100),
134
+ quality_warnings: qualityWarnings,
135
+ download_url: entry.pdfUrl,
136
+ format: "PDF",
137
+ total_examples: 1,
138
+ total_size_bytes: undefined,
139
+ total_size_mb: undefined,
140
+ columns: [
141
+ { name: "title", type: "string" },
142
+ { name: "abstract", type: "string" },
143
+ { name: "authors", type: "string[]" },
144
+ { name: "categories", type: "string[]" },
145
+ { name: "published_at", type: "datetime" },
146
+ { name: "source_url", type: "string" },
147
+ ],
148
+ is_structured: true,
149
+ has_target_column: false,
150
+ is_safe_source: true,
151
+ has_personal_data: false,
152
+ is_paywalled: false,
153
+ is_scraped_web_data: false,
154
+ uses_https: true,
155
+ has_train_split: false,
156
+ has_test_split: false,
157
+ has_validation_split: false,
158
+ description_length: description.length,
159
+ has_readme: false,
160
+ metadata_url: `https://arxiv.org/abs/${entry.id}`,
161
+ ...(input.webcore_content ? { webcore_content: input.webcore_content, webcore_content_kind: "pdf_text" } : {}),
162
+ };
163
+ }
164
+ truncateTo50k(text) {
165
+ return String(text || "").slice(0, 50_000);
166
+ }
167
+ async extractPdfText(arxivId) {
168
+ // Lazy-load heavy dependency only when enabled.
169
+ const pdfParseMod = await import("pdf-parse");
170
+ const pdfParse = pdfParseMod.default || pdfParseMod;
171
+ const pdfUrl = `https://arxiv.org/pdf/${arxivId}.pdf`;
172
+ const start = Date.now();
173
+ const response = await rateLimitedFetch(pdfUrl, {
174
+ headers: {
175
+ "User-Agent": "vesper/2.0 (phase1-arxiv-pdf-extract)"
176
+ }
177
+ }, { maxRetries: 3, initialDelay: 1000, maxDelay: 8000 });
178
+ const arrayBuf = await response.arrayBuffer();
179
+ const buffer = Buffer.from(arrayBuf);
180
+ const parsed = await pdfParse(buffer);
181
+ const text = String(parsed?.text || "");
182
+ // Soft truncate; later caller truncates too.
183
+ if (text.length > 200_000) {
184
+ // Avoid pathological PDFs.
185
+ return text.slice(0, 200_000);
186
+ }
187
+ void start;
188
+ return text;
189
+ }
190
+ extractTag(xml, tagName) {
191
+ const m = xml.match(new RegExp(`<${tagName}>([\\s\\S]*?)<\\/${tagName}>`, "i"));
192
+ return (m?.[1] || "").trim();
193
+ }
194
+ extractAllTags(xml, tagName) {
195
+ const out = [];
196
+ const rgx = new RegExp(`<${tagName}>([\\s\\S]*?)<\\/${tagName}>`, "gi");
197
+ let m = null;
198
+ while ((m = rgx.exec(xml)) !== null) {
199
+ out.push((m[1] || "").trim());
200
+ }
201
+ return out;
202
+ }
203
+ extractAllCategoryTerms(xml) {
204
+ const out = [];
205
+ const rgx = /<category[^>]*term="([^"]+)"[^>]*\/?>/gi;
206
+ let m = null;
207
+ while ((m = rgx.exec(xml)) !== null) {
208
+ out.push((m[1] || "").trim());
209
+ }
210
+ return Array.from(new Set(out));
211
+ }
212
+ extractPdfUrl(xml) {
213
+ const m = xml.match(/<link[^>]*title="pdf"[^>]*href="([^"]+)"[^>]*\/?>/i);
214
+ return (m?.[1] || "").trim();
215
+ }
216
+ extractArxivId(idUrl) {
217
+ const cleaned = idUrl.trim();
218
+ const match = cleaned.match(/\/abs\/([^/?#]+)/i);
219
+ return match?.[1] || cleaned;
220
+ }
221
+ decodeXml(input) {
222
+ return input
223
+ .replace(/&lt;/g, "<")
224
+ .replace(/&gt;/g, ">")
225
+ .replace(/&amp;/g, "&")
226
+ .replace(/&quot;/g, "\"")
227
+ .replace(/&#39;/g, "'");
228
+ }
229
+ }