npm - @vespermcp/mcp-server - Versions diffs - 1.2.22 → 1.2.24 - Mend

@vespermcp/mcp-server 1.2.22 → 1.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/build/cache/service.js +7 -0
package/build/gateway/unified-dataset-gateway.js +34 -3
package/build/index.js +238 -4
package/build/metadata/arxiv-source.js +229 -0
package/build/metadata/circuit-breaker.js +62 -0
package/build/metadata/github-source.js +203 -0
package/build/metadata/hackernews-source.js +123 -0
package/build/metadata/quality.js +27 -0
package/build/metadata/semantic-scholar-source.js +138 -0
package/build/scripts/test-phase1-webcore-quality.js +104 -0
package/build/search/engine.js +2 -1
package/build/web/extract-web.js +297 -0
package/build/web/fusion-engine.js +457 -0
package/build/web/types.js +1 -0
package/build/web/web-core.js +242 -0
package/package.json +6 -1
package/scripts/wizard.cjs +61 -10
package/scripts/wizard.js +34 -2
package/wizard.cjs +1 -1

package/build/cache/service.js CHANGED Viewed

@@ -27,6 +27,13 @@ export class CacheService {
     constructor(provider) {
         this.provider = provider;
     }
+    async getJson(key) {
+        const data = await this.provider.get(key);
+        return data ? JSON.parse(data) : null;
+    }
+    async setJson(key, value, ttlSeconds) {
+        await this.provider.set(key, JSON.stringify(value), ttlSeconds);
+    }
     /**
      * Caches quality reports (TTL: 24h)
      */

package/build/gateway/unified-dataset-gateway.js CHANGED Viewed

@@ -57,6 +57,27 @@ export class UnifiedDatasetGateway {
                     ? ["data.world is available through server-managed credentials."]
                     : ["data.world support exists, but no server-managed token is configured yet."],
             },
+            {
+                source: "arxiv",
+                display_name: "ArXiv",
+                available: true,
+                auth_mode: "public",
+                supported_operations: ["discover", "info"],
+                requires_end_user_key: false,
+                notes: ["ArXiv papers are fetched through the official public API with no user key required."],
+            },
+            {
+                source: "github",
+                display_name: "GitHub",
+                available: true,
+                auth_mode: "public-or-server-managed",
+                supported_operations: ["discover", "info"],
+                requires_end_user_key: false,
+                notes: [
+                    "Repository search is available without a user key, but unauthenticated requests are heavily rate limited.",
+                    "Set GITHUB_TOKEN on the server for higher limits and reliability."
+                ],
+            },
             {
                 source: "s3",
                 display_name: "Amazon S3",
@@ -244,6 +265,10 @@ export class UnifiedDatasetGateway {
                 return await this.deps.kaggleSource.discover(query, limit);
             case "dataworld":
                 return await this.deps.dataworldSource.discover(query, limit);
+            case "arxiv":
+                return await this.deps.arxivSource.discover(query, limit);
+            case "github":
+                return await this.deps.githubSource.discover(query, limit);
             case "s3":
                 throw new Error("S3 does not support search/discovery in the unified gateway. Use a direct s3://bucket/key object reference.");
             case "bigquery":
@@ -265,7 +290,7 @@ export class UnifiedDatasetGateway {
             }
             return [source];
         }
-        const providers = ["huggingface", "openml"];
+        const providers = ["arxiv", "huggingface", "openml"];
         if (!publicOnly && this.deps.dataIngestor.hasKaggleCredentials()) {
             providers.push("kaggle");
         }
@@ -299,8 +324,14 @@ export class UnifiedDatasetGateway {
             return { source: "openml", datasetId: trimmed.replace(/^openml:/i, "") };
         if (/^dataworld:/i.test(trimmed))
             return { source: "dataworld", datasetId: trimmed.replace(/^dataworld:/i, "") };
+        if (/^arxiv:/i.test(trimmed))
+            return { source: "arxiv", datasetId: trimmed.replace(/^arxiv:/i, "") };
+        if (/^github:/i.test(trimmed))
+            return { source: "github", datasetId: trimmed.replace(/^github:/i, "") };
         if (/^bigquery:/i.test(trimmed))
             return { source: "bigquery", datasetId: trimmed.replace(/^bigquery:/i, "") };
+        if (/^\d{4}\.\d{4,5}(v\d+)?$/i.test(trimmed))
+            return { source: "arxiv", datasetId: trimmed };
         if (/^\d+$/.test(trimmed))
             return { source: "openml", datasetId: trimmed };
         if (trimmed.includes("/") && !trimmed.includes(":"))
@@ -316,7 +347,7 @@ export class UnifiedDatasetGateway {
     lookupKnownDataset(datasetId) {
         const candidates = new Set([
             datasetId,
-            datasetId.replace(/^(huggingface|hf|kaggle|openml|dataworld|bigquery):/i, ""),
+            datasetId.replace(/^(huggingface|hf|kaggle|openml|dataworld|arxiv|github|bigquery):/i, ""),
         ]);
         for (const candidate of candidates) {
             const dataset = this.deps.metadataStore.getDataset(candidate);
@@ -326,7 +357,7 @@ export class UnifiedDatasetGateway {
         return undefined;
     }
     matchesDatasetReference(dataset, requested) {
-        const normalizedRequested = requested.replace(/^(huggingface|hf|kaggle|openml|dataworld|bigquery):/i, "").toLowerCase();
+        const normalizedRequested = requested.replace(/^(huggingface|hf|kaggle|openml|dataworld|arxiv|github|bigquery):/i, "").toLowerCase();
         const fullId = `${dataset.source}:${dataset.id}`.toLowerCase();
         return dataset.id.toLowerCase() === normalizedRequested || fullId === requested.toLowerCase();
     }

package/build/index.js CHANGED Viewed

@@ -248,7 +248,7 @@ export function hasStep(datasetId, step) {
 // --- Dataset ID Auto-Detection ---
 export function parseDatasetId(id) {
     const trimmed = id.trim();
-    if (/^(kaggle:|hf:|huggingface:|openml:|dataworld:|http|https):/i.test(trimmed))
+    if (/^(kaggle:|hf:|huggingface:|openml:|dataworld:|arxiv:|github:|http|https):/i.test(trimmed))
         return trimmed;
     if (trimmed.includes("/") && !trimmed.includes(":"))
         return `kaggle:${trimmed}`;
@@ -270,7 +270,14 @@ import { HuggingFaceScraper } from "./metadata/scraper.js";
 import { KaggleSource } from "./metadata/kaggle-source.js";
 import { OpenMLSource } from "./metadata/openml-source.js";
 import { DataWorldSource } from "./metadata/dataworld-source.js";
+import { ArxivSource } from "./metadata/arxiv-source.js";
+import { GithubSource } from "./metadata/github-source.js";
 import { UnifiedDatasetGateway } from "./gateway/unified-dataset-gateway.js";
+import { WebCoreEngine } from "./web/web-core.js";
+import { WebFusionEngine } from "./web/fusion-engine.js";
+import { WebExtractorEngine } from "./web/extract-web.js";
+import { SemanticScholarSource } from "./metadata/semantic-scholar-source.js";
+import { HackerNewsSource } from "./metadata/hackernews-source.js";
 import { formatSearchResults, formatDatasetInfo, formatJobStatus } from "./tools/formatter.js";
 import { JobManager } from "./jobs/manager.js";
 import { QualityAnalyzer } from "./quality/analyzer.js";
@@ -648,7 +655,14 @@ const fusionEngine = new DataFusionEngine(__dirname);
 const kaggleSource = new KaggleSource(__dirname);
 const openmlSource = new OpenMLSource(__dirname);
 const dataworldSource = new DataWorldSource(__dirname);
+const arxivSource = new ArxivSource(cacheService);
+const githubSource = new GithubSource(cacheService);
 const secureKeys = new SecureKeysManager(__dirname);
+const semanticScholarSource = new SemanticScholarSource(cacheService);
+const hackerNewsSource = new HackerNewsSource(cacheService);
+const webCoreEngine = new WebCoreEngine({ arxivSource, githubSource, semanticScholarSource, hackerNewsSource });
+const webFusionEngine = new WebFusionEngine({ webCoreEngine, embedder, cache: cacheService });
+const webExtractorEngine = new WebExtractorEngine(cacheService);
 function hydrateExternalKeys() {
     const keys = secureKeys.getAll();
     if (!process.env.HF_TOKEN && !process.env.HUGGINGFACE_TOKEN && keys.hf_token) {
@@ -674,6 +688,8 @@ const unifiedDatasetGateway = new UnifiedDatasetGateway({
     kaggleSource,
     openmlSource,
     dataworldSource,
+    arxivSource,
+    githubSource,
     hasDataWorldToken,
 });
 // CRITICAL FIX: Pass __dirname (build directory) to analyzers
@@ -757,7 +773,7 @@ async function handlePrepareJob(jobId, query, requirements, outputDir) {
     let datasetIdForDownload = "";
     let source;
     const parsedQuery = parseDatasetId(query);
-    const isExplicitDatasetRef = /^(kaggle:|hf:|huggingface:|openml:|dataworld:)/i.test(parsedQuery) || (query.includes("/") && !query.includes(" "));
+    const isExplicitDatasetRef = /^(kaggle:|hf:|huggingface:|openml:|dataworld:|arxiv:|github:)/i.test(parsedQuery) || (query.includes("/") && !query.includes(" "));
     if (isExplicitDatasetRef) {
         let explicitId = parsedQuery;
         if (/^hf:/i.test(explicitId)) {
@@ -779,6 +795,12 @@ async function handlePrepareJob(jobId, query, requirements, outputDir) {
             source = "dataworld";
             datasetIdForDownload = explicitId.replace(/^dataworld:/i, "");
         }
+        else if (/^arxiv:/i.test(explicitId)) {
+            throw new Error("prepare_dataset does not support direct arXiv downloads yet. Use unified_dataset_api with operation='discover' or 'info' for arXiv.");
+        }
+        else if (/^github:/i.test(explicitId)) {
+            throw new Error("prepare_dataset does not support direct GitHub downloads yet. Use unified_dataset_api with operation='discover' or 'info' for GitHub.");
+        }
         else {
             // Default to HuggingFace for ambiguous refs (user/dataset without prefix)
             source = "huggingface";
@@ -803,12 +825,22 @@ async function handlePrepareJob(jobId, query, requirements, outputDir) {
         const hasDwToken = hasDataWorldToken();
         selectedDataset = results.find(r => {
             const s = (r.source || "").toLowerCase();
+            if (s === "arxiv")
+                return false; // Phase 1: discover/info only, no direct download yet
+            if (s === "github")
+                return false; // Phase 1: discover/info only, no direct download yet
             if (s === "kaggle" && !hasKaggleCreds)
                 return false;
             if (s === "dataworld" && !hasDwToken)
                 return false;
             return true;
         }) || results[0]; // Fallback to first if all require credentials
+        if ((selectedDataset.source || "").toLowerCase() === "arxiv") {
+            throw new Error("Matched an arXiv paper, but prepare_dataset currently supports downloadable dataset providers only.");
+        }
+        if ((selectedDataset.source || "").toLowerCase() === "github") {
+            throw new Error("Matched a GitHub repo, but prepare_dataset currently supports downloadable dataset providers only.");
+        }
         datasetIdForDownload = selectedDataset.id;
         source = selectedDataset.source;
         update({
@@ -1103,7 +1135,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                         },
                         source: {
                             type: "string",
-                            enum: ["auto", "huggingface", "kaggle", "openml", "dataworld", "s3", "bigquery"],
+                            enum: ["auto", "huggingface", "kaggle", "openml", "dataworld", "arxiv", "github", "s3", "bigquery"],
                             description: "Optional provider selector. Use 'auto' to let Vesper choose a compatible backend.",
                         },
                         query: {
@@ -1138,6 +1170,95 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                     required: ["operation"],
                 },
             },
+            {
+                name: "vesper_web_find",
+                description: "Phase 1 Web Core: search web-native sources (ArXiv, GitHub) and return structured, validated documents using a unified schema (source_type, source_url, content, metadata_json, quality_score, collected_at, content_type).",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        query: { type: "string", description: "Natural language query, e.g. 'agentic RAG evaluation'" },
+                        sources: {
+                            type: "array",
+                            items: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews"] },
+                            description: "Optional subset of sources. Defaults to ['arxiv','github'] when omitted.",
+                        },
+                        limit: { type: "number", description: "Max documents to return (default 10, max 50)." },
+                        arxiv_full_text: { type: "boolean", description: "When true, fetch and parse ArXiv PDFs and return full text as document content (slower)." },
+                        github_include_readme: { type: "boolean", description: "When true, fetch and include GitHub README.md text as document content (slower)." },
+                    },
+                    required: ["query"],
+                },
+            },
+            {
+                name: "vesper.fuse",
+                description: "Phase 2 Data Fusion: fuse results from multiple web-native sources into one unified, deduplicated corpus (provenance via source_chain).",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        sources: {
+                            type: "array",
+                            description: "Web sources to collect from, each with its own query.",
+                            items: {
+                                type: "object",
+                                properties: {
+                                    type: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews", "s3"] },
+                                    query: { type: "string", description: "Query for this source." },
+                                    max_results: { type: "number", description: "Max results for this source (optional)." },
+                                    min_stars: { type: "number", description: "Optional popularity filter (GitHub) based on stars/proxy fields." },
+                                    bucket: { type: "string", description: "S3 bucket (for type='s3')." },
+                                    path: { type: "string", description: "S3 prefix/path (for type='s3')." },
+                                    region: { type: "string", description: "AWS region (for type='s3')." },
+                                    credentials: {
+                                        type: "object",
+                                        description: "Pass-through AWS credentials (optional; not persisted).",
+                                        properties: {
+                                            accessKeyId: { type: "string" },
+                                            secretAccessKey: { type: "string" },
+                                            sessionToken: { type: "string" },
+                                            roleArn: { type: "string" },
+                                        }
+                                    },
+                                },
+                                required: ["type", "query"],
+                            },
+                        },
+                        merge_strategy: {
+                            type: "string",
+                            enum: ["union", "dedup"],
+                            description: "How to merge collected documents.",
+                        },
+                        deduplication: {
+                            type: "string",
+                            enum: ["semantic", "exact", "none"],
+                            description: "How to deduplicate across sources.",
+                        },
+                    },
+                    required: ["sources"],
+                },
+            },
+            {
+                name: "vesper.extract_web",
+                description: "Phase 3 Structured Web extraction. Whitelist-only domains, deterministic extraction (tables/lists/infobox), schema validation, and cache fallback on live extraction failure.",
+                inputSchema: {
+                    type: "object",
+                    properties: {
+                        url: { type: "string", description: "Target URL from approved whitelist domains." },
+                        mode: { type: "string", enum: ["auto", "table", "list", "infobox"], description: "Extraction mode (default auto)." },
+                        strict_schema: { type: "boolean", description: "When true (default), enforce domain-specific required fields." },
+                        schema: {
+                            type: "object",
+                            properties: {
+                                required_fields: {
+                                    type: "array",
+                                    items: { type: "string" },
+                                    description: "Optional required top-level fields in extracted data payload."
+                                }
+                            }
+                        }
+                    },
+                    required: ["url"],
+                },
+            },
             {
                 name: "discover_datasets",
                 description: "Discover datasets from a specific source. Public providers work keylessly; Kaggle and data.world can also be exposed through server-managed credentials.",
@@ -1150,7 +1271,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                         },
                         source: {
                             type: "string",
-                            enum: ["huggingface", "kaggle", "openml", "dataworld"],
+                            enum: ["huggingface", "kaggle", "openml", "dataworld", "arxiv", "github"],
                             description: "Data source to discover from.",
                         },
                         limit: {
@@ -1589,6 +1710,119 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
             markStepComplete(String(datasetId), String(step));
         }
         switch (request.params.name) {
+            case "vesper_web_find": {
+                hydrateExternalKeys();
+                const query = String(request.params.arguments?.query || "").trim();
+                const limit = Number(request.params.arguments?.limit || 10);
+                const sources = Array.isArray(request.params.arguments?.sources)
+                    ? (request.params.arguments?.sources).map(s => String(s).trim().toLowerCase()).filter(Boolean)
+                    : undefined;
+                try {
+                    const result = await webCoreEngine.find({
+                        query,
+                        sources: sources,
+                        limit,
+                        arxiv_full_text: request.params.arguments?.arxiv_full_text === true,
+                        github_include_readme: request.params.arguments?.github_include_readme === true,
+                    });
+                    return {
+                        content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
+                    };
+                }
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: web_find failed: ${error.message}` }],
+                        isError: true,
+                    };
+                }
+            }
+            case "vesper.fuse": {
+                hydrateExternalKeys();
+                const sources = Array.isArray(request.params.arguments?.sources)
+                    ? request.params.arguments?.sources
+                    : undefined;
+                if (!sources || !Array.isArray(sources)) {
+                    return {
+                        content: [{ type: "text", text: "ERROR: vesper.fuse requires 'sources' array." }],
+                        isError: true,
+                    };
+                }
+                try {
+                    const mergeStrategyRaw = request.params.arguments?.merge_strategy
+                        ? String(request.params.arguments?.merge_strategy).toLowerCase()
+                        : undefined;
+                    const dedupRaw = request.params.arguments?.deduplication
+                        ? String(request.params.arguments?.deduplication).toLowerCase()
+                        : undefined;
+                    const merge_strategy = mergeStrategyRaw && ["union", "dedup"].includes(mergeStrategyRaw)
+                        ? mergeStrategyRaw
+                        : undefined;
+                    const deduplication = dedupRaw && ["semantic", "exact", "none"].includes(dedupRaw)
+                        ? dedupRaw
+                        : undefined;
+                    const result = await webFusionEngine.fuse({
+                        sources: sources.map((s) => ({
+                            type: String(s?.type || "").trim().toLowerCase(),
+                            query: String(s?.query || "").trim(),
+                            max_results: s?.max_results !== undefined ? Number(s.max_results) : undefined,
+                            min_stars: s?.min_stars !== undefined ? Number(s.min_stars) : undefined,
+                            bucket: s?.bucket !== undefined ? String(s.bucket) : undefined,
+                            path: s?.path !== undefined ? String(s.path) : undefined,
+                            region: s?.region !== undefined ? String(s.region) : undefined,
+                            credentials: s?.credentials ? {
+                                accessKeyId: s.credentials.accessKeyId !== undefined ? String(s.credentials.accessKeyId) : undefined,
+                                secretAccessKey: s.credentials.secretAccessKey !== undefined ? String(s.credentials.secretAccessKey) : undefined,
+                                sessionToken: s.credentials.sessionToken !== undefined ? String(s.credentials.sessionToken) : undefined,
+                                roleArn: s.credentials.roleArn !== undefined ? String(s.credentials.roleArn) : undefined,
+                            } : undefined,
+                        })),
+                        merge_strategy,
+                        deduplication,
+                    });
+                    return {
+                        content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
+                    };
+                }
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: vesper.fuse failed: ${error.message}` }],
+                        isError: true,
+                    };
+                }
+            }
+            case "vesper.extract_web": {
+                hydrateExternalKeys();
+                const url = String(request.params.arguments?.url || "").trim();
+                const mode = request.params.arguments?.mode
+                    ? String(request.params.arguments?.mode).trim().toLowerCase()
+                    : "auto";
+                const schema = request.params.arguments?.schema && typeof request.params.arguments.schema === "object"
+                    ? request.params.arguments.schema
+                    : undefined;
+                if (!url) {
+                    return {
+                        content: [{ type: "text", text: "ERROR: vesper.extract_web requires 'url'." }],
+                        isError: true,
+                    };
+                }
+                try {
+                    const out = await webExtractorEngine.extract({
+                        url,
+                        mode: mode,
+                        strict_schema: request.params.arguments?.strict_schema !== false,
+                        schema: schema,
+                    });
+                    return {
+                        content: [{ type: "text", text: JSON.stringify(out, null, 2) }],
+                    };
+                }
+                catch (error) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: vesper.extract_web failed: ${error.message}` }],
+                        isError: true,
+                    };
+                }
+            }
             case "unified_dataset_api": {
                 hydrateExternalKeys();
                 const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();

package/build/metadata/arxiv-source.js ADDED Viewed

@@ -0,0 +1,229 @@
+import { rateLimitedFetch } from "./rate-limiter.js";
+import { CircuitBreaker } from "./circuit-breaker.js";
+import { estimateQualityScore } from "./quality.js";
+export class ArxivSource {
+    cache;
+    baseUrl = "http://export.arxiv.org/api/query";
+    breaker = new CircuitBreaker("arxiv", {
+        failureThreshold: 5,
+        openDurationMs: 30_000,
+        halfOpenSuccessesToClose: 2,
+    });
+    constructor(cache) {
+        this.cache = cache;
+    }
+    async discover(query, limit = 20) {
+        const out = await this.discoverWithTelemetry(query, limit, { full_text: false });
+        return out.results;
+    }
+    async discoverWithTelemetry(query, limit = 20, input = {}) {
+        const start = Date.now();
+        const cleanQuery = String(query || "").trim();
+        if (!cleanQuery) {
+            return { results: [], cacheHit: false, latencyMs: Date.now() - start };
+        }
+        const fullText = input.full_text === true;
+        const maxResults = Math.max(1, Math.min(100, Number(limit || 20)));
+        const cacheKey = `webcore:arxiv:discover:${cleanQuery.toLowerCase()}:limit=${maxResults}:full_text=${fullText ? 1 : 0}`;
+        const cached = await this.cache?.getJson(cacheKey);
+        if (cached) {
+            return { results: cached, cacheHit: true, latencyMs: Date.now() - start };
+        }
+        if (!this.breaker.canAttempt()) {
+            throw new Error("ArXiv connector is temporarily unavailable (circuit open).");
+        }
+        const url = `${this.baseUrl}?search_query=all:${encodeURIComponent(cleanQuery)}&start=0&max_results=${maxResults}&sortBy=relevance&sortOrder=descending`;
+        const response = await rateLimitedFetch(url, {
+            headers: {
+                "User-Agent": "vesper/2.0 (phase1-arxiv-connector)"
+            }
+        }, { maxRetries: 5, initialDelay: 1000, maxDelay: 15000 }).catch((e) => {
+            this.breaker.onFailure();
+            throw e;
+        });
+        const xml = await response.text();
+        const entries = this.parseEntries(xml);
+        let pdfExtractMsTotal = 0;
+        const result = [];
+        for (const entry of entries) {
+            if (fullText) {
+                const pdfStart = Date.now();
+                const pdfText = await this.extractPdfText(entry.id).catch(() => "");
+                pdfExtractMsTotal += Date.now() - pdfStart;
+                const truncated = pdfText ? this.truncateTo50k(pdfText) : undefined;
+                result.push(this.toDatasetMetadata(entry, {
+                    webcore_content: truncated,
+                    contentDepth: truncated ? truncated.length : entry.summary.length,
+                }));
+            }
+            else {
+                result.push(this.toDatasetMetadata(entry, { contentDepth: entry.summary.length }));
+            }
+        }
+        this.breaker.onSuccess();
+        await this.cache?.setJson(cacheKey, result, 86400); // 24h
+        return { results: result, cacheHit: false, latencyMs: Date.now() - start, pdf_extract_ms_total: pdfExtractMsTotal };
+    }
+    parseEntries(xml) {
+        const entries = [];
+        const entryMatches = xml.match(/<entry>([\s\S]*?)<\/entry>/g) || [];
+        for (const block of entryMatches) {
+            const idUrl = this.extractTag(block, "id");
+            const title = this.decodeXml(this.extractTag(block, "title"));
+            const summary = this.decodeXml(this.extractTag(block, "summary"));
+            const updated = this.extractTag(block, "updated");
+            const published = this.extractTag(block, "published");
+            const pdfUrl = this.extractPdfUrl(block) || (idUrl ? idUrl.replace("/abs/", "/pdf/") : "");
+            const authors = this.extractAllTags(block, "name").map((v) => this.decodeXml(v));
+            const categories = this.extractAllCategoryTerms(block);
+            if (!idUrl || !title)
+                continue;
+            const shortId = this.extractArxivId(idUrl);
+            entries.push({
+                id: shortId,
+                title: title.replace(/\s+/g, " ").trim(),
+                summary: summary.replace(/\s+/g, " ").trim(),
+                updated,
+                published,
+                authors,
+                categories,
+                pdfUrl,
+            });
+        }
+        return entries;
+    }
+    toDatasetMetadata(entry, input) {
+        const description = entry.summary || entry.title;
+        const publishedAt = entry.published || entry.updated || new Date().toISOString();
+        const qualityWarnings = [];
+        if (description.length < 120) {
+            qualityWarnings.push("Short abstract may reduce extraction confidence");
+        }
+        const abstractLength = description.length;
+        const authorsPresent = Array.isArray(entry.authors) && entry.authors.length > 0;
+        const datePresent = !!(entry.published || entry.updated);
+        const contentDepth = Math.max(abstractLength, input.contentDepth || abstractLength);
+        const quality01 = estimateQualityScore({
+            abstractLength,
+            authorsPresent,
+            datePresent,
+            contentDepth,
+        });
+        return {
+            id: entry.id,
+            source: "arxiv",
+            name: entry.title,
+            description,
+            authors: entry.authors,
+            downloads: 0,
+            likes: 0,
+            stars: 0,
+            tags: entry.categories,
+            last_updated: entry.updated || publishedAt,
+            task: "research-paper",
+            languages: [],
+            domain: "research",
+            splits: [],
+            license: {
+                id: "unknown",
+                category: "unknown",
+                usage_restrictions: [],
+                warnings: [],
+            },
+            quality_score: Math.round(quality01 * 100),
+            quality_warnings: qualityWarnings,
+            download_url: entry.pdfUrl,
+            format: "PDF",
+            total_examples: 1,
+            total_size_bytes: undefined,
+            total_size_mb: undefined,
+            columns: [
+                { name: "title", type: "string" },
+                { name: "abstract", type: "string" },
+                { name: "authors", type: "string[]" },
+                { name: "categories", type: "string[]" },
+                { name: "published_at", type: "datetime" },
+                { name: "source_url", type: "string" },
+            ],
+            is_structured: true,
+            has_target_column: false,
+            is_safe_source: true,
+            has_personal_data: false,
+            is_paywalled: false,
+            is_scraped_web_data: false,
+            uses_https: true,
+            has_train_split: false,
+            has_test_split: false,
+            has_validation_split: false,
+            description_length: description.length,
+            has_readme: false,
+            metadata_url: `https://arxiv.org/abs/${entry.id}`,
+            ...(input.webcore_content ? { webcore_content: input.webcore_content, webcore_content_kind: "pdf_text" } : {}),
+        };
+    }
+    truncateTo50k(text) {
+        return String(text || "").slice(0, 50_000);
+    }
+    async extractPdfText(arxivId) {
+        // Lazy-load heavy dependency only when enabled.
+        const pdfParseMod = await import("pdf-parse");
+        const pdfParse = pdfParseMod.default || pdfParseMod;
+        const pdfUrl = `https://arxiv.org/pdf/${arxivId}.pdf`;
+        const start = Date.now();
+        const response = await rateLimitedFetch(pdfUrl, {
+            headers: {
+                "User-Agent": "vesper/2.0 (phase1-arxiv-pdf-extract)"
+            }
+        }, { maxRetries: 3, initialDelay: 1000, maxDelay: 8000 });
+        const arrayBuf = await response.arrayBuffer();
+        const buffer = Buffer.from(arrayBuf);
+        const parsed = await pdfParse(buffer);
+        const text = String(parsed?.text || "");
+        // Soft truncate; later caller truncates too.
+        if (text.length > 200_000) {
+            // Avoid pathological PDFs.
+            return text.slice(0, 200_000);
+        }
+        void start;
+        return text;
+    }
+    extractTag(xml, tagName) {
+        const m = xml.match(new RegExp(`<${tagName}>([\\s\\S]*?)<\\/${tagName}>`, "i"));
+        return (m?.[1] || "").trim();
+    }
+    extractAllTags(xml, tagName) {
+        const out = [];
+        const rgx = new RegExp(`<${tagName}>([\\s\\S]*?)<\\/${tagName}>`, "gi");
+        let m = null;
+        while ((m = rgx.exec(xml)) !== null) {
+            out.push((m[1] || "").trim());
+        }
+        return out;
+    }
+    extractAllCategoryTerms(xml) {
+        const out = [];
+        const rgx = /<category[^>]*term="([^"]+)"[^>]*\/?>/gi;
+        let m = null;
+        while ((m = rgx.exec(xml)) !== null) {
+            out.push((m[1] || "").trim());
+        }
+        return Array.from(new Set(out));
+    }
+    extractPdfUrl(xml) {
+        const m = xml.match(/<link[^>]*title="pdf"[^>]*href="([^"]+)"[^>]*\/?>/i);
+        return (m?.[1] || "").trim();
+    }
+    extractArxivId(idUrl) {
+        const cleaned = idUrl.trim();
+        const match = cleaned.match(/\/abs\/([^/?#]+)/i);
+        return match?.[1] || cleaned;
+    }
+    decodeXml(input) {
+        return input
+            .replace(/&lt;/g, "<")
+            .replace(/&gt;/g, ">")
+            .replace(/&amp;/g, "&")
+            .replace(/&quot;/g, "\"")
+            .replace(/&#39;/g, "'");
+    }
+}