npm - @vespermcp/mcp-server - Versions diffs - 1.2.21 → 1.2.24 - Mend

@vespermcp/mcp-server 1.2.21 → 1.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

package/README.md +49 -0
package/build/cache/service.js +7 -0
package/build/cloud/adapters/supabase.js +49 -0
package/build/cloud/storage-manager.js +6 -0
package/build/export/exporter.js +22 -9
package/build/gateway/unified-dataset-gateway.js +441 -0
package/build/index.js +1815 -839
package/build/ingestion/ingestor.js +7 -4
package/build/install/install-service.js +11 -6
package/build/lib/supabase.js +3 -0
package/build/metadata/arxiv-source.js +229 -0
package/build/metadata/circuit-breaker.js +62 -0
package/build/metadata/github-source.js +203 -0
package/build/metadata/hackernews-source.js +123 -0
package/build/metadata/quality.js +27 -0
package/build/metadata/scraper.js +85 -14
package/build/metadata/semantic-scholar-source.js +138 -0
package/build/python/asset_downloader_engine.py +2 -0
package/build/python/convert_engine.py +92 -0
package/build/python/export_engine.py +45 -0
package/build/python/kaggle_engine.py +77 -5
package/build/python/normalize_engine.py +83 -0
package/build/python/vesper/core/asset_downloader.py +5 -1
package/build/scripts/test-phase1-webcore-quality.js +104 -0
package/build/search/engine.js +45 -6
package/build/search/jit-orchestrator.js +18 -14
package/build/search/query-intent.js +509 -0
package/build/tools/formatter.js +6 -3
package/build/utils/python-runtime.js +130 -0
package/build/web/extract-web.js +297 -0
package/build/web/fusion-engine.js +457 -0
package/build/web/types.js +1 -0
package/build/web/web-core.js +242 -0
package/package.json +12 -5
package/scripts/postinstall.cjs +87 -31
package/scripts/wizard.cjs +652 -0
package/scripts/wizard.js +338 -12
package/src/python/__pycache__/config.cpython-312.pyc +0 -0
package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
package/src/python/asset_downloader_engine.py +2 -0
package/src/python/convert_engine.py +92 -0
package/src/python/export_engine.py +45 -0
package/src/python/kaggle_engine.py +77 -5
package/src/python/normalize_engine.py +83 -0
package/src/python/requirements.txt +12 -0
package/src/python/vesper/core/asset_downloader.py +5 -1
package/wizard.cjs +3 -0

package/build/ingestion/ingestor.js CHANGED Viewed

@@ -46,6 +46,9 @@ export class DataIngestor {
     getKaggleCredentialError() {
         return "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds), or provide ~/.kaggle/kaggle.json.";
     }
+    toSafeDatasetPath(datasetId) {
+        return datasetId.replace(/[:\/]/g, "_");
+    }
     /**
      * Ensures a dataset is available locally
      */
@@ -115,7 +118,7 @@ export class DataIngestor {
                 this.failDownload(datasetId, errorMsg);
                 throw new Error(errorMsg);
             }
-            const targetDir = path.join(this.rawDataDir, datasetId.replace(/\//g, "_"));
+            const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
             this.store.registerDownload(datasetId, targetDir, "downloading");
             try {
                 onProgress?.("Downloading from Kaggle...");
@@ -131,7 +134,7 @@ export class DataIngestor {
             }
         }
         else if (source === "openml") {
-            const targetDir = path.join(this.rawDataDir, datasetId.replace(/:/g, "_"));
+            const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
             this.store.registerDownload(datasetId, targetDir, "downloading");
             try {
                 onProgress?.("Downloading from OpenML...");
@@ -147,7 +150,7 @@ export class DataIngestor {
             }
         }
         else if (source === "dataworld") {
-            const targetDir = path.join(this.rawDataDir, datasetId.replace(/[:\/]/g, "_"));
+            const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
             this.store.registerDownload(datasetId, targetDir, "downloading");
             try {
                 onProgress?.("Downloading from data.world...");
@@ -181,7 +184,7 @@ export class DataIngestor {
      * Generates a safe local filename for a dataset ID
      */
     getTargetPath(datasetId, extension = "parquet") {
-        const safeId = datasetId.replace(/\//g, "_").replace(/:/g, "_");
+        const safeId = this.toSafeDatasetPath(datasetId);
         return path.join(this.rawDataDir, `${safeId}.${extension}`);
     }
     /**

package/build/install/install-service.js CHANGED Viewed

@@ -18,12 +18,15 @@ export class InstallService {
             throw new Error(`Source file not found for installation: ${sourcePath}`);
         }
         const dataset = this.metadataStore.getDataset(datasetId);
-        if (!dataset) {
-            throw new Error(`Dataset metadata not found for ${datasetId}`);
-        }
         // Create target directory
-        const sanitizedName = dataset.name.replace(/[^a-z0-9]/gi, "_").toLowerCase();
-        const installDir = targetDir || path.join(this.projectRoot, "datasets", sanitizedName);
+        const installLabel = dataset?.name || datasetId;
+        const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
+        // If caller specified a target dir, use it directly
+        // Otherwise use the current working directory
+        const installDir = targetDir
+            ? path.resolve(targetDir)
+            : path.resolve(process.cwd(), sanitizedName);
+        console.error(`[InstallService] Resolved install directory: ${installDir}`);
         if (!fs.existsSync(installDir)) {
             fs.mkdirSync(installDir, { recursive: true });
         }
@@ -34,7 +37,9 @@ export class InstallService {
         fs.copyFileSync(sourcePath, targetPath);
         // Update metadata
         const absolutePath = path.resolve(targetPath);
-        this.metadataStore.updateInstallPath(datasetId, absolutePath);
+        if (dataset) {
+            this.metadataStore.updateInstallPath(datasetId, absolutePath);
+        }
         console.error(`[InstallService] Dataset ${datasetId} installed to ${absolutePath}`);
         return absolutePath;
     }

package/build/lib/supabase.js ADDED Viewed

@@ -0,0 +1,3 @@
+import { createClient } from '@supabase/supabase-js';
+export const supabase = createClient(process.env.SUPABASE_URL, process.env.SUPABASE_SERVICE_ROLE_KEY // for MCP, use service_role, not anon
+);

package/build/metadata/arxiv-source.js ADDED Viewed

@@ -0,0 +1,229 @@
+import { rateLimitedFetch } from "./rate-limiter.js";
+import { CircuitBreaker } from "./circuit-breaker.js";
+import { estimateQualityScore } from "./quality.js";
+export class ArxivSource {
+    cache;
+    baseUrl = "http://export.arxiv.org/api/query";
+    breaker = new CircuitBreaker("arxiv", {
+        failureThreshold: 5,
+        openDurationMs: 30_000,
+        halfOpenSuccessesToClose: 2,
+    });
+    constructor(cache) {
+        this.cache = cache;
+    }
+    async discover(query, limit = 20) {
+        const out = await this.discoverWithTelemetry(query, limit, { full_text: false });
+        return out.results;
+    }
+    async discoverWithTelemetry(query, limit = 20, input = {}) {
+        const start = Date.now();
+        const cleanQuery = String(query || "").trim();
+        if (!cleanQuery) {
+            return { results: [], cacheHit: false, latencyMs: Date.now() - start };
+        }
+        const fullText = input.full_text === true;
+        const maxResults = Math.max(1, Math.min(100, Number(limit || 20)));
+        const cacheKey = `webcore:arxiv:discover:${cleanQuery.toLowerCase()}:limit=${maxResults}:full_text=${fullText ? 1 : 0}`;
+        const cached = await this.cache?.getJson(cacheKey);
+        if (cached) {
+            return { results: cached, cacheHit: true, latencyMs: Date.now() - start };
+        }
+        if (!this.breaker.canAttempt()) {
+            throw new Error("ArXiv connector is temporarily unavailable (circuit open).");
+        }
+        const url = `${this.baseUrl}?search_query=all:${encodeURIComponent(cleanQuery)}&start=0&max_results=${maxResults}&sortBy=relevance&sortOrder=descending`;
+        const response = await rateLimitedFetch(url, {
+            headers: {
+                "User-Agent": "vesper/2.0 (phase1-arxiv-connector)"
+            }
+        }, { maxRetries: 5, initialDelay: 1000, maxDelay: 15000 }).catch((e) => {
+            this.breaker.onFailure();
+            throw e;
+        });
+        const xml = await response.text();
+        const entries = this.parseEntries(xml);
+        let pdfExtractMsTotal = 0;
+        const result = [];
+        for (const entry of entries) {
+            if (fullText) {
+                const pdfStart = Date.now();
+                const pdfText = await this.extractPdfText(entry.id).catch(() => "");
+                pdfExtractMsTotal += Date.now() - pdfStart;
+                const truncated = pdfText ? this.truncateTo50k(pdfText) : undefined;
+                result.push(this.toDatasetMetadata(entry, {
+                    webcore_content: truncated,
+                    contentDepth: truncated ? truncated.length : entry.summary.length,
+                }));
+            }
+            else {
+                result.push(this.toDatasetMetadata(entry, { contentDepth: entry.summary.length }));
+            }
+        }
+        this.breaker.onSuccess();
+        await this.cache?.setJson(cacheKey, result, 86400); // 24h
+        return { results: result, cacheHit: false, latencyMs: Date.now() - start, pdf_extract_ms_total: pdfExtractMsTotal };
+    }
+    parseEntries(xml) {
+        const entries = [];
+        const entryMatches = xml.match(/<entry>([\s\S]*?)<\/entry>/g) || [];
+        for (const block of entryMatches) {
+            const idUrl = this.extractTag(block, "id");
+            const title = this.decodeXml(this.extractTag(block, "title"));
+            const summary = this.decodeXml(this.extractTag(block, "summary"));
+            const updated = this.extractTag(block, "updated");
+            const published = this.extractTag(block, "published");
+            const pdfUrl = this.extractPdfUrl(block) || (idUrl ? idUrl.replace("/abs/", "/pdf/") : "");
+            const authors = this.extractAllTags(block, "name").map((v) => this.decodeXml(v));
+            const categories = this.extractAllCategoryTerms(block);
+            if (!idUrl || !title)
+                continue;
+            const shortId = this.extractArxivId(idUrl);
+            entries.push({
+                id: shortId,
+                title: title.replace(/\s+/g, " ").trim(),
+                summary: summary.replace(/\s+/g, " ").trim(),
+                updated,
+                published,
+                authors,
+                categories,
+                pdfUrl,
+            });
+        }
+        return entries;
+    }
+    toDatasetMetadata(entry, input) {
+        const description = entry.summary || entry.title;
+        const publishedAt = entry.published || entry.updated || new Date().toISOString();
+        const qualityWarnings = [];
+        if (description.length < 120) {
+            qualityWarnings.push("Short abstract may reduce extraction confidence");
+        }
+        const abstractLength = description.length;
+        const authorsPresent = Array.isArray(entry.authors) && entry.authors.length > 0;
+        const datePresent = !!(entry.published || entry.updated);
+        const contentDepth = Math.max(abstractLength, input.contentDepth || abstractLength);
+        const quality01 = estimateQualityScore({
+            abstractLength,
+            authorsPresent,
+            datePresent,
+            contentDepth,
+        });
+        return {
+            id: entry.id,
+            source: "arxiv",
+            name: entry.title,
+            description,
+            authors: entry.authors,
+            downloads: 0,
+            likes: 0,
+            stars: 0,
+            tags: entry.categories,
+            last_updated: entry.updated || publishedAt,
+            task: "research-paper",
+            languages: [],
+            domain: "research",
+            splits: [],
+            license: {
+                id: "unknown",
+                category: "unknown",
+                usage_restrictions: [],
+                warnings: [],
+            },
+            quality_score: Math.round(quality01 * 100),
+            quality_warnings: qualityWarnings,
+            download_url: entry.pdfUrl,
+            format: "PDF",
+            total_examples: 1,
+            total_size_bytes: undefined,
+            total_size_mb: undefined,
+            columns: [
+                { name: "title", type: "string" },
+                { name: "abstract", type: "string" },
+                { name: "authors", type: "string[]" },
+                { name: "categories", type: "string[]" },
+                { name: "published_at", type: "datetime" },
+                { name: "source_url", type: "string" },
+            ],
+            is_structured: true,
+            has_target_column: false,
+            is_safe_source: true,
+            has_personal_data: false,
+            is_paywalled: false,
+            is_scraped_web_data: false,
+            uses_https: true,
+            has_train_split: false,
+            has_test_split: false,
+            has_validation_split: false,
+            description_length: description.length,
+            has_readme: false,
+            metadata_url: `https://arxiv.org/abs/${entry.id}`,
+            ...(input.webcore_content ? { webcore_content: input.webcore_content, webcore_content_kind: "pdf_text" } : {}),
+        };
+    }
+    truncateTo50k(text) {
+        return String(text || "").slice(0, 50_000);
+    }
+    async extractPdfText(arxivId) {
+        // Lazy-load heavy dependency only when enabled.
+        const pdfParseMod = await import("pdf-parse");
+        const pdfParse = pdfParseMod.default || pdfParseMod;
+        const pdfUrl = `https://arxiv.org/pdf/${arxivId}.pdf`;
+        const start = Date.now();
+        const response = await rateLimitedFetch(pdfUrl, {
+            headers: {
+                "User-Agent": "vesper/2.0 (phase1-arxiv-pdf-extract)"
+            }
+        }, { maxRetries: 3, initialDelay: 1000, maxDelay: 8000 });
+        const arrayBuf = await response.arrayBuffer();
+        const buffer = Buffer.from(arrayBuf);
+        const parsed = await pdfParse(buffer);
+        const text = String(parsed?.text || "");
+        // Soft truncate; later caller truncates too.
+        if (text.length > 200_000) {
+            // Avoid pathological PDFs.
+            return text.slice(0, 200_000);
+        }
+        void start;
+        return text;
+    }
+    extractTag(xml, tagName) {
+        const m = xml.match(new RegExp(`<${tagName}>([\\s\\S]*?)<\\/${tagName}>`, "i"));
+        return (m?.[1] || "").trim();
+    }
+    extractAllTags(xml, tagName) {
+        const out = [];
+        const rgx = new RegExp(`<${tagName}>([\\s\\S]*?)<\\/${tagName}>`, "gi");
+        let m = null;
+        while ((m = rgx.exec(xml)) !== null) {
+            out.push((m[1] || "").trim());
+        }
+        return out;
+    }
+    extractAllCategoryTerms(xml) {
+        const out = [];
+        const rgx = /<category[^>]*term="([^"]+)"[^>]*\/?>/gi;
+        let m = null;
+        while ((m = rgx.exec(xml)) !== null) {
+            out.push((m[1] || "").trim());
+        }
+        return Array.from(new Set(out));
+    }
+    extractPdfUrl(xml) {
+        const m = xml.match(/<link[^>]*title="pdf"[^>]*href="([^"]+)"[^>]*\/?>/i);
+        return (m?.[1] || "").trim();
+    }
+    extractArxivId(idUrl) {
+        const cleaned = idUrl.trim();
+        const match = cleaned.match(/\/abs\/([^/?#]+)/i);
+        return match?.[1] || cleaned;
+    }
+    decodeXml(input) {
+        return input
+            .replace(/&lt;/g, "<")
+            .replace(/&gt;/g, ">")
+            .replace(/&amp;/g, "&")
+            .replace(/&quot;/g, "\"")
+            .replace(/&#39;/g, "'");
+    }
+}

package/build/metadata/circuit-breaker.js ADDED Viewed

@@ -0,0 +1,62 @@
+export class CircuitBreaker {
+    name;
+    options;
+    state = "closed";
+    consecutiveFailures = 0;
+    openUntilMs = 0;
+    constructor(name, options) {
+        this.name = name;
+        this.options = options;
+    }
+    halfOpenSuccesses = 0;
+    canAttempt() {
+        if (this.state === "closed")
+            return true;
+        if (this.state === "open") {
+            if (Date.now() >= this.openUntilMs) {
+                this.state = "half_open";
+                this.halfOpenSuccesses = 0;
+                return true;
+            }
+            return false;
+        }
+        return true; // half_open
+    }
+    onSuccess() {
+        if (this.state === "half_open") {
+            this.halfOpenSuccesses++;
+            if (this.halfOpenSuccesses >= this.options.halfOpenSuccessesToClose) {
+                this.state = "closed";
+                this.consecutiveFailures = 0;
+                this.openUntilMs = 0;
+            }
+            return;
+        }
+        this.consecutiveFailures = 0;
+        this.state = "closed";
+    }
+    onFailure() {
+        if (this.state === "half_open") {
+            this.trip();
+            return;
+        }
+        this.consecutiveFailures++;
+        if (this.consecutiveFailures >= this.options.failureThreshold) {
+            this.trip();
+        }
+    }
+    trip() {
+        this.state = "open";
+        this.openUntilMs = Date.now() + this.options.openDurationMs;
+        this.consecutiveFailures = 0;
+        this.halfOpenSuccesses = 0;
+        console.error(`[CircuitBreaker] Opened: ${this.name} (until ${new Date(this.openUntilMs).toISOString()})`);
+    }
+    getStatus() {
+        return {
+            name: this.name,
+            state: this.state,
+            ...(this.state === "open" ? { open_until: new Date(this.openUntilMs).toISOString() } : {}),
+        };
+    }
+}

package/build/metadata/github-source.js ADDED Viewed

@@ -0,0 +1,203 @@
+import { rateLimitedFetch } from "./rate-limiter.js";
+import { CircuitBreaker } from "./circuit-breaker.js";
+import { estimateQualityScore } from "./quality.js";
+export class GithubSource {
+    cache;
+    breaker = new CircuitBreaker("github", {
+        failureThreshold: 5,
+        openDurationMs: 30_000,
+        halfOpenSuccessesToClose: 2,
+    });
+    constructor(cache) {
+        this.cache = cache;
+    }
+    async discover(query, limit = 20) {
+        const out = await this.discoverWithTelemetry(query, limit, { include_readme: false });
+        return out.results;
+    }
+    async discoverWithTelemetry(query, limit = 20, input = {}) {
+        const start = Date.now();
+        const cleanQuery = String(query || "").trim();
+        if (!cleanQuery) {
+            return { results: [], cacheHit: false, latencyMs: Date.now() - start, readme_fetch_ms_total: 0 };
+        }
+        const includeReadme = input.include_readme === true;
+        const perPage = Math.max(1, Math.min(100, Number(limit || 20)));
+        const cacheKey = `webcore:github:discover:${cleanQuery.toLowerCase()}:per_page=${perPage}:readme=${includeReadme ? 1 : 0}`;
+        const cached = await this.cache?.getJson(cacheKey);
+        if (cached) {
+            return { results: cached, cacheHit: true, latencyMs: Date.now() - start };
+        }
+        if (!this.breaker.canAttempt()) {
+            throw new Error("GitHub connector is temporarily unavailable (circuit open).");
+        }
+        const refinedQuery = `${cleanQuery} in:name,description,readme`;
+        const url = `https://api.github.com/search/repositories?q=${encodeURIComponent(refinedQuery)}&sort=stars&order=desc&per_page=${perPage}`;
+        const headers = {
+            "Accept": "application/vnd.github+json",
+            "User-Agent": "vesper/2.0 (phase1-github-connector)",
+            "X-GitHub-Api-Version": "2022-11-28",
+        };
+        const token = process.env.GITHUB_TOKEN || process.env.GH_TOKEN;
+        if (token)
+            headers["Authorization"] = `Bearer ${token}`;
+        const response = await rateLimitedFetch(url, { headers }, { maxRetries: 5, initialDelay: 1000, maxDelay: 20000 })
+            .catch((e) => {
+            // GitHub uses 403 for rate limiting; treat as breaker-worthy failure
+            this.breaker.onFailure();
+            if (String(e?.message || "").includes("403")) {
+                throw new Error("GitHub API rate limit exceeded (403). Set GITHUB_TOKEN for higher limits.");
+            }
+            throw e;
+        });
+        const data = (await response.json());
+        const items = Array.isArray(data?.items) ? data.items : [];
+        const repos = items.slice(0, perPage);
+        let readmeFetchMsTotal = 0;
+        const maxReadmes = includeReadme ? Math.min(5, repos.length) : 0;
+        const result = [];
+        for (let i = 0; i < repos.length; i++) {
+            const repo = repos[i];
+            if (includeReadme && i < maxReadmes) {
+                const fullName = String(repo.full_name || repo.name || "").trim();
+                const readmeKey = fullName ? `webcore:github:readme:${fullName}` : "webcore:github:readme:unknown";
+                const cachedReadme = await this.cache?.getJson(readmeKey);
+                if (cachedReadme) {
+                    result.push(this.toDatasetMetadata(repo, { readmeText: cachedReadme }));
+                    continue;
+                }
+                const t0 = Date.now();
+                const readmeText = await this.fetchReadme(repo).catch(() => undefined);
+                readmeFetchMsTotal += Date.now() - t0;
+                if (readmeText) {
+                    await this.cache?.setJson(readmeKey, readmeText, 21600); // 6h
+                }
+                result.push(this.toDatasetMetadata(repo, { readmeText: readmeText ? this.truncate50k(readmeText) : undefined }));
+            }
+            else {
+                result.push(this.toDatasetMetadata(repo, { readmeText: undefined }));
+            }
+        }
+        this.breaker.onSuccess();
+        await this.cache?.setJson(cacheKey, result, 21600); // 6h
+        return { results: result, cacheHit: false, latencyMs: Date.now() - start, readme_fetch_ms_total: readmeFetchMsTotal };
+    }
+    toDatasetMetadata(repo, input) {
+        const fullName = String(repo.full_name || repo.name || "unknown").trim();
+        const description = String(repo.description || "").trim() || "No description provided.";
+        const ownerRepo = this.parseOwnerRepo(fullName);
+        const owner = ownerRepo ? ownerRepo.split("/")[0] : "";
+        const stars = Number(repo.stargazers_count || 0);
+        const forks = Number(repo.forks_count || 0);
+        const updatedAt = repo.updated_at || new Date().toISOString();
+        const topics = Array.isArray(repo.topics) ? repo.topics.filter(Boolean).map(String) : [];
+        const language = repo.language ? [String(repo.language)] : [];
+        const licenseId = repo.license?.spdx_id && repo.license.spdx_id !== "NOASSERTION"
+            ? String(repo.license.spdx_id)
+            : "unknown";
+        const licenseName = repo.license?.name ? String(repo.license.name) : undefined;
+        const qualityWarnings = [];
+        if (stars < 5)
+            qualityWarnings.push("Low star count; may be low-signal");
+        if (description.length < 80)
+            qualityWarnings.push("Short description; relevance may be weaker");
+        const abstractLength = input.readmeText ? input.readmeText.length : description.length;
+        const authorsPresent = !!owner;
+        const datePresent = !!updatedAt;
+        const contentDepth = Math.max(abstractLength, input.readmeText ? input.readmeText.length : description.length);
+        const quality01 = estimateQualityScore({
+            abstractLength,
+            authorsPresent,
+            datePresent,
+            contentDepth,
+        });
+        return {
+            id: fullName,
+            source: "github",
+            name: fullName.split("/").pop() || fullName,
+            description,
+            ...(owner ? { authors: [owner] } : {}),
+            downloads: forks * 10,
+            likes: stars,
+            stars,
+            tags: topics,
+            last_updated: updatedAt,
+            task: "code",
+            languages: language,
+            domain: "research",
+            splits: [],
+            license: {
+                id: licenseId,
+                name: licenseName,
+                category: "unknown",
+                usage_restrictions: [],
+                warnings: [],
+            },
+            quality_score: Math.round(quality01 * 100),
+            quality_warnings: qualityWarnings,
+            download_url: String(repo.html_url || `https://github.com/${fullName}`),
+            format: "GIT",
+            total_examples: 1,
+            is_structured: false,
+            has_target_column: false,
+            is_safe_source: true,
+            has_personal_data: false,
+            is_paywalled: false,
+            is_scraped_web_data: false,
+            uses_https: true,
+            has_train_split: false,
+            has_test_split: false,
+            has_validation_split: false,
+            description_length: description.length,
+            has_readme: false,
+            metadata_url: String(repo.html_url || `https://github.com/${fullName}`),
+            ...(input.readmeText
+                ? { webcore_content: this.truncate50k(input.readmeText), webcore_content_kind: "readme_text" }
+                : {}),
+        };
+    }
+    truncate50k(text) {
+        return String(text || "").slice(0, 50_000);
+    }
+    parseOwnerRepo(fullName) {
+        const trimmed = String(fullName || "").trim();
+        if (trimmed.includes("/"))
+            return trimmed;
+        return null;
+    }
+    async fetchReadme(repo) {
+        const fullName = String(repo.full_name || repo.name || "").trim();
+        const ownerRepo = this.parseOwnerRepo(fullName);
+        if (!ownerRepo)
+            return undefined;
+        const [owner, name] = ownerRepo.split("/");
+        const candidates = [];
+        if (repo.default_branch)
+            candidates.push(String(repo.default_branch));
+        candidates.push("main", "master");
+        const uniq = Array.from(new Set(candidates.filter(Boolean)));
+        const token = process.env.GITHUB_TOKEN || process.env.GH_TOKEN;
+        const headers = {
+            "User-Agent": "vesper/2.0 (phase1-github-readme)",
+            "Accept": "text/plain",
+        };
+        if (token)
+            headers["Authorization"] = `Bearer ${token}`;
+        for (const branch of uniq) {
+            const url = `https://raw.githubusercontent.com/${owner}/${name}/${branch}/README.md`;
+            const res = await fetch(url, { headers }).catch(() => null);
+            if (!res)
+                continue;
+            if (res.status === 404)
+                continue;
+            if (res.status === 429 || res.status === 403) {
+                this.breaker.onFailure();
+                throw new Error(`GitHub README fetch failed with status ${res.status}`);
+            }
+            if (!res.ok)
+                continue;
+            return await res.text();
+        }
+        return undefined;
+    }
+}