npm - scholar-mcp - Versions diffs - 1.0.0 - Mend

scholar-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/README.md +232 -0
package/dist/cli/args.js +57 -0
package/dist/config.js +131 -0
package/dist/core/logger.js +36 -0
package/dist/http/start-http-server.js +329 -0
package/dist/index.js +66 -0
package/dist/mcp/create-scholar-mcp-server.js +583 -0
package/dist/mcp/start-stdio-server.js +8 -0
package/dist/research/citation-service.js +407 -0
package/dist/research/errors.js +36 -0
package/dist/research/extraction-service.js +109 -0
package/dist/research/http-client.js +62 -0
package/dist/research/index.js +7 -0
package/dist/research/ingestion-service.js +430 -0
package/dist/research/literature-service.js +387 -0
package/dist/research/providers/crossref-client.js +73 -0
package/dist/research/providers/openalex-client.js +80 -0
package/dist/research/providers/semantic-scholar-client.js +60 -0
package/dist/research/research-service.js +53 -0
package/dist/research/types.js +1 -0
package/dist/research/utils.js +54 -0
package/dist/scholar/errors.js +30 -0
package/dist/scholar/scholar-client.js +99 -0
package/dist/scholar/scholar-parser.js +251 -0
package/dist/scholar/scholar-service.js +202 -0
package/dist/scholar/types.js +1 -0
package/dist/version.js +14 -0
package/package.json +49 -0

package/dist/research/ingestion-service.js ADDED Viewed

@@ -0,0 +1,430 @@
+import { promises as fs } from 'node:fs';
+import { basename, resolve } from 'node:path';
+import { tmpdir } from 'node:os';
+import { randomUUID } from 'node:crypto';
+import { PDFParse } from 'pdf-parse';
+import { IngestionError, DocumentNotFoundError, JobNotFoundError } from './errors.js';
+import { makeStableId, nowIso, normalizeWhitespace, parseYear } from './utils.js';
+const DOI_REGEX = /10\.\d{4,9}\/[\-._;()/:A-Z0-9]+/i;
+const toAbsolutePath = (value) => (value.startsWith('/') ? value : resolve(process.cwd(), value));
+const splitLines = (text) => text.split(/\r?\n/).map((line) => line.trim());
+const isLikelyHeading = (line) => /^(abstract|introduction|background|related work|method(?:s)?|materials|results|discussion|conclusion|limitations|references)\b/i.test(line.trim());
+const splitIntoSections = (text) => {
+    const lines = splitLines(text).filter((line) => line.length > 0);
+    if (lines.length === 0) {
+        return [];
+    }
+    const sections = [];
+    let currentHeading = 'Body';
+    let currentLines = [];
+    const pushCurrent = () => {
+        const sectionText = normalizeWhitespace(currentLines.join(' '));
+        if (sectionText.length === 0) {
+            return;
+        }
+        sections.push({
+            id: makeStableId([currentHeading, sectionText.slice(0, 120)], 'section'),
+            heading: currentHeading,
+            text: sectionText,
+            pageStart: null,
+            pageEnd: null
+        });
+    };
+    for (const line of lines) {
+        if (isLikelyHeading(line) && currentLines.length > 0) {
+            pushCurrent();
+            currentHeading = line;
+            currentLines = [];
+            continue;
+        }
+        if (isLikelyHeading(line) && currentLines.length === 0) {
+            currentHeading = line;
+            continue;
+        }
+        currentLines.push(line);
+    }
+    if (currentLines.length > 0) {
+        pushCurrent();
+    }
+    return sections;
+};
+const extractReferences = (text) => {
+    const lines = splitLines(text);
+    const referencesStart = lines.findIndex((line) => /^references$/i.test(line));
+    const sourceLines = referencesStart >= 0 ? lines.slice(referencesStart + 1) : lines.slice(-120);
+    return sourceLines
+        .filter((line) => line.length > 30)
+        .slice(0, 60)
+        .map((line) => {
+        const doi = line.match(DOI_REGEX)?.[0]?.toLowerCase() ?? null;
+        const year = parseYear(line);
+        return {
+            rawText: line,
+            doi,
+            title: null,
+            year,
+            authors: []
+        };
+    });
+};
+const extractTitleAndAbstract = (text) => {
+    const lines = splitLines(text).filter((line) => line.length > 0);
+    const title = lines[0] ?? null;
+    let abstract = null;
+    const abstractIndex = lines.findIndex((line) => /^abstract$/i.test(line) || /^abstract[:\s]/i.test(line));
+    if (abstractIndex >= 0) {
+        abstract = normalizeWhitespace(lines.slice(abstractIndex, abstractIndex + 6).join(' '));
+    }
+    return {
+        title,
+        abstract
+    };
+};
+const parseGrobidXml = (xml) => {
+    const title = xml.match(/<title[^>]*type="main"[^>]*>([\s\S]*?)<\/title>/i)?.[1] ?? null;
+    const body = xml.match(/<body>([\s\S]*?)<\/body>/i)?.[1] ?? '';
+    const text = normalizeWhitespace(body.replace(/<[^>]+>/g, ' '));
+    const references = [...xml.matchAll(/<biblStruct[\s\S]*?<\/biblStruct>/gim)]
+        .slice(0, 120)
+        .map((entry) => {
+        const raw = normalizeWhitespace(entry[0].replace(/<[^>]+>/g, ' '));
+        const doi = entry[0].match(DOI_REGEX)?.[0]?.toLowerCase() ?? null;
+        const refTitle = entry[0].match(/<title[^>]*>([\s\S]*?)<\/title>/i)?.[1] ?? null;
+        return {
+            rawText: raw,
+            doi,
+            title: refTitle ? normalizeWhitespace(refTitle.replace(/<[^>]+>/g, ' ')) : null,
+            year: parseYear(raw),
+            authors: []
+        };
+    });
+    const normalizedTitle = title ? normalizeWhitespace(title.replace(/<[^>]+>/g, ' ')) : null;
+    const sections = splitIntoSections(text);
+    return {
+        parserName: 'grobid',
+        parserVersion: 'service',
+        confidence: text.length > 0 ? 0.85 : 0.65,
+        title: normalizedTitle,
+        abstract: null,
+        fullText: text,
+        sections,
+        references
+    };
+};
+export class IngestionService {
+    config;
+    logger;
+    literatureService;
+    jobs = new Map();
+    documents = new Map();
+    constructor(config, logger, literatureService) {
+        this.config = config;
+        this.logger = logger;
+        this.literatureService = literatureService;
+    }
+    enqueueIngestion(input) {
+        const sourceSeed = [input.doi ?? null, input.paperUrl ?? null, input.pdfUrl ?? null, input.localPdfPath ?? null];
+        const documentId = makeStableId(sourceSeed, 'doc');
+        const jobId = makeStableId([...sourceSeed, randomUUID()], 'job');
+        const job = {
+            jobId,
+            documentId,
+            status: 'queued',
+            createdAt: nowIso(),
+            startedAt: null,
+            completedAt: null,
+            source: {
+                doi: input.doi ?? null,
+                paperUrl: input.paperUrl ?? null,
+                pdfUrl: input.pdfUrl ?? null,
+                localPdfPath: input.localPdfPath ?? null
+            },
+            parserName: null,
+            parserConfidence: null,
+            licenseState: 'unknown',
+            error: null,
+            warnings: [],
+            provenance: []
+        };
+        this.jobs.set(jobId, job);
+        void this.processJob(jobId, input).catch((error) => {
+            const current = this.jobs.get(jobId);
+            if (!current) {
+                return;
+            }
+            current.status = 'failed';
+            current.completedAt = nowIso();
+            current.error = error instanceof Error ? error.message : String(error);
+            this.jobs.set(jobId, current);
+        });
+        return job;
+    }
+    getJob(jobId) {
+        const job = this.jobs.get(jobId);
+        if (!job) {
+            throw new JobNotFoundError(jobId);
+        }
+        return job;
+    }
+    getDocument(documentId) {
+        const document = this.documents.get(documentId);
+        if (!document) {
+            throw new DocumentNotFoundError(documentId);
+        }
+        return document;
+    }
+    async processJob(jobId, input) {
+        const job = this.jobs.get(jobId);
+        if (!job) {
+            return;
+        }
+        job.status = 'running';
+        job.startedAt = nowIso();
+        this.jobs.set(jobId, job);
+        const resolved = await this.resolveSource(input);
+        job.source = {
+            doi: resolved.doi,
+            paperUrl: resolved.paperUrl,
+            pdfUrl: resolved.pdfUrl,
+            localPdfPath: resolved.localPdfPath
+        };
+        job.licenseState = resolved.licenseState;
+        const parserMode = input.parseMode ?? 'auto';
+        const parseResult = await this.parseSourcePdf(resolved, parserMode);
+        const document = {
+            documentId: job.documentId,
+            source: {
+                doi: resolved.doi,
+                url: resolved.paperUrl ?? resolved.pdfUrl,
+                localPath: resolved.localPdfPath
+            },
+            parser: {
+                parserName: parseResult.parserName,
+                parserVersion: parseResult.parserVersion,
+                confidence: parseResult.confidence
+            },
+            title: parseResult.title,
+            abstract: parseResult.abstract,
+            fullText: parseResult.fullText,
+            sections: parseResult.sections,
+            references: parseResult.references,
+            tables: [],
+            equations: [],
+            figures: [],
+            createdAt: nowIso(),
+            provenance: [
+                {
+                    provider: resolved.provenanceWork ? 'openalex' : 'scholar_scrape',
+                    sourceUrl: resolved.paperUrl ?? resolved.pdfUrl,
+                    fetchedAt: nowIso(),
+                    confidence: parseResult.confidence,
+                    notes: `${parseResult.parserName}:${parseResult.parserVersion}`
+                }
+            ]
+        };
+        this.documents.set(document.documentId, document);
+        job.status = 'succeeded';
+        job.completedAt = nowIso();
+        job.parserName = parseResult.parserName;
+        job.parserConfidence = parseResult.confidence;
+        job.provenance = document.provenance;
+        this.jobs.set(jobId, job);
+    }
+    async resolveSource(input) {
+        if (input.localPdfPath) {
+            if (!this.config.researchAllowLocalPdfs) {
+                throw new IngestionError('Local PDF ingestion is disabled by configuration.');
+            }
+            const absolutePath = toAbsolutePath(input.localPdfPath);
+            await fs.access(absolutePath);
+            return {
+                doi: input.doi ?? null,
+                paperUrl: input.paperUrl ?? null,
+                pdfUrl: input.pdfUrl ?? null,
+                localPdfPath: absolutePath,
+                licenseState: 'user_provided',
+                provenanceWork: null
+            };
+        }
+        if (!this.config.researchAllowRemotePdfs) {
+            throw new IngestionError('Remote PDF ingestion is disabled by configuration.');
+        }
+        let resolvedWork = null;
+        if (input.doi) {
+            resolvedWork = await this.literatureService.resolveByDoi(input.doi);
+        }
+        const resolvedPdfUrl = input.pdfUrl ??
+            resolvedWork?.openAccess.pdfUrl ??
+            (input.paperUrl?.toLowerCase().endsWith('.pdf') ? input.paperUrl : null);
+        if (!resolvedPdfUrl) {
+            throw new IngestionError('Unable to resolve a downloadable PDF URL from input.');
+        }
+        return {
+            doi: input.doi ?? resolvedWork?.doi ?? null,
+            paperUrl: input.paperUrl ?? resolvedWork?.url ?? null,
+            pdfUrl: resolvedPdfUrl,
+            localPdfPath: null,
+            licenseState: 'open_access',
+            provenanceWork: resolvedWork
+        };
+    }
+    async parseSourcePdf(source, parseMode) {
+        const { filePath, cleanup } = await this.obtainPdfFile(source);
+        try {
+            const modes = this.resolveParserOrder(parseMode);
+            for (const mode of modes) {
+                try {
+                    switch (mode) {
+                        case 'grobid': {
+                            if (!this.config.researchGrobidUrl) {
+                                continue;
+                            }
+                            return await this.parseWithGrobid(filePath);
+                        }
+                        case 'sidecar': {
+                            if (!this.config.researchPythonSidecarUrl) {
+                                continue;
+                            }
+                            return await this.parseWithSidecar(filePath);
+                        }
+                        case 'simple': {
+                            return await this.parseWithSimplePdf(filePath);
+                        }
+                    }
+                }
+                catch (error) {
+                    this.logger.warn('Parser mode failed, trying fallback', {
+                        mode,
+                        filePath,
+                        error: error instanceof Error ? error.message : String(error)
+                    });
+                    continue;
+                }
+            }
+            throw new IngestionError('All parser strategies failed for this PDF source.');
+        }
+        finally {
+            await cleanup();
+        }
+    }
+    resolveParserOrder(parseMode) {
+        if (parseMode === 'auto') {
+            return ['grobid', 'sidecar', 'simple'];
+        }
+        if (parseMode === 'grobid') {
+            return ['grobid', 'sidecar', 'simple'];
+        }
+        if (parseMode === 'sidecar') {
+            return ['sidecar', 'grobid', 'simple'];
+        }
+        return ['simple'];
+    }
+    async obtainPdfFile(source) {
+        if (source.localPdfPath) {
+            return {
+                filePath: source.localPdfPath,
+                cleanup: async () => undefined
+            };
+        }
+        if (!source.pdfUrl) {
+            throw new IngestionError('Missing PDF URL after source resolution.');
+        }
+        const response = await fetch(source.pdfUrl, {
+            headers: {
+                accept: 'application/pdf,*/*'
+            }
+        });
+        if (!response.ok) {
+            throw new IngestionError(`Failed to download PDF. HTTP ${response.status}`);
+        }
+        const bytes = await response.arrayBuffer();
+        const tempPath = resolve(tmpdir(), `scholar-mcp-${Date.now()}-${randomUUID()}.pdf`);
+        await fs.writeFile(tempPath, Buffer.from(bytes));
+        return {
+            filePath: tempPath,
+            cleanup: async () => {
+                await fs.unlink(tempPath).catch(() => undefined);
+            }
+        };
+    }
+    async parseWithSimplePdf(filePath) {
+        const buffer = await fs.readFile(filePath);
+        const parser = new PDFParse({ data: buffer });
+        const parsed = await parser.getText();
+        await parser.destroy();
+        const text = normalizeWhitespace(parsed.text ?? '');
+        if (!text) {
+            throw new IngestionError('Simple PDF parser returned empty text.');
+        }
+        const sections = splitIntoSections(parsed.text ?? '');
+        const references = extractReferences(parsed.text ?? '');
+        const { title, abstract } = extractTitleAndAbstract(parsed.text ?? '');
+        return {
+            parserName: 'pdf-parse',
+            parserVersion: '2.x',
+            confidence: 0.62,
+            title,
+            abstract,
+            fullText: text,
+            sections,
+            references
+        };
+    }
+    async parseWithGrobid(filePath) {
+        if (!this.config.researchGrobidUrl) {
+            throw new IngestionError('GROBID URL is not configured.');
+        }
+        const url = new URL('/api/processFulltextDocument', this.config.researchGrobidUrl);
+        const buffer = await fs.readFile(filePath);
+        const formData = new FormData();
+        formData.set('input', new Blob([buffer], { type: 'application/pdf' }), basename(filePath));
+        formData.set('consolidateHeader', '1');
+        formData.set('consolidateCitations', '1');
+        const response = await fetch(url, {
+            method: 'POST',
+            body: formData
+        });
+        if (!response.ok) {
+            throw new IngestionError(`GROBID returned HTTP ${response.status}`);
+        }
+        const xml = await response.text();
+        const parsed = parseGrobidXml(xml);
+        if (!parsed.fullText) {
+            throw new IngestionError('GROBID response did not include extractable body text.');
+        }
+        return parsed;
+    }
+    async parseWithSidecar(filePath) {
+        if (!this.config.researchPythonSidecarUrl) {
+            throw new IngestionError('Python sidecar URL is not configured.');
+        }
+        const url = new URL('/parse', this.config.researchPythonSidecarUrl);
+        const response = await fetch(url, {
+            method: 'POST',
+            headers: {
+                'content-type': 'application/json'
+            },
+            body: JSON.stringify({
+                filePath
+            })
+        });
+        if (!response.ok) {
+            throw new IngestionError(`Python sidecar returned HTTP ${response.status}`);
+        }
+        const payload = (await response.json());
+        const fullText = normalizeWhitespace(payload.fullText ?? '');
+        if (!fullText) {
+            throw new IngestionError('Python sidecar returned empty full text.');
+        }
+        return {
+            parserName: payload.parserName ?? 'python-sidecar',
+            parserVersion: payload.parserVersion ?? 'unknown',
+            confidence: payload.confidence ?? 0.74,
+            title: payload.title ?? null,
+            abstract: payload.abstract ?? null,
+            fullText,
+            sections: payload.sections ?? splitIntoSections(fullText),
+            references: payload.references ?? extractReferences(fullText)
+        };
+    }
+}