npm - scholar-mcp - Versions diffs - 1.0.6 → 1.0.7 - Mend

scholar-mcp 1.0.6 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md +2 -21
package/dist/config.js +0 -2
package/dist/mcp/create-scholar-mcp-server.js +2 -2
package/dist/research/ingestion-service.js +97 -40
package/dist/research/literature-service.js +44 -1
package/dist/research/providers/openalex-client.js +51 -37
package/dist/research/providers/semantic-scholar-client.js +3 -2
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -24,7 +24,7 @@ Use this if you want Claude Code, Codex, or any MCP-compatible coding agent to r
 - Transports: `stdio` (recommended) and HTTP (`/mcp`)
 - Research providers: Google Scholar, OpenAlex, Crossref, Semantic Scholar
-- Full-text parsing pipeline: `grobid -> sidecar -> simple`
+- Full-text parsing pipeline: `grobid -> simple`
 - Tooling for thesis/paper workflows: ingestion, extraction, references, validation
 ## Quick Start
@@ -158,24 +158,6 @@ RESEARCH_ALLOW_LOCAL_PDFS = "true"
 - "Given this draft section, suggest citations in IEEE style and generate BibTeX."
 - "Validate my manuscript citations against this reference list and show missing citations."
-## Optional Python Sidecar (better parsing fallback)
-Run sidecar:
-```bash
-cd ../../services/python-sidecar
-python -m venv .venv
-source .venv/bin/activate
-pip install -r requirements.txt
-uvicorn app:app --host 127.0.0.1 --port 8090
-```
-Then set:
-```bash
-RESEARCH_PYTHON_SIDECAR_URL=http://127.0.0.1:8090
-```
 ## Configuration
 Most users only need these:
@@ -186,7 +168,6 @@ Most users only need these:
 - `RESEARCH_ALLOW_LOCAL_PDFS`: allow local PDF ingestion (default: `true`)
 - `SCHOLAR_MCP_API_KEY`: optional bearer token for HTTP mode
 - `RESEARCH_GROBID_URL`: optional GROBID endpoint
-- `RESEARCH_PYTHON_SIDECAR_URL`: optional sidecar endpoint
 The CLI loads `.env` from the current working directory automatically at startup.
@@ -197,7 +178,7 @@ Advanced options exist in `src/config.ts` for timeouts, retries, HTTP session ca
 - `Invalid environment variable format` in `claude mcp add`:
   - Add `--` before the MCP server name (see Claude setup command above).
 - `Unable to resolve a downloadable PDF URL from input` on DOI ingestion:
-  - The DOI landing page may not expose a downloadable PDF.
+  - The DOI and landing page may not expose an accessible PDF URL.
   - Retry with `pdf_url` (direct PDF) or `local_pdf_path`.
 - Too many Scholar failures or throttling:
   - Increase `SCHOLAR_REQUEST_DELAY_MS` (for example `500` to `1000`).

package/dist/config.js CHANGED Viewed

@@ -56,7 +56,6 @@ const envSchema = z.object({
     RESEARCH_ALLOW_REMOTE_PDFS: booleanFromEnv(true),
     RESEARCH_ALLOW_LOCAL_PDFS: booleanFromEnv(true),
     RESEARCH_GROBID_URL: z.string().url().optional(),
-    RESEARCH_PYTHON_SIDECAR_URL: z.string().url().optional(),
     RESEARCH_SEMANTIC_ENGINE: z.enum(['cloud-llm', 'none']).default('cloud-llm'),
     RESEARCH_CLOUD_MODEL: z.string().default('gpt-4.1-mini'),
     RESEARCH_GRAPH_CACHE_TTL_MS: numberFromEnv(5 * 60 * 1000, 0, 24 * 60 * 60 * 1000),
@@ -120,7 +119,6 @@ export const parseConfig = (overrides) => {
         researchAllowRemotePdfs: env.RESEARCH_ALLOW_REMOTE_PDFS,
         researchAllowLocalPdfs: env.RESEARCH_ALLOW_LOCAL_PDFS,
         researchGrobidUrl: env.RESEARCH_GROBID_URL,
-        researchPythonSidecarUrl: env.RESEARCH_PYTHON_SIDECAR_URL,
         researchSemanticEngine: env.RESEARCH_SEMANTIC_ENGINE,
         researchCloudModel: env.RESEARCH_CLOUD_MODEL,
         researchGraphCacheTtlMs: env.RESEARCH_GRAPH_CACHE_TTL_MS,

package/dist/mcp/create-scholar-mcp-server.js CHANGED Viewed

@@ -124,7 +124,7 @@ export const createScholarMcpServer = (config, service, researchService, logger)
     });
     server.registerTool('ingest_paper_fulltext', {
         title: 'Ingest Full-Text Paper',
-        description: 'Resolve and ingest a full-text PDF from DOI/URL/local file, then parse into a structured document using GROBID/sidecar/simple fallback pipeline.',
+        description: 'Resolve and ingest a full-text PDF from DOI/URL/local file, then parse into a structured document using GROBID/simple fallback pipeline.',
         annotations: {
             readOnlyHint: false,
             openWorldHint: true
@@ -134,7 +134,7 @@ export const createScholarMcpServer = (config, service, researchService, logger)
             paper_url: z.string().url().optional().describe('Landing page URL for the paper.'),
             pdf_url: z.string().url().optional().describe('Direct PDF URL.'),
             local_pdf_path: z.string().optional().describe('Local absolute or workspace-relative PDF path.'),
-            parse_mode: z.enum(['auto', 'grobid', 'sidecar', 'simple']).default('auto'),
+            parse_mode: z.enum(['auto', 'grobid', 'simple']).default('auto'),
             ocr_enabled: z.boolean().default(true).describe('Reserved for OCR-capable parser modes.')
         }
     }, async ({ doi, paper_url, pdf_url, local_pdf_path, parse_mode, ocr_enabled }) => {

package/dist/research/ingestion-service.js CHANGED Viewed

@@ -6,6 +6,7 @@ import { PDFParse } from 'pdf-parse';
 import { IngestionError, DocumentNotFoundError, JobNotFoundError } from './errors.js';
 import { makeStableId, nowIso, normalizeWhitespace, parseYear } from './utils.js';
 const DOI_REGEX = /10\.\d{4,9}\/[\-._;()/:A-Z0-9]+/i;
+const PDF_LINK_REGEX = /href=["']([^"']+\.pdf(?:\?[^"']*)?)["']/i;
 const toAbsolutePath = (value) => (value.startsWith('/') ? value : resolve(process.cwd(), value));
 const splitLines = (text) => text.split(/\r?\n/).map((line) => line.trim());
 const isLikelyHeading = (line) => /^(abstract|introduction|background|related work|method(?:s)?|materials|results|discussion|conclusion|limitations|references)\b/i.test(line.trim());
@@ -111,6 +112,14 @@ const parseGrobidXml = (xml) => {
         references
     };
 };
+const resolveUrlCandidate = (candidate, baseUrl) => {
+    try {
+        return new URL(candidate, baseUrl).toString();
+    }
+    catch {
+        return null;
+    }
+};
 export class IngestionService {
     config;
     logger;
@@ -253,9 +262,13 @@ export class IngestionService {
         if (input.doi) {
             resolvedWork = await this.literatureService.resolveByDoi(input.doi);
         }
+        const paperUrlCandidate = input.paperUrl ?? resolvedWork?.url ?? null;
+        const paperUrlPdfCandidate = paperUrlCandidate?.toLowerCase().endsWith('.pdf') ? paperUrlCandidate : null;
+        const discoveredPdfFromLanding = await this.resolvePdfUrlFromLandingPages([paperUrlCandidate, resolvedWork?.url]);
         const resolvedPdfUrl = input.pdfUrl ??
             resolvedWork?.openAccess.pdfUrl ??
-            (input.paperUrl?.toLowerCase().endsWith('.pdf') ? input.paperUrl : null);
+            paperUrlPdfCandidate ??
+            discoveredPdfFromLanding;
         if (!resolvedPdfUrl) {
             throw new IngestionError('Unable to resolve a downloadable PDF URL from input.');
         }
@@ -281,12 +294,6 @@ export class IngestionService {
                             }
                             return await this.parseWithGrobid(filePath);
                         }
-                        case 'sidecar': {
-                            if (!this.config.researchPythonSidecarUrl) {
-                                continue;
-                            }
-                            return await this.parseWithSidecar(filePath);
-                        }
                         case 'simple': {
                             return await this.parseWithSimplePdf(filePath);
                         }
@@ -309,13 +316,10 @@ export class IngestionService {
     }
     resolveParserOrder(parseMode) {
         if (parseMode === 'auto') {
-            return ['grobid', 'sidecar', 'simple'];
+            return ['grobid', 'simple'];
         }
         if (parseMode === 'grobid') {
-            return ['grobid', 'sidecar', 'simple'];
-        }
-        if (parseMode === 'sidecar') {
-            return ['sidecar', 'grobid', 'simple'];
+            return ['grobid', 'simple'];
         }
         return ['simple'];
     }
@@ -331,15 +335,22 @@ export class IngestionService {
         }
         const response = await fetch(source.pdfUrl, {
             headers: {
-                accept: 'application/pdf,*/*'
+                accept: 'application/pdf,*/*',
+                'user-agent': 'ScholarMCP/1.0 (+https://github.com/lstudlo/ScholarMCP)'
             }
         });
         if (!response.ok) {
             throw new IngestionError(`Failed to download PDF. HTTP ${response.status}`);
         }
         const bytes = await response.arrayBuffer();
+        const contentType = (response.headers.get('content-type') ?? '').toLowerCase();
+        const buffer = Buffer.from(bytes);
+        const looksLikePdf = buffer.length >= 4 && buffer.subarray(0, 4).toString('utf8') === '%PDF';
+        if (!contentType.includes('application/pdf') && !looksLikePdf) {
+            throw new IngestionError(`Downloaded content is not a PDF (content-type: ${contentType || 'unknown'}).`);
+        }
         const tempPath = resolve(tmpdir(), `scholar-mcp-${Date.now()}-${randomUUID()}.pdf`);
-        await fs.writeFile(tempPath, Buffer.from(bytes));
+        await fs.writeFile(tempPath, buffer);
         return {
             filePath: tempPath,
             cleanup: async () => {
@@ -394,37 +405,83 @@ export class IngestionService {
         }
         return parsed;
     }
-    async parseWithSidecar(filePath) {
-        if (!this.config.researchPythonSidecarUrl) {
-            throw new IngestionError('Python sidecar URL is not configured.');
+    async resolvePdfUrlFromLandingPages(urls) {
+        const seen = new Set();
+        for (const candidate of urls) {
+            if (!candidate) {
+                continue;
+            }
+            const normalized = candidate.trim();
+            if (!normalized || seen.has(normalized)) {
+                continue;
+            }
+            seen.add(normalized);
+            try {
+                const discovered = await this.resolvePdfUrlFromLandingPage(normalized);
+                if (discovered) {
+                    return discovered;
+                }
+            }
+            catch (error) {
+                this.logger.debug('Landing page PDF discovery failed', {
+                    paperUrl: normalized,
+                    error: error instanceof Error ? error.message : String(error)
+                });
+            }
         }
-        const url = new URL('/parse', this.config.researchPythonSidecarUrl);
-        const response = await fetch(url, {
-            method: 'POST',
+        return null;
+    }
+    async resolvePdfUrlFromLandingPage(paperUrl) {
+        const response = await fetch(paperUrl, {
             headers: {
-                'content-type': 'application/json'
-            },
-            body: JSON.stringify({
-                filePath
-            })
+                accept: 'text/html,application/pdf,*/*',
+                'user-agent': 'ScholarMCP/1.0 (+https://github.com/lstudlo/ScholarMCP)'
+            }
         });
         if (!response.ok) {
-            throw new IngestionError(`Python sidecar returned HTTP ${response.status}`);
+            return null;
         }
-        const payload = (await response.json());
-        const fullText = normalizeWhitespace(payload.fullText ?? '');
-        if (!fullText) {
-            throw new IngestionError('Python sidecar returned empty full text.');
+        const finalUrl = response.url || paperUrl;
+        const contentType = (response.headers.get('content-type') ?? '').toLowerCase();
+        if (contentType.includes('application/pdf')) {
+            return finalUrl;
         }
-        return {
-            parserName: payload.parserName ?? 'python-sidecar',
-            parserVersion: payload.parserVersion ?? 'unknown',
-            confidence: payload.confidence ?? 0.74,
-            title: payload.title ?? null,
-            abstract: payload.abstract ?? null,
-            fullText,
-            sections: payload.sections ?? splitIntoSections(fullText),
-            references: payload.references ?? extractReferences(fullText)
-        };
+        const html = await response.text();
+        if (!html) {
+            return null;
+        }
+        const metaPatterns = [
+            /<meta[^>]+name=["']citation_pdf_url["'][^>]+content=["']([^"']+)["'][^>]*>/i,
+            /<meta[^>]+content=["']([^"']+)["'][^>]+name=["']citation_pdf_url["'][^>]*>/i,
+            /<meta[^>]+property=["']og:pdf["'][^>]+content=["']([^"']+)["'][^>]*>/i,
+            /<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:pdf["'][^>]*>/i
+        ];
+        for (const pattern of metaPatterns) {
+            const match = html.match(pattern);
+            if (match?.[1]) {
+                const resolved = resolveUrlCandidate(match[1], finalUrl);
+                if (resolved) {
+                    return resolved;
+                }
+            }
+        }
+        const linkPatterns = [
+            /<link[^>]+type=["']application\/pdf["'][^>]+href=["']([^"']+)["'][^>]*>/i,
+            /<link[^>]+href=["']([^"']+)["'][^>]+type=["']application\/pdf["'][^>]*>/i
+        ];
+        for (const pattern of linkPatterns) {
+            const match = html.match(pattern);
+            if (match?.[1]) {
+                const resolved = resolveUrlCandidate(match[1], finalUrl);
+                if (resolved) {
+                    return resolved;
+                }
+            }
+        }
+        const anchorMatch = html.match(PDF_LINK_REGEX);
+        if (anchorMatch?.[1]) {
+            return resolveUrlCandidate(anchorMatch[1], finalUrl);
+        }
+        return null;
     }
 }

package/dist/research/literature-service.js CHANGED Viewed

@@ -1,5 +1,6 @@
 import { normalizeDoi, normalizeWhitespace, parseYear, tokenizeForRanking } from './utils.js';
 import { ResearchHttpClient } from './http-client.js';
+import { ResearchProviderError } from './errors.js';
 import { OpenAlexClient } from './providers/openalex-client.js';
 import { CrossrefClient } from './providers/crossref-client.js';
 import { SemanticScholarClient } from './providers/semantic-scholar-client.js';
@@ -292,9 +293,51 @@ export class LiteratureService {
         if (!normalized) {
             return null;
         }
+        try {
+            const openAlexExact = await this.openAlexClient.getWorkByDoi(normalized);
+            if (openAlexExact) {
+                return {
+                    title: openAlexExact.title,
+                    abstract: openAlexExact.abstract,
+                    year: openAlexExact.year,
+                    venue: openAlexExact.venue,
+                    doi: openAlexExact.doi,
+                    url: openAlexExact.url,
+                    paperId: openAlexExact.providerId,
+                    citationCount: openAlexExact.citationCount,
+                    influentialCitationCount: openAlexExact.influentialCitationCount,
+                    referenceCount: openAlexExact.referenceCount,
+                    authors: openAlexExact.authors,
+                    openAccess: {
+                        isOpenAccess: openAlexExact.openAccess.isOpenAccess,
+                        pdfUrl: openAlexExact.openAccess.pdfUrl,
+                        license: openAlexExact.openAccess.license
+                    },
+                    externalIds: openAlexExact.externalIds,
+                    fieldsOfStudy: openAlexExact.fieldsOfStudy,
+                    score: openAlexExact.score,
+                    provenance: [
+                        {
+                            provider: 'openalex',
+                            sourceUrl: openAlexExact.sourceUrl,
+                            fetchedAt: new Date().toISOString(),
+                            confidence: providerWeight.openalex
+                        }
+                    ]
+                };
+            }
+        }
+        catch (error) {
+            if (!(error instanceof ResearchProviderError) || error.status !== 404) {
+                this.logger.warn('OpenAlex DOI resolve failed', {
+                    doi: normalized,
+                    error: error instanceof Error ? error.message : String(error)
+                });
+            }
+        }
         const result = await this.searchGraph({
             query: normalized,
-            limit: 10,
+            limit: 50,
             sources: ['openalex', 'crossref', 'semantic_scholar']
         });
         return (result.results.find((item) => normalizeDoi(item.doi) === normalized) ??

package/dist/research/providers/openalex-client.js CHANGED Viewed

@@ -38,43 +38,57 @@ export class OpenAlexClient {
             provider: 'openalex',
             url
         });
-        return (payload.results ?? []).map((item) => {
-            const doi = normalizeDoi(item.ids?.doi ?? null);
-            return {
-                provider: 'openalex',
-                providerId: item.id ?? `openalex:${item.display_name ?? 'unknown'}`,
-                title: item.display_name ?? 'Untitled',
-                abstract: decodeInvertedAbstract(item.abstract_inverted_index),
-                year: parseYear(item.publication_year),
-                venue: item.primary_location?.source?.display_name ?? null,
-                doi,
-                url: item.primary_location?.landing_page_url ?? item.id ?? null,
-                citationCount: item.cited_by_count ?? 0,
-                influentialCitationCount: 0,
-                referenceCount: item.referenced_works_count ?? 0,
-                authors: (item.authorships ?? [])
-                    .map((auth) => ({
-                    name: auth.author?.display_name ?? '',
-                    authorId: auth.author?.id ?? null
-                }))
-                    .filter((author) => author.name.length > 0),
-                openAccess: {
-                    isOpenAccess: item.open_access?.is_oa ?? item.open_access?.any_repository_has_fulltext ?? Boolean(item.primary_location?.pdf_url),
-                    pdfUrl: item.primary_location?.pdf_url ?? item.open_access?.oa_url ?? null,
-                    license: item.primary_location?.license ?? item.open_access?.oa_status ?? null
-                },
-                externalIds: {
-                    ...(item.ids?.openalex ? { openalex: item.ids.openalex } : {}),
-                    ...(doi ? { doi } : {}),
-                    ...(item.ids?.pmid ? { pmid: item.ids.pmid } : {}),
-                    ...(item.ids?.pmcid ? { pmcid: item.ids.pmcid } : {})
-                },
-                fieldsOfStudy: (item.concepts ?? [])
-                    .map((concept) => concept.display_name ?? '')
-                    .filter((value) => value.length > 0),
-                score: item.relevance_score ?? 0.5,
-                sourceUrl: url.toString()
-            };
+        return (payload.results ?? []).map((item) => this.mapWork(item, url.toString()));
+    }
+    async getWorkByDoi(doi) {
+        const normalizedDoi = normalizeDoi(doi);
+        if (!normalizedDoi) {
+            return null;
+        }
+        const encodedDoiUrl = encodeURIComponent(`https://doi.org/${normalizedDoi}`);
+        const url = new URL(`/works/${encodedDoiUrl}`, this.config.researchOpenAlexBaseUrl);
+        const payload = await this.httpClient.fetchJson({
+            provider: 'openalex',
+            url
         });
+        return this.mapWork(payload, url.toString());
+    }
+    mapWork(item, sourceUrl) {
+        const doi = normalizeDoi(item.ids?.doi ?? null);
+        return {
+            provider: 'openalex',
+            providerId: item.id ?? `openalex:${item.display_name ?? 'unknown'}`,
+            title: item.display_name ?? 'Untitled',
+            abstract: decodeInvertedAbstract(item.abstract_inverted_index),
+            year: parseYear(item.publication_year),
+            venue: item.primary_location?.source?.display_name ?? null,
+            doi,
+            url: item.primary_location?.landing_page_url ?? item.id ?? null,
+            citationCount: item.cited_by_count ?? 0,
+            influentialCitationCount: 0,
+            referenceCount: item.referenced_works_count ?? 0,
+            authors: (item.authorships ?? [])
+                .map((auth) => ({
+                name: auth.author?.display_name ?? '',
+                authorId: auth.author?.id ?? null
+            }))
+                .filter((author) => author.name.length > 0),
+            openAccess: {
+                isOpenAccess: item.open_access?.is_oa ?? item.open_access?.any_repository_has_fulltext ?? Boolean(item.primary_location?.pdf_url),
+                pdfUrl: item.primary_location?.pdf_url ?? item.open_access?.oa_url ?? null,
+                license: item.primary_location?.license ?? item.open_access?.oa_status ?? null
+            },
+            externalIds: {
+                ...(item.ids?.openalex ? { openalex: item.ids.openalex } : {}),
+                ...(doi ? { doi } : {}),
+                ...(item.ids?.pmid ? { pmid: item.ids.pmid } : {}),
+                ...(item.ids?.pmcid ? { pmcid: item.ids.pmcid } : {})
+            },
+            fieldsOfStudy: (item.concepts ?? [])
+                .map((concept) => concept.display_name ?? '')
+                .filter((value) => value.length > 0),
+            score: item.relevance_score ?? 0.5,
+            sourceUrl
+        };
     }
 }

package/dist/research/providers/semantic-scholar-client.js CHANGED Viewed

@@ -7,9 +7,10 @@ export class SemanticScholarClient {
         this.httpClient = httpClient;
     }
     async searchWorks(query, limit) {
-        const url = new URL('/paper/search', this.config.researchSemanticScholarBaseUrl.endsWith('/')
+        const baseUrl = this.config.researchSemanticScholarBaseUrl.endsWith('/')
             ? this.config.researchSemanticScholarBaseUrl
-            : `${this.config.researchSemanticScholarBaseUrl}/`);
+            : `${this.config.researchSemanticScholarBaseUrl}/`;
+        const url = new URL('paper/search', baseUrl);
         url.searchParams.set('query', query);
         url.searchParams.set('limit', String(limit));
         url.searchParams.set('fields', 'paperId,title,abstract,year,venue,externalIds,url,citationCount,influentialCitationCount,referenceCount,isOpenAccess,openAccessPdf,fieldsOfStudy,authors');

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "scholar-mcp",
-  "version": "1.0.6",
+  "version": "1.0.7",
   "description": "MCP Server for researchers",
   "license": "MIT",
   "type": "module",