scholar-mcp 1.0.6 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -24,7 +24,7 @@ Use this if you want Claude Code, Codex, or any MCP-compatible coding agent to r
24
24
 
25
25
  - Transports: `stdio` (recommended) and HTTP (`/mcp`)
26
26
  - Research providers: Google Scholar, OpenAlex, Crossref, Semantic Scholar
27
- - Full-text parsing pipeline: `grobid -> sidecar -> simple`
27
+ - Full-text parsing pipeline: `grobid -> simple`
28
28
  - Tooling for thesis/paper workflows: ingestion, extraction, references, validation
29
29
 
30
30
  ## Quick Start
@@ -158,24 +158,6 @@ RESEARCH_ALLOW_LOCAL_PDFS = "true"
158
158
  - "Given this draft section, suggest citations in IEEE style and generate BibTeX."
159
159
  - "Validate my manuscript citations against this reference list and show missing citations."
160
160
 
161
- ## Optional Python Sidecar (better parsing fallback)
162
-
163
- Run sidecar:
164
-
165
- ```bash
166
- cd ../../services/python-sidecar
167
- python -m venv .venv
168
- source .venv/bin/activate
169
- pip install -r requirements.txt
170
- uvicorn app:app --host 127.0.0.1 --port 8090
171
- ```
172
-
173
- Then set:
174
-
175
- ```bash
176
- RESEARCH_PYTHON_SIDECAR_URL=http://127.0.0.1:8090
177
- ```
178
-
179
161
  ## Configuration
180
162
 
181
163
  Most users only need these:
@@ -186,7 +168,6 @@ Most users only need these:
186
168
  - `RESEARCH_ALLOW_LOCAL_PDFS`: allow local PDF ingestion (default: `true`)
187
169
  - `SCHOLAR_MCP_API_KEY`: optional bearer token for HTTP mode
188
170
  - `RESEARCH_GROBID_URL`: optional GROBID endpoint
189
- - `RESEARCH_PYTHON_SIDECAR_URL`: optional sidecar endpoint
190
171
 
191
172
  The CLI loads `.env` from the current working directory automatically at startup.
192
173
 
@@ -197,7 +178,7 @@ Advanced options exist in `src/config.ts` for timeouts, retries, HTTP session ca
197
178
  - `Invalid environment variable format` in `claude mcp add`:
198
179
  - Add `--` before the MCP server name (see Claude setup command above).
199
180
  - `Unable to resolve a downloadable PDF URL from input` on DOI ingestion:
200
- - The DOI landing page may not expose a downloadable PDF.
181
+ - The DOI and landing page may not expose an accessible PDF URL.
201
182
  - Retry with `pdf_url` (direct PDF) or `local_pdf_path`.
202
183
  - Too many Scholar failures or throttling:
203
184
  - Increase `SCHOLAR_REQUEST_DELAY_MS` (for example `500` to `1000`).
package/dist/config.js CHANGED
@@ -56,7 +56,6 @@ const envSchema = z.object({
56
56
  RESEARCH_ALLOW_REMOTE_PDFS: booleanFromEnv(true),
57
57
  RESEARCH_ALLOW_LOCAL_PDFS: booleanFromEnv(true),
58
58
  RESEARCH_GROBID_URL: z.string().url().optional(),
59
- RESEARCH_PYTHON_SIDECAR_URL: z.string().url().optional(),
60
59
  RESEARCH_SEMANTIC_ENGINE: z.enum(['cloud-llm', 'none']).default('cloud-llm'),
61
60
  RESEARCH_CLOUD_MODEL: z.string().default('gpt-4.1-mini'),
62
61
  RESEARCH_GRAPH_CACHE_TTL_MS: numberFromEnv(5 * 60 * 1000, 0, 24 * 60 * 60 * 1000),
@@ -120,7 +119,6 @@ export const parseConfig = (overrides) => {
120
119
  researchAllowRemotePdfs: env.RESEARCH_ALLOW_REMOTE_PDFS,
121
120
  researchAllowLocalPdfs: env.RESEARCH_ALLOW_LOCAL_PDFS,
122
121
  researchGrobidUrl: env.RESEARCH_GROBID_URL,
123
- researchPythonSidecarUrl: env.RESEARCH_PYTHON_SIDECAR_URL,
124
122
  researchSemanticEngine: env.RESEARCH_SEMANTIC_ENGINE,
125
123
  researchCloudModel: env.RESEARCH_CLOUD_MODEL,
126
124
  researchGraphCacheTtlMs: env.RESEARCH_GRAPH_CACHE_TTL_MS,
@@ -124,7 +124,7 @@ export const createScholarMcpServer = (config, service, researchService, logger)
124
124
  });
125
125
  server.registerTool('ingest_paper_fulltext', {
126
126
  title: 'Ingest Full-Text Paper',
127
- description: 'Resolve and ingest a full-text PDF from DOI/URL/local file, then parse into a structured document using GROBID/sidecar/simple fallback pipeline.',
127
+ description: 'Resolve and ingest a full-text PDF from DOI/URL/local file, then parse into a structured document using GROBID/simple fallback pipeline.',
128
128
  annotations: {
129
129
  readOnlyHint: false,
130
130
  openWorldHint: true
@@ -134,7 +134,7 @@ export const createScholarMcpServer = (config, service, researchService, logger)
134
134
  paper_url: z.string().url().optional().describe('Landing page URL for the paper.'),
135
135
  pdf_url: z.string().url().optional().describe('Direct PDF URL.'),
136
136
  local_pdf_path: z.string().optional().describe('Local absolute or workspace-relative PDF path.'),
137
- parse_mode: z.enum(['auto', 'grobid', 'sidecar', 'simple']).default('auto'),
137
+ parse_mode: z.enum(['auto', 'grobid', 'simple']).default('auto'),
138
138
  ocr_enabled: z.boolean().default(true).describe('Reserved for OCR-capable parser modes.')
139
139
  }
140
140
  }, async ({ doi, paper_url, pdf_url, local_pdf_path, parse_mode, ocr_enabled }) => {
@@ -6,6 +6,7 @@ import { PDFParse } from 'pdf-parse';
6
6
  import { IngestionError, DocumentNotFoundError, JobNotFoundError } from './errors.js';
7
7
  import { makeStableId, nowIso, normalizeWhitespace, parseYear } from './utils.js';
8
8
  const DOI_REGEX = /10\.\d{4,9}\/[\-._;()/:A-Z0-9]+/i;
9
+ const PDF_LINK_REGEX = /href=["']([^"']+\.pdf(?:\?[^"']*)?)["']/i;
9
10
  const toAbsolutePath = (value) => (value.startsWith('/') ? value : resolve(process.cwd(), value));
10
11
  const splitLines = (text) => text.split(/\r?\n/).map((line) => line.trim());
11
12
  const isLikelyHeading = (line) => /^(abstract|introduction|background|related work|method(?:s)?|materials|results|discussion|conclusion|limitations|references)\b/i.test(line.trim());
@@ -111,6 +112,14 @@ const parseGrobidXml = (xml) => {
111
112
  references
112
113
  };
113
114
  };
115
+ const resolveUrlCandidate = (candidate, baseUrl) => {
116
+ try {
117
+ return new URL(candidate, baseUrl).toString();
118
+ }
119
+ catch {
120
+ return null;
121
+ }
122
+ };
114
123
  export class IngestionService {
115
124
  config;
116
125
  logger;
@@ -253,9 +262,13 @@ export class IngestionService {
253
262
  if (input.doi) {
254
263
  resolvedWork = await this.literatureService.resolveByDoi(input.doi);
255
264
  }
265
+ const paperUrlCandidate = input.paperUrl ?? resolvedWork?.url ?? null;
266
+ const paperUrlPdfCandidate = paperUrlCandidate?.toLowerCase().endsWith('.pdf') ? paperUrlCandidate : null;
267
+ const discoveredPdfFromLanding = await this.resolvePdfUrlFromLandingPages([paperUrlCandidate, resolvedWork?.url]);
256
268
  const resolvedPdfUrl = input.pdfUrl ??
257
269
  resolvedWork?.openAccess.pdfUrl ??
258
- (input.paperUrl?.toLowerCase().endsWith('.pdf') ? input.paperUrl : null);
270
+ paperUrlPdfCandidate ??
271
+ discoveredPdfFromLanding;
259
272
  if (!resolvedPdfUrl) {
260
273
  throw new IngestionError('Unable to resolve a downloadable PDF URL from input.');
261
274
  }
@@ -281,12 +294,6 @@ export class IngestionService {
281
294
  }
282
295
  return await this.parseWithGrobid(filePath);
283
296
  }
284
- case 'sidecar': {
285
- if (!this.config.researchPythonSidecarUrl) {
286
- continue;
287
- }
288
- return await this.parseWithSidecar(filePath);
289
- }
290
297
  case 'simple': {
291
298
  return await this.parseWithSimplePdf(filePath);
292
299
  }
@@ -309,13 +316,10 @@ export class IngestionService {
309
316
  }
310
317
  resolveParserOrder(parseMode) {
311
318
  if (parseMode === 'auto') {
312
- return ['grobid', 'sidecar', 'simple'];
319
+ return ['grobid', 'simple'];
313
320
  }
314
321
  if (parseMode === 'grobid') {
315
- return ['grobid', 'sidecar', 'simple'];
316
- }
317
- if (parseMode === 'sidecar') {
318
- return ['sidecar', 'grobid', 'simple'];
322
+ return ['grobid', 'simple'];
319
323
  }
320
324
  return ['simple'];
321
325
  }
@@ -331,15 +335,22 @@ export class IngestionService {
331
335
  }
332
336
  const response = await fetch(source.pdfUrl, {
333
337
  headers: {
334
- accept: 'application/pdf,*/*'
338
+ accept: 'application/pdf,*/*',
339
+ 'user-agent': 'ScholarMCP/1.0 (+https://github.com/lstudlo/ScholarMCP)'
335
340
  }
336
341
  });
337
342
  if (!response.ok) {
338
343
  throw new IngestionError(`Failed to download PDF. HTTP ${response.status}`);
339
344
  }
340
345
  const bytes = await response.arrayBuffer();
346
+ const contentType = (response.headers.get('content-type') ?? '').toLowerCase();
347
+ const buffer = Buffer.from(bytes);
348
+ const looksLikePdf = buffer.length >= 4 && buffer.subarray(0, 4).toString('utf8') === '%PDF';
349
+ if (!contentType.includes('application/pdf') && !looksLikePdf) {
350
+ throw new IngestionError(`Downloaded content is not a PDF (content-type: ${contentType || 'unknown'}).`);
351
+ }
341
352
  const tempPath = resolve(tmpdir(), `scholar-mcp-${Date.now()}-${randomUUID()}.pdf`);
342
- await fs.writeFile(tempPath, Buffer.from(bytes));
353
+ await fs.writeFile(tempPath, buffer);
343
354
  return {
344
355
  filePath: tempPath,
345
356
  cleanup: async () => {
@@ -394,37 +405,83 @@ export class IngestionService {
394
405
  }
395
406
  return parsed;
396
407
  }
397
- async parseWithSidecar(filePath) {
398
- if (!this.config.researchPythonSidecarUrl) {
399
- throw new IngestionError('Python sidecar URL is not configured.');
408
+ async resolvePdfUrlFromLandingPages(urls) {
409
+ const seen = new Set();
410
+ for (const candidate of urls) {
411
+ if (!candidate) {
412
+ continue;
413
+ }
414
+ const normalized = candidate.trim();
415
+ if (!normalized || seen.has(normalized)) {
416
+ continue;
417
+ }
418
+ seen.add(normalized);
419
+ try {
420
+ const discovered = await this.resolvePdfUrlFromLandingPage(normalized);
421
+ if (discovered) {
422
+ return discovered;
423
+ }
424
+ }
425
+ catch (error) {
426
+ this.logger.debug('Landing page PDF discovery failed', {
427
+ paperUrl: normalized,
428
+ error: error instanceof Error ? error.message : String(error)
429
+ });
430
+ }
400
431
  }
401
- const url = new URL('/parse', this.config.researchPythonSidecarUrl);
402
- const response = await fetch(url, {
403
- method: 'POST',
432
+ return null;
433
+ }
434
+ async resolvePdfUrlFromLandingPage(paperUrl) {
435
+ const response = await fetch(paperUrl, {
404
436
  headers: {
405
- 'content-type': 'application/json'
406
- },
407
- body: JSON.stringify({
408
- filePath
409
- })
437
+ accept: 'text/html,application/pdf,*/*',
438
+ 'user-agent': 'ScholarMCP/1.0 (+https://github.com/lstudlo/ScholarMCP)'
439
+ }
410
440
  });
411
441
  if (!response.ok) {
412
- throw new IngestionError(`Python sidecar returned HTTP ${response.status}`);
442
+ return null;
413
443
  }
414
- const payload = (await response.json());
415
- const fullText = normalizeWhitespace(payload.fullText ?? '');
416
- if (!fullText) {
417
- throw new IngestionError('Python sidecar returned empty full text.');
444
+ const finalUrl = response.url || paperUrl;
445
+ const contentType = (response.headers.get('content-type') ?? '').toLowerCase();
446
+ if (contentType.includes('application/pdf')) {
447
+ return finalUrl;
418
448
  }
419
- return {
420
- parserName: payload.parserName ?? 'python-sidecar',
421
- parserVersion: payload.parserVersion ?? 'unknown',
422
- confidence: payload.confidence ?? 0.74,
423
- title: payload.title ?? null,
424
- abstract: payload.abstract ?? null,
425
- fullText,
426
- sections: payload.sections ?? splitIntoSections(fullText),
427
- references: payload.references ?? extractReferences(fullText)
428
- };
449
+ const html = await response.text();
450
+ if (!html) {
451
+ return null;
452
+ }
453
+ const metaPatterns = [
454
+ /<meta[^>]+name=["']citation_pdf_url["'][^>]+content=["']([^"']+)["'][^>]*>/i,
455
+ /<meta[^>]+content=["']([^"']+)["'][^>]+name=["']citation_pdf_url["'][^>]*>/i,
456
+ /<meta[^>]+property=["']og:pdf["'][^>]+content=["']([^"']+)["'][^>]*>/i,
457
+ /<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:pdf["'][^>]*>/i
458
+ ];
459
+ for (const pattern of metaPatterns) {
460
+ const match = html.match(pattern);
461
+ if (match?.[1]) {
462
+ const resolved = resolveUrlCandidate(match[1], finalUrl);
463
+ if (resolved) {
464
+ return resolved;
465
+ }
466
+ }
467
+ }
468
+ const linkPatterns = [
469
+ /<link[^>]+type=["']application\/pdf["'][^>]+href=["']([^"']+)["'][^>]*>/i,
470
+ /<link[^>]+href=["']([^"']+)["'][^>]+type=["']application\/pdf["'][^>]*>/i
471
+ ];
472
+ for (const pattern of linkPatterns) {
473
+ const match = html.match(pattern);
474
+ if (match?.[1]) {
475
+ const resolved = resolveUrlCandidate(match[1], finalUrl);
476
+ if (resolved) {
477
+ return resolved;
478
+ }
479
+ }
480
+ }
481
+ const anchorMatch = html.match(PDF_LINK_REGEX);
482
+ if (anchorMatch?.[1]) {
483
+ return resolveUrlCandidate(anchorMatch[1], finalUrl);
484
+ }
485
+ return null;
429
486
  }
430
487
  }
@@ -1,5 +1,6 @@
1
1
  import { normalizeDoi, normalizeWhitespace, parseYear, tokenizeForRanking } from './utils.js';
2
2
  import { ResearchHttpClient } from './http-client.js';
3
+ import { ResearchProviderError } from './errors.js';
3
4
  import { OpenAlexClient } from './providers/openalex-client.js';
4
5
  import { CrossrefClient } from './providers/crossref-client.js';
5
6
  import { SemanticScholarClient } from './providers/semantic-scholar-client.js';
@@ -292,9 +293,51 @@ export class LiteratureService {
292
293
  if (!normalized) {
293
294
  return null;
294
295
  }
296
+ try {
297
+ const openAlexExact = await this.openAlexClient.getWorkByDoi(normalized);
298
+ if (openAlexExact) {
299
+ return {
300
+ title: openAlexExact.title,
301
+ abstract: openAlexExact.abstract,
302
+ year: openAlexExact.year,
303
+ venue: openAlexExact.venue,
304
+ doi: openAlexExact.doi,
305
+ url: openAlexExact.url,
306
+ paperId: openAlexExact.providerId,
307
+ citationCount: openAlexExact.citationCount,
308
+ influentialCitationCount: openAlexExact.influentialCitationCount,
309
+ referenceCount: openAlexExact.referenceCount,
310
+ authors: openAlexExact.authors,
311
+ openAccess: {
312
+ isOpenAccess: openAlexExact.openAccess.isOpenAccess,
313
+ pdfUrl: openAlexExact.openAccess.pdfUrl,
314
+ license: openAlexExact.openAccess.license
315
+ },
316
+ externalIds: openAlexExact.externalIds,
317
+ fieldsOfStudy: openAlexExact.fieldsOfStudy,
318
+ score: openAlexExact.score,
319
+ provenance: [
320
+ {
321
+ provider: 'openalex',
322
+ sourceUrl: openAlexExact.sourceUrl,
323
+ fetchedAt: new Date().toISOString(),
324
+ confidence: providerWeight.openalex
325
+ }
326
+ ]
327
+ };
328
+ }
329
+ }
330
+ catch (error) {
331
+ if (!(error instanceof ResearchProviderError) || error.status !== 404) {
332
+ this.logger.warn('OpenAlex DOI resolve failed', {
333
+ doi: normalized,
334
+ error: error instanceof Error ? error.message : String(error)
335
+ });
336
+ }
337
+ }
295
338
  const result = await this.searchGraph({
296
339
  query: normalized,
297
- limit: 10,
340
+ limit: 50,
298
341
  sources: ['openalex', 'crossref', 'semantic_scholar']
299
342
  });
300
343
  return (result.results.find((item) => normalizeDoi(item.doi) === normalized) ??
@@ -38,43 +38,57 @@ export class OpenAlexClient {
38
38
  provider: 'openalex',
39
39
  url
40
40
  });
41
- return (payload.results ?? []).map((item) => {
42
- const doi = normalizeDoi(item.ids?.doi ?? null);
43
- return {
44
- provider: 'openalex',
45
- providerId: item.id ?? `openalex:${item.display_name ?? 'unknown'}`,
46
- title: item.display_name ?? 'Untitled',
47
- abstract: decodeInvertedAbstract(item.abstract_inverted_index),
48
- year: parseYear(item.publication_year),
49
- venue: item.primary_location?.source?.display_name ?? null,
50
- doi,
51
- url: item.primary_location?.landing_page_url ?? item.id ?? null,
52
- citationCount: item.cited_by_count ?? 0,
53
- influentialCitationCount: 0,
54
- referenceCount: item.referenced_works_count ?? 0,
55
- authors: (item.authorships ?? [])
56
- .map((auth) => ({
57
- name: auth.author?.display_name ?? '',
58
- authorId: auth.author?.id ?? null
59
- }))
60
- .filter((author) => author.name.length > 0),
61
- openAccess: {
62
- isOpenAccess: item.open_access?.is_oa ?? item.open_access?.any_repository_has_fulltext ?? Boolean(item.primary_location?.pdf_url),
63
- pdfUrl: item.primary_location?.pdf_url ?? item.open_access?.oa_url ?? null,
64
- license: item.primary_location?.license ?? item.open_access?.oa_status ?? null
65
- },
66
- externalIds: {
67
- ...(item.ids?.openalex ? { openalex: item.ids.openalex } : {}),
68
- ...(doi ? { doi } : {}),
69
- ...(item.ids?.pmid ? { pmid: item.ids.pmid } : {}),
70
- ...(item.ids?.pmcid ? { pmcid: item.ids.pmcid } : {})
71
- },
72
- fieldsOfStudy: (item.concepts ?? [])
73
- .map((concept) => concept.display_name ?? '')
74
- .filter((value) => value.length > 0),
75
- score: item.relevance_score ?? 0.5,
76
- sourceUrl: url.toString()
77
- };
41
+ return (payload.results ?? []).map((item) => this.mapWork(item, url.toString()));
42
+ }
43
+ async getWorkByDoi(doi) {
44
+ const normalizedDoi = normalizeDoi(doi);
45
+ if (!normalizedDoi) {
46
+ return null;
47
+ }
48
+ const encodedDoiUrl = encodeURIComponent(`https://doi.org/${normalizedDoi}`);
49
+ const url = new URL(`/works/${encodedDoiUrl}`, this.config.researchOpenAlexBaseUrl);
50
+ const payload = await this.httpClient.fetchJson({
51
+ provider: 'openalex',
52
+ url
78
53
  });
54
+ return this.mapWork(payload, url.toString());
55
+ }
56
+ mapWork(item, sourceUrl) {
57
+ const doi = normalizeDoi(item.ids?.doi ?? null);
58
+ return {
59
+ provider: 'openalex',
60
+ providerId: item.id ?? `openalex:${item.display_name ?? 'unknown'}`,
61
+ title: item.display_name ?? 'Untitled',
62
+ abstract: decodeInvertedAbstract(item.abstract_inverted_index),
63
+ year: parseYear(item.publication_year),
64
+ venue: item.primary_location?.source?.display_name ?? null,
65
+ doi,
66
+ url: item.primary_location?.landing_page_url ?? item.id ?? null,
67
+ citationCount: item.cited_by_count ?? 0,
68
+ influentialCitationCount: 0,
69
+ referenceCount: item.referenced_works_count ?? 0,
70
+ authors: (item.authorships ?? [])
71
+ .map((auth) => ({
72
+ name: auth.author?.display_name ?? '',
73
+ authorId: auth.author?.id ?? null
74
+ }))
75
+ .filter((author) => author.name.length > 0),
76
+ openAccess: {
77
+ isOpenAccess: item.open_access?.is_oa ?? item.open_access?.any_repository_has_fulltext ?? Boolean(item.primary_location?.pdf_url),
78
+ pdfUrl: item.primary_location?.pdf_url ?? item.open_access?.oa_url ?? null,
79
+ license: item.primary_location?.license ?? item.open_access?.oa_status ?? null
80
+ },
81
+ externalIds: {
82
+ ...(item.ids?.openalex ? { openalex: item.ids.openalex } : {}),
83
+ ...(doi ? { doi } : {}),
84
+ ...(item.ids?.pmid ? { pmid: item.ids.pmid } : {}),
85
+ ...(item.ids?.pmcid ? { pmcid: item.ids.pmcid } : {})
86
+ },
87
+ fieldsOfStudy: (item.concepts ?? [])
88
+ .map((concept) => concept.display_name ?? '')
89
+ .filter((value) => value.length > 0),
90
+ score: item.relevance_score ?? 0.5,
91
+ sourceUrl
92
+ };
79
93
  }
80
94
  }
@@ -7,9 +7,10 @@ export class SemanticScholarClient {
7
7
  this.httpClient = httpClient;
8
8
  }
9
9
  async searchWorks(query, limit) {
10
- const url = new URL('/paper/search', this.config.researchSemanticScholarBaseUrl.endsWith('/')
10
+ const baseUrl = this.config.researchSemanticScholarBaseUrl.endsWith('/')
11
11
  ? this.config.researchSemanticScholarBaseUrl
12
- : `${this.config.researchSemanticScholarBaseUrl}/`);
12
+ : `${this.config.researchSemanticScholarBaseUrl}/`;
13
+ const url = new URL('paper/search', baseUrl);
13
14
  url.searchParams.set('query', query);
14
15
  url.searchParams.set('limit', String(limit));
15
16
  url.searchParams.set('fields', 'paperId,title,abstract,year,venue,externalIds,url,citationCount,influentialCitationCount,referenceCount,isOpenAccess,openAccessPdf,fieldsOfStudy,authors');
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "scholar-mcp",
3
- "version": "1.0.6",
3
+ "version": "1.0.7",
4
4
  "description": "MCP Server for researchers",
5
5
  "license": "MIT",
6
6
  "type": "module",