scholar-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,583 @@
1
+ import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
2
+ import { z } from 'zod';
3
+ import { ScholarError } from '../scholar/errors.js';
4
+ const paperToLegacyShape = (paper) => ({
5
+ Title: paper.title,
6
+ Authors: paper.authorsLine,
7
+ Abstract: paper.abstract,
8
+ URL: paper.url,
9
+ Year: paper.year,
10
+ CitedBy: paper.citedByCount,
11
+ CitedByURL: paper.citedByUrl,
12
+ RelatedArticlesURL: paper.relatedArticlesUrl,
13
+ Versions: paper.versionsCount,
14
+ VersionsURL: paper.versionsUrl,
15
+ PDFURL: paper.pdfUrl
16
+ });
17
+ const toToolError = (error) => {
18
+ const fallbackMessage = 'Unknown ScholarMCP error.';
19
+ if (error instanceof ScholarError) {
20
+ return {
21
+ isError: true,
22
+ content: [{ type: 'text', text: error.message }],
23
+ structuredContent: {
24
+ error: error.name,
25
+ message: error.message,
26
+ details: error.details
27
+ }
28
+ };
29
+ }
30
+ if (error instanceof Error) {
31
+ return {
32
+ isError: true,
33
+ content: [{ type: 'text', text: error.message }],
34
+ structuredContent: {
35
+ error: error.name,
36
+ message: error.message
37
+ }
38
+ };
39
+ }
40
+ return {
41
+ isError: true,
42
+ content: [{ type: 'text', text: fallbackMessage }],
43
+ structuredContent: {
44
+ error: 'UnknownError',
45
+ message: fallbackMessage
46
+ }
47
+ };
48
+ };
49
+ const manualWorkSchema = z.object({
50
+ title: z.string().min(1),
51
+ abstract: z.string().optional(),
52
+ year: z.number().int().optional(),
53
+ venue: z.string().optional(),
54
+ doi: z.string().optional(),
55
+ url: z.string().optional(),
56
+ citation_count: z.number().int().optional(),
57
+ authors: z.array(z.string()).optional()
58
+ });
59
+ export const createScholarMcpServer = (config, service, researchService, logger) => {
60
+ const server = new McpServer({
61
+ name: config.serverName,
62
+ version: config.serverVersion,
63
+ title: 'ScholarMCP',
64
+ description: 'Google Scholar research tools exposed over MCP'
65
+ }, {
66
+ capabilities: {
67
+ logging: {}
68
+ }
69
+ });
70
+ server.registerTool('search_literature_graph', {
71
+ title: 'Search Federated Literature Graph',
72
+ description: 'Search multiple scholarly metadata providers (OpenAlex, Crossref, Semantic Scholar, optional Scholar scrape) and return canonicalized paper records.',
73
+ annotations: {
74
+ readOnlyHint: true,
75
+ openWorldHint: true
76
+ },
77
+ inputSchema: {
78
+ query: z.string().min(1).describe('Research query string.'),
79
+ year_range: z
80
+ .union([
81
+ z.tuple([z.number().int(), z.number().int()]),
82
+ z.object({ start: z.number().int(), end: z.number().int() })
83
+ ])
84
+ .optional()
85
+ .describe('Optional publication year range as [start, end] or {start, end}.'),
86
+ fields_of_study: z.array(z.string().min(1)).optional().describe('Optional field-of-study filters.'),
87
+ limit: z.number().int().min(1).max(50).default(10).describe('Maximum number of merged results.'),
88
+ sources: z
89
+ .array(z.enum(['openalex', 'crossref', 'semantic_scholar', 'scholar_scrape']))
90
+ .optional()
91
+ .describe('Optional source allow-list.')
92
+ }
93
+ }, async (args) => {
94
+ try {
95
+ const normalizedYearRange = (() => {
96
+ if (!args.year_range) {
97
+ return undefined;
98
+ }
99
+ if (Array.isArray(args.year_range)) {
100
+ return args.year_range;
101
+ }
102
+ return [args.year_range.start, args.year_range.end];
103
+ })();
104
+ const result = await researchService.searchLiteratureGraph({
105
+ query: args.query,
106
+ yearRange: normalizedYearRange,
107
+ fieldsOfStudy: args.fields_of_study,
108
+ limit: args.limit,
109
+ sources: args.sources
110
+ });
111
+ return {
112
+ content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
113
+ structuredContent: result
114
+ };
115
+ }
116
+ catch (error) {
117
+ logger.warn('Federated literature search failed', {
118
+ tool: 'search_literature_graph',
119
+ query: args.query,
120
+ error: error instanceof Error ? error.message : String(error)
121
+ });
122
+ return toToolError(error);
123
+ }
124
+ });
125
+ server.registerTool('ingest_paper_fulltext', {
126
+ title: 'Ingest Full-Text Paper',
127
+ description: 'Resolve and ingest a full-text PDF from DOI/URL/local file, then parse into a structured document using GROBID/sidecar/simple fallback pipeline.',
128
+ annotations: {
129
+ readOnlyHint: false,
130
+ openWorldHint: true
131
+ },
132
+ inputSchema: {
133
+ doi: z.string().optional().describe('DOI (recommended for OA PDF discovery).'),
134
+ paper_url: z.string().url().optional().describe('Landing page URL for the paper.'),
135
+ pdf_url: z.string().url().optional().describe('Direct PDF URL.'),
136
+ local_pdf_path: z.string().optional().describe('Local absolute or workspace-relative PDF path.'),
137
+ parse_mode: z.enum(['auto', 'grobid', 'sidecar', 'simple']).default('auto'),
138
+ ocr_enabled: z.boolean().default(true).describe('Reserved for OCR-capable parser modes.')
139
+ }
140
+ }, async ({ doi, paper_url, pdf_url, local_pdf_path, parse_mode, ocr_enabled }) => {
141
+ try {
142
+ if (!doi && !paper_url && !pdf_url && !local_pdf_path) {
143
+ throw new Error('Provide at least one source: doi, paper_url, pdf_url, or local_pdf_path.');
144
+ }
145
+ const job = researchService.ingestPaperFullText({
146
+ doi,
147
+ paperUrl: paper_url,
148
+ pdfUrl: pdf_url,
149
+ localPdfPath: local_pdf_path,
150
+ parseMode: parse_mode,
151
+ ocrEnabled: ocr_enabled
152
+ });
153
+ return {
154
+ content: [{ type: 'text', text: JSON.stringify(job, null, 2) }],
155
+ structuredContent: job
156
+ };
157
+ }
158
+ catch (error) {
159
+ logger.warn('Full-text ingestion start failed', {
160
+ tool: 'ingest_paper_fulltext',
161
+ doi,
162
+ paper_url,
163
+ pdf_url,
164
+ local_pdf_path,
165
+ error: error instanceof Error ? error.message : String(error)
166
+ });
167
+ return toToolError(error);
168
+ }
169
+ });
170
+ server.registerTool('get_ingestion_status', {
171
+ title: 'Get Full-Text Ingestion Status',
172
+ description: 'Get the status of a previously started ingest_paper_fulltext job.',
173
+ annotations: {
174
+ readOnlyHint: true,
175
+ openWorldHint: false
176
+ },
177
+ inputSchema: {
178
+ job_id: z.string().min(1).describe('Ingestion job id returned by ingest_paper_fulltext.')
179
+ }
180
+ }, async ({ job_id }) => {
181
+ try {
182
+ const job = researchService.getIngestionStatus(job_id);
183
+ const payload = {
184
+ ...job
185
+ };
186
+ if (job.status === 'succeeded') {
187
+ const document = researchService.getParsedDocument(job.documentId);
188
+ payload.document_summary = {
189
+ documentId: document.documentId,
190
+ title: document.title,
191
+ abstract: document.abstract,
192
+ parser: document.parser,
193
+ sections: document.sections.length,
194
+ references: document.references.length,
195
+ createdAt: document.createdAt
196
+ };
197
+ }
198
+ return {
199
+ content: [{ type: 'text', text: JSON.stringify(payload, null, 2) }],
200
+ structuredContent: payload
201
+ };
202
+ }
203
+ catch (error) {
204
+ logger.warn('Ingestion status lookup failed', {
205
+ tool: 'get_ingestion_status',
206
+ job_id,
207
+ error: error instanceof Error ? error.message : String(error)
208
+ });
209
+ return toToolError(error);
210
+ }
211
+ });
212
+ server.registerTool('extract_granular_paper_details', {
213
+ title: 'Extract Granular Paper Details',
214
+ description: 'Extract claims, methods, limitations, datasets, metrics, and section-aware summaries from a parsed document.',
215
+ annotations: {
216
+ readOnlyHint: true,
217
+ openWorldHint: false
218
+ },
219
+ inputSchema: {
220
+ document_id: z.string().min(1),
221
+ sections: z.array(z.string().min(1)).optional(),
222
+ include_references: z.boolean().default(true)
223
+ }
224
+ }, async ({ document_id, sections, include_references }) => {
225
+ try {
226
+ const result = researchService.extractGranularPaperDetails(document_id, {
227
+ sections,
228
+ includeReferences: include_references
229
+ });
230
+ return {
231
+ content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
232
+ structuredContent: result
233
+ };
234
+ }
235
+ catch (error) {
236
+ logger.warn('Granular extraction failed', {
237
+ tool: 'extract_granular_paper_details',
238
+ document_id,
239
+ error: error instanceof Error ? error.message : String(error)
240
+ });
241
+ return toToolError(error);
242
+ }
243
+ });
244
+ server.registerTool('suggest_contextual_citations', {
245
+ title: 'Suggest Context-Aware Citations',
246
+ description: 'Recommend citations from the federated literature graph based on manuscript context.',
247
+ annotations: {
248
+ readOnlyHint: true,
249
+ openWorldHint: true
250
+ },
251
+ inputSchema: {
252
+ manuscript_text: z.string().min(20),
253
+ cursor_context: z.string().optional(),
254
+ style: z.enum(['apa', 'ieee', 'chicago', 'vancouver']).default('apa'),
255
+ k: z.number().int().min(1).max(30).default(10),
256
+ recency_bias: z.number().min(0).max(1).default(0.5)
257
+ }
258
+ }, async ({ manuscript_text, cursor_context, style, k, recency_bias }) => {
259
+ try {
260
+ const result = await researchService.suggestContextualCitations({
261
+ manuscriptText: manuscript_text,
262
+ cursorContext: cursor_context,
263
+ style,
264
+ k,
265
+ recencyBias: recency_bias
266
+ });
267
+ return {
268
+ content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
269
+ structuredContent: result
270
+ };
271
+ }
272
+ catch (error) {
273
+ logger.warn('Citation suggestion failed', {
274
+ tool: 'suggest_contextual_citations',
275
+ error: error instanceof Error ? error.message : String(error)
276
+ });
277
+ return toToolError(error);
278
+ }
279
+ });
280
+ server.registerTool('build_reference_list', {
281
+ title: 'Build Reference List',
282
+ description: 'Generate CSL-formatted bibliography and BibTeX entries from manuscript context or explicit works.',
283
+ annotations: {
284
+ readOnlyHint: true,
285
+ openWorldHint: true
286
+ },
287
+ inputSchema: {
288
+ style: z.enum(['apa', 'ieee', 'chicago', 'vancouver']).default('apa'),
289
+ locale: z.string().default('en-US'),
290
+ manuscript_text: z.string().optional(),
291
+ works: z.array(manualWorkSchema).optional()
292
+ }
293
+ }, async ({ style, locale, manuscript_text, works }) => {
294
+ try {
295
+ if ((!manuscript_text || manuscript_text.trim().length === 0) && (!works || works.length === 0)) {
296
+ throw new Error('Provide either manuscript_text or works.');
297
+ }
298
+ const normalizedWorks = (works ?? []).map((work) => ({
299
+ title: work.title,
300
+ abstract: work.abstract ?? null,
301
+ year: work.year ?? null,
302
+ venue: work.venue ?? null,
303
+ doi: work.doi ?? null,
304
+ url: work.url ?? null,
305
+ paperId: work.doi ?? work.url ?? work.title,
306
+ citationCount: work.citation_count ?? 0,
307
+ influentialCitationCount: 0,
308
+ referenceCount: 0,
309
+ authors: (work.authors ?? []).map((name) => ({ name })),
310
+ openAccess: {
311
+ isOpenAccess: false,
312
+ pdfUrl: null,
313
+ license: null
314
+ },
315
+ externalIds: {},
316
+ fieldsOfStudy: [],
317
+ score: 0.5,
318
+ provenance: []
319
+ }));
320
+ const result = await researchService.buildReferenceList({
321
+ style,
322
+ locale,
323
+ manuscriptText: manuscript_text,
324
+ works: normalizedWorks
325
+ });
326
+ return {
327
+ content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
328
+ structuredContent: result
329
+ };
330
+ }
331
+ catch (error) {
332
+ logger.warn('Reference list build failed', {
333
+ tool: 'build_reference_list',
334
+ error: error instanceof Error ? error.message : String(error)
335
+ });
336
+ return toToolError(error);
337
+ }
338
+ });
339
+ server.registerTool('validate_manuscript_citations', {
340
+ title: 'Validate Manuscript Citations',
341
+ description: 'Validate inline citations against reference entries and detect missing or uncited references.',
342
+ annotations: {
343
+ readOnlyHint: true,
344
+ openWorldHint: false
345
+ },
346
+ inputSchema: {
347
+ manuscript_text: z.string().min(20),
348
+ style: z.enum(['apa', 'ieee', 'chicago', 'vancouver']).optional(),
349
+ references: z.array(z.object({
350
+ id: z.string().optional(),
351
+ formatted: z.string().min(1),
352
+ bibtex: z.string().optional()
353
+ }))
354
+ }
355
+ }, async ({ manuscript_text, references, style }) => {
356
+ try {
357
+ const normalizedReferences = references.map((reference, index) => ({
358
+ id: reference.id ?? `ref-${index + 1}`,
359
+ csl: {},
360
+ formatted: reference.formatted,
361
+ bibtex: reference.bibtex ?? '',
362
+ sourceWork: {
363
+ title: reference.formatted,
364
+ abstract: null,
365
+ year: null,
366
+ venue: null,
367
+ doi: null,
368
+ url: null,
369
+ paperId: reference.id ?? `ref-${index + 1}`,
370
+ citationCount: 0,
371
+ influentialCitationCount: 0,
372
+ referenceCount: 0,
373
+ authors: [],
374
+ openAccess: {
375
+ isOpenAccess: false,
376
+ license: null,
377
+ pdfUrl: null
378
+ },
379
+ externalIds: {},
380
+ fieldsOfStudy: [],
381
+ score: 0,
382
+ provenance: []
383
+ }
384
+ }));
385
+ const result = researchService.validateManuscriptCitations(manuscript_text, normalizedReferences, {
386
+ style
387
+ });
388
+ return {
389
+ content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
390
+ structuredContent: result
391
+ };
392
+ }
393
+ catch (error) {
394
+ logger.warn('Citation validation failed', {
395
+ tool: 'validate_manuscript_citations',
396
+ error: error instanceof Error ? error.message : String(error)
397
+ });
398
+ return toToolError(error);
399
+ }
400
+ });
401
+ server.registerTool('search_google_scholar_key_words', {
402
+ title: 'Search Google Scholar by Keywords',
403
+ description: 'Search Google Scholar using keywords and return paper metadata.',
404
+ annotations: {
405
+ readOnlyHint: true,
406
+ openWorldHint: true
407
+ },
408
+ inputSchema: {
409
+ query: z.string().min(1).describe('Search query string'),
410
+ num_results: z.number().int().min(1).max(20).default(5).describe('Number of results to return'),
411
+ start: z.number().int().min(0).default(0).describe('Offset for pagination (0, 10, 20, ...)'),
412
+ language: z.string().default(config.scholarLanguage).describe('Google Scholar language code (e.g., en)')
413
+ }
414
+ }, async ({ query, num_results, start, language }) => {
415
+ try {
416
+ const result = await service.searchKeywords({
417
+ query,
418
+ numResults: num_results,
419
+ start,
420
+ language
421
+ });
422
+ const legacyResults = result.papers.map(paperToLegacyShape);
423
+ return {
424
+ content: [
425
+ {
426
+ type: 'text',
427
+ text: JSON.stringify({
428
+ query: result.query,
429
+ total_results_text: result.totalResultsText,
430
+ next_page_start: result.nextPageStart,
431
+ results: legacyResults
432
+ }, null, 2)
433
+ }
434
+ ],
435
+ structuredContent: {
436
+ query: result.query,
437
+ totalResultsText: result.totalResultsText,
438
+ nextPageStart: result.nextPageStart,
439
+ results: legacyResults
440
+ }
441
+ };
442
+ }
443
+ catch (error) {
444
+ logger.warn('Keyword search tool failed', {
445
+ tool: 'search_google_scholar_key_words',
446
+ query,
447
+ error: error instanceof Error ? error.message : String(error)
448
+ });
449
+ return toToolError(error);
450
+ }
451
+ });
452
+ server.registerTool('search_google_scholar_advanced', {
453
+ title: 'Search Google Scholar with Advanced Filters',
454
+ description: 'Search Google Scholar using keyword, author, year-range, phrase, and exclusion filters.',
455
+ annotations: {
456
+ readOnlyHint: true,
457
+ openWorldHint: true
458
+ },
459
+ inputSchema: {
460
+ query: z.string().min(1).describe('General search query'),
461
+ author: z.string().optional().describe('Author filter value'),
462
+ year_range: z
463
+ .union([
464
+ z.tuple([z.number().int(), z.number().int()]),
465
+ z.object({ start: z.number().int(), end: z.number().int() })
466
+ ])
467
+ .optional()
468
+ .describe('Year range as [start, end] or { start, end }'),
469
+ exact_phrase: z.string().optional().describe('Exact phrase that must appear in results'),
470
+ exclude_words: z.string().optional().describe('Words that should be excluded from results'),
471
+ title_only: z.boolean().default(false).describe('Restrict search terms to title only'),
472
+ num_results: z.number().int().min(1).max(20).default(5),
473
+ start: z.number().int().min(0).default(0),
474
+ language: z.string().default(config.scholarLanguage)
475
+ }
476
+ }, async (args) => {
477
+ try {
478
+ const normalizedYearRange = (() => {
479
+ if (!args.year_range) {
480
+ return undefined;
481
+ }
482
+ if (Array.isArray(args.year_range)) {
483
+ return args.year_range;
484
+ }
485
+ return [args.year_range.start, args.year_range.end];
486
+ })();
487
+ const result = await service.searchAdvanced({
488
+ query: args.query,
489
+ author: args.author,
490
+ yearRange: normalizedYearRange,
491
+ exactPhrase: args.exact_phrase,
492
+ excludeWords: args.exclude_words,
493
+ titleOnly: args.title_only,
494
+ numResults: args.num_results,
495
+ start: args.start,
496
+ language: args.language
497
+ });
498
+ const legacyResults = result.papers.map(paperToLegacyShape);
499
+ return {
500
+ content: [
501
+ {
502
+ type: 'text',
503
+ text: JSON.stringify({
504
+ query: result.query,
505
+ total_results_text: result.totalResultsText,
506
+ next_page_start: result.nextPageStart,
507
+ results: legacyResults
508
+ }, null, 2)
509
+ }
510
+ ],
511
+ structuredContent: {
512
+ query: result.query,
513
+ totalResultsText: result.totalResultsText,
514
+ nextPageStart: result.nextPageStart,
515
+ results: legacyResults
516
+ }
517
+ };
518
+ }
519
+ catch (error) {
520
+ logger.warn('Advanced search tool failed', {
521
+ tool: 'search_google_scholar_advanced',
522
+ error: error instanceof Error ? error.message : String(error)
523
+ });
524
+ return toToolError(error);
525
+ }
526
+ });
527
+ server.registerTool('get_author_info', {
528
+ title: 'Get Author Info',
529
+ description: 'Retrieve a Google Scholar author profile and top publications by author name.',
530
+ annotations: {
531
+ readOnlyHint: true,
532
+ openWorldHint: true
533
+ },
534
+ inputSchema: {
535
+ author_name: z.string().min(1).describe('Full author name to resolve in Google Scholar'),
536
+ max_publications: z.number().int().min(1).max(20).default(5),
537
+ language: z.string().default(config.scholarLanguage)
538
+ }
539
+ }, async ({ author_name, max_publications, language }) => {
540
+ try {
541
+ const author = await service.getAuthorInfo(author_name, max_publications, language);
542
+ const pythonCompatibilityPayload = {
543
+ name: author.authorName,
544
+ affiliation: author.affiliation,
545
+ interests: author.interests,
546
+ citedby: author.metrics.citationsAll ?? 0,
547
+ author_id: author.authorId,
548
+ profile_url: author.profileUrl,
549
+ verified_email: author.verifiedEmail,
550
+ homepage: author.homepageUrl,
551
+ metrics: {
552
+ citations_all: author.metrics.citationsAll,
553
+ citations_since_2021: author.metrics.citationsSince,
554
+ h_index_all: author.metrics.hIndexAll,
555
+ h_index_since_2021: author.metrics.hIndexSince,
556
+ i10_index_all: author.metrics.i10IndexAll,
557
+ i10_index_since_2021: author.metrics.i10IndexSince
558
+ },
559
+ publications: author.publications.map((publication) => ({
560
+ title: publication.title,
561
+ year: publication.year,
562
+ citations: publication.citations,
563
+ authors: publication.authors,
564
+ venue: publication.venue,
565
+ url: publication.detailUrl
566
+ }))
567
+ };
568
+ return {
569
+ content: [{ type: 'text', text: JSON.stringify(pythonCompatibilityPayload, null, 2) }],
570
+ structuredContent: pythonCompatibilityPayload
571
+ };
572
+ }
573
+ catch (error) {
574
+ logger.warn('Author info tool failed', {
575
+ tool: 'get_author_info',
576
+ author_name,
577
+ error: error instanceof Error ? error.message : String(error)
578
+ });
579
+ return toToolError(error);
580
+ }
581
+ });
582
+ return server;
583
+ };
@@ -0,0 +1,8 @@
1
+ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
2
+ import { createScholarMcpServer } from './create-scholar-mcp-server.js';
3
+ export const startStdioServer = async (config, service, researchService, logger) => {
4
+ const server = createScholarMcpServer(config, service, researchService, logger);
5
+ const transport = new StdioServerTransport();
6
+ await server.connect(transport);
7
+ logger.info('ScholarMCP stdio transport ready');
8
+ };