scholar-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,430 @@
1
+ import { promises as fs } from 'node:fs';
2
+ import { basename, resolve } from 'node:path';
3
+ import { tmpdir } from 'node:os';
4
+ import { randomUUID } from 'node:crypto';
5
+ import { PDFParse } from 'pdf-parse';
6
+ import { IngestionError, DocumentNotFoundError, JobNotFoundError } from './errors.js';
7
+ import { makeStableId, nowIso, normalizeWhitespace, parseYear } from './utils.js';
8
+ const DOI_REGEX = /10\.\d{4,9}\/[\-._;()/:A-Z0-9]+/i;
9
+ const toAbsolutePath = (value) => (value.startsWith('/') ? value : resolve(process.cwd(), value));
10
+ const splitLines = (text) => text.split(/\r?\n/).map((line) => line.trim());
11
+ const isLikelyHeading = (line) => /^(abstract|introduction|background|related work|method(?:s)?|materials|results|discussion|conclusion|limitations|references)\b/i.test(line.trim());
12
+ const splitIntoSections = (text) => {
13
+ const lines = splitLines(text).filter((line) => line.length > 0);
14
+ if (lines.length === 0) {
15
+ return [];
16
+ }
17
+ const sections = [];
18
+ let currentHeading = 'Body';
19
+ let currentLines = [];
20
+ const pushCurrent = () => {
21
+ const sectionText = normalizeWhitespace(currentLines.join(' '));
22
+ if (sectionText.length === 0) {
23
+ return;
24
+ }
25
+ sections.push({
26
+ id: makeStableId([currentHeading, sectionText.slice(0, 120)], 'section'),
27
+ heading: currentHeading,
28
+ text: sectionText,
29
+ pageStart: null,
30
+ pageEnd: null
31
+ });
32
+ };
33
+ for (const line of lines) {
34
+ if (isLikelyHeading(line) && currentLines.length > 0) {
35
+ pushCurrent();
36
+ currentHeading = line;
37
+ currentLines = [];
38
+ continue;
39
+ }
40
+ if (isLikelyHeading(line) && currentLines.length === 0) {
41
+ currentHeading = line;
42
+ continue;
43
+ }
44
+ currentLines.push(line);
45
+ }
46
+ if (currentLines.length > 0) {
47
+ pushCurrent();
48
+ }
49
+ return sections;
50
+ };
51
+ const extractReferences = (text) => {
52
+ const lines = splitLines(text);
53
+ const referencesStart = lines.findIndex((line) => /^references$/i.test(line));
54
+ const sourceLines = referencesStart >= 0 ? lines.slice(referencesStart + 1) : lines.slice(-120);
55
+ return sourceLines
56
+ .filter((line) => line.length > 30)
57
+ .slice(0, 60)
58
+ .map((line) => {
59
+ const doi = line.match(DOI_REGEX)?.[0]?.toLowerCase() ?? null;
60
+ const year = parseYear(line);
61
+ return {
62
+ rawText: line,
63
+ doi,
64
+ title: null,
65
+ year,
66
+ authors: []
67
+ };
68
+ });
69
+ };
70
+ const extractTitleAndAbstract = (text) => {
71
+ const lines = splitLines(text).filter((line) => line.length > 0);
72
+ const title = lines[0] ?? null;
73
+ let abstract = null;
74
+ const abstractIndex = lines.findIndex((line) => /^abstract$/i.test(line) || /^abstract[:\s]/i.test(line));
75
+ if (abstractIndex >= 0) {
76
+ abstract = normalizeWhitespace(lines.slice(abstractIndex, abstractIndex + 6).join(' '));
77
+ }
78
+ return {
79
+ title,
80
+ abstract
81
+ };
82
+ };
83
+ const parseGrobidXml = (xml) => {
84
+ const title = xml.match(/<title[^>]*type="main"[^>]*>([\s\S]*?)<\/title>/i)?.[1] ?? null;
85
+ const body = xml.match(/<body>([\s\S]*?)<\/body>/i)?.[1] ?? '';
86
+ const text = normalizeWhitespace(body.replace(/<[^>]+>/g, ' '));
87
+ const references = [...xml.matchAll(/<biblStruct[\s\S]*?<\/biblStruct>/gim)]
88
+ .slice(0, 120)
89
+ .map((entry) => {
90
+ const raw = normalizeWhitespace(entry[0].replace(/<[^>]+>/g, ' '));
91
+ const doi = entry[0].match(DOI_REGEX)?.[0]?.toLowerCase() ?? null;
92
+ const refTitle = entry[0].match(/<title[^>]*>([\s\S]*?)<\/title>/i)?.[1] ?? null;
93
+ return {
94
+ rawText: raw,
95
+ doi,
96
+ title: refTitle ? normalizeWhitespace(refTitle.replace(/<[^>]+>/g, ' ')) : null,
97
+ year: parseYear(raw),
98
+ authors: []
99
+ };
100
+ });
101
+ const normalizedTitle = title ? normalizeWhitespace(title.replace(/<[^>]+>/g, ' ')) : null;
102
+ const sections = splitIntoSections(text);
103
+ return {
104
+ parserName: 'grobid',
105
+ parserVersion: 'service',
106
+ confidence: text.length > 0 ? 0.85 : 0.65,
107
+ title: normalizedTitle,
108
+ abstract: null,
109
+ fullText: text,
110
+ sections,
111
+ references
112
+ };
113
+ };
114
+ export class IngestionService {
115
+ config;
116
+ logger;
117
+ literatureService;
118
+ jobs = new Map();
119
+ documents = new Map();
120
+ constructor(config, logger, literatureService) {
121
+ this.config = config;
122
+ this.logger = logger;
123
+ this.literatureService = literatureService;
124
+ }
125
+ enqueueIngestion(input) {
126
+ const sourceSeed = [input.doi ?? null, input.paperUrl ?? null, input.pdfUrl ?? null, input.localPdfPath ?? null];
127
+ const documentId = makeStableId(sourceSeed, 'doc');
128
+ const jobId = makeStableId([...sourceSeed, randomUUID()], 'job');
129
+ const job = {
130
+ jobId,
131
+ documentId,
132
+ status: 'queued',
133
+ createdAt: nowIso(),
134
+ startedAt: null,
135
+ completedAt: null,
136
+ source: {
137
+ doi: input.doi ?? null,
138
+ paperUrl: input.paperUrl ?? null,
139
+ pdfUrl: input.pdfUrl ?? null,
140
+ localPdfPath: input.localPdfPath ?? null
141
+ },
142
+ parserName: null,
143
+ parserConfidence: null,
144
+ licenseState: 'unknown',
145
+ error: null,
146
+ warnings: [],
147
+ provenance: []
148
+ };
149
+ this.jobs.set(jobId, job);
150
+ void this.processJob(jobId, input).catch((error) => {
151
+ const current = this.jobs.get(jobId);
152
+ if (!current) {
153
+ return;
154
+ }
155
+ current.status = 'failed';
156
+ current.completedAt = nowIso();
157
+ current.error = error instanceof Error ? error.message : String(error);
158
+ this.jobs.set(jobId, current);
159
+ });
160
+ return job;
161
+ }
162
+ getJob(jobId) {
163
+ const job = this.jobs.get(jobId);
164
+ if (!job) {
165
+ throw new JobNotFoundError(jobId);
166
+ }
167
+ return job;
168
+ }
169
+ getDocument(documentId) {
170
+ const document = this.documents.get(documentId);
171
+ if (!document) {
172
+ throw new DocumentNotFoundError(documentId);
173
+ }
174
+ return document;
175
+ }
176
+ async processJob(jobId, input) {
177
+ const job = this.jobs.get(jobId);
178
+ if (!job) {
179
+ return;
180
+ }
181
+ job.status = 'running';
182
+ job.startedAt = nowIso();
183
+ this.jobs.set(jobId, job);
184
+ const resolved = await this.resolveSource(input);
185
+ job.source = {
186
+ doi: resolved.doi,
187
+ paperUrl: resolved.paperUrl,
188
+ pdfUrl: resolved.pdfUrl,
189
+ localPdfPath: resolved.localPdfPath
190
+ };
191
+ job.licenseState = resolved.licenseState;
192
+ const parserMode = input.parseMode ?? 'auto';
193
+ const parseResult = await this.parseSourcePdf(resolved, parserMode);
194
+ const document = {
195
+ documentId: job.documentId,
196
+ source: {
197
+ doi: resolved.doi,
198
+ url: resolved.paperUrl ?? resolved.pdfUrl,
199
+ localPath: resolved.localPdfPath
200
+ },
201
+ parser: {
202
+ parserName: parseResult.parserName,
203
+ parserVersion: parseResult.parserVersion,
204
+ confidence: parseResult.confidence
205
+ },
206
+ title: parseResult.title,
207
+ abstract: parseResult.abstract,
208
+ fullText: parseResult.fullText,
209
+ sections: parseResult.sections,
210
+ references: parseResult.references,
211
+ tables: [],
212
+ equations: [],
213
+ figures: [],
214
+ createdAt: nowIso(),
215
+ provenance: [
216
+ {
217
+ provider: resolved.provenanceWork ? 'openalex' : 'scholar_scrape',
218
+ sourceUrl: resolved.paperUrl ?? resolved.pdfUrl,
219
+ fetchedAt: nowIso(),
220
+ confidence: parseResult.confidence,
221
+ notes: `${parseResult.parserName}:${parseResult.parserVersion}`
222
+ }
223
+ ]
224
+ };
225
+ this.documents.set(document.documentId, document);
226
+ job.status = 'succeeded';
227
+ job.completedAt = nowIso();
228
+ job.parserName = parseResult.parserName;
229
+ job.parserConfidence = parseResult.confidence;
230
+ job.provenance = document.provenance;
231
+ this.jobs.set(jobId, job);
232
+ }
233
+ async resolveSource(input) {
234
+ if (input.localPdfPath) {
235
+ if (!this.config.researchAllowLocalPdfs) {
236
+ throw new IngestionError('Local PDF ingestion is disabled by configuration.');
237
+ }
238
+ const absolutePath = toAbsolutePath(input.localPdfPath);
239
+ await fs.access(absolutePath);
240
+ return {
241
+ doi: input.doi ?? null,
242
+ paperUrl: input.paperUrl ?? null,
243
+ pdfUrl: input.pdfUrl ?? null,
244
+ localPdfPath: absolutePath,
245
+ licenseState: 'user_provided',
246
+ provenanceWork: null
247
+ };
248
+ }
249
+ if (!this.config.researchAllowRemotePdfs) {
250
+ throw new IngestionError('Remote PDF ingestion is disabled by configuration.');
251
+ }
252
+ let resolvedWork = null;
253
+ if (input.doi) {
254
+ resolvedWork = await this.literatureService.resolveByDoi(input.doi);
255
+ }
256
+ const resolvedPdfUrl = input.pdfUrl ??
257
+ resolvedWork?.openAccess.pdfUrl ??
258
+ (input.paperUrl?.toLowerCase().endsWith('.pdf') ? input.paperUrl : null);
259
+ if (!resolvedPdfUrl) {
260
+ throw new IngestionError('Unable to resolve a downloadable PDF URL from input.');
261
+ }
262
+ return {
263
+ doi: input.doi ?? resolvedWork?.doi ?? null,
264
+ paperUrl: input.paperUrl ?? resolvedWork?.url ?? null,
265
+ pdfUrl: resolvedPdfUrl,
266
+ localPdfPath: null,
267
+ licenseState: 'open_access',
268
+ provenanceWork: resolvedWork
269
+ };
270
+ }
271
+ async parseSourcePdf(source, parseMode) {
272
+ const { filePath, cleanup } = await this.obtainPdfFile(source);
273
+ try {
274
+ const modes = this.resolveParserOrder(parseMode);
275
+ for (const mode of modes) {
276
+ try {
277
+ switch (mode) {
278
+ case 'grobid': {
279
+ if (!this.config.researchGrobidUrl) {
280
+ continue;
281
+ }
282
+ return await this.parseWithGrobid(filePath);
283
+ }
284
+ case 'sidecar': {
285
+ if (!this.config.researchPythonSidecarUrl) {
286
+ continue;
287
+ }
288
+ return await this.parseWithSidecar(filePath);
289
+ }
290
+ case 'simple': {
291
+ return await this.parseWithSimplePdf(filePath);
292
+ }
293
+ }
294
+ }
295
+ catch (error) {
296
+ this.logger.warn('Parser mode failed, trying fallback', {
297
+ mode,
298
+ filePath,
299
+ error: error instanceof Error ? error.message : String(error)
300
+ });
301
+ continue;
302
+ }
303
+ }
304
+ throw new IngestionError('All parser strategies failed for this PDF source.');
305
+ }
306
+ finally {
307
+ await cleanup();
308
+ }
309
+ }
310
+ resolveParserOrder(parseMode) {
311
+ if (parseMode === 'auto') {
312
+ return ['grobid', 'sidecar', 'simple'];
313
+ }
314
+ if (parseMode === 'grobid') {
315
+ return ['grobid', 'sidecar', 'simple'];
316
+ }
317
+ if (parseMode === 'sidecar') {
318
+ return ['sidecar', 'grobid', 'simple'];
319
+ }
320
+ return ['simple'];
321
+ }
322
+ async obtainPdfFile(source) {
323
+ if (source.localPdfPath) {
324
+ return {
325
+ filePath: source.localPdfPath,
326
+ cleanup: async () => undefined
327
+ };
328
+ }
329
+ if (!source.pdfUrl) {
330
+ throw new IngestionError('Missing PDF URL after source resolution.');
331
+ }
332
+ const response = await fetch(source.pdfUrl, {
333
+ headers: {
334
+ accept: 'application/pdf,*/*'
335
+ }
336
+ });
337
+ if (!response.ok) {
338
+ throw new IngestionError(`Failed to download PDF. HTTP ${response.status}`);
339
+ }
340
+ const bytes = await response.arrayBuffer();
341
+ const tempPath = resolve(tmpdir(), `scholar-mcp-${Date.now()}-${randomUUID()}.pdf`);
342
+ await fs.writeFile(tempPath, Buffer.from(bytes));
343
+ return {
344
+ filePath: tempPath,
345
+ cleanup: async () => {
346
+ await fs.unlink(tempPath).catch(() => undefined);
347
+ }
348
+ };
349
+ }
350
+ async parseWithSimplePdf(filePath) {
351
+ const buffer = await fs.readFile(filePath);
352
+ const parser = new PDFParse({ data: buffer });
353
+ const parsed = await parser.getText();
354
+ await parser.destroy();
355
+ const text = normalizeWhitespace(parsed.text ?? '');
356
+ if (!text) {
357
+ throw new IngestionError('Simple PDF parser returned empty text.');
358
+ }
359
+ const sections = splitIntoSections(parsed.text ?? '');
360
+ const references = extractReferences(parsed.text ?? '');
361
+ const { title, abstract } = extractTitleAndAbstract(parsed.text ?? '');
362
+ return {
363
+ parserName: 'pdf-parse',
364
+ parserVersion: '2.x',
365
+ confidence: 0.62,
366
+ title,
367
+ abstract,
368
+ fullText: text,
369
+ sections,
370
+ references
371
+ };
372
+ }
373
+ async parseWithGrobid(filePath) {
374
+ if (!this.config.researchGrobidUrl) {
375
+ throw new IngestionError('GROBID URL is not configured.');
376
+ }
377
+ const url = new URL('/api/processFulltextDocument', this.config.researchGrobidUrl);
378
+ const buffer = await fs.readFile(filePath);
379
+ const formData = new FormData();
380
+ formData.set('input', new Blob([buffer], { type: 'application/pdf' }), basename(filePath));
381
+ formData.set('consolidateHeader', '1');
382
+ formData.set('consolidateCitations', '1');
383
+ const response = await fetch(url, {
384
+ method: 'POST',
385
+ body: formData
386
+ });
387
+ if (!response.ok) {
388
+ throw new IngestionError(`GROBID returned HTTP ${response.status}`);
389
+ }
390
+ const xml = await response.text();
391
+ const parsed = parseGrobidXml(xml);
392
+ if (!parsed.fullText) {
393
+ throw new IngestionError('GROBID response did not include extractable body text.');
394
+ }
395
+ return parsed;
396
+ }
397
+ async parseWithSidecar(filePath) {
398
+ if (!this.config.researchPythonSidecarUrl) {
399
+ throw new IngestionError('Python sidecar URL is not configured.');
400
+ }
401
+ const url = new URL('/parse', this.config.researchPythonSidecarUrl);
402
+ const response = await fetch(url, {
403
+ method: 'POST',
404
+ headers: {
405
+ 'content-type': 'application/json'
406
+ },
407
+ body: JSON.stringify({
408
+ filePath
409
+ })
410
+ });
411
+ if (!response.ok) {
412
+ throw new IngestionError(`Python sidecar returned HTTP ${response.status}`);
413
+ }
414
+ const payload = (await response.json());
415
+ const fullText = normalizeWhitespace(payload.fullText ?? '');
416
+ if (!fullText) {
417
+ throw new IngestionError('Python sidecar returned empty full text.');
418
+ }
419
+ return {
420
+ parserName: payload.parserName ?? 'python-sidecar',
421
+ parserVersion: payload.parserVersion ?? 'unknown',
422
+ confidence: payload.confidence ?? 0.74,
423
+ title: payload.title ?? null,
424
+ abstract: payload.abstract ?? null,
425
+ fullText,
426
+ sections: payload.sections ?? splitIntoSections(fullText),
427
+ references: payload.references ?? extractReferences(fullText)
428
+ };
429
+ }
430
+ }