scholar-mcp 1.0.6 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -21
- package/dist/config.js +0 -2
- package/dist/mcp/create-scholar-mcp-server.js +2 -2
- package/dist/research/ingestion-service.js +97 -40
- package/dist/research/literature-service.js +44 -1
- package/dist/research/providers/openalex-client.js +51 -37
- package/dist/research/providers/semantic-scholar-client.js +3 -2
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -24,7 +24,7 @@ Use this if you want Claude Code, Codex, or any MCP-compatible coding agent to r
|
|
|
24
24
|
|
|
25
25
|
- Transports: `stdio` (recommended) and HTTP (`/mcp`)
|
|
26
26
|
- Research providers: Google Scholar, OpenAlex, Crossref, Semantic Scholar
|
|
27
|
-
- Full-text parsing pipeline: `grobid ->
|
|
27
|
+
- Full-text parsing pipeline: `grobid -> simple`
|
|
28
28
|
- Tooling for thesis/paper workflows: ingestion, extraction, references, validation
|
|
29
29
|
|
|
30
30
|
## Quick Start
|
|
@@ -158,24 +158,6 @@ RESEARCH_ALLOW_LOCAL_PDFS = "true"
|
|
|
158
158
|
- "Given this draft section, suggest citations in IEEE style and generate BibTeX."
|
|
159
159
|
- "Validate my manuscript citations against this reference list and show missing citations."
|
|
160
160
|
|
|
161
|
-
## Optional Python Sidecar (better parsing fallback)
|
|
162
|
-
|
|
163
|
-
Run sidecar:
|
|
164
|
-
|
|
165
|
-
```bash
|
|
166
|
-
cd ../../services/python-sidecar
|
|
167
|
-
python -m venv .venv
|
|
168
|
-
source .venv/bin/activate
|
|
169
|
-
pip install -r requirements.txt
|
|
170
|
-
uvicorn app:app --host 127.0.0.1 --port 8090
|
|
171
|
-
```
|
|
172
|
-
|
|
173
|
-
Then set:
|
|
174
|
-
|
|
175
|
-
```bash
|
|
176
|
-
RESEARCH_PYTHON_SIDECAR_URL=http://127.0.0.1:8090
|
|
177
|
-
```
|
|
178
|
-
|
|
179
161
|
## Configuration
|
|
180
162
|
|
|
181
163
|
Most users only need these:
|
|
@@ -186,7 +168,6 @@ Most users only need these:
|
|
|
186
168
|
- `RESEARCH_ALLOW_LOCAL_PDFS`: allow local PDF ingestion (default: `true`)
|
|
187
169
|
- `SCHOLAR_MCP_API_KEY`: optional bearer token for HTTP mode
|
|
188
170
|
- `RESEARCH_GROBID_URL`: optional GROBID endpoint
|
|
189
|
-
- `RESEARCH_PYTHON_SIDECAR_URL`: optional sidecar endpoint
|
|
190
171
|
|
|
191
172
|
The CLI loads `.env` from the current working directory automatically at startup.
|
|
192
173
|
|
|
@@ -197,7 +178,7 @@ Advanced options exist in `src/config.ts` for timeouts, retries, HTTP session ca
|
|
|
197
178
|
- `Invalid environment variable format` in `claude mcp add`:
|
|
198
179
|
- Add `--` before the MCP server name (see Claude setup command above).
|
|
199
180
|
- `Unable to resolve a downloadable PDF URL from input` on DOI ingestion:
|
|
200
|
-
- The DOI landing page may not expose
|
|
181
|
+
- The DOI and landing page may not expose an accessible PDF URL.
|
|
201
182
|
- Retry with `pdf_url` (direct PDF) or `local_pdf_path`.
|
|
202
183
|
- Too many Scholar failures or throttling:
|
|
203
184
|
- Increase `SCHOLAR_REQUEST_DELAY_MS` (for example `500` to `1000`).
|
package/dist/config.js
CHANGED
|
@@ -56,7 +56,6 @@ const envSchema = z.object({
|
|
|
56
56
|
RESEARCH_ALLOW_REMOTE_PDFS: booleanFromEnv(true),
|
|
57
57
|
RESEARCH_ALLOW_LOCAL_PDFS: booleanFromEnv(true),
|
|
58
58
|
RESEARCH_GROBID_URL: z.string().url().optional(),
|
|
59
|
-
RESEARCH_PYTHON_SIDECAR_URL: z.string().url().optional(),
|
|
60
59
|
RESEARCH_SEMANTIC_ENGINE: z.enum(['cloud-llm', 'none']).default('cloud-llm'),
|
|
61
60
|
RESEARCH_CLOUD_MODEL: z.string().default('gpt-4.1-mini'),
|
|
62
61
|
RESEARCH_GRAPH_CACHE_TTL_MS: numberFromEnv(5 * 60 * 1000, 0, 24 * 60 * 60 * 1000),
|
|
@@ -120,7 +119,6 @@ export const parseConfig = (overrides) => {
|
|
|
120
119
|
researchAllowRemotePdfs: env.RESEARCH_ALLOW_REMOTE_PDFS,
|
|
121
120
|
researchAllowLocalPdfs: env.RESEARCH_ALLOW_LOCAL_PDFS,
|
|
122
121
|
researchGrobidUrl: env.RESEARCH_GROBID_URL,
|
|
123
|
-
researchPythonSidecarUrl: env.RESEARCH_PYTHON_SIDECAR_URL,
|
|
124
122
|
researchSemanticEngine: env.RESEARCH_SEMANTIC_ENGINE,
|
|
125
123
|
researchCloudModel: env.RESEARCH_CLOUD_MODEL,
|
|
126
124
|
researchGraphCacheTtlMs: env.RESEARCH_GRAPH_CACHE_TTL_MS,
|
|
@@ -124,7 +124,7 @@ export const createScholarMcpServer = (config, service, researchService, logger)
|
|
|
124
124
|
});
|
|
125
125
|
server.registerTool('ingest_paper_fulltext', {
|
|
126
126
|
title: 'Ingest Full-Text Paper',
|
|
127
|
-
description: 'Resolve and ingest a full-text PDF from DOI/URL/local file, then parse into a structured document using GROBID/
|
|
127
|
+
description: 'Resolve and ingest a full-text PDF from DOI/URL/local file, then parse into a structured document using GROBID/simple fallback pipeline.',
|
|
128
128
|
annotations: {
|
|
129
129
|
readOnlyHint: false,
|
|
130
130
|
openWorldHint: true
|
|
@@ -134,7 +134,7 @@ export const createScholarMcpServer = (config, service, researchService, logger)
|
|
|
134
134
|
paper_url: z.string().url().optional().describe('Landing page URL for the paper.'),
|
|
135
135
|
pdf_url: z.string().url().optional().describe('Direct PDF URL.'),
|
|
136
136
|
local_pdf_path: z.string().optional().describe('Local absolute or workspace-relative PDF path.'),
|
|
137
|
-
parse_mode: z.enum(['auto', 'grobid', '
|
|
137
|
+
parse_mode: z.enum(['auto', 'grobid', 'simple']).default('auto'),
|
|
138
138
|
ocr_enabled: z.boolean().default(true).describe('Reserved for OCR-capable parser modes.')
|
|
139
139
|
}
|
|
140
140
|
}, async ({ doi, paper_url, pdf_url, local_pdf_path, parse_mode, ocr_enabled }) => {
|
|
@@ -6,6 +6,7 @@ import { PDFParse } from 'pdf-parse';
|
|
|
6
6
|
import { IngestionError, DocumentNotFoundError, JobNotFoundError } from './errors.js';
|
|
7
7
|
import { makeStableId, nowIso, normalizeWhitespace, parseYear } from './utils.js';
|
|
8
8
|
const DOI_REGEX = /10\.\d{4,9}\/[\-._;()/:A-Z0-9]+/i;
|
|
9
|
+
const PDF_LINK_REGEX = /href=["']([^"']+\.pdf(?:\?[^"']*)?)["']/i;
|
|
9
10
|
const toAbsolutePath = (value) => (value.startsWith('/') ? value : resolve(process.cwd(), value));
|
|
10
11
|
const splitLines = (text) => text.split(/\r?\n/).map((line) => line.trim());
|
|
11
12
|
const isLikelyHeading = (line) => /^(abstract|introduction|background|related work|method(?:s)?|materials|results|discussion|conclusion|limitations|references)\b/i.test(line.trim());
|
|
@@ -111,6 +112,14 @@ const parseGrobidXml = (xml) => {
|
|
|
111
112
|
references
|
|
112
113
|
};
|
|
113
114
|
};
|
|
115
|
+
const resolveUrlCandidate = (candidate, baseUrl) => {
|
|
116
|
+
try {
|
|
117
|
+
return new URL(candidate, baseUrl).toString();
|
|
118
|
+
}
|
|
119
|
+
catch {
|
|
120
|
+
return null;
|
|
121
|
+
}
|
|
122
|
+
};
|
|
114
123
|
export class IngestionService {
|
|
115
124
|
config;
|
|
116
125
|
logger;
|
|
@@ -253,9 +262,13 @@ export class IngestionService {
|
|
|
253
262
|
if (input.doi) {
|
|
254
263
|
resolvedWork = await this.literatureService.resolveByDoi(input.doi);
|
|
255
264
|
}
|
|
265
|
+
const paperUrlCandidate = input.paperUrl ?? resolvedWork?.url ?? null;
|
|
266
|
+
const paperUrlPdfCandidate = paperUrlCandidate?.toLowerCase().endsWith('.pdf') ? paperUrlCandidate : null;
|
|
267
|
+
const discoveredPdfFromLanding = await this.resolvePdfUrlFromLandingPages([paperUrlCandidate, resolvedWork?.url]);
|
|
256
268
|
const resolvedPdfUrl = input.pdfUrl ??
|
|
257
269
|
resolvedWork?.openAccess.pdfUrl ??
|
|
258
|
-
|
|
270
|
+
paperUrlPdfCandidate ??
|
|
271
|
+
discoveredPdfFromLanding;
|
|
259
272
|
if (!resolvedPdfUrl) {
|
|
260
273
|
throw new IngestionError('Unable to resolve a downloadable PDF URL from input.');
|
|
261
274
|
}
|
|
@@ -281,12 +294,6 @@ export class IngestionService {
|
|
|
281
294
|
}
|
|
282
295
|
return await this.parseWithGrobid(filePath);
|
|
283
296
|
}
|
|
284
|
-
case 'sidecar': {
|
|
285
|
-
if (!this.config.researchPythonSidecarUrl) {
|
|
286
|
-
continue;
|
|
287
|
-
}
|
|
288
|
-
return await this.parseWithSidecar(filePath);
|
|
289
|
-
}
|
|
290
297
|
case 'simple': {
|
|
291
298
|
return await this.parseWithSimplePdf(filePath);
|
|
292
299
|
}
|
|
@@ -309,13 +316,10 @@ export class IngestionService {
|
|
|
309
316
|
}
|
|
310
317
|
resolveParserOrder(parseMode) {
|
|
311
318
|
if (parseMode === 'auto') {
|
|
312
|
-
return ['grobid', '
|
|
319
|
+
return ['grobid', 'simple'];
|
|
313
320
|
}
|
|
314
321
|
if (parseMode === 'grobid') {
|
|
315
|
-
return ['grobid', '
|
|
316
|
-
}
|
|
317
|
-
if (parseMode === 'sidecar') {
|
|
318
|
-
return ['sidecar', 'grobid', 'simple'];
|
|
322
|
+
return ['grobid', 'simple'];
|
|
319
323
|
}
|
|
320
324
|
return ['simple'];
|
|
321
325
|
}
|
|
@@ -331,15 +335,22 @@ export class IngestionService {
|
|
|
331
335
|
}
|
|
332
336
|
const response = await fetch(source.pdfUrl, {
|
|
333
337
|
headers: {
|
|
334
|
-
accept: 'application/pdf,*/*'
|
|
338
|
+
accept: 'application/pdf,*/*',
|
|
339
|
+
'user-agent': 'ScholarMCP/1.0 (+https://github.com/lstudlo/ScholarMCP)'
|
|
335
340
|
}
|
|
336
341
|
});
|
|
337
342
|
if (!response.ok) {
|
|
338
343
|
throw new IngestionError(`Failed to download PDF. HTTP ${response.status}`);
|
|
339
344
|
}
|
|
340
345
|
const bytes = await response.arrayBuffer();
|
|
346
|
+
const contentType = (response.headers.get('content-type') ?? '').toLowerCase();
|
|
347
|
+
const buffer = Buffer.from(bytes);
|
|
348
|
+
const looksLikePdf = buffer.length >= 4 && buffer.subarray(0, 4).toString('utf8') === '%PDF';
|
|
349
|
+
if (!contentType.includes('application/pdf') && !looksLikePdf) {
|
|
350
|
+
throw new IngestionError(`Downloaded content is not a PDF (content-type: ${contentType || 'unknown'}).`);
|
|
351
|
+
}
|
|
341
352
|
const tempPath = resolve(tmpdir(), `scholar-mcp-${Date.now()}-${randomUUID()}.pdf`);
|
|
342
|
-
await fs.writeFile(tempPath,
|
|
353
|
+
await fs.writeFile(tempPath, buffer);
|
|
343
354
|
return {
|
|
344
355
|
filePath: tempPath,
|
|
345
356
|
cleanup: async () => {
|
|
@@ -394,37 +405,83 @@ export class IngestionService {
|
|
|
394
405
|
}
|
|
395
406
|
return parsed;
|
|
396
407
|
}
|
|
397
|
-
async
|
|
398
|
-
|
|
399
|
-
|
|
408
|
+
async resolvePdfUrlFromLandingPages(urls) {
|
|
409
|
+
const seen = new Set();
|
|
410
|
+
for (const candidate of urls) {
|
|
411
|
+
if (!candidate) {
|
|
412
|
+
continue;
|
|
413
|
+
}
|
|
414
|
+
const normalized = candidate.trim();
|
|
415
|
+
if (!normalized || seen.has(normalized)) {
|
|
416
|
+
continue;
|
|
417
|
+
}
|
|
418
|
+
seen.add(normalized);
|
|
419
|
+
try {
|
|
420
|
+
const discovered = await this.resolvePdfUrlFromLandingPage(normalized);
|
|
421
|
+
if (discovered) {
|
|
422
|
+
return discovered;
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
catch (error) {
|
|
426
|
+
this.logger.debug('Landing page PDF discovery failed', {
|
|
427
|
+
paperUrl: normalized,
|
|
428
|
+
error: error instanceof Error ? error.message : String(error)
|
|
429
|
+
});
|
|
430
|
+
}
|
|
400
431
|
}
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
432
|
+
return null;
|
|
433
|
+
}
|
|
434
|
+
async resolvePdfUrlFromLandingPage(paperUrl) {
|
|
435
|
+
const response = await fetch(paperUrl, {
|
|
404
436
|
headers: {
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
filePath
|
|
409
|
-
})
|
|
437
|
+
accept: 'text/html,application/pdf,*/*',
|
|
438
|
+
'user-agent': 'ScholarMCP/1.0 (+https://github.com/lstudlo/ScholarMCP)'
|
|
439
|
+
}
|
|
410
440
|
});
|
|
411
441
|
if (!response.ok) {
|
|
412
|
-
|
|
442
|
+
return null;
|
|
413
443
|
}
|
|
414
|
-
const
|
|
415
|
-
const
|
|
416
|
-
if (
|
|
417
|
-
|
|
444
|
+
const finalUrl = response.url || paperUrl;
|
|
445
|
+
const contentType = (response.headers.get('content-type') ?? '').toLowerCase();
|
|
446
|
+
if (contentType.includes('application/pdf')) {
|
|
447
|
+
return finalUrl;
|
|
418
448
|
}
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
449
|
+
const html = await response.text();
|
|
450
|
+
if (!html) {
|
|
451
|
+
return null;
|
|
452
|
+
}
|
|
453
|
+
const metaPatterns = [
|
|
454
|
+
/<meta[^>]+name=["']citation_pdf_url["'][^>]+content=["']([^"']+)["'][^>]*>/i,
|
|
455
|
+
/<meta[^>]+content=["']([^"']+)["'][^>]+name=["']citation_pdf_url["'][^>]*>/i,
|
|
456
|
+
/<meta[^>]+property=["']og:pdf["'][^>]+content=["']([^"']+)["'][^>]*>/i,
|
|
457
|
+
/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:pdf["'][^>]*>/i
|
|
458
|
+
];
|
|
459
|
+
for (const pattern of metaPatterns) {
|
|
460
|
+
const match = html.match(pattern);
|
|
461
|
+
if (match?.[1]) {
|
|
462
|
+
const resolved = resolveUrlCandidate(match[1], finalUrl);
|
|
463
|
+
if (resolved) {
|
|
464
|
+
return resolved;
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
const linkPatterns = [
|
|
469
|
+
/<link[^>]+type=["']application\/pdf["'][^>]+href=["']([^"']+)["'][^>]*>/i,
|
|
470
|
+
/<link[^>]+href=["']([^"']+)["'][^>]+type=["']application\/pdf["'][^>]*>/i
|
|
471
|
+
];
|
|
472
|
+
for (const pattern of linkPatterns) {
|
|
473
|
+
const match = html.match(pattern);
|
|
474
|
+
if (match?.[1]) {
|
|
475
|
+
const resolved = resolveUrlCandidate(match[1], finalUrl);
|
|
476
|
+
if (resolved) {
|
|
477
|
+
return resolved;
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
const anchorMatch = html.match(PDF_LINK_REGEX);
|
|
482
|
+
if (anchorMatch?.[1]) {
|
|
483
|
+
return resolveUrlCandidate(anchorMatch[1], finalUrl);
|
|
484
|
+
}
|
|
485
|
+
return null;
|
|
429
486
|
}
|
|
430
487
|
}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { normalizeDoi, normalizeWhitespace, parseYear, tokenizeForRanking } from './utils.js';
|
|
2
2
|
import { ResearchHttpClient } from './http-client.js';
|
|
3
|
+
import { ResearchProviderError } from './errors.js';
|
|
3
4
|
import { OpenAlexClient } from './providers/openalex-client.js';
|
|
4
5
|
import { CrossrefClient } from './providers/crossref-client.js';
|
|
5
6
|
import { SemanticScholarClient } from './providers/semantic-scholar-client.js';
|
|
@@ -292,9 +293,51 @@ export class LiteratureService {
|
|
|
292
293
|
if (!normalized) {
|
|
293
294
|
return null;
|
|
294
295
|
}
|
|
296
|
+
try {
|
|
297
|
+
const openAlexExact = await this.openAlexClient.getWorkByDoi(normalized);
|
|
298
|
+
if (openAlexExact) {
|
|
299
|
+
return {
|
|
300
|
+
title: openAlexExact.title,
|
|
301
|
+
abstract: openAlexExact.abstract,
|
|
302
|
+
year: openAlexExact.year,
|
|
303
|
+
venue: openAlexExact.venue,
|
|
304
|
+
doi: openAlexExact.doi,
|
|
305
|
+
url: openAlexExact.url,
|
|
306
|
+
paperId: openAlexExact.providerId,
|
|
307
|
+
citationCount: openAlexExact.citationCount,
|
|
308
|
+
influentialCitationCount: openAlexExact.influentialCitationCount,
|
|
309
|
+
referenceCount: openAlexExact.referenceCount,
|
|
310
|
+
authors: openAlexExact.authors,
|
|
311
|
+
openAccess: {
|
|
312
|
+
isOpenAccess: openAlexExact.openAccess.isOpenAccess,
|
|
313
|
+
pdfUrl: openAlexExact.openAccess.pdfUrl,
|
|
314
|
+
license: openAlexExact.openAccess.license
|
|
315
|
+
},
|
|
316
|
+
externalIds: openAlexExact.externalIds,
|
|
317
|
+
fieldsOfStudy: openAlexExact.fieldsOfStudy,
|
|
318
|
+
score: openAlexExact.score,
|
|
319
|
+
provenance: [
|
|
320
|
+
{
|
|
321
|
+
provider: 'openalex',
|
|
322
|
+
sourceUrl: openAlexExact.sourceUrl,
|
|
323
|
+
fetchedAt: new Date().toISOString(),
|
|
324
|
+
confidence: providerWeight.openalex
|
|
325
|
+
}
|
|
326
|
+
]
|
|
327
|
+
};
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
catch (error) {
|
|
331
|
+
if (!(error instanceof ResearchProviderError) || error.status !== 404) {
|
|
332
|
+
this.logger.warn('OpenAlex DOI resolve failed', {
|
|
333
|
+
doi: normalized,
|
|
334
|
+
error: error instanceof Error ? error.message : String(error)
|
|
335
|
+
});
|
|
336
|
+
}
|
|
337
|
+
}
|
|
295
338
|
const result = await this.searchGraph({
|
|
296
339
|
query: normalized,
|
|
297
|
-
limit:
|
|
340
|
+
limit: 50,
|
|
298
341
|
sources: ['openalex', 'crossref', 'semantic_scholar']
|
|
299
342
|
});
|
|
300
343
|
return (result.results.find((item) => normalizeDoi(item.doi) === normalized) ??
|
|
@@ -38,43 +38,57 @@ export class OpenAlexClient {
|
|
|
38
38
|
provider: 'openalex',
|
|
39
39
|
url
|
|
40
40
|
});
|
|
41
|
-
return (payload.results ?? []).map((item) =>
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
influentialCitationCount: 0,
|
|
54
|
-
referenceCount: item.referenced_works_count ?? 0,
|
|
55
|
-
authors: (item.authorships ?? [])
|
|
56
|
-
.map((auth) => ({
|
|
57
|
-
name: auth.author?.display_name ?? '',
|
|
58
|
-
authorId: auth.author?.id ?? null
|
|
59
|
-
}))
|
|
60
|
-
.filter((author) => author.name.length > 0),
|
|
61
|
-
openAccess: {
|
|
62
|
-
isOpenAccess: item.open_access?.is_oa ?? item.open_access?.any_repository_has_fulltext ?? Boolean(item.primary_location?.pdf_url),
|
|
63
|
-
pdfUrl: item.primary_location?.pdf_url ?? item.open_access?.oa_url ?? null,
|
|
64
|
-
license: item.primary_location?.license ?? item.open_access?.oa_status ?? null
|
|
65
|
-
},
|
|
66
|
-
externalIds: {
|
|
67
|
-
...(item.ids?.openalex ? { openalex: item.ids.openalex } : {}),
|
|
68
|
-
...(doi ? { doi } : {}),
|
|
69
|
-
...(item.ids?.pmid ? { pmid: item.ids.pmid } : {}),
|
|
70
|
-
...(item.ids?.pmcid ? { pmcid: item.ids.pmcid } : {})
|
|
71
|
-
},
|
|
72
|
-
fieldsOfStudy: (item.concepts ?? [])
|
|
73
|
-
.map((concept) => concept.display_name ?? '')
|
|
74
|
-
.filter((value) => value.length > 0),
|
|
75
|
-
score: item.relevance_score ?? 0.5,
|
|
76
|
-
sourceUrl: url.toString()
|
|
77
|
-
};
|
|
41
|
+
return (payload.results ?? []).map((item) => this.mapWork(item, url.toString()));
|
|
42
|
+
}
|
|
43
|
+
async getWorkByDoi(doi) {
|
|
44
|
+
const normalizedDoi = normalizeDoi(doi);
|
|
45
|
+
if (!normalizedDoi) {
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
const encodedDoiUrl = encodeURIComponent(`https://doi.org/${normalizedDoi}`);
|
|
49
|
+
const url = new URL(`/works/${encodedDoiUrl}`, this.config.researchOpenAlexBaseUrl);
|
|
50
|
+
const payload = await this.httpClient.fetchJson({
|
|
51
|
+
provider: 'openalex',
|
|
52
|
+
url
|
|
78
53
|
});
|
|
54
|
+
return this.mapWork(payload, url.toString());
|
|
55
|
+
}
|
|
56
|
+
mapWork(item, sourceUrl) {
|
|
57
|
+
const doi = normalizeDoi(item.ids?.doi ?? null);
|
|
58
|
+
return {
|
|
59
|
+
provider: 'openalex',
|
|
60
|
+
providerId: item.id ?? `openalex:${item.display_name ?? 'unknown'}`,
|
|
61
|
+
title: item.display_name ?? 'Untitled',
|
|
62
|
+
abstract: decodeInvertedAbstract(item.abstract_inverted_index),
|
|
63
|
+
year: parseYear(item.publication_year),
|
|
64
|
+
venue: item.primary_location?.source?.display_name ?? null,
|
|
65
|
+
doi,
|
|
66
|
+
url: item.primary_location?.landing_page_url ?? item.id ?? null,
|
|
67
|
+
citationCount: item.cited_by_count ?? 0,
|
|
68
|
+
influentialCitationCount: 0,
|
|
69
|
+
referenceCount: item.referenced_works_count ?? 0,
|
|
70
|
+
authors: (item.authorships ?? [])
|
|
71
|
+
.map((auth) => ({
|
|
72
|
+
name: auth.author?.display_name ?? '',
|
|
73
|
+
authorId: auth.author?.id ?? null
|
|
74
|
+
}))
|
|
75
|
+
.filter((author) => author.name.length > 0),
|
|
76
|
+
openAccess: {
|
|
77
|
+
isOpenAccess: item.open_access?.is_oa ?? item.open_access?.any_repository_has_fulltext ?? Boolean(item.primary_location?.pdf_url),
|
|
78
|
+
pdfUrl: item.primary_location?.pdf_url ?? item.open_access?.oa_url ?? null,
|
|
79
|
+
license: item.primary_location?.license ?? item.open_access?.oa_status ?? null
|
|
80
|
+
},
|
|
81
|
+
externalIds: {
|
|
82
|
+
...(item.ids?.openalex ? { openalex: item.ids.openalex } : {}),
|
|
83
|
+
...(doi ? { doi } : {}),
|
|
84
|
+
...(item.ids?.pmid ? { pmid: item.ids.pmid } : {}),
|
|
85
|
+
...(item.ids?.pmcid ? { pmcid: item.ids.pmcid } : {})
|
|
86
|
+
},
|
|
87
|
+
fieldsOfStudy: (item.concepts ?? [])
|
|
88
|
+
.map((concept) => concept.display_name ?? '')
|
|
89
|
+
.filter((value) => value.length > 0),
|
|
90
|
+
score: item.relevance_score ?? 0.5,
|
|
91
|
+
sourceUrl
|
|
92
|
+
};
|
|
79
93
|
}
|
|
80
94
|
}
|
|
@@ -7,9 +7,10 @@ export class SemanticScholarClient {
|
|
|
7
7
|
this.httpClient = httpClient;
|
|
8
8
|
}
|
|
9
9
|
async searchWorks(query, limit) {
|
|
10
|
-
const
|
|
10
|
+
const baseUrl = this.config.researchSemanticScholarBaseUrl.endsWith('/')
|
|
11
11
|
? this.config.researchSemanticScholarBaseUrl
|
|
12
|
-
: `${this.config.researchSemanticScholarBaseUrl}
|
|
12
|
+
: `${this.config.researchSemanticScholarBaseUrl}/`;
|
|
13
|
+
const url = new URL('paper/search', baseUrl);
|
|
13
14
|
url.searchParams.set('query', query);
|
|
14
15
|
url.searchParams.set('limit', String(limit));
|
|
15
16
|
url.searchParams.set('fields', 'paperId,title,abstract,year,venue,externalIds,url,citationCount,influentialCitationCount,referenceCount,isOpenAccess,openAccessPdf,fieldsOfStudy,authors');
|