scholar-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +232 -0
- package/dist/cli/args.js +57 -0
- package/dist/config.js +131 -0
- package/dist/core/logger.js +36 -0
- package/dist/http/start-http-server.js +329 -0
- package/dist/index.js +66 -0
- package/dist/mcp/create-scholar-mcp-server.js +583 -0
- package/dist/mcp/start-stdio-server.js +8 -0
- package/dist/research/citation-service.js +407 -0
- package/dist/research/errors.js +36 -0
- package/dist/research/extraction-service.js +109 -0
- package/dist/research/http-client.js +62 -0
- package/dist/research/index.js +7 -0
- package/dist/research/ingestion-service.js +430 -0
- package/dist/research/literature-service.js +387 -0
- package/dist/research/providers/crossref-client.js +73 -0
- package/dist/research/providers/openalex-client.js +80 -0
- package/dist/research/providers/semantic-scholar-client.js +60 -0
- package/dist/research/research-service.js +53 -0
- package/dist/research/types.js +1 -0
- package/dist/research/utils.js +54 -0
- package/dist/scholar/errors.js +30 -0
- package/dist/scholar/scholar-client.js +99 -0
- package/dist/scholar/scholar-parser.js +251 -0
- package/dist/scholar/scholar-service.js +202 -0
- package/dist/scholar/types.js +1 -0
- package/dist/version.js +14 -0
- package/package.json +49 -0
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
import { normalizeDoi, normalizeWhitespace, parseYear, tokenizeForRanking } from './utils.js';
|
|
2
|
+
import { ResearchHttpClient } from './http-client.js';
|
|
3
|
+
import { OpenAlexClient } from './providers/openalex-client.js';
|
|
4
|
+
import { CrossrefClient } from './providers/crossref-client.js';
|
|
5
|
+
import { SemanticScholarClient } from './providers/semantic-scholar-client.js';
|
|
6
|
+
const providerWeight = {
|
|
7
|
+
openalex: 1,
|
|
8
|
+
crossref: 0.9,
|
|
9
|
+
semantic_scholar: 1.1,
|
|
10
|
+
scholar_scrape: 0.7
|
|
11
|
+
};
|
|
12
|
+
const DEFAULT_SOURCES = ['openalex', 'crossref', 'semantic_scholar'];
|
|
13
|
+
const scoreFromCitations = (citations) => {
|
|
14
|
+
if (citations <= 0) {
|
|
15
|
+
return 0;
|
|
16
|
+
}
|
|
17
|
+
return Math.min(1, Math.log10(citations + 1) / 4);
|
|
18
|
+
};
|
|
19
|
+
const normalizeTitleKey = (title) => normalizeWhitespace(title)
|
|
20
|
+
.toLowerCase()
|
|
21
|
+
.replace(/[^a-z0-9\s]/g, '');
|
|
22
|
+
const tokenSetFromTitle = (title) => new Set(tokenizeForRanking(title));
|
|
23
|
+
const setJaccard = (a, b) => {
|
|
24
|
+
if (a.size === 0 || b.size === 0) {
|
|
25
|
+
return 0;
|
|
26
|
+
}
|
|
27
|
+
let overlap = 0;
|
|
28
|
+
for (const token of a) {
|
|
29
|
+
if (b.has(token)) {
|
|
30
|
+
overlap += 1;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
return overlap / (a.size + b.size - overlap);
|
|
34
|
+
};
|
|
35
|
+
const mergeFields = (current, incoming) => {
|
|
36
|
+
const set = new Set(current);
|
|
37
|
+
for (const value of incoming) {
|
|
38
|
+
if (value) {
|
|
39
|
+
set.add(value);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return [...set];
|
|
43
|
+
};
|
|
44
|
+
const isWithinYearRange = (year, range) => {
|
|
45
|
+
if (!range || !year) {
|
|
46
|
+
return true;
|
|
47
|
+
}
|
|
48
|
+
return year >= range[0] && year <= range[1];
|
|
49
|
+
};
|
|
50
|
+
const matchesFieldOfStudy = (work, requestedFields) => {
|
|
51
|
+
if (!requestedFields || requestedFields.length === 0) {
|
|
52
|
+
return true;
|
|
53
|
+
}
|
|
54
|
+
const normalized = new Set(work.fieldsOfStudy.map((field) => field.trim().toLowerCase()));
|
|
55
|
+
return requestedFields.some((field) => normalized.has(field.trim().toLowerCase()));
|
|
56
|
+
};
|
|
57
|
+
const yearsCompatible = (a, b) => !a || !b || Math.abs(a - b) <= 2;
|
|
58
|
+
const normalizeAuthorName = (name) => normalizeWhitespace(name)
|
|
59
|
+
.toLowerCase()
|
|
60
|
+
.replace(/[^a-z0-9\s]/g, '');
|
|
61
|
+
const sharesAuthorSignal = (left, right) => {
|
|
62
|
+
if (left.length === 0 || right.length === 0) {
|
|
63
|
+
return true;
|
|
64
|
+
}
|
|
65
|
+
const leftIds = new Set(left.map((author) => author.authorId).filter((id) => Boolean(id)));
|
|
66
|
+
if (leftIds.size > 0 && right.some((author) => author.authorId && leftIds.has(author.authorId))) {
|
|
67
|
+
return true;
|
|
68
|
+
}
|
|
69
|
+
const leftNames = new Set(left.map((author) => normalizeAuthorName(author.name)).filter((name) => name.length > 0));
|
|
70
|
+
return right.some((author) => leftNames.has(normalizeAuthorName(author.name)));
|
|
71
|
+
};
|
|
72
|
+
const cloneResult = (value) => {
|
|
73
|
+
try {
|
|
74
|
+
return structuredClone(value);
|
|
75
|
+
}
|
|
76
|
+
catch {
|
|
77
|
+
return JSON.parse(JSON.stringify(value));
|
|
78
|
+
}
|
|
79
|
+
};
|
|
80
|
+
export class LiteratureService {
|
|
81
|
+
config;
|
|
82
|
+
logger;
|
|
83
|
+
scholarService;
|
|
84
|
+
httpClient;
|
|
85
|
+
openAlexClient;
|
|
86
|
+
crossrefClient;
|
|
87
|
+
semanticScholarClient;
|
|
88
|
+
searchCache = new Map();
|
|
89
|
+
constructor(config, logger, scholarService) {
|
|
90
|
+
this.config = config;
|
|
91
|
+
this.logger = logger;
|
|
92
|
+
this.scholarService = scholarService;
|
|
93
|
+
this.httpClient = new ResearchHttpClient(config);
|
|
94
|
+
this.openAlexClient = new OpenAlexClient(config, this.httpClient);
|
|
95
|
+
this.crossrefClient = new CrossrefClient(config, this.httpClient);
|
|
96
|
+
this.semanticScholarClient = new SemanticScholarClient(config, this.httpClient);
|
|
97
|
+
}
|
|
98
|
+
async searchGraph(input) {
|
|
99
|
+
const requestedSources = new Set(input.sources ?? DEFAULT_SOURCES);
|
|
100
|
+
const cacheKey = this.createCacheKey(input, requestedSources);
|
|
101
|
+
const cached = this.getCache(cacheKey);
|
|
102
|
+
if (cached) {
|
|
103
|
+
this.logger.debug('Returning literature graph result from cache', {
|
|
104
|
+
query: input.query,
|
|
105
|
+
sources: [...requestedSources],
|
|
106
|
+
limit: input.limit
|
|
107
|
+
});
|
|
108
|
+
return cached;
|
|
109
|
+
}
|
|
110
|
+
const providerErrors = [];
|
|
111
|
+
const providerLimit = Math.max(input.limit, Math.ceil(input.limit * this.config.researchGraphProviderResultMultiplier));
|
|
112
|
+
const providerPromises = [];
|
|
113
|
+
if (requestedSources.has('openalex')) {
|
|
114
|
+
providerPromises.push(this.openAlexClient.searchWorks(input.query, providerLimit).catch((error) => {
|
|
115
|
+
providerErrors.push({ provider: 'openalex', message: error instanceof Error ? error.message : String(error) });
|
|
116
|
+
return [];
|
|
117
|
+
}));
|
|
118
|
+
}
|
|
119
|
+
if (requestedSources.has('crossref')) {
|
|
120
|
+
providerPromises.push(this.crossrefClient.searchWorks(input.query, providerLimit).catch((error) => {
|
|
121
|
+
providerErrors.push({ provider: 'crossref', message: error instanceof Error ? error.message : String(error) });
|
|
122
|
+
return [];
|
|
123
|
+
}));
|
|
124
|
+
}
|
|
125
|
+
if (requestedSources.has('semantic_scholar')) {
|
|
126
|
+
providerPromises.push(this.semanticScholarClient.searchWorks(input.query, providerLimit).catch((error) => {
|
|
127
|
+
providerErrors.push({
|
|
128
|
+
provider: 'semantic_scholar',
|
|
129
|
+
message: error instanceof Error ? error.message : String(error)
|
|
130
|
+
});
|
|
131
|
+
return [];
|
|
132
|
+
}));
|
|
133
|
+
}
|
|
134
|
+
if (requestedSources.has('scholar_scrape')) {
|
|
135
|
+
providerPromises.push(this.searchWithScholarScrape(input.query, providerLimit).catch((error) => {
|
|
136
|
+
providerErrors.push({ provider: 'scholar_scrape', message: error instanceof Error ? error.message : String(error) });
|
|
137
|
+
return [];
|
|
138
|
+
}));
|
|
139
|
+
}
|
|
140
|
+
const providerResults = (await Promise.all(providerPromises)).flat();
|
|
141
|
+
const filtered = providerResults.filter((work) => isWithinYearRange(work.year, input.yearRange) && matchesFieldOfStudy(work, input.fieldsOfStudy));
|
|
142
|
+
const merged = new Map();
|
|
143
|
+
const doiToKey = new Map();
|
|
144
|
+
const titleToKeys = new Map();
|
|
145
|
+
const indexTitle = (titleKey, key) => {
|
|
146
|
+
const existing = titleToKeys.get(titleKey) ?? new Set();
|
|
147
|
+
existing.add(key);
|
|
148
|
+
titleToKeys.set(titleKey, existing);
|
|
149
|
+
};
|
|
150
|
+
const resolveTargetKey = (work) => {
|
|
151
|
+
const normalizedDoi = normalizeDoi(work.doi);
|
|
152
|
+
if (normalizedDoi && doiToKey.has(normalizedDoi)) {
|
|
153
|
+
return doiToKey.get(normalizedDoi) ?? null;
|
|
154
|
+
}
|
|
155
|
+
const titleKey = normalizeTitleKey(work.title);
|
|
156
|
+
const exactCandidateKeys = [...(titleToKeys.get(titleKey) ?? [])];
|
|
157
|
+
for (const key of exactCandidateKeys) {
|
|
158
|
+
const candidate = merged.get(key);
|
|
159
|
+
if (!candidate) {
|
|
160
|
+
continue;
|
|
161
|
+
}
|
|
162
|
+
if (yearsCompatible(candidate.year, work.year) && sharesAuthorSignal(candidate.authors, work.authors)) {
|
|
163
|
+
return key;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
const incomingTokens = tokenSetFromTitle(work.title);
|
|
167
|
+
let bestKey = null;
|
|
168
|
+
let bestSimilarity = 0;
|
|
169
|
+
for (const [key, candidate] of merged.entries()) {
|
|
170
|
+
if (!yearsCompatible(candidate.year, work.year)) {
|
|
171
|
+
continue;
|
|
172
|
+
}
|
|
173
|
+
if (!sharesAuthorSignal(candidate.authors, work.authors)) {
|
|
174
|
+
continue;
|
|
175
|
+
}
|
|
176
|
+
const similarity = setJaccard(tokenSetFromTitle(candidate.title), incomingTokens);
|
|
177
|
+
if (similarity > bestSimilarity) {
|
|
178
|
+
bestSimilarity = similarity;
|
|
179
|
+
bestKey = key;
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
if (bestKey && bestSimilarity >= this.config.researchGraphFuzzyTitleThreshold) {
|
|
183
|
+
return bestKey;
|
|
184
|
+
}
|
|
185
|
+
return null;
|
|
186
|
+
};
|
|
187
|
+
for (const work of filtered) {
|
|
188
|
+
const targetKey = resolveTargetKey(work);
|
|
189
|
+
const normalizedDoi = normalizeDoi(work.doi);
|
|
190
|
+
const titleKey = normalizeTitleKey(work.title);
|
|
191
|
+
const confidence = providerWeight[work.provider] ?? 0.8;
|
|
192
|
+
const provenance = {
|
|
193
|
+
provider: work.provider,
|
|
194
|
+
sourceUrl: work.sourceUrl,
|
|
195
|
+
fetchedAt: new Date().toISOString(),
|
|
196
|
+
confidence
|
|
197
|
+
};
|
|
198
|
+
const relevanceScore = 0.6 * work.score + 0.3 * scoreFromCitations(work.citationCount) + 0.1 * confidence;
|
|
199
|
+
if (!targetKey) {
|
|
200
|
+
const generatedKey = normalizedDoi ?? `title:${titleKey}:year:${work.year ?? 'na'}`;
|
|
201
|
+
merged.set(generatedKey, {
|
|
202
|
+
title: work.title,
|
|
203
|
+
abstract: work.abstract,
|
|
204
|
+
year: work.year,
|
|
205
|
+
venue: work.venue,
|
|
206
|
+
doi: normalizedDoi,
|
|
207
|
+
url: work.url,
|
|
208
|
+
paperId: work.providerId,
|
|
209
|
+
citationCount: work.citationCount,
|
|
210
|
+
influentialCitationCount: work.influentialCitationCount,
|
|
211
|
+
referenceCount: work.referenceCount,
|
|
212
|
+
authors: work.authors,
|
|
213
|
+
openAccess: {
|
|
214
|
+
isOpenAccess: work.openAccess.isOpenAccess,
|
|
215
|
+
pdfUrl: work.openAccess.pdfUrl,
|
|
216
|
+
license: work.openAccess.license
|
|
217
|
+
},
|
|
218
|
+
externalIds: work.externalIds,
|
|
219
|
+
fieldsOfStudy: work.fieldsOfStudy,
|
|
220
|
+
score: relevanceScore,
|
|
221
|
+
provenance: [provenance]
|
|
222
|
+
});
|
|
223
|
+
if (normalizedDoi) {
|
|
224
|
+
doiToKey.set(normalizedDoi, generatedKey);
|
|
225
|
+
}
|
|
226
|
+
indexTitle(titleKey, generatedKey);
|
|
227
|
+
continue;
|
|
228
|
+
}
|
|
229
|
+
const existing = merged.get(targetKey);
|
|
230
|
+
if (!existing) {
|
|
231
|
+
continue;
|
|
232
|
+
}
|
|
233
|
+
existing.abstract = existing.abstract ?? work.abstract;
|
|
234
|
+
existing.year = existing.year ?? work.year;
|
|
235
|
+
existing.venue = existing.venue ?? work.venue;
|
|
236
|
+
existing.url = existing.url ?? work.url;
|
|
237
|
+
existing.doi = existing.doi ?? normalizedDoi;
|
|
238
|
+
existing.citationCount = Math.max(existing.citationCount, work.citationCount);
|
|
239
|
+
existing.influentialCitationCount = Math.max(existing.influentialCitationCount, work.influentialCitationCount);
|
|
240
|
+
existing.referenceCount = Math.max(existing.referenceCount, work.referenceCount);
|
|
241
|
+
existing.authors = existing.authors.length > 0 ? existing.authors : work.authors;
|
|
242
|
+
existing.fieldsOfStudy = mergeFields(existing.fieldsOfStudy, work.fieldsOfStudy);
|
|
243
|
+
existing.externalIds = {
|
|
244
|
+
...existing.externalIds,
|
|
245
|
+
...work.externalIds
|
|
246
|
+
};
|
|
247
|
+
existing.openAccess = {
|
|
248
|
+
isOpenAccess: existing.openAccess.isOpenAccess || work.openAccess.isOpenAccess,
|
|
249
|
+
pdfUrl: existing.openAccess.pdfUrl ?? work.openAccess.pdfUrl,
|
|
250
|
+
license: existing.openAccess.license ?? work.openAccess.license
|
|
251
|
+
};
|
|
252
|
+
existing.provenance.push(provenance);
|
|
253
|
+
existing.score = Math.max(existing.score, relevanceScore);
|
|
254
|
+
const latestDoi = existing.doi ?? normalizedDoi;
|
|
255
|
+
if (latestDoi) {
|
|
256
|
+
doiToKey.set(latestDoi, targetKey);
|
|
257
|
+
}
|
|
258
|
+
indexTitle(titleKey, targetKey);
|
|
259
|
+
}
|
|
260
|
+
const currentYear = new Date().getFullYear();
|
|
261
|
+
const ranked = [...merged.values()]
|
|
262
|
+
.map((work) => {
|
|
263
|
+
const citationScore = scoreFromCitations(work.citationCount);
|
|
264
|
+
const recencyScore = work.year ? 1 / Math.max(1, currentYear - work.year + 1) : 0.15;
|
|
265
|
+
const diversityScore = Math.min(1, new Set(work.provenance.map((record) => record.provider)).size / Math.max(1, requestedSources.size));
|
|
266
|
+
const blended = 0.5 * work.score + 0.25 * citationScore + 0.15 * diversityScore + 0.1 * Math.min(1, recencyScore * 2);
|
|
267
|
+
return {
|
|
268
|
+
...work,
|
|
269
|
+
score: blended
|
|
270
|
+
};
|
|
271
|
+
})
|
|
272
|
+
.sort((a, b) => b.score - a.score || (b.citationCount ?? 0) - (a.citationCount ?? 0))
|
|
273
|
+
.slice(0, input.limit);
|
|
274
|
+
const result = {
|
|
275
|
+
query: input.query,
|
|
276
|
+
totalResults: ranked.length,
|
|
277
|
+
results: ranked,
|
|
278
|
+
providerErrors
|
|
279
|
+
};
|
|
280
|
+
this.setCache(cacheKey, result);
|
|
281
|
+
this.logger.debug('Literature graph search complete', {
|
|
282
|
+
query: input.query,
|
|
283
|
+
providers: [...requestedSources],
|
|
284
|
+
providerLimit,
|
|
285
|
+
mergedCount: ranked.length,
|
|
286
|
+
providerErrors
|
|
287
|
+
});
|
|
288
|
+
return cloneResult(result);
|
|
289
|
+
}
|
|
290
|
+
async resolveByDoi(doi) {
|
|
291
|
+
const normalized = normalizeDoi(doi);
|
|
292
|
+
if (!normalized) {
|
|
293
|
+
return null;
|
|
294
|
+
}
|
|
295
|
+
const result = await this.searchGraph({
|
|
296
|
+
query: normalized,
|
|
297
|
+
limit: 10,
|
|
298
|
+
sources: ['openalex', 'crossref', 'semantic_scholar']
|
|
299
|
+
});
|
|
300
|
+
return (result.results.find((item) => normalizeDoi(item.doi) === normalized) ??
|
|
301
|
+
result.results.find((item) => normalizeDoi(item.externalIds.doi) === normalized) ??
|
|
302
|
+
result.results[0] ??
|
|
303
|
+
null);
|
|
304
|
+
}
|
|
305
|
+
createCacheKey(input, sources) {
|
|
306
|
+
const normalizedFields = (input.fieldsOfStudy ?? []).map((field) => field.trim().toLowerCase()).sort();
|
|
307
|
+
const normalizedSources = [...sources].sort();
|
|
308
|
+
const normalizedYearRange = input.yearRange ? `${input.yearRange[0]}:${input.yearRange[1]}` : 'none';
|
|
309
|
+
return JSON.stringify({
|
|
310
|
+
query: normalizeWhitespace(input.query).toLowerCase(),
|
|
311
|
+
limit: input.limit,
|
|
312
|
+
yearRange: normalizedYearRange,
|
|
313
|
+
fields: normalizedFields,
|
|
314
|
+
sources: normalizedSources
|
|
315
|
+
});
|
|
316
|
+
}
|
|
317
|
+
getCache(cacheKey) {
|
|
318
|
+
if (this.config.researchGraphCacheTtlMs <= 0) {
|
|
319
|
+
return null;
|
|
320
|
+
}
|
|
321
|
+
const cached = this.searchCache.get(cacheKey);
|
|
322
|
+
if (!cached) {
|
|
323
|
+
return null;
|
|
324
|
+
}
|
|
325
|
+
if (cached.expiresAt <= Date.now()) {
|
|
326
|
+
this.searchCache.delete(cacheKey);
|
|
327
|
+
return null;
|
|
328
|
+
}
|
|
329
|
+
return cloneResult(cached.value);
|
|
330
|
+
}
|
|
331
|
+
setCache(cacheKey, value) {
|
|
332
|
+
if (this.config.researchGraphCacheTtlMs <= 0) {
|
|
333
|
+
return;
|
|
334
|
+
}
|
|
335
|
+
const now = Date.now();
|
|
336
|
+
for (const [key, cached] of this.searchCache.entries()) {
|
|
337
|
+
if (cached.expiresAt <= now) {
|
|
338
|
+
this.searchCache.delete(key);
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
this.searchCache.set(cacheKey, {
|
|
342
|
+
value: cloneResult(value),
|
|
343
|
+
expiresAt: now + this.config.researchGraphCacheTtlMs
|
|
344
|
+
});
|
|
345
|
+
while (this.searchCache.size > this.config.researchGraphMaxCacheEntries) {
|
|
346
|
+
const oldestKey = this.searchCache.keys().next().value;
|
|
347
|
+
if (!oldestKey) {
|
|
348
|
+
break;
|
|
349
|
+
}
|
|
350
|
+
this.searchCache.delete(oldestKey);
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
async searchWithScholarScrape(query, limit) {
|
|
354
|
+
const result = await this.scholarService.searchKeywords({
|
|
355
|
+
query,
|
|
356
|
+
numResults: limit,
|
|
357
|
+
start: 0,
|
|
358
|
+
language: this.config.scholarLanguage
|
|
359
|
+
});
|
|
360
|
+
return result.papers.map((paper) => ({
|
|
361
|
+
provider: 'scholar_scrape',
|
|
362
|
+
providerId: paper.url ?? `scholar:${paper.title}`,
|
|
363
|
+
title: paper.title,
|
|
364
|
+
abstract: paper.abstract || null,
|
|
365
|
+
year: parseYear(paper.year),
|
|
366
|
+
venue: null,
|
|
367
|
+
doi: null,
|
|
368
|
+
url: paper.url,
|
|
369
|
+
citationCount: paper.citedByCount,
|
|
370
|
+
influentialCitationCount: 0,
|
|
371
|
+
referenceCount: 0,
|
|
372
|
+
authors: paper.authorsLine
|
|
373
|
+
.split(',')
|
|
374
|
+
.map((name) => ({ name: name.trim() }))
|
|
375
|
+
.filter((author) => author.name.length > 0),
|
|
376
|
+
openAccess: {
|
|
377
|
+
isOpenAccess: Boolean(paper.pdfUrl),
|
|
378
|
+
pdfUrl: paper.pdfUrl,
|
|
379
|
+
license: null
|
|
380
|
+
},
|
|
381
|
+
externalIds: {},
|
|
382
|
+
fieldsOfStudy: [],
|
|
383
|
+
score: 0.4,
|
|
384
|
+
sourceUrl: result.requestedUrl
|
|
385
|
+
}));
|
|
386
|
+
}
|
|
387
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import { normalizeDoi, parseYear } from '../utils.js';
|
|
2
|
+
const parseCrossrefYear = (item) => {
|
|
3
|
+
const fromIssued = item.issued?.['date-parts']?.[0]?.[0];
|
|
4
|
+
if (typeof fromIssued === 'number') {
|
|
5
|
+
return parseYear(fromIssued);
|
|
6
|
+
}
|
|
7
|
+
const fromPublished = item.published?.['date-parts']?.[0]?.[0];
|
|
8
|
+
if (typeof fromPublished === 'number') {
|
|
9
|
+
return parseYear(fromPublished);
|
|
10
|
+
}
|
|
11
|
+
return null;
|
|
12
|
+
};
|
|
13
|
+
const toPlainAbstract = (value) => {
|
|
14
|
+
if (!value) {
|
|
15
|
+
return null;
|
|
16
|
+
}
|
|
17
|
+
const stripped = value.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
|
|
18
|
+
return stripped.length > 0 ? stripped : null;
|
|
19
|
+
};
|
|
20
|
+
export class CrossrefClient {
|
|
21
|
+
config;
|
|
22
|
+
httpClient;
|
|
23
|
+
constructor(config, httpClient) {
|
|
24
|
+
this.config = config;
|
|
25
|
+
this.httpClient = httpClient;
|
|
26
|
+
}
|
|
27
|
+
async searchWorks(query, limit) {
|
|
28
|
+
const url = new URL('/works', this.config.researchCrossrefBaseUrl);
|
|
29
|
+
url.searchParams.set('query.bibliographic', query);
|
|
30
|
+
url.searchParams.set('rows', String(limit));
|
|
31
|
+
const payload = await this.httpClient.fetchJson({
|
|
32
|
+
provider: 'crossref',
|
|
33
|
+
url,
|
|
34
|
+
headers: {
|
|
35
|
+
accept: 'application/json'
|
|
36
|
+
}
|
|
37
|
+
});
|
|
38
|
+
return (payload.message?.items ?? []).map((item) => {
|
|
39
|
+
const doi = normalizeDoi(item.DOI ?? null);
|
|
40
|
+
const linkPdf = (item.link ?? []).find((link) => (link['content-type'] ?? '').includes('pdf'))?.URL ?? null;
|
|
41
|
+
return {
|
|
42
|
+
provider: 'crossref',
|
|
43
|
+
providerId: doi ? `doi:${doi}` : `crossref:${item.URL ?? 'unknown'}`,
|
|
44
|
+
title: item.title?.[0] ?? 'Untitled',
|
|
45
|
+
abstract: toPlainAbstract(item.abstract),
|
|
46
|
+
year: parseCrossrefYear(item),
|
|
47
|
+
venue: item['container-title']?.[0] ?? null,
|
|
48
|
+
doi,
|
|
49
|
+
url: item.URL ?? null,
|
|
50
|
+
citationCount: item['is-referenced-by-count'] ?? 0,
|
|
51
|
+
influentialCitationCount: 0,
|
|
52
|
+
referenceCount: item.reference?.length ?? 0,
|
|
53
|
+
authors: (item.author ?? [])
|
|
54
|
+
.map((author) => ({
|
|
55
|
+
name: [author.given ?? '', author.family ?? ''].join(' ').trim(),
|
|
56
|
+
authorId: author.ORCID?.replace('https://orcid.org/', '') ?? null
|
|
57
|
+
}))
|
|
58
|
+
.filter((author) => author.name.length > 0),
|
|
59
|
+
openAccess: {
|
|
60
|
+
isOpenAccess: Boolean(linkPdf),
|
|
61
|
+
pdfUrl: linkPdf,
|
|
62
|
+
license: item.license?.[0]?.URL ?? null
|
|
63
|
+
},
|
|
64
|
+
externalIds: {
|
|
65
|
+
...(doi ? { doi } : {})
|
|
66
|
+
},
|
|
67
|
+
fieldsOfStudy: item.subject ?? [],
|
|
68
|
+
score: item.score ?? 0.5,
|
|
69
|
+
sourceUrl: url.toString()
|
|
70
|
+
};
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { normalizeDoi, parseYear } from '../utils.js';
|
|
2
|
+
const decodeInvertedAbstract = (inverted) => {
|
|
3
|
+
if (!inverted || Object.keys(inverted).length === 0) {
|
|
4
|
+
return null;
|
|
5
|
+
}
|
|
6
|
+
let max = 0;
|
|
7
|
+
for (const positions of Object.values(inverted)) {
|
|
8
|
+
for (const index of positions) {
|
|
9
|
+
if (index > max) {
|
|
10
|
+
max = index;
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
const words = new Array(max + 1).fill('');
|
|
15
|
+
for (const [token, positions] of Object.entries(inverted)) {
|
|
16
|
+
for (const index of positions) {
|
|
17
|
+
words[index] = token;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
const text = words.join(' ').replace(/\s+/g, ' ').trim();
|
|
21
|
+
return text.length > 0 ? text : null;
|
|
22
|
+
};
|
|
23
|
+
export class OpenAlexClient {
|
|
24
|
+
config;
|
|
25
|
+
httpClient;
|
|
26
|
+
constructor(config, httpClient) {
|
|
27
|
+
this.config = config;
|
|
28
|
+
this.httpClient = httpClient;
|
|
29
|
+
}
|
|
30
|
+
async searchWorks(query, limit) {
|
|
31
|
+
const url = new URL('/works', this.config.researchOpenAlexBaseUrl);
|
|
32
|
+
url.searchParams.set('search', query);
|
|
33
|
+
url.searchParams.set('per-page', String(limit));
|
|
34
|
+
if (this.config.researchOpenAlexApiKey) {
|
|
35
|
+
url.searchParams.set('api_key', this.config.researchOpenAlexApiKey);
|
|
36
|
+
}
|
|
37
|
+
const payload = await this.httpClient.fetchJson({
|
|
38
|
+
provider: 'openalex',
|
|
39
|
+
url
|
|
40
|
+
});
|
|
41
|
+
return (payload.results ?? []).map((item) => {
|
|
42
|
+
const doi = normalizeDoi(item.ids?.doi ?? null);
|
|
43
|
+
return {
|
|
44
|
+
provider: 'openalex',
|
|
45
|
+
providerId: item.id ?? `openalex:${item.display_name ?? 'unknown'}`,
|
|
46
|
+
title: item.display_name ?? 'Untitled',
|
|
47
|
+
abstract: decodeInvertedAbstract(item.abstract_inverted_index),
|
|
48
|
+
year: parseYear(item.publication_year),
|
|
49
|
+
venue: item.primary_location?.source?.display_name ?? null,
|
|
50
|
+
doi,
|
|
51
|
+
url: item.primary_location?.landing_page_url ?? item.id ?? null,
|
|
52
|
+
citationCount: item.cited_by_count ?? 0,
|
|
53
|
+
influentialCitationCount: 0,
|
|
54
|
+
referenceCount: item.referenced_works_count ?? 0,
|
|
55
|
+
authors: (item.authorships ?? [])
|
|
56
|
+
.map((auth) => ({
|
|
57
|
+
name: auth.author?.display_name ?? '',
|
|
58
|
+
authorId: auth.author?.id ?? null
|
|
59
|
+
}))
|
|
60
|
+
.filter((author) => author.name.length > 0),
|
|
61
|
+
openAccess: {
|
|
62
|
+
isOpenAccess: item.open_access?.is_oa ?? item.open_access?.any_repository_has_fulltext ?? Boolean(item.primary_location?.pdf_url),
|
|
63
|
+
pdfUrl: item.primary_location?.pdf_url ?? item.open_access?.oa_url ?? null,
|
|
64
|
+
license: item.primary_location?.license ?? item.open_access?.oa_status ?? null
|
|
65
|
+
},
|
|
66
|
+
externalIds: {
|
|
67
|
+
...(item.ids?.openalex ? { openalex: item.ids.openalex } : {}),
|
|
68
|
+
...(doi ? { doi } : {}),
|
|
69
|
+
...(item.ids?.pmid ? { pmid: item.ids.pmid } : {}),
|
|
70
|
+
...(item.ids?.pmcid ? { pmcid: item.ids.pmcid } : {})
|
|
71
|
+
},
|
|
72
|
+
fieldsOfStudy: (item.concepts ?? [])
|
|
73
|
+
.map((concept) => concept.display_name ?? '')
|
|
74
|
+
.filter((value) => value.length > 0),
|
|
75
|
+
score: item.relevance_score ?? 0.5,
|
|
76
|
+
sourceUrl: url.toString()
|
|
77
|
+
};
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import { normalizeDoi, parseYear } from '../utils.js';
|
|
2
|
+
export class SemanticScholarClient {
|
|
3
|
+
config;
|
|
4
|
+
httpClient;
|
|
5
|
+
constructor(config, httpClient) {
|
|
6
|
+
this.config = config;
|
|
7
|
+
this.httpClient = httpClient;
|
|
8
|
+
}
|
|
9
|
+
async searchWorks(query, limit) {
|
|
10
|
+
const url = new URL('/paper/search', this.config.researchSemanticScholarBaseUrl.endsWith('/')
|
|
11
|
+
? this.config.researchSemanticScholarBaseUrl
|
|
12
|
+
: `${this.config.researchSemanticScholarBaseUrl}/`);
|
|
13
|
+
url.searchParams.set('query', query);
|
|
14
|
+
url.searchParams.set('limit', String(limit));
|
|
15
|
+
url.searchParams.set('fields', 'paperId,title,abstract,year,venue,externalIds,url,citationCount,influentialCitationCount,referenceCount,isOpenAccess,openAccessPdf,fieldsOfStudy,authors');
|
|
16
|
+
const headers = { accept: 'application/json' };
|
|
17
|
+
if (this.config.researchSemanticScholarApiKey) {
|
|
18
|
+
headers['x-api-key'] = this.config.researchSemanticScholarApiKey;
|
|
19
|
+
}
|
|
20
|
+
const payload = await this.httpClient.fetchJson({
|
|
21
|
+
provider: 'semantic_scholar',
|
|
22
|
+
url,
|
|
23
|
+
headers
|
|
24
|
+
});
|
|
25
|
+
return (payload.data ?? []).map((item) => {
|
|
26
|
+
const doi = normalizeDoi(item.externalIds?.DOI ?? null);
|
|
27
|
+
return {
|
|
28
|
+
provider: 'semantic_scholar',
|
|
29
|
+
providerId: item.paperId ?? `semantic:${item.title ?? 'unknown'}`,
|
|
30
|
+
title: item.title ?? 'Untitled',
|
|
31
|
+
abstract: item.abstract ?? null,
|
|
32
|
+
year: parseYear(item.year),
|
|
33
|
+
venue: item.venue ?? null,
|
|
34
|
+
doi,
|
|
35
|
+
url: item.url ?? null,
|
|
36
|
+
citationCount: item.citationCount ?? 0,
|
|
37
|
+
influentialCitationCount: item.influentialCitationCount ?? 0,
|
|
38
|
+
referenceCount: item.referenceCount ?? 0,
|
|
39
|
+
authors: (item.authors ?? [])
|
|
40
|
+
.map((author) => ({
|
|
41
|
+
name: author.name ?? '',
|
|
42
|
+
authorId: author.authorId ?? null
|
|
43
|
+
}))
|
|
44
|
+
.filter((author) => author.name.length > 0),
|
|
45
|
+
openAccess: {
|
|
46
|
+
isOpenAccess: item.isOpenAccess ?? Boolean(item.openAccessPdf?.url),
|
|
47
|
+
pdfUrl: item.openAccessPdf?.url ?? null,
|
|
48
|
+
license: item.openAccessPdf?.license ?? null
|
|
49
|
+
},
|
|
50
|
+
externalIds: {
|
|
51
|
+
...(doi ? { doi } : {}),
|
|
52
|
+
...(item.externalIds ?? {})
|
|
53
|
+
},
|
|
54
|
+
fieldsOfStudy: item.fieldsOfStudy ?? [],
|
|
55
|
+
score: 0.7,
|
|
56
|
+
sourceUrl: url.toString()
|
|
57
|
+
};
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import { CitationService } from './citation-service.js';
|
|
2
|
+
import { ExtractionService } from './extraction-service.js';
|
|
3
|
+
import { LiteratureService } from './literature-service.js';
|
|
4
|
+
import { IngestionService } from './ingestion-service.js';
|
|
5
|
+
export class ResearchService {
|
|
6
|
+
config;
|
|
7
|
+
logger;
|
|
8
|
+
scholarService;
|
|
9
|
+
literatureService;
|
|
10
|
+
ingestionService;
|
|
11
|
+
extractionService;
|
|
12
|
+
citationService;
|
|
13
|
+
constructor(config, logger, scholarService) {
|
|
14
|
+
this.config = config;
|
|
15
|
+
this.logger = logger;
|
|
16
|
+
this.scholarService = scholarService;
|
|
17
|
+
this.literatureService = new LiteratureService(config, logger, scholarService);
|
|
18
|
+
this.ingestionService = new IngestionService(config, logger, this.literatureService);
|
|
19
|
+
this.extractionService = new ExtractionService();
|
|
20
|
+
this.citationService = new CitationService(this.literatureService);
|
|
21
|
+
}
|
|
22
|
+
static fromConfig(config, logger, scholarService) {
|
|
23
|
+
return new ResearchService(config, logger, scholarService);
|
|
24
|
+
}
|
|
25
|
+
async searchLiteratureGraph(input) {
|
|
26
|
+
return this.literatureService.searchGraph(input);
|
|
27
|
+
}
|
|
28
|
+
async resolveWorkByDoi(doi) {
|
|
29
|
+
return this.literatureService.resolveByDoi(doi);
|
|
30
|
+
}
|
|
31
|
+
ingestPaperFullText(input) {
|
|
32
|
+
return this.ingestionService.enqueueIngestion(input);
|
|
33
|
+
}
|
|
34
|
+
getIngestionStatus(jobId) {
|
|
35
|
+
return this.ingestionService.getJob(jobId);
|
|
36
|
+
}
|
|
37
|
+
getParsedDocument(documentId) {
|
|
38
|
+
return this.ingestionService.getDocument(documentId);
|
|
39
|
+
}
|
|
40
|
+
extractGranularPaperDetails(documentId, input) {
|
|
41
|
+
const document = this.ingestionService.getDocument(documentId);
|
|
42
|
+
return this.extractionService.extract(document, input);
|
|
43
|
+
}
|
|
44
|
+
suggestContextualCitations(input) {
|
|
45
|
+
return this.citationService.suggestContextualCitations(input);
|
|
46
|
+
}
|
|
47
|
+
buildReferenceList(input) {
|
|
48
|
+
return this.citationService.buildReferenceList(input);
|
|
49
|
+
}
|
|
50
|
+
validateManuscriptCitations(manuscriptText, references, options) {
|
|
51
|
+
return this.citationService.validateManuscriptCitations(manuscriptText, references, options);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|