scholar-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +232 -0
- package/dist/cli/args.js +57 -0
- package/dist/config.js +131 -0
- package/dist/core/logger.js +36 -0
- package/dist/http/start-http-server.js +329 -0
- package/dist/index.js +66 -0
- package/dist/mcp/create-scholar-mcp-server.js +583 -0
- package/dist/mcp/start-stdio-server.js +8 -0
- package/dist/research/citation-service.js +407 -0
- package/dist/research/errors.js +36 -0
- package/dist/research/extraction-service.js +109 -0
- package/dist/research/http-client.js +62 -0
- package/dist/research/index.js +7 -0
- package/dist/research/ingestion-service.js +430 -0
- package/dist/research/literature-service.js +387 -0
- package/dist/research/providers/crossref-client.js +73 -0
- package/dist/research/providers/openalex-client.js +80 -0
- package/dist/research/providers/semantic-scholar-client.js +60 -0
- package/dist/research/research-service.js +53 -0
- package/dist/research/types.js +1 -0
- package/dist/research/utils.js +54 -0
- package/dist/scholar/errors.js +30 -0
- package/dist/scholar/scholar-client.js +99 -0
- package/dist/scholar/scholar-parser.js +251 -0
- package/dist/scholar/scholar-service.js +202 -0
- package/dist/scholar/types.js +1 -0
- package/dist/version.js +14 -0
- package/package.json +49 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import { createHash, randomUUID } from 'node:crypto';
|
|
2
|
+
export const nowIso = () => new Date().toISOString();
|
|
3
|
+
export const makeStableId = (parts, prefix) => {
|
|
4
|
+
const value = parts.filter(Boolean).join('|');
|
|
5
|
+
if (!value) {
|
|
6
|
+
return `${prefix}_${randomUUID()}`;
|
|
7
|
+
}
|
|
8
|
+
const digest = createHash('sha1').update(value).digest('hex').slice(0, 16);
|
|
9
|
+
return `${prefix}_${digest}`;
|
|
10
|
+
};
|
|
11
|
+
export const normalizeWhitespace = (input) => input.replace(/\s+/g, ' ').trim();
|
|
12
|
+
export const normalizeDoi = (doi) => {
|
|
13
|
+
if (!doi) {
|
|
14
|
+
return null;
|
|
15
|
+
}
|
|
16
|
+
return doi
|
|
17
|
+
.trim()
|
|
18
|
+
.replace(/^https?:\/\/(dx\.)?doi\.org\//i, '')
|
|
19
|
+
.toLowerCase();
|
|
20
|
+
};
|
|
21
|
+
export const parseYear = (input) => {
|
|
22
|
+
if (typeof input === 'number' && Number.isInteger(input) && input >= 1000 && input <= 2100) {
|
|
23
|
+
return input;
|
|
24
|
+
}
|
|
25
|
+
if (typeof input === 'string') {
|
|
26
|
+
const match = input.match(/(?:19|20)\d{2}/);
|
|
27
|
+
if (match?.[0]) {
|
|
28
|
+
const year = Number.parseInt(match[0], 10);
|
|
29
|
+
if (year >= 1000 && year <= 2100) {
|
|
30
|
+
return year;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
return null;
|
|
35
|
+
};
|
|
36
|
+
export const clamp = (value, min, max) => Math.min(max, Math.max(min, value));
|
|
37
|
+
export const tokenizeForRanking = (input) => normalizeWhitespace(input)
|
|
38
|
+
.toLowerCase()
|
|
39
|
+
.replace(/[^a-z0-9\s]/g, ' ')
|
|
40
|
+
.split(' ')
|
|
41
|
+
.filter((token) => token.length >= 3);
|
|
42
|
+
export const overlapScore = (a, b) => {
|
|
43
|
+
if (a.length === 0 || b.length === 0) {
|
|
44
|
+
return 0;
|
|
45
|
+
}
|
|
46
|
+
const bSet = new Set(b);
|
|
47
|
+
let overlap = 0;
|
|
48
|
+
for (const token of a) {
|
|
49
|
+
if (bSet.has(token)) {
|
|
50
|
+
overlap += 1;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
return overlap / Math.max(a.length, b.length);
|
|
54
|
+
};
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
export class ScholarError extends Error {
|
|
2
|
+
details;
|
|
3
|
+
constructor(message, details) {
|
|
4
|
+
super(message);
|
|
5
|
+
this.details = details;
|
|
6
|
+
this.name = 'ScholarError';
|
|
7
|
+
}
|
|
8
|
+
}
|
|
9
|
+
export class ScholarFetchError extends ScholarError {
|
|
10
|
+
url;
|
|
11
|
+
status;
|
|
12
|
+
constructor(message, url, status, details) {
|
|
13
|
+
super(message, details);
|
|
14
|
+
this.url = url;
|
|
15
|
+
this.status = status;
|
|
16
|
+
this.name = 'ScholarFetchError';
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
export class ScholarBlockedError extends ScholarFetchError {
|
|
20
|
+
constructor(message, url, details) {
|
|
21
|
+
super(message, url, undefined, details);
|
|
22
|
+
this.name = 'ScholarBlockedError';
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
export class ScholarParseError extends ScholarError {
|
|
26
|
+
constructor(message, details) {
|
|
27
|
+
super(message, details);
|
|
28
|
+
this.name = 'ScholarParseError';
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import { setTimeout as sleep } from 'node:timers/promises';
|
|
2
|
+
import { ScholarBlockedError, ScholarFetchError } from './errors.js';
|
|
3
|
+
const SCHOLAR_SEARCH_PATH = '/scholar';
|
|
4
|
+
const SCHOLAR_CITATIONS_PATH = '/citations';
|
|
5
|
+
const USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36';
|
|
6
|
+
const BLOCK_PATTERNS = [
|
|
7
|
+
/detected unusual traffic/i,
|
|
8
|
+
/not a robot/i,
|
|
9
|
+
/please show you're not a robot/i,
|
|
10
|
+
/accounts\.google\.com\/v3\/signin/i,
|
|
11
|
+
/sorry\/index/i
|
|
12
|
+
];
|
|
13
|
+
const toSearchParams = (values) => {
|
|
14
|
+
const params = new URLSearchParams();
|
|
15
|
+
Object.entries(values).forEach(([key, value]) => {
|
|
16
|
+
if (value === undefined) {
|
|
17
|
+
return;
|
|
18
|
+
}
|
|
19
|
+
params.set(key, String(value));
|
|
20
|
+
});
|
|
21
|
+
return params;
|
|
22
|
+
};
|
|
23
|
+
export class ScholarClient {
|
|
24
|
+
config;
|
|
25
|
+
lastRequestAt = 0;
|
|
26
|
+
constructor(config) {
|
|
27
|
+
this.config = config;
|
|
28
|
+
}
|
|
29
|
+
async fetchScholarSearch(params) {
|
|
30
|
+
return this.fetchHtml(SCHOLAR_SEARCH_PATH, params);
|
|
31
|
+
}
|
|
32
|
+
async fetchAuthorProfile(authorId, language) {
|
|
33
|
+
return this.fetchHtml(SCHOLAR_CITATIONS_PATH, {
|
|
34
|
+
user: authorId,
|
|
35
|
+
hl: language
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
async fetchHtml(path, params) {
|
|
39
|
+
const requestUrl = new URL(path, this.config.scholarBaseUrl);
|
|
40
|
+
requestUrl.search = toSearchParams(params).toString();
|
|
41
|
+
let attempt = 0;
|
|
42
|
+
let lastError;
|
|
43
|
+
while (attempt <= this.config.scholarRetryAttempts) {
|
|
44
|
+
await this.waitBeforeRequest();
|
|
45
|
+
const controller = new AbortController();
|
|
46
|
+
const timeoutId = setTimeout(() => controller.abort(), this.config.scholarTimeoutMs);
|
|
47
|
+
try {
|
|
48
|
+
const response = await fetch(requestUrl, {
|
|
49
|
+
method: 'GET',
|
|
50
|
+
headers: {
|
|
51
|
+
'user-agent': USER_AGENT,
|
|
52
|
+
accept: 'text/html,application/xhtml+xml'
|
|
53
|
+
},
|
|
54
|
+
signal: controller.signal
|
|
55
|
+
});
|
|
56
|
+
const html = await response.text();
|
|
57
|
+
if (!response.ok) {
|
|
58
|
+
throw new ScholarFetchError(`Google Scholar returned HTTP ${response.status}`, requestUrl.toString(), response.status, { statusText: response.statusText });
|
|
59
|
+
}
|
|
60
|
+
this.assertNotBlocked(html, requestUrl.toString());
|
|
61
|
+
return { html, url: requestUrl.toString() };
|
|
62
|
+
}
|
|
63
|
+
catch (error) {
|
|
64
|
+
lastError = error;
|
|
65
|
+
const isLastAttempt = attempt >= this.config.scholarRetryAttempts;
|
|
66
|
+
if (isLastAttempt) {
|
|
67
|
+
break;
|
|
68
|
+
}
|
|
69
|
+
await sleep(this.config.scholarRetryDelayMs);
|
|
70
|
+
}
|
|
71
|
+
finally {
|
|
72
|
+
clearTimeout(timeoutId);
|
|
73
|
+
}
|
|
74
|
+
attempt += 1;
|
|
75
|
+
}
|
|
76
|
+
if (lastError instanceof Error) {
|
|
77
|
+
throw lastError;
|
|
78
|
+
}
|
|
79
|
+
throw new ScholarFetchError('Unknown Google Scholar fetch error', requestUrl.toString());
|
|
80
|
+
}
|
|
81
|
+
assertNotBlocked(html, url) {
|
|
82
|
+
if (BLOCK_PATTERNS.some((pattern) => pattern.test(html))) {
|
|
83
|
+
throw new ScholarBlockedError('Google Scholar blocked or challenged this request. Try slower request settings or run from a different network.', url);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
async waitBeforeRequest() {
|
|
87
|
+
if (this.config.scholarRequestDelayMs <= 0) {
|
|
88
|
+
this.lastRequestAt = Date.now();
|
|
89
|
+
return;
|
|
90
|
+
}
|
|
91
|
+
const now = Date.now();
|
|
92
|
+
const elapsed = now - this.lastRequestAt;
|
|
93
|
+
const delay = this.config.scholarRequestDelayMs - elapsed;
|
|
94
|
+
if (delay > 0) {
|
|
95
|
+
await sleep(delay);
|
|
96
|
+
}
|
|
97
|
+
this.lastRequestAt = Date.now();
|
|
98
|
+
}
|
|
99
|
+
}
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
import { load } from 'cheerio';
|
|
2
|
+
const TEXT_WHITESPACE = /\s+/g;
|
|
3
|
+
const YEAR_REGEX = /(?:^|\D)((?:19|20)\d{2})(?:\D|$)/g;
|
|
4
|
+
const AUTHOR_ID_REGEX = /[?&]user=([A-Za-z0-9_-]+)/;
|
|
5
|
+
const normalizeText = (value) => value.replace(TEXT_WHITESPACE, ' ').trim();
|
|
6
|
+
const parseNumber = (value) => {
|
|
7
|
+
const digits = value.replace(/[^\d]/g, '');
|
|
8
|
+
return digits.length === 0 ? 0 : Number.parseInt(digits, 10);
|
|
9
|
+
};
|
|
10
|
+
const parseNullableNumber = (value) => {
|
|
11
|
+
const digits = value.replace(/[^\d]/g, '');
|
|
12
|
+
if (digits.length === 0) {
|
|
13
|
+
return null;
|
|
14
|
+
}
|
|
15
|
+
return Number.parseInt(digits, 10);
|
|
16
|
+
};
|
|
17
|
+
const resolveUrl = (baseUrl, href) => {
|
|
18
|
+
if (!href) {
|
|
19
|
+
return null;
|
|
20
|
+
}
|
|
21
|
+
try {
|
|
22
|
+
return new URL(href, baseUrl).toString();
|
|
23
|
+
}
|
|
24
|
+
catch {
|
|
25
|
+
return null;
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
const extractYear = (input) => {
|
|
29
|
+
const years = [];
|
|
30
|
+
for (const match of input.matchAll(YEAR_REGEX)) {
|
|
31
|
+
const year = match[1];
|
|
32
|
+
if (year) {
|
|
33
|
+
years.push(Number.parseInt(year, 10));
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
return years.at(-1) ?? null;
|
|
37
|
+
};
|
|
38
|
+
const extractAuthorIds = (htmlChunk) => {
|
|
39
|
+
const ids = new Set();
|
|
40
|
+
for (const match of htmlChunk.matchAll(/[?&]user=([A-Za-z0-9_-]+)/g)) {
|
|
41
|
+
const authorId = match[1];
|
|
42
|
+
if (!authorId) {
|
|
43
|
+
continue;
|
|
44
|
+
}
|
|
45
|
+
ids.add(authorId);
|
|
46
|
+
}
|
|
47
|
+
return [...ids];
|
|
48
|
+
};
|
|
49
|
+
const extractAuthorIdsFromElement = ($, selector) => {
|
|
50
|
+
const ids = new Set();
|
|
51
|
+
$(selector).each((_, element) => {
|
|
52
|
+
const href = $(element).attr('href');
|
|
53
|
+
if (!href) {
|
|
54
|
+
return;
|
|
55
|
+
}
|
|
56
|
+
const match = href.match(AUTHOR_ID_REGEX);
|
|
57
|
+
if (match) {
|
|
58
|
+
const authorId = match[1];
|
|
59
|
+
if (authorId) {
|
|
60
|
+
ids.add(authorId);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
});
|
|
64
|
+
return [...ids];
|
|
65
|
+
};
|
|
66
|
+
const parsePaperResult = (baseUrl, htmlChunk) => {
|
|
67
|
+
const $ = load(htmlChunk);
|
|
68
|
+
const titleAnchor = $('h3.gs_rt a').first();
|
|
69
|
+
const title = normalizeText(titleAnchor.text() || $('h3.gs_rt').first().text());
|
|
70
|
+
const authorsLine = normalizeText($('.gs_a').first().text());
|
|
71
|
+
const abstract = normalizeText($('.gs_rs').first().text());
|
|
72
|
+
let citedByCount = 0;
|
|
73
|
+
let citedByUrl = null;
|
|
74
|
+
let relatedArticlesUrl = null;
|
|
75
|
+
let versionsCount = 0;
|
|
76
|
+
let versionsUrl = null;
|
|
77
|
+
$('.gs_fl a').each((_, link) => {
|
|
78
|
+
const anchor = $(link);
|
|
79
|
+
const text = normalizeText(anchor.text());
|
|
80
|
+
const href = anchor.attr('href');
|
|
81
|
+
if (text.startsWith('Cited by')) {
|
|
82
|
+
citedByCount = parseNumber(text);
|
|
83
|
+
citedByUrl = resolveUrl(baseUrl, href);
|
|
84
|
+
return;
|
|
85
|
+
}
|
|
86
|
+
if (text.startsWith('Related articles')) {
|
|
87
|
+
relatedArticlesUrl = resolveUrl(baseUrl, href);
|
|
88
|
+
return;
|
|
89
|
+
}
|
|
90
|
+
const versionMatch = text.match(/^All\s+(\d+)\s+versions$/i);
|
|
91
|
+
if (versionMatch) {
|
|
92
|
+
const versionCount = versionMatch[1];
|
|
93
|
+
if (versionCount) {
|
|
94
|
+
versionsCount = Number.parseInt(versionCount, 10);
|
|
95
|
+
}
|
|
96
|
+
versionsUrl = resolveUrl(baseUrl, href);
|
|
97
|
+
}
|
|
98
|
+
});
|
|
99
|
+
const authorNode = $('.gs_a').first();
|
|
100
|
+
return {
|
|
101
|
+
title,
|
|
102
|
+
url: resolveUrl(baseUrl, titleAnchor.attr('href')),
|
|
103
|
+
authorsLine,
|
|
104
|
+
abstract,
|
|
105
|
+
year: extractYear(`${authorsLine} ${abstract}`),
|
|
106
|
+
citedByCount,
|
|
107
|
+
citedByUrl,
|
|
108
|
+
relatedArticlesUrl,
|
|
109
|
+
versionsCount,
|
|
110
|
+
versionsUrl,
|
|
111
|
+
pdfUrl: resolveUrl(baseUrl, $('.gs_ggsd a').first().attr('href')),
|
|
112
|
+
authorIds: extractAuthorIds($.html(authorNode) ?? '')
|
|
113
|
+
};
|
|
114
|
+
};
|
|
115
|
+
export const parseScholarSearchResult = (html, baseUrl, requestedUrl, query) => {
|
|
116
|
+
const $ = load(html);
|
|
117
|
+
const resultChunks = $('.gs_r.gs_or.gs_scl')
|
|
118
|
+
.toArray()
|
|
119
|
+
.map((el) => $.html(el))
|
|
120
|
+
.filter((chunk) => typeof chunk === 'string');
|
|
121
|
+
const papers = resultChunks.map((chunk) => parsePaperResult(baseUrl, chunk));
|
|
122
|
+
const nextHref = $('#gs_n a')
|
|
123
|
+
.toArray()
|
|
124
|
+
.map((el) => $(el))
|
|
125
|
+
.find((el) => normalizeText(el.text()).toLowerCase().includes('next'))
|
|
126
|
+
?.attr('href');
|
|
127
|
+
let nextPageStart = null;
|
|
128
|
+
if (nextHref) {
|
|
129
|
+
try {
|
|
130
|
+
const nextUrl = new URL(nextHref, baseUrl);
|
|
131
|
+
const start = nextUrl.searchParams.get('start');
|
|
132
|
+
nextPageStart = start ? Number.parseInt(start, 10) : null;
|
|
133
|
+
}
|
|
134
|
+
catch {
|
|
135
|
+
nextPageStart = null;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
const totalResultsText = normalizeText($('#gs_ab_md').first().text()) || null;
|
|
139
|
+
return {
|
|
140
|
+
query,
|
|
141
|
+
requestedUrl,
|
|
142
|
+
totalResultsText,
|
|
143
|
+
nextPageStart,
|
|
144
|
+
papers
|
|
145
|
+
};
|
|
146
|
+
};
|
|
147
|
+
export const extractAuthorIdsFromSearch = (html) => {
|
|
148
|
+
const $ = load(html);
|
|
149
|
+
const ids = new Set();
|
|
150
|
+
$('a[href*="/citations?user="]').each((_, element) => {
|
|
151
|
+
const href = $(element).attr('href');
|
|
152
|
+
if (!href) {
|
|
153
|
+
return;
|
|
154
|
+
}
|
|
155
|
+
const match = href.match(AUTHOR_ID_REGEX);
|
|
156
|
+
if (match) {
|
|
157
|
+
const authorId = match[1];
|
|
158
|
+
if (authorId) {
|
|
159
|
+
ids.add(authorId);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
});
|
|
163
|
+
return [...ids];
|
|
164
|
+
};
|
|
165
|
+
const parseAuthorMetrics = ($) => {
|
|
166
|
+
const rows = $('#gsc_rsb_st tr').toArray();
|
|
167
|
+
let citationsAll = null;
|
|
168
|
+
let citationsSince = null;
|
|
169
|
+
let hIndexAll = null;
|
|
170
|
+
let hIndexSince = null;
|
|
171
|
+
let i10IndexAll = null;
|
|
172
|
+
let i10IndexSince = null;
|
|
173
|
+
rows.forEach((row) => {
|
|
174
|
+
const label = normalizeText($(row).find('td.gsc_rsb_sc1').text()).toLowerCase();
|
|
175
|
+
const values = $(row)
|
|
176
|
+
.find('td.gsc_rsb_std')
|
|
177
|
+
.toArray()
|
|
178
|
+
.map((cell) => normalizeText($(cell).text()));
|
|
179
|
+
if (!label || values.length === 0) {
|
|
180
|
+
return;
|
|
181
|
+
}
|
|
182
|
+
if (label.startsWith('citations')) {
|
|
183
|
+
citationsAll = parseNullableNumber(values[0] ?? '');
|
|
184
|
+
citationsSince = parseNullableNumber(values[1] ?? '');
|
|
185
|
+
return;
|
|
186
|
+
}
|
|
187
|
+
if (label.startsWith('h-index')) {
|
|
188
|
+
hIndexAll = parseNullableNumber(values[0] ?? '');
|
|
189
|
+
hIndexSince = parseNullableNumber(values[1] ?? '');
|
|
190
|
+
return;
|
|
191
|
+
}
|
|
192
|
+
if (label.startsWith('i10-index')) {
|
|
193
|
+
i10IndexAll = parseNullableNumber(values[0] ?? '');
|
|
194
|
+
i10IndexSince = parseNullableNumber(values[1] ?? '');
|
|
195
|
+
}
|
|
196
|
+
});
|
|
197
|
+
return {
|
|
198
|
+
citationsAll,
|
|
199
|
+
citationsSince,
|
|
200
|
+
hIndexAll,
|
|
201
|
+
hIndexSince,
|
|
202
|
+
i10IndexAll,
|
|
203
|
+
i10IndexSince
|
|
204
|
+
};
|
|
205
|
+
};
|
|
206
|
+
const parseAuthorPublication = ($, row, baseUrl) => {
|
|
207
|
+
const rowNode = $(row);
|
|
208
|
+
const titleAnchor = rowNode.find('.gsc_a_at').first();
|
|
209
|
+
const metadataRows = rowNode.find('.gs_gray').toArray().map((cell) => normalizeText($(cell).text()));
|
|
210
|
+
return {
|
|
211
|
+
title: normalizeText(titleAnchor.text()),
|
|
212
|
+
detailUrl: resolveUrl(baseUrl, titleAnchor.attr('href')),
|
|
213
|
+
authors: metadataRows[0] ?? '',
|
|
214
|
+
venue: metadataRows[1] ?? '',
|
|
215
|
+
year: parseNullableNumber(rowNode.find('.gsc_a_y .gsc_a_h').first().text() || rowNode.find('.gsc_a_y').first().text()),
|
|
216
|
+
citations: parseNumber(rowNode.find('.gsc_a_ac').first().text()),
|
|
217
|
+
citationsUrl: resolveUrl(baseUrl, rowNode.find('.gsc_a_ac').first().attr('href'))
|
|
218
|
+
};
|
|
219
|
+
};
|
|
220
|
+
export const parseScholarAuthorProfile = (html, baseUrl, authorId, maxPublications) => {
|
|
221
|
+
const $ = load(html);
|
|
222
|
+
const name = normalizeText($('#gsc_prf_in').text());
|
|
223
|
+
const affiliation = normalizeText($('.gsc_prf_il').first().text()) || null;
|
|
224
|
+
const verifiedLine = normalizeText($('#gsc_prf_ivh').text());
|
|
225
|
+
const verifiedEmailMatch = verifiedLine.match(/Verified email at ([^\s-][^\-]*)/i);
|
|
226
|
+
const publications = $('#gsc_a_b tr.gsc_a_tr')
|
|
227
|
+
.toArray()
|
|
228
|
+
.slice(0, maxPublications)
|
|
229
|
+
.map((row) => parseAuthorPublication($, row, baseUrl));
|
|
230
|
+
const profileAuthorIds = extractAuthorIdsFromElement($, '#gsc_prf_int a, .gs_a a');
|
|
231
|
+
const derivedAuthorId = profileAuthorIds[0] ?? authorId;
|
|
232
|
+
return {
|
|
233
|
+
authorId: derivedAuthorId,
|
|
234
|
+
authorName: name,
|
|
235
|
+
profileUrl: `${baseUrl}/citations?user=${encodeURIComponent(derivedAuthorId)}&hl=en`,
|
|
236
|
+
affiliation,
|
|
237
|
+
verifiedEmail: verifiedEmailMatch?.[1] ? normalizeText(verifiedEmailMatch[1]) : null,
|
|
238
|
+
homepageUrl: resolveUrl(baseUrl, $('#gsc_prf_ivh a').first().attr('href')),
|
|
239
|
+
interests: $('#gsc_prf_int a')
|
|
240
|
+
.toArray()
|
|
241
|
+
.map((el) => normalizeText($(el).text()))
|
|
242
|
+
.filter((interest) => interest.length > 0),
|
|
243
|
+
metrics: parseAuthorMetrics($),
|
|
244
|
+
publications
|
|
245
|
+
};
|
|
246
|
+
};
|
|
247
|
+
export const parseAuthorName = (html) => {
|
|
248
|
+
const $ = load(html);
|
|
249
|
+
const name = normalizeText($('#gsc_prf_in').first().text());
|
|
250
|
+
return name.length > 0 ? name : null;
|
|
251
|
+
};
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
import { ScholarParseError } from './errors.js';
|
|
2
|
+
import { extractAuthorIdsFromSearch, parseAuthorName, parseScholarAuthorProfile, parseScholarSearchResult } from './scholar-parser.js';
|
|
3
|
+
import { ScholarClient } from './scholar-client.js';
|
|
4
|
+
const normalizeName = (value) => value
|
|
5
|
+
.toLowerCase()
|
|
6
|
+
.normalize('NFD')
|
|
7
|
+
.replace(/[\u0300-\u036f]/g, '')
|
|
8
|
+
.replace(/[^a-z0-9\s]/g, ' ')
|
|
9
|
+
.replace(/\s+/g, ' ')
|
|
10
|
+
.trim();
|
|
11
|
+
const clamp = (value, min, max) => Math.min(max, Math.max(min, value));
|
|
12
|
+
const nameSimilarity = (requestedName, candidateName) => {
|
|
13
|
+
const requested = normalizeName(requestedName);
|
|
14
|
+
const candidate = normalizeName(candidateName);
|
|
15
|
+
if (!requested || !candidate) {
|
|
16
|
+
return 0;
|
|
17
|
+
}
|
|
18
|
+
if (requested === candidate) {
|
|
19
|
+
return 1;
|
|
20
|
+
}
|
|
21
|
+
if (candidate.includes(requested) || requested.includes(candidate)) {
|
|
22
|
+
return 0.9;
|
|
23
|
+
}
|
|
24
|
+
const requestedTokens = new Set(requested.split(' '));
|
|
25
|
+
const candidateTokens = new Set(candidate.split(' '));
|
|
26
|
+
let overlap = 0;
|
|
27
|
+
requestedTokens.forEach((token) => {
|
|
28
|
+
if (candidateTokens.has(token)) {
|
|
29
|
+
overlap += 1;
|
|
30
|
+
}
|
|
31
|
+
});
|
|
32
|
+
return overlap / Math.max(requestedTokens.size, candidateTokens.size, 1);
|
|
33
|
+
};
|
|
34
|
+
export class ScholarService {
|
|
35
|
+
client;
|
|
36
|
+
config;
|
|
37
|
+
logger;
|
|
38
|
+
constructor(client, config, logger) {
|
|
39
|
+
this.client = client;
|
|
40
|
+
this.config = config;
|
|
41
|
+
this.logger = logger;
|
|
42
|
+
}
|
|
43
|
+
static fromConfig(config, logger) {
|
|
44
|
+
return new ScholarService(new ScholarClient(config), config, logger);
|
|
45
|
+
}
|
|
46
|
+
async searchKeywords(input) {
|
|
47
|
+
const numResults = clamp(input.numResults, 1, this.config.scholarMaxResultsPerRequest);
|
|
48
|
+
const params = {
|
|
49
|
+
q: input.query,
|
|
50
|
+
hl: input.language,
|
|
51
|
+
as_sdt: '0,5',
|
|
52
|
+
num: numResults,
|
|
53
|
+
start: input.start
|
|
54
|
+
};
|
|
55
|
+
const { html, url } = await this.client.fetchScholarSearch(params);
|
|
56
|
+
const parsed = parseScholarSearchResult(html, this.config.scholarBaseUrl, url, input.query);
|
|
57
|
+
return {
|
|
58
|
+
...parsed,
|
|
59
|
+
papers: parsed.papers.slice(0, numResults)
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
async searchAdvanced(input) {
|
|
63
|
+
const numResults = clamp(input.numResults, 1, this.config.scholarMaxResultsPerRequest);
|
|
64
|
+
const params = {
|
|
65
|
+
q: input.query,
|
|
66
|
+
hl: input.language,
|
|
67
|
+
as_sdt: '0,5',
|
|
68
|
+
num: numResults,
|
|
69
|
+
start: input.start,
|
|
70
|
+
as_sauthors: input.author,
|
|
71
|
+
as_epq: input.exactPhrase,
|
|
72
|
+
as_eq: input.excludeWords,
|
|
73
|
+
as_occt: input.titleOnly ? 'title' : undefined
|
|
74
|
+
};
|
|
75
|
+
if (input.yearRange) {
|
|
76
|
+
params.as_ylo = input.yearRange[0];
|
|
77
|
+
params.as_yhi = input.yearRange[1];
|
|
78
|
+
}
|
|
79
|
+
const { html, url } = await this.client.fetchScholarSearch(params);
|
|
80
|
+
const parsed = parseScholarSearchResult(html, this.config.scholarBaseUrl, url, input.query);
|
|
81
|
+
return {
|
|
82
|
+
...parsed,
|
|
83
|
+
papers: parsed.papers.slice(0, numResults)
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
async getAuthorInfo(authorName, maxPublications, language) {
|
|
87
|
+
const publicationLimit = clamp(maxPublications, 1, this.config.scholarMaxResultsPerRequest);
|
|
88
|
+
try {
|
|
89
|
+
const authorId = await this.findBestAuthorId(authorName, language);
|
|
90
|
+
const { html } = await this.client.fetchAuthorProfile(authorId, language);
|
|
91
|
+
return parseScholarAuthorProfile(html, this.config.scholarBaseUrl, authorId, publicationLimit);
|
|
92
|
+
}
|
|
93
|
+
catch (error) {
|
|
94
|
+
this.logger.warn('Falling back to paper-based author summary', {
|
|
95
|
+
authorName,
|
|
96
|
+
error: error instanceof Error ? error.message : String(error)
|
|
97
|
+
});
|
|
98
|
+
return this.buildAuthorFallback(authorName, publicationLimit, language);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
async buildAuthorFallback(authorName, maxPublications, language) {
|
|
102
|
+
const search = await this.searchAdvanced({
|
|
103
|
+
query: authorName,
|
|
104
|
+
author: authorName,
|
|
105
|
+
numResults: maxPublications,
|
|
106
|
+
start: 0,
|
|
107
|
+
language
|
|
108
|
+
});
|
|
109
|
+
const publications = search.papers.map((paper) => ({
|
|
110
|
+
title: paper.title,
|
|
111
|
+
detailUrl: paper.url,
|
|
112
|
+
authors: paper.authorsLine,
|
|
113
|
+
venue: paper.authorsLine,
|
|
114
|
+
year: paper.year,
|
|
115
|
+
citations: paper.citedByCount,
|
|
116
|
+
citationsUrl: paper.citedByUrl
|
|
117
|
+
}));
|
|
118
|
+
return {
|
|
119
|
+
authorId: 'unresolved',
|
|
120
|
+
authorName,
|
|
121
|
+
profileUrl: `${this.config.scholarBaseUrl}/scholar?q=${encodeURIComponent(authorName)}`,
|
|
122
|
+
affiliation: null,
|
|
123
|
+
verifiedEmail: null,
|
|
124
|
+
homepageUrl: null,
|
|
125
|
+
interests: [],
|
|
126
|
+
metrics: {
|
|
127
|
+
citationsAll: null,
|
|
128
|
+
citationsSince: null,
|
|
129
|
+
hIndexAll: null,
|
|
130
|
+
hIndexSince: null,
|
|
131
|
+
i10IndexAll: null,
|
|
132
|
+
i10IndexSince: null
|
|
133
|
+
},
|
|
134
|
+
publications
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
async findBestAuthorId(authorName, language) {
|
|
138
|
+
const candidateIds = new Set();
|
|
139
|
+
const strategies = [
|
|
140
|
+
{
|
|
141
|
+
q: `"${authorName}"`,
|
|
142
|
+
as_sauthors: authorName
|
|
143
|
+
},
|
|
144
|
+
{
|
|
145
|
+
q: `${authorName} research`,
|
|
146
|
+
as_sauthors: authorName
|
|
147
|
+
},
|
|
148
|
+
{
|
|
149
|
+
q: authorName,
|
|
150
|
+
as_sauthors: authorName
|
|
151
|
+
}
|
|
152
|
+
];
|
|
153
|
+
for (const strategy of strategies) {
|
|
154
|
+
const { html } = await this.client.fetchScholarSearch({
|
|
155
|
+
...strategy,
|
|
156
|
+
hl: language,
|
|
157
|
+
as_sdt: '0,5',
|
|
158
|
+
num: this.config.scholarMaxResultsPerRequest
|
|
159
|
+
});
|
|
160
|
+
extractAuthorIdsFromSearch(html).forEach((id) => candidateIds.add(id));
|
|
161
|
+
if (candidateIds.size >= 10) {
|
|
162
|
+
break;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
if (candidateIds.size === 0) {
|
|
166
|
+
throw new ScholarParseError(`Unable to discover a Google Scholar profile for "${authorName}".`);
|
|
167
|
+
}
|
|
168
|
+
const limitedCandidates = [...candidateIds].slice(0, 10);
|
|
169
|
+
let bestId = null;
|
|
170
|
+
let bestScore = 0;
|
|
171
|
+
for (const candidateId of limitedCandidates) {
|
|
172
|
+
try {
|
|
173
|
+
const { html } = await this.client.fetchAuthorProfile(candidateId, language);
|
|
174
|
+
const candidateName = parseAuthorName(html);
|
|
175
|
+
if (!candidateName) {
|
|
176
|
+
continue;
|
|
177
|
+
}
|
|
178
|
+
const score = nameSimilarity(authorName, candidateName);
|
|
179
|
+
this.logger.debug('Evaluated author candidate', {
|
|
180
|
+
requested: authorName,
|
|
181
|
+
candidateId,
|
|
182
|
+
candidateName,
|
|
183
|
+
score
|
|
184
|
+
});
|
|
185
|
+
if (score > bestScore) {
|
|
186
|
+
bestScore = score;
|
|
187
|
+
bestId = candidateId;
|
|
188
|
+
}
|
|
189
|
+
if (score >= 0.98) {
|
|
190
|
+
break;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
catch {
|
|
194
|
+
continue;
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
if (!bestId) {
|
|
198
|
+
throw new ScholarParseError(`Unable to match a Google Scholar profile to "${authorName}".`);
|
|
199
|
+
}
|
|
200
|
+
return bestId;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/dist/version.js
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { createRequire } from 'node:module';
|
|
2
|
+
const require = createRequire(import.meta.url);
|
|
3
|
+
export const getPackageVersion = () => {
|
|
4
|
+
try {
|
|
5
|
+
const pkg = require('../package.json');
|
|
6
|
+
if (typeof pkg.version === 'string' && pkg.version.trim().length > 0) {
|
|
7
|
+
return pkg.version;
|
|
8
|
+
}
|
|
9
|
+
}
|
|
10
|
+
catch {
|
|
11
|
+
// Fall through to static fallback.
|
|
12
|
+
}
|
|
13
|
+
return '0.0.0';
|
|
14
|
+
};
|