smart-web-mcp 0.8.8 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/CHANGELOG.md +72 -18
  2. package/README.md +23 -15
  3. package/dist/assessment.js +1 -1
  4. package/dist/assessment.js.map +1 -1
  5. package/dist/browser-session.d.ts +1 -0
  6. package/dist/browser-session.js +11 -11
  7. package/dist/browser-session.js.map +1 -1
  8. package/dist/cli.js +5 -1
  9. package/dist/cli.js.map +1 -1
  10. package/dist/dev-runtime.js +2 -1
  11. package/dist/dev-runtime.js.map +1 -1
  12. package/dist/index.js +9 -2
  13. package/dist/index.js.map +1 -1
  14. package/dist/mcp-server.js +2 -2
  15. package/dist/mcp-server.js.map +1 -1
  16. package/dist/runtime-temp.d.ts +1 -1
  17. package/dist/runtime-temp.js +12 -17
  18. package/dist/runtime-temp.js.map +1 -1
  19. package/dist/settings.d.ts +25 -5
  20. package/dist/settings.js +55 -60
  21. package/dist/settings.js.map +1 -1
  22. package/dist/shared.d.ts +3 -3
  23. package/dist/shared.js +29 -9
  24. package/dist/shared.js.map +1 -1
  25. package/dist/smartcrawl.js +2 -2
  26. package/dist/smartcrawl.js.map +1 -1
  27. package/dist/smartfetch/academic-fallback.d.ts +7 -0
  28. package/dist/smartfetch/academic-fallback.js +777 -0
  29. package/dist/smartfetch/academic-fallback.js.map +1 -0
  30. package/dist/smartfetch/archive-fallback.js +7 -57
  31. package/dist/smartfetch/archive-fallback.js.map +1 -1
  32. package/dist/smartfetch/generic-fallbacks.d.ts +92 -0
  33. package/dist/smartfetch/generic-fallbacks.js +616 -0
  34. package/dist/smartfetch/generic-fallbacks.js.map +1 -0
  35. package/dist/smartfetch/jina-reader.js +62 -3
  36. package/dist/smartfetch/jina-reader.js.map +1 -1
  37. package/dist/smartfetch/paywall.d.ts +7 -0
  38. package/dist/smartfetch/paywall.js +77 -0
  39. package/dist/smartfetch/paywall.js.map +1 -0
  40. package/dist/smartfetch/pipeline.d.ts +1 -1
  41. package/dist/smartfetch/pipeline.js +4 -3
  42. package/dist/smartfetch/pipeline.js.map +1 -1
  43. package/dist/smartfetch/provider-policy.js +2 -2
  44. package/dist/smartfetch/provider-policy.js.map +1 -1
  45. package/dist/smartfetch/provider-types.d.ts +1 -0
  46. package/dist/smartfetch/providers/acmicpc.d.ts +2 -0
  47. package/dist/smartfetch/providers/acmicpc.js +57 -0
  48. package/dist/smartfetch/providers/acmicpc.js.map +1 -0
  49. package/dist/smartfetch/providers/article.d.ts +1 -2
  50. package/dist/smartfetch/providers/article.js +161 -1
  51. package/dist/smartfetch/providers/article.js.map +1 -1
  52. package/dist/smartfetch/providers/atcoder.d.ts +2 -0
  53. package/dist/smartfetch/providers/atcoder.js +54 -0
  54. package/dist/smartfetch/providers/atcoder.js.map +1 -0
  55. package/dist/smartfetch/providers/codeforces.d.ts +2 -0
  56. package/dist/smartfetch/providers/codeforces.js +54 -0
  57. package/dist/smartfetch/providers/codeforces.js.map +1 -0
  58. package/dist/smartfetch/providers/cp-common.d.ts +8 -0
  59. package/dist/smartfetch/providers/cp-common.js +62 -0
  60. package/dist/smartfetch/providers/cp-common.js.map +1 -0
  61. package/dist/smartfetch/providers/index.js +15 -1
  62. package/dist/smartfetch/providers/index.js.map +1 -1
  63. package/dist/smartfetch/providers/jungol.d.ts +2 -0
  64. package/dist/smartfetch/providers/jungol.js +43 -0
  65. package/dist/smartfetch/providers/jungol.js.map +1 -0
  66. package/dist/smartfetch/providers/kakao-map.d.ts +2 -0
  67. package/dist/smartfetch/providers/kakao-map.js +150 -0
  68. package/dist/smartfetch/providers/kakao-map.js.map +1 -0
  69. package/dist/smartfetch/providers/linkedin.js +5 -5
  70. package/dist/smartfetch/providers/linkedin.js.map +1 -1
  71. package/dist/smartfetch/providers/map-utils.d.ts +7 -0
  72. package/dist/smartfetch/providers/map-utils.js +56 -0
  73. package/dist/smartfetch/providers/map-utils.js.map +1 -0
  74. package/dist/smartfetch/providers/naver-map.d.ts +2 -0
  75. package/dist/smartfetch/providers/naver-map.js +183 -0
  76. package/dist/smartfetch/providers/naver-map.js.map +1 -0
  77. package/dist/smartfetch/providers/qoj.d.ts +2 -0
  78. package/dist/smartfetch/providers/qoj.js +54 -0
  79. package/dist/smartfetch/providers/qoj.js.map +1 -0
  80. package/dist/smartfetch/providers/reddit.js +2 -2
  81. package/dist/smartfetch/providers/reddit.js.map +1 -1
  82. package/dist/smartfetch/providers/solvedac.js +193 -2
  83. package/dist/smartfetch/providers/solvedac.js.map +1 -1
  84. package/dist/smartfetch/providers/x.js +4 -7
  85. package/dist/smartfetch/providers/x.js.map +1 -1
  86. package/dist/smartfetch/providers/youtube.js +2 -2
  87. package/dist/smartfetch/providers/youtube.js.map +1 -1
  88. package/dist/smartfetch.js +22 -9
  89. package/dist/smartfetch.js.map +1 -1
  90. package/dist/smartsearch.js +199 -26
  91. package/dist/smartsearch.js.map +1 -1
  92. package/dist/test-settings.d.ts +9 -0
  93. package/dist/test-settings.js +40 -0
  94. package/dist/test-settings.js.map +1 -0
  95. package/package.json +7 -6
@@ -0,0 +1,777 @@
1
+ import { configText } from "../settings.js";
2
+ import { absolutizeUrl, configEnabled, dedupeUrls, extractMetaDescription, extractMetaName, extractMetaProperty, extractTitleFromHtml } from "../shared.js";
3
+ import { fetchProviderJson } from "./provider-policy.js";
4
+ function academicFallbackEnabled() {
5
+ return configEnabled("fetch.enableAcademicFallback", true);
6
+ }
7
+ function openAlexEnabled() {
8
+ return configEnabled("fetch.enableOpenAlex", true);
9
+ }
10
+ function europePmcEnabled() {
11
+ return configEnabled("fetch.enableEuropePmc", true);
12
+ }
13
+ function biorxivApiEnabled() {
14
+ return configEnabled("fetch.enableBiorxivApi", true);
15
+ }
16
+ function unpaywallEnabled() {
17
+ return configEnabled("fetch.enableUnpaywall", true);
18
+ }
19
+ function semanticScholarEnabled() {
20
+ return configEnabled("fetch.enableSemanticScholar", true);
21
+ }
22
+ function coreDiscoveryEnabled() {
23
+ return configEnabled("fetch.enableCoreDiscovery", true);
24
+ }
25
+ function unpaywallEmail() {
26
+ return normalizeSpace(configText("fetch.unpaywallEmail"));
27
+ }
28
+ function semanticScholarApiKey() {
29
+ return normalizeSpace(configText("fetch.semanticScholarApiKey"));
30
+ }
31
+ function coreApiKey() {
32
+ return normalizeSpace(configText("fetch.coreApiKey"));
33
+ }
34
+ function normalizeSpace(value) {
35
+ return String(value || "").replace(/\s+/g, " ").trim();
36
+ }
37
+ function objectValue(value) {
38
+ return value && typeof value === "object" && !Array.isArray(value) ? value : {};
39
+ }
40
+ function arrayValue(value) {
41
+ return Array.isArray(value) ? value : [];
42
+ }
43
+ function textValue(value) {
44
+ return normalizeSpace(String(value || ""));
45
+ }
46
+ function uniqueTexts(values) {
47
+ return Array.from(new Set(values.map((value) => normalizeSpace(value)).filter(Boolean)));
48
+ }
49
+ function httpUrl(value) {
50
+ const text = textValue(value);
51
+ return /^https?:\/\//i.test(text) ? text : "";
52
+ }
53
+ function personName(value) {
54
+ const person = objectValue(value);
55
+ const direct = textValue(person.name) || textValue(person.display_name);
56
+ if (direct)
57
+ return direct;
58
+ const given = textValue(person.given) || textValue(person.given_name) || textValue(person.first);
59
+ const family = textValue(person.family) || textValue(person.family_name) || textValue(person.last);
60
+ return normalizeSpace([given, family].filter(Boolean).join(" "));
61
+ }
62
+ function authorNames(value) {
63
+ return uniqueTexts(arrayValue(value).map((item) => personName(item)));
64
+ }
65
+ function collectUrlFields(value, keyPattern, out) {
66
+ if (Array.isArray(value)) {
67
+ for (const item of value)
68
+ collectUrlFields(item, keyPattern, out);
69
+ return;
70
+ }
71
+ if (!value || typeof value !== "object")
72
+ return;
73
+ for (const [key, entry] of Object.entries(value)) {
74
+ if (typeof entry === "string") {
75
+ if (keyPattern.test(key))
76
+ out.push(entry);
77
+ continue;
78
+ }
79
+ if (typeof entry === "number" || typeof entry === "boolean" || entry === null)
80
+ continue;
81
+ collectUrlFields(entry, keyPattern, out);
82
+ }
83
+ }
84
+ function parseTagAttributes(tag) {
85
+ const attributes = {};
86
+ const regex = /([^\s=/>]+)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+)))?/g;
87
+ for (const match of tag.matchAll(regex)) {
88
+ const name = String(match[1] || "").trim().toLowerCase();
89
+ if (!name || name === "meta" || name in attributes)
90
+ continue;
91
+ const value = match[2] ?? match[3] ?? match[4] ?? "";
92
+ attributes[name] = normalizeSpace(String(value || ""));
93
+ }
94
+ return attributes;
95
+ }
96
+ function allMetaNames(html, names) {
97
+ const wanted = new Set(names.map((name) => name.toLowerCase()));
98
+ const out = [];
99
+ for (const match of html.matchAll(/<meta\b[^>]*>/gi)) {
100
+ const attributes = parseTagAttributes(String(match[0] || ""));
101
+ const name = String(attributes.name || "").toLowerCase();
102
+ if (!wanted.has(name))
103
+ continue;
104
+ const value = normalizeSpace(attributes.content || "");
105
+ if (value)
106
+ out.push(value);
107
+ }
108
+ return uniqueTexts(out);
109
+ }
110
+ function looksLikePdfUrl(url) {
111
+ return /\/pdf\//i.test(url) || /\/pdf(?:$|[?#])/i.test(url) || /\.pdf(?:$|[?#])/i.test(url) || /[?&]pdf(?:=|%3d|$)/i.test(url);
112
+ }
113
+ function normalizeDoi(value) {
114
+ const decoded = decodeURIComponent(String(value || ""));
115
+ const match = decoded.match(/10\.\d{4,9}\/[-._;()/:A-Z0-9]+/i);
116
+ return normalizeSpace(String(match?.[0] || "")).replace(/[)>.,;]+$/g, "");
117
+ }
118
+ function doiLinkUrl(doi) {
119
+ return doi ? `https://doi.org/${doi}` : "";
120
+ }
121
+ function extractDoiFromLinks(values) {
122
+ for (const value of values) {
123
+ if (!/doi\.org\//i.test(value))
124
+ continue;
125
+ const doi = normalizeDoi(value);
126
+ if (doi)
127
+ return doi;
128
+ }
129
+ return "";
130
+ }
131
+ function pubmedIdFromUrl(url) {
132
+ try {
133
+ const parsed = new URL(url);
134
+ if (parsed.hostname !== "pubmed.ncbi.nlm.nih.gov")
135
+ return "";
136
+ return normalizeSpace(parsed.pathname.replace(/^\//, "").replace(/\/$/, ""));
137
+ }
138
+ catch {
139
+ return "";
140
+ }
141
+ }
142
+ function pmcIdFromUrl(url) {
143
+ try {
144
+ const parsed = new URL(url);
145
+ const match = parsed.pathname.match(/\/articles\/(PMC\d+)/i);
146
+ return normalizeSpace(String(match?.[1] || "")).toUpperCase();
147
+ }
148
+ catch {
149
+ return "";
150
+ }
151
+ }
152
+ function preprintServer(url) {
153
+ try {
154
+ const host = new URL(url).hostname.toLowerCase();
155
+ if (host === "www.biorxiv.org" || host === "biorxiv.org")
156
+ return "biorxiv";
157
+ if (host === "www.medrxiv.org" || host === "medrxiv.org")
158
+ return "medrxiv";
159
+ return "";
160
+ }
161
+ catch {
162
+ return "";
163
+ }
164
+ }
165
+ function preprintDoiFromUrl(url) {
166
+ try {
167
+ const parsed = new URL(url);
168
+ const match = parsed.pathname.match(/\/content\/(10\.1101\/[^?#]+?)(?:v\d+)?(?:\.full(?:\.pdf)?)?(?:$|[?#])/i);
169
+ return normalizeDoi(String(match?.[1] || ""));
170
+ }
171
+ catch {
172
+ return "";
173
+ }
174
+ }
175
+ function siteLabel(url) {
176
+ try {
177
+ const host = new URL(url).hostname.toLowerCase().replace(/^www\./, "");
178
+ if (host === "pubmed.ncbi.nlm.nih.gov")
179
+ return "pubmed";
180
+ if (host === "pmc.ncbi.nlm.nih.gov")
181
+ return "pmc";
182
+ if (host === "biorxiv.org")
183
+ return "biorxiv";
184
+ if (host === "medrxiv.org")
185
+ return "medrxiv";
186
+ const parts = host.split(".");
187
+ return parts.length >= 2 ? parts[parts.length - 2] || host : host;
188
+ }
189
+ catch {
190
+ return "";
191
+ }
192
+ }
193
+ function looksAcademicHost(url) {
194
+ try {
195
+ const host = new URL(url).hostname.toLowerCase().replace(/^www\./, "");
196
+ const knownHosts = [
197
+ "pubmed.ncbi.nlm.nih.gov",
198
+ "pmc.ncbi.nlm.nih.gov",
199
+ "biorxiv.org",
200
+ "medrxiv.org",
201
+ "nature.com",
202
+ "sciencedirect.com",
203
+ "springer.com",
204
+ "link.springer.com",
205
+ "onlinelibrary.wiley.com",
206
+ "academic.oup.com",
207
+ "jamanetwork.com",
208
+ "nejm.org",
209
+ "thelancet.com",
210
+ "cell.com",
211
+ "plos.org",
212
+ "biomedcentral.com",
213
+ "frontiersin.org",
214
+ "sagepub.com",
215
+ "tandfonline.com",
216
+ "ieeexplore.ieee.org",
217
+ "dl.acm.org",
218
+ ];
219
+ return knownHosts.some((item) => host === item || host.endsWith(`.${item}`));
220
+ }
221
+ catch {
222
+ return false;
223
+ }
224
+ }
225
+ function authorListFromPost(post) {
226
+ const authors = post.authors;
227
+ if (Array.isArray(authors))
228
+ return uniqueTexts(authors.map((item) => textValue(item)));
229
+ const author = textValue(post.author);
230
+ return author ? [author] : [];
231
+ }
232
+ function extractAcademicSignals(context, normalized) {
233
+ const html = String(context.active.content || "");
234
+ const post = objectValue(normalized.post);
235
+ const resolvedUrl = context.resolvedUrl || context.url;
236
+ const preprintDoi = preprintDoiFromUrl(resolvedUrl);
237
+ const hostLooksAcademic = looksAcademicHost(resolvedUrl);
238
+ return {
239
+ title: textValue(post.title) || normalizeSpace(extractMetaName(html, "citation_title")
240
+ || extractMetaName(html, "dc.title")
241
+ || extractMetaProperty(html, "og:title")
242
+ || extractTitleFromHtml(html)),
243
+ abstract: textValue(post.description) || normalizeSpace(extractMetaName(html, "citation_abstract")
244
+ || extractMetaName(html, "dc.description")
245
+ || extractMetaProperty(html, "og:description")
246
+ || extractMetaDescription(html)),
247
+ authors: uniqueTexts([
248
+ ...authorListFromPost(post),
249
+ ...allMetaNames(html, ["citation_author", "dc.contributor", "dc.creator"]),
250
+ ]),
251
+ doi: normalizeDoi(textValue(post.doi) || extractMetaName(html, "citation_doi") || preprintDoi || (hostLooksAcademic ? extractDoiFromLinks(context.active.links) : "")),
252
+ pmid: textValue(post.pmid) || extractMetaName(html, "citation_pmid") || pubmedIdFromUrl(resolvedUrl),
253
+ pmcid: textValue(post.pmcid) || normalizeSpace(extractMetaName(html, "citation_pmcid") || pmcIdFromUrl(resolvedUrl)).toUpperCase(),
254
+ journal: textValue(post.journal) || normalizeSpace(extractMetaName(html, "citation_journal_title")
255
+ || extractMetaName(html, "dc.source")
256
+ || extractMetaName(html, "citation_publisher")),
257
+ published: textValue(post.published) || normalizeSpace(extractMetaName(html, "citation_publication_date")
258
+ || extractMetaName(html, "citation_online_date")
259
+ || extractMetaName(html, "dc.date")),
260
+ pdfUrls: dedupeUrls([
261
+ textValue(post.pdf_url),
262
+ absolutizeUrl(extractMetaName(html, "citation_pdf_url"), resolvedUrl),
263
+ ...context.active.links.filter((link) => looksLikePdfUrl(link)),
264
+ ]),
265
+ fullTextUrls: dedupeUrls([
266
+ textValue(post.full_text_url),
267
+ absolutizeUrl(extractMetaName(html, "citation_fulltext_html_url"), resolvedUrl),
268
+ textValue(post.open_access_url),
269
+ ]),
270
+ license: textValue(post.open_access_license),
271
+ oaStatus: textValue(post.open_access_status),
272
+ publishedDoi: normalizeDoi(textValue(post.published_doi)),
273
+ category: textValue(post.category),
274
+ jatsXml: textValue(post.jats_xml),
275
+ site: textValue(post.site) || siteLabel(resolvedUrl),
276
+ canonicalUrl: textValue(post.url) || resolvedUrl,
277
+ };
278
+ }
279
+ function mergeSignals(target, source, options) {
280
+ target.title ||= normalizeSpace(source.title || "");
281
+ target.abstract ||= normalizeSpace(source.abstract || "");
282
+ if (target.authors.length === 0)
283
+ target.authors = uniqueTexts(source.authors || []);
284
+ target.doi ||= normalizeDoi(source.doi || "");
285
+ target.pmid ||= normalizeSpace(source.pmid || "");
286
+ target.pmcid ||= normalizeSpace(source.pmcid || "").toUpperCase();
287
+ target.journal ||= normalizeSpace(source.journal || "");
288
+ target.published ||= normalizeSpace(source.published || "");
289
+ target.pdfUrls = options?.linkOrder === "prepend"
290
+ ? dedupeUrls([...(source.pdfUrls || []), ...target.pdfUrls])
291
+ : dedupeUrls([...target.pdfUrls, ...(source.pdfUrls || [])]);
292
+ target.fullTextUrls = options?.linkOrder === "prepend"
293
+ ? dedupeUrls([...(source.fullTextUrls || []), ...target.fullTextUrls])
294
+ : dedupeUrls([...target.fullTextUrls, ...(source.fullTextUrls || [])]);
295
+ target.license ||= normalizeSpace(source.license || "");
296
+ target.oaStatus ||= normalizeSpace(source.oaStatus || "");
297
+ target.publishedDoi ||= normalizeDoi(source.publishedDoi || "");
298
+ target.category ||= normalizeSpace(source.category || "");
299
+ target.jatsXml ||= normalizeSpace(source.jatsXml || "");
300
+ target.site ||= normalizeSpace(source.site || "");
301
+ target.canonicalUrl ||= normalizeSpace(source.canonicalUrl || "");
302
+ }
303
+ async function fetchBioRxivRecord(context, server, doi) {
304
+ const result = await fetchProviderJson(`https://api.biorxiv.org/details/${server}/${doi}/na/json`, context.timeoutMs, {
305
+ headers: {
306
+ accept: "application/json,text/plain,*/*",
307
+ "user-agent": "Mozilla/5.0",
308
+ },
309
+ }, {
310
+ mode: "relay",
311
+ sourceUrl: context.url,
312
+ });
313
+ if (!result.ok)
314
+ return null;
315
+ const data = objectValue(result.data);
316
+ const collection = Array.isArray(data.collection) ? data.collection : [];
317
+ const record = objectValue(collection[0]);
318
+ if (Object.keys(record).length === 0)
319
+ return null;
320
+ const authors = uniqueTexts(textValue(record.authors).split(";").map((item) => item.trim()));
321
+ const normalizedServer = server.toLowerCase();
322
+ const version = textValue(record.version) || "1";
323
+ const preprintDoi = normalizeDoi(textValue(record.doi) || doi);
324
+ const canonicalUrl = preprintDoi ? `https://www.${normalizedServer}.org/content/${preprintDoi}v${version}` : context.url;
325
+ return {
326
+ title: textValue(record.title),
327
+ abstract: normalizeSpace(textValue(record.abstract).replace(/\s+/g, " ")),
328
+ authors,
329
+ doi: preprintDoi,
330
+ publishedDoi: normalizeDoi(textValue(record.published)),
331
+ published: textValue(record.date),
332
+ pdfUrls: canonicalUrl ? [`${canonicalUrl}.full.pdf`] : [],
333
+ fullTextUrls: canonicalUrl ? [canonicalUrl] : [],
334
+ license: textValue(record.license),
335
+ category: textValue(record.category),
336
+ jatsXml: textValue(record.jatsxml),
337
+ site: normalizedServer,
338
+ canonicalUrl,
339
+ };
340
+ }
341
+ function europePmcQuery(signals) {
342
+ if (signals.doi)
343
+ return `DOI:${signals.doi}`;
344
+ if (signals.pmcid)
345
+ return `PMCID:${signals.pmcid}`;
346
+ if (signals.pmid)
347
+ return `EXT_ID:${signals.pmid} AND SRC:MED`;
348
+ return "";
349
+ }
350
+ function europePmcLinks(entry) {
351
+ const list = objectValue(entry.fullTextUrlList);
352
+ const items = Array.isArray(list.fullTextUrl) ? list.fullTextUrl : [];
353
+ const openAccessItems = items.map((item) => objectValue(item)).filter((item) => textValue(item.availabilityCode).toUpperCase() === "OA" || textValue(item.availability).toLowerCase() === "open access");
354
+ const urls = openAccessItems.map((item) => textValue(item.url)).filter(Boolean);
355
+ return {
356
+ pdfUrls: urls.filter((url) => looksLikePdfUrl(url) || textValue(objectValue(openAccessItems.find((item) => textValue(item.url) === url)).documentStyle).toLowerCase() === "pdf"),
357
+ fullTextUrls: urls.filter((url) => !looksLikePdfUrl(url) || /articles\/PMC/i.test(url)),
358
+ };
359
+ }
360
+ async function fetchEuropePmcRecord(context, signals) {
361
+ const query = europePmcQuery(signals);
362
+ if (!query)
363
+ return null;
364
+ const result = await fetchProviderJson(`https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=${encodeURIComponent(query)}&format=json&pageSize=1&resultType=core`, context.timeoutMs, {
365
+ headers: {
366
+ accept: "application/json,text/plain,*/*",
367
+ "user-agent": "Mozilla/5.0",
368
+ },
369
+ }, {
370
+ mode: "relay",
371
+ sourceUrl: context.url,
372
+ });
373
+ if (!result.ok)
374
+ return null;
375
+ const data = objectValue(result.data);
376
+ const resultList = objectValue(data.resultList);
377
+ const list = Array.isArray(resultList.result) ? resultList.result : [];
378
+ const entry = objectValue(list[0]);
379
+ if (Object.keys(entry).length === 0)
380
+ return null;
381
+ const links = europePmcLinks(entry);
382
+ return {
383
+ title: textValue(entry.title),
384
+ abstract: textValue(entry.abstractText),
385
+ authors: uniqueTexts(textValue(entry.authorString).split(/[;,]/).map((item) => item.trim())),
386
+ doi: normalizeDoi(textValue(entry.doi)),
387
+ pmid: textValue(entry.pmid),
388
+ pmcid: textValue(entry.pmcid).toUpperCase(),
389
+ journal: textValue(entry.journalTitle),
390
+ published: textValue(entry.firstPublicationDate) || textValue(entry.firstIndexDate),
391
+ pdfUrls: links.pdfUrls,
392
+ fullTextUrls: links.fullTextUrls,
393
+ };
394
+ }
395
+ function openAlexAuthors(data) {
396
+ const authorships = Array.isArray(data.authorships) ? data.authorships : [];
397
+ return uniqueTexts(authorships.map((item) => textValue(objectValue(objectValue(item).author).display_name)));
398
+ }
399
+ function openAlexLinks(data) {
400
+ const best = objectValue(data.best_oa_location);
401
+ const primary = objectValue(data.primary_location);
402
+ const locations = Array.isArray(data.locations) ? data.locations.slice(0, 6) : [];
403
+ const pdfUrls = dedupeUrls([
404
+ textValue(best.pdf_url),
405
+ textValue(primary.pdf_url),
406
+ ...locations.map((item) => textValue(objectValue(item).pdf_url)),
407
+ ]);
408
+ const fullTextUrls = dedupeUrls([
409
+ textValue(best.landing_page_url),
410
+ textValue(primary.landing_page_url),
411
+ textValue(objectValue(data.open_access).oa_url),
412
+ ...locations.map((item) => textValue(objectValue(item).landing_page_url)),
413
+ ]);
414
+ return { pdfUrls, fullTextUrls };
415
+ }
416
+ async function fetchOpenAlexRecord(context, doi) {
417
+ const result = await fetchProviderJson(`https://api.openalex.org/works/${encodeURIComponent(`doi:${doi}`)}`, context.timeoutMs, {
418
+ headers: {
419
+ accept: "application/json,text/plain,*/*",
420
+ "user-agent": "Mozilla/5.0",
421
+ },
422
+ }, {
423
+ mode: "relay",
424
+ sourceUrl: context.url,
425
+ });
426
+ if (!result.ok)
427
+ return null;
428
+ const data = objectValue(result.data);
429
+ if (Object.keys(data).length === 0)
430
+ return null;
431
+ const openAccess = objectValue(data.open_access);
432
+ const links = openAlexLinks(data);
433
+ const best = objectValue(data.best_oa_location);
434
+ const primary = objectValue(data.primary_location);
435
+ const bestSource = objectValue(best.source);
436
+ const primarySource = objectValue(primary.source);
437
+ return {
438
+ title: textValue(data.title),
439
+ authors: openAlexAuthors(data),
440
+ journal: textValue(bestSource.display_name) || textValue(primarySource.display_name) || textValue(objectValue(data.primary_location).display_name),
441
+ published: textValue(data.publication_date),
442
+ license: textValue(best.license) || textValue(primary.license),
443
+ oaStatus: textValue(openAccess.oa_status),
444
+ pdfUrls: links.pdfUrls,
445
+ fullTextUrls: links.fullTextUrls,
446
+ };
447
+ }
448
+ function unpaywallLinks(data) {
449
+ const best = objectValue(data.best_oa_location);
450
+ const locations = arrayValue(data.oa_locations).map((item) => objectValue(item)).slice(0, 8);
451
+ const pdfUrls = dedupeUrls([
452
+ httpUrl(best.url_for_pdf),
453
+ ...locations.map((item) => httpUrl(item.url_for_pdf)),
454
+ ]);
455
+ const fullTextUrls = dedupeUrls([
456
+ httpUrl(best.url_for_landing_page),
457
+ httpUrl(best.url),
458
+ ...locations.map((item) => httpUrl(item.url_for_landing_page)),
459
+ ...locations.map((item) => httpUrl(item.url)),
460
+ ]).filter((url) => !/doi\.org\//i.test(url));
461
+ const license = textValue(best.license) || uniqueTexts(locations.map((item) => textValue(item.license)))[0] || "";
462
+ return { pdfUrls, fullTextUrls, license };
463
+ }
464
+ async function fetchUnpaywallRecord(context, doi) {
465
+ const email = unpaywallEmail();
466
+ if (!email)
467
+ return null;
468
+ const result = await fetchProviderJson(`https://api.unpaywall.org/v2/${encodeURIComponent(doi)}?email=${encodeURIComponent(email)}`, context.timeoutMs, {
469
+ headers: {
470
+ accept: "application/json,text/plain,*/*",
471
+ "user-agent": "Mozilla/5.0",
472
+ },
473
+ }, {
474
+ mode: "relay",
475
+ sourceUrl: context.url,
476
+ });
477
+ if (!result.ok)
478
+ return null;
479
+ const data = objectValue(result.data);
480
+ if (Object.keys(data).length === 0)
481
+ return null;
482
+ const links = unpaywallLinks(data);
483
+ return {
484
+ title: textValue(data.title),
485
+ authors: authorNames(data.z_authors),
486
+ journal: textValue(data.journal_name),
487
+ published: textValue(data.published_date) || textValue(data.year),
488
+ license: links.license,
489
+ oaStatus: textValue(data.oa_status),
490
+ pdfUrls: links.pdfUrls,
491
+ fullTextUrls: links.fullTextUrls,
492
+ canonicalUrl: textValue(data.doi_url) || doiLinkUrl(doi),
493
+ };
494
+ }
495
+ function semanticScholarPaperId(signals) {
496
+ if (signals.doi)
497
+ return `DOI:${signals.doi}`;
498
+ if (signals.pmid)
499
+ return `PMID:${signals.pmid}`;
500
+ return "";
501
+ }
502
+ async function fetchSemanticScholarRecord(context, signals) {
503
+ const paperId = semanticScholarPaperId(signals);
504
+ if (!paperId)
505
+ return null;
506
+ const apiKey = semanticScholarApiKey();
507
+ const result = await fetchProviderJson(`https://api.semanticscholar.org/graph/v1/paper/${encodeURIComponent(paperId)}?fields=title,abstract,authors,venue,year,externalIds,openAccessPdf,isOpenAccess,journal,url`, context.timeoutMs, {
508
+ headers: {
509
+ accept: "application/json,text/plain,*/*",
510
+ ...(apiKey ? { "x-api-key": apiKey } : {}),
511
+ "user-agent": "Mozilla/5.0",
512
+ },
513
+ }, {
514
+ mode: "relay",
515
+ sourceUrl: context.url,
516
+ });
517
+ if (!result.ok)
518
+ return null;
519
+ const data = objectValue(result.data);
520
+ if (Object.keys(data).length === 0)
521
+ return null;
522
+ const openAccessPdf = objectValue(data.openAccessPdf);
523
+ const externalIds = objectValue(data.externalIds);
524
+ const pdfUrl = httpUrl(openAccessPdf.url);
525
+ const journal = objectValue(data.journal);
526
+ return {
527
+ title: textValue(data.title),
528
+ abstract: textValue(data.abstract),
529
+ authors: authorNames(data.authors),
530
+ doi: normalizeDoi(textValue(externalIds.DOI)),
531
+ pmid: textValue(externalIds.PubMed),
532
+ journal: textValue(journal.name) || textValue(data.venue),
533
+ published: textValue(data.year),
534
+ license: textValue(openAccessPdf.license),
535
+ oaStatus: textValue(openAccessPdf.status).toLowerCase() || (data.isOpenAccess === true ? "open" : ""),
536
+ pdfUrls: pdfUrl && looksLikePdfUrl(pdfUrl) ? [pdfUrl] : [],
537
+ fullTextUrls: pdfUrl && !looksLikePdfUrl(pdfUrl) ? [pdfUrl] : [],
538
+ canonicalUrl: textValue(data.url),
539
+ };
540
+ }
541
+ function coreSearchLinks(data) {
542
+ const rawUrls = [];
543
+ collectUrlFields(data, /(download|full.?text|reader|url)$/i, rawUrls);
544
+ const urls = dedupeUrls(rawUrls.map((item) => httpUrl(item)).filter(Boolean)).filter((url) => !/^https?:\/\/api\.core\.ac\.uk\//i.test(url) && !/doi\.org\//i.test(url));
545
+ return {
546
+ pdfUrls: urls.filter((url) => looksLikePdfUrl(url)),
547
+ fullTextUrls: urls.filter((url) => !looksLikePdfUrl(url)),
548
+ };
549
+ }
550
+ async function fetchCoreSearchRecord(context, doi) {
551
+ const apiKey = coreApiKey();
552
+ if (!apiKey)
553
+ return null;
554
+ const query = encodeURIComponent(`doi:"${doi}"`);
555
+ const result = await fetchProviderJson(`https://api.core.ac.uk/v3/search/works?q=${query}&limit=3&offset=0&sort=relevance`, context.timeoutMs, {
556
+ headers: {
557
+ accept: "application/json,text/plain,*/*",
558
+ authorization: `Bearer ${apiKey}`,
559
+ "user-agent": "Mozilla/5.0",
560
+ },
561
+ }, {
562
+ mode: "relay",
563
+ sourceUrl: context.url,
564
+ });
565
+ if (!result.ok)
566
+ return null;
567
+ const data = objectValue(result.data);
568
+ const results = arrayValue(data.results).map((item) => objectValue(item));
569
+ const entry = results.find((item) => normalizeDoi(textValue(item.doi)) === doi) || results[0] || null;
570
+ if (!entry)
571
+ return null;
572
+ const links = coreSearchLinks(entry);
573
+ const journals = uniqueTexts(arrayValue(entry.journals).map((item) => textValue(objectValue(item).title)));
574
+ const abstract = textValue(entry.abstract) || textValue(entry.description);
575
+ const year = textValue(entry.yearPublished);
576
+ return {
577
+ title: textValue(entry.title),
578
+ abstract,
579
+ authors: authorNames(entry.authors),
580
+ doi: normalizeDoi(textValue(entry.doi)),
581
+ journal: journals[0] || textValue(entry.publisher),
582
+ published: textValue(entry.publishedDate) || year || textValue(entry.depositedDate),
583
+ pdfUrls: links.pdfUrls,
584
+ fullTextUrls: links.fullTextUrls,
585
+ site: "core",
586
+ canonicalUrl: textValue(entry.id) ? `https://core.ac.uk/works/${textValue(entry.id)}` : "",
587
+ };
588
+ }
589
+ async function fetchCoreDiscoverRecord(context, doi) {
590
+ const result = await fetchProviderJson("https://api.core.ac.uk/v3/discover", context.timeoutMs, {
591
+ method: "POST",
592
+ headers: {
593
+ accept: "application/json,text/plain,*/*",
594
+ "content-type": "application/json",
595
+ "user-agent": "Mozilla/5.0",
596
+ },
597
+ body: JSON.stringify({ doi }),
598
+ }, {
599
+ mode: "relay",
600
+ sourceUrl: context.url,
601
+ });
602
+ if (!result.ok)
603
+ return null;
604
+ const data = objectValue(result.data);
605
+ if (Object.keys(data).length === 0)
606
+ return null;
607
+ const links = coreSearchLinks(data);
608
+ if (links.pdfUrls.length === 0 && links.fullTextUrls.length === 0)
609
+ return null;
610
+ return {
611
+ title: "",
612
+ abstract: "",
613
+ authors: [],
614
+ doi,
615
+ journal: "",
616
+ published: "",
617
+ pdfUrls: links.pdfUrls,
618
+ fullTextUrls: links.fullTextUrls,
619
+ site: "core",
620
+ canonicalUrl: "",
621
+ };
622
+ }
623
+ async function fetchCoreRecord(context, doi) {
624
+ const searchRecord = await fetchCoreSearchRecord(context, doi);
625
+ const discoverRecord = await fetchCoreDiscoverRecord(context, doi);
626
+ if (!searchRecord && !discoverRecord)
627
+ return null;
628
+ const merged = {
629
+ title: "",
630
+ abstract: "",
631
+ authors: [],
632
+ doi: "",
633
+ journal: "",
634
+ published: "",
635
+ pdfUrls: [],
636
+ fullTextUrls: [],
637
+ site: "core",
638
+ canonicalUrl: "",
639
+ };
640
+ if (searchRecord)
641
+ mergeSignals(merged, searchRecord, { linkOrder: "prepend" });
642
+ if (discoverRecord)
643
+ mergeSignals(merged, discoverRecord);
644
+ return merged;
645
+ }
646
+ function hasPaperSignals(signals) {
647
+ return Boolean(signals.doi
648
+ || signals.pmid
649
+ || signals.pmcid
650
+ || signals.authors.length > 0
651
+ || signals.journal
652
+ || signals.pdfUrls.length > 0
653
+ || signals.fullTextUrls.length > 0);
654
+ }
655
+ function composePaperText(currentText, signals, bestPdfUrl, bestFullTextUrl) {
656
+ const narrative = currentText.length > 600
657
+ ? currentText
658
+ : signals.abstract || currentText;
659
+ const metadata = [
660
+ signals.authors.length > 0 ? `Authors: ${signals.authors.join(", ")}` : "",
661
+ signals.journal ? `Journal: ${signals.journal}` : "",
662
+ signals.published ? `Published: ${signals.published}` : "",
663
+ signals.doi ? `DOI: ${signals.doi}` : "",
664
+ signals.publishedDoi ? `Published DOI: ${signals.publishedDoi}` : "",
665
+ signals.pmid ? `PMID: ${signals.pmid}` : "",
666
+ signals.pmcid ? `PMCID: ${signals.pmcid}` : "",
667
+ signals.oaStatus ? `Open access: ${signals.oaStatus}` : "",
668
+ signals.license ? `License: ${signals.license}` : "",
669
+ signals.category ? `Category: ${signals.category}` : "",
670
+ bestPdfUrl ? `PDF: ${bestPdfUrl}` : "",
671
+ bestFullTextUrl ? `Full text: ${bestFullTextUrl}` : "",
672
+ signals.jatsXml ? `JATS XML: ${signals.jatsXml}` : "",
673
+ ].filter(Boolean).join("\n");
674
+ const blocks = [normalizeSpace(narrative) ? narrative.trim() : "", metadata].filter(Boolean);
675
+ return Array.from(new Set(blocks)).join("\n\n").slice(0, 50000);
676
+ }
677
+ export async function maybeUseAcademicFallback(context, normalized) {
678
+ if (!academicFallbackEnabled())
679
+ return { normalized, retrievalMethods: [] };
680
+ const post = objectValue(normalized.post);
681
+ const resolvedUrl = context.resolvedUrl || context.url;
682
+ const signals = extractAcademicSignals(context, normalized);
683
+ const retrievalMethods = [];
684
+ const server = preprintServer(resolvedUrl);
685
+ if (server && biorxivApiEnabled()) {
686
+ const apiDoi = signals.doi || preprintDoiFromUrl(resolvedUrl);
687
+ if (apiDoi) {
688
+ const preprint = await fetchBioRxivRecord(context, server, apiDoi);
689
+ if (preprint) {
690
+ mergeSignals(signals, preprint);
691
+ retrievalMethods.push("academic_biorxiv_api");
692
+ }
693
+ }
694
+ }
695
+ if (!hasPaperSignals(signals) && !String(textValue(post.kind)).startsWith("paper_")) {
696
+ return { normalized, retrievalMethods: [] };
697
+ }
698
+ if (europePmcEnabled() && (signals.doi || signals.pmid || signals.pmcid)) {
699
+ const europePmc = await fetchEuropePmcRecord(context, signals);
700
+ if (europePmc) {
701
+ mergeSignals(signals, europePmc);
702
+ retrievalMethods.push("academic_europepmc");
703
+ }
704
+ }
705
+ if (openAlexEnabled() && signals.doi) {
706
+ const openAlex = await fetchOpenAlexRecord(context, signals.doi);
707
+ if (openAlex) {
708
+ mergeSignals(signals, openAlex);
709
+ signals.pdfUrls = dedupeUrls([...(openAlex.pdfUrls || []), ...signals.pdfUrls]);
710
+ signals.fullTextUrls = dedupeUrls([...(openAlex.fullTextUrls || []), ...signals.fullTextUrls]);
711
+ retrievalMethods.push("academic_openalex");
712
+ }
713
+ }
714
+ const title = signals.title || textValue(post.title);
715
+ const description = signals.abstract || textValue(post.description);
716
+ const doiUrl = doiLinkUrl(signals.doi);
717
+ const pubmedUrl = signals.pmid ? `https://pubmed.ncbi.nlm.nih.gov/${signals.pmid}/` : "";
718
+ const pmcUrl = signals.pmcid ? `https://pmc.ncbi.nlm.nih.gov/articles/${signals.pmcid}/` : "";
719
+ const bestPdfUrl = signals.pdfUrls[0] || "";
720
+ const bestFullTextUrl = signals.fullTextUrls.find((url) => url !== bestPdfUrl) || pmcUrl || doiUrl || "";
721
+ const kind = server ? "paper_preprint" : String(textValue(post.kind)).startsWith("paper_") ? textValue(post.kind) : "paper_article";
722
+ const status = title && (description || bestPdfUrl || bestFullTextUrl || signals.doi || signals.pmid || signals.pmcid)
723
+ ? "ok"
724
+ : textValue(post.status) || "reference_only";
725
+ const text = composePaperText(String(post.text || "").trim(), signals, bestPdfUrl, bestFullTextUrl);
726
+ const outboundLinks = dedupeUrls([
727
+ ...normalized.outbound_links,
728
+ signals.canonicalUrl,
729
+ doiUrl,
730
+ pubmedUrl,
731
+ pmcUrl,
732
+ ...signals.pdfUrls,
733
+ ...signals.fullTextUrls,
734
+ signals.jatsXml,
735
+ ]);
736
+ const authors = signals.authors.length > 0 ? signals.authors : authorListFromPost(post);
737
+ const nextPost = {
738
+ ...post,
739
+ url: textValue(post.url) || signals.canonicalUrl || context.url,
740
+ title,
741
+ description,
742
+ text,
743
+ ...(authors.length > 0 ? { authors } : {}),
744
+ ...(signals.journal ? { journal: signals.journal } : {}),
745
+ ...(signals.published ? { published: signals.published } : {}),
746
+ ...(signals.doi ? { doi: signals.doi } : {}),
747
+ ...(signals.pmid ? { pmid: signals.pmid } : {}),
748
+ ...(signals.pmcid ? { pmcid: signals.pmcid } : {}),
749
+ ...(signals.publishedDoi ? { published_doi: signals.publishedDoi } : {}),
750
+ ...(signals.oaStatus ? { open_access_status: signals.oaStatus } : {}),
751
+ ...(signals.license ? { open_access_license: signals.license } : {}),
752
+ ...(bestPdfUrl ? { pdf_url: bestPdfUrl } : {}),
753
+ ...(bestFullTextUrl ? { full_text_url: bestFullTextUrl } : {}),
754
+ ...(signals.category ? { category: signals.category } : {}),
755
+ ...(signals.jatsXml ? { jats_xml: signals.jatsXml } : {}),
756
+ ...(signals.site ? { site: signals.site } : {}),
757
+ kind,
758
+ status,
759
+ };
760
+ const currentPostJson = JSON.stringify(normalized.post || null);
761
+ const nextPostJson = JSON.stringify(nextPost);
762
+ const currentLinksJson = JSON.stringify(normalized.outbound_links);
763
+ const nextLinksJson = JSON.stringify(outboundLinks);
764
+ const improved = currentPostJson !== nextPostJson || currentLinksJson !== nextLinksJson;
765
+ if (!improved)
766
+ return { normalized, retrievalMethods: [] };
767
+ return {
768
+ normalized: {
769
+ ...normalized,
770
+ post: nextPost,
771
+ outbound_links: outboundLinks,
772
+ partial: !(title && (description || bestPdfUrl || bestFullTextUrl || signals.doi || signals.pmid || signals.pmcid)),
773
+ },
774
+ retrievalMethods: Array.from(new Set(retrievalMethods)),
775
+ };
776
+ }
777
+ //# sourceMappingURL=academic-fallback.js.map