search_paper 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1365 @@
1
+ // src/client.ts
2
+ import { GhostClient } from "ghostfetch";
3
+ function createClient(options) {
4
+ return new GhostClient({
5
+ browser: options?.browser ?? "Chrome_131",
6
+ autoSolveJsChallenge: true,
7
+ timeout: options?.timeout ?? 15e3,
8
+ ...options?.proxy && { proxy: options.proxy },
9
+ ...options?.proxyPool && {
10
+ proxyPool: options.proxyPool,
11
+ proxyRotation: "round-robin"
12
+ }
13
+ });
14
+ }
15
+
16
+ // src/utils/sleep.ts
17
+ function sleep(ms) {
18
+ return new Promise((resolve) => setTimeout(resolve, ms));
19
+ }
20
+
21
+ // src/utils/rate-limiter.ts
22
+ var RateLimiter = class {
23
+ constructor(minIntervalMs, jitterMs = 0) {
24
+ this.minIntervalMs = minIntervalMs;
25
+ this.jitterMs = jitterMs;
26
+ }
27
+ lastRequest = 0;
28
+ async wait() {
29
+ const now = Date.now();
30
+ const elapsed = now - this.lastRequest;
31
+ const interval = this.minIntervalMs + Math.random() * this.jitterMs;
32
+ if (elapsed < interval) {
33
+ await sleep(interval - elapsed);
34
+ }
35
+ this.lastRequest = Date.now();
36
+ }
37
+ };
38
+ var AdaptiveRateLimiter = class {
39
+ constructor(baseIntervalMs, baseJitterMs = 0, maxIntervalMs = 12e4, backoffMultiplier = 3, cooldownAfter = 2, cooldownFactor = 0.7) {
40
+ this.baseIntervalMs = baseIntervalMs;
41
+ this.baseJitterMs = baseJitterMs;
42
+ this.maxIntervalMs = maxIntervalMs;
43
+ this.backoffMultiplier = backoffMultiplier;
44
+ this.cooldownAfter = cooldownAfter;
45
+ this.cooldownFactor = cooldownFactor;
46
+ this.currentIntervalMs = baseIntervalMs;
47
+ }
48
+ lastRequest = 0;
49
+ currentIntervalMs;
50
+ consecutiveSuccesses = 0;
51
+ consecutiveBlocks = 0;
52
+ /** 현재 적용 중인 인터벌 (ms) */
53
+ get interval() {
54
+ return this.currentIntervalMs;
55
+ }
56
+ /** 현재 차단 상태인지 (연속 차단 2회 이상이면 true) */
57
+ get isBlocked() {
58
+ return this.consecutiveBlocks >= 2;
59
+ }
60
+ async wait() {
61
+ const now = Date.now();
62
+ const elapsed = now - this.lastRequest;
63
+ const jitter = Math.random() * this.baseJitterMs;
64
+ const interval = this.currentIntervalMs + jitter;
65
+ if (elapsed < interval) {
66
+ await sleep(interval - elapsed);
67
+ }
68
+ this.lastRequest = Date.now();
69
+ }
70
+ /** 요청 성공 시 호출 → 점진적으로 딜레이 감소 */
71
+ success() {
72
+ this.consecutiveSuccesses++;
73
+ this.consecutiveBlocks = 0;
74
+ if (this.consecutiveSuccesses >= this.cooldownAfter && this.currentIntervalMs > this.baseIntervalMs) {
75
+ this.currentIntervalMs = Math.max(
76
+ this.baseIntervalMs,
77
+ Math.round(this.currentIntervalMs * this.cooldownFactor)
78
+ );
79
+ this.consecutiveSuccesses = 0;
80
+ }
81
+ }
82
+ /** 차단 감지 시 호출 → 딜레이를 지수적으로 증가 */
83
+ backoff() {
84
+ this.consecutiveSuccesses = 0;
85
+ this.consecutiveBlocks++;
86
+ this.currentIntervalMs = Math.min(
87
+ this.maxIntervalMs,
88
+ Math.round(this.currentIntervalMs * this.backoffMultiplier)
89
+ );
90
+ }
91
+ };
92
+
93
+ // src/utils/retry.ts
94
+ import {
95
+ TimeoutError,
96
+ NetworkError,
97
+ ProxyError,
98
+ ChallengeError
99
+ } from "ghostfetch";
100
+
101
+ // src/utils/errors.ts
102
+ var HttpError = class extends Error {
103
+ constructor(status, statusText) {
104
+ super(`HTTP ${status}: ${statusText}`);
105
+ this.status = status;
106
+ this.statusText = statusText;
107
+ this.name = "HttpError";
108
+ }
109
+ };
110
+
111
+ // src/utils/retry.ts
112
+ async function withRetry(fn, options) {
113
+ const maxRetries = options?.maxRetries ?? 3;
114
+ const baseDelay = options?.baseDelayMs ?? 500;
115
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
116
+ try {
117
+ return await fn();
118
+ } catch (err) {
119
+ if (attempt === maxRetries || !isRetryable(err)) throw err;
120
+ const jitter = Math.random() * baseDelay * 0.5;
121
+ await sleep(baseDelay * 2 ** attempt + jitter);
122
+ }
123
+ }
124
+ throw new Error("unreachable");
125
+ }
126
+ function isRetryable(err) {
127
+ if (err instanceof TimeoutError) return true;
128
+ if (err instanceof NetworkError) return true;
129
+ if (err instanceof ProxyError) return true;
130
+ if (err instanceof ChallengeError) return false;
131
+ if (err instanceof HttpError) {
132
+ return err.status === 429 || err.status === 503;
133
+ }
134
+ return false;
135
+ }
136
+
137
+ // src/sources/semantic-scholar.ts
138
+ var BASE_URL = "https://api.semanticscholar.org/graph/v1";
139
+ var sharedRateLimiters = {
140
+ withKey: new RateLimiter(200, 100),
141
+ withoutKey: new RateLimiter(5e3, 1500)
142
+ };
143
+ var PAPER_FIELDS = [
144
+ "paperId",
145
+ "externalIds",
146
+ "url",
147
+ "title",
148
+ "abstract",
149
+ "venue",
150
+ "year",
151
+ "referenceCount",
152
+ "citationCount",
153
+ "isOpenAccess",
154
+ "openAccessPdf",
155
+ "authors.name",
156
+ "authors.authorId",
157
+ "authors.affiliations",
158
+ "journal",
159
+ "publicationDate",
160
+ "s2FieldsOfStudy"
161
+ ].join(",");
162
+ var NESTED_PAPER_FIELDS = [
163
+ "paperId",
164
+ "externalIds",
165
+ "url",
166
+ "title",
167
+ "abstract",
168
+ "venue",
169
+ "year",
170
+ "referenceCount",
171
+ "citationCount",
172
+ "openAccessPdf",
173
+ "authors",
174
+ "journal",
175
+ "publicationDate",
176
+ "s2FieldsOfStudy"
177
+ ].join(",");
178
+ function toPaper(raw) {
179
+ const authors = raw.authors.map((a, i) => ({
180
+ name: a.name,
181
+ affiliations: a.affiliations?.length ? a.affiliations : void 0,
182
+ scholarId: a.authorId ?? void 0,
183
+ isFirstAuthor: i === 0 ? true : void 0
184
+ }));
185
+ const journal = raw.journal?.name || raw.journal?.volume ? {
186
+ name: raw.journal.name || void 0,
187
+ volume: raw.journal.volume || void 0
188
+ } : void 0;
189
+ const keywords = raw.s2FieldsOfStudy?.length ? raw.s2FieldsOfStudy.map((f) => f.category) : void 0;
190
+ return {
191
+ title: raw.title,
192
+ authors,
193
+ abstract: raw.abstract ?? void 0,
194
+ year: raw.year ?? void 0,
195
+ publicationDate: raw.publicationDate ?? void 0,
196
+ venue: raw.venue || raw.journal?.name || void 0,
197
+ doi: raw.externalIds?.DOI ?? void 0,
198
+ url: raw.url,
199
+ pdfUrl: raw.openAccessPdf?.url || void 0,
200
+ citationCount: raw.citationCount ?? void 0,
201
+ source: "semantic_scholar",
202
+ sourceId: raw.paperId,
203
+ keywords,
204
+ journal
205
+ };
206
+ }
207
+ var SemanticScholarSource = class {
208
+ name = "semantic_scholar";
209
+ client;
210
+ apiKey;
211
+ rateLimiter;
212
+ constructor(client, options) {
213
+ this.client = client;
214
+ this.apiKey = options?.semanticScholarApiKey;
215
+ this.rateLimiter = this.apiKey ? sharedRateLimiters.withKey : sharedRateLimiters.withoutKey;
216
+ }
217
+ async search(query, options) {
218
+ const limit = Math.min(options?.limit ?? 10, 100);
219
+ const offset = options?.offset ?? 0;
220
+ const params = new URLSearchParams({
221
+ query,
222
+ fields: PAPER_FIELDS,
223
+ offset: String(offset),
224
+ limit: String(limit)
225
+ });
226
+ if (options?.year) {
227
+ const from = options.year.from ?? "";
228
+ const to = options.year.to ?? "";
229
+ if (from || to) {
230
+ params.set("year", `${from}-${to}`);
231
+ }
232
+ }
233
+ if (options?.sort === "citations") {
234
+ params.set("sort", "citationCount:desc");
235
+ } else if (options?.sort === "date") {
236
+ params.set("sort", "publicationDate:desc");
237
+ }
238
+ const data = await this.fetchApi(
239
+ `/paper/search?${params}`
240
+ );
241
+ if (!data) {
242
+ return {
243
+ query,
244
+ totalResults: 0,
245
+ papers: [],
246
+ source: "semantic_scholar"
247
+ };
248
+ }
249
+ return {
250
+ query,
251
+ totalResults: data.total,
252
+ papers: data.data.map(toPaper),
253
+ nextPageToken: data.next != null ? String(data.next) : void 0,
254
+ source: "semantic_scholar"
255
+ };
256
+ }
257
+ async getPaper(id) {
258
+ const data = await this.fetchApi(
259
+ `/paper/${encodeURIComponent(id)}?fields=${PAPER_FIELDS}`
260
+ );
261
+ return data ? toPaper(data) : null;
262
+ }
263
+ async getPapers(ids) {
264
+ if (ids.length === 0) return [];
265
+ const data = await this.fetchApi(
266
+ `/paper/batch?fields=${PAPER_FIELDS}`,
267
+ {
268
+ method: "POST",
269
+ headers: { "Content-Type": "application/json" },
270
+ body: JSON.stringify({ ids })
271
+ }
272
+ );
273
+ if (!data) return ids.map(() => null);
274
+ return data.map((raw) => raw && raw.paperId ? toPaper(raw) : null);
275
+ }
276
+ async getCitations(id, options) {
277
+ const limit = Math.min(options?.limit ?? 10, 100);
278
+ const offset = options?.offset ?? 0;
279
+ const params = `fields=${NESTED_PAPER_FIELDS}&offset=${offset}&limit=${limit}`;
280
+ const data = await this.fetchApi(
281
+ `/paper/${encodeURIComponent(id)}/citations?${params}`
282
+ );
283
+ if (!data) {
284
+ return { query: id, papers: [], source: "semantic_scholar" };
285
+ }
286
+ return {
287
+ query: id,
288
+ papers: data.data.map((d) => d.citingPaper).filter((p) => p && p.paperId).map(toPaper),
289
+ nextPageToken: data.next != null ? String(data.next) : void 0,
290
+ source: "semantic_scholar"
291
+ };
292
+ }
293
+ async getReferences(id, options) {
294
+ const limit = Math.min(options?.limit ?? 10, 100);
295
+ const offset = options?.offset ?? 0;
296
+ const params = `fields=${NESTED_PAPER_FIELDS}&offset=${offset}&limit=${limit}`;
297
+ const data = await this.fetchApi(
298
+ `/paper/${encodeURIComponent(id)}/references?${params}`
299
+ );
300
+ if (!data) {
301
+ return { query: id, papers: [], source: "semantic_scholar" };
302
+ }
303
+ return {
304
+ query: id,
305
+ papers: data.data.map((d) => d.citedPaper).filter((p) => p && p.paperId).map(toPaper),
306
+ nextPageToken: data.next != null ? String(data.next) : void 0,
307
+ source: "semantic_scholar"
308
+ };
309
+ }
310
+ async fetchApi(path, init) {
311
+ const headers = { ...init?.headers };
312
+ if (this.apiKey) {
313
+ headers["x-api-key"] = this.apiKey;
314
+ }
315
+ return withRetry(async () => {
316
+ await this.rateLimiter.wait();
317
+ const res = await this.client.fetch(`${BASE_URL}${path}`, {
318
+ method: init?.method,
319
+ headers,
320
+ body: init?.body
321
+ });
322
+ if (!res.ok) {
323
+ if (res.status === 404) return null;
324
+ throw new HttpError(res.status, res.statusText);
325
+ }
326
+ return res.json();
327
+ }, { maxRetries: 5, baseDelayMs: 5e3 });
328
+ }
329
+ };
330
+
331
+ // src/parsers/scholar-html.ts
332
+ import * as cheerio from "cheerio";
333
+ function detectCaptcha($) {
334
+ return $("#gs_captcha_ccl").length > 0 || $("#recaptcha").length > 0 || $("form#captcha-form").length > 0;
335
+ }
336
+ function parseMetaText(text) {
337
+ const parts = text.split(" - ");
338
+ const authorsPart = parts[0] ?? "";
339
+ const authors = authorsPart.split(",").map((name) => name.trim()).filter((name) => name.length > 0 && name !== "\u2026").map((name) => ({ name }));
340
+ const venuePart = parts[1] ?? "";
341
+ const yearMatch = venuePart.match(/\b(19|20)\d{2}\b/);
342
+ const year = yearMatch ? Number(yearMatch[0]) : void 0;
343
+ const venue = venuePart.replace(/\b(19|20)\d{2}\b/, "").replace(/,\s*$/, "").trim() || void 0;
344
+ return { authors, year, venue };
345
+ }
346
+ function parseCitedBy(text) {
347
+ const match = text.match(/Cited by (\d+)/);
348
+ return match ? Number(match[1]) : void 0;
349
+ }
350
+ function parseScholarHtml(html) {
351
+ const $ = cheerio.load(html);
352
+ if (detectCaptcha($)) {
353
+ return { papers: [], isCaptcha: true };
354
+ }
355
+ const papers = [];
356
+ const items = $("div.gs_r.gs_or.gs_scl");
357
+ items.each((_, el) => {
358
+ const $el = $(el);
359
+ const titleEl = $el.find("h3.gs_rt a");
360
+ const title = titleEl.text().trim();
361
+ const url = titleEl.attr("href");
362
+ if (!title) return;
363
+ const metaText = $el.find("div.gs_a").text();
364
+ const meta = parseMetaText(metaText);
365
+ const abstract = $el.find("div.gs_rs").text().trim() || void 0;
366
+ const citedByText = $el.find("div.gs_fl a").filter((_2, a) => $(a).text().startsWith("Cited by")).first().text();
367
+ const citationCount = parseCitedBy(citedByText);
368
+ const pdfUrl = $el.find("div.gs_or_ggsm a").attr("href") || void 0;
369
+ papers.push({
370
+ title,
371
+ authors: meta.authors,
372
+ abstract,
373
+ year: meta.year,
374
+ venue: meta.venue,
375
+ url: url || "",
376
+ pdfUrl,
377
+ citationCount,
378
+ source: "google_scholar"
379
+ });
380
+ });
381
+ return { papers, isCaptcha: false };
382
+ }
383
+
384
+ // src/sources/google-scholar.ts
385
+ var sharedRateLimiter = new AdaptiveRateLimiter(
386
+ 2e3,
387
+ // baseInterval: 2초
388
+ 3e3,
389
+ // jitter: 3초
390
+ 12e4,
391
+ // max: 2분
392
+ 3,
393
+ // CAPTCHA 시 3배 증가
394
+ 2,
395
+ // 연속 2회 성공 시 감소
396
+ 0.7
397
+ // 30%씩 감소
398
+ );
399
+ var MAX_CAPTCHA_RETRIES = 1;
400
+ var GoogleScholarSource = class {
401
+ name = "google_scholar";
402
+ client;
403
+ rateLimiter;
404
+ constructor(client, rateLimiter) {
405
+ this.client = client;
406
+ this.rateLimiter = rateLimiter ?? sharedRateLimiter;
407
+ }
408
+ async search(query, options) {
409
+ const limit = Math.min(options?.limit ?? 10, 20);
410
+ const offset = options?.offset ?? 0;
411
+ const params = new URLSearchParams({
412
+ q: query,
413
+ start: String(offset),
414
+ num: String(limit),
415
+ hl: "en"
416
+ });
417
+ if (options?.year?.from) {
418
+ params.set("as_ylo", String(options.year.from));
419
+ }
420
+ if (options?.year?.to) {
421
+ params.set("as_yhi", String(options.year.to));
422
+ }
423
+ if (options?.sort === "date") {
424
+ params.set("scisbd", "1");
425
+ }
426
+ const url = `https://scholar.google.com/scholar?${params}`;
427
+ if (this.rateLimiter.isBlocked) {
428
+ return {
429
+ query,
430
+ papers: [],
431
+ source: "google_scholar",
432
+ errors: [{
433
+ source: "google_scholar",
434
+ message: "Google Scholar CAPTCHA detected",
435
+ code: "CAPTCHA"
436
+ }]
437
+ };
438
+ }
439
+ for (let captchaRetry = 0; captchaRetry <= MAX_CAPTCHA_RETRIES; captchaRetry++) {
440
+ await this.rateLimiter.wait();
441
+ const html = await withRetry(async () => {
442
+ const res = await this.client.fetch(url);
443
+ if (!res.ok) {
444
+ throw new HttpError(res.status, res.statusText);
445
+ }
446
+ return res.text();
447
+ }, { maxRetries: 2, baseDelayMs: 5e3 });
448
+ const { papers, isCaptcha } = parseScholarHtml(html);
449
+ if (isCaptcha) {
450
+ this.rateLimiter.backoff();
451
+ console.warn(
452
+ `[GoogleScholar] CAPTCHA \uAC10\uC9C0 (${captchaRetry + 1}/${MAX_CAPTCHA_RETRIES + 1}), \uB2E4\uC74C \uC694\uCCAD\uAE4C\uC9C0 ~${Math.round(this.rateLimiter.interval / 1e3)}\uCD08 \uB300\uAE30`
453
+ );
454
+ if (captchaRetry === MAX_CAPTCHA_RETRIES) {
455
+ return {
456
+ query,
457
+ papers: [],
458
+ source: "google_scholar",
459
+ errors: [{
460
+ source: "google_scholar",
461
+ message: "Google Scholar CAPTCHA detected",
462
+ code: "CAPTCHA"
463
+ }]
464
+ };
465
+ }
466
+ continue;
467
+ }
468
+ this.rateLimiter.success();
469
+ return {
470
+ query,
471
+ papers,
472
+ nextPageToken: papers.length >= limit ? String(offset + limit) : void 0,
473
+ source: "google_scholar"
474
+ };
475
+ }
476
+ return { query, papers: [], source: "google_scholar" };
477
+ }
478
+ async getPaper(id) {
479
+ const result = await this.search(`"${id}"`, { limit: 1 });
480
+ return result.papers[0] ?? null;
481
+ }
482
+ };
483
+
484
+ // src/parsers/arxiv-xml.ts
485
+ import { XMLParser } from "fast-xml-parser";
486
+ var parser = new XMLParser({
487
+ ignoreAttributes: false,
488
+ attributeNamePrefix: "@_",
489
+ isArray: (name) => ["author", "category", "link"].includes(name)
490
+ });
491
+ function extractArxivId(entryId) {
492
+ const match = entryId.match(/abs\/(.+?)(?:v\d+)?$/);
493
+ return match?.[1] || void 0;
494
+ }
495
+ function extractDoi(entry) {
496
+ const doi = entry["arxiv:doi"];
497
+ if (!doi) return void 0;
498
+ return typeof doi === "string" ? doi : doi["#text"] ?? void 0;
499
+ }
500
+ function extractJournalRef(entry) {
501
+ const ref = entry["arxiv:journal_ref"];
502
+ if (!ref) return void 0;
503
+ return typeof ref === "string" ? ref : ref["#text"] ?? void 0;
504
+ }
505
+ function entryToPaper(entry) {
506
+ const authors = entry.author.map((a) => ({
507
+ name: a.name,
508
+ affiliations: a["arxiv:affiliation"] ? [].concat(a["arxiv:affiliation"]).map(
509
+ (aff) => typeof aff === "string" ? aff : aff["#text"] ?? String(aff)
510
+ ) : void 0
511
+ }));
512
+ const pdfLink = entry.link.find(
513
+ (l) => l["@_title"] === "pdf"
514
+ );
515
+ if (authors.length > 0) authors[0].isFirstAuthor = true;
516
+ return {
517
+ title: entry.title.replace(/\s+/g, " ").trim(),
518
+ authors,
519
+ abstract: entry.summary?.replace(/\s+/g, " ").trim() || void 0,
520
+ year: entry.published ? new Date(entry.published).getFullYear() : void 0,
521
+ publicationDate: entry.published ? entry.published.split("T")[0] : void 0,
522
+ url: entry.id,
523
+ pdfUrl: pdfLink?.["@_href"] || void 0,
524
+ doi: extractDoi(entry),
525
+ venue: extractJournalRef(entry),
526
+ source: "arxiv",
527
+ sourceId: extractArxivId(entry.id),
528
+ tags: entry.category?.map((c) => c["@_term"])
529
+ };
530
+ }
531
+ function parseArxivResponse(xml) {
532
+ const parsed = parser.parse(xml);
533
+ const feed = parsed.feed;
534
+ if (!feed) {
535
+ return { papers: [], totalResults: 0 };
536
+ }
537
+ const entries = feed.entry;
538
+ if (!entries) {
539
+ return { papers: [], totalResults: 0 };
540
+ }
541
+ const items = Array.isArray(entries) ? entries : [entries];
542
+ const papers = items.filter((entry) => entry.id && entry.title).map(entryToPaper);
543
+ const totalResults = Number(feed["opensearch:totalResults"]) || void 0;
544
+ return { papers, totalResults };
545
+ }
546
+
547
+ // src/parsers/ar5iv-html.ts
548
+ import * as cheerio2 from "cheerio";
549
+ var EXCLUDED_HEADINGS = /* @__PURE__ */ new Set([
550
+ "author contributions",
551
+ "funding",
552
+ "institutional review board statement",
553
+ "informed consent statement",
554
+ "data availability statement",
555
+ "conflicts of interest",
556
+ "references",
557
+ "acknowledgements",
558
+ "acknowledgments",
559
+ "article metrics"
560
+ ]);
561
+ function parseAr5ivHtml(html, arxivId) {
562
+ const $ = cheerio2.load(html);
563
+ const authors = parseAuthors($);
564
+ const sections = parseSections($);
565
+ const figures = parseFigures($, arxivId);
566
+ const tables = parseTables($);
567
+ return { authors, sections, figures, tables };
568
+ }
569
+ function parseAuthors($) {
570
+ const authors = [];
571
+ const correspondingNames = /* @__PURE__ */ new Set();
572
+ $("span.ltx_creator.ltx_role_corresponding, div.ltx_creator.ltx_role_corresponding").each((_, el) => {
573
+ const name = cleanText($(el).find(".ltx_personname").text());
574
+ if (name) correspondingNames.add(name);
575
+ });
576
+ const creatorBlocks = $("span.ltx_creator.ltx_role_author, div.ltx_creator.ltx_role_author");
577
+ if (creatorBlocks.length > 0) {
578
+ creatorBlocks.each((_, el) => {
579
+ const block = $(el);
580
+ const nameEl = block.find(".ltx_personname");
581
+ const contactEl = block.find(".ltx_contact.ltx_role_affiliation");
582
+ const hasEmail = block.find(".ltx_contact.ltx_role_email").length > 0;
583
+ if (nameEl.length > 0) {
584
+ const name = cleanText(nameEl.text());
585
+ const affiliations = [];
586
+ contactEl.each((__, affEl) => {
587
+ const aff = cleanText($(affEl).text());
588
+ if (aff) affiliations.push(aff);
589
+ });
590
+ const isCorresponding = correspondingNames.has(name) || hasEmail || void 0;
591
+ if (name) {
592
+ authors.push({
593
+ name,
594
+ affiliations: affiliations.length > 0 ? affiliations : void 0,
595
+ isFirstAuthor: authors.length === 0 ? true : void 0,
596
+ isCorresponding: isCorresponding || void 0
597
+ });
598
+ }
599
+ }
600
+ });
601
+ }
602
+ if (authors.length === 0) {
603
+ const authorsDiv = $("div.ltx_authors, span.ltx_authors");
604
+ if (authorsDiv.length > 0) {
605
+ const rawText = authorsDiv.first().html() || "";
606
+ const authorEntries = parseInlineAuthors(rawText);
607
+ for (let i = 0; i < authorEntries.length; i++) {
608
+ const entry = authorEntries[i];
609
+ if (entry.name) {
610
+ authors.push({
611
+ name: entry.name,
612
+ affiliations: entry.affiliations.length > 0 ? entry.affiliations : void 0,
613
+ isFirstAuthor: i === 0 ? true : void 0
614
+ });
615
+ }
616
+ }
617
+ }
618
+ }
619
+ return authors;
620
+ }
621
+ function parseInlineAuthors(html) {
622
+ const $ = cheerio2.load(`<div>${html}</div>`);
623
+ const results = [];
624
+ const text = $("div").first().text();
625
+ const parts = text.split(/(?:\\\s*AND\s*|(?<!\w)&(?!\w@))/);
626
+ for (const part of parts) {
627
+ const lines = part.split("\n").map((l) => l.trim()).filter(Boolean);
628
+ if (lines.length === 0) continue;
629
+ let name = lines[0].replace(/[*†‡§¶∗]+/g, "").replace(/\d+/g, "").replace(/[,;]+$/, "").trim();
630
+ name = name.replace(/\S+@\S+/g, "").trim();
631
+ if (!name || name.length < 2) continue;
632
+ const affiliations = [];
633
+ for (let i = 1; i < lines.length; i++) {
634
+ const line = lines[i].trim();
635
+ if (line && !line.includes("@") && !/^\d+$/.test(line) && line.length > 2) {
636
+ affiliations.push(line.replace(/[,;]+$/, "").trim());
637
+ }
638
+ }
639
+ results.push({ name, affiliations });
640
+ }
641
+ return results;
642
+ }
643
+ function parseSections($) {
644
+ const sections = [];
645
+ const sectionMap = {
646
+ ltx_section: 1,
647
+ ltx_subsection: 2,
648
+ ltx_subsubsection: 3,
649
+ ltx_appendix: 1
650
+ };
651
+ $("section.ltx_section, section.ltx_appendix").each((_, el) => {
652
+ const section = $(el);
653
+ const className = section.attr("class") || "";
654
+ let level = 1;
655
+ for (const [cls, lvl] of Object.entries(sectionMap)) {
656
+ if (className.includes(cls)) {
657
+ level = lvl;
658
+ break;
659
+ }
660
+ }
661
+ const headingEl = section.children("h2, h3, h4, h5, h6").first();
662
+ if (!headingEl.length) return;
663
+ const heading = cleanText(headingEl.text()).replace(/^[\d.]+\s*/, "").replace(/^Appendix\s+[A-Z]\s*/, "Appendix ").trim();
664
+ if (!heading) return;
665
+ if (EXCLUDED_HEADINGS.has(heading.toLowerCase())) return;
666
+ if (className.includes("ltx_bibliography")) return;
667
+ const contentParts = [];
668
+ section.find(".ltx_para, .ltx_paragraph").each((__, paraEl) => {
669
+ const $para = $(paraEl);
670
+ if ($para.closest("figure").length > 0) return;
671
+ const paraText = cleanText($para.text());
672
+ if (paraText) contentParts.push(paraText);
673
+ });
674
+ const isAppendix = className.includes("ltx_appendix");
675
+ const displayHeading = isAppendix && !heading.startsWith("Appendix") ? `Appendix: ${heading}` : heading;
676
+ sections.push({
677
+ heading: displayHeading,
678
+ level,
679
+ content: contentParts.length > 0 ? contentParts.join("\n\n") : void 0
680
+ });
681
+ });
682
+ return sections;
683
+ }
684
+ function parseFigures($, arxivId) {
685
+ const figures = [];
686
+ const baseUrl = `https://ar5iv.labs.arxiv.org`;
687
+ $("figure.ltx_figure").each((_, el) => {
688
+ const fig = $(el);
689
+ const captionEl = fig.find("figcaption");
690
+ const tagEl = captionEl.find(".ltx_tag.ltx_tag_figure");
691
+ const label = cleanText(tagEl.text()).replace(/:\s*$/, "").trim();
692
+ if (!label) return;
693
+ const fullCaption = cleanText(captionEl.text());
694
+ const caption = fullCaption.replace(label, "").replace(/^:\s*/, "").trim();
695
+ const imgEl = fig.find("img").first();
696
+ let filePath;
697
+ if (imgEl.length) {
698
+ const src = imgEl.attr("src") || "";
699
+ if (src && !src.startsWith("data:")) {
700
+ filePath = src.startsWith("http") ? src : `${baseUrl}${src}`;
701
+ }
702
+ }
703
+ figures.push({
704
+ label,
705
+ caption: caption || void 0,
706
+ filePath: filePath || void 0
707
+ });
708
+ });
709
+ return figures;
710
+ }
711
+ function parseTables($) {
712
+ const tables = [];
713
+ $("figure.ltx_table").each((_, el) => {
714
+ const tableFig = $(el);
715
+ const captionEl = tableFig.find("figcaption");
716
+ const tagEl = captionEl.find(".ltx_tag.ltx_tag_table");
717
+ const label = cleanText(tagEl.text()).replace(/:\s*$/, "").trim();
718
+ if (!label) return;
719
+ const fullCaption = cleanText(captionEl.text());
720
+ const caption = fullCaption.replace(label, "").replace(/^:\s*/, "").trim();
721
+ const tableEl = tableFig.find("table.ltx_tabular").first();
722
+ const headers = [];
723
+ const rows = [];
724
+ if (tableEl.length) {
725
+ const allRows = tableEl.find("tr");
726
+ let headerParsed = false;
727
+ allRows.each((__, rowEl) => {
728
+ const row = $(rowEl);
729
+ const cells = [];
730
+ const isHeader = row.find("th").length > 0 && !headerParsed;
731
+ row.find("td, th").each((___, cellEl) => {
732
+ cells.push(cleanText($(cellEl).text()));
733
+ });
734
+ if (cells.length === 0) return;
735
+ if (isHeader && !headerParsed) {
736
+ headers.push(...cells);
737
+ headerParsed = true;
738
+ } else {
739
+ rows.push(cells);
740
+ }
741
+ });
742
+ }
743
+ tables.push({
744
+ label,
745
+ caption: caption || void 0,
746
+ headers: headers.length > 0 ? headers : void 0,
747
+ rows: rows.length > 0 ? rows : void 0
748
+ });
749
+ });
750
+ return tables;
751
+ }
752
+ function cleanText(text) {
753
+ return text.replace(/\s+/g, " ").replace(/\n+/g, " ").trim();
754
+ }
755
+
756
+ // src/sources/arxiv.ts
757
+ var sharedRateLimiter2 = new RateLimiter(3e3, 1e3);
758
+ function buildSearchQuery(query) {
759
+ if (/^(ti|au|abs|co|jr|cat|rn|id|all):/.test(query)) {
760
+ return query;
761
+ }
762
+ return `all:${query}`;
763
+ }
764
+ function normalizeArxivId(id) {
765
+ return id.replace(/^arxiv:/i, "");
766
+ }
767
+ var ArxivSource = class {
768
+ name = "arxiv";
769
+ client;
770
+ rateLimiter;
771
+ constructor(client) {
772
+ this.client = client;
773
+ this.rateLimiter = sharedRateLimiter2;
774
+ }
775
+ async search(query, options) {
776
+ const limit = Math.min(options?.limit ?? 10, 2e3);
777
+ const offset = options?.offset ?? 0;
778
+ const params = new URLSearchParams({
779
+ search_query: buildSearchQuery(query),
780
+ start: String(offset),
781
+ max_results: String(limit)
782
+ });
783
+ if (options?.sort === "date") {
784
+ params.set("sortBy", "submittedDate");
785
+ params.set("sortOrder", "descending");
786
+ }
787
+ const url = `https://export.arxiv.org/api/query?${params}`;
788
+ const xml = await withRetry(async () => {
789
+ await this.rateLimiter.wait();
790
+ const res = await this.client.fetch(url);
791
+ if (!res.ok) {
792
+ throw new HttpError(res.status, res.statusText);
793
+ }
794
+ return res.text();
795
+ }, { maxRetries: 4, baseDelayMs: 3e3 });
796
+ const { papers, totalResults } = parseArxivResponse(xml);
797
+ return {
798
+ query,
799
+ totalResults,
800
+ papers,
801
+ nextPageToken: papers.length >= limit ? String(offset + limit) : void 0,
802
+ source: "arxiv"
803
+ };
804
+ }
805
+ async getPaper(id) {
806
+ const arxivId = normalizeArxivId(id);
807
+ const url = `https://export.arxiv.org/api/query?id_list=${encodeURIComponent(arxivId)}`;
808
+ const xml = await withRetry(async () => {
809
+ await this.rateLimiter.wait();
810
+ const res = await this.client.fetch(url);
811
+ if (!res.ok) {
812
+ throw new HttpError(res.status, res.statusText);
813
+ }
814
+ return res.text();
815
+ }, { maxRetries: 4, baseDelayMs: 3e3 });
816
+ const { papers } = parseArxivResponse(xml);
817
+ return papers[0] ?? null;
818
+ }
819
+ /**
820
+ * arXiv 논문의 전체 내용을 가져옴 (메타데이터 + ar5iv HTML 파싱)
821
+ * - API에서 기본 메타데이터 조회
822
+ * - ar5iv HTML에서 저자(소속 포함), 섹션, 그림, 표 파싱
823
+ */
824
+ async getFullPaper(id) {
825
+ const arxivId = normalizeArxivId(id);
826
+ const paper = await this.getPaper(arxivId);
827
+ if (!paper) return null;
828
+ try {
829
+ const htmlUrl = `https://ar5iv.labs.arxiv.org/html/${arxivId}`;
830
+ const html = await withRetry(async () => {
831
+ await this.rateLimiter.wait();
832
+ const res = await this.client.fetch(htmlUrl);
833
+ if (!res.ok) {
834
+ throw new HttpError(res.status, res.statusText);
835
+ }
836
+ return res.text();
837
+ }, { maxRetries: 3, baseDelayMs: 3e3 });
838
+ const parsed = parseAr5ivHtml(html, arxivId);
839
+ if (parsed.authors.length > 0) {
840
+ const hasAffiliations = parsed.authors.some((a) => a.affiliations?.length);
841
+ if (hasAffiliations) {
842
+ paper.authors = parsed.authors;
843
+ }
844
+ }
845
+ if (parsed.sections.length > 0) paper.sections = parsed.sections;
846
+ if (parsed.figures.length > 0) paper.figures = parsed.figures;
847
+ if (parsed.tables.length > 0) paper.tables = parsed.tables;
848
+ } catch (e) {
849
+ console.warn(`ar5iv HTML \uD30C\uC2F1 \uC2E4\uD328 (${arxivId}): ${e.message}`);
850
+ }
851
+ return paper;
852
+ }
853
+ };
854
+
855
+ // src/utils/url.ts
856
+ function normalizeUrl(url) {
857
+ try {
858
+ const parsed = new URL(url);
859
+ parsed.protocol = "https:";
860
+ parsed.hostname = parsed.hostname.replace(/^www\./, "");
861
+ parsed.search = "";
862
+ parsed.hash = "";
863
+ let normalized = parsed.toString();
864
+ normalized = normalized.replace(/\/+$/, "");
865
+ return normalized;
866
+ } catch {
867
+ return url;
868
+ }
869
+ }
870
+ function normalizeTitle(title) {
871
+ return title.toLowerCase().replace(/[^a-z0-9\s]/g, "").replace(/\s+/g, " ").trim();
872
+ }
873
+
874
+ // src/utils/dedup.ts
875
+ function deduplicatePapers(papers) {
876
+ const seen = /* @__PURE__ */ new Map();
877
+ for (const paper of papers) {
878
+ let merged = false;
879
+ if (paper.doi) {
880
+ const doiKey = `doi:${paper.doi.toLowerCase()}`;
881
+ if (seen.has(doiKey)) {
882
+ mergePaper(seen.get(doiKey), paper);
883
+ merged = true;
884
+ } else {
885
+ seen.set(doiKey, paper);
886
+ merged = true;
887
+ }
888
+ }
889
+ if (!merged && paper.canonicalUrl) {
890
+ const urlKey = `url:${normalizeUrl(paper.canonicalUrl)}`;
891
+ if (seen.has(urlKey)) {
892
+ mergePaper(seen.get(urlKey), paper);
893
+ merged = true;
894
+ } else {
895
+ seen.set(urlKey, paper);
896
+ merged = true;
897
+ }
898
+ }
899
+ const titleKey = `title:${normalizeTitle(paper.title)}`;
900
+ if (!merged) {
901
+ if (seen.has(titleKey)) {
902
+ mergePaper(seen.get(titleKey), paper);
903
+ } else {
904
+ seen.set(titleKey, paper);
905
+ }
906
+ } else {
907
+ if (!seen.has(titleKey)) {
908
+ const existingPaper = paper.doi ? seen.get(`doi:${paper.doi.toLowerCase()}`) : paper.canonicalUrl ? seen.get(`url:${normalizeUrl(paper.canonicalUrl)}`) : void 0;
909
+ if (existingPaper) {
910
+ seen.set(titleKey, existingPaper);
911
+ }
912
+ }
913
+ }
914
+ }
915
+ const uniqueMap = /* @__PURE__ */ new Map();
916
+ for (const [, paper] of seen) {
917
+ uniqueMap.set(paper, true);
918
+ }
919
+ return [...uniqueMap.keys()];
920
+ }
921
+ function mergePaper(target, source) {
922
+ target.abstract ??= source.abstract;
923
+ target.doi ??= source.doi;
924
+ target.pdfUrl ??= source.pdfUrl;
925
+ target.venue ??= source.venue;
926
+ target.year ??= source.year;
927
+ target.sourceId ??= source.sourceId;
928
+ target.canonicalUrl ??= source.canonicalUrl;
929
+ target.tags ??= source.tags;
930
+ target.references ??= source.references;
931
+ target.impactFactor ??= source.impactFactor;
932
+ if (source.citationCount != null) {
933
+ target.citationCount = Math.max(target.citationCount ?? 0, source.citationCount);
934
+ }
935
+ if (source.authors.length > target.authors.length || source.authors.some((a) => a.scholarId || a.affiliations)) {
936
+ if (!target.authors.some((a) => a.scholarId || a.affiliations)) {
937
+ target.authors = source.authors;
938
+ }
939
+ }
940
+ }
941
+
942
+ // src/utils/canonical-url.ts
943
+ async function resolveCanonicalUrls(client, papers) {
944
+ const targets = papers.filter(
945
+ (p) => !p.canonicalUrl && p.url && p.source !== "semantic_scholar"
946
+ );
947
+ if (targets.length === 0) return;
948
+ const CONCURRENCY = 5;
949
+ for (let i = 0; i < targets.length; i += CONCURRENCY) {
950
+ const batch = targets.slice(i, i + CONCURRENCY);
951
+ await Promise.allSettled(
952
+ batch.map(async (paper) => {
953
+ try {
954
+ const res = await client.fetch(paper.url, {
955
+ method: "GET",
956
+ redirect: "follow",
957
+ timeout: 5e3
958
+ });
959
+ paper.canonicalUrl = res.url;
960
+ await res.text();
961
+ } catch {
962
+ }
963
+ })
964
+ );
965
+ }
966
+ }
967
+
968
+ // src/data/impact-factors.ts
969
+ var impactFactors = {
970
+ // === 의학 ===
971
+ "the lancet": 98.4,
972
+ "lancet": 98.4,
973
+ "new england journal of medicine": 158.5,
974
+ "the new england journal of medicine": 158.5,
975
+ "nejm": 158.5,
976
+ "jama": 63.1,
977
+ "bmj": 93.6,
978
+ "nature medicine": 82.9,
979
+ "lancet oncology": 41.6,
980
+ "the lancet oncology": 41.6,
981
+ "lancet neurology": 46.5,
982
+ "lancet infectious diseases": 36.4,
983
+ "lancet psychiatry": 30.8,
984
+ "annals of internal medicine": 39.2,
985
+ "circulation": 35.5,
986
+ "european heart journal": 35.3,
987
+ "journal of clinical oncology": 45.3,
988
+ "gastroenterology": 25.7,
989
+ "gut": 24.5,
990
+ "hepatology": 14,
991
+ "journal of hepatology": 25.7,
992
+ "diabetes care": 14.8,
993
+ "annals of oncology": 32,
994
+ // === 종합 과학 ===
995
+ "nature": 64.8,
996
+ "science": 56.9,
997
+ "cell": 45.5,
998
+ "proceedings of the national academy of sciences": 11.1,
999
+ "pnas": 11.1,
1000
+ "nature communications": 16.6,
1001
+ "science advances": 13.6,
1002
+ "scientific reports": 4.6,
1003
+ "plos one": 3.7,
1004
+ "plos biology": 9.8,
1005
+ "elife": 7.7,
1006
+ // === 생명과학 ===
1007
+ "nature biotechnology": 46.9,
1008
+ "nature genetics": 31.7,
1009
+ "nature methods": 48,
1010
+ "nature neuroscience": 25,
1011
+ "nature cell biology": 21.3,
1012
+ "nature immunology": 27.7,
1013
+ "nature structural & molecular biology": 12.5,
1014
+ "nature chemical biology": 12.9,
1015
+ "nature microbiology": 20.5,
1016
+ "molecular cell": 14.5,
1017
+ "genes & development": 10.5,
1018
+ "cell stem cell": 19.8,
1019
+ "cell metabolism": 27.7,
1020
+ "cell reports": 8.8,
1021
+ "current biology": 9.2,
1022
+ "developmental cell": 10.7,
1023
+ "neuron": 16.2,
1024
+ "immunity": 32.4,
1025
+ "journal of experimental medicine": 15.3,
1026
+ "genome research": 7,
1027
+ "genome biology": 12.3,
1028
+ "nucleic acids research": 14.9,
1029
+ "bioinformatics": 5.8,
1030
+ "molecular biology and evolution": 10.7,
1031
+ "plant cell": 11.6,
1032
+ // === 화학 ===
1033
+ "chemical reviews": 62.1,
1034
+ "nature chemistry": 23.5,
1035
+ "journal of the american chemical society": 15,
1036
+ "angewandte chemie international edition": 16.6,
1037
+ "chemical society reviews": 46.2,
1038
+ "nature catalysis": 37.8,
1039
+ "acs nano": 15.8,
1040
+ "nano letters": 10.8,
1041
+ "chemistry of materials": 7.2,
1042
+ "journal of physical chemistry letters": 5.7,
1043
+ "analytical chemistry": 7.4,
1044
+ // === 물리학 ===
1045
+ "nature physics": 19.6,
1046
+ "physical review letters": 8.6,
1047
+ "reviews of modern physics": 54,
1048
+ "physical review x": 12.5,
1049
+ "physical review d": 5,
1050
+ "physical review b": 3.7,
1051
+ "astrophysical journal": 5.3,
1052
+ "monthly notices of the royal astronomical society": 4.8,
1053
+ "astronomy & astrophysics": 6.5,
1054
+ // === 컴퓨터과학 (저널) ===
1055
+ "ieee transactions on pattern analysis and machine intelligence": 20.8,
1056
+ "ieee tpami": 20.8,
1057
+ "international journal of computer vision": 11.6,
1058
+ "ijcv": 11.6,
1059
+ "journal of machine learning research": 6,
1060
+ "jmlr": 6,
1061
+ "artificial intelligence": 5.1,
1062
+ "ieee transactions on neural networks and learning systems": 10.4,
1063
+ "ieee transactions on image processing": 10.6,
1064
+ "ieee transactions on information forensics and security": 6.8,
1065
+ "ieee transactions on software engineering": 7.4,
1066
+ "acm computing surveys": 16.6,
1067
+ "ieee transactions on knowledge and data engineering": 8.9,
1068
+ "ieee transactions on cybernetics": 11.8,
1069
+ "pattern recognition": 8,
1070
+ "information sciences": 8.1,
1071
+ "expert systems with applications": 8.5,
1072
+ "knowledge-based systems": 8.8,
1073
+ "neural networks": 7.8,
1074
+ // === 컴퓨터과학 (학회 — IF 없음, null) ===
1075
+ "neurips": null,
1076
+ "nips": null,
1077
+ "icml": null,
1078
+ "iclr": null,
1079
+ "cvpr": null,
1080
+ "iccv": null,
1081
+ "eccv": null,
1082
+ "aaai": null,
1083
+ "ijcai": null,
1084
+ "acl": null,
1085
+ "emnlp": null,
1086
+ "naacl": null,
1087
+ "sigir": null,
1088
+ "kdd": null,
1089
+ "www": null,
1090
+ "chi": null,
1091
+ "uist": null,
1092
+ "osdi": null,
1093
+ "sosp": null,
1094
+ "sigmod": null,
1095
+ "vldb": null,
1096
+ "icse": null,
1097
+ "fse": null,
1098
+ "pldi": null,
1099
+ "popl": null,
1100
+ "stoc": null,
1101
+ "focs": null,
1102
+ "isca": null,
1103
+ "micro": null,
1104
+ "hpca": null,
1105
+ "asplos": null,
1106
+ "sc": null,
1107
+ "siggraph": null,
1108
+ "ccs": null,
1109
+ "sp": null,
1110
+ "usenix security": null,
1111
+ "ndss": null,
1112
+ // === 공학 ===
1113
+ "nature energy": 49.7,
1114
+ "nature electronics": 33.7,
1115
+ "nature materials": 37.2,
1116
+ "advanced materials": 27.4,
1117
+ "energy & environmental science": 32.5,
1118
+ "joule": 38.6,
1119
+ "advanced energy materials": 24.4,
1120
+ "advanced functional materials": 18.5,
1121
+ "acs energy letters": 19.3,
1122
+ "nano energy": 16.8,
1123
+ // === 환경/지구과학 ===
1124
+ "nature climate change": 28.3,
1125
+ "nature geoscience": 18.3,
1126
+ "nature sustainability": 25.7,
1127
+ "nature ecology & evolution": 16,
1128
+ "global change biology": 11.6,
1129
+ "environmental science & technology": 11.4,
1130
+ // === 심리학/인지과학 ===
1131
+ "nature human behaviour": 21.4,
1132
+ "psychological bulletin": 22.4,
1133
+ "annual review of psychology": 23.6,
1134
+ "psychological science": 10.2,
1135
+ "trends in cognitive sciences": 16.7,
1136
+ // === 사회과학/경제 ===
1137
+ "quarterly journal of economics": 11.1,
1138
+ "american economic review": 10.5,
1139
+ "econometrica": 6.5,
1140
+ "journal of political economy": 9.1,
1141
+ "review of economic studies": 7.8,
1142
+ "journal of finance": 8.9,
1143
+ "review of financial studies": 7.2
1144
+ };
1145
+
1146
+ // src/utils/impact-factor.ts
1147
+ function getImpactFactor(venue) {
1148
+ if (!venue) return void 0;
1149
+ const normalized = venue.toLowerCase().trim();
1150
+ if (!normalized) return void 0;
1151
+ if (normalized in impactFactors) {
1152
+ return impactFactors[normalized] ?? void 0;
1153
+ }
1154
+ let bestMatch;
1155
+ for (const [key, value] of Object.entries(impactFactors)) {
1156
+ if (normalized.includes(key) || key.includes(normalized)) {
1157
+ if (!bestMatch || key.length > bestMatch.key.length) {
1158
+ bestMatch = { key, value };
1159
+ }
1160
+ }
1161
+ }
1162
+ if (bestMatch) {
1163
+ return bestMatch.value ?? void 0;
1164
+ }
1165
+ return void 0;
1166
+ }
1167
+
1168
+ // src/search.ts
1169
+ async function searchPapers(query, options) {
1170
+ const client = createClient(options?.client);
1171
+ const sources = resolveSources(client, options?.sources, options?.client);
1172
+ const limit = options?.limit ?? 10;
1173
+ try {
1174
+ const results = await Promise.allSettled(
1175
+ sources.map((src) => src.search(query, { ...options, limit }))
1176
+ );
1177
+ const allPapers = [];
1178
+ const errors = [];
1179
+ results.forEach((result, index) => {
1180
+ if (result.status === "fulfilled") {
1181
+ allPapers.push(...result.value.papers);
1182
+ if (result.value.errors) errors.push(...result.value.errors);
1183
+ } else {
1184
+ const err = toSourceError(result.reason);
1185
+ err.source = sources[index].name;
1186
+ errors.push(err);
1187
+ }
1188
+ });
1189
+ await resolveCanonicalUrls(client, allPapers);
1190
+ applyImpactFactors(allPapers);
1191
+ const unique = deduplicatePapers(allPapers);
1192
+ const sorted = sortPapers(unique, options?.sort);
1193
+ return {
1194
+ query,
1195
+ totalResults: unique.length,
1196
+ papers: sorted.slice(0, limit),
1197
+ source: sources[0]?.name ?? "semantic_scholar",
1198
+ errors: errors.length > 0 ? errors : void 0
1199
+ };
1200
+ } finally {
1201
+ await client.destroy();
1202
+ }
1203
+ }
1204
+ function resolveSources(client, sourceTypes, clientOptions) {
1205
+ const types = sourceTypes ?? ["semantic_scholar", "google_scholar", "arxiv"];
1206
+ return types.map((type) => {
1207
+ switch (type) {
1208
+ case "semantic_scholar":
1209
+ return new SemanticScholarSource(client, clientOptions);
1210
+ case "google_scholar":
1211
+ return new GoogleScholarSource(client);
1212
+ case "arxiv":
1213
+ return new ArxivSource(client);
1214
+ default:
1215
+ throw new Error(`Unknown source type: ${type}`);
1216
+ }
1217
+ });
1218
+ }
1219
+ function toSourceError(err) {
1220
+ if (err instanceof HttpError) {
1221
+ const code = err.status === 429 ? "RATE_LIMITED" : err.status === 503 ? "TIMEOUT" : "UNKNOWN";
1222
+ return {
1223
+ source: "semantic_scholar",
1224
+ message: err.message,
1225
+ code
1226
+ };
1227
+ }
1228
+ const name = err?.name ?? "";
1229
+ if (name === "TimeoutError") {
1230
+ return { source: "semantic_scholar", message: String(err), code: "TIMEOUT" };
1231
+ }
1232
+ if (name === "NetworkError" || name === "ProxyError") {
1233
+ return { source: "semantic_scholar", message: String(err), code: "NETWORK_ERROR" };
1234
+ }
1235
+ if (name === "ChallengeError") {
1236
+ return { source: "semantic_scholar", message: String(err), code: "CAPTCHA" };
1237
+ }
1238
+ return {
1239
+ source: "semantic_scholar",
1240
+ message: err instanceof Error ? err.message : String(err),
1241
+ code: "UNKNOWN"
1242
+ };
1243
+ }
1244
+ function applyImpactFactors(papers) {
1245
+ for (const paper of papers) {
1246
+ if (paper.impactFactor == null && paper.venue) {
1247
+ paper.impactFactor = getImpactFactor(paper.venue);
1248
+ }
1249
+ }
1250
+ }
1251
+ function sortPapers(papers, sort) {
1252
+ if (sort === "date") {
1253
+ return papers.sort((a, b) => (b.year ?? 0) - (a.year ?? 0));
1254
+ }
1255
+ if (sort === "citations") {
1256
+ return papers.sort((a, b) => (b.citationCount ?? 0) - (a.citationCount ?? 0));
1257
+ }
1258
+ return papers.sort((a, b) => {
1259
+ const ifDiff = (b.impactFactor ?? -1) - (a.impactFactor ?? -1);
1260
+ if (ifDiff !== 0) return ifDiff;
1261
+ return (b.citationCount ?? 0) - (a.citationCount ?? 0);
1262
+ });
1263
+ }
1264
+
1265
+ // src/get-paper.ts
1266
+ async function getPaper(doi, options) {
1267
+ const client = createClient(options?.client);
1268
+ try {
1269
+ const source = new SemanticScholarSource(client, options?.client);
1270
+ return await source.getPaper(`DOI:${doi}`);
1271
+ } finally {
1272
+ await client.destroy();
1273
+ }
1274
+ }
1275
+
1276
+ // src/get-full-paper.ts
1277
+ async function getFullPaper(id, options) {
1278
+ const client = createClient(options?.client);
1279
+ try {
1280
+ const s2 = new SemanticScholarSource(client, options?.client);
1281
+ const arxiv = new ArxivSource(client);
1282
+ const isArxivId = /^\d{4}\.\d{4,5}(v\d+)?$/.test(id) || /^arxiv:/i.test(id);
1283
+ const arxivId = isArxivId ? id.replace(/^arxiv:/i, "") : void 0;
1284
+ const s2Query = isArxivId ? `ARXIV:${arxivId}` : `DOI:${id}`;
1285
+ let s2Paper = null;
1286
+ try {
1287
+ s2Paper = await s2.getPaper(s2Query);
1288
+ } catch {
1289
+ }
1290
+ let fullPaper = null;
1291
+ if (arxivId) {
1292
+ try {
1293
+ fullPaper = await arxiv.getFullPaper(arxivId);
1294
+ } catch {
1295
+ }
1296
+ }
1297
+ if (!s2Paper && !fullPaper) return null;
1298
+ if (s2Paper && fullPaper) {
1299
+ return mergePapers(s2Paper, fullPaper);
1300
+ }
1301
+ return s2Paper ?? fullPaper;
1302
+ } finally {
1303
+ await client.destroy();
1304
+ }
1305
+ }
1306
+ function mergePapers(s2, arxiv) {
1307
+ return {
1308
+ ...s2,
1309
+ // arXiv에서 소속 정보가 있으면 저자 정보 보강
1310
+ authors: mergeAuthors(s2.authors, arxiv.authors),
1311
+ // 본문 구조는 arXiv에서
1312
+ sections: arxiv.sections ?? s2.sections,
1313
+ figures: arxiv.figures ?? s2.figures,
1314
+ tables: arxiv.tables ?? s2.tables,
1315
+ // S2에 없는 필드는 arXiv에서 보완
1316
+ abstract: s2.abstract ?? arxiv.abstract,
1317
+ pdfUrl: s2.pdfUrl ?? arxiv.pdfUrl,
1318
+ tags: s2.tags ?? arxiv.tags
1319
+ };
1320
+ }
1321
+ function mergeAuthors(s2Authors, arxivAuthors) {
1322
+ if (!arxivAuthors.length) return s2Authors;
1323
+ if (!s2Authors.length) return arxivAuthors;
1324
+ const hasArxivAffiliations = arxivAuthors.some((a) => a.affiliations?.length);
1325
+ if (!hasArxivAffiliations) return s2Authors;
1326
+ return s2Authors.map((s2a, i) => {
1327
+ const arxivMatch = arxivAuthors[i];
1328
+ const found = arxivMatch && namesMatch(s2a.name, arxivMatch.name) ? arxivMatch : arxivAuthors.find((a) => namesMatch(s2a.name, a.name));
1329
+ if (found) {
1330
+ return {
1331
+ ...s2a,
1332
+ affiliations: s2a.affiliations ?? found.affiliations,
1333
+ isFirstAuthor: found.isFirstAuthor || s2a.isFirstAuthor || (i === 0 ? true : void 0),
1334
+ isCorresponding: found.isCorresponding || s2a.isCorresponding || void 0
1335
+ };
1336
+ }
1337
+ return {
1338
+ ...s2a,
1339
+ isFirstAuthor: s2a.isFirstAuthor || (i === 0 ? true : void 0)
1340
+ };
1341
+ });
1342
+ }
1343
+ function namesMatch(a, b) {
1344
+ const normalize = (n) => n.toLowerCase().replace(/[^a-z\s]/g, "").trim();
1345
+ const na = normalize(a);
1346
+ const nb = normalize(b);
1347
+ if (na === nb) return true;
1348
+ const lastA = na.split(" ").pop() || "";
1349
+ const lastB = nb.split(" ").pop() || "";
1350
+ return lastA === lastB && lastA.length > 1;
1351
+ }
1352
+ export {
1353
+ AdaptiveRateLimiter,
1354
+ ArxivSource,
1355
+ GoogleScholarSource,
1356
+ HttpError,
1357
+ RateLimiter,
1358
+ SemanticScholarSource,
1359
+ createClient,
1360
+ deduplicatePapers,
1361
+ getFullPaper,
1362
+ getPaper,
1363
+ searchPapers,
1364
+ withRetry
1365
+ };