gyoshu 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1048 +0,0 @@
1
- /**
2
- * Literature Client for Gyoshu Research System
3
- *
4
- * Provides access to academic literature APIs (Crossref, arXiv) with:
5
- * - Local caching to reduce API calls
6
- * - Rate limiting (1 request/sec for Crossref)
7
- * - Retry logic with exponential backoff
8
- * - Consistent Citation interface across sources
9
- *
10
- * Usage:
11
- * ```typescript
12
- * import { searchCrossref, searchArxiv, getCitationByDOI } from './literature-client';
13
- *
14
- * // Search by title
15
- * const results = await searchCrossref('neural network optimization');
16
- *
17
- * // Get specific paper by DOI
18
- * const citation = await getCitationByDOI('10.1038/nature12373');
19
- *
20
- * // Search arXiv
21
- * const arxivResults = await searchArxiv('transformer attention mechanism');
22
- * ```
23
- *
24
- * @module literature-client
25
- */
26
-
27
- import * as fs from "fs/promises";
28
- import * as path from "path";
29
- import { getReportsRootDir, ensureDirSync } from "./paths";
30
- import { durableAtomicWrite, fileExists } from "./atomic-write";
31
-
32
- // =============================================================================
33
- // INTERFACES
34
- // =============================================================================
35
-
36
- /**
37
- * Represents an academic citation/paper reference.
38
- */
39
- export interface Citation {
40
- /** First author or all authors (formatted string) */
41
- authors: string;
42
- /** Paper title */
43
- title: string;
44
- /** Publication year */
45
- year: number | null;
46
- /** Journal or venue name */
47
- journal: string | null;
48
- /** Digital Object Identifier */
49
- doi: string | null;
50
- /** URL to the paper (publisher page or PDF) */
51
- url: string | null;
52
- /** Paper abstract (if available) */
53
- abstract: string | null;
54
- /** arXiv ID if from arXiv (e.g., "2301.12345") */
55
- arxivId: string | null;
56
- /** Source of this citation: 'crossref' | 'arxiv' */
57
- source: "crossref" | "arxiv";
58
- }
59
-
60
- /**
61
- * Result of a literature search operation.
62
- */
63
- export interface SearchResult {
64
- /** Array of matching citations */
65
- citations: Citation[];
66
- /** Total number of results available (may be more than returned) */
67
- totalResults: number;
68
- /** The query that was executed */
69
- query: string;
70
- /** Source of results: 'crossref' | 'arxiv' */
71
- source: "crossref" | "arxiv";
72
- /** Whether results came from cache */
73
- fromCache: boolean;
74
- }
75
-
76
- /**
77
- * Cache entry for a single citation or search result.
78
- */
79
- interface CacheEntry {
80
- /** The cached data */
81
- data: Citation | SearchResult;
82
- /** Timestamp when cached (ms since epoch) */
83
- cachedAt: number;
84
- }
85
-
86
- /**
87
- * Literature cache stored as JSON file.
88
- */
89
- export interface LiteratureCache {
90
- /** Version for future migrations */
91
- version: number;
92
- /** Map of cache keys to entries */
93
- entries: Record<string, CacheEntry>;
94
- /** Last time cache was written */
95
- lastUpdated: string;
96
- }
97
-
98
- // =============================================================================
99
- // CONSTANTS
100
- // =============================================================================
101
-
102
- /** Cache file name */
103
- const CACHE_FILE_NAME = ".gyoshu-literature-cache.json";
104
-
105
- /** Cache expiry in milliseconds (7 days) */
106
- const CACHE_EXPIRY_MS = 7 * 24 * 60 * 60 * 1000;
107
-
108
- /** Current cache schema version */
109
- const CACHE_VERSION = 1;
110
-
111
- /** Crossref API base URL */
112
- const CROSSREF_API_BASE = "https://api.crossref.org";
113
-
114
- /** arXiv API base URL (HTTPS required for security) */
115
- const ARXIV_API_BASE = "https://export.arxiv.org/api/query";
116
-
117
- /** Rate limit: minimum ms between Crossref requests */
118
- const CROSSREF_RATE_LIMIT_MS = 1000;
119
-
120
- /** Rate limit: minimum ms between arXiv requests (etiquette requirement) */
121
- const ARXIV_RATE_LIMIT_MS = 3000;
122
-
123
- /** Fetch timeout in milliseconds */
124
- const FETCH_TIMEOUT_MS = 10000;
125
-
126
- /** Maximum number of retry attempts */
127
- const MAX_RETRIES = 3;
128
-
129
- /** Base delay for exponential backoff (ms) */
130
- const BASE_RETRY_DELAY_MS = 1000;
131
-
132
- /** User agent for API requests (Crossref requires this for polite pool) */
133
- const USER_AGENT = "Gyoshu-Research-System/1.0 (https://github.com/gyoshu; mailto:research@gyoshu.dev)";
134
-
135
- // =============================================================================
136
- // RATE LIMITING
137
- // =============================================================================
138
-
139
- /** Timestamp of last Crossref API call */
140
- let lastCrossrefCallTime = 0;
141
-
142
- /** Timestamp of last arXiv API call */
143
- let lastArxivCallTime = 0;
144
-
145
- /**
146
- * Wait to ensure rate limit compliance before making a Crossref API call.
147
- */
148
- async function waitForCrossrefRateLimit(): Promise<void> {
149
- const now = Date.now();
150
- const timeSinceLastCall = now - lastCrossrefCallTime;
151
-
152
- if (timeSinceLastCall < CROSSREF_RATE_LIMIT_MS) {
153
- const waitTime = CROSSREF_RATE_LIMIT_MS - timeSinceLastCall;
154
- await sleep(waitTime);
155
- }
156
-
157
- lastCrossrefCallTime = Date.now();
158
- }
159
-
160
- /**
161
- * Wait to ensure rate limit compliance before making an arXiv API call.
162
- * arXiv requires 3 second delay between requests per their usage guidelines.
163
- */
164
- async function waitForArxivRateLimit(): Promise<void> {
165
- const now = Date.now();
166
- const timeSinceLastCall = now - lastArxivCallTime;
167
-
168
- if (timeSinceLastCall < ARXIV_RATE_LIMIT_MS) {
169
- const waitTime = ARXIV_RATE_LIMIT_MS - timeSinceLastCall;
170
- await sleep(waitTime);
171
- }
172
-
173
- lastArxivCallTime = Date.now();
174
- }
175
-
176
- /**
177
- * Sleep for specified milliseconds.
178
- */
179
- function sleep(ms: number): Promise<void> {
180
- return new Promise((resolve) => setTimeout(resolve, ms));
181
- }
182
-
183
- // =============================================================================
184
- // CACHE LAYER
185
- // =============================================================================
186
-
187
- /**
188
- * Get the path to the literature cache file.
189
- */
190
- function getCachePath(): string {
191
- const reportsDir = getReportsRootDir();
192
- return path.join(reportsDir, CACHE_FILE_NAME);
193
- }
194
-
195
- /**
196
- * Load the literature cache from disk.
197
- * Returns empty cache if file doesn't exist or is invalid.
198
- */
199
- async function loadCache(): Promise<LiteratureCache> {
200
- const cachePath = getCachePath();
201
-
202
- try {
203
- if (!(await fileExists(cachePath))) {
204
- return createEmptyCache();
205
- }
206
-
207
- const content = await fs.readFile(cachePath, "utf-8");
208
- const cache = JSON.parse(content) as LiteratureCache;
209
-
210
- // Check version compatibility
211
- if (cache.version !== CACHE_VERSION) {
212
- console.warn(`[literature-client] Cache version mismatch, clearing cache`);
213
- return createEmptyCache();
214
- }
215
-
216
- return cache;
217
- } catch (error) {
218
- console.warn(`[literature-client] Failed to load cache: ${(error as Error).message}`);
219
- return createEmptyCache();
220
- }
221
- }
222
-
223
- /**
224
- * Create an empty cache object.
225
- */
226
- function createEmptyCache(): LiteratureCache {
227
- return {
228
- version: CACHE_VERSION,
229
- entries: {},
230
- lastUpdated: new Date().toISOString(),
231
- };
232
- }
233
-
234
- /**
235
- * Save the cache to disk atomically.
236
- */
237
- async function saveCache(cache: LiteratureCache): Promise<void> {
238
- const cachePath = getCachePath();
239
-
240
- // Ensure the reports directory exists
241
- const reportsDir = getReportsRootDir();
242
- ensureDirSync(reportsDir);
243
-
244
- cache.lastUpdated = new Date().toISOString();
245
-
246
- try {
247
- await durableAtomicWrite(cachePath, JSON.stringify(cache, null, 2));
248
- } catch (error) {
249
- console.warn(`[literature-client] Failed to save cache: ${(error as Error).message}`);
250
- }
251
- }
252
-
253
- /**
254
- * Get a cached entry if it exists and is not expired.
255
- */
256
- async function getCached<T>(key: string): Promise<T | null> {
257
- const cache = await loadCache();
258
- const entry = cache.entries[key];
259
-
260
- if (!entry) {
261
- return null;
262
- }
263
-
264
- const age = Date.now() - entry.cachedAt;
265
- if (age > CACHE_EXPIRY_MS) {
266
- // Expired, remove from cache
267
- delete cache.entries[key];
268
- await saveCache(cache);
269
- return null;
270
- }
271
-
272
- return entry.data as T;
273
- }
274
-
275
- /**
276
- * Store an entry in the cache.
277
- */
278
- async function setCached<T extends Citation | SearchResult>(key: string, data: T): Promise<void> {
279
- const cache = await loadCache();
280
-
281
- cache.entries[key] = {
282
- data,
283
- cachedAt: Date.now(),
284
- };
285
-
286
- await saveCache(cache);
287
- }
288
-
289
- /**
290
- * Generate a cache key for a search query.
291
- */
292
- function searchCacheKey(source: string, query: string, limit: number): string {
293
- return `search:${source}:${query.toLowerCase().trim()}:${limit}`;
294
- }
295
-
296
- /**
297
- * Generate a cache key for a DOI lookup.
298
- */
299
- function doiCacheKey(doi: string): string {
300
- return `doi:${doi.toLowerCase().trim()}`;
301
- }
302
-
303
- // =============================================================================
304
- // RETRY LOGIC
305
- // =============================================================================
306
-
307
- /**
308
- * Error thrown when API call fails after all retries.
309
- */
310
- export class LiteratureAPIError extends Error {
311
- constructor(
312
- message: string,
313
- public readonly statusCode?: number,
314
- public readonly source?: string
315
- ) {
316
- super(message);
317
- this.name = "LiteratureAPIError";
318
- }
319
- }
320
-
321
- /**
322
- * Execute a fetch with retry logic and exponential backoff.
323
- *
324
- * @param url - URL to fetch
325
- * @param options - Fetch options
326
- * @returns Response object
327
- * @throws LiteratureAPIError if all retries fail
328
- */
329
- async function fetchWithRetry(url: string, options: RequestInit = {}): Promise<Response> {
330
- let lastError: Error | null = null;
331
-
332
- for (let attempt = 0; attempt < MAX_RETRIES; attempt++) {
333
- try {
334
- const response = await fetch(url, {
335
- ...options,
336
- headers: {
337
- "User-Agent": USER_AGENT,
338
- Accept: "application/json",
339
- ...options.headers,
340
- },
341
- signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
342
- });
343
-
344
- // Success or client error (4xx) - don't retry
345
- if (response.ok || (response.status >= 400 && response.status < 500 && response.status !== 429)) {
346
- return response;
347
- }
348
-
349
- // Rate limited or server error - retry with backoff
350
- if (response.status === 429 || response.status >= 500) {
351
- const delay = BASE_RETRY_DELAY_MS * Math.pow(2, attempt);
352
- console.warn(`[literature-client] Request failed with ${response.status}, retrying in ${delay}ms...`);
353
- await sleep(delay);
354
- lastError = new LiteratureAPIError(`HTTP ${response.status}`, response.status);
355
- continue;
356
- }
357
-
358
- // Unexpected status
359
- return response;
360
- } catch (error) {
361
- // Network error - retry with backoff
362
- const delay = BASE_RETRY_DELAY_MS * Math.pow(2, attempt);
363
- console.warn(`[literature-client] Network error, retrying in ${delay}ms: ${(error as Error).message}`);
364
- await sleep(delay);
365
- lastError = error as Error;
366
- }
367
- }
368
-
369
- throw new LiteratureAPIError(
370
- `Failed after ${MAX_RETRIES} retries: ${lastError?.message || "Unknown error"}`,
371
- undefined,
372
- "network"
373
- );
374
- }
375
-
376
- // =============================================================================
377
- // CROSSREF API CLIENT
378
- // =============================================================================
379
-
380
- /**
381
- * Parse a Crossref work item into a Citation.
382
- */
383
- function parseCrossrefWork(work: Record<string, unknown>): Citation {
384
- // Extract authors
385
- const authorList = work.author as Array<{ given?: string; family?: string }> | undefined;
386
- let authors = "Unknown";
387
- if (authorList && authorList.length > 0) {
388
- authors = authorList
389
- .map((a) => {
390
- if (a.given && a.family) {
391
- return `${a.given} ${a.family}`;
392
- } else if (a.family) {
393
- return a.family;
394
- }
395
- return "Unknown";
396
- })
397
- .join(", ");
398
- }
399
-
400
- // Extract title
401
- const titleArray = work.title as string[] | undefined;
402
- const title = titleArray && titleArray.length > 0 ? titleArray[0] : "Untitled";
403
-
404
- // Extract year
405
- const published = work.published as { "date-parts"?: number[][] } | undefined;
406
- const issued = work.issued as { "date-parts"?: number[][] } | undefined;
407
- const dateParts = published?.["date-parts"]?.[0] || issued?.["date-parts"]?.[0];
408
- const year = dateParts && dateParts.length > 0 ? dateParts[0] : null;
409
-
410
- // Extract journal/venue
411
- const containerTitle = work["container-title"] as string[] | undefined;
412
- const journal = containerTitle && containerTitle.length > 0 ? containerTitle[0] : null;
413
-
414
- // Extract DOI
415
- const doi = (work.DOI as string) || null;
416
-
417
- // Extract URL
418
- const url = (work.URL as string) || (doi ? `https://doi.org/${doi}` : null);
419
-
420
- // Extract abstract
421
- const abstract = (work.abstract as string) || null;
422
-
423
- return {
424
- authors,
425
- title,
426
- year,
427
- journal,
428
- doi,
429
- url,
430
- abstract,
431
- arxivId: null,
432
- source: "crossref",
433
- };
434
- }
435
-
436
- /**
437
- * Get citation metadata for a DOI from Crossref.
438
- *
439
- * @param doi - Digital Object Identifier (e.g., "10.1038/nature12373")
440
- * @returns Citation object or null if not found
441
- *
442
- * @example
443
- * const citation = await getCitationByDOI('10.1038/nature12373');
444
- * console.log(citation?.title);
445
- */
446
- export async function getCitationByDOI(doi: string): Promise<Citation | null> {
447
- // Normalize DOI
448
- const normalizedDoi = doi.replace(/^https?:\/\/doi\.org\//i, "").trim();
449
-
450
- // Check cache first
451
- const cacheKey = doiCacheKey(normalizedDoi);
452
- const cached = await getCached<Citation>(cacheKey);
453
- if (cached) {
454
- return cached;
455
- }
456
-
457
- // Make API call
458
- await waitForCrossrefRateLimit();
459
-
460
- const url = `${CROSSREF_API_BASE}/works/${encodeURIComponent(normalizedDoi)}`;
461
-
462
- try {
463
- const response = await fetchWithRetry(url);
464
-
465
- if (response.status === 404) {
466
- return null;
467
- }
468
-
469
- if (!response.ok) {
470
- throw new LiteratureAPIError(
471
- `Crossref API error: ${response.status} ${response.statusText}`,
472
- response.status,
473
- "crossref"
474
- );
475
- }
476
-
477
- const data = (await response.json()) as { message: Record<string, unknown> };
478
- const citation = parseCrossrefWork(data.message);
479
-
480
- // Cache the result
481
- await setCached(cacheKey, citation);
482
-
483
- return citation;
484
- } catch (error) {
485
- if (error instanceof LiteratureAPIError) {
486
- throw error;
487
- }
488
- throw new LiteratureAPIError(
489
- `Failed to fetch DOI ${normalizedDoi}: ${(error as Error).message}`,
490
- undefined,
491
- "crossref"
492
- );
493
- }
494
- }
495
-
496
- /**
497
- * Search Crossref for papers matching a title query.
498
- *
499
- * @param query - Search query (title, author, keyword)
500
- * @param limit - Maximum number of results to return (default: 10, max: 100)
501
- * @returns SearchResult with matching citations
502
- *
503
- * @example
504
- * const results = await searchCrossref('deep learning optimization', 5);
505
- * console.log(`Found ${results.totalResults} results`);
506
- * results.citations.forEach(c => console.log(c.title));
507
- */
508
- export async function searchCrossref(query: string, limit: number = 10): Promise<SearchResult> {
509
- // Clamp limit to reasonable range
510
- const clampedLimit = Math.min(Math.max(1, limit), 100);
511
-
512
- // Check cache first
513
- const cacheKey = searchCacheKey("crossref", query, clampedLimit);
514
- const cached = await getCached<SearchResult>(cacheKey);
515
- if (cached) {
516
- return { ...cached, fromCache: true };
517
- }
518
-
519
- // Make API call
520
- await waitForCrossrefRateLimit();
521
-
522
- const params = new URLSearchParams({
523
- query: query,
524
- rows: clampedLimit.toString(),
525
- });
526
-
527
- const url = `${CROSSREF_API_BASE}/works?${params.toString()}`;
528
-
529
- try {
530
- const response = await fetchWithRetry(url);
531
-
532
- if (!response.ok) {
533
- throw new LiteratureAPIError(
534
- `Crossref API error: ${response.status} ${response.statusText}`,
535
- response.status,
536
- "crossref"
537
- );
538
- }
539
-
540
- const data = (await response.json()) as {
541
- message: {
542
- "total-results": number;
543
- items: Array<Record<string, unknown>>;
544
- };
545
- };
546
-
547
- const citations = data.message.items.map(parseCrossrefWork);
548
-
549
- const result: SearchResult = {
550
- citations,
551
- totalResults: data.message["total-results"],
552
- query,
553
- source: "crossref",
554
- fromCache: false,
555
- };
556
-
557
- // Cache the result
558
- await setCached(cacheKey, result);
559
-
560
- return result;
561
- } catch (error) {
562
- if (error instanceof LiteratureAPIError) {
563
- throw error;
564
- }
565
- throw new LiteratureAPIError(
566
- `Crossref search failed: ${(error as Error).message}`,
567
- undefined,
568
- "crossref"
569
- );
570
- }
571
- }
572
-
573
- // =============================================================================
574
- // ARXIV API CLIENT
575
- // =============================================================================
576
-
577
- /**
578
- * Parse arXiv Atom XML response into Citations.
579
- * Uses regex-based parsing to avoid XML library dependency.
580
- */
581
- function parseArxivResponse(xml: string): { citations: Citation[]; totalResults: number } {
582
- const citations: Citation[] = [];
583
-
584
- // Extract total results
585
- const totalMatch = xml.match(/<opensearch:totalResults[^>]*>(\d+)<\/opensearch:totalResults>/);
586
- const totalResults = totalMatch ? parseInt(totalMatch[1], 10) : 0;
587
-
588
- // Extract entries
589
- const entryRegex = /<entry>([\s\S]*?)<\/entry>/g;
590
- let entryMatch;
591
-
592
- while ((entryMatch = entryRegex.exec(xml)) !== null) {
593
- const entry = entryMatch[1];
594
-
595
- // Extract title (remove newlines)
596
- const titleMatch = entry.match(/<title>([\s\S]*?)<\/title>/);
597
- const title = titleMatch
598
- ? titleMatch[1].replace(/\s+/g, " ").trim()
599
- : "Untitled";
600
-
601
- // Extract authors
602
- const authorRegex = /<author>[\s\S]*?<name>([\s\S]*?)<\/name>[\s\S]*?<\/author>/g;
603
- const authors: string[] = [];
604
- let authorMatch;
605
- while ((authorMatch = authorRegex.exec(entry)) !== null) {
606
- authors.push(authorMatch[1].trim());
607
- }
608
-
609
- // Extract abstract/summary
610
- const summaryMatch = entry.match(/<summary>([\s\S]*?)<\/summary>/);
611
- const abstract = summaryMatch
612
- ? summaryMatch[1].replace(/\s+/g, " ").trim()
613
- : null;
614
-
615
- // Extract arXiv ID from id URL
616
- const idMatch = entry.match(/<id>http:\/\/arxiv\.org\/abs\/([\d.]+v?\d*)<\/id>/);
617
- const arxivId = idMatch ? idMatch[1] : null;
618
-
619
- // Extract published date (for year)
620
- const publishedMatch = entry.match(/<published>(\d{4})-\d{2}-\d{2}/);
621
- const year = publishedMatch ? parseInt(publishedMatch[1], 10) : null;
622
-
623
- // Extract PDF link
624
- const pdfMatch = entry.match(/<link[^>]*title="pdf"[^>]*href="([^"]+)"/);
625
- const pdfUrl = pdfMatch ? pdfMatch[1] : null;
626
-
627
- // Extract abstract page URL
628
- const absMatch = entry.match(/<link[^>]*type="text\/html"[^>]*href="([^"]+)"/);
629
- const absUrl = absMatch ? absMatch[1] : (arxivId ? `https://arxiv.org/abs/${arxivId}` : null);
630
-
631
- // Extract primary category for journal/venue equivalent
632
- const categoryMatch = entry.match(/<arxiv:primary_category[^>]*term="([^"]+)"/);
633
- const category = categoryMatch ? `arXiv:${categoryMatch[1]}` : "arXiv";
634
-
635
- citations.push({
636
- authors: authors.length > 0 ? authors.join(", ") : "Unknown",
637
- title,
638
- year,
639
- journal: category,
640
- doi: null, // arXiv papers may have DOIs but not directly in the API
641
- url: pdfUrl || absUrl,
642
- abstract,
643
- arxivId,
644
- source: "arxiv",
645
- });
646
- }
647
-
648
- return { citations, totalResults };
649
- }
650
-
651
- /**
652
- * Search arXiv for papers matching a query.
653
- *
654
- * @param query - Search query (supports arXiv search syntax)
655
- * @param limit - Maximum number of results to return (default: 10, max: 100)
656
- * @returns SearchResult with matching citations
657
- *
658
- * @example
659
- * const results = await searchArxiv('all:transformer attention', 5);
660
- * results.citations.forEach(c => {
661
- * console.log(`${c.title} (arXiv:${c.arxivId})`);
662
- * console.log(` PDF: ${c.url}`);
663
- * });
664
- */
665
- export async function searchArxiv(query: string, limit: number = 10): Promise<SearchResult> {
666
- // Clamp limit to reasonable range
667
- const clampedLimit = Math.min(Math.max(1, limit), 100);
668
-
669
- // Check cache first
670
- const cacheKey = searchCacheKey("arxiv", query, clampedLimit);
671
- const cached = await getCached<SearchResult>(cacheKey);
672
- if (cached) {
673
- return { ...cached, fromCache: true };
674
- }
675
-
676
- // Build query URL
677
- // arXiv uses 'search_query' parameter with special field prefixes:
678
- // - ti: title, au: author, abs: abstract, all: all fields
679
- // If user doesn't specify a field, search all
680
- const searchQuery = query.includes(":") ? query : `all:${query}`;
681
-
682
- const params = new URLSearchParams({
683
- search_query: searchQuery,
684
- start: "0",
685
- max_results: clampedLimit.toString(),
686
- sortBy: "relevance",
687
- sortOrder: "descending",
688
- });
689
-
690
- const url = `${ARXIV_API_BASE}?${params.toString()}`;
691
-
692
- try {
693
- // Enforce arXiv rate limiting (3s between requests)
694
- await waitForArxivRateLimit();
695
-
696
- const response = await fetchWithRetry(url, {
697
- headers: {
698
- Accept: "application/atom+xml",
699
- },
700
- });
701
-
702
- if (!response.ok) {
703
- throw new LiteratureAPIError(
704
- `arXiv API error: ${response.status} ${response.statusText}`,
705
- response.status,
706
- "arxiv"
707
- );
708
- }
709
-
710
- const xml = await response.text();
711
- const { citations, totalResults } = parseArxivResponse(xml);
712
-
713
- const result: SearchResult = {
714
- citations,
715
- totalResults,
716
- query,
717
- source: "arxiv",
718
- fromCache: false,
719
- };
720
-
721
- // Cache the result
722
- await setCached(cacheKey, result);
723
-
724
- return result;
725
- } catch (error) {
726
- if (error instanceof LiteratureAPIError) {
727
- throw error;
728
- }
729
- throw new LiteratureAPIError(
730
- `arXiv search failed: ${(error as Error).message}`,
731
- undefined,
732
- "arxiv"
733
- );
734
- }
735
- }
736
-
737
- /**
738
- * Get a paper by arXiv ID.
739
- *
740
- * @param arxivId - arXiv identifier (e.g., "2301.12345" or "2301.12345v2")
741
- * @returns Citation object or null if not found
742
- *
743
- * @example
744
- * const paper = await getArxivPaper('2301.07041');
745
- * console.log(paper?.title);
746
- * console.log(paper?.url); // PDF URL
747
- */
748
- export async function getArxivPaper(arxivId: string): Promise<Citation | null> {
749
- // Normalize arXiv ID (remove arxiv: prefix if present)
750
- const normalizedId = arxivId.replace(/^arxiv:/i, "").trim();
751
-
752
- // Check cache first
753
- const cacheKey = `arxiv:${normalizedId}`;
754
- const cached = await getCached<Citation>(cacheKey);
755
- if (cached) {
756
- return cached;
757
- }
758
-
759
- // Use id_list parameter for direct lookup
760
- const params = new URLSearchParams({
761
- id_list: normalizedId,
762
- });
763
-
764
- const url = `${ARXIV_API_BASE}?${params.toString()}`;
765
-
766
- try {
767
- // Enforce arXiv rate limiting (3s between requests)
768
- await waitForArxivRateLimit();
769
-
770
- const response = await fetchWithRetry(url, {
771
- headers: {
772
- Accept: "application/atom+xml",
773
- },
774
- });
775
-
776
- if (!response.ok) {
777
- throw new LiteratureAPIError(
778
- `arXiv API error: ${response.status} ${response.statusText}`,
779
- response.status,
780
- "arxiv"
781
- );
782
- }
783
-
784
- const xml = await response.text();
785
- const { citations } = parseArxivResponse(xml);
786
-
787
- if (citations.length === 0) {
788
- return null;
789
- }
790
-
791
- const citation = citations[0];
792
-
793
- // Cache the result
794
- await setCached(cacheKey, citation);
795
-
796
- return citation;
797
- } catch (error) {
798
- if (error instanceof LiteratureAPIError) {
799
- throw error;
800
- }
801
- throw new LiteratureAPIError(
802
- `Failed to fetch arXiv paper ${normalizedId}: ${(error as Error).message}`,
803
- undefined,
804
- "arxiv"
805
- );
806
- }
807
- }
808
-
809
- // =============================================================================
810
- // UNIFIED SEARCH
811
- // =============================================================================
812
-
813
- /**
814
- * Search both Crossref and arXiv for papers matching a query.
815
- * Results are combined and deduplicated where possible.
816
- *
817
- * @param query - Search query
818
- * @param options - Search options
819
- * @returns Combined SearchResult from both sources
820
- *
821
- * @example
822
- * const results = await searchLiterature('neural network pruning', {
823
- * limit: 5,
824
- * sources: ['crossref', 'arxiv']
825
- * });
826
- */
827
- export async function searchLiterature(
828
- query: string,
829
- options: {
830
- limit?: number;
831
- sources?: Array<"crossref" | "arxiv">;
832
- } = {}
833
- ): Promise<SearchResult> {
834
- const { limit = 10, sources = ["crossref", "arxiv"] } = options;
835
-
836
- const results: Citation[] = [];
837
- let totalResults = 0;
838
- let fromCache = true;
839
-
840
- // Search each requested source
841
- const searchPromises: Promise<void>[] = [];
842
-
843
- if (sources.includes("crossref")) {
844
- searchPromises.push(
845
- searchCrossref(query, limit)
846
- .then((result) => {
847
- results.push(...result.citations);
848
- totalResults += result.totalResults;
849
- if (!result.fromCache) fromCache = false;
850
- })
851
- .catch((error) => {
852
- console.warn(`[literature-client] Crossref search failed: ${error.message}`);
853
- })
854
- );
855
- }
856
-
857
- if (sources.includes("arxiv")) {
858
- searchPromises.push(
859
- searchArxiv(query, limit)
860
- .then((result) => {
861
- results.push(...result.citations);
862
- totalResults += result.totalResults;
863
- if (!result.fromCache) fromCache = false;
864
- })
865
- .catch((error) => {
866
- console.warn(`[literature-client] arXiv search failed: ${error.message}`);
867
- })
868
- );
869
- }
870
-
871
- await Promise.all(searchPromises);
872
-
873
- // Sort by year (newest first), with null years at the end
874
- results.sort((a, b) => {
875
- if (a.year === null && b.year === null) return 0;
876
- if (a.year === null) return 1;
877
- if (b.year === null) return -1;
878
- return b.year - a.year;
879
- });
880
-
881
- // Limit total results
882
- const limitedResults = results.slice(0, limit);
883
-
884
- return {
885
- citations: limitedResults,
886
- totalResults,
887
- query,
888
- source: sources.length === 1 ? sources[0] : "crossref", // Default to crossref for mixed
889
- fromCache,
890
- };
891
- }
892
-
893
- // =============================================================================
894
- // CITATION FORMATTING
895
- // =============================================================================
896
-
897
- /**
898
- * Format a Citation as APA style reference.
899
- *
900
- * @param citation - Citation to format
901
- * @returns APA formatted string
902
- *
903
- * @example
904
- * const apa = formatCitationAPA(citation);
905
- * // "Smith, J., & Jones, M. (2023). Paper title. Journal Name."
906
- */
907
- export function formatCitationAPA(citation: Citation): string {
908
- const parts: string[] = [];
909
-
910
- // Authors
911
- parts.push(citation.authors);
912
-
913
- // Year
914
- if (citation.year) {
915
- parts.push(`(${citation.year}).`);
916
- } else {
917
- parts.push("(n.d.).");
918
- }
919
-
920
- // Title
921
- parts.push(citation.title + ".");
922
-
923
- // Journal/venue
924
- if (citation.journal) {
925
- parts.push(`*${citation.journal}*.`);
926
- }
927
-
928
- // DOI or URL
929
- if (citation.doi) {
930
- parts.push(`https://doi.org/${citation.doi}`);
931
- } else if (citation.url) {
932
- parts.push(citation.url);
933
- }
934
-
935
- return parts.join(" ");
936
- }
937
-
938
- /**
939
- * Format a Citation as BibTeX entry.
940
- *
941
- * @param citation - Citation to format
942
- * @param key - BibTeX entry key (defaults to generated key)
943
- * @returns BibTeX formatted string
944
- *
945
- * @example
946
- * const bibtex = formatCitationBibTeX(citation);
947
- */
948
- export function formatCitationBibTeX(citation: Citation, key?: string): string {
949
- // Generate a key if not provided
950
- const entryKey =
951
- key ||
952
- (citation.authors.split(",")[0].split(" ").pop() || "unknown") +
953
- (citation.year || "nd");
954
-
955
- const lines: string[] = [];
956
-
957
- const entryType = citation.arxivId ? "misc" : "article";
958
- lines.push(`@${entryType}{${entryKey.toLowerCase().replace(/\s+/g, "")},`);
959
-
960
- // Author
961
- lines.push(` author = {${citation.authors}},`);
962
-
963
- // Title
964
- lines.push(` title = {${citation.title}},`);
965
-
966
- // Year
967
- if (citation.year) {
968
- lines.push(` year = {${citation.year}},`);
969
- }
970
-
971
- // Journal
972
- if (citation.journal) {
973
- lines.push(` journal = {${citation.journal}},`);
974
- }
975
-
976
- // DOI
977
- if (citation.doi) {
978
- lines.push(` doi = {${citation.doi}},`);
979
- }
980
-
981
- // URL
982
- if (citation.url) {
983
- lines.push(` url = {${citation.url}},`);
984
- }
985
-
986
- // arXiv ID
987
- if (citation.arxivId) {
988
- lines.push(` eprint = {${citation.arxivId}},`);
989
- lines.push(` archivePrefix = {arXiv},`);
990
- }
991
-
992
- lines.push("}");
993
-
994
- return lines.join("\n");
995
- }
996
-
997
- // =============================================================================
998
- // CACHE MANAGEMENT
999
- // =============================================================================
1000
-
1001
- /**
1002
- * Clear the literature cache.
1003
- * Useful for testing or when cache needs to be refreshed.
1004
- */
1005
- export async function clearLiteratureCache(): Promise<void> {
1006
- const cachePath = getCachePath();
1007
-
1008
- try {
1009
- if (await fileExists(cachePath)) {
1010
- await fs.unlink(cachePath);
1011
- }
1012
- } catch (error) {
1013
- console.warn(`[literature-client] Failed to clear cache: ${(error as Error).message}`);
1014
- }
1015
- }
1016
-
1017
- /**
1018
- * Get cache statistics.
1019
- *
1020
- * @returns Object with cache stats
1021
- */
1022
- export async function getCacheStats(): Promise<{
1023
- entryCount: number;
1024
- oldestEntry: string | null;
1025
- newestEntry: string | null;
1026
- cacheFilePath: string;
1027
- }> {
1028
- const cache = await loadCache();
1029
- const entries = Object.values(cache.entries);
1030
-
1031
- if (entries.length === 0) {
1032
- return {
1033
- entryCount: 0,
1034
- oldestEntry: null,
1035
- newestEntry: null,
1036
- cacheFilePath: getCachePath(),
1037
- };
1038
- }
1039
-
1040
- const sorted = entries.sort((a, b) => a.cachedAt - b.cachedAt);
1041
-
1042
- return {
1043
- entryCount: entries.length,
1044
- oldestEntry: new Date(sorted[0].cachedAt).toISOString(),
1045
- newestEntry: new Date(sorted[sorted.length - 1].cachedAt).toISOString(),
1046
- cacheFilePath: getCachePath(),
1047
- };
1048
- }