docrev 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/doi.js ADDED
@@ -0,0 +1,823 @@
1
+ /**
2
+ * DOI validation and fetching utilities
3
+ * Check DOIs in .bib files, fetch BibTeX from DOIs
4
+ */
5
+
6
+ import * as fs from 'fs';
7
+
8
+ // Entry types that typically don't have DOIs
9
+ const NO_DOI_TYPES = new Set([
10
+ 'book', // Books often don't have DOIs (chapters might)
11
+ 'inbook', // Book chapters - variable
12
+ 'thesis', // Theses rarely have DOIs
13
+ 'mastersthesis',
14
+ 'phdthesis',
15
+ 'misc', // Catch-all, often no DOI
16
+ 'unpublished', // By definition
17
+ 'manual', // Software manuals
18
+ 'techreport', // Some do, many don't
19
+ 'booklet',
20
+ ]);
21
+
22
+ // Entry types that should have DOIs
23
+ const EXPECT_DOI_TYPES = new Set([
24
+ 'article', // Journal articles should have DOIs
25
+ 'inproceedings', // Conference papers usually do
26
+ 'proceedings',
27
+ 'incollection', // Book chapters in collections
28
+ ]);
29
+
30
+ /**
31
+ * Parse .bib file and extract entries with DOI info
32
+ * @param {string} bibPath
33
+ * @returns {Array<{key: string, type: string, doi: string|null, title: string, skip: boolean, line: number}>}
34
+ */
35
+ export function parseBibEntries(bibPath) {
36
+ if (!fs.existsSync(bibPath)) {
37
+ return [];
38
+ }
39
+
40
+ const content = fs.readFileSync(bibPath, 'utf-8');
41
+ const entries = [];
42
+ const lines = content.split('\n');
43
+
44
+ // Pattern for bib entries: @type{key,
45
+ const entryPattern = /@(\w+)\s*\{\s*([^,\s]+)\s*,/g;
46
+
47
+ let match;
48
+ while ((match = entryPattern.exec(content)) !== null) {
49
+ const type = match[1].toLowerCase();
50
+ const key = match[2];
51
+ const startPos = match.index;
52
+
53
+ // Find the line number
54
+ let line = 1;
55
+ for (let i = 0; i < startPos; i++) {
56
+ if (content[i] === '\n') line++;
57
+ }
58
+
59
+ // Find the end of this entry (matching closing brace)
60
+ let braceCount = 0;
61
+ let entryEnd = startPos;
62
+ let inEntry = false;
63
+
64
+ for (let i = startPos; i < content.length; i++) {
65
+ if (content[i] === '{') {
66
+ braceCount++;
67
+ inEntry = true;
68
+ } else if (content[i] === '}') {
69
+ braceCount--;
70
+ if (inEntry && braceCount === 0) {
71
+ entryEnd = i + 1;
72
+ break;
73
+ }
74
+ }
75
+ }
76
+
77
+ const entryContent = content.slice(startPos, entryEnd);
78
+
79
+ // Extract DOI field
80
+ const doiMatch = entryContent.match(/\bdoi\s*=\s*[{"]([^}"]+)[}"]/i);
81
+ let doi = doiMatch ? doiMatch[1].trim() : null;
82
+
83
+ // Clean DOI - remove URL prefix if present
84
+ if (doi) {
85
+ doi = doi.replace(/^https?:\/\/(dx\.)?doi\.org\//i, '');
86
+ }
87
+
88
+ // Extract title for display
89
+ const titleMatch = entryContent.match(/\btitle\s*=\s*[{"]([^}"]+)[}"]/i);
90
+ const title = titleMatch ? titleMatch[1].trim().slice(0, 60) : '';
91
+
92
+ // Extract author for lookup
93
+ const authorMatch = entryContent.match(/\bauthor\s*=\s*[{"]([^}"]+)[}"]/i);
94
+ const authorRaw = authorMatch ? authorMatch[1].trim() : '';
95
+
96
+ // Extract year
97
+ const yearMatch = entryContent.match(/\byear\s*=\s*[{"]?(\d{4})[}""]?/i);
98
+ const year = yearMatch ? parseInt(yearMatch[1]) : null;
99
+
100
+ // Extract journal
101
+ const journalMatch = entryContent.match(/\bjournal\s*=\s*[{"]([^}"]+)[}"]/i);
102
+ const journal = journalMatch ? journalMatch[1].trim() : '';
103
+
104
+ // Check for skip marker: nodoi = {true} or nodoi = true
105
+ const skipMatch = entryContent.match(/\bnodoi\s*=\s*[{"]?(true|yes|1)[}""]?/i);
106
+ const skip = !!skipMatch;
107
+
108
+ // Check for comment marker before entry: % no-doi
109
+ const linesBefore = content.slice(Math.max(0, startPos - 200), startPos);
110
+ const commentSkip = /% *no-?doi/i.test(linesBefore);
111
+
112
+ entries.push({
113
+ key,
114
+ type,
115
+ doi,
116
+ title,
117
+ authorRaw,
118
+ year,
119
+ journal,
120
+ skip: skip || commentSkip,
121
+ expectDoi: EXPECT_DOI_TYPES.has(type),
122
+ noDoi: NO_DOI_TYPES.has(type),
123
+ line,
124
+ });
125
+ }
126
+
127
+ return entries;
128
+ }
129
+
130
+ /**
131
+ * Validate DOI format
132
+ * @param {string} doi
133
+ * @returns {boolean}
134
+ */
135
+ export function isValidDoiFormat(doi) {
136
+ if (!doi) return false;
137
+ // DOI format: 10.prefix/suffix
138
+ // Prefix is 4+ digits, suffix can contain most characters
139
+ return /^10\.\d{4,}\/[^\s]+$/.test(doi);
140
+ }
141
+
142
+ /**
143
+ * Check if DOI resolves via DataCite (for Zenodo, Figshare, etc.)
144
+ * @param {string} doi
145
+ * @returns {Promise<{valid: boolean, metadata?: object, error?: string}>}
146
+ */
147
+ async function checkDoiDataCite(doi) {
148
+ try {
149
+ const response = await fetch(`https://api.datacite.org/dois/${encodeURIComponent(doi)}`, {
150
+ headers: {
151
+ 'Accept': 'application/vnd.api+json',
152
+ 'User-Agent': 'rev-cli/0.2.0',
153
+ },
154
+ });
155
+
156
+ if (response.status === 404) {
157
+ return { valid: false, error: 'DOI not found in DataCite' };
158
+ }
159
+
160
+ if (!response.ok) {
161
+ return { valid: false, error: `HTTP ${response.status}` };
162
+ }
163
+
164
+ const data = await response.json();
165
+ const attrs = data.data?.attributes;
166
+
167
+ if (!attrs) {
168
+ return { valid: false, error: 'Invalid DataCite response' };
169
+ }
170
+
171
+ return {
172
+ valid: true,
173
+ source: 'datacite',
174
+ metadata: {
175
+ title: attrs.titles?.[0]?.title || '',
176
+ authors: attrs.creators?.map(c => `${c.givenName || ''} ${c.familyName || ''}`.trim()) || [],
177
+ year: attrs.publicationYear,
178
+ journal: attrs.publisher || '',
179
+ type: attrs.types?.resourceTypeGeneral || '',
180
+ },
181
+ };
182
+ } catch (err) {
183
+ return { valid: false, error: err.message };
184
+ }
185
+ }
186
+
187
+ /**
188
+ * Check if DOI resolves (exists) - tries Crossref first, then DataCite
189
+ * @param {string} doi
190
+ * @returns {Promise<{valid: boolean, source?: string, metadata?: object, error?: string}>}
191
+ */
192
+ export async function checkDoi(doi) {
193
+ if (!isValidDoiFormat(doi)) {
194
+ return { valid: false, error: 'Invalid DOI format' };
195
+ }
196
+
197
+ // Zenodo DOIs start with 10.5281 - check DataCite first
198
+ const isZenodo = doi.startsWith('10.5281/');
199
+ const isFigshare = doi.startsWith('10.6084/');
200
+ const isDataCiteLikely = isZenodo || isFigshare;
201
+
202
+ if (isDataCiteLikely) {
203
+ const dataciteResult = await checkDoiDataCite(doi);
204
+ if (dataciteResult.valid) {
205
+ return dataciteResult;
206
+ }
207
+ }
208
+
209
+ try {
210
+ // Use Crossref API to check DOI
211
+ const response = await fetch(`https://api.crossref.org/works/${encodeURIComponent(doi)}`, {
212
+ headers: {
213
+ 'User-Agent': 'rev-cli/0.2.0 (mailto:dev@example.com)',
214
+ },
215
+ });
216
+
217
+ if (response.status === 404) {
218
+ // Try DataCite as fallback (if not already tried)
219
+ if (!isDataCiteLikely) {
220
+ const dataciteResult = await checkDoiDataCite(doi);
221
+ if (dataciteResult.valid) {
222
+ return dataciteResult;
223
+ }
224
+ }
225
+ return { valid: false, error: 'DOI not found' };
226
+ }
227
+
228
+ if (!response.ok) {
229
+ return { valid: false, error: `HTTP ${response.status}` };
230
+ }
231
+
232
+ const data = await response.json();
233
+ const work = data.message;
234
+
235
+ return {
236
+ valid: true,
237
+ source: 'crossref',
238
+ metadata: {
239
+ title: work.title?.[0] || '',
240
+ authors: work.author?.map(a => `${a.given || ''} ${a.family || ''}`.trim()) || [],
241
+ year: work.published?.['date-parts']?.[0]?.[0] || work.created?.['date-parts']?.[0]?.[0],
242
+ journal: work['container-title']?.[0] || '',
243
+ type: work.type,
244
+ },
245
+ };
246
+ } catch (err) {
247
+ return { valid: false, error: err.message };
248
+ }
249
+ }
250
+
251
+ /**
252
+ * Fetch BibTeX from DOI using content negotiation
253
+ * @param {string} doi
254
+ * @returns {Promise<{success: boolean, bibtex?: string, error?: string}>}
255
+ */
256
+ export async function fetchBibtex(doi) {
257
+ // Clean DOI
258
+ doi = doi.replace(/^https?:\/\/(dx\.)?doi\.org\//i, '');
259
+
260
+ if (!isValidDoiFormat(doi)) {
261
+ return { success: false, error: 'Invalid DOI format' };
262
+ }
263
+
264
+ try {
265
+ const response = await fetch(`https://doi.org/${encodeURIComponent(doi)}`, {
266
+ headers: {
267
+ 'Accept': 'application/x-bibtex',
268
+ 'User-Agent': 'rev-cli/0.2.0',
269
+ },
270
+ redirect: 'follow',
271
+ });
272
+
273
+ if (!response.ok) {
274
+ return { success: false, error: `HTTP ${response.status}` };
275
+ }
276
+
277
+ const bibtex = await response.text();
278
+
279
+ if (!bibtex.includes('@')) {
280
+ return { success: false, error: 'Invalid BibTeX response' };
281
+ }
282
+
283
+ return { success: true, bibtex: bibtex.trim() };
284
+ } catch (err) {
285
+ return { success: false, error: err.message };
286
+ }
287
+ }
288
+
289
+ /**
290
+ * Check all DOIs in a .bib file
291
+ * @param {string} bibPath
292
+ * @param {object} options
293
+ * @returns {Promise<{entries: Array, valid: number, invalid: number, missing: number, skipped: number}>}
294
+ */
295
+ export async function checkBibDois(bibPath, options = {}) {
296
+ const { checkMissing = false, parallel = 5 } = options;
297
+
298
+ const entries = parseBibEntries(bibPath);
299
+ const results = [];
300
+
301
+ let valid = 0;
302
+ let invalid = 0;
303
+ let missing = 0;
304
+ let skipped = 0;
305
+
306
+ // Process in batches to avoid rate limiting
307
+ for (let i = 0; i < entries.length; i += parallel) {
308
+ const batch = entries.slice(i, i + parallel);
309
+
310
+ const batchResults = await Promise.all(
311
+ batch.map(async (entry) => {
312
+ // Skip if marked
313
+ if (entry.skip) {
314
+ skipped++;
315
+ return { ...entry, status: 'skipped', message: 'Marked as no-doi' };
316
+ }
317
+
318
+ // No DOI field
319
+ if (!entry.doi) {
320
+ if (entry.noDoi) {
321
+ // Expected - books, theses, etc.
322
+ skipped++;
323
+ return { ...entry, status: 'skipped', message: `${entry.type} typically has no DOI` };
324
+ } else if (entry.expectDoi) {
325
+ // Should have DOI but doesn't
326
+ missing++;
327
+ return { ...entry, status: 'missing', message: 'Expected DOI for article/proceedings' };
328
+ } else {
329
+ skipped++;
330
+ return { ...entry, status: 'skipped', message: 'No DOI field' };
331
+ }
332
+ }
333
+
334
+ // Validate DOI format first
335
+ if (!isValidDoiFormat(entry.doi)) {
336
+ invalid++;
337
+ return { ...entry, status: 'invalid', message: 'Invalid DOI format' };
338
+ }
339
+
340
+ // Check if DOI resolves
341
+ const check = await checkDoi(entry.doi);
342
+ if (check.valid) {
343
+ valid++;
344
+ return { ...entry, status: 'valid', metadata: check.metadata };
345
+ } else {
346
+ invalid++;
347
+ return { ...entry, status: 'invalid', message: check.error };
348
+ }
349
+ })
350
+ );
351
+
352
+ results.push(...batchResults);
353
+
354
+ // Small delay between batches to be nice to the API
355
+ if (i + parallel < entries.length) {
356
+ await new Promise(r => setTimeout(r, 200));
357
+ }
358
+ }
359
+
360
+ return { entries: results, valid, invalid, missing, skipped };
361
+ }
362
+
363
+ /**
364
+ * Search DataCite API (for Zenodo, Figshare, etc.)
365
+ * @param {string} title
366
+ * @param {string} author
367
+ * @param {number} year
368
+ * @returns {Promise<Array>}
369
+ */
370
+ async function searchDataCite(title, author = '', year = null) {
371
+ try {
372
+ // DataCite query syntax
373
+ let query = `titles.title:${title.replace(/[{}]/g, '')}`;
374
+ if (author) {
375
+ query += ` AND creators.name:${author}`;
376
+ }
377
+ if (year) {
378
+ query += ` AND publicationYear:${year}`;
379
+ }
380
+
381
+ const params = new URLSearchParams({
382
+ query: query,
383
+ 'page[size]': '5',
384
+ });
385
+
386
+ const response = await fetch(`https://api.datacite.org/dois?${params}`, {
387
+ headers: {
388
+ 'Accept': 'application/vnd.api+json',
389
+ 'User-Agent': 'rev-cli/0.2.0',
390
+ },
391
+ });
392
+
393
+ if (!response.ok) return [];
394
+
395
+ const data = await response.json();
396
+ const items = data.data || [];
397
+
398
+ return items.map(item => {
399
+ const attrs = item.attributes;
400
+ return {
401
+ DOI: item.id,
402
+ title: [attrs.titles?.[0]?.title || ''],
403
+ author: attrs.creators?.map(c => ({ family: c.familyName, given: c.givenName })) || [],
404
+ 'published-print': { 'date-parts': [[attrs.publicationYear]] },
405
+ 'container-title': [attrs.publisher || ''],
406
+ score: 50, // Base score for DataCite results
407
+ source: 'datacite',
408
+ };
409
+ });
410
+ } catch {
411
+ return [];
412
+ }
413
+ }
414
+
415
+ /**
416
+ * Normalize text for comparison (lowercase, remove special chars)
417
+ * @param {string} text
418
+ * @returns {string}
419
+ */
420
+ function normalizeText(text) {
421
+ return (text || '')
422
+ .toLowerCase()
423
+ .replace(/[{}\\]/g, '') // Remove LaTeX braces
424
+ .replace(/[^a-z0-9\s]/g, ' ') // Replace special chars with space
425
+ .replace(/\s+/g, ' ')
426
+ .trim();
427
+ }
428
+
429
+ /**
430
+ * Check if DOI looks like a supplement, figure, or review (not the main paper)
431
+ * @param {string} doi
432
+ * @param {string} title
433
+ * @param {string} journal
434
+ * @returns {boolean}
435
+ */
436
+ function isSupplementOrReview(doi, title = '', journal = '') {
437
+ const doiLower = (doi || '').toLowerCase();
438
+ const titleLower = (title || '').toLowerCase();
439
+ const journalLower = (journal || '').toLowerCase();
440
+
441
+ // Supplement/figure DOI patterns
442
+ if (/\.suppl|\/suppl|\.figure|\/figure|\.s\d+$|_s\d+$/i.test(doiLower)) {
443
+ return true;
444
+ }
445
+
446
+ // F1000/Faculty Opinions (post-publication reviews)
447
+ if (/10\.3410\/f\./i.test(doiLower) || /faculty opinions/i.test(journalLower)) {
448
+ return true;
449
+ }
450
+
451
+ // Title suggests it's supplementary material
452
+ if (/^supplementary|^supporting information|^appendix/i.test(titleLower)) {
453
+ return true;
454
+ }
455
+
456
+ return false;
457
+ }
458
+
459
+ /**
460
+ * Search for DOI by title and author using Crossref API (+ DataCite fallback)
461
+ * @param {string} title
462
+ * @param {string} author - First author's last name
463
+ * @param {number} year - Publication year (optional, improves accuracy)
464
+ * @param {string} journal - Expected journal name (optional, improves accuracy)
465
+ * @returns {Promise<{found: boolean, doi?: string, confidence?: number, metadata?: object, error?: string}>}
466
+ */
467
+ export async function lookupDoi(title, author = '', year = null, journal = '') {
468
+ if (!title || title.length < 10) {
469
+ return { found: false, error: 'Title too short for reliable search' };
470
+ }
471
+
472
+ // Check for keywords that suggest Zenodo/DataCite sources
473
+ const likelyZenodo = /\b(IPBES|zenodo|assessment report|secretariat)\b/i.test(title);
474
+
475
+ try {
476
+ // Build query - title is most important, add author and journal if available
477
+ let query = title;
478
+ if (author) {
479
+ query = `${title} ${author}`;
480
+ }
481
+ // Add journal to query for better matching
482
+ if (journal) {
483
+ query = `${query} ${journal}`;
484
+ }
485
+
486
+ let items = [];
487
+
488
+ // Try structured bibliographic query first (more accurate)
489
+ const structuredParams = new URLSearchParams({
490
+ rows: '10',
491
+ select: 'DOI,title,author,published-print,published-online,container-title,score,type',
492
+ });
493
+ structuredParams.set('query.bibliographic', title);
494
+ if (author) {
495
+ structuredParams.set('query.author', author);
496
+ }
497
+ if (journal) {
498
+ structuredParams.set('query.container-title', journal);
499
+ }
500
+
501
+ let response = await fetch(`https://api.crossref.org/works?${structuredParams}`, {
502
+ headers: {
503
+ 'User-Agent': 'rev-cli/0.2.0 (mailto:dev@example.com)',
504
+ },
505
+ });
506
+
507
+ if (response.ok) {
508
+ const data = await response.json();
509
+ items = data.message?.items || [];
510
+ }
511
+
512
+ // If structured query found few results, also try query.title (often better for exact matches)
513
+ if (items.length < 5) {
514
+ const titleParams = new URLSearchParams({
515
+ rows: '10',
516
+ select: 'DOI,title,author,published-print,published-online,container-title,score,type',
517
+ });
518
+ titleParams.set('query.title', title);
519
+
520
+ const response2 = await fetch(`https://api.crossref.org/works?${titleParams}`, {
521
+ headers: {
522
+ 'User-Agent': 'rev-cli/0.2.0 (mailto:dev@example.com)',
523
+ },
524
+ });
525
+
526
+ if (response2.ok) {
527
+ const data = await response2.json();
528
+ const newItems = data.message?.items || [];
529
+ // Merge results, avoiding duplicates
530
+ const existingDois = new Set(items.map(i => i.DOI));
531
+ for (const item of newItems) {
532
+ if (!existingDois.has(item.DOI)) {
533
+ items.push(item);
534
+ }
535
+ }
536
+ }
537
+ }
538
+
539
+ // If still nothing, try basic query (most lenient)
540
+ if (items.length === 0) {
541
+ const basicParams = new URLSearchParams({
542
+ query: query,
543
+ rows: '10',
544
+ select: 'DOI,title,author,published-print,published-online,container-title,score,type',
545
+ });
546
+
547
+ response = await fetch(`https://api.crossref.org/works?${basicParams}`, {
548
+ headers: {
549
+ 'User-Agent': 'rev-cli/0.2.0 (mailto:dev@example.com)',
550
+ },
551
+ });
552
+
553
+ if (response.ok) {
554
+ const data = await response.json();
555
+ items = data.message?.items || [];
556
+ }
557
+ }
558
+
559
+ // Also search DataCite for Zenodo/institutional repos
560
+ if (likelyZenodo || items.length === 0) {
561
+ const dataciteItems = await searchDataCite(title, author, year);
562
+ items = [...items, ...dataciteItems];
563
+ }
564
+
565
+ if (items.length === 0) {
566
+ return { found: false, error: 'No results found' };
567
+ }
568
+
569
+ const normalizedSearchTitle = normalizeText(title);
570
+ const normalizedJournal = normalizeText(journal);
571
+
572
+ // Score the results
573
+ const scored = items.map(item => {
574
+ let score = 0;
575
+ const itemTitle = item.title?.[0] || '';
576
+ const itemJournal = item['container-title']?.[0] || '';
577
+ const normalizedItemTitle = normalizeText(itemTitle);
578
+ const normalizedItemJournal = normalizeText(itemJournal);
579
+
580
+ // === PENALTY: Supplement/figure/review DOIs ===
581
+ if (isSupplementOrReview(item.DOI, itemTitle, itemJournal)) {
582
+ score -= 100; // Heavy penalty - almost never want these
583
+ }
584
+
585
+ // === Title similarity (most important) ===
586
+ if (normalizedItemTitle === normalizedSearchTitle) {
587
+ score += 100; // Exact match
588
+ } else if (normalizedItemTitle.includes(normalizedSearchTitle) ||
589
+ normalizedSearchTitle.includes(normalizedItemTitle)) {
590
+ score += 50;
591
+ } else {
592
+ // Check word overlap
593
+ const searchWords = normalizedSearchTitle.split(/\s+/).filter(w => w.length > 3);
594
+ const itemWords = normalizedItemTitle.split(/\s+/).filter(w => w.length > 3);
595
+ const overlap = searchWords.filter(w =>
596
+ itemWords.some(iw => iw.includes(w) || w.includes(iw))
597
+ );
598
+ score += (overlap.length / Math.max(searchWords.length, 1)) * 40;
599
+ }
600
+
601
+ // === Author match ===
602
+ if (author && item.author) {
603
+ const authorLower = author.toLowerCase();
604
+ const hasAuthor = item.author.some(a =>
605
+ (a.family || '').toLowerCase().includes(authorLower) ||
606
+ authorLower.includes((a.family || '').toLowerCase())
607
+ );
608
+ if (hasAuthor) score += 30;
609
+ }
610
+
611
+ // === Journal match (NEW) ===
612
+ if (normalizedJournal && normalizedItemJournal) {
613
+ // Check for journal name match (handles abbreviations)
614
+ const journalWords = normalizedJournal.split(/\s+/).filter(w => w.length > 2);
615
+ const itemJournalWords = normalizedItemJournal.split(/\s+/).filter(w => w.length > 2);
616
+
617
+ // Count matching words
618
+ const journalOverlap = journalWords.filter(w =>
619
+ itemJournalWords.some(iw => iw.includes(w) || w.includes(iw))
620
+ );
621
+
622
+ if (journalOverlap.length >= Math.min(2, journalWords.length)) {
623
+ score += 40; // Good journal match
624
+ } else if (journalOverlap.length >= 1) {
625
+ score += 15; // Partial match
626
+ }
627
+
628
+ // Bonus for exact journal match
629
+ if (normalizedItemJournal === normalizedJournal) {
630
+ score += 20;
631
+ }
632
+ }
633
+
634
+ // === Year match - CRITICAL for accuracy ===
635
+ const itemYear = item['published-print']?.['date-parts']?.[0]?.[0] ||
636
+ item['published-online']?.['date-parts']?.[0]?.[0];
637
+ if (year && itemYear) {
638
+ if (itemYear === year) {
639
+ score += 50; // Exact match - required for high confidence
640
+ } else if (Math.abs(itemYear - year) === 1) {
641
+ score += 20; // Off by one (common for online-first)
642
+ } else {
643
+ score -= 50; // Wrong year = likely wrong paper
644
+ }
645
+ } else if (year && !itemYear) {
646
+ score -= 10; // Can't verify year
647
+ }
648
+
649
+ // Crossref's own relevance score (capped)
650
+ score += Math.min(item.score || 0, 10);
651
+
652
+ return {
653
+ doi: item.DOI,
654
+ title: itemTitle,
655
+ authors: item.author?.map(a => `${a.given || ''} ${a.family || ''}`.trim()) || [],
656
+ year: itemYear,
657
+ journal: itemJournal,
658
+ score,
659
+ crossrefScore: item.score,
660
+ isSupplement: isSupplementOrReview(item.DOI, itemTitle, itemJournal),
661
+ };
662
+ });
663
+
664
+ // Sort by our score
665
+ scored.sort((a, b) => b.score - a.score);
666
+
667
+ // Filter out supplements for the "best" pick (but keep in alternatives)
668
+ const mainPapers = scored.filter(s => !s.isSupplement);
669
+ const best = mainPapers.length > 0 ? mainPapers[0] : scored[0];
670
+
671
+ // Confidence thresholds
672
+ let confidence = 'low';
673
+ if (best.score >= 120) confidence = 'high';
674
+ else if (best.score >= 70) confidence = 'medium';
675
+
676
+ // === NEW: Try DataCite if Crossref confidence is low ===
677
+ if (confidence === 'low' && !likelyZenodo) {
678
+ const dataciteItems = await searchDataCite(title, author, year);
679
+ if (dataciteItems.length > 0) {
680
+ // Score DataCite results with same logic
681
+ for (const dcItem of dataciteItems) {
682
+ const dcTitle = dcItem.title?.[0] || '';
683
+ const normalizedDcTitle = normalizeText(dcTitle);
684
+ let dcScore = 0;
685
+
686
+ // Title match
687
+ if (normalizedDcTitle === normalizedSearchTitle) {
688
+ dcScore += 100;
689
+ } else if (normalizedDcTitle.includes(normalizedSearchTitle) ||
690
+ normalizedSearchTitle.includes(normalizedDcTitle)) {
691
+ dcScore += 50;
692
+ }
693
+
694
+ // Year match
695
+ const dcYear = dcItem['published-print']?.['date-parts']?.[0]?.[0];
696
+ if (year && dcYear && dcYear === year) {
697
+ dcScore += 50;
698
+ }
699
+
700
+ if (dcScore > best.score) {
701
+ return {
702
+ found: true,
703
+ doi: dcItem.DOI,
704
+ confidence: dcScore >= 120 ? 'high' : dcScore >= 70 ? 'medium' : 'low',
705
+ score: dcScore,
706
+ metadata: {
707
+ title: dcTitle,
708
+ authors: dcItem.author?.map(a => `${a.given || ''} ${a.family || ''}`.trim()) || [],
709
+ year: dcYear,
710
+ journal: dcItem['container-title']?.[0] || '',
711
+ },
712
+ alternatives: scored.slice(0, 2),
713
+ source: 'datacite',
714
+ };
715
+ }
716
+ }
717
+ }
718
+ }
719
+
720
+ return {
721
+ found: true,
722
+ doi: best.doi,
723
+ confidence,
724
+ score: best.score,
725
+ metadata: {
726
+ title: best.title,
727
+ authors: best.authors,
728
+ year: best.year,
729
+ journal: best.journal,
730
+ },
731
+ alternatives: scored.filter(s => s.doi !== best.doi).slice(0, 3),
732
+ };
733
+ } catch (err) {
734
+ return { found: false, error: err.message };
735
+ }
736
+ }
737
+
738
+ /**
739
+ * Look up DOIs for all entries missing them in a .bib file
740
+ * @param {string} bibPath
741
+ * @param {object} options
742
+ * @returns {Promise<Array<{key: string, result: object}>>}
743
+ */
744
+ export async function lookupMissingDois(bibPath, options = {}) {
745
+ const { parallel = 3, onProgress } = options;
746
+
747
+ const entries = parseBibEntries(bibPath);
748
+ const missing = entries.filter(e =>
749
+ !e.doi &&
750
+ !e.skip &&
751
+ !NO_DOI_TYPES.has(e.type)
752
+ );
753
+
754
+ const results = [];
755
+
756
+ for (let i = 0; i < missing.length; i += parallel) {
757
+ const batch = missing.slice(i, i + parallel);
758
+
759
+ const batchResults = await Promise.all(
760
+ batch.map(async (entry) => {
761
+ // Extract first author's last name from the entry
762
+ // This is tricky because BibTeX author format varies
763
+ let author = '';
764
+ if (entry.authorRaw) {
765
+ // Try to get first author's last name
766
+ const firstAuthor = entry.authorRaw.split(' and ')[0];
767
+ const parts = firstAuthor.split(',');
768
+ author = parts[0]?.trim() || '';
769
+ }
770
+
771
+ const result = await lookupDoi(entry.title, author, entry.year, entry.journal);
772
+
773
+ return {
774
+ key: entry.key,
775
+ title: entry.title,
776
+ type: entry.type,
777
+ journal: entry.journal,
778
+ result,
779
+ };
780
+ })
781
+ );
782
+
783
+ results.push(...batchResults);
784
+
785
+ if (onProgress) {
786
+ onProgress(Math.min(i + parallel, missing.length), missing.length);
787
+ }
788
+
789
+ // Rate limiting
790
+ if (i + parallel < missing.length) {
791
+ await new Promise(r => setTimeout(r, 300));
792
+ }
793
+ }
794
+
795
+ return results;
796
+ }
797
+
798
+ /**
799
+ * Add a BibTeX entry to a .bib file
800
+ * @param {string} bibPath
801
+ * @param {string} bibtex
802
+ * @returns {{success: boolean, key?: string, error?: string}}
803
+ */
804
+ export function addToBib(bibPath, bibtex) {
805
+ // Extract key from BibTeX
806
+ const keyMatch = bibtex.match(/@\w+\s*\{\s*([^,\s]+)/);
807
+ if (!keyMatch) {
808
+ return { success: false, error: 'Could not extract citation key from BibTeX' };
809
+ }
810
+ const key = keyMatch[1];
811
+
812
+ // Check if key already exists
813
+ const existing = fs.existsSync(bibPath) ? fs.readFileSync(bibPath, 'utf-8') : '';
814
+ if (existing.includes(`{${key},`) || existing.includes(`{${key}\n`)) {
815
+ return { success: false, error: `Key "${key}" already exists in ${bibPath}` };
816
+ }
817
+
818
+ // Append to file
819
+ const newContent = existing.trim() + '\n\n' + bibtex + '\n';
820
+ fs.writeFileSync(bibPath, newContent, 'utf-8');
821
+
822
+ return { success: true, key };
823
+ }