docrev 0.9.11 → 0.9.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/.claude/settings.local.json +9 -9
  2. package/.gitattributes +1 -1
  3. package/CHANGELOG.md +149 -149
  4. package/PLAN-tables-and-postprocess.md +850 -850
  5. package/README.md +391 -391
  6. package/bin/rev.js +11 -11
  7. package/bin/rev.ts +145 -145
  8. package/completions/rev.bash +127 -127
  9. package/completions/rev.ps1 +210 -210
  10. package/completions/rev.zsh +207 -207
  11. package/dev_notes/stress2/build_adversarial.ts +186 -186
  12. package/dev_notes/stress2/drift_matcher.ts +62 -62
  13. package/dev_notes/stress2/probe_anchors.ts +35 -35
  14. package/dev_notes/stress2/project/discussion.before.md +3 -3
  15. package/dev_notes/stress2/project/discussion.md +3 -3
  16. package/dev_notes/stress2/project/methods.before.md +20 -20
  17. package/dev_notes/stress2/project/methods.md +20 -20
  18. package/dev_notes/stress2/project/rev.yaml +5 -5
  19. package/dev_notes/stress2/project/sections.yaml +4 -4
  20. package/dev_notes/stress2/sections.yaml +5 -5
  21. package/dev_notes/stress2/trace_placement.ts +50 -50
  22. package/dev_notes/stresstest_boundaries.ts +27 -27
  23. package/dev_notes/stresstest_drift_apply.ts +43 -43
  24. package/dev_notes/stresstest_drift_compare.ts +43 -43
  25. package/dev_notes/stresstest_drift_v2.ts +54 -54
  26. package/dev_notes/stresstest_inspect.ts +54 -54
  27. package/dev_notes/stresstest_pstyle.ts +55 -55
  28. package/dev_notes/stresstest_section_debug.ts +23 -23
  29. package/dev_notes/stresstest_split.ts +70 -70
  30. package/dev_notes/stresstest_trace.ts +19 -19
  31. package/dev_notes/stresstest_verify_no_overwrite.ts +40 -40
  32. package/dist/lib/build.d.ts +50 -1
  33. package/dist/lib/build.d.ts.map +1 -1
  34. package/dist/lib/build.js +80 -30
  35. package/dist/lib/build.js.map +1 -1
  36. package/dist/lib/commands/build.d.ts.map +1 -1
  37. package/dist/lib/commands/build.js +38 -5
  38. package/dist/lib/commands/build.js.map +1 -1
  39. package/dist/lib/commands/utilities.js +164 -164
  40. package/dist/lib/commands/word-tools.js +8 -8
  41. package/dist/lib/grammar.js +3 -3
  42. package/dist/lib/import.d.ts.map +1 -1
  43. package/dist/lib/import.js +146 -24
  44. package/dist/lib/import.js.map +1 -1
  45. package/dist/lib/pdf-comments.js +44 -44
  46. package/dist/lib/plugins.js +57 -57
  47. package/dist/lib/pptx-themes.js +115 -115
  48. package/dist/lib/spelling.js +2 -2
  49. package/dist/lib/templates.js +387 -387
  50. package/dist/lib/themes.js +51 -51
  51. package/dist/lib/types.d.ts +20 -0
  52. package/dist/lib/types.d.ts.map +1 -1
  53. package/dist/lib/word-extraction.d.ts +6 -0
  54. package/dist/lib/word-extraction.d.ts.map +1 -1
  55. package/dist/lib/word-extraction.js +46 -3
  56. package/dist/lib/word-extraction.js.map +1 -1
  57. package/dist/lib/wordcomments.d.ts.map +1 -1
  58. package/dist/lib/wordcomments.js +23 -5
  59. package/dist/lib/wordcomments.js.map +1 -1
  60. package/eslint.config.js +27 -27
  61. package/lib/anchor-match.ts +276 -276
  62. package/lib/annotations.ts +644 -644
  63. package/lib/build.ts +1300 -1227
  64. package/lib/citations.ts +160 -160
  65. package/lib/commands/build.ts +833 -801
  66. package/lib/commands/citations.ts +515 -515
  67. package/lib/commands/comments.ts +1050 -1050
  68. package/lib/commands/context.ts +174 -174
  69. package/lib/commands/core.ts +309 -309
  70. package/lib/commands/doi.ts +435 -435
  71. package/lib/commands/file-ops.ts +372 -372
  72. package/lib/commands/history.ts +320 -320
  73. package/lib/commands/index.ts +87 -87
  74. package/lib/commands/init.ts +259 -259
  75. package/lib/commands/merge-resolve.ts +378 -378
  76. package/lib/commands/preview.ts +178 -178
  77. package/lib/commands/project-info.ts +244 -244
  78. package/lib/commands/quality.ts +517 -517
  79. package/lib/commands/response.ts +454 -454
  80. package/lib/commands/section-boundaries.ts +82 -82
  81. package/lib/commands/sections.ts +451 -451
  82. package/lib/commands/sync.ts +706 -706
  83. package/lib/commands/text-ops.ts +449 -449
  84. package/lib/commands/utilities.ts +448 -448
  85. package/lib/commands/verify-anchors.ts +272 -272
  86. package/lib/commands/word-tools.ts +340 -340
  87. package/lib/comment-realign.ts +517 -517
  88. package/lib/config.ts +84 -84
  89. package/lib/crossref.ts +781 -781
  90. package/lib/csl.ts +191 -191
  91. package/lib/dependencies.ts +98 -98
  92. package/lib/diff-engine.ts +465 -465
  93. package/lib/doi-cache.ts +115 -115
  94. package/lib/doi.ts +897 -897
  95. package/lib/equations.ts +506 -506
  96. package/lib/errors.ts +346 -346
  97. package/lib/format.ts +541 -541
  98. package/lib/git.ts +326 -326
  99. package/lib/grammar.ts +303 -303
  100. package/lib/image-registry.ts +180 -180
  101. package/lib/import.ts +911 -792
  102. package/lib/journals.ts +543 -543
  103. package/lib/merge.ts +633 -633
  104. package/lib/orcid.ts +144 -144
  105. package/lib/pdf-comments.ts +263 -263
  106. package/lib/pdf-import.ts +524 -524
  107. package/lib/plugins.ts +362 -362
  108. package/lib/postprocess.ts +188 -188
  109. package/lib/pptx-color-filter.lua +37 -37
  110. package/lib/pptx-template.ts +469 -469
  111. package/lib/pptx-themes.ts +483 -483
  112. package/lib/protect-restore.ts +520 -520
  113. package/lib/rate-limiter.ts +94 -94
  114. package/lib/response.ts +197 -197
  115. package/lib/restore-references.ts +240 -240
  116. package/lib/review.ts +327 -327
  117. package/lib/schema.ts +417 -417
  118. package/lib/scientific-words.ts +73 -73
  119. package/lib/sections.ts +335 -335
  120. package/lib/slides.ts +756 -756
  121. package/lib/spelling.ts +334 -334
  122. package/lib/templates.ts +526 -526
  123. package/lib/themes.ts +742 -742
  124. package/lib/trackchanges.ts +247 -247
  125. package/lib/tui.ts +450 -450
  126. package/lib/types.ts +550 -530
  127. package/lib/undo.ts +250 -250
  128. package/lib/utils.ts +69 -69
  129. package/lib/variables.ts +179 -179
  130. package/lib/word-extraction.ts +806 -759
  131. package/lib/word.ts +643 -643
  132. package/lib/wordcomments.ts +817 -798
  133. package/package.json +137 -137
  134. package/scripts/postbuild.js +28 -28
  135. package/skill/REFERENCE.md +431 -431
  136. package/skill/SKILL.md +258 -258
  137. package/tsconfig.json +26 -26
  138. package/types/index.d.ts +525 -525
package/lib/doi.ts CHANGED
@@ -1,897 +1,897 @@
1
- /**
2
- * DOI validation and fetching utilities
3
- * Check DOIs in .bib files, fetch BibTeX from DOIs
4
- */
5
-
6
- import * as fs from 'fs';
7
- import type { BibEntry, DoiCheckResult, BibtexFetchResult, DoiLookupResult, BibCheckResult } from './types.js';
8
- import { crossrefLimiter, dataciteLimiter, doiOrgLimiter } from './rate-limiter.js';
9
- import { getCachedDoi, cacheDoi } from './doi-cache.js';
10
-
11
- // Entry types that typically don't have DOIs
12
- const NO_DOI_TYPES = new Set([
13
- 'book', // Books often don't have DOIs (chapters might)
14
- 'inbook', // Book chapters - variable
15
- 'thesis', // Theses rarely have DOIs
16
- 'mastersthesis',
17
- 'phdthesis',
18
- 'misc', // Catch-all, often no DOI
19
- 'unpublished', // By definition
20
- 'manual', // Software manuals
21
- 'techreport', // Some do, many don't
22
- 'booklet',
23
- ]);
24
-
25
- // Entry types that should have DOIs
26
- const EXPECT_DOI_TYPES = new Set([
27
- 'article', // Journal articles should have DOIs
28
- 'inproceedings', // Conference papers usually do
29
- 'proceedings',
30
- 'incollection', // Book chapters in collections
31
- ]);
32
-
33
- /**
34
- * Parse .bib file and extract entries with DOI info
35
- */
36
- export function parseBibEntries(bibPath: string): BibEntry[] {
37
- if (!fs.existsSync(bibPath)) {
38
- return [];
39
- }
40
-
41
- const content = fs.readFileSync(bibPath, 'utf-8');
42
- const entries: BibEntry[] = [];
43
- const lines = content.split('\n');
44
-
45
- // Pattern for bib entries: @type{key,
46
- const entryPattern = /@(\w+)\s*\{\s*([^,\s]+)\s*,/g;
47
-
48
- let match: RegExpExecArray | null;
49
- while ((match = entryPattern.exec(content)) !== null) {
50
- const type = match[1]!.toLowerCase();
51
- const key = match[2]!;
52
- const startPos = match.index;
53
-
54
- // Find the line number
55
- let line = 1;
56
- for (let i = 0; i < startPos; i++) {
57
- if (content[i] === '\n') line++;
58
- }
59
-
60
- // Find the end of this entry (matching closing brace)
61
- let braceCount = 0;
62
- let entryEnd = startPos;
63
- let inEntry = false;
64
-
65
- for (let i = startPos; i < content.length; i++) {
66
- if (content[i] === '{') {
67
- braceCount++;
68
- inEntry = true;
69
- } else if (content[i] === '}') {
70
- braceCount--;
71
- if (inEntry && braceCount === 0) {
72
- entryEnd = i + 1;
73
- break;
74
- }
75
- }
76
- }
77
-
78
- const entryContent = content.slice(startPos, entryEnd);
79
-
80
- // Extract DOI field
81
- const doiMatch = entryContent.match(/\bdoi\s*=\s*[{"]([^}"]+)[}"]/i);
82
- let doi = doiMatch ? doiMatch[1]!.trim() : null;
83
-
84
- // Clean DOI - remove URL prefix if present
85
- if (doi) {
86
- doi = doi.replace(/^https?:\/\/(dx\.)?doi\.org\//i, '');
87
- }
88
-
89
- // Extract title for display
90
- const titleMatch = entryContent.match(/\btitle\s*=\s*[{"]([^}"]+)[}"]/i);
91
- const title = titleMatch ? titleMatch[1]!.trim().slice(0, 60) : '';
92
-
93
- // Extract author for lookup
94
- const authorMatch = entryContent.match(/\bauthor\s*=\s*[{"]([^}"]+)[}"]/i);
95
- const authorRaw = authorMatch ? authorMatch[1]!.trim() : '';
96
-
97
- // Extract year
98
- const yearMatch = entryContent.match(/\byear\s*=\s*[{"]?(\d{4})[}""]?/i);
99
- const year = yearMatch ? parseInt(yearMatch[1]!) : null;
100
-
101
- // Extract journal
102
- const journalMatch = entryContent.match(/\bjournal\s*=\s*[{"]([^}"]+)[}"]/i);
103
- const journal = journalMatch ? journalMatch[1]!.trim() : '';
104
-
105
- // Check for skip marker: nodoi = {true} or nodoi = true
106
- const skipMatch = entryContent.match(/\bnodoi\s*=\s*[{"]?(true|yes|1)[}""]?/i);
107
- const skip = !!skipMatch;
108
-
109
- // Check for comment marker immediately before entry: % no-doi
110
- // Only look at the text between the last entry end (or start) and this entry
111
- const linesBefore = content.slice(Math.max(0, startPos - 200), startPos);
112
- // Find the last closing brace or start of file to avoid matching comments for previous entries
113
- const lastEntryEnd = linesBefore.lastIndexOf('}');
114
- const relevantBefore = lastEntryEnd >= 0 ? linesBefore.slice(lastEntryEnd + 1) : linesBefore;
115
- const commentSkip = /% *no-?doi/i.test(relevantBefore);
116
-
117
- entries.push({
118
- key,
119
- type,
120
- doi: doi || null,
121
- title,
122
- authorRaw,
123
- year,
124
- journal,
125
- skip: skip || commentSkip,
126
- expectDoi: EXPECT_DOI_TYPES.has(type),
127
- noDoi: NO_DOI_TYPES.has(type),
128
- line,
129
- });
130
- }
131
-
132
- return entries;
133
- }
134
-
135
- /**
136
- * Validate DOI format
137
- */
138
- export function isValidDoiFormat(doi: string): boolean {
139
- if (!doi) return false;
140
- // DOI format: 10.prefix/suffix
141
- // Prefix is 4+ digits, suffix can contain most characters
142
- return /^10\.\d{4,}\/[^\s]+$/.test(doi);
143
- }
144
-
145
- /**
146
- * Check if DOI resolves via DataCite (for Zenodo, Figshare, etc.)
147
- */
148
- async function checkDoiDataCite(doi: string): Promise<DoiCheckResult> {
149
- try {
150
- const response = await dataciteLimiter.fetchWithRetry(
151
- `https://api.datacite.org/dois/${encodeURIComponent(doi)}`,
152
- {
153
- headers: {
154
- 'Accept': 'application/vnd.api+json',
155
- 'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev)',
156
- },
157
- }
158
- );
159
-
160
- if (response.status === 404) {
161
- return { valid: false, error: 'DOI not found in DataCite' };
162
- }
163
-
164
- if (!response.ok) {
165
- return { valid: false, error: `HTTP ${response.status}` };
166
- }
167
-
168
- const data = await response.json() as any;
169
- const attrs = data.data?.attributes;
170
-
171
- if (!attrs) {
172
- return { valid: false, error: 'Invalid DataCite response' };
173
- }
174
-
175
- return {
176
- valid: true,
177
- source: 'datacite',
178
- metadata: {
179
- title: attrs.titles?.[0]?.title || '',
180
- authors: attrs.creators?.map((c: any) => `${c.givenName || ''} ${c.familyName || ''}`.trim()) || [],
181
- year: attrs.publicationYear,
182
- journal: attrs.publisher || '',
183
- type: attrs.types?.resourceTypeGeneral || '',
184
- },
185
- };
186
- } catch (err) {
187
- return { valid: false, error: (err as Error).message };
188
- }
189
- }
190
-
191
- interface CheckDoiOptions {
192
- skipCache?: boolean;
193
- }
194
-
195
- /**
196
- * Check if DOI resolves (exists) - tries Crossref first, then DataCite
197
- * Results are cached for 7 days to reduce API calls.
198
- */
199
- export async function checkDoi(doi: string, options: CheckDoiOptions = {}): Promise<DoiCheckResult & { cached?: boolean }> {
200
- if (!isValidDoiFormat(doi)) {
201
- return { valid: false, error: 'Invalid DOI format' };
202
- }
203
-
204
- // Check cache first (unless skipped)
205
- if (!options.skipCache) {
206
- const cached = getCachedDoi(doi);
207
- if (cached) {
208
- return { ...cached, cached: true } as DoiCheckResult & { cached?: boolean };
209
- }
210
- }
211
-
212
- // Zenodo DOIs start with 10.5281 - check DataCite first
213
- const isZenodo = doi.startsWith('10.5281/');
214
- const isFigshare = doi.startsWith('10.6084/');
215
- const isDataCiteLikely = isZenodo || isFigshare;
216
-
217
- if (isDataCiteLikely) {
218
- const dataciteResult = await checkDoiDataCite(doi);
219
- if (dataciteResult.valid) {
220
- cacheDoi(doi, dataciteResult);
221
- return dataciteResult;
222
- }
223
- }
224
-
225
- try {
226
- // Use Crossref API to check DOI
227
- const response = await crossrefLimiter.fetchWithRetry(
228
- `https://api.crossref.org/works/${encodeURIComponent(doi)}`,
229
- {
230
- headers: {
231
- 'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev; mailto:docrev@example.com)',
232
- },
233
- }
234
- );
235
-
236
- if (response.status === 404) {
237
- // Try DataCite as fallback (if not already tried)
238
- if (!isDataCiteLikely) {
239
- const dataciteResult = await checkDoiDataCite(doi);
240
- if (dataciteResult.valid) {
241
- cacheDoi(doi, dataciteResult);
242
- return dataciteResult;
243
- }
244
- }
245
- const result = { valid: false, error: 'DOI not found' };
246
- cacheDoi(doi, result);
247
- return result;
248
- }
249
-
250
- if (!response.ok) {
251
- // Don't cache transient errors
252
- return { valid: false, error: `HTTP ${response.status}` };
253
- }
254
-
255
- const data = await response.json() as any;
256
- const work = data.message;
257
-
258
- const result: DoiCheckResult = {
259
- valid: true,
260
- source: 'crossref',
261
- metadata: {
262
- title: work.title?.[0] || '',
263
- authors: work.author?.map((a: any) => `${a.given || ''} ${a.family || ''}`.trim()) || [],
264
- year: work.published?.['date-parts']?.[0]?.[0] || work.created?.['date-parts']?.[0]?.[0],
265
- journal: work['container-title']?.[0] || '',
266
- type: work.type,
267
- },
268
- };
269
-
270
- cacheDoi(doi, result);
271
- return result;
272
- } catch (err) {
273
- // Don't cache network errors
274
- return { valid: false, error: (err as Error).message };
275
- }
276
- }
277
-
278
- /**
279
- * Fetch BibTeX from DOI using content negotiation
280
- */
281
- export async function fetchBibtex(doi: string): Promise<BibtexFetchResult> {
282
- // Clean DOI
283
- doi = doi.replace(/^https?:\/\/(dx\.)?doi\.org\//i, '');
284
-
285
- if (!isValidDoiFormat(doi)) {
286
- return { success: false, error: 'Invalid DOI format' };
287
- }
288
-
289
- try {
290
- const response = await doiOrgLimiter.fetchWithRetry(
291
- `https://doi.org/${encodeURIComponent(doi)}`,
292
- {
293
- headers: {
294
- 'Accept': 'application/x-bibtex',
295
- 'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev)',
296
- },
297
- redirect: 'follow',
298
- }
299
- );
300
-
301
- if (!response.ok) {
302
- return { success: false, error: `HTTP ${response.status}` };
303
- }
304
-
305
- const bibtex = await response.text();
306
-
307
- if (!bibtex.includes('@')) {
308
- return { success: false, error: 'Invalid BibTeX response' };
309
- }
310
-
311
- return { success: true, bibtex: bibtex.trim() };
312
- } catch (err) {
313
- return { success: false, error: (err as Error).message };
314
- }
315
- }
316
-
317
- interface CheckBibDoisOptions {
318
- checkMissing?: boolean;
319
- parallel?: number;
320
- }
321
-
322
- /**
323
- * Check all DOIs in a .bib file
324
- */
325
- export async function checkBibDois(bibPath: string, options: CheckBibDoisOptions = {}): Promise<BibCheckResult> {
326
- const { checkMissing = false, parallel = 5 } = options;
327
-
328
- const entries = parseBibEntries(bibPath);
329
- const results: Array<BibEntry & { status: string; message?: string; metadata?: object }> = [];
330
-
331
- let valid = 0;
332
- let invalid = 0;
333
- let missing = 0;
334
- let skipped = 0;
335
-
336
- // Process in batches to avoid rate limiting
337
- for (let i = 0; i < entries.length; i += parallel) {
338
- const batch = entries.slice(i, i + parallel);
339
-
340
- const batchResults = await Promise.all(
341
- batch.map(async (entry) => {
342
- // Skip if marked
343
- if (entry.skip) {
344
- skipped++;
345
- return { ...entry, status: 'skipped', message: 'Marked as no-doi' };
346
- }
347
-
348
- // No DOI field
349
- if (!entry.doi) {
350
- if (entry.noDoi) {
351
- // Expected - books, theses, etc.
352
- skipped++;
353
- return { ...entry, status: 'skipped', message: `${entry.type} typically has no DOI` };
354
- } else if (entry.expectDoi) {
355
- // Should have DOI but doesn't
356
- missing++;
357
- return { ...entry, status: 'missing', message: 'Expected DOI for article/proceedings' };
358
- } else {
359
- skipped++;
360
- return { ...entry, status: 'skipped', message: 'No DOI field' };
361
- }
362
- }
363
-
364
- // Validate DOI format first
365
- if (!isValidDoiFormat(entry.doi)) {
366
- invalid++;
367
- return { ...entry, status: 'invalid', message: 'Invalid DOI format' };
368
- }
369
-
370
- // Check if DOI resolves
371
- const check = await checkDoi(entry.doi);
372
- if (check.valid) {
373
- valid++;
374
- return { ...entry, status: 'valid', metadata: check.metadata };
375
- } else {
376
- invalid++;
377
- return { ...entry, status: 'invalid', message: check.error };
378
- }
379
- })
380
- );
381
-
382
- results.push(...batchResults);
383
-
384
- // Small delay between batches to be nice to the API
385
- if (i + parallel < entries.length) {
386
- await new Promise(r => setTimeout(r, 200));
387
- }
388
- }
389
-
390
- return { entries: results, valid, invalid, missing, skipped };
391
- }
392
-
393
- interface DataCiteItem {
394
- id: string;
395
- attributes: {
396
- titles?: Array<{ title: string }>;
397
- creators?: Array<{ givenName?: string; familyName?: string }>;
398
- publicationYear: number;
399
- publisher?: string;
400
- };
401
- }
402
-
403
- /**
404
- * Search DataCite API (for Zenodo, Figshare, etc.)
405
- */
406
- async function searchDataCite(title: string, author: string = '', year: number | null = null): Promise<any[]> {
407
- try {
408
- // DataCite query syntax
409
- let query = `titles.title:${title.replace(/[{}]/g, '')}`;
410
- if (author) {
411
- query += ` AND creators.name:${author}`;
412
- }
413
- if (year) {
414
- query += ` AND publicationYear:${year}`;
415
- }
416
-
417
- const params = new URLSearchParams({
418
- query: query,
419
- 'page[size]': '5',
420
- });
421
-
422
- const response = await dataciteLimiter.fetchWithRetry(
423
- `https://api.datacite.org/dois?${params}`,
424
- {
425
- headers: {
426
- 'Accept': 'application/vnd.api+json',
427
- 'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev)',
428
- },
429
- }
430
- );
431
-
432
- if (!response.ok) return [];
433
-
434
- const data = await response.json() as { data?: DataCiteItem[] };
435
- const items = data.data || [];
436
-
437
- return items.map(item => {
438
- const attrs = item.attributes;
439
- return {
440
- DOI: item.id,
441
- title: [attrs.titles?.[0]?.title || ''],
442
- author: attrs.creators?.map(c => ({ family: c.familyName, given: c.givenName })) || [],
443
- 'published-print': { 'date-parts': [[attrs.publicationYear]] },
444
- 'container-title': [attrs.publisher || ''],
445
- score: 50, // Base score for DataCite results
446
- source: 'datacite',
447
- };
448
- });
449
- } catch {
450
- return [];
451
- }
452
- }
453
-
454
- /**
455
- * Normalize text for comparison (lowercase, remove special chars)
456
- */
457
- function normalizeForMatching(text: string): string {
458
- return (text || '')
459
- .toLowerCase()
460
- .replace(/[{}\\]/g, '') // Remove LaTeX braces
461
- .replace(/[^a-z0-9\s]/g, ' ') // Replace special chars with space
462
- .replace(/\s+/g, ' ')
463
- .trim();
464
- }
465
-
466
- /**
467
- * Check if DOI looks like a supplement, figure, or review (not the main paper)
468
- */
469
- function isSupplementOrReview(doi: string, title: string = '', journal: string = ''): boolean {
470
- const doiLower = (doi || '').toLowerCase();
471
- const titleLower = (title || '').toLowerCase();
472
- const journalLower = (journal || '').toLowerCase();
473
-
474
- // Supplement/figure DOI patterns
475
- if (/\.suppl|\/suppl|\.figure|\/figure|\.s\d+$|_s\d+$/i.test(doiLower)) {
476
- return true;
477
- }
478
-
479
- // F1000/Faculty Opinions (post-publication reviews)
480
- if (/10\.3410\/f\./i.test(doiLower) || /faculty opinions/i.test(journalLower)) {
481
- return true;
482
- }
483
-
484
- // Title suggests it's supplementary material
485
- if (/^supplementary|^supporting information|^appendix/i.test(titleLower)) {
486
- return true;
487
- }
488
-
489
- return false;
490
- }
491
-
492
- interface CrossrefItem {
493
- DOI: string;
494
- title?: string[];
495
- author?: Array<{ given?: string; family?: string }>;
496
- 'published-print'?: { 'date-parts': number[][] };
497
- 'published-online'?: { 'date-parts': number[][] };
498
- 'container-title'?: string[];
499
- score?: number;
500
- type?: string;
501
- }
502
-
503
- /**
504
- * Search for DOI by title and author using Crossref API (+ DataCite fallback)
505
- */
506
- export async function lookupDoi(
507
- title: string,
508
- author: string = '',
509
- year: number | null = null,
510
- journal: string = ''
511
- ): Promise<DoiLookupResult> {
512
- if (!title || title.length < 10) {
513
- return { found: false, error: 'Title too short for reliable search' };
514
- }
515
-
516
- // Check for keywords that suggest Zenodo/DataCite sources
517
- const likelyZenodo = /\b(IPBES|zenodo|assessment report|secretariat)\b/i.test(title);
518
-
519
- try {
520
- // Build query - title is most important, add author and journal if available
521
- let query = title;
522
- if (author) {
523
- query = `${title} ${author}`;
524
- }
525
- // Add journal to query for better matching
526
- if (journal) {
527
- query = `${query} ${journal}`;
528
- }
529
-
530
- let items: CrossrefItem[] = [];
531
-
532
- // Try structured bibliographic query first (more accurate)
533
- const structuredParams = new URLSearchParams({
534
- rows: '10',
535
- select: 'DOI,title,author,published-print,published-online,container-title,score,type',
536
- });
537
- structuredParams.set('query.bibliographic', title);
538
- if (author) {
539
- structuredParams.set('query.author', author);
540
- }
541
- if (journal) {
542
- structuredParams.set('query.container-title', journal);
543
- }
544
-
545
- let response = await crossrefLimiter.fetchWithRetry(
546
- `https://api.crossref.org/works?${structuredParams}`,
547
- {
548
- headers: {
549
- 'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev; mailto:docrev@example.com)',
550
- },
551
- }
552
- );
553
-
554
- if (response.ok) {
555
- const data = await response.json() as { message?: { items?: CrossrefItem[] } };
556
- items = data.message?.items || [];
557
- }
558
-
559
- // If structured query found few results, also try query.title (often better for exact matches)
560
- if (items.length < 5) {
561
- const titleParams = new URLSearchParams({
562
- rows: '10',
563
- select: 'DOI,title,author,published-print,published-online,container-title,score,type',
564
- });
565
- titleParams.set('query.title', title);
566
-
567
- const response2 = await crossrefLimiter.fetchWithRetry(
568
- `https://api.crossref.org/works?${titleParams}`,
569
- {
570
- headers: {
571
- 'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev; mailto:docrev@example.com)',
572
- },
573
- }
574
- );
575
-
576
- if (response2.ok) {
577
- const data = await response2.json() as { message?: { items?: CrossrefItem[] } };
578
- const newItems = data.message?.items || [];
579
- // Merge results, avoiding duplicates
580
- const existingDois = new Set(items.map(i => i.DOI));
581
- for (const item of newItems) {
582
- if (!existingDois.has(item.DOI)) {
583
- items.push(item);
584
- }
585
- }
586
- }
587
- }
588
-
589
- // If still nothing, try basic query (most lenient)
590
- if (items.length === 0) {
591
- const basicParams = new URLSearchParams({
592
- query: query,
593
- rows: '10',
594
- select: 'DOI,title,author,published-print,published-online,container-title,score,type',
595
- });
596
-
597
- response = await crossrefLimiter.fetchWithRetry(
598
- `https://api.crossref.org/works?${basicParams}`,
599
- {
600
- headers: {
601
- 'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev; mailto:docrev@example.com)',
602
- },
603
- }
604
- );
605
-
606
- if (response.ok) {
607
- const data = await response.json() as { message?: { items?: CrossrefItem[] } };
608
- items = data.message?.items || [];
609
- }
610
- }
611
-
612
- // Also search DataCite for Zenodo/institutional repos
613
- if (likelyZenodo || items.length === 0) {
614
- const dataciteItems = await searchDataCite(title, author, year);
615
- items = [...items, ...dataciteItems];
616
- }
617
-
618
- if (items.length === 0) {
619
- return { found: false, error: 'No results found' };
620
- }
621
-
622
- const normalizedSearchTitle = normalizeForMatching(title);
623
- const normalizedJournal = normalizeForMatching(journal);
624
-
625
- // Score the results
626
- const scored = items.map(item => {
627
- let score = 0;
628
- const itemTitle = item.title?.[0] || '';
629
- const itemJournal = item['container-title']?.[0] || '';
630
- const normalizedItemTitle = normalizeForMatching(itemTitle);
631
- const normalizedItemJournal = normalizeForMatching(itemJournal);
632
-
633
- // === PENALTY: Supplement/figure/review DOIs ===
634
- if (isSupplementOrReview(item.DOI, itemTitle, itemJournal)) {
635
- score -= 100; // Heavy penalty - almost never want these
636
- }
637
-
638
- // === Title similarity (most important) ===
639
- if (normalizedItemTitle === normalizedSearchTitle) {
640
- score += 100; // Exact match
641
- } else if (normalizedItemTitle.includes(normalizedSearchTitle) ||
642
- normalizedSearchTitle.includes(normalizedItemTitle)) {
643
- score += 50;
644
- } else {
645
- // Check word overlap
646
- const searchWords = normalizedSearchTitle.split(/\s+/).filter(w => w.length > 3);
647
- const itemWords = normalizedItemTitle.split(/\s+/).filter(w => w.length > 3);
648
- const overlap = searchWords.filter(w =>
649
- itemWords.some(iw => iw.includes(w) || w.includes(iw))
650
- );
651
- score += (overlap.length / Math.max(searchWords.length, 1)) * 40;
652
- }
653
-
654
- // === Author match ===
655
- if (author && item.author) {
656
- const authorLower = author.toLowerCase();
657
- const hasAuthor = item.author.some(a =>
658
- (a.family || '').toLowerCase().includes(authorLower) ||
659
- authorLower.includes((a.family || '').toLowerCase())
660
- );
661
- if (hasAuthor) score += 30;
662
- }
663
-
664
- // === Journal match (NEW) ===
665
- if (normalizedJournal && normalizedItemJournal) {
666
- // Check for journal name match (handles abbreviations)
667
- const journalWords = normalizedJournal.split(/\s+/).filter(w => w.length > 2);
668
- const itemJournalWords = normalizedItemJournal.split(/\s+/).filter(w => w.length > 2);
669
-
670
- // Count matching words
671
- const journalOverlap = journalWords.filter(w =>
672
- itemJournalWords.some(iw => iw.includes(w) || w.includes(iw))
673
- );
674
-
675
- if (journalOverlap.length >= Math.min(2, journalWords.length)) {
676
- score += 40; // Good journal match
677
- } else if (journalOverlap.length >= 1) {
678
- score += 15; // Partial match
679
- }
680
-
681
- // Bonus for exact journal match
682
- if (normalizedItemJournal === normalizedJournal) {
683
- score += 20;
684
- }
685
- }
686
-
687
- // === Year match - CRITICAL for accuracy ===
688
- const itemYear = item['published-print']?.['date-parts']?.[0]?.[0] ||
689
- item['published-online']?.['date-parts']?.[0]?.[0];
690
- if (year && itemYear) {
691
- if (itemYear === year) {
692
- score += 50; // Exact match - required for high confidence
693
- } else if (Math.abs(itemYear - year) === 1) {
694
- score += 20; // Off by one (common for online-first)
695
- } else {
696
- score -= 50; // Wrong year = likely wrong paper
697
- }
698
- } else if (year && !itemYear) {
699
- score -= 10; // Can't verify year
700
- }
701
-
702
- // Crossref's own relevance score (capped)
703
- score += Math.min(item.score || 0, 10);
704
-
705
- return {
706
- doi: item.DOI,
707
- title: itemTitle,
708
- authors: item.author?.map(a => `${a.given || ''} ${a.family || ''}`.trim()) || [],
709
- year: itemYear,
710
- journal: itemJournal,
711
- score,
712
- crossrefScore: item.score,
713
- isSupplement: isSupplementOrReview(item.DOI, itemTitle, itemJournal),
714
- };
715
- });
716
-
717
- // Sort by our score
718
- scored.sort((a, b) => b.score - a.score);
719
-
720
- // Filter out supplements for the "best" pick (but keep in alternatives)
721
- const mainPapers = scored.filter(s => !s.isSupplement);
722
- const best = mainPapers.length > 0 ? mainPapers[0] : scored[0];
723
-
724
- if (!best) {
725
- return { found: false, error: 'No valid results found' };
726
- }
727
-
728
- // Confidence thresholds
729
- let confidence: 'low' | 'medium' | 'high' = 'low';
730
- if (best.score >= 120) confidence = 'high';
731
- else if (best.score >= 70) confidence = 'medium';
732
-
733
- // === NEW: Try DataCite if Crossref confidence is low ===
734
- if (confidence === 'low' && !likelyZenodo) {
735
- const dataciteItems = await searchDataCite(title, author, year);
736
- if (dataciteItems.length > 0) {
737
- // Score DataCite results with same logic
738
- for (const dcItem of dataciteItems) {
739
- const dcTitle = dcItem.title?.[0] || '';
740
- const normalizedDcTitle = normalizeForMatching(dcTitle);
741
- let dcScore = 0;
742
-
743
- // Title match
744
- if (normalizedDcTitle === normalizedSearchTitle) {
745
- dcScore += 100;
746
- } else if (normalizedDcTitle.includes(normalizedSearchTitle) ||
747
- normalizedSearchTitle.includes(normalizedDcTitle)) {
748
- dcScore += 50;
749
- }
750
-
751
- // Year match
752
- const dcYear = dcItem['published-print']?.['date-parts']?.[0]?.[0];
753
- if (year && dcYear && dcYear === year) {
754
- dcScore += 50;
755
- }
756
-
757
- if (dcScore > best.score) {
758
- return {
759
- found: true,
760
- doi: dcItem.DOI,
761
- confidence: dcScore >= 120 ? 'high' : dcScore >= 70 ? 'medium' : 'low',
762
- score: dcScore,
763
- metadata: {
764
- title: dcTitle,
765
- authors: dcItem.author?.map((a: any) => `${a.given || ''} ${a.family || ''}`.trim()) || [],
766
- year: dcYear,
767
- journal: dcItem['container-title']?.[0] || '',
768
- },
769
- alternatives: scored.slice(0, 2),
770
- };
771
- }
772
- }
773
- }
774
- }
775
-
776
- return {
777
- found: true,
778
- doi: best.doi,
779
- confidence,
780
- score: best.score,
781
- metadata: {
782
- title: best.title,
783
- authors: best.authors,
784
- year: best.year || 0,
785
- journal: best.journal,
786
- },
787
- alternatives: scored.filter(s => s.doi !== best.doi).slice(0, 3),
788
- };
789
- } catch (err) {
790
- return { found: false, error: (err as Error).message };
791
- }
792
- }
793
-
794
- interface LookupMissingDoisOptions {
795
- parallel?: number;
796
- onProgress?: (current: number, total: number) => void;
797
- }
798
-
799
- interface LookupMissingDoiResult {
800
- key: string;
801
- title: string;
802
- type: string;
803
- journal: string;
804
- result: DoiLookupResult;
805
- }
806
-
807
- /**
808
- * Look up DOIs for all entries missing them in a .bib file
809
- */
810
- export async function lookupMissingDois(
811
- bibPath: string,
812
- options: LookupMissingDoisOptions = {}
813
- ): Promise<LookupMissingDoiResult[]> {
814
- const { parallel = 3, onProgress } = options;
815
-
816
- const entries = parseBibEntries(bibPath);
817
- const missing = entries.filter(e =>
818
- !e.doi &&
819
- !e.skip &&
820
- !NO_DOI_TYPES.has(e.type)
821
- );
822
-
823
- const results: LookupMissingDoiResult[] = [];
824
-
825
- for (let i = 0; i < missing.length; i += parallel) {
826
- const batch = missing.slice(i, i + parallel);
827
-
828
- const batchResults = await Promise.all(
829
- batch.map(async (entry) => {
830
- // Extract first author's last name from the entry
831
- // This is tricky because BibTeX author format varies
832
- let author = '';
833
- if (entry.authorRaw) {
834
- // Try to get first author's last name
835
- const firstAuthor = entry.authorRaw.split(' and ')[0];
836
- if (firstAuthor) {
837
- const parts = firstAuthor.split(',');
838
- author = parts[0]?.trim() || '';
839
- }
840
- }
841
-
842
- const result = await lookupDoi(entry.title, author, entry.year, entry.journal);
843
-
844
- return {
845
- key: entry.key,
846
- title: entry.title,
847
- type: entry.type,
848
- journal: entry.journal,
849
- result,
850
- };
851
- })
852
- );
853
-
854
- results.push(...batchResults);
855
-
856
- if (onProgress) {
857
- onProgress(Math.min(i + parallel, missing.length), missing.length);
858
- }
859
-
860
- // Rate limiting
861
- if (i + parallel < missing.length) {
862
- await new Promise(r => setTimeout(r, 300));
863
- }
864
- }
865
-
866
- return results;
867
- }
868
-
869
- interface AddToBibResult {
870
- success: boolean;
871
- key?: string;
872
- error?: string;
873
- }
874
-
875
- /**
876
- * Add a BibTeX entry to a .bib file
877
- */
878
- export function addToBib(bibPath: string, bibtex: string): AddToBibResult {
879
- // Extract key from BibTeX
880
- const keyMatch = bibtex.match(/@\w+\s*\{\s*([^,\s]+)/);
881
- if (!keyMatch) {
882
- return { success: false, error: 'Could not extract citation key from BibTeX' };
883
- }
884
- const key = keyMatch[1];
885
-
886
- // Check if key already exists
887
- const existing = fs.existsSync(bibPath) ? fs.readFileSync(bibPath, 'utf-8') : '';
888
- if (existing.includes(`{${key},`) || existing.includes(`{${key}\n`)) {
889
- return { success: false, error: `Key "${key}" already exists in ${bibPath}` };
890
- }
891
-
892
- // Append to file
893
- const newContent = existing.trim() + '\n\n' + bibtex + '\n';
894
- fs.writeFileSync(bibPath, newContent, 'utf-8');
895
-
896
- return { success: true, key };
897
- }
1
+ /**
2
+ * DOI validation and fetching utilities
3
+ * Check DOIs in .bib files, fetch BibTeX from DOIs
4
+ */
5
+
6
+ import * as fs from 'fs';
7
+ import type { BibEntry, DoiCheckResult, BibtexFetchResult, DoiLookupResult, BibCheckResult } from './types.js';
8
+ import { crossrefLimiter, dataciteLimiter, doiOrgLimiter } from './rate-limiter.js';
9
+ import { getCachedDoi, cacheDoi } from './doi-cache.js';
10
+
11
+ // Entry types that typically don't have DOIs
12
+ const NO_DOI_TYPES = new Set([
13
+ 'book', // Books often don't have DOIs (chapters might)
14
+ 'inbook', // Book chapters - variable
15
+ 'thesis', // Theses rarely have DOIs
16
+ 'mastersthesis',
17
+ 'phdthesis',
18
+ 'misc', // Catch-all, often no DOI
19
+ 'unpublished', // By definition
20
+ 'manual', // Software manuals
21
+ 'techreport', // Some do, many don't
22
+ 'booklet',
23
+ ]);
24
+
25
+ // Entry types that should have DOIs
26
+ const EXPECT_DOI_TYPES = new Set([
27
+ 'article', // Journal articles should have DOIs
28
+ 'inproceedings', // Conference papers usually do
29
+ 'proceedings',
30
+ 'incollection', // Book chapters in collections
31
+ ]);
32
+
33
+ /**
34
+ * Parse .bib file and extract entries with DOI info
35
+ */
36
+ export function parseBibEntries(bibPath: string): BibEntry[] {
37
+ if (!fs.existsSync(bibPath)) {
38
+ return [];
39
+ }
40
+
41
+ const content = fs.readFileSync(bibPath, 'utf-8');
42
+ const entries: BibEntry[] = [];
43
+ const lines = content.split('\n');
44
+
45
+ // Pattern for bib entries: @type{key,
46
+ const entryPattern = /@(\w+)\s*\{\s*([^,\s]+)\s*,/g;
47
+
48
+ let match: RegExpExecArray | null;
49
+ while ((match = entryPattern.exec(content)) !== null) {
50
+ const type = match[1]!.toLowerCase();
51
+ const key = match[2]!;
52
+ const startPos = match.index;
53
+
54
+ // Find the line number
55
+ let line = 1;
56
+ for (let i = 0; i < startPos; i++) {
57
+ if (content[i] === '\n') line++;
58
+ }
59
+
60
+ // Find the end of this entry (matching closing brace)
61
+ let braceCount = 0;
62
+ let entryEnd = startPos;
63
+ let inEntry = false;
64
+
65
+ for (let i = startPos; i < content.length; i++) {
66
+ if (content[i] === '{') {
67
+ braceCount++;
68
+ inEntry = true;
69
+ } else if (content[i] === '}') {
70
+ braceCount--;
71
+ if (inEntry && braceCount === 0) {
72
+ entryEnd = i + 1;
73
+ break;
74
+ }
75
+ }
76
+ }
77
+
78
+ const entryContent = content.slice(startPos, entryEnd);
79
+
80
+ // Extract DOI field
81
+ const doiMatch = entryContent.match(/\bdoi\s*=\s*[{"]([^}"]+)[}"]/i);
82
+ let doi = doiMatch ? doiMatch[1]!.trim() : null;
83
+
84
+ // Clean DOI - remove URL prefix if present
85
+ if (doi) {
86
+ doi = doi.replace(/^https?:\/\/(dx\.)?doi\.org\//i, '');
87
+ }
88
+
89
+ // Extract title for display
90
+ const titleMatch = entryContent.match(/\btitle\s*=\s*[{"]([^}"]+)[}"]/i);
91
+ const title = titleMatch ? titleMatch[1]!.trim().slice(0, 60) : '';
92
+
93
+ // Extract author for lookup
94
+ const authorMatch = entryContent.match(/\bauthor\s*=\s*[{"]([^}"]+)[}"]/i);
95
+ const authorRaw = authorMatch ? authorMatch[1]!.trim() : '';
96
+
97
+ // Extract year
98
+ const yearMatch = entryContent.match(/\byear\s*=\s*[{"]?(\d{4})[}""]?/i);
99
+ const year = yearMatch ? parseInt(yearMatch[1]!) : null;
100
+
101
+ // Extract journal
102
+ const journalMatch = entryContent.match(/\bjournal\s*=\s*[{"]([^}"]+)[}"]/i);
103
+ const journal = journalMatch ? journalMatch[1]!.trim() : '';
104
+
105
+ // Check for skip marker: nodoi = {true} or nodoi = true
106
+ const skipMatch = entryContent.match(/\bnodoi\s*=\s*[{"]?(true|yes|1)[}""]?/i);
107
+ const skip = !!skipMatch;
108
+
109
+ // Check for comment marker immediately before entry: % no-doi
110
+ // Only look at the text between the last entry end (or start) and this entry
111
+ const linesBefore = content.slice(Math.max(0, startPos - 200), startPos);
112
+ // Find the last closing brace or start of file to avoid matching comments for previous entries
113
+ const lastEntryEnd = linesBefore.lastIndexOf('}');
114
+ const relevantBefore = lastEntryEnd >= 0 ? linesBefore.slice(lastEntryEnd + 1) : linesBefore;
115
+ const commentSkip = /% *no-?doi/i.test(relevantBefore);
116
+
117
+ entries.push({
118
+ key,
119
+ type,
120
+ doi: doi || null,
121
+ title,
122
+ authorRaw,
123
+ year,
124
+ journal,
125
+ skip: skip || commentSkip,
126
+ expectDoi: EXPECT_DOI_TYPES.has(type),
127
+ noDoi: NO_DOI_TYPES.has(type),
128
+ line,
129
+ });
130
+ }
131
+
132
+ return entries;
133
+ }
134
+
135
+ /**
136
+ * Validate DOI format
137
+ */
138
+ export function isValidDoiFormat(doi: string): boolean {
139
+ if (!doi) return false;
140
+ // DOI format: 10.prefix/suffix
141
+ // Prefix is 4+ digits, suffix can contain most characters
142
+ return /^10\.\d{4,}\/[^\s]+$/.test(doi);
143
+ }
144
+
145
+ /**
146
+ * Check if DOI resolves via DataCite (for Zenodo, Figshare, etc.)
147
+ */
148
+ async function checkDoiDataCite(doi: string): Promise<DoiCheckResult> {
149
+ try {
150
+ const response = await dataciteLimiter.fetchWithRetry(
151
+ `https://api.datacite.org/dois/${encodeURIComponent(doi)}`,
152
+ {
153
+ headers: {
154
+ 'Accept': 'application/vnd.api+json',
155
+ 'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev)',
156
+ },
157
+ }
158
+ );
159
+
160
+ if (response.status === 404) {
161
+ return { valid: false, error: 'DOI not found in DataCite' };
162
+ }
163
+
164
+ if (!response.ok) {
165
+ return { valid: false, error: `HTTP ${response.status}` };
166
+ }
167
+
168
+ const data = await response.json() as any;
169
+ const attrs = data.data?.attributes;
170
+
171
+ if (!attrs) {
172
+ return { valid: false, error: 'Invalid DataCite response' };
173
+ }
174
+
175
+ return {
176
+ valid: true,
177
+ source: 'datacite',
178
+ metadata: {
179
+ title: attrs.titles?.[0]?.title || '',
180
+ authors: attrs.creators?.map((c: any) => `${c.givenName || ''} ${c.familyName || ''}`.trim()) || [],
181
+ year: attrs.publicationYear,
182
+ journal: attrs.publisher || '',
183
+ type: attrs.types?.resourceTypeGeneral || '',
184
+ },
185
+ };
186
+ } catch (err) {
187
+ return { valid: false, error: (err as Error).message };
188
+ }
189
+ }
190
+
191
+ interface CheckDoiOptions {
192
+ skipCache?: boolean;
193
+ }
194
+
195
+ /**
196
+ * Check if DOI resolves (exists) - tries Crossref first, then DataCite
197
+ * Results are cached for 7 days to reduce API calls.
198
+ */
199
+ export async function checkDoi(doi: string, options: CheckDoiOptions = {}): Promise<DoiCheckResult & { cached?: boolean }> {
200
+ if (!isValidDoiFormat(doi)) {
201
+ return { valid: false, error: 'Invalid DOI format' };
202
+ }
203
+
204
+ // Check cache first (unless skipped)
205
+ if (!options.skipCache) {
206
+ const cached = getCachedDoi(doi);
207
+ if (cached) {
208
+ return { ...cached, cached: true } as DoiCheckResult & { cached?: boolean };
209
+ }
210
+ }
211
+
212
+ // Zenodo DOIs start with 10.5281 - check DataCite first
213
+ const isZenodo = doi.startsWith('10.5281/');
214
+ const isFigshare = doi.startsWith('10.6084/');
215
+ const isDataCiteLikely = isZenodo || isFigshare;
216
+
217
+ if (isDataCiteLikely) {
218
+ const dataciteResult = await checkDoiDataCite(doi);
219
+ if (dataciteResult.valid) {
220
+ cacheDoi(doi, dataciteResult);
221
+ return dataciteResult;
222
+ }
223
+ }
224
+
225
+ try {
226
+ // Use Crossref API to check DOI
227
+ const response = await crossrefLimiter.fetchWithRetry(
228
+ `https://api.crossref.org/works/${encodeURIComponent(doi)}`,
229
+ {
230
+ headers: {
231
+ 'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev; mailto:docrev@example.com)',
232
+ },
233
+ }
234
+ );
235
+
236
+ if (response.status === 404) {
237
+ // Try DataCite as fallback (if not already tried)
238
+ if (!isDataCiteLikely) {
239
+ const dataciteResult = await checkDoiDataCite(doi);
240
+ if (dataciteResult.valid) {
241
+ cacheDoi(doi, dataciteResult);
242
+ return dataciteResult;
243
+ }
244
+ }
245
+ const result = { valid: false, error: 'DOI not found' };
246
+ cacheDoi(doi, result);
247
+ return result;
248
+ }
249
+
250
+ if (!response.ok) {
251
+ // Don't cache transient errors
252
+ return { valid: false, error: `HTTP ${response.status}` };
253
+ }
254
+
255
+ const data = await response.json() as any;
256
+ const work = data.message;
257
+
258
+ const result: DoiCheckResult = {
259
+ valid: true,
260
+ source: 'crossref',
261
+ metadata: {
262
+ title: work.title?.[0] || '',
263
+ authors: work.author?.map((a: any) => `${a.given || ''} ${a.family || ''}`.trim()) || [],
264
+ year: work.published?.['date-parts']?.[0]?.[0] || work.created?.['date-parts']?.[0]?.[0],
265
+ journal: work['container-title']?.[0] || '',
266
+ type: work.type,
267
+ },
268
+ };
269
+
270
+ cacheDoi(doi, result);
271
+ return result;
272
+ } catch (err) {
273
+ // Don't cache network errors
274
+ return { valid: false, error: (err as Error).message };
275
+ }
276
+ }
277
+
278
+ /**
279
+ * Fetch BibTeX from DOI using content negotiation
280
+ */
281
+ export async function fetchBibtex(doi: string): Promise<BibtexFetchResult> {
282
+ // Clean DOI
283
+ doi = doi.replace(/^https?:\/\/(dx\.)?doi\.org\//i, '');
284
+
285
+ if (!isValidDoiFormat(doi)) {
286
+ return { success: false, error: 'Invalid DOI format' };
287
+ }
288
+
289
+ try {
290
+ const response = await doiOrgLimiter.fetchWithRetry(
291
+ `https://doi.org/${encodeURIComponent(doi)}`,
292
+ {
293
+ headers: {
294
+ 'Accept': 'application/x-bibtex',
295
+ 'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev)',
296
+ },
297
+ redirect: 'follow',
298
+ }
299
+ );
300
+
301
+ if (!response.ok) {
302
+ return { success: false, error: `HTTP ${response.status}` };
303
+ }
304
+
305
+ const bibtex = await response.text();
306
+
307
+ if (!bibtex.includes('@')) {
308
+ return { success: false, error: 'Invalid BibTeX response' };
309
+ }
310
+
311
+ return { success: true, bibtex: bibtex.trim() };
312
+ } catch (err) {
313
+ return { success: false, error: (err as Error).message };
314
+ }
315
+ }
316
+
317
+ interface CheckBibDoisOptions {
318
+ checkMissing?: boolean;
319
+ parallel?: number;
320
+ }
321
+
322
+ /**
323
+ * Check all DOIs in a .bib file
324
+ */
325
+ export async function checkBibDois(bibPath: string, options: CheckBibDoisOptions = {}): Promise<BibCheckResult> {
326
+ const { checkMissing = false, parallel = 5 } = options;
327
+
328
+ const entries = parseBibEntries(bibPath);
329
+ const results: Array<BibEntry & { status: string; message?: string; metadata?: object }> = [];
330
+
331
+ let valid = 0;
332
+ let invalid = 0;
333
+ let missing = 0;
334
+ let skipped = 0;
335
+
336
+ // Process in batches to avoid rate limiting
337
+ for (let i = 0; i < entries.length; i += parallel) {
338
+ const batch = entries.slice(i, i + parallel);
339
+
340
+ const batchResults = await Promise.all(
341
+ batch.map(async (entry) => {
342
+ // Skip if marked
343
+ if (entry.skip) {
344
+ skipped++;
345
+ return { ...entry, status: 'skipped', message: 'Marked as no-doi' };
346
+ }
347
+
348
+ // No DOI field
349
+ if (!entry.doi) {
350
+ if (entry.noDoi) {
351
+ // Expected - books, theses, etc.
352
+ skipped++;
353
+ return { ...entry, status: 'skipped', message: `${entry.type} typically has no DOI` };
354
+ } else if (entry.expectDoi) {
355
+ // Should have DOI but doesn't
356
+ missing++;
357
+ return { ...entry, status: 'missing', message: 'Expected DOI for article/proceedings' };
358
+ } else {
359
+ skipped++;
360
+ return { ...entry, status: 'skipped', message: 'No DOI field' };
361
+ }
362
+ }
363
+
364
+ // Validate DOI format first
365
+ if (!isValidDoiFormat(entry.doi)) {
366
+ invalid++;
367
+ return { ...entry, status: 'invalid', message: 'Invalid DOI format' };
368
+ }
369
+
370
+ // Check if DOI resolves
371
+ const check = await checkDoi(entry.doi);
372
+ if (check.valid) {
373
+ valid++;
374
+ return { ...entry, status: 'valid', metadata: check.metadata };
375
+ } else {
376
+ invalid++;
377
+ return { ...entry, status: 'invalid', message: check.error };
378
+ }
379
+ })
380
+ );
381
+
382
+ results.push(...batchResults);
383
+
384
+ // Small delay between batches to be nice to the API
385
+ if (i + parallel < entries.length) {
386
+ await new Promise(r => setTimeout(r, 200));
387
+ }
388
+ }
389
+
390
+ return { entries: results, valid, invalid, missing, skipped };
391
+ }
392
+
393
+ interface DataCiteItem {
394
+ id: string;
395
+ attributes: {
396
+ titles?: Array<{ title: string }>;
397
+ creators?: Array<{ givenName?: string; familyName?: string }>;
398
+ publicationYear: number;
399
+ publisher?: string;
400
+ };
401
+ }
402
+
403
+ /**
404
+ * Search DataCite API (for Zenodo, Figshare, etc.)
405
+ */
406
+ async function searchDataCite(title: string, author: string = '', year: number | null = null): Promise<any[]> {
407
+ try {
408
+ // DataCite query syntax
409
+ let query = `titles.title:${title.replace(/[{}]/g, '')}`;
410
+ if (author) {
411
+ query += ` AND creators.name:${author}`;
412
+ }
413
+ if (year) {
414
+ query += ` AND publicationYear:${year}`;
415
+ }
416
+
417
+ const params = new URLSearchParams({
418
+ query: query,
419
+ 'page[size]': '5',
420
+ });
421
+
422
+ const response = await dataciteLimiter.fetchWithRetry(
423
+ `https://api.datacite.org/dois?${params}`,
424
+ {
425
+ headers: {
426
+ 'Accept': 'application/vnd.api+json',
427
+ 'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev)',
428
+ },
429
+ }
430
+ );
431
+
432
+ if (!response.ok) return [];
433
+
434
+ const data = await response.json() as { data?: DataCiteItem[] };
435
+ const items = data.data || [];
436
+
437
+ return items.map(item => {
438
+ const attrs = item.attributes;
439
+ return {
440
+ DOI: item.id,
441
+ title: [attrs.titles?.[0]?.title || ''],
442
+ author: attrs.creators?.map(c => ({ family: c.familyName, given: c.givenName })) || [],
443
+ 'published-print': { 'date-parts': [[attrs.publicationYear]] },
444
+ 'container-title': [attrs.publisher || ''],
445
+ score: 50, // Base score for DataCite results
446
+ source: 'datacite',
447
+ };
448
+ });
449
+ } catch {
450
+ return [];
451
+ }
452
+ }
453
+
454
+ /**
455
+ * Normalize text for comparison (lowercase, remove special chars)
456
+ */
457
+ function normalizeForMatching(text: string): string {
458
+ return (text || '')
459
+ .toLowerCase()
460
+ .replace(/[{}\\]/g, '') // Remove LaTeX braces
461
+ .replace(/[^a-z0-9\s]/g, ' ') // Replace special chars with space
462
+ .replace(/\s+/g, ' ')
463
+ .trim();
464
+ }
465
+
466
+ /**
467
+ * Check if DOI looks like a supplement, figure, or review (not the main paper)
468
+ */
469
+ function isSupplementOrReview(doi: string, title: string = '', journal: string = ''): boolean {
470
+ const doiLower = (doi || '').toLowerCase();
471
+ const titleLower = (title || '').toLowerCase();
472
+ const journalLower = (journal || '').toLowerCase();
473
+
474
+ // Supplement/figure DOI patterns
475
+ if (/\.suppl|\/suppl|\.figure|\/figure|\.s\d+$|_s\d+$/i.test(doiLower)) {
476
+ return true;
477
+ }
478
+
479
+ // F1000/Faculty Opinions (post-publication reviews)
480
+ if (/10\.3410\/f\./i.test(doiLower) || /faculty opinions/i.test(journalLower)) {
481
+ return true;
482
+ }
483
+
484
+ // Title suggests it's supplementary material
485
+ if (/^supplementary|^supporting information|^appendix/i.test(titleLower)) {
486
+ return true;
487
+ }
488
+
489
+ return false;
490
+ }
491
+
492
+ interface CrossrefItem {
493
+ DOI: string;
494
+ title?: string[];
495
+ author?: Array<{ given?: string; family?: string }>;
496
+ 'published-print'?: { 'date-parts': number[][] };
497
+ 'published-online'?: { 'date-parts': number[][] };
498
+ 'container-title'?: string[];
499
+ score?: number;
500
+ type?: string;
501
+ }
502
+
503
+ /**
504
+ * Search for DOI by title and author using Crossref API (+ DataCite fallback)
505
+ */
506
+ export async function lookupDoi(
507
+ title: string,
508
+ author: string = '',
509
+ year: number | null = null,
510
+ journal: string = ''
511
+ ): Promise<DoiLookupResult> {
512
+ if (!title || title.length < 10) {
513
+ return { found: false, error: 'Title too short for reliable search' };
514
+ }
515
+
516
+ // Check for keywords that suggest Zenodo/DataCite sources
517
+ const likelyZenodo = /\b(IPBES|zenodo|assessment report|secretariat)\b/i.test(title);
518
+
519
+ try {
520
+ // Build query - title is most important, add author and journal if available
521
+ let query = title;
522
+ if (author) {
523
+ query = `${title} ${author}`;
524
+ }
525
+ // Add journal to query for better matching
526
+ if (journal) {
527
+ query = `${query} ${journal}`;
528
+ }
529
+
530
+ let items: CrossrefItem[] = [];
531
+
532
+ // Try structured bibliographic query first (more accurate)
533
+ const structuredParams = new URLSearchParams({
534
+ rows: '10',
535
+ select: 'DOI,title,author,published-print,published-online,container-title,score,type',
536
+ });
537
+ structuredParams.set('query.bibliographic', title);
538
+ if (author) {
539
+ structuredParams.set('query.author', author);
540
+ }
541
+ if (journal) {
542
+ structuredParams.set('query.container-title', journal);
543
+ }
544
+
545
+ let response = await crossrefLimiter.fetchWithRetry(
546
+ `https://api.crossref.org/works?${structuredParams}`,
547
+ {
548
+ headers: {
549
+ 'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev; mailto:docrev@example.com)',
550
+ },
551
+ }
552
+ );
553
+
554
+ if (response.ok) {
555
+ const data = await response.json() as { message?: { items?: CrossrefItem[] } };
556
+ items = data.message?.items || [];
557
+ }
558
+
559
+ // If structured query found few results, also try query.title (often better for exact matches)
560
+ if (items.length < 5) {
561
+ const titleParams = new URLSearchParams({
562
+ rows: '10',
563
+ select: 'DOI,title,author,published-print,published-online,container-title,score,type',
564
+ });
565
+ titleParams.set('query.title', title);
566
+
567
+ const response2 = await crossrefLimiter.fetchWithRetry(
568
+ `https://api.crossref.org/works?${titleParams}`,
569
+ {
570
+ headers: {
571
+ 'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev; mailto:docrev@example.com)',
572
+ },
573
+ }
574
+ );
575
+
576
+ if (response2.ok) {
577
+ const data = await response2.json() as { message?: { items?: CrossrefItem[] } };
578
+ const newItems = data.message?.items || [];
579
+ // Merge results, avoiding duplicates
580
+ const existingDois = new Set(items.map(i => i.DOI));
581
+ for (const item of newItems) {
582
+ if (!existingDois.has(item.DOI)) {
583
+ items.push(item);
584
+ }
585
+ }
586
+ }
587
+ }
588
+
589
+ // If still nothing, try basic query (most lenient)
590
+ if (items.length === 0) {
591
+ const basicParams = new URLSearchParams({
592
+ query: query,
593
+ rows: '10',
594
+ select: 'DOI,title,author,published-print,published-online,container-title,score,type',
595
+ });
596
+
597
+ response = await crossrefLimiter.fetchWithRetry(
598
+ `https://api.crossref.org/works?${basicParams}`,
599
+ {
600
+ headers: {
601
+ 'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev; mailto:docrev@example.com)',
602
+ },
603
+ }
604
+ );
605
+
606
+ if (response.ok) {
607
+ const data = await response.json() as { message?: { items?: CrossrefItem[] } };
608
+ items = data.message?.items || [];
609
+ }
610
+ }
611
+
612
+ // Also search DataCite for Zenodo/institutional repos
613
+ if (likelyZenodo || items.length === 0) {
614
+ const dataciteItems = await searchDataCite(title, author, year);
615
+ items = [...items, ...dataciteItems];
616
+ }
617
+
618
+ if (items.length === 0) {
619
+ return { found: false, error: 'No results found' };
620
+ }
621
+
622
+ const normalizedSearchTitle = normalizeForMatching(title);
623
+ const normalizedJournal = normalizeForMatching(journal);
624
+
625
+ // Score the results
626
+ const scored = items.map(item => {
627
+ let score = 0;
628
+ const itemTitle = item.title?.[0] || '';
629
+ const itemJournal = item['container-title']?.[0] || '';
630
+ const normalizedItemTitle = normalizeForMatching(itemTitle);
631
+ const normalizedItemJournal = normalizeForMatching(itemJournal);
632
+
633
+ // === PENALTY: Supplement/figure/review DOIs ===
634
+ if (isSupplementOrReview(item.DOI, itemTitle, itemJournal)) {
635
+ score -= 100; // Heavy penalty - almost never want these
636
+ }
637
+
638
+ // === Title similarity (most important) ===
639
+ if (normalizedItemTitle === normalizedSearchTitle) {
640
+ score += 100; // Exact match
641
+ } else if (normalizedItemTitle.includes(normalizedSearchTitle) ||
642
+ normalizedSearchTitle.includes(normalizedItemTitle)) {
643
+ score += 50;
644
+ } else {
645
+ // Check word overlap
646
+ const searchWords = normalizedSearchTitle.split(/\s+/).filter(w => w.length > 3);
647
+ const itemWords = normalizedItemTitle.split(/\s+/).filter(w => w.length > 3);
648
+ const overlap = searchWords.filter(w =>
649
+ itemWords.some(iw => iw.includes(w) || w.includes(iw))
650
+ );
651
+ score += (overlap.length / Math.max(searchWords.length, 1)) * 40;
652
+ }
653
+
654
+ // === Author match ===
655
+ if (author && item.author) {
656
+ const authorLower = author.toLowerCase();
657
+ const hasAuthor = item.author.some(a =>
658
+ (a.family || '').toLowerCase().includes(authorLower) ||
659
+ authorLower.includes((a.family || '').toLowerCase())
660
+ );
661
+ if (hasAuthor) score += 30;
662
+ }
663
+
664
+ // === Journal match (NEW) ===
665
+ if (normalizedJournal && normalizedItemJournal) {
666
+ // Check for journal name match (handles abbreviations)
667
+ const journalWords = normalizedJournal.split(/\s+/).filter(w => w.length > 2);
668
+ const itemJournalWords = normalizedItemJournal.split(/\s+/).filter(w => w.length > 2);
669
+
670
+ // Count matching words
671
+ const journalOverlap = journalWords.filter(w =>
672
+ itemJournalWords.some(iw => iw.includes(w) || w.includes(iw))
673
+ );
674
+
675
+ if (journalOverlap.length >= Math.min(2, journalWords.length)) {
676
+ score += 40; // Good journal match
677
+ } else if (journalOverlap.length >= 1) {
678
+ score += 15; // Partial match
679
+ }
680
+
681
+ // Bonus for exact journal match
682
+ if (normalizedItemJournal === normalizedJournal) {
683
+ score += 20;
684
+ }
685
+ }
686
+
687
+ // === Year match - CRITICAL for accuracy ===
688
+ const itemYear = item['published-print']?.['date-parts']?.[0]?.[0] ||
689
+ item['published-online']?.['date-parts']?.[0]?.[0];
690
+ if (year && itemYear) {
691
+ if (itemYear === year) {
692
+ score += 50; // Exact match - required for high confidence
693
+ } else if (Math.abs(itemYear - year) === 1) {
694
+ score += 20; // Off by one (common for online-first)
695
+ } else {
696
+ score -= 50; // Wrong year = likely wrong paper
697
+ }
698
+ } else if (year && !itemYear) {
699
+ score -= 10; // Can't verify year
700
+ }
701
+
702
+ // Crossref's own relevance score (capped)
703
+ score += Math.min(item.score || 0, 10);
704
+
705
+ return {
706
+ doi: item.DOI,
707
+ title: itemTitle,
708
+ authors: item.author?.map(a => `${a.given || ''} ${a.family || ''}`.trim()) || [],
709
+ year: itemYear,
710
+ journal: itemJournal,
711
+ score,
712
+ crossrefScore: item.score,
713
+ isSupplement: isSupplementOrReview(item.DOI, itemTitle, itemJournal),
714
+ };
715
+ });
716
+
717
+ // Sort by our score
718
+ scored.sort((a, b) => b.score - a.score);
719
+
720
+ // Filter out supplements for the "best" pick (but keep in alternatives)
721
+ const mainPapers = scored.filter(s => !s.isSupplement);
722
+ const best = mainPapers.length > 0 ? mainPapers[0] : scored[0];
723
+
724
+ if (!best) {
725
+ return { found: false, error: 'No valid results found' };
726
+ }
727
+
728
+ // Confidence thresholds
729
+ let confidence: 'low' | 'medium' | 'high' = 'low';
730
+ if (best.score >= 120) confidence = 'high';
731
+ else if (best.score >= 70) confidence = 'medium';
732
+
733
+ // === NEW: Try DataCite if Crossref confidence is low ===
734
+ if (confidence === 'low' && !likelyZenodo) {
735
+ const dataciteItems = await searchDataCite(title, author, year);
736
+ if (dataciteItems.length > 0) {
737
+ // Score DataCite results with same logic
738
+ for (const dcItem of dataciteItems) {
739
+ const dcTitle = dcItem.title?.[0] || '';
740
+ const normalizedDcTitle = normalizeForMatching(dcTitle);
741
+ let dcScore = 0;
742
+
743
+ // Title match
744
+ if (normalizedDcTitle === normalizedSearchTitle) {
745
+ dcScore += 100;
746
+ } else if (normalizedDcTitle.includes(normalizedSearchTitle) ||
747
+ normalizedSearchTitle.includes(normalizedDcTitle)) {
748
+ dcScore += 50;
749
+ }
750
+
751
+ // Year match
752
+ const dcYear = dcItem['published-print']?.['date-parts']?.[0]?.[0];
753
+ if (year && dcYear && dcYear === year) {
754
+ dcScore += 50;
755
+ }
756
+
757
+ if (dcScore > best.score) {
758
+ return {
759
+ found: true,
760
+ doi: dcItem.DOI,
761
+ confidence: dcScore >= 120 ? 'high' : dcScore >= 70 ? 'medium' : 'low',
762
+ score: dcScore,
763
+ metadata: {
764
+ title: dcTitle,
765
+ authors: dcItem.author?.map((a: any) => `${a.given || ''} ${a.family || ''}`.trim()) || [],
766
+ year: dcYear,
767
+ journal: dcItem['container-title']?.[0] || '',
768
+ },
769
+ alternatives: scored.slice(0, 2),
770
+ };
771
+ }
772
+ }
773
+ }
774
+ }
775
+
776
+ return {
777
+ found: true,
778
+ doi: best.doi,
779
+ confidence,
780
+ score: best.score,
781
+ metadata: {
782
+ title: best.title,
783
+ authors: best.authors,
784
+ year: best.year || 0,
785
+ journal: best.journal,
786
+ },
787
+ alternatives: scored.filter(s => s.doi !== best.doi).slice(0, 3),
788
+ };
789
+ } catch (err) {
790
+ return { found: false, error: (err as Error).message };
791
+ }
792
+ }
793
+
794
+ interface LookupMissingDoisOptions {
795
+ parallel?: number;
796
+ onProgress?: (current: number, total: number) => void;
797
+ }
798
+
799
+ interface LookupMissingDoiResult {
800
+ key: string;
801
+ title: string;
802
+ type: string;
803
+ journal: string;
804
+ result: DoiLookupResult;
805
+ }
806
+
807
+ /**
808
+ * Look up DOIs for all entries missing them in a .bib file
809
+ */
810
+ export async function lookupMissingDois(
811
+ bibPath: string,
812
+ options: LookupMissingDoisOptions = {}
813
+ ): Promise<LookupMissingDoiResult[]> {
814
+ const { parallel = 3, onProgress } = options;
815
+
816
+ const entries = parseBibEntries(bibPath);
817
+ const missing = entries.filter(e =>
818
+ !e.doi &&
819
+ !e.skip &&
820
+ !NO_DOI_TYPES.has(e.type)
821
+ );
822
+
823
+ const results: LookupMissingDoiResult[] = [];
824
+
825
+ for (let i = 0; i < missing.length; i += parallel) {
826
+ const batch = missing.slice(i, i + parallel);
827
+
828
+ const batchResults = await Promise.all(
829
+ batch.map(async (entry) => {
830
+ // Extract first author's last name from the entry
831
+ // This is tricky because BibTeX author format varies
832
+ let author = '';
833
+ if (entry.authorRaw) {
834
+ // Try to get first author's last name
835
+ const firstAuthor = entry.authorRaw.split(' and ')[0];
836
+ if (firstAuthor) {
837
+ const parts = firstAuthor.split(',');
838
+ author = parts[0]?.trim() || '';
839
+ }
840
+ }
841
+
842
+ const result = await lookupDoi(entry.title, author, entry.year, entry.journal);
843
+
844
+ return {
845
+ key: entry.key,
846
+ title: entry.title,
847
+ type: entry.type,
848
+ journal: entry.journal,
849
+ result,
850
+ };
851
+ })
852
+ );
853
+
854
+ results.push(...batchResults);
855
+
856
+ if (onProgress) {
857
+ onProgress(Math.min(i + parallel, missing.length), missing.length);
858
+ }
859
+
860
+ // Rate limiting
861
+ if (i + parallel < missing.length) {
862
+ await new Promise(r => setTimeout(r, 300));
863
+ }
864
+ }
865
+
866
+ return results;
867
+ }
868
+
869
+ interface AddToBibResult {
870
+ success: boolean;
871
+ key?: string;
872
+ error?: string;
873
+ }
874
+
875
+ /**
876
+ * Add a BibTeX entry to a .bib file
877
+ */
878
+ export function addToBib(bibPath: string, bibtex: string): AddToBibResult {
879
+ // Extract key from BibTeX
880
+ const keyMatch = bibtex.match(/@\w+\s*\{\s*([^,\s]+)/);
881
+ if (!keyMatch) {
882
+ return { success: false, error: 'Could not extract citation key from BibTeX' };
883
+ }
884
+ const key = keyMatch[1];
885
+
886
+ // Check if key already exists
887
+ const existing = fs.existsSync(bibPath) ? fs.readFileSync(bibPath, 'utf-8') : '';
888
+ if (existing.includes(`{${key},`) || existing.includes(`{${key}\n`)) {
889
+ return { success: false, error: `Key "${key}" already exists in ${bibPath}` };
890
+ }
891
+
892
+ // Append to file
893
+ const newContent = existing.trim() + '\n\n' + bibtex + '\n';
894
+ fs.writeFileSync(bibPath, newContent, 'utf-8');
895
+
896
+ return { success: true, key };
897
+ }