docrev 0.9.11 → 0.9.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +9 -9
- package/.gitattributes +1 -1
- package/CHANGELOG.md +149 -149
- package/PLAN-tables-and-postprocess.md +850 -850
- package/README.md +391 -391
- package/bin/rev.js +11 -11
- package/bin/rev.ts +145 -145
- package/completions/rev.bash +127 -127
- package/completions/rev.ps1 +210 -210
- package/completions/rev.zsh +207 -207
- package/dev_notes/stress2/build_adversarial.ts +186 -186
- package/dev_notes/stress2/drift_matcher.ts +62 -62
- package/dev_notes/stress2/probe_anchors.ts +35 -35
- package/dev_notes/stress2/project/discussion.before.md +3 -3
- package/dev_notes/stress2/project/discussion.md +3 -3
- package/dev_notes/stress2/project/methods.before.md +20 -20
- package/dev_notes/stress2/project/methods.md +20 -20
- package/dev_notes/stress2/project/rev.yaml +5 -5
- package/dev_notes/stress2/project/sections.yaml +4 -4
- package/dev_notes/stress2/sections.yaml +5 -5
- package/dev_notes/stress2/trace_placement.ts +50 -50
- package/dev_notes/stresstest_boundaries.ts +27 -27
- package/dev_notes/stresstest_drift_apply.ts +43 -43
- package/dev_notes/stresstest_drift_compare.ts +43 -43
- package/dev_notes/stresstest_drift_v2.ts +54 -54
- package/dev_notes/stresstest_inspect.ts +54 -54
- package/dev_notes/stresstest_pstyle.ts +55 -55
- package/dev_notes/stresstest_section_debug.ts +23 -23
- package/dev_notes/stresstest_split.ts +70 -70
- package/dev_notes/stresstest_trace.ts +19 -19
- package/dev_notes/stresstest_verify_no_overwrite.ts +40 -40
- package/dist/lib/build.d.ts +50 -1
- package/dist/lib/build.d.ts.map +1 -1
- package/dist/lib/build.js +80 -30
- package/dist/lib/build.js.map +1 -1
- package/dist/lib/commands/build.d.ts.map +1 -1
- package/dist/lib/commands/build.js +38 -5
- package/dist/lib/commands/build.js.map +1 -1
- package/dist/lib/commands/utilities.js +164 -164
- package/dist/lib/commands/word-tools.js +8 -8
- package/dist/lib/grammar.js +3 -3
- package/dist/lib/import.d.ts.map +1 -1
- package/dist/lib/import.js +146 -24
- package/dist/lib/import.js.map +1 -1
- package/dist/lib/pdf-comments.js +44 -44
- package/dist/lib/plugins.js +57 -57
- package/dist/lib/pptx-themes.js +115 -115
- package/dist/lib/spelling.js +2 -2
- package/dist/lib/templates.js +387 -387
- package/dist/lib/themes.js +51 -51
- package/dist/lib/types.d.ts +20 -0
- package/dist/lib/types.d.ts.map +1 -1
- package/dist/lib/word-extraction.d.ts +6 -0
- package/dist/lib/word-extraction.d.ts.map +1 -1
- package/dist/lib/word-extraction.js +46 -3
- package/dist/lib/word-extraction.js.map +1 -1
- package/dist/lib/wordcomments.d.ts.map +1 -1
- package/dist/lib/wordcomments.js +23 -5
- package/dist/lib/wordcomments.js.map +1 -1
- package/eslint.config.js +27 -27
- package/lib/anchor-match.ts +276 -276
- package/lib/annotations.ts +644 -644
- package/lib/build.ts +1300 -1227
- package/lib/citations.ts +160 -160
- package/lib/commands/build.ts +833 -801
- package/lib/commands/citations.ts +515 -515
- package/lib/commands/comments.ts +1050 -1050
- package/lib/commands/context.ts +174 -174
- package/lib/commands/core.ts +309 -309
- package/lib/commands/doi.ts +435 -435
- package/lib/commands/file-ops.ts +372 -372
- package/lib/commands/history.ts +320 -320
- package/lib/commands/index.ts +87 -87
- package/lib/commands/init.ts +259 -259
- package/lib/commands/merge-resolve.ts +378 -378
- package/lib/commands/preview.ts +178 -178
- package/lib/commands/project-info.ts +244 -244
- package/lib/commands/quality.ts +517 -517
- package/lib/commands/response.ts +454 -454
- package/lib/commands/section-boundaries.ts +82 -82
- package/lib/commands/sections.ts +451 -451
- package/lib/commands/sync.ts +706 -706
- package/lib/commands/text-ops.ts +449 -449
- package/lib/commands/utilities.ts +448 -448
- package/lib/commands/verify-anchors.ts +272 -272
- package/lib/commands/word-tools.ts +340 -340
- package/lib/comment-realign.ts +517 -517
- package/lib/config.ts +84 -84
- package/lib/crossref.ts +781 -781
- package/lib/csl.ts +191 -191
- package/lib/dependencies.ts +98 -98
- package/lib/diff-engine.ts +465 -465
- package/lib/doi-cache.ts +115 -115
- package/lib/doi.ts +897 -897
- package/lib/equations.ts +506 -506
- package/lib/errors.ts +346 -346
- package/lib/format.ts +541 -541
- package/lib/git.ts +326 -326
- package/lib/grammar.ts +303 -303
- package/lib/image-registry.ts +180 -180
- package/lib/import.ts +911 -792
- package/lib/journals.ts +543 -543
- package/lib/merge.ts +633 -633
- package/lib/orcid.ts +144 -144
- package/lib/pdf-comments.ts +263 -263
- package/lib/pdf-import.ts +524 -524
- package/lib/plugins.ts +362 -362
- package/lib/postprocess.ts +188 -188
- package/lib/pptx-color-filter.lua +37 -37
- package/lib/pptx-template.ts +469 -469
- package/lib/pptx-themes.ts +483 -483
- package/lib/protect-restore.ts +520 -520
- package/lib/rate-limiter.ts +94 -94
- package/lib/response.ts +197 -197
- package/lib/restore-references.ts +240 -240
- package/lib/review.ts +327 -327
- package/lib/schema.ts +417 -417
- package/lib/scientific-words.ts +73 -73
- package/lib/sections.ts +335 -335
- package/lib/slides.ts +756 -756
- package/lib/spelling.ts +334 -334
- package/lib/templates.ts +526 -526
- package/lib/themes.ts +742 -742
- package/lib/trackchanges.ts +247 -247
- package/lib/tui.ts +450 -450
- package/lib/types.ts +550 -530
- package/lib/undo.ts +250 -250
- package/lib/utils.ts +69 -69
- package/lib/variables.ts +179 -179
- package/lib/word-extraction.ts +806 -759
- package/lib/word.ts +643 -643
- package/lib/wordcomments.ts +817 -798
- package/package.json +137 -137
- package/scripts/postbuild.js +28 -28
- package/skill/REFERENCE.md +431 -431
- package/skill/SKILL.md +258 -258
- package/tsconfig.json +26 -26
- package/types/index.d.ts +525 -525
package/lib/doi.ts
CHANGED
|
@@ -1,897 +1,897 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* DOI validation and fetching utilities
|
|
3
|
-
* Check DOIs in .bib files, fetch BibTeX from DOIs
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
import * as fs from 'fs';
|
|
7
|
-
import type { BibEntry, DoiCheckResult, BibtexFetchResult, DoiLookupResult, BibCheckResult } from './types.js';
|
|
8
|
-
import { crossrefLimiter, dataciteLimiter, doiOrgLimiter } from './rate-limiter.js';
|
|
9
|
-
import { getCachedDoi, cacheDoi } from './doi-cache.js';
|
|
10
|
-
|
|
11
|
-
// Entry types that typically don't have DOIs
|
|
12
|
-
const NO_DOI_TYPES = new Set([
|
|
13
|
-
'book', // Books often don't have DOIs (chapters might)
|
|
14
|
-
'inbook', // Book chapters - variable
|
|
15
|
-
'thesis', // Theses rarely have DOIs
|
|
16
|
-
'mastersthesis',
|
|
17
|
-
'phdthesis',
|
|
18
|
-
'misc', // Catch-all, often no DOI
|
|
19
|
-
'unpublished', // By definition
|
|
20
|
-
'manual', // Software manuals
|
|
21
|
-
'techreport', // Some do, many don't
|
|
22
|
-
'booklet',
|
|
23
|
-
]);
|
|
24
|
-
|
|
25
|
-
// Entry types that should have DOIs
|
|
26
|
-
const EXPECT_DOI_TYPES = new Set([
|
|
27
|
-
'article', // Journal articles should have DOIs
|
|
28
|
-
'inproceedings', // Conference papers usually do
|
|
29
|
-
'proceedings',
|
|
30
|
-
'incollection', // Book chapters in collections
|
|
31
|
-
]);
|
|
32
|
-
|
|
33
|
-
/**
|
|
34
|
-
* Parse .bib file and extract entries with DOI info
|
|
35
|
-
*/
|
|
36
|
-
export function parseBibEntries(bibPath: string): BibEntry[] {
|
|
37
|
-
if (!fs.existsSync(bibPath)) {
|
|
38
|
-
return [];
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
const content = fs.readFileSync(bibPath, 'utf-8');
|
|
42
|
-
const entries: BibEntry[] = [];
|
|
43
|
-
const lines = content.split('\n');
|
|
44
|
-
|
|
45
|
-
// Pattern for bib entries: @type{key,
|
|
46
|
-
const entryPattern = /@(\w+)\s*\{\s*([^,\s]+)\s*,/g;
|
|
47
|
-
|
|
48
|
-
let match: RegExpExecArray | null;
|
|
49
|
-
while ((match = entryPattern.exec(content)) !== null) {
|
|
50
|
-
const type = match[1]!.toLowerCase();
|
|
51
|
-
const key = match[2]!;
|
|
52
|
-
const startPos = match.index;
|
|
53
|
-
|
|
54
|
-
// Find the line number
|
|
55
|
-
let line = 1;
|
|
56
|
-
for (let i = 0; i < startPos; i++) {
|
|
57
|
-
if (content[i] === '\n') line++;
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
// Find the end of this entry (matching closing brace)
|
|
61
|
-
let braceCount = 0;
|
|
62
|
-
let entryEnd = startPos;
|
|
63
|
-
let inEntry = false;
|
|
64
|
-
|
|
65
|
-
for (let i = startPos; i < content.length; i++) {
|
|
66
|
-
if (content[i] === '{') {
|
|
67
|
-
braceCount++;
|
|
68
|
-
inEntry = true;
|
|
69
|
-
} else if (content[i] === '}') {
|
|
70
|
-
braceCount--;
|
|
71
|
-
if (inEntry && braceCount === 0) {
|
|
72
|
-
entryEnd = i + 1;
|
|
73
|
-
break;
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
const entryContent = content.slice(startPos, entryEnd);
|
|
79
|
-
|
|
80
|
-
// Extract DOI field
|
|
81
|
-
const doiMatch = entryContent.match(/\bdoi\s*=\s*[{"]([^}"]+)[}"]/i);
|
|
82
|
-
let doi = doiMatch ? doiMatch[1]!.trim() : null;
|
|
83
|
-
|
|
84
|
-
// Clean DOI - remove URL prefix if present
|
|
85
|
-
if (doi) {
|
|
86
|
-
doi = doi.replace(/^https?:\/\/(dx\.)?doi\.org\//i, '');
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
// Extract title for display
|
|
90
|
-
const titleMatch = entryContent.match(/\btitle\s*=\s*[{"]([^}"]+)[}"]/i);
|
|
91
|
-
const title = titleMatch ? titleMatch[1]!.trim().slice(0, 60) : '';
|
|
92
|
-
|
|
93
|
-
// Extract author for lookup
|
|
94
|
-
const authorMatch = entryContent.match(/\bauthor\s*=\s*[{"]([^}"]+)[}"]/i);
|
|
95
|
-
const authorRaw = authorMatch ? authorMatch[1]!.trim() : '';
|
|
96
|
-
|
|
97
|
-
// Extract year
|
|
98
|
-
const yearMatch = entryContent.match(/\byear\s*=\s*[{"]?(\d{4})[}""]?/i);
|
|
99
|
-
const year = yearMatch ? parseInt(yearMatch[1]!) : null;
|
|
100
|
-
|
|
101
|
-
// Extract journal
|
|
102
|
-
const journalMatch = entryContent.match(/\bjournal\s*=\s*[{"]([^}"]+)[}"]/i);
|
|
103
|
-
const journal = journalMatch ? journalMatch[1]!.trim() : '';
|
|
104
|
-
|
|
105
|
-
// Check for skip marker: nodoi = {true} or nodoi = true
|
|
106
|
-
const skipMatch = entryContent.match(/\bnodoi\s*=\s*[{"]?(true|yes|1)[}""]?/i);
|
|
107
|
-
const skip = !!skipMatch;
|
|
108
|
-
|
|
109
|
-
// Check for comment marker immediately before entry: % no-doi
|
|
110
|
-
// Only look at the text between the last entry end (or start) and this entry
|
|
111
|
-
const linesBefore = content.slice(Math.max(0, startPos - 200), startPos);
|
|
112
|
-
// Find the last closing brace or start of file to avoid matching comments for previous entries
|
|
113
|
-
const lastEntryEnd = linesBefore.lastIndexOf('}');
|
|
114
|
-
const relevantBefore = lastEntryEnd >= 0 ? linesBefore.slice(lastEntryEnd + 1) : linesBefore;
|
|
115
|
-
const commentSkip = /% *no-?doi/i.test(relevantBefore);
|
|
116
|
-
|
|
117
|
-
entries.push({
|
|
118
|
-
key,
|
|
119
|
-
type,
|
|
120
|
-
doi: doi || null,
|
|
121
|
-
title,
|
|
122
|
-
authorRaw,
|
|
123
|
-
year,
|
|
124
|
-
journal,
|
|
125
|
-
skip: skip || commentSkip,
|
|
126
|
-
expectDoi: EXPECT_DOI_TYPES.has(type),
|
|
127
|
-
noDoi: NO_DOI_TYPES.has(type),
|
|
128
|
-
line,
|
|
129
|
-
});
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
return entries;
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
/**
|
|
136
|
-
* Validate DOI format
|
|
137
|
-
*/
|
|
138
|
-
export function isValidDoiFormat(doi: string): boolean {
|
|
139
|
-
if (!doi) return false;
|
|
140
|
-
// DOI format: 10.prefix/suffix
|
|
141
|
-
// Prefix is 4+ digits, suffix can contain most characters
|
|
142
|
-
return /^10\.\d{4,}\/[^\s]+$/.test(doi);
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
/**
|
|
146
|
-
* Check if DOI resolves via DataCite (for Zenodo, Figshare, etc.)
|
|
147
|
-
*/
|
|
148
|
-
async function checkDoiDataCite(doi: string): Promise<DoiCheckResult> {
|
|
149
|
-
try {
|
|
150
|
-
const response = await dataciteLimiter.fetchWithRetry(
|
|
151
|
-
`https://api.datacite.org/dois/${encodeURIComponent(doi)}`,
|
|
152
|
-
{
|
|
153
|
-
headers: {
|
|
154
|
-
'Accept': 'application/vnd.api+json',
|
|
155
|
-
'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev)',
|
|
156
|
-
},
|
|
157
|
-
}
|
|
158
|
-
);
|
|
159
|
-
|
|
160
|
-
if (response.status === 404) {
|
|
161
|
-
return { valid: false, error: 'DOI not found in DataCite' };
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
if (!response.ok) {
|
|
165
|
-
return { valid: false, error: `HTTP ${response.status}` };
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
const data = await response.json() as any;
|
|
169
|
-
const attrs = data.data?.attributes;
|
|
170
|
-
|
|
171
|
-
if (!attrs) {
|
|
172
|
-
return { valid: false, error: 'Invalid DataCite response' };
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
return {
|
|
176
|
-
valid: true,
|
|
177
|
-
source: 'datacite',
|
|
178
|
-
metadata: {
|
|
179
|
-
title: attrs.titles?.[0]?.title || '',
|
|
180
|
-
authors: attrs.creators?.map((c: any) => `${c.givenName || ''} ${c.familyName || ''}`.trim()) || [],
|
|
181
|
-
year: attrs.publicationYear,
|
|
182
|
-
journal: attrs.publisher || '',
|
|
183
|
-
type: attrs.types?.resourceTypeGeneral || '',
|
|
184
|
-
},
|
|
185
|
-
};
|
|
186
|
-
} catch (err) {
|
|
187
|
-
return { valid: false, error: (err as Error).message };
|
|
188
|
-
}
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
interface CheckDoiOptions {
|
|
192
|
-
skipCache?: boolean;
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
/**
|
|
196
|
-
* Check if DOI resolves (exists) - tries Crossref first, then DataCite
|
|
197
|
-
* Results are cached for 7 days to reduce API calls.
|
|
198
|
-
*/
|
|
199
|
-
export async function checkDoi(doi: string, options: CheckDoiOptions = {}): Promise<DoiCheckResult & { cached?: boolean }> {
|
|
200
|
-
if (!isValidDoiFormat(doi)) {
|
|
201
|
-
return { valid: false, error: 'Invalid DOI format' };
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
// Check cache first (unless skipped)
|
|
205
|
-
if (!options.skipCache) {
|
|
206
|
-
const cached = getCachedDoi(doi);
|
|
207
|
-
if (cached) {
|
|
208
|
-
return { ...cached, cached: true } as DoiCheckResult & { cached?: boolean };
|
|
209
|
-
}
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
// Zenodo DOIs start with 10.5281 - check DataCite first
|
|
213
|
-
const isZenodo = doi.startsWith('10.5281/');
|
|
214
|
-
const isFigshare = doi.startsWith('10.6084/');
|
|
215
|
-
const isDataCiteLikely = isZenodo || isFigshare;
|
|
216
|
-
|
|
217
|
-
if (isDataCiteLikely) {
|
|
218
|
-
const dataciteResult = await checkDoiDataCite(doi);
|
|
219
|
-
if (dataciteResult.valid) {
|
|
220
|
-
cacheDoi(doi, dataciteResult);
|
|
221
|
-
return dataciteResult;
|
|
222
|
-
}
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
try {
|
|
226
|
-
// Use Crossref API to check DOI
|
|
227
|
-
const response = await crossrefLimiter.fetchWithRetry(
|
|
228
|
-
`https://api.crossref.org/works/${encodeURIComponent(doi)}`,
|
|
229
|
-
{
|
|
230
|
-
headers: {
|
|
231
|
-
'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev; mailto:docrev@example.com)',
|
|
232
|
-
},
|
|
233
|
-
}
|
|
234
|
-
);
|
|
235
|
-
|
|
236
|
-
if (response.status === 404) {
|
|
237
|
-
// Try DataCite as fallback (if not already tried)
|
|
238
|
-
if (!isDataCiteLikely) {
|
|
239
|
-
const dataciteResult = await checkDoiDataCite(doi);
|
|
240
|
-
if (dataciteResult.valid) {
|
|
241
|
-
cacheDoi(doi, dataciteResult);
|
|
242
|
-
return dataciteResult;
|
|
243
|
-
}
|
|
244
|
-
}
|
|
245
|
-
const result = { valid: false, error: 'DOI not found' };
|
|
246
|
-
cacheDoi(doi, result);
|
|
247
|
-
return result;
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
if (!response.ok) {
|
|
251
|
-
// Don't cache transient errors
|
|
252
|
-
return { valid: false, error: `HTTP ${response.status}` };
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
const data = await response.json() as any;
|
|
256
|
-
const work = data.message;
|
|
257
|
-
|
|
258
|
-
const result: DoiCheckResult = {
|
|
259
|
-
valid: true,
|
|
260
|
-
source: 'crossref',
|
|
261
|
-
metadata: {
|
|
262
|
-
title: work.title?.[0] || '',
|
|
263
|
-
authors: work.author?.map((a: any) => `${a.given || ''} ${a.family || ''}`.trim()) || [],
|
|
264
|
-
year: work.published?.['date-parts']?.[0]?.[0] || work.created?.['date-parts']?.[0]?.[0],
|
|
265
|
-
journal: work['container-title']?.[0] || '',
|
|
266
|
-
type: work.type,
|
|
267
|
-
},
|
|
268
|
-
};
|
|
269
|
-
|
|
270
|
-
cacheDoi(doi, result);
|
|
271
|
-
return result;
|
|
272
|
-
} catch (err) {
|
|
273
|
-
// Don't cache network errors
|
|
274
|
-
return { valid: false, error: (err as Error).message };
|
|
275
|
-
}
|
|
276
|
-
}
|
|
277
|
-
|
|
278
|
-
/**
|
|
279
|
-
* Fetch BibTeX from DOI using content negotiation
|
|
280
|
-
*/
|
|
281
|
-
export async function fetchBibtex(doi: string): Promise<BibtexFetchResult> {
|
|
282
|
-
// Clean DOI
|
|
283
|
-
doi = doi.replace(/^https?:\/\/(dx\.)?doi\.org\//i, '');
|
|
284
|
-
|
|
285
|
-
if (!isValidDoiFormat(doi)) {
|
|
286
|
-
return { success: false, error: 'Invalid DOI format' };
|
|
287
|
-
}
|
|
288
|
-
|
|
289
|
-
try {
|
|
290
|
-
const response = await doiOrgLimiter.fetchWithRetry(
|
|
291
|
-
`https://doi.org/${encodeURIComponent(doi)}`,
|
|
292
|
-
{
|
|
293
|
-
headers: {
|
|
294
|
-
'Accept': 'application/x-bibtex',
|
|
295
|
-
'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev)',
|
|
296
|
-
},
|
|
297
|
-
redirect: 'follow',
|
|
298
|
-
}
|
|
299
|
-
);
|
|
300
|
-
|
|
301
|
-
if (!response.ok) {
|
|
302
|
-
return { success: false, error: `HTTP ${response.status}` };
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
const bibtex = await response.text();
|
|
306
|
-
|
|
307
|
-
if (!bibtex.includes('@')) {
|
|
308
|
-
return { success: false, error: 'Invalid BibTeX response' };
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
return { success: true, bibtex: bibtex.trim() };
|
|
312
|
-
} catch (err) {
|
|
313
|
-
return { success: false, error: (err as Error).message };
|
|
314
|
-
}
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
interface CheckBibDoisOptions {
|
|
318
|
-
checkMissing?: boolean;
|
|
319
|
-
parallel?: number;
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
/**
|
|
323
|
-
* Check all DOIs in a .bib file
|
|
324
|
-
*/
|
|
325
|
-
export async function checkBibDois(bibPath: string, options: CheckBibDoisOptions = {}): Promise<BibCheckResult> {
|
|
326
|
-
const { checkMissing = false, parallel = 5 } = options;
|
|
327
|
-
|
|
328
|
-
const entries = parseBibEntries(bibPath);
|
|
329
|
-
const results: Array<BibEntry & { status: string; message?: string; metadata?: object }> = [];
|
|
330
|
-
|
|
331
|
-
let valid = 0;
|
|
332
|
-
let invalid = 0;
|
|
333
|
-
let missing = 0;
|
|
334
|
-
let skipped = 0;
|
|
335
|
-
|
|
336
|
-
// Process in batches to avoid rate limiting
|
|
337
|
-
for (let i = 0; i < entries.length; i += parallel) {
|
|
338
|
-
const batch = entries.slice(i, i + parallel);
|
|
339
|
-
|
|
340
|
-
const batchResults = await Promise.all(
|
|
341
|
-
batch.map(async (entry) => {
|
|
342
|
-
// Skip if marked
|
|
343
|
-
if (entry.skip) {
|
|
344
|
-
skipped++;
|
|
345
|
-
return { ...entry, status: 'skipped', message: 'Marked as no-doi' };
|
|
346
|
-
}
|
|
347
|
-
|
|
348
|
-
// No DOI field
|
|
349
|
-
if (!entry.doi) {
|
|
350
|
-
if (entry.noDoi) {
|
|
351
|
-
// Expected - books, theses, etc.
|
|
352
|
-
skipped++;
|
|
353
|
-
return { ...entry, status: 'skipped', message: `${entry.type} typically has no DOI` };
|
|
354
|
-
} else if (entry.expectDoi) {
|
|
355
|
-
// Should have DOI but doesn't
|
|
356
|
-
missing++;
|
|
357
|
-
return { ...entry, status: 'missing', message: 'Expected DOI for article/proceedings' };
|
|
358
|
-
} else {
|
|
359
|
-
skipped++;
|
|
360
|
-
return { ...entry, status: 'skipped', message: 'No DOI field' };
|
|
361
|
-
}
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
// Validate DOI format first
|
|
365
|
-
if (!isValidDoiFormat(entry.doi)) {
|
|
366
|
-
invalid++;
|
|
367
|
-
return { ...entry, status: 'invalid', message: 'Invalid DOI format' };
|
|
368
|
-
}
|
|
369
|
-
|
|
370
|
-
// Check if DOI resolves
|
|
371
|
-
const check = await checkDoi(entry.doi);
|
|
372
|
-
if (check.valid) {
|
|
373
|
-
valid++;
|
|
374
|
-
return { ...entry, status: 'valid', metadata: check.metadata };
|
|
375
|
-
} else {
|
|
376
|
-
invalid++;
|
|
377
|
-
return { ...entry, status: 'invalid', message: check.error };
|
|
378
|
-
}
|
|
379
|
-
})
|
|
380
|
-
);
|
|
381
|
-
|
|
382
|
-
results.push(...batchResults);
|
|
383
|
-
|
|
384
|
-
// Small delay between batches to be nice to the API
|
|
385
|
-
if (i + parallel < entries.length) {
|
|
386
|
-
await new Promise(r => setTimeout(r, 200));
|
|
387
|
-
}
|
|
388
|
-
}
|
|
389
|
-
|
|
390
|
-
return { entries: results, valid, invalid, missing, skipped };
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
interface DataCiteItem {
|
|
394
|
-
id: string;
|
|
395
|
-
attributes: {
|
|
396
|
-
titles?: Array<{ title: string }>;
|
|
397
|
-
creators?: Array<{ givenName?: string; familyName?: string }>;
|
|
398
|
-
publicationYear: number;
|
|
399
|
-
publisher?: string;
|
|
400
|
-
};
|
|
401
|
-
}
|
|
402
|
-
|
|
403
|
-
/**
|
|
404
|
-
* Search DataCite API (for Zenodo, Figshare, etc.)
|
|
405
|
-
*/
|
|
406
|
-
async function searchDataCite(title: string, author: string = '', year: number | null = null): Promise<any[]> {
|
|
407
|
-
try {
|
|
408
|
-
// DataCite query syntax
|
|
409
|
-
let query = `titles.title:${title.replace(/[{}]/g, '')}`;
|
|
410
|
-
if (author) {
|
|
411
|
-
query += ` AND creators.name:${author}`;
|
|
412
|
-
}
|
|
413
|
-
if (year) {
|
|
414
|
-
query += ` AND publicationYear:${year}`;
|
|
415
|
-
}
|
|
416
|
-
|
|
417
|
-
const params = new URLSearchParams({
|
|
418
|
-
query: query,
|
|
419
|
-
'page[size]': '5',
|
|
420
|
-
});
|
|
421
|
-
|
|
422
|
-
const response = await dataciteLimiter.fetchWithRetry(
|
|
423
|
-
`https://api.datacite.org/dois?${params}`,
|
|
424
|
-
{
|
|
425
|
-
headers: {
|
|
426
|
-
'Accept': 'application/vnd.api+json',
|
|
427
|
-
'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev)',
|
|
428
|
-
},
|
|
429
|
-
}
|
|
430
|
-
);
|
|
431
|
-
|
|
432
|
-
if (!response.ok) return [];
|
|
433
|
-
|
|
434
|
-
const data = await response.json() as { data?: DataCiteItem[] };
|
|
435
|
-
const items = data.data || [];
|
|
436
|
-
|
|
437
|
-
return items.map(item => {
|
|
438
|
-
const attrs = item.attributes;
|
|
439
|
-
return {
|
|
440
|
-
DOI: item.id,
|
|
441
|
-
title: [attrs.titles?.[0]?.title || ''],
|
|
442
|
-
author: attrs.creators?.map(c => ({ family: c.familyName, given: c.givenName })) || [],
|
|
443
|
-
'published-print': { 'date-parts': [[attrs.publicationYear]] },
|
|
444
|
-
'container-title': [attrs.publisher || ''],
|
|
445
|
-
score: 50, // Base score for DataCite results
|
|
446
|
-
source: 'datacite',
|
|
447
|
-
};
|
|
448
|
-
});
|
|
449
|
-
} catch {
|
|
450
|
-
return [];
|
|
451
|
-
}
|
|
452
|
-
}
|
|
453
|
-
|
|
454
|
-
/**
|
|
455
|
-
* Normalize text for comparison (lowercase, remove special chars)
|
|
456
|
-
*/
|
|
457
|
-
function normalizeForMatching(text: string): string {
|
|
458
|
-
return (text || '')
|
|
459
|
-
.toLowerCase()
|
|
460
|
-
.replace(/[{}\\]/g, '') // Remove LaTeX braces
|
|
461
|
-
.replace(/[^a-z0-9\s]/g, ' ') // Replace special chars with space
|
|
462
|
-
.replace(/\s+/g, ' ')
|
|
463
|
-
.trim();
|
|
464
|
-
}
|
|
465
|
-
|
|
466
|
-
/**
|
|
467
|
-
* Check if DOI looks like a supplement, figure, or review (not the main paper)
|
|
468
|
-
*/
|
|
469
|
-
function isSupplementOrReview(doi: string, title: string = '', journal: string = ''): boolean {
|
|
470
|
-
const doiLower = (doi || '').toLowerCase();
|
|
471
|
-
const titleLower = (title || '').toLowerCase();
|
|
472
|
-
const journalLower = (journal || '').toLowerCase();
|
|
473
|
-
|
|
474
|
-
// Supplement/figure DOI patterns
|
|
475
|
-
if (/\.suppl|\/suppl|\.figure|\/figure|\.s\d+$|_s\d+$/i.test(doiLower)) {
|
|
476
|
-
return true;
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
// F1000/Faculty Opinions (post-publication reviews)
|
|
480
|
-
if (/10\.3410\/f\./i.test(doiLower) || /faculty opinions/i.test(journalLower)) {
|
|
481
|
-
return true;
|
|
482
|
-
}
|
|
483
|
-
|
|
484
|
-
// Title suggests it's supplementary material
|
|
485
|
-
if (/^supplementary|^supporting information|^appendix/i.test(titleLower)) {
|
|
486
|
-
return true;
|
|
487
|
-
}
|
|
488
|
-
|
|
489
|
-
return false;
|
|
490
|
-
}
|
|
491
|
-
|
|
492
|
-
interface CrossrefItem {
|
|
493
|
-
DOI: string;
|
|
494
|
-
title?: string[];
|
|
495
|
-
author?: Array<{ given?: string; family?: string }>;
|
|
496
|
-
'published-print'?: { 'date-parts': number[][] };
|
|
497
|
-
'published-online'?: { 'date-parts': number[][] };
|
|
498
|
-
'container-title'?: string[];
|
|
499
|
-
score?: number;
|
|
500
|
-
type?: string;
|
|
501
|
-
}
|
|
502
|
-
|
|
503
|
-
/**
|
|
504
|
-
* Search for DOI by title and author using Crossref API (+ DataCite fallback)
|
|
505
|
-
*/
|
|
506
|
-
export async function lookupDoi(
|
|
507
|
-
title: string,
|
|
508
|
-
author: string = '',
|
|
509
|
-
year: number | null = null,
|
|
510
|
-
journal: string = ''
|
|
511
|
-
): Promise<DoiLookupResult> {
|
|
512
|
-
if (!title || title.length < 10) {
|
|
513
|
-
return { found: false, error: 'Title too short for reliable search' };
|
|
514
|
-
}
|
|
515
|
-
|
|
516
|
-
// Check for keywords that suggest Zenodo/DataCite sources
|
|
517
|
-
const likelyZenodo = /\b(IPBES|zenodo|assessment report|secretariat)\b/i.test(title);
|
|
518
|
-
|
|
519
|
-
try {
|
|
520
|
-
// Build query - title is most important, add author and journal if available
|
|
521
|
-
let query = title;
|
|
522
|
-
if (author) {
|
|
523
|
-
query = `${title} ${author}`;
|
|
524
|
-
}
|
|
525
|
-
// Add journal to query for better matching
|
|
526
|
-
if (journal) {
|
|
527
|
-
query = `${query} ${journal}`;
|
|
528
|
-
}
|
|
529
|
-
|
|
530
|
-
let items: CrossrefItem[] = [];
|
|
531
|
-
|
|
532
|
-
// Try structured bibliographic query first (more accurate)
|
|
533
|
-
const structuredParams = new URLSearchParams({
|
|
534
|
-
rows: '10',
|
|
535
|
-
select: 'DOI,title,author,published-print,published-online,container-title,score,type',
|
|
536
|
-
});
|
|
537
|
-
structuredParams.set('query.bibliographic', title);
|
|
538
|
-
if (author) {
|
|
539
|
-
structuredParams.set('query.author', author);
|
|
540
|
-
}
|
|
541
|
-
if (journal) {
|
|
542
|
-
structuredParams.set('query.container-title', journal);
|
|
543
|
-
}
|
|
544
|
-
|
|
545
|
-
let response = await crossrefLimiter.fetchWithRetry(
|
|
546
|
-
`https://api.crossref.org/works?${structuredParams}`,
|
|
547
|
-
{
|
|
548
|
-
headers: {
|
|
549
|
-
'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev; mailto:docrev@example.com)',
|
|
550
|
-
},
|
|
551
|
-
}
|
|
552
|
-
);
|
|
553
|
-
|
|
554
|
-
if (response.ok) {
|
|
555
|
-
const data = await response.json() as { message?: { items?: CrossrefItem[] } };
|
|
556
|
-
items = data.message?.items || [];
|
|
557
|
-
}
|
|
558
|
-
|
|
559
|
-
// If structured query found few results, also try query.title (often better for exact matches)
|
|
560
|
-
if (items.length < 5) {
|
|
561
|
-
const titleParams = new URLSearchParams({
|
|
562
|
-
rows: '10',
|
|
563
|
-
select: 'DOI,title,author,published-print,published-online,container-title,score,type',
|
|
564
|
-
});
|
|
565
|
-
titleParams.set('query.title', title);
|
|
566
|
-
|
|
567
|
-
const response2 = await crossrefLimiter.fetchWithRetry(
|
|
568
|
-
`https://api.crossref.org/works?${titleParams}`,
|
|
569
|
-
{
|
|
570
|
-
headers: {
|
|
571
|
-
'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev; mailto:docrev@example.com)',
|
|
572
|
-
},
|
|
573
|
-
}
|
|
574
|
-
);
|
|
575
|
-
|
|
576
|
-
if (response2.ok) {
|
|
577
|
-
const data = await response2.json() as { message?: { items?: CrossrefItem[] } };
|
|
578
|
-
const newItems = data.message?.items || [];
|
|
579
|
-
// Merge results, avoiding duplicates
|
|
580
|
-
const existingDois = new Set(items.map(i => i.DOI));
|
|
581
|
-
for (const item of newItems) {
|
|
582
|
-
if (!existingDois.has(item.DOI)) {
|
|
583
|
-
items.push(item);
|
|
584
|
-
}
|
|
585
|
-
}
|
|
586
|
-
}
|
|
587
|
-
}
|
|
588
|
-
|
|
589
|
-
// If still nothing, try basic query (most lenient)
|
|
590
|
-
if (items.length === 0) {
|
|
591
|
-
const basicParams = new URLSearchParams({
|
|
592
|
-
query: query,
|
|
593
|
-
rows: '10',
|
|
594
|
-
select: 'DOI,title,author,published-print,published-online,container-title,score,type',
|
|
595
|
-
});
|
|
596
|
-
|
|
597
|
-
response = await crossrefLimiter.fetchWithRetry(
|
|
598
|
-
`https://api.crossref.org/works?${basicParams}`,
|
|
599
|
-
{
|
|
600
|
-
headers: {
|
|
601
|
-
'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev; mailto:docrev@example.com)',
|
|
602
|
-
},
|
|
603
|
-
}
|
|
604
|
-
);
|
|
605
|
-
|
|
606
|
-
if (response.ok) {
|
|
607
|
-
const data = await response.json() as { message?: { items?: CrossrefItem[] } };
|
|
608
|
-
items = data.message?.items || [];
|
|
609
|
-
}
|
|
610
|
-
}
|
|
611
|
-
|
|
612
|
-
// Also search DataCite for Zenodo/institutional repos
|
|
613
|
-
if (likelyZenodo || items.length === 0) {
|
|
614
|
-
const dataciteItems = await searchDataCite(title, author, year);
|
|
615
|
-
items = [...items, ...dataciteItems];
|
|
616
|
-
}
|
|
617
|
-
|
|
618
|
-
if (items.length === 0) {
|
|
619
|
-
return { found: false, error: 'No results found' };
|
|
620
|
-
}
|
|
621
|
-
|
|
622
|
-
const normalizedSearchTitle = normalizeForMatching(title);
|
|
623
|
-
const normalizedJournal = normalizeForMatching(journal);
|
|
624
|
-
|
|
625
|
-
// Score the results
|
|
626
|
-
const scored = items.map(item => {
|
|
627
|
-
let score = 0;
|
|
628
|
-
const itemTitle = item.title?.[0] || '';
|
|
629
|
-
const itemJournal = item['container-title']?.[0] || '';
|
|
630
|
-
const normalizedItemTitle = normalizeForMatching(itemTitle);
|
|
631
|
-
const normalizedItemJournal = normalizeForMatching(itemJournal);
|
|
632
|
-
|
|
633
|
-
// === PENALTY: Supplement/figure/review DOIs ===
|
|
634
|
-
if (isSupplementOrReview(item.DOI, itemTitle, itemJournal)) {
|
|
635
|
-
score -= 100; // Heavy penalty - almost never want these
|
|
636
|
-
}
|
|
637
|
-
|
|
638
|
-
// === Title similarity (most important) ===
|
|
639
|
-
if (normalizedItemTitle === normalizedSearchTitle) {
|
|
640
|
-
score += 100; // Exact match
|
|
641
|
-
} else if (normalizedItemTitle.includes(normalizedSearchTitle) ||
|
|
642
|
-
normalizedSearchTitle.includes(normalizedItemTitle)) {
|
|
643
|
-
score += 50;
|
|
644
|
-
} else {
|
|
645
|
-
// Check word overlap
|
|
646
|
-
const searchWords = normalizedSearchTitle.split(/\s+/).filter(w => w.length > 3);
|
|
647
|
-
const itemWords = normalizedItemTitle.split(/\s+/).filter(w => w.length > 3);
|
|
648
|
-
const overlap = searchWords.filter(w =>
|
|
649
|
-
itemWords.some(iw => iw.includes(w) || w.includes(iw))
|
|
650
|
-
);
|
|
651
|
-
score += (overlap.length / Math.max(searchWords.length, 1)) * 40;
|
|
652
|
-
}
|
|
653
|
-
|
|
654
|
-
// === Author match ===
|
|
655
|
-
if (author && item.author) {
|
|
656
|
-
const authorLower = author.toLowerCase();
|
|
657
|
-
const hasAuthor = item.author.some(a =>
|
|
658
|
-
(a.family || '').toLowerCase().includes(authorLower) ||
|
|
659
|
-
authorLower.includes((a.family || '').toLowerCase())
|
|
660
|
-
);
|
|
661
|
-
if (hasAuthor) score += 30;
|
|
662
|
-
}
|
|
663
|
-
|
|
664
|
-
// === Journal match (NEW) ===
|
|
665
|
-
if (normalizedJournal && normalizedItemJournal) {
|
|
666
|
-
// Check for journal name match (handles abbreviations)
|
|
667
|
-
const journalWords = normalizedJournal.split(/\s+/).filter(w => w.length > 2);
|
|
668
|
-
const itemJournalWords = normalizedItemJournal.split(/\s+/).filter(w => w.length > 2);
|
|
669
|
-
|
|
670
|
-
// Count matching words
|
|
671
|
-
const journalOverlap = journalWords.filter(w =>
|
|
672
|
-
itemJournalWords.some(iw => iw.includes(w) || w.includes(iw))
|
|
673
|
-
);
|
|
674
|
-
|
|
675
|
-
if (journalOverlap.length >= Math.min(2, journalWords.length)) {
|
|
676
|
-
score += 40; // Good journal match
|
|
677
|
-
} else if (journalOverlap.length >= 1) {
|
|
678
|
-
score += 15; // Partial match
|
|
679
|
-
}
|
|
680
|
-
|
|
681
|
-
// Bonus for exact journal match
|
|
682
|
-
if (normalizedItemJournal === normalizedJournal) {
|
|
683
|
-
score += 20;
|
|
684
|
-
}
|
|
685
|
-
}
|
|
686
|
-
|
|
687
|
-
// === Year match - CRITICAL for accuracy ===
|
|
688
|
-
const itemYear = item['published-print']?.['date-parts']?.[0]?.[0] ||
|
|
689
|
-
item['published-online']?.['date-parts']?.[0]?.[0];
|
|
690
|
-
if (year && itemYear) {
|
|
691
|
-
if (itemYear === year) {
|
|
692
|
-
score += 50; // Exact match - required for high confidence
|
|
693
|
-
} else if (Math.abs(itemYear - year) === 1) {
|
|
694
|
-
score += 20; // Off by one (common for online-first)
|
|
695
|
-
} else {
|
|
696
|
-
score -= 50; // Wrong year = likely wrong paper
|
|
697
|
-
}
|
|
698
|
-
} else if (year && !itemYear) {
|
|
699
|
-
score -= 10; // Can't verify year
|
|
700
|
-
}
|
|
701
|
-
|
|
702
|
-
// Crossref's own relevance score (capped)
|
|
703
|
-
score += Math.min(item.score || 0, 10);
|
|
704
|
-
|
|
705
|
-
return {
|
|
706
|
-
doi: item.DOI,
|
|
707
|
-
title: itemTitle,
|
|
708
|
-
authors: item.author?.map(a => `${a.given || ''} ${a.family || ''}`.trim()) || [],
|
|
709
|
-
year: itemYear,
|
|
710
|
-
journal: itemJournal,
|
|
711
|
-
score,
|
|
712
|
-
crossrefScore: item.score,
|
|
713
|
-
isSupplement: isSupplementOrReview(item.DOI, itemTitle, itemJournal),
|
|
714
|
-
};
|
|
715
|
-
});
|
|
716
|
-
|
|
717
|
-
// Sort by our score
|
|
718
|
-
scored.sort((a, b) => b.score - a.score);
|
|
719
|
-
|
|
720
|
-
// Filter out supplements for the "best" pick (but keep in alternatives)
|
|
721
|
-
const mainPapers = scored.filter(s => !s.isSupplement);
|
|
722
|
-
const best = mainPapers.length > 0 ? mainPapers[0] : scored[0];
|
|
723
|
-
|
|
724
|
-
if (!best) {
|
|
725
|
-
return { found: false, error: 'No valid results found' };
|
|
726
|
-
}
|
|
727
|
-
|
|
728
|
-
// Confidence thresholds
|
|
729
|
-
let confidence: 'low' | 'medium' | 'high' = 'low';
|
|
730
|
-
if (best.score >= 120) confidence = 'high';
|
|
731
|
-
else if (best.score >= 70) confidence = 'medium';
|
|
732
|
-
|
|
733
|
-
// === NEW: Try DataCite if Crossref confidence is low ===
|
|
734
|
-
if (confidence === 'low' && !likelyZenodo) {
|
|
735
|
-
const dataciteItems = await searchDataCite(title, author, year);
|
|
736
|
-
if (dataciteItems.length > 0) {
|
|
737
|
-
// Score DataCite results with same logic
|
|
738
|
-
for (const dcItem of dataciteItems) {
|
|
739
|
-
const dcTitle = dcItem.title?.[0] || '';
|
|
740
|
-
const normalizedDcTitle = normalizeForMatching(dcTitle);
|
|
741
|
-
let dcScore = 0;
|
|
742
|
-
|
|
743
|
-
// Title match
|
|
744
|
-
if (normalizedDcTitle === normalizedSearchTitle) {
|
|
745
|
-
dcScore += 100;
|
|
746
|
-
} else if (normalizedDcTitle.includes(normalizedSearchTitle) ||
|
|
747
|
-
normalizedSearchTitle.includes(normalizedDcTitle)) {
|
|
748
|
-
dcScore += 50;
|
|
749
|
-
}
|
|
750
|
-
|
|
751
|
-
// Year match
|
|
752
|
-
const dcYear = dcItem['published-print']?.['date-parts']?.[0]?.[0];
|
|
753
|
-
if (year && dcYear && dcYear === year) {
|
|
754
|
-
dcScore += 50;
|
|
755
|
-
}
|
|
756
|
-
|
|
757
|
-
if (dcScore > best.score) {
|
|
758
|
-
return {
|
|
759
|
-
found: true,
|
|
760
|
-
doi: dcItem.DOI,
|
|
761
|
-
confidence: dcScore >= 120 ? 'high' : dcScore >= 70 ? 'medium' : 'low',
|
|
762
|
-
score: dcScore,
|
|
763
|
-
metadata: {
|
|
764
|
-
title: dcTitle,
|
|
765
|
-
authors: dcItem.author?.map((a: any) => `${a.given || ''} ${a.family || ''}`.trim()) || [],
|
|
766
|
-
year: dcYear,
|
|
767
|
-
journal: dcItem['container-title']?.[0] || '',
|
|
768
|
-
},
|
|
769
|
-
alternatives: scored.slice(0, 2),
|
|
770
|
-
};
|
|
771
|
-
}
|
|
772
|
-
}
|
|
773
|
-
}
|
|
774
|
-
}
|
|
775
|
-
|
|
776
|
-
return {
|
|
777
|
-
found: true,
|
|
778
|
-
doi: best.doi,
|
|
779
|
-
confidence,
|
|
780
|
-
score: best.score,
|
|
781
|
-
metadata: {
|
|
782
|
-
title: best.title,
|
|
783
|
-
authors: best.authors,
|
|
784
|
-
year: best.year || 0,
|
|
785
|
-
journal: best.journal,
|
|
786
|
-
},
|
|
787
|
-
alternatives: scored.filter(s => s.doi !== best.doi).slice(0, 3),
|
|
788
|
-
};
|
|
789
|
-
} catch (err) {
|
|
790
|
-
return { found: false, error: (err as Error).message };
|
|
791
|
-
}
|
|
792
|
-
}
|
|
793
|
-
|
|
794
|
-
interface LookupMissingDoisOptions {
|
|
795
|
-
parallel?: number;
|
|
796
|
-
onProgress?: (current: number, total: number) => void;
|
|
797
|
-
}
|
|
798
|
-
|
|
799
|
-
interface LookupMissingDoiResult {
|
|
800
|
-
key: string;
|
|
801
|
-
title: string;
|
|
802
|
-
type: string;
|
|
803
|
-
journal: string;
|
|
804
|
-
result: DoiLookupResult;
|
|
805
|
-
}
|
|
806
|
-
|
|
807
|
-
/**
|
|
808
|
-
* Look up DOIs for all entries missing them in a .bib file
|
|
809
|
-
*/
|
|
810
|
-
export async function lookupMissingDois(
|
|
811
|
-
bibPath: string,
|
|
812
|
-
options: LookupMissingDoisOptions = {}
|
|
813
|
-
): Promise<LookupMissingDoiResult[]> {
|
|
814
|
-
const { parallel = 3, onProgress } = options;
|
|
815
|
-
|
|
816
|
-
const entries = parseBibEntries(bibPath);
|
|
817
|
-
const missing = entries.filter(e =>
|
|
818
|
-
!e.doi &&
|
|
819
|
-
!e.skip &&
|
|
820
|
-
!NO_DOI_TYPES.has(e.type)
|
|
821
|
-
);
|
|
822
|
-
|
|
823
|
-
const results: LookupMissingDoiResult[] = [];
|
|
824
|
-
|
|
825
|
-
for (let i = 0; i < missing.length; i += parallel) {
|
|
826
|
-
const batch = missing.slice(i, i + parallel);
|
|
827
|
-
|
|
828
|
-
const batchResults = await Promise.all(
|
|
829
|
-
batch.map(async (entry) => {
|
|
830
|
-
// Extract first author's last name from the entry
|
|
831
|
-
// This is tricky because BibTeX author format varies
|
|
832
|
-
let author = '';
|
|
833
|
-
if (entry.authorRaw) {
|
|
834
|
-
// Try to get first author's last name
|
|
835
|
-
const firstAuthor = entry.authorRaw.split(' and ')[0];
|
|
836
|
-
if (firstAuthor) {
|
|
837
|
-
const parts = firstAuthor.split(',');
|
|
838
|
-
author = parts[0]?.trim() || '';
|
|
839
|
-
}
|
|
840
|
-
}
|
|
841
|
-
|
|
842
|
-
const result = await lookupDoi(entry.title, author, entry.year, entry.journal);
|
|
843
|
-
|
|
844
|
-
return {
|
|
845
|
-
key: entry.key,
|
|
846
|
-
title: entry.title,
|
|
847
|
-
type: entry.type,
|
|
848
|
-
journal: entry.journal,
|
|
849
|
-
result,
|
|
850
|
-
};
|
|
851
|
-
})
|
|
852
|
-
);
|
|
853
|
-
|
|
854
|
-
results.push(...batchResults);
|
|
855
|
-
|
|
856
|
-
if (onProgress) {
|
|
857
|
-
onProgress(Math.min(i + parallel, missing.length), missing.length);
|
|
858
|
-
}
|
|
859
|
-
|
|
860
|
-
// Rate limiting
|
|
861
|
-
if (i + parallel < missing.length) {
|
|
862
|
-
await new Promise(r => setTimeout(r, 300));
|
|
863
|
-
}
|
|
864
|
-
}
|
|
865
|
-
|
|
866
|
-
return results;
|
|
867
|
-
}
|
|
868
|
-
|
|
869
|
-
interface AddToBibResult {
|
|
870
|
-
success: boolean;
|
|
871
|
-
key?: string;
|
|
872
|
-
error?: string;
|
|
873
|
-
}
|
|
874
|
-
|
|
875
|
-
/**
|
|
876
|
-
* Add a BibTeX entry to a .bib file
|
|
877
|
-
*/
|
|
878
|
-
export function addToBib(bibPath: string, bibtex: string): AddToBibResult {
|
|
879
|
-
// Extract key from BibTeX
|
|
880
|
-
const keyMatch = bibtex.match(/@\w+\s*\{\s*([^,\s]+)/);
|
|
881
|
-
if (!keyMatch) {
|
|
882
|
-
return { success: false, error: 'Could not extract citation key from BibTeX' };
|
|
883
|
-
}
|
|
884
|
-
const key = keyMatch[1];
|
|
885
|
-
|
|
886
|
-
// Check if key already exists
|
|
887
|
-
const existing = fs.existsSync(bibPath) ? fs.readFileSync(bibPath, 'utf-8') : '';
|
|
888
|
-
if (existing.includes(`{${key},`) || existing.includes(`{${key}\n`)) {
|
|
889
|
-
return { success: false, error: `Key "${key}" already exists in ${bibPath}` };
|
|
890
|
-
}
|
|
891
|
-
|
|
892
|
-
// Append to file
|
|
893
|
-
const newContent = existing.trim() + '\n\n' + bibtex + '\n';
|
|
894
|
-
fs.writeFileSync(bibPath, newContent, 'utf-8');
|
|
895
|
-
|
|
896
|
-
return { success: true, key };
|
|
897
|
-
}
|
|
1
|
+
/**
|
|
2
|
+
* DOI validation and fetching utilities
|
|
3
|
+
* Check DOIs in .bib files, fetch BibTeX from DOIs
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import * as fs from 'fs';
|
|
7
|
+
import type { BibEntry, DoiCheckResult, BibtexFetchResult, DoiLookupResult, BibCheckResult } from './types.js';
|
|
8
|
+
import { crossrefLimiter, dataciteLimiter, doiOrgLimiter } from './rate-limiter.js';
|
|
9
|
+
import { getCachedDoi, cacheDoi } from './doi-cache.js';
|
|
10
|
+
|
|
11
|
+
// Entry types that typically don't have DOIs
|
|
12
|
+
const NO_DOI_TYPES = new Set([
|
|
13
|
+
'book', // Books often don't have DOIs (chapters might)
|
|
14
|
+
'inbook', // Book chapters - variable
|
|
15
|
+
'thesis', // Theses rarely have DOIs
|
|
16
|
+
'mastersthesis',
|
|
17
|
+
'phdthesis',
|
|
18
|
+
'misc', // Catch-all, often no DOI
|
|
19
|
+
'unpublished', // By definition
|
|
20
|
+
'manual', // Software manuals
|
|
21
|
+
'techreport', // Some do, many don't
|
|
22
|
+
'booklet',
|
|
23
|
+
]);
|
|
24
|
+
|
|
25
|
+
// Entry types that should have DOIs
|
|
26
|
+
const EXPECT_DOI_TYPES = new Set([
|
|
27
|
+
'article', // Journal articles should have DOIs
|
|
28
|
+
'inproceedings', // Conference papers usually do
|
|
29
|
+
'proceedings',
|
|
30
|
+
'incollection', // Book chapters in collections
|
|
31
|
+
]);
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Parse .bib file and extract entries with DOI info
|
|
35
|
+
*/
|
|
36
|
+
export function parseBibEntries(bibPath: string): BibEntry[] {
|
|
37
|
+
if (!fs.existsSync(bibPath)) {
|
|
38
|
+
return [];
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const content = fs.readFileSync(bibPath, 'utf-8');
|
|
42
|
+
const entries: BibEntry[] = [];
|
|
43
|
+
const lines = content.split('\n');
|
|
44
|
+
|
|
45
|
+
// Pattern for bib entries: @type{key,
|
|
46
|
+
const entryPattern = /@(\w+)\s*\{\s*([^,\s]+)\s*,/g;
|
|
47
|
+
|
|
48
|
+
let match: RegExpExecArray | null;
|
|
49
|
+
while ((match = entryPattern.exec(content)) !== null) {
|
|
50
|
+
const type = match[1]!.toLowerCase();
|
|
51
|
+
const key = match[2]!;
|
|
52
|
+
const startPos = match.index;
|
|
53
|
+
|
|
54
|
+
// Find the line number
|
|
55
|
+
let line = 1;
|
|
56
|
+
for (let i = 0; i < startPos; i++) {
|
|
57
|
+
if (content[i] === '\n') line++;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Find the end of this entry (matching closing brace)
|
|
61
|
+
let braceCount = 0;
|
|
62
|
+
let entryEnd = startPos;
|
|
63
|
+
let inEntry = false;
|
|
64
|
+
|
|
65
|
+
for (let i = startPos; i < content.length; i++) {
|
|
66
|
+
if (content[i] === '{') {
|
|
67
|
+
braceCount++;
|
|
68
|
+
inEntry = true;
|
|
69
|
+
} else if (content[i] === '}') {
|
|
70
|
+
braceCount--;
|
|
71
|
+
if (inEntry && braceCount === 0) {
|
|
72
|
+
entryEnd = i + 1;
|
|
73
|
+
break;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
const entryContent = content.slice(startPos, entryEnd);
|
|
79
|
+
|
|
80
|
+
// Extract DOI field
|
|
81
|
+
const doiMatch = entryContent.match(/\bdoi\s*=\s*[{"]([^}"]+)[}"]/i);
|
|
82
|
+
let doi = doiMatch ? doiMatch[1]!.trim() : null;
|
|
83
|
+
|
|
84
|
+
// Clean DOI - remove URL prefix if present
|
|
85
|
+
if (doi) {
|
|
86
|
+
doi = doi.replace(/^https?:\/\/(dx\.)?doi\.org\//i, '');
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Extract title for display
|
|
90
|
+
const titleMatch = entryContent.match(/\btitle\s*=\s*[{"]([^}"]+)[}"]/i);
|
|
91
|
+
const title = titleMatch ? titleMatch[1]!.trim().slice(0, 60) : '';
|
|
92
|
+
|
|
93
|
+
// Extract author for lookup
|
|
94
|
+
const authorMatch = entryContent.match(/\bauthor\s*=\s*[{"]([^}"]+)[}"]/i);
|
|
95
|
+
const authorRaw = authorMatch ? authorMatch[1]!.trim() : '';
|
|
96
|
+
|
|
97
|
+
// Extract year
|
|
98
|
+
const yearMatch = entryContent.match(/\byear\s*=\s*[{"]?(\d{4})[}""]?/i);
|
|
99
|
+
const year = yearMatch ? parseInt(yearMatch[1]!) : null;
|
|
100
|
+
|
|
101
|
+
// Extract journal
|
|
102
|
+
const journalMatch = entryContent.match(/\bjournal\s*=\s*[{"]([^}"]+)[}"]/i);
|
|
103
|
+
const journal = journalMatch ? journalMatch[1]!.trim() : '';
|
|
104
|
+
|
|
105
|
+
// Check for skip marker: nodoi = {true} or nodoi = true
|
|
106
|
+
const skipMatch = entryContent.match(/\bnodoi\s*=\s*[{"]?(true|yes|1)[}""]?/i);
|
|
107
|
+
const skip = !!skipMatch;
|
|
108
|
+
|
|
109
|
+
// Check for comment marker immediately before entry: % no-doi
|
|
110
|
+
// Only look at the text between the last entry end (or start) and this entry
|
|
111
|
+
const linesBefore = content.slice(Math.max(0, startPos - 200), startPos);
|
|
112
|
+
// Find the last closing brace or start of file to avoid matching comments for previous entries
|
|
113
|
+
const lastEntryEnd = linesBefore.lastIndexOf('}');
|
|
114
|
+
const relevantBefore = lastEntryEnd >= 0 ? linesBefore.slice(lastEntryEnd + 1) : linesBefore;
|
|
115
|
+
const commentSkip = /% *no-?doi/i.test(relevantBefore);
|
|
116
|
+
|
|
117
|
+
entries.push({
|
|
118
|
+
key,
|
|
119
|
+
type,
|
|
120
|
+
doi: doi || null,
|
|
121
|
+
title,
|
|
122
|
+
authorRaw,
|
|
123
|
+
year,
|
|
124
|
+
journal,
|
|
125
|
+
skip: skip || commentSkip,
|
|
126
|
+
expectDoi: EXPECT_DOI_TYPES.has(type),
|
|
127
|
+
noDoi: NO_DOI_TYPES.has(type),
|
|
128
|
+
line,
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
return entries;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Validate DOI format
|
|
137
|
+
*/
|
|
138
|
+
export function isValidDoiFormat(doi: string): boolean {
|
|
139
|
+
if (!doi) return false;
|
|
140
|
+
// DOI format: 10.prefix/suffix
|
|
141
|
+
// Prefix is 4+ digits, suffix can contain most characters
|
|
142
|
+
return /^10\.\d{4,}\/[^\s]+$/.test(doi);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Check if DOI resolves via DataCite (for Zenodo, Figshare, etc.)
|
|
147
|
+
*/
|
|
148
|
+
async function checkDoiDataCite(doi: string): Promise<DoiCheckResult> {
|
|
149
|
+
try {
|
|
150
|
+
const response = await dataciteLimiter.fetchWithRetry(
|
|
151
|
+
`https://api.datacite.org/dois/${encodeURIComponent(doi)}`,
|
|
152
|
+
{
|
|
153
|
+
headers: {
|
|
154
|
+
'Accept': 'application/vnd.api+json',
|
|
155
|
+
'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev)',
|
|
156
|
+
},
|
|
157
|
+
}
|
|
158
|
+
);
|
|
159
|
+
|
|
160
|
+
if (response.status === 404) {
|
|
161
|
+
return { valid: false, error: 'DOI not found in DataCite' };
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
if (!response.ok) {
|
|
165
|
+
return { valid: false, error: `HTTP ${response.status}` };
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
const data = await response.json() as any;
|
|
169
|
+
const attrs = data.data?.attributes;
|
|
170
|
+
|
|
171
|
+
if (!attrs) {
|
|
172
|
+
return { valid: false, error: 'Invalid DataCite response' };
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
return {
|
|
176
|
+
valid: true,
|
|
177
|
+
source: 'datacite',
|
|
178
|
+
metadata: {
|
|
179
|
+
title: attrs.titles?.[0]?.title || '',
|
|
180
|
+
authors: attrs.creators?.map((c: any) => `${c.givenName || ''} ${c.familyName || ''}`.trim()) || [],
|
|
181
|
+
year: attrs.publicationYear,
|
|
182
|
+
journal: attrs.publisher || '',
|
|
183
|
+
type: attrs.types?.resourceTypeGeneral || '',
|
|
184
|
+
},
|
|
185
|
+
};
|
|
186
|
+
} catch (err) {
|
|
187
|
+
return { valid: false, error: (err as Error).message };
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
interface CheckDoiOptions {
|
|
192
|
+
skipCache?: boolean;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Check if DOI resolves (exists) - tries Crossref first, then DataCite
|
|
197
|
+
* Results are cached for 7 days to reduce API calls.
|
|
198
|
+
*/
|
|
199
|
+
export async function checkDoi(doi: string, options: CheckDoiOptions = {}): Promise<DoiCheckResult & { cached?: boolean }> {
|
|
200
|
+
if (!isValidDoiFormat(doi)) {
|
|
201
|
+
return { valid: false, error: 'Invalid DOI format' };
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// Check cache first (unless skipped)
|
|
205
|
+
if (!options.skipCache) {
|
|
206
|
+
const cached = getCachedDoi(doi);
|
|
207
|
+
if (cached) {
|
|
208
|
+
return { ...cached, cached: true } as DoiCheckResult & { cached?: boolean };
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// Zenodo DOIs start with 10.5281 - check DataCite first
|
|
213
|
+
const isZenodo = doi.startsWith('10.5281/');
|
|
214
|
+
const isFigshare = doi.startsWith('10.6084/');
|
|
215
|
+
const isDataCiteLikely = isZenodo || isFigshare;
|
|
216
|
+
|
|
217
|
+
if (isDataCiteLikely) {
|
|
218
|
+
const dataciteResult = await checkDoiDataCite(doi);
|
|
219
|
+
if (dataciteResult.valid) {
|
|
220
|
+
cacheDoi(doi, dataciteResult);
|
|
221
|
+
return dataciteResult;
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
try {
|
|
226
|
+
// Use Crossref API to check DOI
|
|
227
|
+
const response = await crossrefLimiter.fetchWithRetry(
|
|
228
|
+
`https://api.crossref.org/works/${encodeURIComponent(doi)}`,
|
|
229
|
+
{
|
|
230
|
+
headers: {
|
|
231
|
+
'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev; mailto:docrev@example.com)',
|
|
232
|
+
},
|
|
233
|
+
}
|
|
234
|
+
);
|
|
235
|
+
|
|
236
|
+
if (response.status === 404) {
|
|
237
|
+
// Try DataCite as fallback (if not already tried)
|
|
238
|
+
if (!isDataCiteLikely) {
|
|
239
|
+
const dataciteResult = await checkDoiDataCite(doi);
|
|
240
|
+
if (dataciteResult.valid) {
|
|
241
|
+
cacheDoi(doi, dataciteResult);
|
|
242
|
+
return dataciteResult;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
const result = { valid: false, error: 'DOI not found' };
|
|
246
|
+
cacheDoi(doi, result);
|
|
247
|
+
return result;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
if (!response.ok) {
|
|
251
|
+
// Don't cache transient errors
|
|
252
|
+
return { valid: false, error: `HTTP ${response.status}` };
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
const data = await response.json() as any;
|
|
256
|
+
const work = data.message;
|
|
257
|
+
|
|
258
|
+
const result: DoiCheckResult = {
|
|
259
|
+
valid: true,
|
|
260
|
+
source: 'crossref',
|
|
261
|
+
metadata: {
|
|
262
|
+
title: work.title?.[0] || '',
|
|
263
|
+
authors: work.author?.map((a: any) => `${a.given || ''} ${a.family || ''}`.trim()) || [],
|
|
264
|
+
year: work.published?.['date-parts']?.[0]?.[0] || work.created?.['date-parts']?.[0]?.[0],
|
|
265
|
+
journal: work['container-title']?.[0] || '',
|
|
266
|
+
type: work.type,
|
|
267
|
+
},
|
|
268
|
+
};
|
|
269
|
+
|
|
270
|
+
cacheDoi(doi, result);
|
|
271
|
+
return result;
|
|
272
|
+
} catch (err) {
|
|
273
|
+
// Don't cache network errors
|
|
274
|
+
return { valid: false, error: (err as Error).message };
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/**
|
|
279
|
+
* Fetch BibTeX from DOI using content negotiation
|
|
280
|
+
*/
|
|
281
|
+
export async function fetchBibtex(doi: string): Promise<BibtexFetchResult> {
|
|
282
|
+
// Clean DOI
|
|
283
|
+
doi = doi.replace(/^https?:\/\/(dx\.)?doi\.org\//i, '');
|
|
284
|
+
|
|
285
|
+
if (!isValidDoiFormat(doi)) {
|
|
286
|
+
return { success: false, error: 'Invalid DOI format' };
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
try {
|
|
290
|
+
const response = await doiOrgLimiter.fetchWithRetry(
|
|
291
|
+
`https://doi.org/${encodeURIComponent(doi)}`,
|
|
292
|
+
{
|
|
293
|
+
headers: {
|
|
294
|
+
'Accept': 'application/x-bibtex',
|
|
295
|
+
'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev)',
|
|
296
|
+
},
|
|
297
|
+
redirect: 'follow',
|
|
298
|
+
}
|
|
299
|
+
);
|
|
300
|
+
|
|
301
|
+
if (!response.ok) {
|
|
302
|
+
return { success: false, error: `HTTP ${response.status}` };
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
const bibtex = await response.text();
|
|
306
|
+
|
|
307
|
+
if (!bibtex.includes('@')) {
|
|
308
|
+
return { success: false, error: 'Invalid BibTeX response' };
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
return { success: true, bibtex: bibtex.trim() };
|
|
312
|
+
} catch (err) {
|
|
313
|
+
return { success: false, error: (err as Error).message };
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
interface CheckBibDoisOptions {
|
|
318
|
+
checkMissing?: boolean;
|
|
319
|
+
parallel?: number;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
/**
|
|
323
|
+
* Check all DOIs in a .bib file
|
|
324
|
+
*/
|
|
325
|
+
export async function checkBibDois(bibPath: string, options: CheckBibDoisOptions = {}): Promise<BibCheckResult> {
|
|
326
|
+
const { checkMissing = false, parallel = 5 } = options;
|
|
327
|
+
|
|
328
|
+
const entries = parseBibEntries(bibPath);
|
|
329
|
+
const results: Array<BibEntry & { status: string; message?: string; metadata?: object }> = [];
|
|
330
|
+
|
|
331
|
+
let valid = 0;
|
|
332
|
+
let invalid = 0;
|
|
333
|
+
let missing = 0;
|
|
334
|
+
let skipped = 0;
|
|
335
|
+
|
|
336
|
+
// Process in batches to avoid rate limiting
|
|
337
|
+
for (let i = 0; i < entries.length; i += parallel) {
|
|
338
|
+
const batch = entries.slice(i, i + parallel);
|
|
339
|
+
|
|
340
|
+
const batchResults = await Promise.all(
|
|
341
|
+
batch.map(async (entry) => {
|
|
342
|
+
// Skip if marked
|
|
343
|
+
if (entry.skip) {
|
|
344
|
+
skipped++;
|
|
345
|
+
return { ...entry, status: 'skipped', message: 'Marked as no-doi' };
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
// No DOI field
|
|
349
|
+
if (!entry.doi) {
|
|
350
|
+
if (entry.noDoi) {
|
|
351
|
+
// Expected - books, theses, etc.
|
|
352
|
+
skipped++;
|
|
353
|
+
return { ...entry, status: 'skipped', message: `${entry.type} typically has no DOI` };
|
|
354
|
+
} else if (entry.expectDoi) {
|
|
355
|
+
// Should have DOI but doesn't
|
|
356
|
+
missing++;
|
|
357
|
+
return { ...entry, status: 'missing', message: 'Expected DOI for article/proceedings' };
|
|
358
|
+
} else {
|
|
359
|
+
skipped++;
|
|
360
|
+
return { ...entry, status: 'skipped', message: 'No DOI field' };
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
// Validate DOI format first
|
|
365
|
+
if (!isValidDoiFormat(entry.doi)) {
|
|
366
|
+
invalid++;
|
|
367
|
+
return { ...entry, status: 'invalid', message: 'Invalid DOI format' };
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
// Check if DOI resolves
|
|
371
|
+
const check = await checkDoi(entry.doi);
|
|
372
|
+
if (check.valid) {
|
|
373
|
+
valid++;
|
|
374
|
+
return { ...entry, status: 'valid', metadata: check.metadata };
|
|
375
|
+
} else {
|
|
376
|
+
invalid++;
|
|
377
|
+
return { ...entry, status: 'invalid', message: check.error };
|
|
378
|
+
}
|
|
379
|
+
})
|
|
380
|
+
);
|
|
381
|
+
|
|
382
|
+
results.push(...batchResults);
|
|
383
|
+
|
|
384
|
+
// Small delay between batches to be nice to the API
|
|
385
|
+
if (i + parallel < entries.length) {
|
|
386
|
+
await new Promise(r => setTimeout(r, 200));
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
return { entries: results, valid, invalid, missing, skipped };
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
interface DataCiteItem {
|
|
394
|
+
id: string;
|
|
395
|
+
attributes: {
|
|
396
|
+
titles?: Array<{ title: string }>;
|
|
397
|
+
creators?: Array<{ givenName?: string; familyName?: string }>;
|
|
398
|
+
publicationYear: number;
|
|
399
|
+
publisher?: string;
|
|
400
|
+
};
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
/**
|
|
404
|
+
* Search DataCite API (for Zenodo, Figshare, etc.)
|
|
405
|
+
*/
|
|
406
|
+
async function searchDataCite(title: string, author: string = '', year: number | null = null): Promise<any[]> {
|
|
407
|
+
try {
|
|
408
|
+
// DataCite query syntax
|
|
409
|
+
let query = `titles.title:${title.replace(/[{}]/g, '')}`;
|
|
410
|
+
if (author) {
|
|
411
|
+
query += ` AND creators.name:${author}`;
|
|
412
|
+
}
|
|
413
|
+
if (year) {
|
|
414
|
+
query += ` AND publicationYear:${year}`;
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
const params = new URLSearchParams({
|
|
418
|
+
query: query,
|
|
419
|
+
'page[size]': '5',
|
|
420
|
+
});
|
|
421
|
+
|
|
422
|
+
const response = await dataciteLimiter.fetchWithRetry(
|
|
423
|
+
`https://api.datacite.org/dois?${params}`,
|
|
424
|
+
{
|
|
425
|
+
headers: {
|
|
426
|
+
'Accept': 'application/vnd.api+json',
|
|
427
|
+
'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev)',
|
|
428
|
+
},
|
|
429
|
+
}
|
|
430
|
+
);
|
|
431
|
+
|
|
432
|
+
if (!response.ok) return [];
|
|
433
|
+
|
|
434
|
+
const data = await response.json() as { data?: DataCiteItem[] };
|
|
435
|
+
const items = data.data || [];
|
|
436
|
+
|
|
437
|
+
return items.map(item => {
|
|
438
|
+
const attrs = item.attributes;
|
|
439
|
+
return {
|
|
440
|
+
DOI: item.id,
|
|
441
|
+
title: [attrs.titles?.[0]?.title || ''],
|
|
442
|
+
author: attrs.creators?.map(c => ({ family: c.familyName, given: c.givenName })) || [],
|
|
443
|
+
'published-print': { 'date-parts': [[attrs.publicationYear]] },
|
|
444
|
+
'container-title': [attrs.publisher || ''],
|
|
445
|
+
score: 50, // Base score for DataCite results
|
|
446
|
+
source: 'datacite',
|
|
447
|
+
};
|
|
448
|
+
});
|
|
449
|
+
} catch {
|
|
450
|
+
return [];
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
/**
|
|
455
|
+
* Normalize text for comparison (lowercase, remove special chars)
|
|
456
|
+
*/
|
|
457
|
+
function normalizeForMatching(text: string): string {
|
|
458
|
+
return (text || '')
|
|
459
|
+
.toLowerCase()
|
|
460
|
+
.replace(/[{}\\]/g, '') // Remove LaTeX braces
|
|
461
|
+
.replace(/[^a-z0-9\s]/g, ' ') // Replace special chars with space
|
|
462
|
+
.replace(/\s+/g, ' ')
|
|
463
|
+
.trim();
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
/**
|
|
467
|
+
* Check if DOI looks like a supplement, figure, or review (not the main paper)
|
|
468
|
+
*/
|
|
469
|
+
function isSupplementOrReview(doi: string, title: string = '', journal: string = ''): boolean {
|
|
470
|
+
const doiLower = (doi || '').toLowerCase();
|
|
471
|
+
const titleLower = (title || '').toLowerCase();
|
|
472
|
+
const journalLower = (journal || '').toLowerCase();
|
|
473
|
+
|
|
474
|
+
// Supplement/figure DOI patterns
|
|
475
|
+
if (/\.suppl|\/suppl|\.figure|\/figure|\.s\d+$|_s\d+$/i.test(doiLower)) {
|
|
476
|
+
return true;
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
// F1000/Faculty Opinions (post-publication reviews)
|
|
480
|
+
if (/10\.3410\/f\./i.test(doiLower) || /faculty opinions/i.test(journalLower)) {
|
|
481
|
+
return true;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
// Title suggests it's supplementary material
|
|
485
|
+
if (/^supplementary|^supporting information|^appendix/i.test(titleLower)) {
|
|
486
|
+
return true;
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
return false;
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
interface CrossrefItem {
|
|
493
|
+
DOI: string;
|
|
494
|
+
title?: string[];
|
|
495
|
+
author?: Array<{ given?: string; family?: string }>;
|
|
496
|
+
'published-print'?: { 'date-parts': number[][] };
|
|
497
|
+
'published-online'?: { 'date-parts': number[][] };
|
|
498
|
+
'container-title'?: string[];
|
|
499
|
+
score?: number;
|
|
500
|
+
type?: string;
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
/**
|
|
504
|
+
* Search for DOI by title and author using Crossref API (+ DataCite fallback)
|
|
505
|
+
*/
|
|
506
|
+
export async function lookupDoi(
|
|
507
|
+
title: string,
|
|
508
|
+
author: string = '',
|
|
509
|
+
year: number | null = null,
|
|
510
|
+
journal: string = ''
|
|
511
|
+
): Promise<DoiLookupResult> {
|
|
512
|
+
if (!title || title.length < 10) {
|
|
513
|
+
return { found: false, error: 'Title too short for reliable search' };
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
// Check for keywords that suggest Zenodo/DataCite sources
|
|
517
|
+
const likelyZenodo = /\b(IPBES|zenodo|assessment report|secretariat)\b/i.test(title);
|
|
518
|
+
|
|
519
|
+
try {
|
|
520
|
+
// Build query - title is most important, add author and journal if available
|
|
521
|
+
let query = title;
|
|
522
|
+
if (author) {
|
|
523
|
+
query = `${title} ${author}`;
|
|
524
|
+
}
|
|
525
|
+
// Add journal to query for better matching
|
|
526
|
+
if (journal) {
|
|
527
|
+
query = `${query} ${journal}`;
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
let items: CrossrefItem[] = [];
|
|
531
|
+
|
|
532
|
+
// Try structured bibliographic query first (more accurate)
|
|
533
|
+
const structuredParams = new URLSearchParams({
|
|
534
|
+
rows: '10',
|
|
535
|
+
select: 'DOI,title,author,published-print,published-online,container-title,score,type',
|
|
536
|
+
});
|
|
537
|
+
structuredParams.set('query.bibliographic', title);
|
|
538
|
+
if (author) {
|
|
539
|
+
structuredParams.set('query.author', author);
|
|
540
|
+
}
|
|
541
|
+
if (journal) {
|
|
542
|
+
structuredParams.set('query.container-title', journal);
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
let response = await crossrefLimiter.fetchWithRetry(
|
|
546
|
+
`https://api.crossref.org/works?${structuredParams}`,
|
|
547
|
+
{
|
|
548
|
+
headers: {
|
|
549
|
+
'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev; mailto:docrev@example.com)',
|
|
550
|
+
},
|
|
551
|
+
}
|
|
552
|
+
);
|
|
553
|
+
|
|
554
|
+
if (response.ok) {
|
|
555
|
+
const data = await response.json() as { message?: { items?: CrossrefItem[] } };
|
|
556
|
+
items = data.message?.items || [];
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
// If structured query found few results, also try query.title (often better for exact matches)
|
|
560
|
+
if (items.length < 5) {
|
|
561
|
+
const titleParams = new URLSearchParams({
|
|
562
|
+
rows: '10',
|
|
563
|
+
select: 'DOI,title,author,published-print,published-online,container-title,score,type',
|
|
564
|
+
});
|
|
565
|
+
titleParams.set('query.title', title);
|
|
566
|
+
|
|
567
|
+
const response2 = await crossrefLimiter.fetchWithRetry(
|
|
568
|
+
`https://api.crossref.org/works?${titleParams}`,
|
|
569
|
+
{
|
|
570
|
+
headers: {
|
|
571
|
+
'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev; mailto:docrev@example.com)',
|
|
572
|
+
},
|
|
573
|
+
}
|
|
574
|
+
);
|
|
575
|
+
|
|
576
|
+
if (response2.ok) {
|
|
577
|
+
const data = await response2.json() as { message?: { items?: CrossrefItem[] } };
|
|
578
|
+
const newItems = data.message?.items || [];
|
|
579
|
+
// Merge results, avoiding duplicates
|
|
580
|
+
const existingDois = new Set(items.map(i => i.DOI));
|
|
581
|
+
for (const item of newItems) {
|
|
582
|
+
if (!existingDois.has(item.DOI)) {
|
|
583
|
+
items.push(item);
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
// If still nothing, try basic query (most lenient)
|
|
590
|
+
if (items.length === 0) {
|
|
591
|
+
const basicParams = new URLSearchParams({
|
|
592
|
+
query: query,
|
|
593
|
+
rows: '10',
|
|
594
|
+
select: 'DOI,title,author,published-print,published-online,container-title,score,type',
|
|
595
|
+
});
|
|
596
|
+
|
|
597
|
+
response = await crossrefLimiter.fetchWithRetry(
|
|
598
|
+
`https://api.crossref.org/works?${basicParams}`,
|
|
599
|
+
{
|
|
600
|
+
headers: {
|
|
601
|
+
'User-Agent': 'docrev/0.6.0 (https://github.com/gcol33/docrev; mailto:docrev@example.com)',
|
|
602
|
+
},
|
|
603
|
+
}
|
|
604
|
+
);
|
|
605
|
+
|
|
606
|
+
if (response.ok) {
|
|
607
|
+
const data = await response.json() as { message?: { items?: CrossrefItem[] } };
|
|
608
|
+
items = data.message?.items || [];
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
// Also search DataCite for Zenodo/institutional repos
|
|
613
|
+
if (likelyZenodo || items.length === 0) {
|
|
614
|
+
const dataciteItems = await searchDataCite(title, author, year);
|
|
615
|
+
items = [...items, ...dataciteItems];
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
if (items.length === 0) {
|
|
619
|
+
return { found: false, error: 'No results found' };
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
const normalizedSearchTitle = normalizeForMatching(title);
|
|
623
|
+
const normalizedJournal = normalizeForMatching(journal);
|
|
624
|
+
|
|
625
|
+
// Score the results
|
|
626
|
+
const scored = items.map(item => {
|
|
627
|
+
let score = 0;
|
|
628
|
+
const itemTitle = item.title?.[0] || '';
|
|
629
|
+
const itemJournal = item['container-title']?.[0] || '';
|
|
630
|
+
const normalizedItemTitle = normalizeForMatching(itemTitle);
|
|
631
|
+
const normalizedItemJournal = normalizeForMatching(itemJournal);
|
|
632
|
+
|
|
633
|
+
// === PENALTY: Supplement/figure/review DOIs ===
|
|
634
|
+
if (isSupplementOrReview(item.DOI, itemTitle, itemJournal)) {
|
|
635
|
+
score -= 100; // Heavy penalty - almost never want these
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
// === Title similarity (most important) ===
|
|
639
|
+
if (normalizedItemTitle === normalizedSearchTitle) {
|
|
640
|
+
score += 100; // Exact match
|
|
641
|
+
} else if (normalizedItemTitle.includes(normalizedSearchTitle) ||
|
|
642
|
+
normalizedSearchTitle.includes(normalizedItemTitle)) {
|
|
643
|
+
score += 50;
|
|
644
|
+
} else {
|
|
645
|
+
// Check word overlap
|
|
646
|
+
const searchWords = normalizedSearchTitle.split(/\s+/).filter(w => w.length > 3);
|
|
647
|
+
const itemWords = normalizedItemTitle.split(/\s+/).filter(w => w.length > 3);
|
|
648
|
+
const overlap = searchWords.filter(w =>
|
|
649
|
+
itemWords.some(iw => iw.includes(w) || w.includes(iw))
|
|
650
|
+
);
|
|
651
|
+
score += (overlap.length / Math.max(searchWords.length, 1)) * 40;
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
// === Author match ===
|
|
655
|
+
if (author && item.author) {
|
|
656
|
+
const authorLower = author.toLowerCase();
|
|
657
|
+
const hasAuthor = item.author.some(a =>
|
|
658
|
+
(a.family || '').toLowerCase().includes(authorLower) ||
|
|
659
|
+
authorLower.includes((a.family || '').toLowerCase())
|
|
660
|
+
);
|
|
661
|
+
if (hasAuthor) score += 30;
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
// === Journal match (NEW) ===
|
|
665
|
+
if (normalizedJournal && normalizedItemJournal) {
|
|
666
|
+
// Check for journal name match (handles abbreviations)
|
|
667
|
+
const journalWords = normalizedJournal.split(/\s+/).filter(w => w.length > 2);
|
|
668
|
+
const itemJournalWords = normalizedItemJournal.split(/\s+/).filter(w => w.length > 2);
|
|
669
|
+
|
|
670
|
+
// Count matching words
|
|
671
|
+
const journalOverlap = journalWords.filter(w =>
|
|
672
|
+
itemJournalWords.some(iw => iw.includes(w) || w.includes(iw))
|
|
673
|
+
);
|
|
674
|
+
|
|
675
|
+
if (journalOverlap.length >= Math.min(2, journalWords.length)) {
|
|
676
|
+
score += 40; // Good journal match
|
|
677
|
+
} else if (journalOverlap.length >= 1) {
|
|
678
|
+
score += 15; // Partial match
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
// Bonus for exact journal match
|
|
682
|
+
if (normalizedItemJournal === normalizedJournal) {
|
|
683
|
+
score += 20;
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
// === Year match - CRITICAL for accuracy ===
|
|
688
|
+
const itemYear = item['published-print']?.['date-parts']?.[0]?.[0] ||
|
|
689
|
+
item['published-online']?.['date-parts']?.[0]?.[0];
|
|
690
|
+
if (year && itemYear) {
|
|
691
|
+
if (itemYear === year) {
|
|
692
|
+
score += 50; // Exact match - required for high confidence
|
|
693
|
+
} else if (Math.abs(itemYear - year) === 1) {
|
|
694
|
+
score += 20; // Off by one (common for online-first)
|
|
695
|
+
} else {
|
|
696
|
+
score -= 50; // Wrong year = likely wrong paper
|
|
697
|
+
}
|
|
698
|
+
} else if (year && !itemYear) {
|
|
699
|
+
score -= 10; // Can't verify year
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
// Crossref's own relevance score (capped)
|
|
703
|
+
score += Math.min(item.score || 0, 10);
|
|
704
|
+
|
|
705
|
+
return {
|
|
706
|
+
doi: item.DOI,
|
|
707
|
+
title: itemTitle,
|
|
708
|
+
authors: item.author?.map(a => `${a.given || ''} ${a.family || ''}`.trim()) || [],
|
|
709
|
+
year: itemYear,
|
|
710
|
+
journal: itemJournal,
|
|
711
|
+
score,
|
|
712
|
+
crossrefScore: item.score,
|
|
713
|
+
isSupplement: isSupplementOrReview(item.DOI, itemTitle, itemJournal),
|
|
714
|
+
};
|
|
715
|
+
});
|
|
716
|
+
|
|
717
|
+
// Sort by our score
|
|
718
|
+
scored.sort((a, b) => b.score - a.score);
|
|
719
|
+
|
|
720
|
+
// Filter out supplements for the "best" pick (but keep in alternatives)
|
|
721
|
+
const mainPapers = scored.filter(s => !s.isSupplement);
|
|
722
|
+
const best = mainPapers.length > 0 ? mainPapers[0] : scored[0];
|
|
723
|
+
|
|
724
|
+
if (!best) {
|
|
725
|
+
return { found: false, error: 'No valid results found' };
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
// Confidence thresholds
|
|
729
|
+
let confidence: 'low' | 'medium' | 'high' = 'low';
|
|
730
|
+
if (best.score >= 120) confidence = 'high';
|
|
731
|
+
else if (best.score >= 70) confidence = 'medium';
|
|
732
|
+
|
|
733
|
+
// === NEW: Try DataCite if Crossref confidence is low ===
|
|
734
|
+
if (confidence === 'low' && !likelyZenodo) {
|
|
735
|
+
const dataciteItems = await searchDataCite(title, author, year);
|
|
736
|
+
if (dataciteItems.length > 0) {
|
|
737
|
+
// Score DataCite results with same logic
|
|
738
|
+
for (const dcItem of dataciteItems) {
|
|
739
|
+
const dcTitle = dcItem.title?.[0] || '';
|
|
740
|
+
const normalizedDcTitle = normalizeForMatching(dcTitle);
|
|
741
|
+
let dcScore = 0;
|
|
742
|
+
|
|
743
|
+
// Title match
|
|
744
|
+
if (normalizedDcTitle === normalizedSearchTitle) {
|
|
745
|
+
dcScore += 100;
|
|
746
|
+
} else if (normalizedDcTitle.includes(normalizedSearchTitle) ||
|
|
747
|
+
normalizedSearchTitle.includes(normalizedDcTitle)) {
|
|
748
|
+
dcScore += 50;
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
// Year match
|
|
752
|
+
const dcYear = dcItem['published-print']?.['date-parts']?.[0]?.[0];
|
|
753
|
+
if (year && dcYear && dcYear === year) {
|
|
754
|
+
dcScore += 50;
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
if (dcScore > best.score) {
|
|
758
|
+
return {
|
|
759
|
+
found: true,
|
|
760
|
+
doi: dcItem.DOI,
|
|
761
|
+
confidence: dcScore >= 120 ? 'high' : dcScore >= 70 ? 'medium' : 'low',
|
|
762
|
+
score: dcScore,
|
|
763
|
+
metadata: {
|
|
764
|
+
title: dcTitle,
|
|
765
|
+
authors: dcItem.author?.map((a: any) => `${a.given || ''} ${a.family || ''}`.trim()) || [],
|
|
766
|
+
year: dcYear,
|
|
767
|
+
journal: dcItem['container-title']?.[0] || '',
|
|
768
|
+
},
|
|
769
|
+
alternatives: scored.slice(0, 2),
|
|
770
|
+
};
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
return {
|
|
777
|
+
found: true,
|
|
778
|
+
doi: best.doi,
|
|
779
|
+
confidence,
|
|
780
|
+
score: best.score,
|
|
781
|
+
metadata: {
|
|
782
|
+
title: best.title,
|
|
783
|
+
authors: best.authors,
|
|
784
|
+
year: best.year || 0,
|
|
785
|
+
journal: best.journal,
|
|
786
|
+
},
|
|
787
|
+
alternatives: scored.filter(s => s.doi !== best.doi).slice(0, 3),
|
|
788
|
+
};
|
|
789
|
+
} catch (err) {
|
|
790
|
+
return { found: false, error: (err as Error).message };
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
interface LookupMissingDoisOptions {
|
|
795
|
+
parallel?: number;
|
|
796
|
+
onProgress?: (current: number, total: number) => void;
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
interface LookupMissingDoiResult {
|
|
800
|
+
key: string;
|
|
801
|
+
title: string;
|
|
802
|
+
type: string;
|
|
803
|
+
journal: string;
|
|
804
|
+
result: DoiLookupResult;
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
/**
|
|
808
|
+
* Look up DOIs for all entries missing them in a .bib file
|
|
809
|
+
*/
|
|
810
|
+
export async function lookupMissingDois(
|
|
811
|
+
bibPath: string,
|
|
812
|
+
options: LookupMissingDoisOptions = {}
|
|
813
|
+
): Promise<LookupMissingDoiResult[]> {
|
|
814
|
+
const { parallel = 3, onProgress } = options;
|
|
815
|
+
|
|
816
|
+
const entries = parseBibEntries(bibPath);
|
|
817
|
+
const missing = entries.filter(e =>
|
|
818
|
+
!e.doi &&
|
|
819
|
+
!e.skip &&
|
|
820
|
+
!NO_DOI_TYPES.has(e.type)
|
|
821
|
+
);
|
|
822
|
+
|
|
823
|
+
const results: LookupMissingDoiResult[] = [];
|
|
824
|
+
|
|
825
|
+
for (let i = 0; i < missing.length; i += parallel) {
|
|
826
|
+
const batch = missing.slice(i, i + parallel);
|
|
827
|
+
|
|
828
|
+
const batchResults = await Promise.all(
|
|
829
|
+
batch.map(async (entry) => {
|
|
830
|
+
// Extract first author's last name from the entry
|
|
831
|
+
// This is tricky because BibTeX author format varies
|
|
832
|
+
let author = '';
|
|
833
|
+
if (entry.authorRaw) {
|
|
834
|
+
// Try to get first author's last name
|
|
835
|
+
const firstAuthor = entry.authorRaw.split(' and ')[0];
|
|
836
|
+
if (firstAuthor) {
|
|
837
|
+
const parts = firstAuthor.split(',');
|
|
838
|
+
author = parts[0]?.trim() || '';
|
|
839
|
+
}
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
const result = await lookupDoi(entry.title, author, entry.year, entry.journal);
|
|
843
|
+
|
|
844
|
+
return {
|
|
845
|
+
key: entry.key,
|
|
846
|
+
title: entry.title,
|
|
847
|
+
type: entry.type,
|
|
848
|
+
journal: entry.journal,
|
|
849
|
+
result,
|
|
850
|
+
};
|
|
851
|
+
})
|
|
852
|
+
);
|
|
853
|
+
|
|
854
|
+
results.push(...batchResults);
|
|
855
|
+
|
|
856
|
+
if (onProgress) {
|
|
857
|
+
onProgress(Math.min(i + parallel, missing.length), missing.length);
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
// Rate limiting
|
|
861
|
+
if (i + parallel < missing.length) {
|
|
862
|
+
await new Promise(r => setTimeout(r, 300));
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
return results;
|
|
867
|
+
}
|
|
868
|
+
|
|
869
|
+
interface AddToBibResult {
|
|
870
|
+
success: boolean;
|
|
871
|
+
key?: string;
|
|
872
|
+
error?: string;
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
/**
|
|
876
|
+
* Add a BibTeX entry to a .bib file
|
|
877
|
+
*/
|
|
878
|
+
export function addToBib(bibPath: string, bibtex: string): AddToBibResult {
|
|
879
|
+
// Extract key from BibTeX
|
|
880
|
+
const keyMatch = bibtex.match(/@\w+\s*\{\s*([^,\s]+)/);
|
|
881
|
+
if (!keyMatch) {
|
|
882
|
+
return { success: false, error: 'Could not extract citation key from BibTeX' };
|
|
883
|
+
}
|
|
884
|
+
const key = keyMatch[1];
|
|
885
|
+
|
|
886
|
+
// Check if key already exists
|
|
887
|
+
const existing = fs.existsSync(bibPath) ? fs.readFileSync(bibPath, 'utf-8') : '';
|
|
888
|
+
if (existing.includes(`{${key},`) || existing.includes(`{${key}\n`)) {
|
|
889
|
+
return { success: false, error: `Key "${key}" already exists in ${bibPath}` };
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
// Append to file
|
|
893
|
+
const newContent = existing.trim() + '\n\n' + bibtex + '\n';
|
|
894
|
+
fs.writeFileSync(bibPath, newContent, 'utf-8');
|
|
895
|
+
|
|
896
|
+
return { success: true, key };
|
|
897
|
+
}
|