@de-otio/bibcheck 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +147 -0
- package/dist/cache/fs-cache.d.ts +55 -0
- package/dist/cache/fs-cache.d.ts.map +1 -0
- package/dist/cache/fs-cache.js +264 -0
- package/dist/cache/fs-cache.js.map +1 -0
- package/dist/canonical.d.ts +29 -0
- package/dist/canonical.d.ts.map +1 -0
- package/dist/canonical.js +132 -0
- package/dist/canonical.js.map +1 -0
- package/dist/check.d.ts +140 -0
- package/dist/check.d.ts.map +1 -0
- package/dist/check.js +646 -0
- package/dist/check.js.map +1 -0
- package/dist/cli.d.ts +19 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +357 -0
- package/dist/cli.js.map +1 -0
- package/dist/config.d.ts +175 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +180 -0
- package/dist/config.js.map +1 -0
- package/dist/databases/crossref.d.ts +53 -0
- package/dist/databases/crossref.d.ts.map +1 -0
- package/dist/databases/crossref.js +138 -0
- package/dist/databases/crossref.js.map +1 -0
- package/dist/databases/index.d.ts +12 -0
- package/dist/databases/index.d.ts.map +1 -0
- package/dist/databases/index.js +9 -0
- package/dist/databases/index.js.map +1 -0
- package/dist/databases/openalex.d.ts +29 -0
- package/dist/databases/openalex.d.ts.map +1 -0
- package/dist/databases/openalex.js +117 -0
- package/dist/databases/openalex.js.map +1 -0
- package/dist/databases/openlibrary.d.ts +26 -0
- package/dist/databases/openlibrary.d.ts.map +1 -0
- package/dist/databases/openlibrary.js +79 -0
- package/dist/databases/openlibrary.js.map +1 -0
- package/dist/databases/worldcat.d.ts +33 -0
- package/dist/databases/worldcat.d.ts.map +1 -0
- package/dist/databases/worldcat.js +145 -0
- package/dist/databases/worldcat.js.map +1 -0
- package/dist/doctor.d.ts +44 -0
- package/dist/doctor.d.ts.map +1 -0
- package/dist/doctor.js +386 -0
- package/dist/doctor.js.map +1 -0
- package/dist/existence.d.ts +70 -0
- package/dist/existence.d.ts.map +1 -0
- package/dist/existence.js +308 -0
- package/dist/existence.js.map +1 -0
- package/dist/http.d.ts +97 -0
- package/dist/http.d.ts.map +1 -0
- package/dist/http.js +543 -0
- package/dist/http.js.map +1 -0
- package/dist/identifiers.d.ts +44 -0
- package/dist/identifiers.d.ts.map +1 -0
- package/dist/identifiers.js +111 -0
- package/dist/identifiers.js.map +1 -0
- package/dist/index.d.ts +9 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -0
- package/dist/linkage.d.ts +29 -0
- package/dist/linkage.d.ts.map +1 -0
- package/dist/linkage.js +73 -0
- package/dist/linkage.js.map +1 -0
- package/dist/markdown/blocks.d.ts +19 -0
- package/dist/markdown/blocks.d.ts.map +1 -0
- package/dist/markdown/blocks.js +69 -0
- package/dist/markdown/blocks.js.map +1 -0
- package/dist/markdown/citekeys.d.ts +22 -0
- package/dist/markdown/citekeys.d.ts.map +1 -0
- package/dist/markdown/citekeys.js +100 -0
- package/dist/markdown/citekeys.js.map +1 -0
- package/dist/markdown/glob.d.ts +18 -0
- package/dist/markdown/glob.d.ts.map +1 -0
- package/dist/markdown/glob.js +26 -0
- package/dist/markdown/glob.js.map +1 -0
- package/dist/markdown/prose.d.ts +19 -0
- package/dist/markdown/prose.d.ts.map +1 -0
- package/dist/markdown/prose.js +81 -0
- package/dist/markdown/prose.js.map +1 -0
- package/dist/output/json.d.ts +21 -0
- package/dist/output/json.d.ts.map +1 -0
- package/dist/output/json.js +24 -0
- package/dist/output/json.js.map +1 -0
- package/dist/output/markdown.d.ts +21 -0
- package/dist/output/markdown.d.ts.map +1 -0
- package/dist/output/markdown.js +194 -0
- package/dist/output/markdown.js.map +1 -0
- package/dist/output/sarif.d.ts +31 -0
- package/dist/output/sarif.d.ts.map +1 -0
- package/dist/output/sarif.js +322 -0
- package/dist/output/sarif.js.map +1 -0
- package/dist/output/text.d.ts +27 -0
- package/dist/output/text.d.ts.map +1 -0
- package/dist/output/text.js +212 -0
- package/dist/output/text.js.map +1 -0
- package/dist/phrases/load.d.ts +34 -0
- package/dist/phrases/load.d.ts.map +1 -0
- package/dist/phrases/load.js +148 -0
- package/dist/phrases/load.js.map +1 -0
- package/dist/phrases.d.ts +27 -0
- package/dist/phrases.d.ts.map +1 -0
- package/dist/phrases.js +116 -0
- package/dist/phrases.js.map +1 -0
- package/dist/schema/csl.d.ts +429 -0
- package/dist/schema/csl.d.ts.map +1 -0
- package/dist/schema/csl.js +101 -0
- package/dist/schema/csl.js.map +1 -0
- package/dist/schema/output.d.ts +1116 -0
- package/dist/schema/output.d.ts.map +1 -0
- package/dist/schema/output.js +419 -0
- package/dist/schema/output.js.map +1 -0
- package/dist/suppression.d.ts +106 -0
- package/dist/suppression.d.ts.map +1 -0
- package/dist/suppression.js +134 -0
- package/dist/suppression.js.map +1 -0
- package/dist/version.d.ts +11 -0
- package/dist/version.d.ts.map +1 -0
- package/dist/version.js +14 -0
- package/dist/version.js.map +1 -0
- package/dist/worklist.d.ts +32 -0
- package/dist/worklist.d.ts.map +1 -0
- package/dist/worklist.js +211 -0
- package/dist/worklist.js.map +1 -0
- package/package.json +82 -0
package/dist/check.js
ADDED
|
@@ -0,0 +1,646 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `bibcheck check` orchestrator.
|
|
3
|
+
*
|
|
4
|
+
* Composes all five subcommands (existence, canonical, linkage, phrases,
|
|
5
|
+
* worklist) and assembles the top-level Output.
|
|
6
|
+
*
|
|
7
|
+
* Design notes:
|
|
8
|
+
* - Subcommands run sequentially for deterministic log output.
|
|
9
|
+
* - Each subcommand runs with its own 5-minute deadline.
|
|
10
|
+
* - If a subcommand throws, the error is caught, logged, and a degraded
|
|
11
|
+
* (error-flavored) result is emitted for that layer; the run continues.
|
|
12
|
+
* - The final Output is validated against OutputSchema before return.
|
|
13
|
+
*/
|
|
14
|
+
import { readFile as nodeReadFile } from 'node:fs/promises';
|
|
15
|
+
import { fileURLToPath } from 'node:url';
|
|
16
|
+
import path from 'node:path';
|
|
17
|
+
import { isGated, parseAllowsForBibliography, } from './suppression.js';
|
|
18
|
+
import { OutputSchema, SCHEMA_VERSION } from './schema/output.js';
|
|
19
|
+
import { loadBibliography, BibliographyParseError } from './schema/csl.js';
|
|
20
|
+
import { loadDenylist, PhraseLoaderError } from './phrases/load.js';
|
|
21
|
+
import { createFsCache } from './cache/fs-cache.js';
|
|
22
|
+
import { createHttpClient, isPrivateApiBase } from './http.js';
|
|
23
|
+
import { USER_AGENT_BASE } from './version.js';
|
|
24
|
+
import { createCrossRefClient, createOpenAlexClient, createOpenLibraryClient, } from './databases/index.js';
|
|
25
|
+
import { runExistence } from './existence.js';
|
|
26
|
+
import { runIdentifiers } from './identifiers.js';
|
|
27
|
+
import { runCanonical } from './canonical.js';
|
|
28
|
+
import { runLinkage } from './linkage.js';
|
|
29
|
+
import { runPhrases } from './phrases.js';
|
|
30
|
+
import { runWorklist } from './worklist.js';
|
|
31
|
+
const noopLogger = {
|
|
32
|
+
info: () => undefined,
|
|
33
|
+
warn: () => undefined,
|
|
34
|
+
error: () => undefined,
|
|
35
|
+
};
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
// buildCheckDeps
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
export async function buildCheckDeps(opts) {
|
|
40
|
+
const { config, cwd, signal, userAgent, } = opts;
|
|
41
|
+
const logger = opts.logger ?? noopLogger;
|
|
42
|
+
// Load bibliography
|
|
43
|
+
let bibliography;
|
|
44
|
+
try {
|
|
45
|
+
bibliography = await loadBibliography({ path: config.bibliography.file, cwd });
|
|
46
|
+
}
|
|
47
|
+
catch (err) {
|
|
48
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
49
|
+
logger.error('bibliography.load_failed', { error: message });
|
|
50
|
+
throw err instanceof BibliographyParseError ? err : new BibliographyParseError(message, err);
|
|
51
|
+
}
|
|
52
|
+
// Load phrase denylist (failures are non-fatal)
|
|
53
|
+
let patterns = [];
|
|
54
|
+
if (config.phrases.file !== null) {
|
|
55
|
+
try {
|
|
56
|
+
patterns = await loadDenylist({ path: config.phrases.file, cwd });
|
|
57
|
+
}
|
|
58
|
+
catch (err) {
|
|
59
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
60
|
+
logger.warn('phrases.load_failed', { error: message });
|
|
61
|
+
if (!(err instanceof PhraseLoaderError)) {
|
|
62
|
+
logger.warn('phrases.unexpected_error', { error: message });
|
|
63
|
+
}
|
|
64
|
+
patterns = [];
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
// Create cache
|
|
68
|
+
const cache = createFsCache({
|
|
69
|
+
dir: path.resolve(cwd, config.cache.dir),
|
|
70
|
+
maxSizeMb: config.cache.max_size_mb ?? null,
|
|
71
|
+
});
|
|
72
|
+
// Create HTTP client. If the operator has explicitly pointed any DB API base
|
|
73
|
+
// at a private/loopback host (e.g. a local stub or mirror), honor that
|
|
74
|
+
// deliberate config by allowing private hosts. The per-hop SSRF guard still
|
|
75
|
+
// protects untrusted bibliography URLs in the default (public-API) case.
|
|
76
|
+
const allowPrivateHosts = isPrivateApiBase(config.apis.crossref_base) ||
|
|
77
|
+
isPrivateApiBase(config.apis.openalex_base) ||
|
|
78
|
+
isPrivateApiBase(config.apis.openlibrary_base);
|
|
79
|
+
const http = createHttpClient({
|
|
80
|
+
userAgent: userAgent ?? USER_AGENT_BASE,
|
|
81
|
+
defaultTimeoutMs: 10_000,
|
|
82
|
+
maxRetries: 2,
|
|
83
|
+
perOriginConcurrency: 2,
|
|
84
|
+
allowPrivateHosts,
|
|
85
|
+
});
|
|
86
|
+
return {
|
|
87
|
+
config,
|
|
88
|
+
cwd,
|
|
89
|
+
bibliography,
|
|
90
|
+
patterns,
|
|
91
|
+
http,
|
|
92
|
+
cache,
|
|
93
|
+
logger,
|
|
94
|
+
signal,
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
// ---------------------------------------------------------------------------
|
|
98
|
+
// Degraded result builders
|
|
99
|
+
// ---------------------------------------------------------------------------
|
|
100
|
+
function degradedExistenceLayer(message) {
|
|
101
|
+
return {
|
|
102
|
+
status: 'unverifiable',
|
|
103
|
+
evidence: 'unverifiable',
|
|
104
|
+
checkedFor: [],
|
|
105
|
+
notCheckedFor: ['existence', 'metadata', 'canonical-url', 'claim-support'],
|
|
106
|
+
checks: [{ source: 'crossref', result: 'error', evidence: { error: message } }],
|
|
107
|
+
error: message,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
function degradedCanonicalLayer() {
|
|
111
|
+
return { status: 'not-applicable', url: null };
|
|
112
|
+
}
|
|
113
|
+
// ---------------------------------------------------------------------------
|
|
114
|
+
// Tool version
|
|
115
|
+
// ---------------------------------------------------------------------------
|
|
116
|
+
async function readPackageVersion() {
|
|
117
|
+
try {
|
|
118
|
+
const thisFile = fileURLToPath(import.meta.url);
|
|
119
|
+
// Traverse up from dist/ or src/ to find package.json
|
|
120
|
+
let dir = path.dirname(thisFile);
|
|
121
|
+
for (let i = 0; i < 4; i++) {
|
|
122
|
+
const candidate = path.join(dir, 'package.json');
|
|
123
|
+
try {
|
|
124
|
+
const raw = await nodeReadFile(candidate, 'utf-8');
|
|
125
|
+
const parsed = JSON.parse(raw);
|
|
126
|
+
if (typeof parsed.version === 'string')
|
|
127
|
+
return parsed.version;
|
|
128
|
+
}
|
|
129
|
+
catch {
|
|
130
|
+
// not found at this level
|
|
131
|
+
}
|
|
132
|
+
dir = path.dirname(dir);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
catch {
|
|
136
|
+
// ignore
|
|
137
|
+
}
|
|
138
|
+
return '0.0.0';
|
|
139
|
+
}
|
|
140
|
+
// ---------------------------------------------------------------------------
|
|
141
|
+
// runCheck
|
|
142
|
+
// ---------------------------------------------------------------------------
|
|
143
|
+
/**
|
|
144
|
+
* Orchestrates all five subcommands and assembles the validated Output.
|
|
145
|
+
*
|
|
146
|
+
* Subcommands run sequentially. Each has a 5-minute deadline via
|
|
147
|
+
* `AbortSignal.any([deps.signal, AbortSignal.timeout(300_000)])`.
|
|
148
|
+
*
|
|
149
|
+
* If a subcommand throws (including timeout), the error is caught, logged,
|
|
150
|
+
* and a degraded result is emitted for that layer. The run continues with
|
|
151
|
+
* remaining subcommands.
|
|
152
|
+
*/
|
|
153
|
+
export async function runCheck(deps) {
|
|
154
|
+
const { config, cwd, bibliography, patterns, http, cache, logger, signal, skip, readFile = (p) => nodeReadFile(p, 'utf-8'), _runExistence: doRunExistence = runExistence, _runCanonical: doRunCanonical = runCanonical, _runLinkage: doRunLinkage = runLinkage, _runPhrases: doRunPhrases = runPhrases, _runWorklist: doRunWorklist = runWorklist, } = deps;
|
|
155
|
+
const SUBCOMMAND_TIMEOUT_MS = 300_000; // 5 minutes
|
|
156
|
+
function subSignal() {
|
|
157
|
+
return AbortSignal.any([signal, AbortSignal.timeout(SUBCOMMAND_TIMEOUT_MS)]);
|
|
158
|
+
}
|
|
159
|
+
// Per-entry maps: citekey → layer result
|
|
160
|
+
const identifiersMap = new Map();
|
|
161
|
+
const existenceMap = new Map();
|
|
162
|
+
const canonicalMap = new Map();
|
|
163
|
+
// Pre-populate maps with null (skipped) for all bibliography entries
|
|
164
|
+
for (const entry of bibliography) {
|
|
165
|
+
identifiersMap.set(entry.citekey, null);
|
|
166
|
+
existenceMap.set(entry.citekey, null);
|
|
167
|
+
canonicalMap.set(entry.citekey, null);
|
|
168
|
+
}
|
|
169
|
+
// --- identifiers (Layer 0: pure, local, pre-network well-formedness) ---
|
|
170
|
+
// Always run when existence runs: a malformed/bad-checksum identifier is a
|
|
171
|
+
// cheap fabrication signal that both gates (summary.malformedIdentifiers)
|
|
172
|
+
// and short-circuits the network existence call. Skipped only when the
|
|
173
|
+
// existence layer itself is skipped (no point validating ids we won't use).
|
|
174
|
+
const identifierInvalid = new Set();
|
|
175
|
+
if (!skip?.has('existence')) {
|
|
176
|
+
const idResult = runIdentifiers({ bibliography });
|
|
177
|
+
for (const e of idResult.entries) {
|
|
178
|
+
identifiersMap.set(e.citekey, e.identifiers);
|
|
179
|
+
const ids = e.identifiers;
|
|
180
|
+
// A DOI/ISBN that is present but malformed/bad-checksum cannot be looked
|
|
181
|
+
// up. (A bad URL does not block existence — existence keys off DOI/ISBN/
|
|
182
|
+
// title — but still counts toward malformedIdentifiers in the summary.)
|
|
183
|
+
if (ids.doi === 'malformed' ||
|
|
184
|
+
ids.isbn === 'malformed' ||
|
|
185
|
+
ids.isbn === 'bad-checksum') {
|
|
186
|
+
identifierInvalid.add(e.citekey);
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
// --- existence ---
|
|
191
|
+
if (!skip?.has('existence')) {
|
|
192
|
+
try {
|
|
193
|
+
const crossref = createCrossRefClient({
|
|
194
|
+
http,
|
|
195
|
+
cache,
|
|
196
|
+
mailto: config.apis.crossref_mailto ?? undefined,
|
|
197
|
+
baseUrl: config.apis.crossref_base,
|
|
198
|
+
});
|
|
199
|
+
const openalex = createOpenAlexClient({
|
|
200
|
+
http,
|
|
201
|
+
cache,
|
|
202
|
+
mailto: config.apis.openalex_mailto ?? undefined,
|
|
203
|
+
baseUrl: config.apis.openalex_base,
|
|
204
|
+
});
|
|
205
|
+
const openlibrary = createOpenLibraryClient({
|
|
206
|
+
http,
|
|
207
|
+
cache,
|
|
208
|
+
baseUrl: config.apis.openlibrary_base,
|
|
209
|
+
});
|
|
210
|
+
const existenceDeps = {
|
|
211
|
+
bibliography,
|
|
212
|
+
clients: { crossref, openalex, openlibrary },
|
|
213
|
+
identifierInvalid,
|
|
214
|
+
signal: subSignal(),
|
|
215
|
+
};
|
|
216
|
+
const result = await doRunExistence(existenceDeps);
|
|
217
|
+
for (const e of result.entries) {
|
|
218
|
+
existenceMap.set(e.citekey, e.existence);
|
|
219
|
+
}
|
|
220
|
+
// Surface transport failures explicitly. An entry whose existence checks
|
|
221
|
+
// are *all* transport errors (DNS/connect failure, 5xx after retries)
|
|
222
|
+
// must not be silently treated as a clean "unverifiable" pass. We emit a
|
|
223
|
+
// clear, actionable top-level message so the failure is not masked as
|
|
224
|
+
// success. Entries deliberately skipped for a malformed identifier are
|
|
225
|
+
// excluded — their all-error checks are an intentional short-circuit,
|
|
226
|
+
// not a connectivity problem.
|
|
227
|
+
const transportFailed = result.entries.filter((e) => !identifierInvalid.has(e.citekey) &&
|
|
228
|
+
e.existence.checks.length > 0 &&
|
|
229
|
+
e.existence.checks.every((c) => c.result === 'error'));
|
|
230
|
+
if (transportFailed.length > 0) {
|
|
231
|
+
logger.error('existence.transport_failure', {
|
|
232
|
+
message: 'Could not reach one or more bibliographic databases. ' +
|
|
233
|
+
'Existence could not be verified — this is a connectivity error, ' +
|
|
234
|
+
'not a confirmation that the works are absent. Check your network ' +
|
|
235
|
+
'connection and the [apis] base URLs in bibcheck.toml.',
|
|
236
|
+
affectedEntries: transportFailed.map((e) => e.citekey),
|
|
237
|
+
});
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
catch (err) {
|
|
241
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
242
|
+
logger.error('existence.failed', { error: message });
|
|
243
|
+
// Emit degraded existence for all entries
|
|
244
|
+
for (const entry of bibliography) {
|
|
245
|
+
existenceMap.set(entry.citekey, degradedExistenceLayer(message));
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
// --- canonical ---
|
|
250
|
+
if (!skip?.has('canonical')) {
|
|
251
|
+
try {
|
|
252
|
+
const canonicalDeps = {
|
|
253
|
+
config,
|
|
254
|
+
bibliography,
|
|
255
|
+
http,
|
|
256
|
+
cache,
|
|
257
|
+
signal: subSignal(),
|
|
258
|
+
};
|
|
259
|
+
const result = await doRunCanonical(canonicalDeps);
|
|
260
|
+
for (const e of result.entries) {
|
|
261
|
+
canonicalMap.set(e.citekey, e.canonical ?? null);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
catch (err) {
|
|
265
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
266
|
+
logger.error('canonical.failed', { error: message });
|
|
267
|
+
// Emit degraded canonical for all entries
|
|
268
|
+
for (const entry of bibliography) {
|
|
269
|
+
canonicalMap.set(entry.citekey, degradedCanonicalLayer());
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
// --- linkage ---
|
|
274
|
+
let linkageResult = { linkage: [] };
|
|
275
|
+
if (!skip?.has('linkage')) {
|
|
276
|
+
try {
|
|
277
|
+
const linkageDeps = {
|
|
278
|
+
config,
|
|
279
|
+
cwd,
|
|
280
|
+
bibliography,
|
|
281
|
+
readFile,
|
|
282
|
+
signal: subSignal(),
|
|
283
|
+
};
|
|
284
|
+
linkageResult = await doRunLinkage(linkageDeps);
|
|
285
|
+
}
|
|
286
|
+
catch (err) {
|
|
287
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
288
|
+
logger.error('linkage.failed', { error: message });
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
// --- phrases ---
|
|
292
|
+
let phrasesResult = { phraseFlags: [] };
|
|
293
|
+
if (!skip?.has('phrases')) {
|
|
294
|
+
try {
|
|
295
|
+
const phrasesDeps = {
|
|
296
|
+
config,
|
|
297
|
+
cwd,
|
|
298
|
+
patterns,
|
|
299
|
+
readFile,
|
|
300
|
+
signal: subSignal(),
|
|
301
|
+
};
|
|
302
|
+
phrasesResult = await doRunPhrases(phrasesDeps);
|
|
303
|
+
}
|
|
304
|
+
catch (err) {
|
|
305
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
306
|
+
logger.error('phrases.failed', { error: message });
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
// --- worklist ---
|
|
310
|
+
let worklistResult = { worklist: [] };
|
|
311
|
+
if (!skip?.has('worklist')) {
|
|
312
|
+
try {
|
|
313
|
+
const worklistDeps = {
|
|
314
|
+
config,
|
|
315
|
+
cwd,
|
|
316
|
+
bibliography,
|
|
317
|
+
readFile,
|
|
318
|
+
signal: subSignal(),
|
|
319
|
+
};
|
|
320
|
+
worklistResult = await doRunWorklist(worklistDeps);
|
|
321
|
+
}
|
|
322
|
+
catch (err) {
|
|
323
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
324
|
+
logger.error('worklist.failed', { error: message });
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
// --- Assemble entries ---
|
|
328
|
+
const entries = bibliography.map((bib) => ({
|
|
329
|
+
citekey: bib.citekey,
|
|
330
|
+
identifiers: identifiersMap.get(bib.citekey) ?? null,
|
|
331
|
+
existence: existenceMap.get(bib.citekey) ?? null,
|
|
332
|
+
canonical: canonicalMap.get(bib.citekey) ?? null,
|
|
333
|
+
}));
|
|
334
|
+
// --- Compute summary ---
|
|
335
|
+
const CANONICAL_ISSUE_STATUSES = new Set([
|
|
336
|
+
'wrong-host',
|
|
337
|
+
'dead-url',
|
|
338
|
+
'live-url-not-archived-snapshot',
|
|
339
|
+
'no-url-on-pre-doi-entry',
|
|
340
|
+
]);
|
|
341
|
+
const totalEntries = bibliography.length;
|
|
342
|
+
// The four existence buckets PARTITION the entries: every entry lands in
|
|
343
|
+
// exactly one, so they reconcile to totalEntries (T20 invariant, enforced by
|
|
344
|
+
// OutputSchema). An entry whose existence layer was not run (null, e.g.
|
|
345
|
+
// existence skipped) is treated as `unverifiable` for the partition — we
|
|
346
|
+
// could not place it in any database, so it is not verified/mismatched/absent.
|
|
347
|
+
let verified = 0;
|
|
348
|
+
let metadataMismatches = 0;
|
|
349
|
+
let notFoundInDatabases = 0;
|
|
350
|
+
let unverifiable = 0;
|
|
351
|
+
let malformedIdentifiers = 0;
|
|
352
|
+
let canonicalIssues = 0;
|
|
353
|
+
for (const entry of entries) {
|
|
354
|
+
const ex = entry.existence;
|
|
355
|
+
const can = entry.canonical;
|
|
356
|
+
const ids = entry.identifiers;
|
|
357
|
+
const status = ex?.status ?? 'unverifiable';
|
|
358
|
+
switch (status) {
|
|
359
|
+
case 'verified':
|
|
360
|
+
verified += 1;
|
|
361
|
+
break;
|
|
362
|
+
case 'metadata-mismatch':
|
|
363
|
+
metadataMismatches += 1;
|
|
364
|
+
break;
|
|
365
|
+
case 'not-found-in-databases':
|
|
366
|
+
notFoundInDatabases += 1;
|
|
367
|
+
break;
|
|
368
|
+
case 'unverifiable':
|
|
369
|
+
unverifiable += 1;
|
|
370
|
+
break;
|
|
371
|
+
}
|
|
372
|
+
// Malformed-identifier count (T21): any entry with a malformed/bad-checksum
|
|
373
|
+
// DOI/ISBN/URL. Overlaps the existence buckets (those entries are
|
|
374
|
+
// unverifiable) — it is a separate fabrication-signal counter, not a fifth
|
|
375
|
+
// bucket. Gates by default.
|
|
376
|
+
if (ids !== null &&
|
|
377
|
+
(ids.doi === 'malformed' ||
|
|
378
|
+
ids.isbn === 'malformed' ||
|
|
379
|
+
ids.isbn === 'bad-checksum' ||
|
|
380
|
+
ids.url === 'malformed')) {
|
|
381
|
+
malformedIdentifiers += 1;
|
|
382
|
+
}
|
|
383
|
+
if (can !== null &&
|
|
384
|
+
CANONICAL_ISSUE_STATUSES.has(can.status)) {
|
|
385
|
+
canonicalIssues += 1;
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
const linkageFailures = linkageResult.linkage.filter((l) => l.status === 'unresolved').length;
|
|
389
|
+
// Reverse linkage (H2): bibliography citekeys never referenced in any doc.
|
|
390
|
+
// Informational only — counted for visibility but NOT added to
|
|
391
|
+
// checkExitReasons, so orphans never affect the exit code.
|
|
392
|
+
const orphanedEntries = linkageResult.linkage.filter((l) => l.status === 'orphan').length;
|
|
393
|
+
const phraseFlags = phrasesResult.phraseFlags.filter((f) => f.status === 'flagged').length;
|
|
394
|
+
const worklistItems = worklistResult.worklist.length;
|
|
395
|
+
const summary = {
|
|
396
|
+
totalEntries,
|
|
397
|
+
verified,
|
|
398
|
+
metadataMismatches,
|
|
399
|
+
notFoundInDatabases,
|
|
400
|
+
malformedIdentifiers,
|
|
401
|
+
unverifiable,
|
|
402
|
+
canonicalIssues,
|
|
403
|
+
linkageFailures,
|
|
404
|
+
phraseFlags,
|
|
405
|
+
worklistItems,
|
|
406
|
+
orphanedEntries,
|
|
407
|
+
};
|
|
408
|
+
// --- Tool info ---
|
|
409
|
+
const version = await readPackageVersion();
|
|
410
|
+
const output = {
|
|
411
|
+
schemaVersion: SCHEMA_VERSION,
|
|
412
|
+
tool: { name: 'bibcheck', version },
|
|
413
|
+
summary,
|
|
414
|
+
entries,
|
|
415
|
+
linkage: linkageResult.linkage,
|
|
416
|
+
phraseFlags: phrasesResult.phraseFlags,
|
|
417
|
+
worklist: worklistResult.worklist,
|
|
418
|
+
};
|
|
419
|
+
// --- T23 suppression: warn on reason-less allows + log acknowledged findings ---
|
|
420
|
+
// Reason is MANDATORY: a `bibcheck-allow` with an empty/missing reason does
|
|
421
|
+
// NOT suppress (isGated ignores it); warn so the omission is visible rather
|
|
422
|
+
// than silently ineffective. An unknown finding-type token likewise warns.
|
|
423
|
+
// Suppressed findings stay in the document (totals unchanged) and are logged
|
|
424
|
+
// as informational acknowledgements — never silently dropped.
|
|
425
|
+
{
|
|
426
|
+
const ctx = buildSuppressionContext(config, bibliography);
|
|
427
|
+
const { unknownTypes, reasonless } = parseAllowDiagnostics(bibliography);
|
|
428
|
+
for (const u of reasonless) {
|
|
429
|
+
logger.warn('suppression.allow_missing_reason', {
|
|
430
|
+
citekey: u.citekey,
|
|
431
|
+
findingType: u.findingType,
|
|
432
|
+
message: `bibcheck-allow for '${u.findingType}' on @${u.citekey} has no (reason: ...); ` +
|
|
433
|
+
'reason is mandatory, so this allow does NOT suppress. Add a reason to silence the finding.',
|
|
434
|
+
});
|
|
435
|
+
}
|
|
436
|
+
for (const u of unknownTypes) {
|
|
437
|
+
logger.warn('suppression.allow_unknown_type', {
|
|
438
|
+
citekey: u.citekey,
|
|
439
|
+
token: u.token,
|
|
440
|
+
message: `bibcheck-allow on @${u.citekey} names unknown finding-type '${u.token}'; ` +
|
|
441
|
+
'this directive suppresses nothing.',
|
|
442
|
+
});
|
|
443
|
+
}
|
|
444
|
+
for (const ack of collectAcknowledgedFindings(output, ctx)) {
|
|
445
|
+
logger.info('check.acknowledged_finding', {
|
|
446
|
+
citekey: ack.citekey,
|
|
447
|
+
findingType: ack.findingType,
|
|
448
|
+
suppressedBy: ack.reason,
|
|
449
|
+
message: `@${ack.citekey}: '${ack.findingType}' suppressed by ${ack.reason} ` +
|
|
450
|
+
'(reported as acknowledged, excluded from the build gate).',
|
|
451
|
+
});
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
// --- Validate ---
|
|
455
|
+
const parsed = OutputSchema.safeParse(output);
|
|
456
|
+
if (!parsed.success) {
|
|
457
|
+
const firstIssue = parsed.error.issues[0];
|
|
458
|
+
const msg = firstIssue
|
|
459
|
+
? `${firstIssue.path.join('.')}: ${firstIssue.message}`
|
|
460
|
+
: parsed.error.message;
|
|
461
|
+
logger.error('output.schema_invalid', { error: msg });
|
|
462
|
+
throw new Error(`Output failed schema validation (bibcheck bug): ${msg}`);
|
|
463
|
+
}
|
|
464
|
+
return parsed.data;
|
|
465
|
+
}
|
|
466
|
+
// ---------------------------------------------------------------------------
|
|
467
|
+
// CHECK_NON_ZERO_REASON
|
|
468
|
+
// ---------------------------------------------------------------------------
|
|
469
|
+
export const CHECK_NON_ZERO_REASON = {
|
|
470
|
+
flagged_phrase: 'flagged_phrase',
|
|
471
|
+
unresolved_linkage: 'unresolved_linkage',
|
|
472
|
+
canonical_issue: 'canonical_issue',
|
|
473
|
+
metadata_mismatch: 'metadata_mismatch',
|
|
474
|
+
// NEW in 0.2.0 (T22): secure-default gating (Q1). Gate by default; T23 layers
|
|
475
|
+
// source-type exemptions and per-finding suppression on top via the optional
|
|
476
|
+
// SuppressionContext passed to checkExitReasons (see below).
|
|
477
|
+
not_found_in_databases: 'not_found_in_databases',
|
|
478
|
+
malformed_identifier: 'malformed_identifier',
|
|
479
|
+
};
|
|
480
|
+
// ---------------------------------------------------------------------------
|
|
481
|
+
// checkExitReasons
|
|
482
|
+
// ---------------------------------------------------------------------------
|
|
483
|
+
const CANONICAL_EXIT_STATUSES = new Set([
|
|
484
|
+
'dead-url',
|
|
485
|
+
'wrong-host',
|
|
486
|
+
'no-url-on-pre-doi-entry',
|
|
487
|
+
'live-url-not-archived-snapshot',
|
|
488
|
+
]);
|
|
489
|
+
/** True when an entry has any malformed/bad-checksum identifier (gating signal). */
|
|
490
|
+
function entryHasMalformedIdentifier(e) {
|
|
491
|
+
const ids = e.identifiers;
|
|
492
|
+
return (ids !== null &&
|
|
493
|
+
(ids.doi === 'malformed' ||
|
|
494
|
+
ids.isbn === 'malformed' ||
|
|
495
|
+
ids.isbn === 'bad-checksum' ||
|
|
496
|
+
ids.url === 'malformed'));
|
|
497
|
+
}
|
|
498
|
+
/**
|
|
499
|
+
* Returns the list of finding kinds that should cause a non-zero exit.
|
|
500
|
+
* Empty array → exit 0.
|
|
501
|
+
*
|
|
502
|
+
* Rules:
|
|
503
|
+
* - 'flagged_phrase' if any phraseFlags[].status === 'flagged'
|
|
504
|
+
* - 'unresolved_linkage' if any linkage[].status === 'unresolved'
|
|
505
|
+
* - 'canonical_issue' if any entry's canonical.status is in the problem set
|
|
506
|
+
* - 'metadata_mismatch' if any entry's existence.status === 'metadata-mismatch'
|
|
507
|
+
* - 'not_found_in_databases' if any (non-suppressed) entry's existence.status
|
|
508
|
+
* === 'not-found-in-databases' (B1 fix — absence is a fabrication signal
|
|
509
|
+
* and gates by default per Q1)
|
|
510
|
+
* - 'malformed_identifier' if any (non-suppressed) entry has a malformed
|
|
511
|
+
* DOI/ISBN/URL (a cheap fabrication signal; gates by default)
|
|
512
|
+
*
|
|
513
|
+
* Does NOT trigger non-zero exit:
|
|
514
|
+
* - acknowledged phrases
|
|
515
|
+
* - worklist items
|
|
516
|
+
* - unverifiable existence (graceful degradation)
|
|
517
|
+
*
|
|
518
|
+
* T23: the optional `ctx` filters WHICH entries reach the gate. A not-found or
|
|
519
|
+
* malformed finding does NOT gate when `isGated` resolves it to a source-type
|
|
520
|
+
* exemption or a valid per-entry allow — the gate itself is unchanged, only the
|
|
521
|
+
* per-entry predicate is narrowed. Suppressed findings remain in the Output
|
|
522
|
+
* document (entries + summary counts) and are NOT removed; they are surfaced as
|
|
523
|
+
* informational acknowledgements (see `collectAcknowledgedFindings`). When
|
|
524
|
+
* `ctx` is omitted, every not-found / malformed finding gates unconditionally
|
|
525
|
+
* (the pre-T23 secure default).
|
|
526
|
+
*/
|
|
527
|
+
export function checkExitReasons(output, ctx) {
|
|
528
|
+
const reasons = [];
|
|
529
|
+
if (output.phraseFlags.some((f) => f.status === 'flagged')) {
|
|
530
|
+
reasons.push(CHECK_NON_ZERO_REASON.flagged_phrase);
|
|
531
|
+
}
|
|
532
|
+
if (output.linkage.some((l) => l.status === 'unresolved')) {
|
|
533
|
+
reasons.push(CHECK_NON_ZERO_REASON.unresolved_linkage);
|
|
534
|
+
}
|
|
535
|
+
if (output.entries.some((e) => e.canonical !== null && CANONICAL_EXIT_STATUSES.has(e.canonical.status))) {
|
|
536
|
+
reasons.push(CHECK_NON_ZERO_REASON.canonical_issue);
|
|
537
|
+
}
|
|
538
|
+
if (output.entries.some((e) => e.existence !== null && e.existence.status === 'metadata-mismatch')) {
|
|
539
|
+
reasons.push(CHECK_NON_ZERO_REASON.metadata_mismatch);
|
|
540
|
+
}
|
|
541
|
+
// --- Q1 secure default + T23 suppression: not-found + malformed ---
|
|
542
|
+
// Without a context, both gate unconditionally. With one, each finding is
|
|
543
|
+
// routed through `isGated`; only findings that resolve to `gated: true` count.
|
|
544
|
+
const gatedNotFound = output.entries.some((e) => {
|
|
545
|
+
if (e.existence === null || e.existence.status !== 'not-found-in-databases')
|
|
546
|
+
return false;
|
|
547
|
+
if (ctx === undefined)
|
|
548
|
+
return true;
|
|
549
|
+
return isGated({
|
|
550
|
+
citekey: e.citekey,
|
|
551
|
+
findingType: 'not-found',
|
|
552
|
+
cslType: ctx.cslTypeByCitekey.get(e.citekey),
|
|
553
|
+
config: ctx.config,
|
|
554
|
+
allows: ctx.allows,
|
|
555
|
+
}).gated;
|
|
556
|
+
});
|
|
557
|
+
if (gatedNotFound) {
|
|
558
|
+
reasons.push(CHECK_NON_ZERO_REASON.not_found_in_databases);
|
|
559
|
+
}
|
|
560
|
+
const gatedMalformed = output.entries.some((e) => {
|
|
561
|
+
if (!entryHasMalformedIdentifier(e))
|
|
562
|
+
return false;
|
|
563
|
+
if (ctx === undefined)
|
|
564
|
+
return true;
|
|
565
|
+
return isGated({
|
|
566
|
+
citekey: e.citekey,
|
|
567
|
+
findingType: 'malformed-identifier',
|
|
568
|
+
cslType: ctx.cslTypeByCitekey.get(e.citekey),
|
|
569
|
+
config: ctx.config,
|
|
570
|
+
allows: ctx.allows,
|
|
571
|
+
}).gated;
|
|
572
|
+
});
|
|
573
|
+
if (gatedMalformed) {
|
|
574
|
+
reasons.push(CHECK_NON_ZERO_REASON.malformed_identifier);
|
|
575
|
+
}
|
|
576
|
+
return reasons;
|
|
577
|
+
}
|
|
578
|
+
/**
|
|
579
|
+
* Collect the not-found / malformed findings that WOULD have gated but were
|
|
580
|
+
* suppressed by a source-type exemption or a per-entry allow. These stay in the
|
|
581
|
+
* Output document (totals are unchanged); this list drives the informational
|
|
582
|
+
* `check.acknowledged_finding` log entries, mirroring how an acknowledged
|
|
583
|
+
* phrase is reported rather than dropped. Pure.
|
|
584
|
+
*/
|
|
585
|
+
export function collectAcknowledgedFindings(output, ctx) {
|
|
586
|
+
const acks = [];
|
|
587
|
+
for (const e of output.entries) {
|
|
588
|
+
const cslType = ctx.cslTypeByCitekey.get(e.citekey);
|
|
589
|
+
if (e.existence !== null && e.existence.status === 'not-found-in-databases') {
|
|
590
|
+
const r = isGated({
|
|
591
|
+
citekey: e.citekey,
|
|
592
|
+
findingType: 'not-found',
|
|
593
|
+
cslType,
|
|
594
|
+
config: ctx.config,
|
|
595
|
+
allows: ctx.allows,
|
|
596
|
+
});
|
|
597
|
+
if (!r.gated && r.reason !== 'default') {
|
|
598
|
+
acks.push({ citekey: e.citekey, findingType: 'not-found', reason: r.reason });
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
if (entryHasMalformedIdentifier(e)) {
|
|
602
|
+
const r = isGated({
|
|
603
|
+
citekey: e.citekey,
|
|
604
|
+
findingType: 'malformed-identifier',
|
|
605
|
+
cslType,
|
|
606
|
+
config: ctx.config,
|
|
607
|
+
allows: ctx.allows,
|
|
608
|
+
});
|
|
609
|
+
if (!r.gated && r.reason !== 'default') {
|
|
610
|
+
acks.push({ citekey: e.citekey, findingType: 'malformed-identifier', reason: r.reason });
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
return acks;
|
|
615
|
+
}
|
|
616
|
+
/**
|
|
617
|
+
* Build the T23 suppression context from the config and the loaded
|
|
618
|
+
* bibliography: the citekey → CSL-type map (the frozen Output schema does not
|
|
619
|
+
* carry `type`, but it is needed to resolve source-type exemptions) and the
|
|
620
|
+
* parsed per-entry `bibcheck-allow` directives. Pure. The CLI calls this and
|
|
621
|
+
* passes the result to `checkExitReasons`.
|
|
622
|
+
*/
|
|
623
|
+
export function buildSuppressionContext(config, bibliography) {
|
|
624
|
+
const cslTypeByCitekey = new Map();
|
|
625
|
+
for (const e of bibliography) {
|
|
626
|
+
cslTypeByCitekey.set(e.citekey, e.type);
|
|
627
|
+
}
|
|
628
|
+
const { allows } = parseAllowsForBibliography(bibliography.map((e) => ({ citekey: e.citekey, note: e.note })));
|
|
629
|
+
return { config, cslTypeByCitekey, allows };
|
|
630
|
+
}
|
|
631
|
+
/**
|
|
632
|
+
* Diagnostics over the parsed allows: directives with an unknown finding-type
|
|
633
|
+
* token and valid-type directives whose reason was omitted (reason is
|
|
634
|
+
* mandatory; these do not suppress). Pure; drives the `runCheck` warnings.
|
|
635
|
+
*/
|
|
636
|
+
function parseAllowDiagnostics(bibliography) {
|
|
637
|
+
const { allows, unknownTypes } = parseAllowsForBibliography(bibliography.map((e) => ({ citekey: e.citekey, note: e.note })));
|
|
638
|
+
const reasonless = allows
|
|
639
|
+
.filter((a) => a.reason === null)
|
|
640
|
+
.map((a) => ({ citekey: a.citekey, findingType: a.findingType }));
|
|
641
|
+
return {
|
|
642
|
+
unknownTypes: unknownTypes.map((u) => ({ citekey: u.citekey, token: u.token })),
|
|
643
|
+
reasonless,
|
|
644
|
+
};
|
|
645
|
+
}
|
|
646
|
+
//# sourceMappingURL=check.js.map
|