veracite 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- veracite-0.1.1/LICENSE +21 -0
- veracite-0.1.1/PKG-INFO +506 -0
- veracite-0.1.1/README.md +479 -0
- veracite-0.1.1/pyproject.toml +44 -0
- veracite-0.1.1/setup.cfg +4 -0
- veracite-0.1.1/tests/test_bibcheck.py +2410 -0
- veracite-0.1.1/tests/test_catalog.py +133 -0
- veracite-0.1.1/tests/test_checkpoint.py +375 -0
- veracite-0.1.1/tests/test_webcheck.py +180 -0
- veracite-0.1.1/veracite/__init__.py +17 -0
- veracite-0.1.1/veracite/__main__.py +8 -0
- veracite-0.1.1/veracite/catalog.py +170 -0
- veracite-0.1.1/veracite/checkpoint.py +321 -0
- veracite-0.1.1/veracite/cli.py +453 -0
- veracite-0.1.1/veracite/compare.py +466 -0
- veracite-0.1.1/veracite/config.py +163 -0
- veracite-0.1.1/veracite/data/biblatex_datamodel.json +1334 -0
- veracite-0.1.1/veracite/data/journal_abbrev.json +426 -0
- veracite-0.1.1/veracite/datamodel.py +101 -0
- veracite-0.1.1/veracite/http.py +113 -0
- veracite-0.1.1/veracite/identifiers.py +42 -0
- veracite-0.1.1/veracite/llm.py +382 -0
- veracite-0.1.1/veracite/models.py +51 -0
- veracite-0.1.1/veracite/normalize.py +365 -0
- veracite-0.1.1/veracite/parser.py +375 -0
- veracite-0.1.1/veracite/pipeline.py +37 -0
- veracite-0.1.1/veracite/record.py +271 -0
- veracite-0.1.1/veracite/report.py +671 -0
- veracite-0.1.1/veracite/rules.py +849 -0
- veracite-0.1.1/veracite/sources.py +349 -0
- veracite-0.1.1/veracite/titles.py +79 -0
- veracite-0.1.1/veracite/verify.py +389 -0
- veracite-0.1.1/veracite/webcheck.py +189 -0
- veracite-0.1.1/veracite.egg-info/PKG-INFO +506 -0
- veracite-0.1.1/veracite.egg-info/SOURCES.txt +37 -0
- veracite-0.1.1/veracite.egg-info/dependency_links.txt +1 -0
- veracite-0.1.1/veracite.egg-info/entry_points.txt +2 -0
- veracite-0.1.1/veracite.egg-info/requires.txt +6 -0
- veracite-0.1.1/veracite.egg-info/top_level.txt +1 -0
veracite-0.1.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Shannon Whitlock
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
veracite-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: veracite
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: A bibliography health checker for LaTeX projects.
|
|
5
|
+
Author: Shannon Whitlock
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Shannon-Whitlock/veracite
|
|
8
|
+
Project-URL: Repository, https://github.com/Shannon-Whitlock/veracite
|
|
9
|
+
Project-URL: Issues, https://github.com/Shannon-Whitlock/veracite/issues
|
|
10
|
+
Keywords: bibtex,biblatex,bibliography,latex,citations,doi,arxiv
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering
|
|
18
|
+
Classifier: Topic :: Text Processing :: Markup :: LaTeX
|
|
19
|
+
Requires-Python: >=3.8
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Provides-Extra: http
|
|
23
|
+
Requires-Dist: requests; extra == "http"
|
|
24
|
+
Provides-Extra: test
|
|
25
|
+
Requires-Dist: pytest; extra == "test"
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# VeraCite
|
|
29
|
+
|
|
30
|
+
**A lightweight, auditable tool for checking the accuracy and conformity of
|
|
31
|
+
BibTeX/biblatex bibliographies in scientific articles — a deterministic check
|
|
32
|
+
against hallucinated and mangled citations.**
|
|
33
|
+
|
|
34
|
+
VeraCite improves the **veracity** of the bibliographic record in scientific
|
|
35
|
+
papers. Where BibTeX is notoriously tolerant of imperfect entries, VeraCite
|
|
36
|
+
surfaces errors for fast human verification and AI-tool integration, helping
|
|
37
|
+
bibliographic records better satisfy the
|
|
38
|
+
[FAIR](https://www.go-fair.org/fair-principles/) principles (persistent
|
|
39
|
+
identifiers, shared standards, accurate metadata). Because every check is a rule
|
|
40
|
+
or a comparison against an authoritative record — **never** a language model
|
|
41
|
+
guessing — it is exactly the kind of ground-truth gate an AI writing assistant
|
|
42
|
+
needs: it confirms, against Crossref/arXiv and friends, that a reference is real,
|
|
43
|
+
correctly identified, and accurately transcribed, catching the fabricated DOI,
|
|
44
|
+
the invented paper, and the subtly wrong year or author that LLMs introduce.
|
|
45
|
+
|
|
46
|
+
VeraCite is for authors, publishers, and AI assistants who want to vet a
|
|
47
|
+
bibliography *before* publication. It checks a `.bib` file along three levels:
|
|
48
|
+
|
|
49
|
+
- **Syntax** — does it conform to the BibTeX/biblatex datamodel?
|
|
50
|
+
- **Semantics** — is each entry consistent with the authoritative online record
|
|
51
|
+
(Crossref, arXiv, INSPIRE-HEP, OpenAlex, Open Library)?
|
|
52
|
+
- **Context** — (with `--tex`) is each work genuinely cited, and cited
|
|
53
|
+
appropriately, in the manuscript?
|
|
54
|
+
|
|
55
|
+
It produces both a **human-readable** report and a **machine-readable** JSON
|
|
56
|
+
record, each with clear descriptions of every issue and an overall **0–100
|
|
57
|
+
integrity score**.
|
|
58
|
+
|
|
59
|
+
VeraCite **never modifies your bibliography or your LaTeX** — it only *flags*
|
|
60
|
+
issues, with the offending line and (where possible) a suggested fix, for an author
|
|
61
|
+
to inspect and correct. Every finding carries a stable rule **category**
|
|
62
|
+
and, for online checks, a `verify:` link, so the report is auditable rather than
|
|
63
|
+
a black box.
|
|
64
|
+
|
|
65
|
+
### Why VeraCite
|
|
66
|
+
|
|
67
|
+
A bibliography is easy to get wrong and tedious to check by hand: a wrong year,
|
|
68
|
+
a mistyped DOI, a page number that doesn't match the published article, a
|
|
69
|
+
preprint that has since appeared in a journal, or a misplaced citation that
|
|
70
|
+
points to the wrong work. These slip through because BibTeX accepts
|
|
71
|
+
them without complaint — and checking each entry against the real record is slow and error prone. The same errors, plus outright **fabricated references**, now
|
|
72
|
+
arrive in bulk from LLM-assisted drafting, where a confident-looking citation
|
|
73
|
+
may name a paper that does not exist or attach a real DOI to the wrong work.
|
|
74
|
+
VeraCite does that checking for you — deterministically, against the source of
|
|
75
|
+
record — and is built to be:
|
|
76
|
+
|
|
77
|
+
- **Simple to run** — one small Python program you run from the command line. No
|
|
78
|
+
account, no website, no setup; it works out of the box and needs no extra
|
|
79
|
+
software installed.
|
|
80
|
+
- **Trustworthy** — it doesn't guess. Every issue it reports comes from an
|
|
81
|
+
explicit rule or a comparison against an authoritative record, so you can see
|
|
82
|
+
exactly why each was flagged — which is also what makes it a sound check *on* an
|
|
83
|
+
AI assistant's output rather than another source of guesses. The optional AI
|
|
84
|
+
relevance check is **off by default**.
|
|
85
|
+
- **Standards-based** — it checks your entries against the official BibTeX/biblatex
|
|
86
|
+
rules, standard journal-name abbreviations, and validated identifiers (DOI,
|
|
87
|
+
arXiv, ISBN, ISSN, ORCID).
|
|
88
|
+
- **Private by default** — built to help *you* fix *your own* bibliography before
|
|
89
|
+
submission. Unless you opt in, it **never reads your manuscript and sends
|
|
90
|
+
nothing to any AI service**, so it is safe to run on confidential drafts.
|
|
91
|
+
|
|
92
|
+
### Auditable by design
|
|
93
|
+
|
|
94
|
+
VeraCite's checks are not arbitrary or hidden in model weights. Every rule is a
|
|
95
|
+
small, deterministic piece of Python or generated data that an author, publisher or
|
|
96
|
+
developer can read, correct, and extend. The four places to look:
|
|
97
|
+
|
|
98
|
+
| What | Where | How to inspect / extend |
|
|
99
|
+
|------|-------|-------------------------|
|
|
100
|
+
| **Static checks** (the rule registry) | [`veracite/rules.py`](veracite/rules.py) | Each check is a function decorated `@rule` (per entry) or `@file_rule` (whole file) and appended to a registry the engine iterates. Add a check by writing one function; the module docstring marks it *"the part meant to be read and edited."* |
|
|
101
|
+
| **Structural validity** (legal & mandatory fields) | [`veracite/data/biblatex_datamodel.json`](veracite/data/biblatex_datamodel.json), loaded by [`veracite/datamodel.py`](veracite/datamodel.py) | Generated from biblatex's **own** `blx-dm.def` by [`tools/gen_datamodel.py`](tools/gen_datamodel.py) — not a hand-kept blocklist. Regenerate when biblatex updates. |
|
|
102
|
+
| **Severity, grouping & descriptions** (what's an error vs. a note, the syntax/semantic/context bucket, and the catalog text) | `resolve_severity()`, `CATEGORY_GROUP`, and `CATEGORY_DOC` in [`veracite/report.py`](veracite/report.py); defaults in `DEFAULT_SETTINGS["severity"]` ([`veracite/config.py`](veracite/config.py)) | Every finding carries a stable string **category**. List the whole catalog with `--list-rules`; re-rank any category to `error`/`warning`/`note` via the `severity` block in a settings file (see [Configuration](#configuration)) — no code change needed. |
|
|
103
|
+
| **Integrity score** (the 0–100 roll-up) | `integrity()` in [`veracite/verify.py`](veracite/verify.py) | A transparent weighted formula over explicit counts — `0.50·verification + 0.20·PID + 0.15·DOI + 0.15·(1 − defects)` — **not** a model output. |
|
|
104
|
+
|
|
105
|
+
Start from the catalog — the complete list of every finding category VeraCite can
|
|
106
|
+
emit, with its default severity, group, what supersedes it, and a one-line
|
|
107
|
+
description:
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
python -m veracite --list-rules # human-readable table (the audit sheet)
|
|
111
|
+
python -m veracite --list-rules json # same, machine-readable
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
```
|
|
115
|
+
category severity group superseded by description
|
|
116
|
+
------------------------ -------- -------- ------------- ----------------------------------------
|
|
117
|
+
duplicate error syntax - duplicate citation key or DOI ...
|
|
118
|
+
metadata_mismatch warning semantic - author/title/year/vol/pages/journal differ
|
|
119
|
+
preprint_superseded warning context - a published version now exists
|
|
120
|
+
title_case note semantic record layer title looks miscased (mostly UPPERCASE)
|
|
121
|
+
...
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Getting Started
|
|
125
|
+
|
|
126
|
+
Point it at a `.bib` file; it reports structural, stylistic, and record-level
|
|
127
|
+
problems for a human to read and a script to parse. Add `--tex` to also check how
|
|
128
|
+
the bibliography is cited.
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
python -m veracite --bib refs.bib # check every entry; reads no .tex
|
|
132
|
+
python -m veracite --bib refs.bib --tex paper/ # check only cited entries
|
|
133
|
+
python -m veracite --bib refs.bib --offline # static checks only (no network)
|
|
134
|
+
python -m veracite --bib refs.bib --tex paper/ --llm # + LLM relevance sweep
|
|
135
|
+
python -m veracite --bib refs.bib --skipnotes # warnings and errors only
|
|
136
|
+
python -m veracite --bib refs.bib --sort severity # global triage list, errors first
|
|
137
|
+
python -m veracite --bib refs.bib --json report.json
|
|
138
|
+
python -m veracite --list-rules # the rule catalog / audit sheet
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
Installed (`pip install .`) it also exposes a `veracite` command.
|
|
142
|
+
|
|
143
|
+
`--bib FILE` selects the bibliography; if omitted it is auto-discovered under the
|
|
144
|
+
cwd. VeraCite runs in one of two modes:
|
|
145
|
+
|
|
146
|
+
- **bibliography-only** (no `--tex`): every entry is checked. **No `.tex` file is
|
|
147
|
+
ever read** — the default run never touches your manuscript, so it is safe on
|
|
148
|
+
confidential drafts.
|
|
149
|
+
- **citations** (`--tex PATH`, a file or directory, repeatable): only the entries
|
|
150
|
+
cited by those sources are resolved online and (with `--llm`) rated; uncited
|
|
151
|
+
entries are noted and skipped. A cited key with no `.bib` entry is an error.
|
|
152
|
+
|
|
153
|
+
`.tex` is read only when you ask for it with `--tex`; there is no silent
|
|
154
|
+
auto-discovery. Online checks are on by default; `--offline` makes the run fully
|
|
155
|
+
offline. The LLM relevance sweep is **off** unless `--llm` is given, and `--llm`
|
|
156
|
+
**requires `--tex`** (it needs the citation context). Every layer runs per entry
|
|
157
|
+
in bibliography order, so the report is a single list in `.bib` order — each
|
|
158
|
+
entry's findings printed once, followed by a file-level group and the summary.
|
|
159
|
+
|
|
160
|
+
Exit status is non-zero when any error is found, so it can gate CI.
|
|
161
|
+
|
|
162
|
+
## Message types
|
|
163
|
+
|
|
164
|
+
The three levels mean different things and call for different action:
|
|
165
|
+
|
|
166
|
+
- **`[ERROR]`** — must fix. A structural/syntax error that stops BibTeX from
|
|
167
|
+
parsing (unbalanced braces, a missing `=`, an unknown entry type, a dropped
|
|
168
|
+
reference); a duplicate; a retraction; a dead DOI; an id that resolves to a
|
|
169
|
+
different paper (first author **and** title both differ); or an LLM-flagged
|
|
170
|
+
clearly-wrong paper.
|
|
171
|
+
- **`[WARN]`** — investigate. A discrepancy between the record and the bib that
|
|
172
|
+
may or may not be wrong: an author/title/given-name/year/volume/pages field
|
|
173
|
+
differs from the id-resolved record, a non-standard journal abbreviation, a
|
|
174
|
+
preprint with a published version, a linked erratum, or an LLM relevance ≤3.
|
|
175
|
+
Open the `verify:` link and decide.
|
|
176
|
+
- **`[note]`** — stylistic, or filtered by biblatex anyway: casing,
|
|
177
|
+
brace-protection, dashes, a name month, an invalid-for-biblatex field, an
|
|
178
|
+
abbreviated given name, or a registry-parity suggestion. Hide with
|
|
179
|
+
`--skipnotes` (still counted in the summary).
|
|
180
|
+
|
|
181
|
+
### Example output
|
|
182
|
+
|
|
183
|
+
Findings are grouped into one **block per bibliography entry**, in `.bib` order. A
|
|
184
|
+
block opens with a header line that identifies the record and the verification status,
|
|
185
|
+
then lists each finding indented beneath it (in severity order):
|
|
186
|
+
|
|
187
|
+
```
|
|
188
|
+
[ 8/83] amo2009 @article line 96 VERIFIED (confidence 0.75); https://doi.org/10.1038/nature07640
|
|
189
|
+
[WARN] metadata_mismatch (line 98): [crossref] year differs: bib=2009, record=2010
|
|
190
|
+
[note] style (line 101): month 'may' is a name; biblatex will not sort it (suggested: 'may' -> '5')
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
The header carries everything that identifies and verifies the record (an optional
|
|
194
|
+
`[i/N]` progress counter, the key, `@type`, line, status, and confidence with a
|
|
195
|
+
`verify:` link); a clean VERIFIED entry with no other findings prints no block at
|
|
196
|
+
all. Each finding line follows one fixed shape:
|
|
197
|
+
|
|
198
|
+
```
|
|
199
|
+
[SEVERITY] category (line N): message (suggested: 'current' -> 'fixed')
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
- `[SEVERITY]` is `[ERROR]`/`[WARN]`/`[note]`.
|
|
203
|
+
- `category` is the **stable rule code** (see `--list-rules`); every finding has
|
|
204
|
+
one — none falls back to a bare layer name.
|
|
205
|
+
- `(line N)` is the offending field's line in the `.bib`.
|
|
206
|
+
- a fixable finding carries the advisory edit inline as `current -> suggested`.
|
|
207
|
+
- a message never wraps: any embedded newline is folded, so one finding = one line.
|
|
208
|
+
|
|
209
|
+
## What it checks
|
|
210
|
+
|
|
211
|
+
Checks run in layers, syntax first.
|
|
212
|
+
|
|
213
|
+
0. **Syntax** — structural validity, so a file BibTeX cannot parse is never
|
|
214
|
+
reported as healthy. Unbalanced braces, a stray extra `}`, a field missing
|
|
215
|
+
its `=`, an unknown entry type, a duplicate field, a file-level brace
|
|
216
|
+
imbalance, and a **cited key with no entry** are each errors. The parser
|
|
217
|
+
recovers at the next `@entry{`, so one broken entry does not hide the others.
|
|
218
|
+
`@string` abbreviations (both `{…}` and `(…)` delimited) and `#` concatenation
|
|
219
|
+
are expanded, so a `journal = prb` macro is checked by its full value, not the
|
|
220
|
+
bare macro name.
|
|
221
|
+
|
|
222
|
+
1. **Static** (offline) — a rule registry (`rules.py`); add a check by writing a
|
|
223
|
+
function and decorating it `@rule`/`@file_rule`. Covers missing fields;
|
|
224
|
+
**biblatex field validity** derived from the standard datamodel (see below);
|
|
225
|
+
title casing/brace-protection; trailing periods; `and others`; arXiv-id
|
|
226
|
+
consistency; page/dash/numpages sanity; encoding; DOI format; duplicate
|
|
227
|
+
keys/DOIs; and file-wide consistency. Uncited entries are noted.
|
|
228
|
+
|
|
229
|
+
2. **Record** (online) — resolve each entry by DOI (Crossref) or arXiv id and
|
|
230
|
+
flag **disagreement** with the record. The authoritative record is the
|
|
231
|
+
**canonical reference**: VeraCite never rewrites your `.bib`, but each flagged
|
|
232
|
+
field carries a **suggested edit that conforms the bib to the record** (e.g.
|
|
233
|
+
`year (suggested: '2009' -> '2010')`), so the fix direction is always toward the
|
|
234
|
+
registry — unless the record itself is clearly broken. **Severity follows
|
|
235
|
+
render-impact:** a field that changes the rendered citation (title, author,
|
|
236
|
+
year, journal, volume, issue, pages) is a **warning**; a purely stylistic
|
|
237
|
+
difference (an abbreviated given name, casing) is a **note**. None is a
|
|
238
|
+
wrong-paper claim — name folding handles suffixes (`Jr`/`III`), particles,
|
|
239
|
+
collaborations, and abbreviated given names so these don't misfire, and findings
|
|
240
|
+
show the original, readable names. A journal name matches the record when it is a
|
|
241
|
+
known abbreviation (a small curated physics table in `veracite/data/`) or a valid
|
|
242
|
+
ISO-4 abbreviation (period-insensitive, so `Phys. Rev. B` and `Phys Rev B` both
|
|
243
|
+
match `Physical Review B`); only a genuinely non-standard journal string warns.
|
|
244
|
+
The one identity **error** is when the first author *and* the title both differ
|
|
245
|
+
strongly: the id likely resolves to a different paper (a copy-pasted DOI). A
|
|
246
|
+
`verify:` link is printed for every entry with an online finding.
|
|
247
|
+
|
|
248
|
+
3. **Status** (online) — retraction (via OpenAlex / Retraction Watch), linked
|
|
249
|
+
errata/corrections/comments/replies, and preprints with a published version.
|
|
250
|
+
|
|
251
|
+
4. **Cross-source** (online) — when more than one authoritative source resolves an
|
|
252
|
+
entry (Crossref, **INSPIRE-HEP** for physics, arXiv, Open Library for books),
|
|
253
|
+
their records are compared *against each other*. A data difference (year,
|
|
254
|
+
volume, issue, pages, or a genuinely different journal) is a **warning**
|
|
255
|
+
(`source_conflict`) naming both sources. Purely stylistic differences — title
|
|
256
|
+
casing, or a full journal title vs its ISO-4 abbreviation — are **not** flagged,
|
|
257
|
+
since both forms are valid. This surfaces stale or corrupted registry metadata
|
|
258
|
+
the single-source comparison cannot see.
|
|
259
|
+
|
|
260
|
+
5. **Verification** (online) — each entry gets one of three statuses with a
|
|
261
|
+
**confidence** (0–1, a deterministic function of which sources agreed, not a
|
|
262
|
+
model output). **VERIFIED** — the id resolved and the first author and title
|
|
263
|
+
match; confidence reflects corroboration: **1.0** (clean match across ≥2
|
|
264
|
+
sources), **~0.95** (clean single source), **0.75** (a field disagrees), **0.70**
|
|
265
|
+
(sources disagree, or only arXiv confirms). **UNVERIFIED** — could not confirm:
|
|
266
|
+
no identifier, no record returned, or a DOI that did not resolve (also an error).
|
|
267
|
+
**MISMATCH** — it resolved but the record's identity disagrees (the id may point
|
|
268
|
+
at a different paper). If an entry carries **no identifier at all** (no DOI and no
|
|
269
|
+
arXiv id), VeraCite **searches** for one — first Crossref (title + first author,
|
|
270
|
+
corroborated by journal or ±1-year), then, failing that, **arXiv by title**
|
|
271
|
+
(title + first-author surname; common for ML/physics works cited by venue only).
|
|
272
|
+
On a strong match it verifies the entry and reports the identifier to add; when an
|
|
273
|
+
arXiv hit links a **published DOI** (its `<arxiv:doi>`), that DOI is preferred and
|
|
274
|
+
suggested instead of the bare preprint id. This search is a last resort: an entry
|
|
275
|
+
that **already** carries a DOI or arXiv id is resolved against *that* and the
|
|
276
|
+
search never runs. A post-2005 article with no findable identifier is flagged;
|
|
277
|
+
pre-2005 work is not penalized; arXiv ids and ISBNs count as PIDs.
|
|
278
|
+
|
|
279
|
+
6. **Integrity score** (online) — a summary roll-up: counts of
|
|
280
|
+
verified (and how many carry a caveat), unverified, mismatch, DOI coverage over eligible (post-2005)
|
|
281
|
+
articles, PID coverage, and a **0–100 integrity score** — a transparent weighted
|
|
282
|
+
blend of verification rate (50%), PID coverage (20%), DOI coverage (15%), and
|
|
283
|
+
freedom from integrity defects (15%). Printed beneath the verdict.
|
|
284
|
+
|
|
285
|
+
7. **LLM** (optional, `--llm`, needs `--tex`) — for each cited entry, a language
|
|
286
|
+
model rates **relevance** (1–5) from the abstract and the surrounding sentences,
|
|
287
|
+
and flags a clear **wrong paper**. For a grouped citation (`\cite{a,b,c}`) it also
|
|
288
|
+
sees the co-cited references and drops a low-relevance (≤3) odd-one-out a further
|
|
289
|
+
point, surfacing an inappropriate citation hidden in a list of relevant ones. A
|
|
290
|
+
wrong-paper flag is an error; relevance ≤3 a warning; **4–5 leaves a `[llm]
|
|
291
|
+
context OK N/5` note**. Because an LLM call costs tokens, every rated citation
|
|
292
|
+
always shows exactly one line in the report (clean pass, weak, wrong paper, or
|
|
293
|
+
rating-unavailable) rather than vanishing silently; the clean-pass note is hidden
|
|
294
|
+
by `--skipnotes` like any other note. Findings are worded as tentative,
|
|
295
|
+
abstract-only opinions to verify, never authoritative judgements. The provider is pluggable (`llm.py`), but **for now the only
|
|
296
|
+
supported backend is Claude Code** (the `claude` CLI, using your existing login),
|
|
297
|
+
and it defaults to **Claude Haiku** for token efficiency — fast and inexpensive
|
|
298
|
+
for a per-citation rating. **Privacy:** `--llm` sends those cited sentences to the
|
|
299
|
+
provider, so it is off by default and prints a warning — do not use it on a
|
|
300
|
+
confidential manuscript.
|
|
301
|
+
|
|
302
|
+
Identifier formats (DOI, arXiv, **ISBN**, **ISSN**, **ORCID**) are checked offline
|
|
303
|
+
by their check digits. An entry with a structural **syntax error** is reported, and
|
|
304
|
+
the rest of its checks (record, status, cross-source, LLM) are skipped until it
|
|
305
|
+
parses cleanly — comparing a garbled parse against a record only yields false
|
|
306
|
+
mismatches. When `--tex` is given, a multi-key `\cite{}` **group that is not in
|
|
307
|
+
chronological order** gets an advisory note (some bibliography styles cite the
|
|
308
|
+
earliest work first); it is never an error, since grouped-citation order is a
|
|
309
|
+
style choice, not a standard.
|
|
310
|
+
|
|
311
|
+
## Machine-readable report (`--json`)
|
|
312
|
+
|
|
313
|
+
`--json FILE` writes the report as **NDJSON** (newline-delimited JSON): one
|
|
314
|
+
self-contained JSON record per line. Most lines are one bibliography **entry**,
|
|
315
|
+
keyed by its citation key and carrying everything about it — which `phases` have
|
|
316
|
+
been computed (see [Checkpointing](#checkpointing-and-phased-resume)), its
|
|
317
|
+
verification `status`/`confidence`, the `verify` link, its `identifiers`, the
|
|
318
|
+
matched `canonical_record`, the `sources` that resolved it, and its `issues` (that
|
|
319
|
+
entry's findings). Two reserved records close the file: `"<file>"` (file-level
|
|
320
|
+
findings — duplicates, brace balance, dropped cited keys) and `"<summary>"` (the
|
|
321
|
+
integrity roll-up):
|
|
322
|
+
|
|
323
|
+
```jsonc
|
|
324
|
+
{"key": "amo2009", "phases": {"offline": true, "online": true, "llm": false},
|
|
325
|
+
"status": "VERIFIED", "confidence": 1.0, "verify": "https://doi.org/10.1038/nphys1364",
|
|
326
|
+
"identifiers": {"doi": "10.1038/nphys1364", "arxiv": null, "isbn": null},
|
|
327
|
+
"sources": ["crossref", "inspire"], "canonical_record": {"title": "...", "year": 2009},
|
|
328
|
+
"issues": []}
|
|
329
|
+
{"key": "<file>", "issues": []}
|
|
330
|
+
{"key": "<summary>", "veracite_version": "0.1.1",
|
|
331
|
+
"summary": {"checked": 152, "verified": 151, "verified_with_caveat": 8,
|
|
332
|
+
"unverified": 1, "mismatch": 0, "doi_coverage": 0.94, "pid_coverage": 0.97,
|
|
333
|
+
"integrity_score": 97}}
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
Read it line by line (`for line in open(f): json.loads(line)`); the `"<summary>"`
|
|
337
|
+
record holds the metrics and the `veracite_version` that produced the report (so a
|
|
338
|
+
saved or shared report is traceable to the exact tool revision — the version is also
|
|
339
|
+
printed on the terminal `BIBLIOGRAPHY HEALTH` line). Every other non-reserved record
|
|
340
|
+
is one reference. Under
|
|
341
|
+
`--offline` there is no online verification, so the `"<summary>"` record carries the
|
|
342
|
+
offline mode and finding counts with a null score (`{"mode": "offline",
|
|
343
|
+
"integrity_score": null, ...}`) and each entry appears with `phases.offline = true`,
|
|
344
|
+
the rest `false`, and a null `status`/`canonical_record` — enough for a later online
|
|
345
|
+
run to resume it, never a fabricated score.
|
|
346
|
+
|
|
347
|
+
The NDJSON shape is what makes checkpointing cheap and crash-safe: a finished entry
|
|
348
|
+
is one appended line, so an interrupted run leaves every prior line intact and
|
|
349
|
+
loadable (see below).
|
|
350
|
+
|
|
351
|
+
### Using VeraCite as a verification step for an AI assistant
|
|
352
|
+
|
|
353
|
+
VeraCite is deliberately **read-only**: it never edits your `.bib` or `.tex`. That
|
|
354
|
+
is what lets it serve as an independent **verification gate** in a
|
|
355
|
+
human-supervised AI editing loop — the checker has to be separate from whatever is
|
|
356
|
+
doing the writing, including an LLM. **Applying** the suggested edits is left to a
|
|
357
|
+
supervised tool (e.g. an AI assistant the author is driving), so the deterministic
|
|
358
|
+
checker and the judgement-applying editor stay cleanly separated. The NDJSON report
|
|
359
|
+
*is* the integration surface, designed to be consumed by a program, not just read:
|
|
360
|
+
|
|
361
|
+
- **Every finding is grounded, not generated.** A `metadata_mismatch`,
|
|
362
|
+
`dead_doi`, `id_resolves_wrong_record`, or an `UNVERIFIED` status comes from a
|
|
363
|
+
rule or a comparison against Crossref/arXiv/INSPIRE/OpenAlex, each with a
|
|
364
|
+
`verify:` link the agent can check independently. The `confidence` is a
|
|
365
|
+
deterministic function of which sources agreed — **not** a model output — so an
|
|
366
|
+
agent can trust it to gate its own edits without compounding hallucination.
|
|
367
|
+
- **Findings route by `group`, not by learning every category.** Each issue
|
|
368
|
+
carries a `group` of `syntax` / `semantic` / `context`: `syntax` is the
|
|
369
|
+
written form (safe, mechanical fixes); `semantic` is metadata that should be
|
|
370
|
+
reconciled against the source of record before editing; `context` needs
|
|
371
|
+
judgement. An agent can hold three policies instead of ~25 categories.
|
|
372
|
+
- **Fixable findings carry a structured `suggested` patch** —
|
|
373
|
+
`{"field": ..., "from": ..., "to": ...}`, separated from the prose `message` —
|
|
374
|
+
so a tool can apply an edit as data rather than parsing English. The record is
|
|
375
|
+
the canonical reference, so `to` is the value that conforms the bib to it.
|
|
376
|
+
- **The catch is the point.** A hallucinated reference surfaces as `UNVERIFIED`
|
|
377
|
+
with no findable identifier; a real DOI on the wrong paper as
|
|
378
|
+
`id_resolves_wrong_record` (status `MISMATCH`); a corrupted DOI/ISBN/arXiv id
|
|
379
|
+
fails its offline check digit; a subtly-wrong year/venue/author as a
|
|
380
|
+
`metadata_mismatch` with the registry value to adopt. These are exactly the
|
|
381
|
+
failure modes LLM-drafted bibliographies introduce.
|
|
382
|
+
|
|
383
|
+
Schema stability: the entry-record fields (`status`, `confidence`, `phases`,
|
|
384
|
+
`identifiers`, `canonical_record`, `sources`, `issues`) and each issue's
|
|
385
|
+
`severity` / `group` / `category` / `suggested` shape are the supported contract;
|
|
386
|
+
`--list-rules json` enumerates the full category vocabulary, and the
|
|
387
|
+
`veracite_version` on the `"<summary>"` record pins the producing revision so a
|
|
388
|
+
consumer can detect a contract change.
|
|
389
|
+
|
|
390
|
+
## Checkpointing and phased resume
|
|
391
|
+
|
|
392
|
+
For a large bibliography an online run can take a long time (a few paced network
|
|
393
|
+
calls per entry), so a crash partway through should not throw the work away. When
|
|
394
|
+
you pass `--json report.ndjson`, VeraCite **appends each entry's record as it
|
|
395
|
+
finishes** — an O(1) write, so checkpointing after every entry stays cheap even at
|
|
396
|
+
10k references and a crash loses at most the entry in flight. It can then **resume**
|
|
397
|
+
from that file:
|
|
398
|
+
|
|
399
|
+
```bash
|
|
400
|
+
python -m veracite --bib refs.bib --offline --json report.ndjson # phase 1: fast, no network
|
|
401
|
+
python -m veracite --bib refs.bib --json report.ndjson # phase 2: resume, resolve online
|
|
402
|
+
python -m veracite --bib refs.bib --tex p/ --json report.ndjson --llm # phase 3: add LLM ratings
|
|
403
|
+
```
|
|
404
|
+
|
|
405
|
+
Point VeraCite at an **existing** report and it loads it, replays the work already
|
|
406
|
+
saved, and runs each entry **only for the checks it does not yet have** — so a job
|
|
407
|
+
can be built up in phases or simply restarted after an interruption. A re-run
|
|
408
|
+
appends a fresh record per entry (the **last line for a key wins** on load); at the
|
|
409
|
+
end of a clean run the file is **compacted** once — rewritten atomically with one
|
|
410
|
+
line per key in bibliography order. A partial line from a crash mid-write is simply
|
|
411
|
+
skipped on load. It prints a NOTE that it is resuming; **choose a different `--json`
|
|
412
|
+
filename to run from scratch.** The update rule per entry:
|
|
413
|
+
|
|
414
|
+
- **offline** (the static/syntax checks) always re-runs — it is cheap and needs no
|
|
415
|
+
network.
|
|
416
|
+
- the **online** layer runs only for entries not already resolved online; an
|
|
417
|
+
already-resolved entry is reused (its record, status and findings), no network.
|
|
418
|
+
- **`--llm`** rates only entries not already rated. Because the rating needs the
|
|
419
|
+
work's abstract — an LLM input that is not persisted — rating an entry also
|
|
420
|
+
re-runs its online layer; an entry already rated is reused, spending no tokens.
|
|
421
|
+
|
|
422
|
+
VeraCite also **warns up front** when a run looks expensive: a bibliography of 200+
|
|
423
|
+
entries run online without `--json` prints a recommendation to add it (so the run
|
|
424
|
+
is saved and resumable), and `--llm` prints how many entries it will rate (it uses
|
|
425
|
+
LLM tokens). Both are warnings only — the run proceeds, so scripts and CI are
|
|
426
|
+
unaffected.
|
|
427
|
+
|
|
428
|
+
## Configuration
|
|
429
|
+
|
|
430
|
+
VeraCite runs with no configuration. Optional settings are read from the first
|
|
431
|
+
of `./veracite.json`, `~/.config/veracite/settings.json`, `~/.veracite.json`, or
|
|
432
|
+
a `--settings FILE` path. None is shipped, so the tool carries no personal data.
|
|
433
|
+
Recognized keys (all optional):
|
|
434
|
+
|
|
435
|
+
```json
|
|
436
|
+
{
|
|
437
|
+
"contact_email": "you@example.org",
|
|
438
|
+
"llm_provider": "claude",
|
|
439
|
+
"llm_models": {"claude": "claude-haiku-4-5-20251001"},
|
|
440
|
+
"document_context": "a paper on <your topic>",
|
|
441
|
+
"protected_terms": ["Rydberg", "Yb", "Pulser"],
|
|
442
|
+
"severity": {"preprint_superseded": "error", "biblatex_validity": "note"},
|
|
443
|
+
"request_delay": 0.2,
|
|
444
|
+
"request_timeout": 20,
|
|
445
|
+
"endpoints": {"crossref_work": "https://api.crossref.org/works/{doi}"}
|
|
446
|
+
}
|
|
447
|
+
```
|
|
448
|
+
|
|
449
|
+
- `contact_email` is added to the User-Agent (Crossref/OpenAlex "polite pool");
|
|
450
|
+
may also be set with `VERACITE_CONTACT_EMAIL`.
|
|
451
|
+
- `llm_provider` selects the `--llm` backend. For now the only supported provider
|
|
452
|
+
is `claude` (Claude Code, via the `claude` CLI and your existing login).
|
|
453
|
+
- `llm_models` pins the model used per provider. The default is **Claude Haiku**
|
|
454
|
+
(`claude-haiku-4-5-20251001`) — chosen for token efficiency, ample for a
|
|
455
|
+
per-citation relevance rating. It is a specific, pinned id for reproducible
|
|
456
|
+
ratings; if that model is ever retired, `--llm` will report `rating unavailable:
|
|
457
|
+
claude CLI failed (model '...')` — set `llm_models` to a current id to fix it, no
|
|
458
|
+
code change needed. Point it at a larger model (e.g. Sonnet) for tougher calls.
|
|
459
|
+
- `severity` re-ranks any finding category to `error`/`warning`/`note`.
|
|
460
|
+
- `protected_terms` is the project's must-stay-capitalized title terms.
|
|
461
|
+
- `request_delay`/`request_timeout` set API pacing; `--delay`/`--timeout`
|
|
462
|
+
override them. Pacing is **per service and time-based**: each external service
|
|
463
|
+
has a minimum interval (`request_delay`, default 0.2 s; arXiv is paced at 3 s) and
|
|
464
|
+
a request waits only the *remainder* of that interval — time already spent on
|
|
465
|
+
other services or the rest of the pipeline counts, and a service whose interval
|
|
466
|
+
has elapsed proceeds immediately. So an entry resolved by Crossref never pays an
|
|
467
|
+
arXiv delay, and arXiv's slow limit spaces out across many entries rather than
|
|
468
|
+
blocking each one. Only a real outbound request ever waits.
|
|
469
|
+
- `endpoints` repoints the external API URLs if a service moves.
|
|
470
|
+
|
|
471
|
+
## Layout
|
|
472
|
+
|
|
473
|
+
```
|
|
474
|
+
veracite/ package: config, parser, normalize, datamodel, report,
|
|
475
|
+
rules, record, llm, cli
|
|
476
|
+
tools/ gen_datamodel.py (regenerates the datamodel JSON)
|
|
477
|
+
tests/ pytest suite + .bib fixtures
|
|
478
|
+
```
|
|
479
|
+
|
|
480
|
+
## Requirements
|
|
481
|
+
|
|
482
|
+
- Python 3.8+. Uses `requests` if present, else the stdlib `urllib`.
|
|
483
|
+
- Network (for the online layers): `api.crossref.org`, `export.arxiv.org`,
|
|
484
|
+
`api.openalex.org`, `api.semanticscholar.org`, `inspirehep.net` (physics),
|
|
485
|
+
`openlibrary.org` / `googleapis.com` (ISBN). All optional and degrade
|
|
486
|
+
gracefully — a source that fails to respond is reported as "could not retrieve",
|
|
487
|
+
never a crash, and `--offline` skips them all.
|
|
488
|
+
- For `--llm` with the default provider: the [`claude`
|
|
489
|
+
CLI](https://docs.claude.com/en/docs/claude-code) on `PATH`, **logged in** (run
|
|
490
|
+
`claude` once and sign in; it needs a Claude account). `--llm` probes the
|
|
491
|
+
provider before the run and, if it is missing or not logged in, stops up front
|
|
492
|
+
with how to fix it rather than failing per entry. Everything except `--llm`
|
|
493
|
+
works with no account.
|
|
494
|
+
|
|
495
|
+
## Known limitations
|
|
496
|
+
|
|
497
|
+
VeraCite compares against registry **metadata**; errors in free text or in
|
|
498
|
+
fields no registry encodes are out of reach. Correction/erratum and published-version coverage is best-effort. "No problem found" means no problem in the checkable fields, not
|
|
499
|
+
that every field was verified.
|
|
500
|
+
|
|
501
|
+
## Tests
|
|
502
|
+
|
|
503
|
+
```bash
|
|
504
|
+
pip install pytest
|
|
505
|
+
python -m pytest
|
|
506
|
+
```
|