veracite 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
veracite/__init__.py ADDED
@@ -0,0 +1,17 @@
1
+ """VeraCite -- a bibliography health checker for LaTeX projects.
2
+
3
+ Public API: parse a .bib, run the checks, render a report. See cli.main for the
4
+ command-line entry point.
5
+ """
6
+
7
+ from .config import VERSION, load_settings
8
+ from .parser import parse_bib
9
+ from .report import Finding, Report, Severity
10
+ from .rules import run_static, syntax_pass
11
+ from .webcheck import check_bib_text
12
+
13
+ __version__ = VERSION
14
+ __all__ = [
15
+ "parse_bib", "run_static", "syntax_pass", "check_bib_text",
16
+ "Report", "Finding", "Severity", "load_settings", "__version__",
17
+ ]
veracite/__main__.py ADDED
@@ -0,0 +1,8 @@
1
+ """Enable `python -m veracite`."""
2
+
3
+ import sys
4
+
5
+ from .cli import main
6
+
7
+ if __name__ == "__main__":
8
+ sys.exit(main())
veracite/catalog.py ADDED
@@ -0,0 +1,170 @@
1
+ """The rule catalog: a publisher's audit sheet, derived from the source of truth.
2
+
3
+ `veracite --list-rules` prints, for every finding category VeraCite can emit, its
4
+ default severity, its group (syntax/semantic/context), what (if anything)
5
+ supersedes it, and a one-line description. A publisher reads this table to decide
6
+ where their house standard disagrees, then encodes the disagreements in a
7
+ settings file's `severity` block -- no code change needed.
8
+
9
+ The catalog is *introspected*, never hand-maintained: the set of categories is
10
+ scanned from the `category="..."` literals in the package source, and the four
11
+ columns are joined from the existing tables (DEFAULT_SETTINGS['severity'],
12
+ report.CATEGORY_GROUP, report.SUPERSEDES, report.CATEGORY_DOC). So it cannot drift
13
+ from what the code actually emits -- and `tests/test_catalog.py` asserts exactly
14
+ that, which is what stops the table from ever going stale.
15
+ """
16
+
17
+ import json
18
+ import os
19
+ import re
20
+
21
+ from .config import DEFAULT_SETTINGS
22
+ from .report import (CATEGORY_DOC, CATEGORY_GROUP, finding_group,
23
+ resolve_severity, SEVERITY_NAMES, SUPERSEDES)
24
+
25
+ _PKG_DIR = os.path.dirname(os.path.abspath(__file__))
26
+ _CATEGORY_RE = re.compile(r'category="([a-z_]+)"')
27
+ _DEF_RE = re.compile(r'^(\s*)(?:async\s+)?def (\w+)\s*\(')
28
+
29
+ # Categories deliberately NOT pinned in DEFAULT_SETTINGS['severity']: their checks
30
+ # emit MORE THAN ONE severity (author_format: a note for ALL-CAPS surnames, a
31
+ # warning for an 'and' glued to a name), and pinning a category flattens all its
32
+ # findings to one level. Listed here -- the single source of truth, referenced by
33
+ # config.py's comment and asserted by tests/test_catalog.py -- so a new
34
+ # mixed-severity category is a conscious choice, not an accidental gap.
35
+ INTENTIONALLY_UNPINNED = frozenset({"author_format"})
36
+
37
+
38
+ def category_sources():
39
+ """Map each emittable category to the source locations that emit it.
40
+
41
+ A single static scan of the package: it walks every `category="..."` literal
42
+ and attributes it to the innermost enclosing `def`, recording (file, function,
43
+ line). The catalog's index is built from this, and `emitted_categories()` is
44
+ just its key set -- so there is ONE scanner, and the index can never disagree
45
+ with the category list it indexes.
46
+
47
+ The relationship is genuinely many-to-many (one rule function can emit several
48
+ categories; one category, e.g. 'style', is emitted by many functions), so each
49
+ category maps to a *list* of sources, sorted for stable output."""
50
+ sources = {}
51
+ for name in sorted(os.listdir(_PKG_DIR)):
52
+ if not name.endswith(".py") or name == "catalog.py":
53
+ continue # catalog.py's own _CATEGORY_RE literal is not an emit site
54
+ path = os.path.join(_PKG_DIR, name)
55
+ with open(path, encoding="utf-8") as fh:
56
+ # Track the function enclosing each line by indentation: a `def` owns
57
+ # every following line more-indented than it, until a sibling/dedent.
58
+ stack = [] # (indent, function name)
59
+ for lineno, line in enumerate(fh, 1):
60
+ if not line.strip() or line.lstrip().startswith("#"):
61
+ continue
62
+ indent = len(line) - len(line.lstrip())
63
+ m = _DEF_RE.match(line)
64
+ if m:
65
+ while stack and stack[-1][0] >= indent:
66
+ stack.pop()
67
+ stack.append((indent, m.group(2)))
68
+ else:
69
+ while stack and stack[-1][0] >= indent:
70
+ stack.pop()
71
+ for cat in _CATEGORY_RE.findall(line):
72
+ fn = stack[-1][1] if stack else "<module>"
73
+ sources.setdefault(cat, set()).add((name, fn, lineno))
74
+ return {c: sorted(locs) for c, locs in sources.items()}
75
+
76
+
77
+ def emitted_categories():
78
+ """Every category any rule or layer can emit, scanned from the package source.
79
+
80
+ This is the authoritative set: a static scan of the `category="..."` literals
81
+ catches categories from the online layers and multi-category rules that
82
+ introspecting the ENTRY_RULES/FILE_RULES registries alone would miss."""
83
+ return set(category_sources())
84
+
85
+
86
+ def default_severity_label(category):
87
+ """The default severity a category resolves to with no user override:
88
+ 'error'/'warning'/'note' for a pinned category, or 'mixed' for one that is
89
+ deliberately unpinned (its checks emit several severities; see
90
+ INTENTIONALLY_UNPINNED). Reads DEFAULT_SETTINGS['severity'] -- the same table
91
+ resolve_severity() consults -- so it matches a real run's behaviour."""
92
+ configured = DEFAULT_SETTINGS.get("severity", {}).get(category)
93
+ if configured and str(configured).lower() in SEVERITY_NAMES:
94
+ return str(configured).lower()
95
+ if category in INTENTIONALLY_UNPINNED:
96
+ return "mixed"
97
+ return None # an unexpected gap: caller/test surfaces it
98
+
99
+
100
+ def catalog():
101
+ """The full catalog as a sorted list of dicts, one per emittable category.
102
+
103
+ Each row also carries `sources`: the function(s) and file:line(s) that emit the
104
+ category. This makes the catalog a faithful *index* into the detection logic --
105
+ it cannot reproduce a check (the algorithm lives in the function body), but it
106
+ points at exactly the code to read to see what a check does."""
107
+ srcmap = category_sources()
108
+ rows = []
109
+ for cat in sorted(emitted_categories()):
110
+ sup = SUPERSEDES.get(cat)
111
+ rows.append({
112
+ "category": cat,
113
+ "default_severity": default_severity_label(cat),
114
+ "group": finding_group(cat),
115
+ "superseded_by": sup[1] if sup else None,
116
+ "description": CATEGORY_DOC.get(cat, ""),
117
+ "sources": [{"function": fn, "file": f, "line": ln}
118
+ for (f, fn, ln) in srcmap.get(cat, [])],
119
+ })
120
+ return rows
121
+
122
+
123
+ def _source_functions(row, limit=None):
124
+ """The distinct function names that emit a category. Many-to-many is normal
125
+ (e.g. 'style' is emitted by ten functions). For the table a `limit` keeps rows
126
+ scannable, eliding the tail as '+N more'; the JSON `sources` lists them all,
127
+ with file:line."""
128
+ seen = []
129
+ for s in row["sources"]:
130
+ if s["function"] not in seen:
131
+ seen.append(s["function"])
132
+ if limit and len(seen) > limit:
133
+ return ", ".join(seen[:limit]) + f" +{len(seen) - limit} more"
134
+ return ", ".join(seen)
135
+
136
+
137
+ def _fmt_table(rows):
138
+ cols = [("category", "category"), ("default_severity", "severity"),
139
+ ("group", "group"), ("superseded_by", "superseded by"),
140
+ ("rules", "rules (in source)"), ("description", "description")]
141
+ def cell(r, key):
142
+ if key == "rules":
143
+ return _source_functions(r, limit=3) or "-"
144
+ v = r.get(key)
145
+ return "-" if v in (None, "") else str(v)
146
+ widths = {k: max(len(hdr), *(len(cell(r, k)) for r in rows)) for k, hdr in cols}
147
+ line = lambda vals: " ".join(v.ljust(widths[k]) for (k, _), v in zip(cols, vals))
148
+ out = [line([hdr for _, hdr in cols]),
149
+ line(["-" * widths[k] for k, _ in cols])]
150
+ out += [line([cell(r, k) for k, _ in cols]) for r in rows]
151
+ return "\n".join(out)
152
+
153
+
154
+ def print_catalog(as_json=False, stream=None):
155
+ """Print the catalog to `stream` (default stdout). The audit sheet a publisher
156
+ reviews against their house standard; `as_json` for machine consumption."""
157
+ import sys
158
+ stream = stream or sys.stdout
159
+ rows = catalog()
160
+ if as_json:
161
+ json.dump({"rules": rows}, stream, indent=2)
162
+ stream.write("\n")
163
+ return
164
+ stream.write(_fmt_table(rows) + "\n")
165
+ stream.write(
166
+ f"\n{len(rows)} finding categories. The 'rules' column names the function(s) "
167
+ "that emit each\ncategory -- read those in veracite/ for the exact logic "
168
+ "('--list-rules json' gives\nfull file:line). Re-rank any category in a "
169
+ "settings file's \"severity\" block, e.g.\n"
170
+ ' {"severity": {"style": "warning", "title_case": "note"}}\n')
veracite/checkpoint.py ADDED
@@ -0,0 +1,321 @@
1
+ """Checkpointing: persist a (possibly partial) run to the --json report as an
2
+ append-only NDJSON log, and rebuild it on a later run -- so a large or interrupted
3
+ job is resumable and can be completed in phases (offline -> online -> llm).
4
+
5
+ The on-disk format is **NDJSON**: one self-contained JSON record per line. Most
6
+ lines are one bibliography ENTRY, keyed by its citation key and carrying everything
7
+ about it -- which `phases` it has, its verification `status`/`confidence`,
8
+ `identifiers`, the matched `canonical_record`, the `sources`, and its `issues`
9
+ (findings):
10
+
11
+ {"key": "k0", "phases": {...}, "status": "VERIFIED", "issues": [...], ...}
12
+ {"key": "k1", ...}
13
+ {"key": "<file>", "issues": [...]} # file-level findings (duplicates, ...)
14
+ {"key": "<summary>", "summary": {...}} # the integrity roll-up / offline stub
15
+
16
+ Why NDJSON: a new entry is a single O(1) APPEND (no rewrite of the whole growing
17
+ report), so checkpointing after every entry stays cheap even at 10k references, and
18
+ a crash mid-run leaves every prior line intact (a torn final line is just skipped on
19
+ load). Re-running an entry simply APPENDS a fresh record for that key; on load, the
20
+ LAST record per key wins, so an updated entry supersedes its earlier state with no
21
+ in-place edit. At the end of a clean run the file is COMPACTED -- rewritten once,
22
+ atomically, with exactly one line per key in bibliography order.
23
+
24
+ This module owns all of that format knowledge (the per-key record shape, append,
25
+ load with last-wins, compaction) so the CLI driver and report.py stay agnostic.
26
+ """
27
+
28
+ import json
29
+ import os
30
+
31
+ from .models import Record
32
+ from .record import Resolution
33
+ from .report import Finding, Severity
34
+
35
+ # The phase order, weakest first. A run "requests" a set of phases; an entry is
36
+ # (re)processed for a requested phase it does not already have.
37
+ PHASES = ("offline", "online", "llm")
38
+
39
+ # Reserved keys for the non-entry records.
40
+ FILE_KEY = "<file>"
41
+ SUMMARY_KEY = "<summary>"
42
+
43
+ # Which phase produces a finding, by its category. Anything not listed is treated
44
+ # as 'offline' (the static/syntax rules), the conservative default -- those
45
+ # findings are cheap to recompute and never depend on the network. Only the
46
+ # online/llm categories must be named, since those are the ones we must NOT throw
47
+ # away when resuming a job that already paid for them.
48
+ _ONLINE_CATEGORIES = {
49
+ "metadata_mismatch", "source_conflict", "record_unresolved", "dead_doi",
50
+ "retraction", "related_work", "preprint_superseded",
51
+ "id_resolves_wrong_record", "doi_available", "pid_missing", "pid_optional",
52
+ "container_granularity",
53
+ }
54
+ _LLM_CATEGORIES = {"llm_relevance", "wrong_paper", "llm_config"}
55
+
56
+
57
+ def finding_phase(f):
58
+ """The phase that produced finding `f`. LLM and online categories are named
59
+ explicitly; everything else is an offline static/syntax finding."""
60
+ from .report import Report # lazy: avoid an import cycle at module load
61
+ if f.category in _LLM_CATEGORIES or f.layer == "llm":
62
+ return "llm"
63
+ if f.category in _ONLINE_CATEGORIES or f.layer in Report._ONLINE_LAYERS:
64
+ return "online"
65
+ return "offline"
66
+
67
+
68
+ def requested_phases(online, llm):
69
+ """The phases this invocation is asking to (re)compute. Offline always runs;
70
+ online unless --offline; llm only with --llm (which implies online)."""
71
+ req = {"offline"}
72
+ if online:
73
+ req.add("online")
74
+ if llm:
75
+ req.add("llm")
76
+ return req
77
+
78
+
79
+ # --- per-entry record (the NDJSON line) ------------------------------------
80
+
81
+ def entry_record(key, res, status, conf, phases, issues, verify=None):
82
+ """Build the persisted record for one bib entry: a self-contained dict with its
83
+ phases, verification status, identifiers, canonical record, sources and issues.
84
+ `res` is a Resolution (or None for an offline-only entry); `issues` is a list of
85
+ finding dicts (Report._finding_dict shape). This is the inverse of the loader's
86
+ `_resolution_from_record`, so a round trip reproduces the same report."""
87
+ rec = (res.record if res else None) or {}
88
+ return {
89
+ "key": key,
90
+ "phases": {p: (p in phases) for p in PHASES},
91
+ "status": status,
92
+ "confidence": conf,
93
+ "verify": verify,
94
+ "identifiers": {"doi": (res.doi if res else "") or None,
95
+ "arxiv": (res.arxiv_id if res else "") or None,
96
+ "isbn": (res.isbn if res else "") or None},
97
+ "sources": sorted(res.sources) if res else [],
98
+ "canonical_record": {k: rec.get(k) for k in
99
+ ("title", "year", "journal", "volume",
100
+ "number", "pages")} if rec else None,
101
+ "issues": issues,
102
+ }
103
+
104
+
105
+ def file_record(issues):
106
+ """The reserved file-level record: findings not tied to one entry (duplicates,
107
+ brace balance, a cited key with no entry)."""
108
+ return {"key": FILE_KEY, "issues": issues}
109
+
110
+
111
+ def summary_record(summary):
112
+ """The reserved summary record: the integrity roll-up (or offline stub), stamped
113
+ with the VeraCite version that produced the report so a saved/shared report is
114
+ traceable to the exact tool revision (checks and scoring can change between
115
+ versions)."""
116
+ from .config import VERSION
117
+ return {"key": SUMMARY_KEY, "veracite_version": VERSION, "summary": summary}
118
+
119
+
120
+ def append_record(path, record):
121
+ """Append one NDJSON record (a single line) to the checkpoint. O(1): no rewrite
122
+ of the existing file. Each line is a complete JSON value terminated by '\\n', so
123
+ a crash after a full line leaves a loadable file and a crash mid-line is skipped
124
+ on load. Returns True on success; an OSError is reported, never raised."""
125
+ try:
126
+ with open(path, "a", encoding="utf-8") as fh:
127
+ fh.write(json.dumps(record, ensure_ascii=False) + "\n")
128
+ return True
129
+ except OSError as ex:
130
+ import sys
131
+ print(f"\nwarning: could not write checkpoint to {path}: {ex}", file=sys.stderr)
132
+ return False
133
+
134
+
135
+ def compact(path, ordered_keys):
136
+ """Rewrite the checkpoint with exactly one line per key, in `ordered_keys` order
137
+ (entry keys in bib order), followed by the <file> and <summary> records. Done
138
+ once at the end of a clean run so a finished report has no superseded duplicate
139
+ lines. Atomic (temp + os.replace), so an interruption during compaction cannot
140
+ corrupt the still-valid append log it replaces. Returns True on success."""
141
+ records, _ = _read_records(path) # last-wins map: key -> record
142
+ if records is None:
143
+ return False
144
+ order = list(ordered_keys) + [FILE_KEY, SUMMARY_KEY]
145
+ seen = set()
146
+ lines = []
147
+ for k in order:
148
+ if k in records and k not in seen:
149
+ seen.add(k)
150
+ lines.append(json.dumps(records[k], ensure_ascii=False))
151
+ # Any record whose key was not in `ordered_keys` (shouldn't happen, but be safe)
152
+ # is appended at the end so nothing is silently dropped.
153
+ for k, rec in records.items():
154
+ if k not in seen:
155
+ lines.append(json.dumps(rec, ensure_ascii=False))
156
+ tmp = f"{path}.tmp.{os.getpid()}"
157
+ try:
158
+ with open(tmp, "w", encoding="utf-8") as fh:
159
+ fh.write("\n".join(lines) + ("\n" if lines else ""))
160
+ os.replace(tmp, path)
161
+ return True
162
+ except OSError as ex:
163
+ import sys
164
+ print(f"\nwarning: could not compact checkpoint {path}: {ex}", file=sys.stderr)
165
+ try:
166
+ os.remove(tmp)
167
+ except OSError:
168
+ pass
169
+ return False
170
+
171
+
172
+ def _read_records(path):
173
+ """Read the NDJSON checkpoint into an ORDERED last-wins map {key: record}. A
174
+ later line for a key supersedes an earlier one (an updated entry), keeping the
175
+ key's first-seen position so bib order is stable. A blank or unparseable line
176
+ (e.g. a torn final line from a crash) is skipped. Returns (records, n_lines) or
177
+ (None, 0) if the file is absent/unreadable."""
178
+ if not path or not os.path.isfile(path):
179
+ return None, 0
180
+ records = {}
181
+ n = 0
182
+ try:
183
+ with open(path, encoding="utf-8") as fh:
184
+ for line in fh:
185
+ line = line.strip()
186
+ if not line:
187
+ continue
188
+ try:
189
+ rec = json.loads(line)
190
+ except ValueError:
191
+ continue # torn / partial line -- skip it
192
+ if not isinstance(rec, dict) or "key" not in rec:
193
+ continue
194
+ n += 1
195
+ records[rec["key"]] = rec # last wins; insertion order preserved
196
+ except OSError:
197
+ return None, 0
198
+ return records, n
199
+
200
+
201
+ class Checkpoint:
202
+ """A loaded prior run rebuilt from an NDJSON checkpoint. `phases_by_key[key]` is
203
+ the set of phases already computed for an entry; `needs(key, requested)` says
204
+ which requested phases are still missing. The replayed findings, resolutions,
205
+ statuses and verify links are exposed so the driver can reproduce prior work
206
+ without recomputing it."""
207
+
208
+ def __init__(self, path):
209
+ self.path = path
210
+ self.findings = [] # replayed Finding objects (all keys)
211
+ self.results = {} # key -> Resolution
212
+ self.statuses = {} # key -> (status, confidence)
213
+ self.links = {} # key -> verify url
214
+ self.phases_by_key = {} # key -> set(phases done)
215
+ self.summary = None # the saved summary record, if any
216
+ self._findings_by_key = {}
217
+ self.loaded = False
218
+
219
+ @classmethod
220
+ def load(cls, path):
221
+ """Load an NDJSON checkpoint if it exists and parses. Returns a Checkpoint
222
+ (with .loaded True) or None if the file is absent or holds no usable record
223
+ (so a stray/empty/foreign path just starts fresh)."""
224
+ records, _ = _read_records(path)
225
+ if not records:
226
+ return None
227
+ cp = cls(path)
228
+ cp._replay(records)
229
+ cp.loaded = True
230
+ return cp
231
+
232
+ def _replay(self, records):
233
+ for key, rec in records.items():
234
+ if key == SUMMARY_KEY:
235
+ self.summary = rec.get("summary")
236
+ continue
237
+ for fd in rec.get("issues", []):
238
+ f = _finding_from_dict(fd, key)
239
+ if f is not None:
240
+ self.findings.append(f)
241
+ self._findings_by_key.setdefault(key, []).append(f)
242
+ if key == FILE_KEY:
243
+ continue
244
+ self.phases_by_key[key] = {p for p, on in (rec.get("phases") or {}).items() if on}
245
+ self.results[key] = _resolution_from_record(rec)
246
+ self.statuses[key] = (rec.get("status"),
247
+ float(rec.get("confidence") or 0.0))
248
+ if rec.get("verify"):
249
+ self.links[key] = rec["verify"]
250
+
251
+ # -- phase coverage -----------------------------------------------------
252
+
253
+ def has(self, key, phase):
254
+ return phase in self.phases_by_key.get(key, set())
255
+
256
+ def needs(self, key, requested):
257
+ """The requested phases this key has NOT already computed -- the work left
258
+ to do for it. Empty means the saved entry already satisfies the request.
259
+
260
+ The LLM rating needs the work's abstract, which only the online layer
261
+ fetches and which is NOT persisted (it is an LLM input, not a result). So
262
+ when the llm phase must run, the online phase is run with it -- otherwise a
263
+ resumed --llm pass would have no abstract to rate. This is the one phase
264
+ coupling; offline is always independent."""
265
+ todo = {p for p in requested if not self.has(key, phase=p)}
266
+ if "llm" in todo:
267
+ todo.add("online")
268
+ return todo
269
+
270
+ def seed_findings_for(self, key, keep_phases):
271
+ """The saved findings for `key` that belong to a phase in `keep_phases` --
272
+ i.e. the prior findings to replay because this run is NOT recomputing their
273
+ phase. Offline findings are always recomputed live, so they are never in
274
+ `keep_phases` and never replayed (avoids duplicating a static finding)."""
275
+ return [f for f in self._findings_by_key.get(key, [])
276
+ if finding_phase(f) in keep_phases]
277
+
278
+ def file_findings(self):
279
+ """The replayed file-level (<file>) findings, so a resumed run that does not
280
+ recompute them (it always does, but defensively) does not lose them."""
281
+ return list(self._findings_by_key.get(FILE_KEY, []))
282
+
283
+
284
+ def _finding_from_dict(fd, key):
285
+ """Rebuild a Finding from a persisted issue dict. The key comes from the record
286
+ (issues no longer carry their own key). Tolerates a missing/odd severity by
287
+ defaulting to a note rather than crashing on a hand-edited report."""
288
+ try:
289
+ sev = Severity[fd.get("severity", "INFO")]
290
+ except KeyError:
291
+ sev = Severity.INFO
292
+ return Finding(severity=sev, key=key,
293
+ line=int(fd.get("line") or 0), message=fd.get("message", ""),
294
+ layer=fd.get("layer", "static"), category=fd.get("category", ""),
295
+ suggested=fd.get("suggested"))
296
+
297
+
298
+ def _resolution_from_record(rec):
299
+ """Rebuild a Resolution from a saved entry record -- enough for the integrity
300
+ score (ids, sources) and the report to regenerate identically. Abstracts are not
301
+ persisted (they are an LLM input, not a result), so a resumed LLM phase re-fetches
302
+ the abstract via the online layer."""
303
+ ids = rec.get("identifiers") or {}
304
+ res = Resolution()
305
+ res.doi = ids.get("doi") or ""
306
+ res.arxiv_id = ids.get("arxiv") or ""
307
+ res.isbn = ids.get("isbn") or ""
308
+ crec = rec.get("canonical_record")
309
+ if crec:
310
+ res.record = Record(title=crec.get("title") or "", year=crec.get("year"),
311
+ journal=crec.get("journal") or "",
312
+ volume=crec.get("volume") or "",
313
+ number=crec.get("number") or "",
314
+ pages=crec.get("pages") or "")
315
+ res.source = (rec.get("sources") or [""])[0]
316
+ for s in (rec.get("sources") or []):
317
+ # Per-source records are not persisted individually; a placeholder keeps
318
+ # len(sources) correct for the confidence/cross-source logic and the
319
+ # `sources` list. The primary record carries the comparable fields.
320
+ res.sources[s] = res.record if s == res.source else {}
321
+ return res