veracite 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- veracite/__init__.py +17 -0
- veracite/__main__.py +8 -0
- veracite/catalog.py +170 -0
- veracite/checkpoint.py +321 -0
- veracite/cli.py +453 -0
- veracite/compare.py +466 -0
- veracite/config.py +163 -0
- veracite/data/biblatex_datamodel.json +1334 -0
- veracite/data/journal_abbrev.json +426 -0
- veracite/datamodel.py +101 -0
- veracite/http.py +113 -0
- veracite/identifiers.py +42 -0
- veracite/llm.py +382 -0
- veracite/models.py +51 -0
- veracite/normalize.py +365 -0
- veracite/parser.py +375 -0
- veracite/pipeline.py +37 -0
- veracite/record.py +271 -0
- veracite/report.py +671 -0
- veracite/rules.py +849 -0
- veracite/sources.py +349 -0
- veracite/titles.py +79 -0
- veracite/verify.py +389 -0
- veracite/webcheck.py +189 -0
- veracite-0.1.1.dist-info/METADATA +506 -0
- veracite-0.1.1.dist-info/RECORD +30 -0
- veracite-0.1.1.dist-info/WHEEL +5 -0
- veracite-0.1.1.dist-info/entry_points.txt +2 -0
- veracite-0.1.1.dist-info/licenses/LICENSE +21 -0
- veracite-0.1.1.dist-info/top_level.txt +1 -0
veracite/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""VeraCite -- a bibliography health checker for LaTeX projects.
|
|
2
|
+
|
|
3
|
+
Public API: parse a .bib, run the checks, render a report. See cli.main for the
|
|
4
|
+
command-line entry point.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .config import VERSION, load_settings
|
|
8
|
+
from .parser import parse_bib
|
|
9
|
+
from .report import Finding, Report, Severity
|
|
10
|
+
from .rules import run_static, syntax_pass
|
|
11
|
+
from .webcheck import check_bib_text
|
|
12
|
+
|
|
13
|
+
__version__ = VERSION
|
|
14
|
+
__all__ = [
|
|
15
|
+
"parse_bib", "run_static", "syntax_pass", "check_bib_text",
|
|
16
|
+
"Report", "Finding", "Severity", "load_settings", "__version__",
|
|
17
|
+
]
|
veracite/__main__.py
ADDED
veracite/catalog.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""The rule catalog: a publisher's audit sheet, derived from the source of truth.
|
|
2
|
+
|
|
3
|
+
`veracite --list-rules` prints, for every finding category VeraCite can emit, its
|
|
4
|
+
default severity, its group (syntax/semantic/context), what (if anything)
|
|
5
|
+
supersedes it, and a one-line description. A publisher reads this table to decide
|
|
6
|
+
where their house standard disagrees, then encodes the disagreements in a
|
|
7
|
+
settings file's `severity` block -- no code change needed.
|
|
8
|
+
|
|
9
|
+
The catalog is *introspected*, never hand-maintained: the set of categories is
|
|
10
|
+
scanned from the `category="..."` literals in the package source, and the four
|
|
11
|
+
columns are joined from the existing tables (DEFAULT_SETTINGS['severity'],
|
|
12
|
+
report.CATEGORY_GROUP, report.SUPERSEDES, report.CATEGORY_DOC). So it cannot drift
|
|
13
|
+
from what the code actually emits -- and `tests/test_catalog.py` asserts exactly
|
|
14
|
+
that, which is what stops the table from ever going stale.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
import os
|
|
19
|
+
import re
|
|
20
|
+
|
|
21
|
+
from .config import DEFAULT_SETTINGS
|
|
22
|
+
from .report import (CATEGORY_DOC, CATEGORY_GROUP, finding_group,
|
|
23
|
+
resolve_severity, SEVERITY_NAMES, SUPERSEDES)
|
|
24
|
+
|
|
25
|
+
_PKG_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
26
|
+
_CATEGORY_RE = re.compile(r'category="([a-z_]+)"')
|
|
27
|
+
_DEF_RE = re.compile(r'^(\s*)(?:async\s+)?def (\w+)\s*\(')
|
|
28
|
+
|
|
29
|
+
# Categories deliberately NOT pinned in DEFAULT_SETTINGS['severity']: their checks
|
|
30
|
+
# emit MORE THAN ONE severity (author_format: a note for ALL-CAPS surnames, a
|
|
31
|
+
# warning for an 'and' glued to a name), and pinning a category flattens all its
|
|
32
|
+
# findings to one level. Listed here -- the single source of truth, referenced by
|
|
33
|
+
# config.py's comment and asserted by tests/test_catalog.py -- so a new
|
|
34
|
+
# mixed-severity category is a conscious choice, not an accidental gap.
|
|
35
|
+
INTENTIONALLY_UNPINNED = frozenset({"author_format"})
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def category_sources():
|
|
39
|
+
"""Map each emittable category to the source locations that emit it.
|
|
40
|
+
|
|
41
|
+
A single static scan of the package: it walks every `category="..."` literal
|
|
42
|
+
and attributes it to the innermost enclosing `def`, recording (file, function,
|
|
43
|
+
line). The catalog's index is built from this, and `emitted_categories()` is
|
|
44
|
+
just its key set -- so there is ONE scanner, and the index can never disagree
|
|
45
|
+
with the category list it indexes.
|
|
46
|
+
|
|
47
|
+
The relationship is genuinely many-to-many (one rule function can emit several
|
|
48
|
+
categories; one category, e.g. 'style', is emitted by many functions), so each
|
|
49
|
+
category maps to a *list* of sources, sorted for stable output."""
|
|
50
|
+
sources = {}
|
|
51
|
+
for name in sorted(os.listdir(_PKG_DIR)):
|
|
52
|
+
if not name.endswith(".py") or name == "catalog.py":
|
|
53
|
+
continue # catalog.py's own _CATEGORY_RE literal is not an emit site
|
|
54
|
+
path = os.path.join(_PKG_DIR, name)
|
|
55
|
+
with open(path, encoding="utf-8") as fh:
|
|
56
|
+
# Track the function enclosing each line by indentation: a `def` owns
|
|
57
|
+
# every following line more-indented than it, until a sibling/dedent.
|
|
58
|
+
stack = [] # (indent, function name)
|
|
59
|
+
for lineno, line in enumerate(fh, 1):
|
|
60
|
+
if not line.strip() or line.lstrip().startswith("#"):
|
|
61
|
+
continue
|
|
62
|
+
indent = len(line) - len(line.lstrip())
|
|
63
|
+
m = _DEF_RE.match(line)
|
|
64
|
+
if m:
|
|
65
|
+
while stack and stack[-1][0] >= indent:
|
|
66
|
+
stack.pop()
|
|
67
|
+
stack.append((indent, m.group(2)))
|
|
68
|
+
else:
|
|
69
|
+
while stack and stack[-1][0] >= indent:
|
|
70
|
+
stack.pop()
|
|
71
|
+
for cat in _CATEGORY_RE.findall(line):
|
|
72
|
+
fn = stack[-1][1] if stack else "<module>"
|
|
73
|
+
sources.setdefault(cat, set()).add((name, fn, lineno))
|
|
74
|
+
return {c: sorted(locs) for c, locs in sources.items()}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def emitted_categories():
|
|
78
|
+
"""Every category any rule or layer can emit, scanned from the package source.
|
|
79
|
+
|
|
80
|
+
This is the authoritative set: a static scan of the `category="..."` literals
|
|
81
|
+
catches categories from the online layers and multi-category rules that
|
|
82
|
+
introspecting the ENTRY_RULES/FILE_RULES registries alone would miss."""
|
|
83
|
+
return set(category_sources())
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def default_severity_label(category):
|
|
87
|
+
"""The default severity a category resolves to with no user override:
|
|
88
|
+
'error'/'warning'/'note' for a pinned category, or 'mixed' for one that is
|
|
89
|
+
deliberately unpinned (its checks emit several severities; see
|
|
90
|
+
INTENTIONALLY_UNPINNED). Reads DEFAULT_SETTINGS['severity'] -- the same table
|
|
91
|
+
resolve_severity() consults -- so it matches a real run's behaviour."""
|
|
92
|
+
configured = DEFAULT_SETTINGS.get("severity", {}).get(category)
|
|
93
|
+
if configured and str(configured).lower() in SEVERITY_NAMES:
|
|
94
|
+
return str(configured).lower()
|
|
95
|
+
if category in INTENTIONALLY_UNPINNED:
|
|
96
|
+
return "mixed"
|
|
97
|
+
return None # an unexpected gap: caller/test surfaces it
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def catalog():
|
|
101
|
+
"""The full catalog as a sorted list of dicts, one per emittable category.
|
|
102
|
+
|
|
103
|
+
Each row also carries `sources`: the function(s) and file:line(s) that emit the
|
|
104
|
+
category. This makes the catalog a faithful *index* into the detection logic --
|
|
105
|
+
it cannot reproduce a check (the algorithm lives in the function body), but it
|
|
106
|
+
points at exactly the code to read to see what a check does."""
|
|
107
|
+
srcmap = category_sources()
|
|
108
|
+
rows = []
|
|
109
|
+
for cat in sorted(emitted_categories()):
|
|
110
|
+
sup = SUPERSEDES.get(cat)
|
|
111
|
+
rows.append({
|
|
112
|
+
"category": cat,
|
|
113
|
+
"default_severity": default_severity_label(cat),
|
|
114
|
+
"group": finding_group(cat),
|
|
115
|
+
"superseded_by": sup[1] if sup else None,
|
|
116
|
+
"description": CATEGORY_DOC.get(cat, ""),
|
|
117
|
+
"sources": [{"function": fn, "file": f, "line": ln}
|
|
118
|
+
for (f, fn, ln) in srcmap.get(cat, [])],
|
|
119
|
+
})
|
|
120
|
+
return rows
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _source_functions(row, limit=None):
|
|
124
|
+
"""The distinct function names that emit a category. Many-to-many is normal
|
|
125
|
+
(e.g. 'style' is emitted by ten functions). For the table a `limit` keeps rows
|
|
126
|
+
scannable, eliding the tail as '+N more'; the JSON `sources` lists them all,
|
|
127
|
+
with file:line."""
|
|
128
|
+
seen = []
|
|
129
|
+
for s in row["sources"]:
|
|
130
|
+
if s["function"] not in seen:
|
|
131
|
+
seen.append(s["function"])
|
|
132
|
+
if limit and len(seen) > limit:
|
|
133
|
+
return ", ".join(seen[:limit]) + f" +{len(seen) - limit} more"
|
|
134
|
+
return ", ".join(seen)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _fmt_table(rows):
|
|
138
|
+
cols = [("category", "category"), ("default_severity", "severity"),
|
|
139
|
+
("group", "group"), ("superseded_by", "superseded by"),
|
|
140
|
+
("rules", "rules (in source)"), ("description", "description")]
|
|
141
|
+
def cell(r, key):
|
|
142
|
+
if key == "rules":
|
|
143
|
+
return _source_functions(r, limit=3) or "-"
|
|
144
|
+
v = r.get(key)
|
|
145
|
+
return "-" if v in (None, "") else str(v)
|
|
146
|
+
widths = {k: max(len(hdr), *(len(cell(r, k)) for r in rows)) for k, hdr in cols}
|
|
147
|
+
line = lambda vals: " ".join(v.ljust(widths[k]) for (k, _), v in zip(cols, vals))
|
|
148
|
+
out = [line([hdr for _, hdr in cols]),
|
|
149
|
+
line(["-" * widths[k] for k, _ in cols])]
|
|
150
|
+
out += [line([cell(r, k) for k, _ in cols]) for r in rows]
|
|
151
|
+
return "\n".join(out)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def print_catalog(as_json=False, stream=None):
|
|
155
|
+
"""Print the catalog to `stream` (default stdout). The audit sheet a publisher
|
|
156
|
+
reviews against their house standard; `as_json` for machine consumption."""
|
|
157
|
+
import sys
|
|
158
|
+
stream = stream or sys.stdout
|
|
159
|
+
rows = catalog()
|
|
160
|
+
if as_json:
|
|
161
|
+
json.dump({"rules": rows}, stream, indent=2)
|
|
162
|
+
stream.write("\n")
|
|
163
|
+
return
|
|
164
|
+
stream.write(_fmt_table(rows) + "\n")
|
|
165
|
+
stream.write(
|
|
166
|
+
f"\n{len(rows)} finding categories. The 'rules' column names the function(s) "
|
|
167
|
+
"that emit each\ncategory -- read those in veracite/ for the exact logic "
|
|
168
|
+
"('--list-rules json' gives\nfull file:line). Re-rank any category in a "
|
|
169
|
+
"settings file's \"severity\" block, e.g.\n"
|
|
170
|
+
' {"severity": {"style": "warning", "title_case": "note"}}\n')
|
veracite/checkpoint.py
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
"""Checkpointing: persist a (possibly partial) run to the --json report as an
|
|
2
|
+
append-only NDJSON log, and rebuild it on a later run -- so a large or interrupted
|
|
3
|
+
job is resumable and can be completed in phases (offline -> online -> llm).
|
|
4
|
+
|
|
5
|
+
The on-disk format is **NDJSON**: one self-contained JSON record per line. Most
|
|
6
|
+
lines are one bibliography ENTRY, keyed by its citation key and carrying everything
|
|
7
|
+
about it -- which `phases` it has, its verification `status`/`confidence`,
|
|
8
|
+
`identifiers`, the matched `canonical_record`, the `sources`, and its `issues`
|
|
9
|
+
(findings):
|
|
10
|
+
|
|
11
|
+
{"key": "k0", "phases": {...}, "status": "VERIFIED", "issues": [...], ...}
|
|
12
|
+
{"key": "k1", ...}
|
|
13
|
+
{"key": "<file>", "issues": [...]} # file-level findings (duplicates, ...)
|
|
14
|
+
{"key": "<summary>", "summary": {...}} # the integrity roll-up / offline stub
|
|
15
|
+
|
|
16
|
+
Why NDJSON: a new entry is a single O(1) APPEND (no rewrite of the whole growing
|
|
17
|
+
report), so checkpointing after every entry stays cheap even at 10k references, and
|
|
18
|
+
a crash mid-run leaves every prior line intact (a torn final line is just skipped on
|
|
19
|
+
load). Re-running an entry simply APPENDS a fresh record for that key; on load, the
|
|
20
|
+
LAST record per key wins, so an updated entry supersedes its earlier state with no
|
|
21
|
+
in-place edit. At the end of a clean run the file is COMPACTED -- rewritten once,
|
|
22
|
+
atomically, with exactly one line per key in bibliography order.
|
|
23
|
+
|
|
24
|
+
This module owns all of that format knowledge (the per-key record shape, append,
|
|
25
|
+
load with last-wins, compaction) so the CLI driver and report.py stay agnostic.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
import json
|
|
29
|
+
import os
|
|
30
|
+
|
|
31
|
+
from .models import Record
|
|
32
|
+
from .record import Resolution
|
|
33
|
+
from .report import Finding, Severity
|
|
34
|
+
|
|
35
|
+
# The phase order, weakest first. A run "requests" a set of phases; an entry is
|
|
36
|
+
# (re)processed for a requested phase it does not already have.
|
|
37
|
+
PHASES = ("offline", "online", "llm")
|
|
38
|
+
|
|
39
|
+
# Reserved keys for the non-entry records.
|
|
40
|
+
FILE_KEY = "<file>"
|
|
41
|
+
SUMMARY_KEY = "<summary>"
|
|
42
|
+
|
|
43
|
+
# Which phase produces a finding, by its category. Anything not listed is treated
|
|
44
|
+
# as 'offline' (the static/syntax rules), the conservative default -- those
|
|
45
|
+
# findings are cheap to recompute and never depend on the network. Only the
|
|
46
|
+
# online/llm categories must be named, since those are the ones we must NOT throw
|
|
47
|
+
# away when resuming a job that already paid for them.
|
|
48
|
+
_ONLINE_CATEGORIES = {
|
|
49
|
+
"metadata_mismatch", "source_conflict", "record_unresolved", "dead_doi",
|
|
50
|
+
"retraction", "related_work", "preprint_superseded",
|
|
51
|
+
"id_resolves_wrong_record", "doi_available", "pid_missing", "pid_optional",
|
|
52
|
+
"container_granularity",
|
|
53
|
+
}
|
|
54
|
+
_LLM_CATEGORIES = {"llm_relevance", "wrong_paper", "llm_config"}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def finding_phase(f):
|
|
58
|
+
"""The phase that produced finding `f`. LLM and online categories are named
|
|
59
|
+
explicitly; everything else is an offline static/syntax finding."""
|
|
60
|
+
from .report import Report # lazy: avoid an import cycle at module load
|
|
61
|
+
if f.category in _LLM_CATEGORIES or f.layer == "llm":
|
|
62
|
+
return "llm"
|
|
63
|
+
if f.category in _ONLINE_CATEGORIES or f.layer in Report._ONLINE_LAYERS:
|
|
64
|
+
return "online"
|
|
65
|
+
return "offline"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def requested_phases(online, llm):
|
|
69
|
+
"""The phases this invocation is asking to (re)compute. Offline always runs;
|
|
70
|
+
online unless --offline; llm only with --llm (which implies online)."""
|
|
71
|
+
req = {"offline"}
|
|
72
|
+
if online:
|
|
73
|
+
req.add("online")
|
|
74
|
+
if llm:
|
|
75
|
+
req.add("llm")
|
|
76
|
+
return req
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# --- per-entry record (the NDJSON line) ------------------------------------
|
|
80
|
+
|
|
81
|
+
def entry_record(key, res, status, conf, phases, issues, verify=None):
|
|
82
|
+
"""Build the persisted record for one bib entry: a self-contained dict with its
|
|
83
|
+
phases, verification status, identifiers, canonical record, sources and issues.
|
|
84
|
+
`res` is a Resolution (or None for an offline-only entry); `issues` is a list of
|
|
85
|
+
finding dicts (Report._finding_dict shape). This is the inverse of the loader's
|
|
86
|
+
`_resolution_from_record`, so a round trip reproduces the same report."""
|
|
87
|
+
rec = (res.record if res else None) or {}
|
|
88
|
+
return {
|
|
89
|
+
"key": key,
|
|
90
|
+
"phases": {p: (p in phases) for p in PHASES},
|
|
91
|
+
"status": status,
|
|
92
|
+
"confidence": conf,
|
|
93
|
+
"verify": verify,
|
|
94
|
+
"identifiers": {"doi": (res.doi if res else "") or None,
|
|
95
|
+
"arxiv": (res.arxiv_id if res else "") or None,
|
|
96
|
+
"isbn": (res.isbn if res else "") or None},
|
|
97
|
+
"sources": sorted(res.sources) if res else [],
|
|
98
|
+
"canonical_record": {k: rec.get(k) for k in
|
|
99
|
+
("title", "year", "journal", "volume",
|
|
100
|
+
"number", "pages")} if rec else None,
|
|
101
|
+
"issues": issues,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def file_record(issues):
|
|
106
|
+
"""The reserved file-level record: findings not tied to one entry (duplicates,
|
|
107
|
+
brace balance, a cited key with no entry)."""
|
|
108
|
+
return {"key": FILE_KEY, "issues": issues}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def summary_record(summary):
|
|
112
|
+
"""The reserved summary record: the integrity roll-up (or offline stub), stamped
|
|
113
|
+
with the VeraCite version that produced the report so a saved/shared report is
|
|
114
|
+
traceable to the exact tool revision (checks and scoring can change between
|
|
115
|
+
versions)."""
|
|
116
|
+
from .config import VERSION
|
|
117
|
+
return {"key": SUMMARY_KEY, "veracite_version": VERSION, "summary": summary}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def append_record(path, record):
|
|
121
|
+
"""Append one NDJSON record (a single line) to the checkpoint. O(1): no rewrite
|
|
122
|
+
of the existing file. Each line is a complete JSON value terminated by '\\n', so
|
|
123
|
+
a crash after a full line leaves a loadable file and a crash mid-line is skipped
|
|
124
|
+
on load. Returns True on success; an OSError is reported, never raised."""
|
|
125
|
+
try:
|
|
126
|
+
with open(path, "a", encoding="utf-8") as fh:
|
|
127
|
+
fh.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
128
|
+
return True
|
|
129
|
+
except OSError as ex:
|
|
130
|
+
import sys
|
|
131
|
+
print(f"\nwarning: could not write checkpoint to {path}: {ex}", file=sys.stderr)
|
|
132
|
+
return False
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def compact(path, ordered_keys):
|
|
136
|
+
"""Rewrite the checkpoint with exactly one line per key, in `ordered_keys` order
|
|
137
|
+
(entry keys in bib order), followed by the <file> and <summary> records. Done
|
|
138
|
+
once at the end of a clean run so a finished report has no superseded duplicate
|
|
139
|
+
lines. Atomic (temp + os.replace), so an interruption during compaction cannot
|
|
140
|
+
corrupt the still-valid append log it replaces. Returns True on success."""
|
|
141
|
+
records, _ = _read_records(path) # last-wins map: key -> record
|
|
142
|
+
if records is None:
|
|
143
|
+
return False
|
|
144
|
+
order = list(ordered_keys) + [FILE_KEY, SUMMARY_KEY]
|
|
145
|
+
seen = set()
|
|
146
|
+
lines = []
|
|
147
|
+
for k in order:
|
|
148
|
+
if k in records and k not in seen:
|
|
149
|
+
seen.add(k)
|
|
150
|
+
lines.append(json.dumps(records[k], ensure_ascii=False))
|
|
151
|
+
# Any record whose key was not in `ordered_keys` (shouldn't happen, but be safe)
|
|
152
|
+
# is appended at the end so nothing is silently dropped.
|
|
153
|
+
for k, rec in records.items():
|
|
154
|
+
if k not in seen:
|
|
155
|
+
lines.append(json.dumps(rec, ensure_ascii=False))
|
|
156
|
+
tmp = f"{path}.tmp.{os.getpid()}"
|
|
157
|
+
try:
|
|
158
|
+
with open(tmp, "w", encoding="utf-8") as fh:
|
|
159
|
+
fh.write("\n".join(lines) + ("\n" if lines else ""))
|
|
160
|
+
os.replace(tmp, path)
|
|
161
|
+
return True
|
|
162
|
+
except OSError as ex:
|
|
163
|
+
import sys
|
|
164
|
+
print(f"\nwarning: could not compact checkpoint {path}: {ex}", file=sys.stderr)
|
|
165
|
+
try:
|
|
166
|
+
os.remove(tmp)
|
|
167
|
+
except OSError:
|
|
168
|
+
pass
|
|
169
|
+
return False
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _read_records(path):
|
|
173
|
+
"""Read the NDJSON checkpoint into an ORDERED last-wins map {key: record}. A
|
|
174
|
+
later line for a key supersedes an earlier one (an updated entry), keeping the
|
|
175
|
+
key's first-seen position so bib order is stable. A blank or unparseable line
|
|
176
|
+
(e.g. a torn final line from a crash) is skipped. Returns (records, n_lines) or
|
|
177
|
+
(None, 0) if the file is absent/unreadable."""
|
|
178
|
+
if not path or not os.path.isfile(path):
|
|
179
|
+
return None, 0
|
|
180
|
+
records = {}
|
|
181
|
+
n = 0
|
|
182
|
+
try:
|
|
183
|
+
with open(path, encoding="utf-8") as fh:
|
|
184
|
+
for line in fh:
|
|
185
|
+
line = line.strip()
|
|
186
|
+
if not line:
|
|
187
|
+
continue
|
|
188
|
+
try:
|
|
189
|
+
rec = json.loads(line)
|
|
190
|
+
except ValueError:
|
|
191
|
+
continue # torn / partial line -- skip it
|
|
192
|
+
if not isinstance(rec, dict) or "key" not in rec:
|
|
193
|
+
continue
|
|
194
|
+
n += 1
|
|
195
|
+
records[rec["key"]] = rec # last wins; insertion order preserved
|
|
196
|
+
except OSError:
|
|
197
|
+
return None, 0
|
|
198
|
+
return records, n
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class Checkpoint:
|
|
202
|
+
"""A loaded prior run rebuilt from an NDJSON checkpoint. `phases_by_key[key]` is
|
|
203
|
+
the set of phases already computed for an entry; `needs(key, requested)` says
|
|
204
|
+
which requested phases are still missing. The replayed findings, resolutions,
|
|
205
|
+
statuses and verify links are exposed so the driver can reproduce prior work
|
|
206
|
+
without recomputing it."""
|
|
207
|
+
|
|
208
|
+
def __init__(self, path):
|
|
209
|
+
self.path = path
|
|
210
|
+
self.findings = [] # replayed Finding objects (all keys)
|
|
211
|
+
self.results = {} # key -> Resolution
|
|
212
|
+
self.statuses = {} # key -> (status, confidence)
|
|
213
|
+
self.links = {} # key -> verify url
|
|
214
|
+
self.phases_by_key = {} # key -> set(phases done)
|
|
215
|
+
self.summary = None # the saved summary record, if any
|
|
216
|
+
self._findings_by_key = {}
|
|
217
|
+
self.loaded = False
|
|
218
|
+
|
|
219
|
+
@classmethod
|
|
220
|
+
def load(cls, path):
|
|
221
|
+
"""Load an NDJSON checkpoint if it exists and parses. Returns a Checkpoint
|
|
222
|
+
(with .loaded True) or None if the file is absent or holds no usable record
|
|
223
|
+
(so a stray/empty/foreign path just starts fresh)."""
|
|
224
|
+
records, _ = _read_records(path)
|
|
225
|
+
if not records:
|
|
226
|
+
return None
|
|
227
|
+
cp = cls(path)
|
|
228
|
+
cp._replay(records)
|
|
229
|
+
cp.loaded = True
|
|
230
|
+
return cp
|
|
231
|
+
|
|
232
|
+
def _replay(self, records):
|
|
233
|
+
for key, rec in records.items():
|
|
234
|
+
if key == SUMMARY_KEY:
|
|
235
|
+
self.summary = rec.get("summary")
|
|
236
|
+
continue
|
|
237
|
+
for fd in rec.get("issues", []):
|
|
238
|
+
f = _finding_from_dict(fd, key)
|
|
239
|
+
if f is not None:
|
|
240
|
+
self.findings.append(f)
|
|
241
|
+
self._findings_by_key.setdefault(key, []).append(f)
|
|
242
|
+
if key == FILE_KEY:
|
|
243
|
+
continue
|
|
244
|
+
self.phases_by_key[key] = {p for p, on in (rec.get("phases") or {}).items() if on}
|
|
245
|
+
self.results[key] = _resolution_from_record(rec)
|
|
246
|
+
self.statuses[key] = (rec.get("status"),
|
|
247
|
+
float(rec.get("confidence") or 0.0))
|
|
248
|
+
if rec.get("verify"):
|
|
249
|
+
self.links[key] = rec["verify"]
|
|
250
|
+
|
|
251
|
+
# -- phase coverage -----------------------------------------------------
|
|
252
|
+
|
|
253
|
+
def has(self, key, phase):
|
|
254
|
+
return phase in self.phases_by_key.get(key, set())
|
|
255
|
+
|
|
256
|
+
def needs(self, key, requested):
|
|
257
|
+
"""The requested phases this key has NOT already computed -- the work left
|
|
258
|
+
to do for it. Empty means the saved entry already satisfies the request.
|
|
259
|
+
|
|
260
|
+
The LLM rating needs the work's abstract, which only the online layer
|
|
261
|
+
fetches and which is NOT persisted (it is an LLM input, not a result). So
|
|
262
|
+
when the llm phase must run, the online phase is run with it -- otherwise a
|
|
263
|
+
resumed --llm pass would have no abstract to rate. This is the one phase
|
|
264
|
+
coupling; offline is always independent."""
|
|
265
|
+
todo = {p for p in requested if not self.has(key, phase=p)}
|
|
266
|
+
if "llm" in todo:
|
|
267
|
+
todo.add("online")
|
|
268
|
+
return todo
|
|
269
|
+
|
|
270
|
+
def seed_findings_for(self, key, keep_phases):
|
|
271
|
+
"""The saved findings for `key` that belong to a phase in `keep_phases` --
|
|
272
|
+
i.e. the prior findings to replay because this run is NOT recomputing their
|
|
273
|
+
phase. Offline findings are always recomputed live, so they are never in
|
|
274
|
+
`keep_phases` and never replayed (avoids duplicating a static finding)."""
|
|
275
|
+
return [f for f in self._findings_by_key.get(key, [])
|
|
276
|
+
if finding_phase(f) in keep_phases]
|
|
277
|
+
|
|
278
|
+
def file_findings(self):
|
|
279
|
+
"""The replayed file-level (<file>) findings, so a resumed run that does not
|
|
280
|
+
recompute them (it always does, but defensively) does not lose them."""
|
|
281
|
+
return list(self._findings_by_key.get(FILE_KEY, []))
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _finding_from_dict(fd, key):
|
|
285
|
+
"""Rebuild a Finding from a persisted issue dict. The key comes from the record
|
|
286
|
+
(issues no longer carry their own key). Tolerates a missing/odd severity by
|
|
287
|
+
defaulting to a note rather than crashing on a hand-edited report."""
|
|
288
|
+
try:
|
|
289
|
+
sev = Severity[fd.get("severity", "INFO")]
|
|
290
|
+
except KeyError:
|
|
291
|
+
sev = Severity.INFO
|
|
292
|
+
return Finding(severity=sev, key=key,
|
|
293
|
+
line=int(fd.get("line") or 0), message=fd.get("message", ""),
|
|
294
|
+
layer=fd.get("layer", "static"), category=fd.get("category", ""),
|
|
295
|
+
suggested=fd.get("suggested"))
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _resolution_from_record(rec):
|
|
299
|
+
"""Rebuild a Resolution from a saved entry record -- enough for the integrity
|
|
300
|
+
score (ids, sources) and the report to regenerate identically. Abstracts are not
|
|
301
|
+
persisted (they are an LLM input, not a result), so a resumed LLM phase re-fetches
|
|
302
|
+
the abstract via the online layer."""
|
|
303
|
+
ids = rec.get("identifiers") or {}
|
|
304
|
+
res = Resolution()
|
|
305
|
+
res.doi = ids.get("doi") or ""
|
|
306
|
+
res.arxiv_id = ids.get("arxiv") or ""
|
|
307
|
+
res.isbn = ids.get("isbn") or ""
|
|
308
|
+
crec = rec.get("canonical_record")
|
|
309
|
+
if crec:
|
|
310
|
+
res.record = Record(title=crec.get("title") or "", year=crec.get("year"),
|
|
311
|
+
journal=crec.get("journal") or "",
|
|
312
|
+
volume=crec.get("volume") or "",
|
|
313
|
+
number=crec.get("number") or "",
|
|
314
|
+
pages=crec.get("pages") or "")
|
|
315
|
+
res.source = (rec.get("sources") or [""])[0]
|
|
316
|
+
for s in (rec.get("sources") or []):
|
|
317
|
+
# Per-source records are not persisted individually; a placeholder keeps
|
|
318
|
+
# len(sources) correct for the confidence/cross-source logic and the
|
|
319
|
+
# `sources` list. The primary record carries the comparable fields.
|
|
320
|
+
res.sources[s] = res.record if s == res.source else {}
|
|
321
|
+
return res
|