ghostcite 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ghostcite/__init__.py +1 -0
- ghostcite/__main__.py +5 -0
- ghostcite/_pace.py +52 -0
- ghostcite/cli.py +174 -0
- ghostcite/compare.py +321 -0
- ghostcite/crossref.py +117 -0
- ghostcite/models.py +54 -0
- ghostcite/parsers/__init__.py +34 -0
- ghostcite/parsers/bibtex.py +54 -0
- ghostcite/parsers/doi.py +40 -0
- ghostcite/parsers/markdown.py +45 -0
- ghostcite/pubmed.py +146 -0
- ghostcite/py.typed +0 -0
- ghostcite/report.py +77 -0
- ghostcite-0.1.0.dist-info/METADATA +271 -0
- ghostcite-0.1.0.dist-info/RECORD +19 -0
- ghostcite-0.1.0.dist-info/WHEEL +4 -0
- ghostcite-0.1.0.dist-info/entry_points.txt +2 -0
- ghostcite-0.1.0.dist-info/licenses/LICENSE +21 -0
ghostcite/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
ghostcite/__main__.py
ADDED
ghostcite/_pace.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
|
|
7
|
+
_INTERVAL_RE = re.compile(r"(\d+)\s*s")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class _Pacer:
|
|
11
|
+
"""Client-side request pacer enforcing a minimum interval between calls.
|
|
12
|
+
|
|
13
|
+
The floor is the larger of an explicit ``--max-rps`` ceiling and whatever
|
|
14
|
+
the server advertises via ``x-rate-limit-limit`` / ``x-rate-limit-interval``
|
|
15
|
+
response headers, so the more conservative of the two always wins.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, max_rps: float | None) -> None:
|
|
19
|
+
if max_rps is not None and max_rps <= 0:
|
|
20
|
+
raise ValueError("max_rps must be positive")
|
|
21
|
+
self._min_interval = 1.0 / max_rps if max_rps else 0.0
|
|
22
|
+
self._last: float | None = None
|
|
23
|
+
|
|
24
|
+
def update_from_headers(self, headers: Mapping[str, str]) -> None:
|
|
25
|
+
"""Raise the floor from CrossRef rate headers. Malformed/missing → no-op."""
|
|
26
|
+
raw_limit = headers.get("x-rate-limit-limit")
|
|
27
|
+
raw_interval = headers.get("x-rate-limit-interval")
|
|
28
|
+
if raw_limit is None or raw_interval is None:
|
|
29
|
+
return
|
|
30
|
+
try:
|
|
31
|
+
limit = int(raw_limit)
|
|
32
|
+
except (TypeError, ValueError):
|
|
33
|
+
return
|
|
34
|
+
if limit <= 0:
|
|
35
|
+
return
|
|
36
|
+
m = _INTERVAL_RE.fullmatch(raw_interval.strip())
|
|
37
|
+
if not m:
|
|
38
|
+
return
|
|
39
|
+
interval_sec = int(m.group(1))
|
|
40
|
+
self._min_interval = max(self._min_interval, interval_sec / limit)
|
|
41
|
+
|
|
42
|
+
def wait(self) -> None:
|
|
43
|
+
"""Block until at least ``_min_interval`` has elapsed since the last call."""
|
|
44
|
+
now = time.monotonic()
|
|
45
|
+
if self._last is None:
|
|
46
|
+
# First call: prime the clock, don't sleep.
|
|
47
|
+
self._last = now
|
|
48
|
+
return
|
|
49
|
+
delay = self._min_interval - (now - self._last)
|
|
50
|
+
if delay > 0:
|
|
51
|
+
time.sleep(delay)
|
|
52
|
+
self._last = time.monotonic()
|
ghostcite/cli.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
from contextlib import ExitStack
|
|
7
|
+
|
|
8
|
+
from ghostcite import __version__
|
|
9
|
+
from ghostcite.compare import cross_check_pubmed, evaluate
|
|
10
|
+
from ghostcite.crossref import CrossRefClient
|
|
11
|
+
from ghostcite.models import Finding, Tier
|
|
12
|
+
from ghostcite.parsers import parse
|
|
13
|
+
from ghostcite.pubmed import PubMedClient
|
|
14
|
+
from ghostcite.report import render_json, render_text
|
|
15
|
+
|
|
16
|
+
_TIER_BY_NAME = {
|
|
17
|
+
"author": Tier.AUTHOR,
|
|
18
|
+
"year": Tier.YEAR,
|
|
19
|
+
"retraction": Tier.RETRACTION,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _parse_args(argv):
|
|
24
|
+
p = argparse.ArgumentParser(
|
|
25
|
+
prog="ghostcite",
|
|
26
|
+
description="Catch ghost citations: cross-check claimed author/year against CrossRef.",
|
|
27
|
+
)
|
|
28
|
+
p.add_argument("--version", action="version", version=f"ghostcite {__version__}")
|
|
29
|
+
p.add_argument(
|
|
30
|
+
"file",
|
|
31
|
+
help="bibliography file (.bib, markdown refs, or DOI list), or '-' for stdin",
|
|
32
|
+
)
|
|
33
|
+
p.add_argument("--format", choices=["auto", "bibtex", "markdown", "doi"], default="auto")
|
|
34
|
+
p.add_argument("--json", action="store_true", help="machine-readable output")
|
|
35
|
+
p.add_argument("--dry-run", action="store_true", help="parse + count only, no network")
|
|
36
|
+
p.add_argument(
|
|
37
|
+
"--fail-on",
|
|
38
|
+
default="author,year,retraction",
|
|
39
|
+
help="comma list of tiers that cause exit 1, or 'none' "
|
|
40
|
+
"(choices: author,year,retraction,none)",
|
|
41
|
+
)
|
|
42
|
+
p.add_argument(
|
|
43
|
+
"--max-rps",
|
|
44
|
+
type=float,
|
|
45
|
+
default=None,
|
|
46
|
+
help="cap outbound requests per second (proactive rate pacing)",
|
|
47
|
+
)
|
|
48
|
+
p.add_argument(
|
|
49
|
+
"--color",
|
|
50
|
+
choices=["auto", "always", "never"],
|
|
51
|
+
default="auto",
|
|
52
|
+
help="colorize tier glyphs (default auto; honors NO_COLOR)",
|
|
53
|
+
)
|
|
54
|
+
p.add_argument(
|
|
55
|
+
"--cross-check",
|
|
56
|
+
choices=["none", "pubmed"],
|
|
57
|
+
default="none",
|
|
58
|
+
help="second source to corroborate findings against (default none)",
|
|
59
|
+
)
|
|
60
|
+
p.add_argument(
|
|
61
|
+
"--ncbi-email",
|
|
62
|
+
default=os.environ.get("NCBI_EMAIL"),
|
|
63
|
+
help="contact email for NCBI E-utilities (or set NCBI_EMAIL)",
|
|
64
|
+
)
|
|
65
|
+
p.add_argument(
|
|
66
|
+
"--ncbi-api-key",
|
|
67
|
+
default=os.environ.get("NCBI_API_KEY"),
|
|
68
|
+
help="NCBI API key for higher rate limits (or set NCBI_API_KEY)",
|
|
69
|
+
)
|
|
70
|
+
args = p.parse_args(argv)
|
|
71
|
+
if args.max_rps is not None and args.max_rps <= 0:
|
|
72
|
+
p.error("--max-rps must be > 0")
|
|
73
|
+
return args
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _want_color(mode: str) -> bool:
|
|
77
|
+
"""Resolve --color {auto,always,never} against NO_COLOR + TTY state.
|
|
78
|
+
|
|
79
|
+
NO_COLOR (presence, any value) disables color even when ``always`` is set,
|
|
80
|
+
per https://no-color.org/.
|
|
81
|
+
"""
|
|
82
|
+
if "NO_COLOR" in os.environ:
|
|
83
|
+
return False
|
|
84
|
+
if mode == "always":
|
|
85
|
+
return True
|
|
86
|
+
if mode == "never":
|
|
87
|
+
return False
|
|
88
|
+
return sys.stdout.isatty() and os.environ.get("TERM") != "dumb"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def main(argv=None) -> int:
|
|
92
|
+
args = _parse_args(argv if argv is not None else sys.argv[1:])
|
|
93
|
+
if args.file == "-":
|
|
94
|
+
text = sys.stdin.read()
|
|
95
|
+
if not text.strip():
|
|
96
|
+
print("ghostcite: no input on stdin", file=sys.stderr)
|
|
97
|
+
return 2
|
|
98
|
+
else:
|
|
99
|
+
try:
|
|
100
|
+
with open(args.file, encoding="utf-8") as fh:
|
|
101
|
+
text = fh.read()
|
|
102
|
+
except OSError as e:
|
|
103
|
+
print(f"ghostcite: cannot read {args.file}: {e}", file=sys.stderr)
|
|
104
|
+
return 2
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
citations = parse(text, fmt=args.format)
|
|
108
|
+
except ValueError as e:
|
|
109
|
+
print(f"ghostcite: {e}", file=sys.stderr)
|
|
110
|
+
return 2
|
|
111
|
+
|
|
112
|
+
color = False if args.json else _want_color(args.color)
|
|
113
|
+
with_doi = sum(1 for c in citations if c.doi)
|
|
114
|
+
if args.dry_run:
|
|
115
|
+
print(
|
|
116
|
+
f"ghostcite: would check {len(citations)} entries "
|
|
117
|
+
f"({with_doi} via DOI, {len(citations) - with_doi} via search)."
|
|
118
|
+
)
|
|
119
|
+
return 0
|
|
120
|
+
|
|
121
|
+
findings: list[Finding] = []
|
|
122
|
+
use_pubmed = args.cross_check == "pubmed"
|
|
123
|
+
try:
|
|
124
|
+
with ExitStack() as stack:
|
|
125
|
+
client = stack.enter_context(CrossRefClient(max_rps=args.max_rps))
|
|
126
|
+
pmclient = None
|
|
127
|
+
if use_pubmed:
|
|
128
|
+
pmclient = stack.enter_context(
|
|
129
|
+
PubMedClient(
|
|
130
|
+
max_rps=args.max_rps,
|
|
131
|
+
email=args.ncbi_email,
|
|
132
|
+
api_key=args.ncbi_api_key,
|
|
133
|
+
)
|
|
134
|
+
)
|
|
135
|
+
for c in citations:
|
|
136
|
+
if c.doi:
|
|
137
|
+
rec = client.lookup_by_doi(c.doi)
|
|
138
|
+
else:
|
|
139
|
+
rec = client.search_bibliographic(
|
|
140
|
+
c.claimed_first_author, c.claimed_year, c.claimed_title
|
|
141
|
+
)
|
|
142
|
+
cite_findings = evaluate(c, rec)
|
|
143
|
+
if pmclient is not None:
|
|
144
|
+
if c.doi:
|
|
145
|
+
pm = pmclient.lookup_by_doi(c.doi)
|
|
146
|
+
else:
|
|
147
|
+
pm = pmclient.lookup_by_doi_meta(
|
|
148
|
+
c.claimed_first_author, c.claimed_year, c.claimed_title
|
|
149
|
+
)
|
|
150
|
+
cross_check_pubmed(c, rec, cite_findings, pm)
|
|
151
|
+
findings.extend(cite_findings)
|
|
152
|
+
except Exception as e: # fail-loud: surface, keep partial findings
|
|
153
|
+
print(f"ghostcite: cross-check error: {e}", file=sys.stderr)
|
|
154
|
+
out = (
|
|
155
|
+
render_json(findings, len(citations), with_doi)
|
|
156
|
+
if args.json
|
|
157
|
+
else render_text(findings, len(citations), with_doi, color=color)
|
|
158
|
+
)
|
|
159
|
+
print(out)
|
|
160
|
+
return 2
|
|
161
|
+
|
|
162
|
+
out = (
|
|
163
|
+
render_json(findings, len(citations), with_doi)
|
|
164
|
+
if args.json
|
|
165
|
+
else render_text(findings, len(citations), with_doi, color=color)
|
|
166
|
+
)
|
|
167
|
+
print(out)
|
|
168
|
+
|
|
169
|
+
if args.fail_on.strip().lower() == "none":
|
|
170
|
+
return 0
|
|
171
|
+
fail_tiers = {
|
|
172
|
+
_TIER_BY_NAME[n.strip()] for n in args.fail_on.split(",") if n.strip() in _TIER_BY_NAME
|
|
173
|
+
}
|
|
174
|
+
return 1 if any(f.tier in fail_tiers for f in findings) else 0
|
ghostcite/compare.py
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import unicodedata
|
|
4
|
+
|
|
5
|
+
from ghostcite.models import CanonicalRecord, Citation, Finding, PubMedRecord, Tier
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def normalize_surname(name: str | None) -> str:
|
|
9
|
+
"""Fold to a comparable key: strip diacritics, lowercase, keep only letters."""
|
|
10
|
+
if not name:
|
|
11
|
+
return ""
|
|
12
|
+
decomposed = unicodedata.normalize("NFKD", name)
|
|
13
|
+
ascii_only = "".join(c for c in decomposed if not unicodedata.combining(c))
|
|
14
|
+
return "".join(c for c in ascii_only.lower() if c.isalpha())
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _is_initials(token: str) -> bool:
|
|
18
|
+
"""True if a token is an initials cluster, e.g. "J", "J.", "JA", "J.A.".
|
|
19
|
+
Single letter, or all-uppercase letters after stripping dots — a real
|
|
20
|
+
surname ("Li", "Berg") is mixed-case, not all-caps."""
|
|
21
|
+
bare = token.replace(".", "")
|
|
22
|
+
if not bare or not bare.isalpha():
|
|
23
|
+
return False
|
|
24
|
+
return len(bare) == 1 or bare.isupper()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _surname_tokens(name: str) -> list[str]:
|
|
28
|
+
# A lone token is always the surname, never an initials cluster — keep it
|
|
29
|
+
# even if it looks like initials (e.g. an all-caps "LI"). Only strip
|
|
30
|
+
# initials-looking tokens when there is at least one other token to fall
|
|
31
|
+
# back on ("Smith J" → ["Smith"], "J Smith" → ["Smith"]).
|
|
32
|
+
tokens = name.split()
|
|
33
|
+
if len(tokens) <= 1:
|
|
34
|
+
return tokens
|
|
35
|
+
return [t for t in tokens if not _is_initials(t)]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _surname_key(name: str | None) -> str:
|
|
39
|
+
"""Surname comparison key: drop standalone initial tokens
|
|
40
|
+
(e.g. "Smith J", "J Smith", "Smith JA") before normalizing, so a claimed
|
|
41
|
+
author carrying initials still matches a bare CrossRef family name."""
|
|
42
|
+
if not name:
|
|
43
|
+
return ""
|
|
44
|
+
return normalize_surname(" ".join(_surname_tokens(name)))
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _surname_raw(name: str | None) -> str:
|
|
48
|
+
"""Initial-stripped surname, diacritics PRESERVED, lowercased, despaced.
|
|
49
|
+
For the CLAIMED author: drops initials ("Smith J" → "smith") so an
|
|
50
|
+
initials-only difference doesn't read as a diacritic difference once
|
|
51
|
+
`_surname_key` has matched the keys."""
|
|
52
|
+
if not name:
|
|
53
|
+
return ""
|
|
54
|
+
return "".join(_surname_tokens(name)).lower()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _canonical_raw(name: str | None) -> str:
|
|
58
|
+
"""Like `_surname_raw` but for a CANONICAL family name (CrossRef/PubMed):
|
|
59
|
+
diacritics preserved, lowercased, despaced, NO initials-stripping — an
|
|
60
|
+
all-caps family ("BURGER") is a real surname, not an initials cluster."""
|
|
61
|
+
if not name:
|
|
62
|
+
return ""
|
|
63
|
+
return "".join(name.split()).lower()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _title_tokens(title: str) -> set[str]:
|
|
67
|
+
decomposed = unicodedata.normalize("NFKD", title)
|
|
68
|
+
ascii_only = "".join(c for c in decomposed if not unicodedata.combining(c)).lower()
|
|
69
|
+
words = "".join(c if c.isalnum() else " " for c in ascii_only).split()
|
|
70
|
+
stop = {
|
|
71
|
+
"the",
|
|
72
|
+
"a",
|
|
73
|
+
"an",
|
|
74
|
+
"of",
|
|
75
|
+
"and",
|
|
76
|
+
"in",
|
|
77
|
+
"for",
|
|
78
|
+
"on",
|
|
79
|
+
"to",
|
|
80
|
+
"with",
|
|
81
|
+
"by",
|
|
82
|
+
"reveals",
|
|
83
|
+
}
|
|
84
|
+
return {w for w in words if len(w) > 2 and w not in stop}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def title_similar(a: str | None, b: str | None, threshold: float = 0.4) -> bool:
|
|
88
|
+
"""Jaccard token overlap >= threshold. Used to tell wrong-author from wrong-DOI."""
|
|
89
|
+
if not a or not b:
|
|
90
|
+
return False
|
|
91
|
+
ta, tb = _title_tokens(a), _title_tokens(b)
|
|
92
|
+
if not ta or not tb:
|
|
93
|
+
return False
|
|
94
|
+
inter = len(ta & tb)
|
|
95
|
+
union = len(ta | tb)
|
|
96
|
+
return (inter / union) >= threshold
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def evaluate(citation: Citation, canonical: CanonicalRecord | None) -> list[Finding]:
|
|
100
|
+
"""Compare a claimed citation against the canonical record. Empty list = OK."""
|
|
101
|
+
if canonical is None:
|
|
102
|
+
return [Finding(citation, Tier.UNRESOLVABLE, None, "DOI not found / unresolvable")]
|
|
103
|
+
|
|
104
|
+
findings: list[Finding] = []
|
|
105
|
+
|
|
106
|
+
# Retraction is orthogonal — fires regardless of author/year.
|
|
107
|
+
if canonical.retracted:
|
|
108
|
+
findings.append(Finding(citation, Tier.RETRACTION, canonical, "RETRACTED per CrossRef"))
|
|
109
|
+
elif canonical.eoc:
|
|
110
|
+
findings.append(
|
|
111
|
+
Finding(
|
|
112
|
+
citation,
|
|
113
|
+
Tier.RETRACTION,
|
|
114
|
+
canonical,
|
|
115
|
+
"Expression of concern per CrossRef",
|
|
116
|
+
)
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Author/year only when the input actually claimed an author (not DOI-list mode).
|
|
120
|
+
if citation.claimed_first_author:
|
|
121
|
+
findings.extend(_author_year(citation, canonical))
|
|
122
|
+
|
|
123
|
+
return findings
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _author_year(citation: Citation, canonical: CanonicalRecord) -> list[Finding]:
|
|
127
|
+
families_raw = canonical.authors or []
|
|
128
|
+
if not families_raw:
|
|
129
|
+
# DOI resolved but CrossRef has no author array (some preprints,
|
|
130
|
+
# datasets, protocols). Author can't be verified → warn, don't fail CI.
|
|
131
|
+
return [
|
|
132
|
+
Finding(
|
|
133
|
+
citation,
|
|
134
|
+
Tier.UNRESOLVABLE,
|
|
135
|
+
canonical,
|
|
136
|
+
"CrossRef record has no author data — author not verifiable",
|
|
137
|
+
)
|
|
138
|
+
]
|
|
139
|
+
if not citation.claimed_first_author:
|
|
140
|
+
# No claimed author to verify against (caller already guards this, but
|
|
141
|
+
# keep the precondition explicit so the type is narrowed).
|
|
142
|
+
return []
|
|
143
|
+
claimed_raw = citation.claimed_first_author.strip()
|
|
144
|
+
# Asymmetric normalization: the claimed author is free text that may carry
|
|
145
|
+
# initials ("Smith JA") → initials-strip it. A CrossRef `family` field is
|
|
146
|
+
# ALWAYS a pure surname (even when ALL CAPS) → diacritic-fold only, never
|
|
147
|
+
# initials-strip (else "TURING" → "" and a correct claim is flagged Tier A).
|
|
148
|
+
claimed = _surname_key(claimed_raw)
|
|
149
|
+
first_raw = families_raw[0] if families_raw else ""
|
|
150
|
+
first = normalize_surname(first_raw)
|
|
151
|
+
all_norm = [normalize_surname(a) for a in families_raw]
|
|
152
|
+
|
|
153
|
+
conf = " (low-confidence match)" if canonical.low_confidence else ""
|
|
154
|
+
|
|
155
|
+
if claimed == first:
|
|
156
|
+
# First-author matches after normalization.
|
|
157
|
+
if _surname_raw(claimed_raw) != _canonical_raw(first_raw):
|
|
158
|
+
return [
|
|
159
|
+
Finding(
|
|
160
|
+
citation,
|
|
161
|
+
Tier.COSMETIC,
|
|
162
|
+
canonical,
|
|
163
|
+
f'diacritic/spelling: CrossRef has "{first_raw}"{conf}',
|
|
164
|
+
)
|
|
165
|
+
]
|
|
166
|
+
# True match → check year.
|
|
167
|
+
if citation.claimed_year and canonical.year and citation.claimed_year != canonical.year:
|
|
168
|
+
return [
|
|
169
|
+
Finding(
|
|
170
|
+
citation,
|
|
171
|
+
Tier.YEAR,
|
|
172
|
+
canonical,
|
|
173
|
+
f"CrossRef year is {canonical.year}{conf}",
|
|
174
|
+
)
|
|
175
|
+
]
|
|
176
|
+
return []
|
|
177
|
+
|
|
178
|
+
# First-author mismatch.
|
|
179
|
+
if claimed in all_norm:
|
|
180
|
+
idx = all_norm.index(claimed)
|
|
181
|
+
return [
|
|
182
|
+
Finding(
|
|
183
|
+
citation,
|
|
184
|
+
Tier.AUTHOR,
|
|
185
|
+
canonical,
|
|
186
|
+
f"claimed first author is actually author #{idx + 1}; "
|
|
187
|
+
f"CrossRef first author is {first_raw}{conf}",
|
|
188
|
+
)
|
|
189
|
+
]
|
|
190
|
+
|
|
191
|
+
# Not in author list at all — wrong author, or wrong DOI entirely.
|
|
192
|
+
if not title_similar(citation.claimed_title, canonical.title):
|
|
193
|
+
return [
|
|
194
|
+
Finding(
|
|
195
|
+
citation,
|
|
196
|
+
Tier.AUTHOR,
|
|
197
|
+
canonical,
|
|
198
|
+
f"DOI resolves to {first_raw} ({canonical.year}) — possibly wrong DOI{conf}",
|
|
199
|
+
)
|
|
200
|
+
]
|
|
201
|
+
return [
|
|
202
|
+
Finding(
|
|
203
|
+
citation,
|
|
204
|
+
Tier.AUTHOR,
|
|
205
|
+
canonical,
|
|
206
|
+
f"CrossRef first author is {first_raw}{conf}",
|
|
207
|
+
)
|
|
208
|
+
]
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def cross_check_pubmed(
|
|
212
|
+
citation: Citation,
|
|
213
|
+
canonical: CanonicalRecord | None,
|
|
214
|
+
findings: list[Finding],
|
|
215
|
+
pubmed: PubMedRecord | None,
|
|
216
|
+
) -> None:
|
|
217
|
+
"""Reconcile CrossRef-derived ``findings`` against a PubMed record in place.
|
|
218
|
+
|
|
219
|
+
Annotates existing findings with a ``cross_check`` note and/or appends NEW
|
|
220
|
+
findings (author/year disagreements PubMed sees that CrossRef didn't,
|
|
221
|
+
retractions PubMed flags). Pure aside from mutating ``findings``; no I/O.
|
|
222
|
+
|
|
223
|
+
Semantics:
|
|
224
|
+
* CrossRef had no record (UNRESOLVABLE) + PubMed resolves → annotate the
|
|
225
|
+
unresolvable finding: evaluable via PubMed (low confidence).
|
|
226
|
+
* An AUTHOR/YEAR finding that PubMed *corroborates* (PubMed agrees with
|
|
227
|
+
CrossRef, not the claim) → note "corroborated by PubMed", tier kept.
|
|
228
|
+
* An AUTHOR/YEAR finding that PubMed *contradicts* (PubMed agrees with the
|
|
229
|
+
cited claim) → conflict note, tier kept (manual review).
|
|
230
|
+
* No CrossRef author/year finding but PubMed disagrees with the claim →
|
|
231
|
+
NEW AUTHOR/YEAR finding "raised by PubMed".
|
|
232
|
+
* Retraction is OR-combined: PubMed-flagged retraction/EoC appends a
|
|
233
|
+
RETRACTION finding if CrossRef didn't already.
|
|
234
|
+
"""
|
|
235
|
+
if pubmed is None:
|
|
236
|
+
return
|
|
237
|
+
|
|
238
|
+
claimed_author = _surname_key(citation.claimed_first_author)
|
|
239
|
+
# PubMed's surname field is a pure surname (possibly ALL CAPS) — fold only,
|
|
240
|
+
# don't initials-strip it (see _author_year for the same asymmetry).
|
|
241
|
+
pm_author = normalize_surname(pubmed.first_author_surname)
|
|
242
|
+
claimed_year = citation.claimed_year
|
|
243
|
+
|
|
244
|
+
# 1. Retraction / expression of concern — OR-combine with CrossRef.
|
|
245
|
+
if (pubmed.retracted or pubmed.eoc) and not any(f.tier is Tier.RETRACTION for f in findings):
|
|
246
|
+
label = "RETRACTED" if pubmed.retracted else "Expression of concern"
|
|
247
|
+
findings.append(
|
|
248
|
+
Finding(
|
|
249
|
+
citation,
|
|
250
|
+
Tier.RETRACTION,
|
|
251
|
+
canonical,
|
|
252
|
+
f"{label} per PubMed",
|
|
253
|
+
cross_check="raised by PubMed",
|
|
254
|
+
)
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# 2. CrossRef had no usable record → PubMed makes it evaluable.
|
|
258
|
+
if canonical is None:
|
|
259
|
+
for f in findings:
|
|
260
|
+
if f.tier is Tier.UNRESOLVABLE:
|
|
261
|
+
f.cross_check = "resolved via PubMed; CrossRef had no record (low confidence)"
|
|
262
|
+
return
|
|
263
|
+
|
|
264
|
+
author_year_findings = [f for f in findings if f.tier in (Tier.AUTHOR, Tier.YEAR)]
|
|
265
|
+
|
|
266
|
+
# 3. Existing AUTHOR/YEAR findings — does PubMed back CrossRef or the claim?
|
|
267
|
+
no_data_note = "PubMed record had no author/year data — not verifiable"
|
|
268
|
+
for f in author_year_findings:
|
|
269
|
+
if f.tier is Tier.AUTHOR and pm_author and claimed_author:
|
|
270
|
+
if pm_author == claimed_author:
|
|
271
|
+
f.cross_check = (
|
|
272
|
+
"⚠ PubMed agrees with the cited author — CrossRef and PubMed "
|
|
273
|
+
"conflict; verify manually"
|
|
274
|
+
)
|
|
275
|
+
else:
|
|
276
|
+
f.cross_check = "corroborated by PubMed"
|
|
277
|
+
elif f.tier is Tier.YEAR and pubmed.year and claimed_year:
|
|
278
|
+
if pubmed.year == claimed_year:
|
|
279
|
+
f.cross_check = (
|
|
280
|
+
"⚠ PubMed agrees with the cited year — CrossRef and PubMed "
|
|
281
|
+
"conflict; verify manually"
|
|
282
|
+
)
|
|
283
|
+
else:
|
|
284
|
+
f.cross_check = "corroborated by PubMed"
|
|
285
|
+
else:
|
|
286
|
+
# PubMed was consulted but lacks the usable author/year needed to
|
|
287
|
+
# corroborate this finding — say so instead of leaving it silent.
|
|
288
|
+
f.cross_check = no_data_note
|
|
289
|
+
|
|
290
|
+
# 4. CrossRef raised no author finding but PubMed disagrees with the claim.
|
|
291
|
+
has_author_finding = any(f.tier is Tier.AUTHOR for f in findings)
|
|
292
|
+
if not has_author_finding and claimed_author and pm_author and pm_author != claimed_author:
|
|
293
|
+
findings.append(
|
|
294
|
+
Finding(
|
|
295
|
+
citation,
|
|
296
|
+
Tier.AUTHOR,
|
|
297
|
+
canonical,
|
|
298
|
+
f"PubMed first author is {pubmed.first_author_surname}",
|
|
299
|
+
cross_check="raised by PubMed",
|
|
300
|
+
)
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# 5. CrossRef raised no year finding but PubMed disagrees with the claimed year.
|
|
304
|
+
has_year_finding = any(f.tier is Tier.YEAR for f in findings)
|
|
305
|
+
if (
|
|
306
|
+
not has_year_finding
|
|
307
|
+
and claimed_year
|
|
308
|
+
and pubmed.year
|
|
309
|
+
and pubmed.year != claimed_year
|
|
310
|
+
# Only when the author lines up — otherwise the author finding carries it.
|
|
311
|
+
and (not pm_author or not claimed_author or pm_author == claimed_author)
|
|
312
|
+
):
|
|
313
|
+
findings.append(
|
|
314
|
+
Finding(
|
|
315
|
+
citation,
|
|
316
|
+
Tier.YEAR,
|
|
317
|
+
canonical,
|
|
318
|
+
f"PubMed year is {pubmed.year}",
|
|
319
|
+
cross_check="raised by PubMed",
|
|
320
|
+
)
|
|
321
|
+
)
|
ghostcite/crossref.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
|
|
7
|
+
from ghostcite import __version__
|
|
8
|
+
from ghostcite._pace import _Pacer
|
|
9
|
+
from ghostcite.models import CanonicalRecord
|
|
10
|
+
|
|
11
|
+
_BASE = "https://api.crossref.org"
|
|
12
|
+
_UA = f"ghostcite/{__version__} (https://github.com/musharna/ghostcite)"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _retraction_flags(message: dict) -> tuple[bool, bool]:
|
|
16
|
+
"""Return (retracted, expression_of_concern) from a CrossRef work message.
|
|
17
|
+
|
|
18
|
+
CrossRef exposes retraction/EoC three ways, and the live schema (verified
|
|
19
|
+
against the Wakefield 1998 Lancet DOI, 2026-06-08) uses ``updated-by``:
|
|
20
|
+
- ``updated-by``: items ON THE RETRACTED WORK pointing forward to the
|
|
21
|
+
notice; each has ``type`` ∈ {retraction, expression_of_concern, ...}.
|
|
22
|
+
This is the canonical "this paper was retracted" signal.
|
|
23
|
+
- ``update-to``: the inverse — items on the NOTICE pointing back to what
|
|
24
|
+
it updates (kept for completeness; often null on the retracted work).
|
|
25
|
+
- ``relation``: keys like ``is-retracted-by`` (rarely populated).
|
|
26
|
+
"""
|
|
27
|
+
retracted = eoc = False
|
|
28
|
+
relation = message.get("relation") or {}
|
|
29
|
+
for key in relation:
|
|
30
|
+
k = key.lower()
|
|
31
|
+
if "retract" in k:
|
|
32
|
+
retracted = True
|
|
33
|
+
if "concern" in k:
|
|
34
|
+
eoc = True
|
|
35
|
+
for field in ("updated-by", "update-to"):
|
|
36
|
+
for upd in message.get(field) or []:
|
|
37
|
+
t = str(upd.get("type", "")).lower()
|
|
38
|
+
if "retract" in t:
|
|
39
|
+
retracted = True
|
|
40
|
+
if "concern" in t:
|
|
41
|
+
eoc = True
|
|
42
|
+
return retracted, eoc
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _year(message: dict) -> int | None:
|
|
46
|
+
for key in ("published", "published-print", "published-online", "issued"):
|
|
47
|
+
parts = (message.get(key) or {}).get("date-parts") or []
|
|
48
|
+
if parts and parts[0] and parts[0][0]:
|
|
49
|
+
return int(parts[0][0])
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _record_from_message(message: dict, low_confidence: bool = False) -> CanonicalRecord:
|
|
54
|
+
retracted, eoc = _retraction_flags(message)
|
|
55
|
+
authors = [a.get("family", "").strip() for a in message.get("author") or [] if a.get("family")]
|
|
56
|
+
title = (message.get("title") or [None])[0]
|
|
57
|
+
journal = (message.get("container-title") or [None])[0]
|
|
58
|
+
return CanonicalRecord(
|
|
59
|
+
doi=(message.get("DOI") or "").lower() or None,
|
|
60
|
+
authors=authors,
|
|
61
|
+
year=_year(message),
|
|
62
|
+
title=title,
|
|
63
|
+
journal=journal,
|
|
64
|
+
retracted=retracted,
|
|
65
|
+
eoc=eoc,
|
|
66
|
+
low_confidence=low_confidence,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class CrossRefClient:
|
|
71
|
+
def __init__(self, timeout: float = 20.0, max_rps: float | None = None):
|
|
72
|
+
self._client = httpx.Client(
|
|
73
|
+
timeout=timeout,
|
|
74
|
+
headers={"User-Agent": _UA},
|
|
75
|
+
follow_redirects=True,
|
|
76
|
+
)
|
|
77
|
+
self._pacer = _Pacer(max_rps)
|
|
78
|
+
|
|
79
|
+
def __enter__(self) -> CrossRefClient:
|
|
80
|
+
return self
|
|
81
|
+
|
|
82
|
+
def __exit__(self, *exc) -> None:
|
|
83
|
+
self._client.close()
|
|
84
|
+
|
|
85
|
+
def _get(self, url: str, **kw) -> httpx.Response:
|
|
86
|
+
"""GET with a single retry on a transient 429/503, honoring Retry-After
|
|
87
|
+
(capped at 60s). The caller handles status codes (e.g. 404)."""
|
|
88
|
+
self._pacer.wait()
|
|
89
|
+
r = self._client.get(url, **kw)
|
|
90
|
+
self._pacer.update_from_headers(r.headers)
|
|
91
|
+
if r.status_code in (429, 503):
|
|
92
|
+
delay = min(int(r.headers.get("Retry-After", "5")), 60)
|
|
93
|
+
time.sleep(delay)
|
|
94
|
+
self._pacer.wait()
|
|
95
|
+
r = self._client.get(url, **kw)
|
|
96
|
+
self._pacer.update_from_headers(r.headers)
|
|
97
|
+
return r
|
|
98
|
+
|
|
99
|
+
def lookup_by_doi(self, doi: str) -> CanonicalRecord | None:
|
|
100
|
+
r = self._get(f"{_BASE}/works/{doi}")
|
|
101
|
+
if r.status_code == 404:
|
|
102
|
+
return None
|
|
103
|
+
r.raise_for_status()
|
|
104
|
+
return _record_from_message(r.json()["message"])
|
|
105
|
+
|
|
106
|
+
def search_bibliographic(
|
|
107
|
+
self, author: str | None, year: int | None, title: str | None
|
|
108
|
+
) -> CanonicalRecord | None:
|
|
109
|
+
query = " ".join(str(x) for x in (author, year, title) if x).strip()
|
|
110
|
+
if not query:
|
|
111
|
+
return None
|
|
112
|
+
r = self._get(f"{_BASE}/works", params={"query.bibliographic": query, "rows": 1})
|
|
113
|
+
r.raise_for_status()
|
|
114
|
+
items = r.json().get("message", {}).get("items") or []
|
|
115
|
+
if not items:
|
|
116
|
+
return None
|
|
117
|
+
return _record_from_message(items[0], low_confidence=True)
|