bibcite-cli 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bibcite-cli
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX for agents and humans
5
+ Project-URL: Repository, https://github.com/leo1oel/bibcite
5
6
  License-Expression: MIT
6
7
  License-File: LICENSE
7
8
  Keywords: arxiv,bibliography,bibtex,citations,dblp
@@ -32,7 +33,7 @@ After every write, the file is formatted with [bibtex-tidy](https://github.com/F
32
33
  uv tool install --editable .
33
34
 
34
35
  # from git, no checkout needed
35
- uv tool install git+https://github.com/<you>/bibcite
36
+ uv tool install git+https://github.com/leo1oel/bibcite
36
37
 
37
38
  # once published to PyPI (package name bibcite-cli, command name bibcite)
38
39
  uv tool install bibcite-cli # or: uvx --from bibcite-cli bibcite ...
@@ -57,6 +58,13 @@ bibcite add refs.bib 2103.14030 --json
57
58
  # Add a raw BibTeX entry you already have (venue still canonicalized, file still tidied)
58
59
  bibcite add refs.bib --bibtex "$(pbpaste)"
59
60
 
61
+ # Batch add (one query per line; shares rate-limit state, tidies once)
62
+ bibcite add refs.bib --from ids.txt
63
+
64
+ # Overwrite a bad existing entry (keeps its key), or delete one
65
+ bibcite add refs.bib <query> --replace
66
+ bibcite remove refs.bib <key>
67
+
60
68
  # One-shot cleanup: upgrade preprints → tidy → lint
61
69
  bibcite fix refs.bib
62
70
 
@@ -68,8 +76,11 @@ bibcite tidy refs.bib
68
76
  bibcite check refs.bib
69
77
  ```
70
78
 
71
- `--json` prints a machine-readable result on stdout (`action`, `key`, `venue`, `source`, ...); all diagnostics go to stderr.
79
+ `add`/`upgrade`/`check`/`fix`/`remove` print a machine-readable JSON result on stdout (`action`, `key`, `venue`, `source`, ...); all diagnostics go to stderr.
72
80
  `add` is idempotent: an existing entry returns `action: exists` with its key, and an existing arXiv entry matched to a published version is upgraded in place, keeping its citation key.
81
+ Exit codes: 0 success, 2 paper not found (ask for a better identifier), 3 sources/tool failure (retry later).
82
+ Successful matches are cached at `~/.cache/bibcite/published.json` (published papers only — preprint status is never cached); bypass with `--no-cache` or `BIBCITE_NO_CACHE=1`.
83
+ Entries marked `pubstate = {preprint}` are treated as confirmed preprint-only and muted from `check`/`upgrade`.
73
84
 
74
85
  ## For agents
75
86
 
@@ -20,7 +20,7 @@ After every write, the file is formatted with [bibtex-tidy](https://github.com/F
20
20
  uv tool install --editable .
21
21
 
22
22
  # from git, no checkout needed
23
- uv tool install git+https://github.com/<you>/bibcite
23
+ uv tool install git+https://github.com/leo1oel/bibcite
24
24
 
25
25
  # once published to PyPI (package name bibcite-cli, command name bibcite)
26
26
  uv tool install bibcite-cli # or: uvx --from bibcite-cli bibcite ...
@@ -45,6 +45,13 @@ bibcite add refs.bib 2103.14030 --json
45
45
  # Add a raw BibTeX entry you already have (venue still canonicalized, file still tidied)
46
46
  bibcite add refs.bib --bibtex "$(pbpaste)"
47
47
 
48
+ # Batch add (one query per line; shares rate-limit state, tidies once)
49
+ bibcite add refs.bib --from ids.txt
50
+
51
+ # Overwrite a bad existing entry (keeps its key), or delete one
52
+ bibcite add refs.bib <query> --replace
53
+ bibcite remove refs.bib <key>
54
+
48
55
  # One-shot cleanup: upgrade preprints → tidy → lint
49
56
  bibcite fix refs.bib
50
57
 
@@ -56,8 +63,11 @@ bibcite tidy refs.bib
56
63
  bibcite check refs.bib
57
64
  ```
58
65
 
59
- `--json` prints a machine-readable result on stdout (`action`, `key`, `venue`, `source`, ...); all diagnostics go to stderr.
66
+ `add`/`upgrade`/`check`/`fix`/`remove` print a machine-readable JSON result on stdout (`action`, `key`, `venue`, `source`, ...); all diagnostics go to stderr.
60
67
  `add` is idempotent: an existing entry returns `action: exists` with its key, and an existing arXiv entry matched to a published version is upgraded in place, keeping its citation key.
68
+ Exit codes: 0 success, 2 paper not found (ask for a better identifier), 3 sources/tool failure (retry later).
69
+ Successful matches are cached at `~/.cache/bibcite/published.json` (published papers only — preprint status is never cached); bypass with `--no-cache` or `BIBCITE_NO_CACHE=1`.
70
+ Entries marked `pubstate = {preprint}` are treated as confirmed preprint-only and muted from `check`/`upgrade`.
61
71
 
62
72
  ## For agents
63
73
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "bibcite-cli"
3
- version = "0.2.0"
3
+ version = "0.3.0"
4
4
  description = "Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX for agents and humans"
5
5
  readme = "Readme.md"
6
6
  license = "MIT"
@@ -11,6 +11,9 @@ dependencies = [
11
11
  ]
12
12
  keywords = ["bibtex", "arxiv", "citations", "dblp", "bibliography"]
13
13
 
14
+ [project.urls]
15
+ Repository = "https://github.com/leo1oel/bibcite"
16
+
14
17
  [project.scripts]
15
18
  bibcite = "bibcite.cli:main"
16
19
 
@@ -1,3 +1,3 @@
1
1
  """bibcite: canonical BibTeX resolution for papers (arXiv id / DOI / title)."""
2
2
 
3
- __version__ = "0.2.0"
3
+ __version__ = "0.3.0"
@@ -14,7 +14,10 @@ from bibtexparser.bwriter import BibTexWriter
14
14
  from .normalize import norm_title
15
15
 
16
16
  # The exact bibtex-tidy invocation requested by the user; keep in sync with
17
- # their LaTeX workflow.
17
+ # their LaTeX workflow. NOTE: no --generate-keys — bibcite owns key
18
+ # generation (make_key ASCII-folds names, so Hyvärinen -> hyvarinen2000...,
19
+ # where tidy would emit hyv_arinen2000...), and stable keys keep existing
20
+ # \cite{} commands valid.
18
21
  TIDY_ARGS = [
19
22
  "--modify",
20
23
  "--omit=pages,publisher,doi,timestamp,biburl,bibsource,abstract,month,series,volume,editor,note,date,number,address",
@@ -27,10 +30,26 @@ TIDY_ARGS = [
27
30
  "--sort-fields=author,title,booktitle,journal,year,url,pdf",
28
31
  "--strip-enclosing-braces",
29
32
  "--tidy-comments",
30
- "--generate-keys",
31
33
  ]
32
34
 
33
- NOISE_FIELDS = ("timestamp", "biburl", "bibsource", "crossref")
35
+ NOISE_FIELDS = ("timestamp", "biburl", "bibsource", "crossref", "month")
36
+
37
+ # BibTeX month macros. bibtexparser's common_strings only defines jan..dec;
38
+ # CrossRef's transform endpoint emits bare full names (month=June), which
39
+ # otherwise KeyError during string interpolation.
40
+ MONTH_STRINGS = {
41
+ m[:3]: m.capitalize()
42
+ for m in (
43
+ "january february march april may june july august september "
44
+ "october november december"
45
+ ).split()
46
+ } | {
47
+ m: m.capitalize()
48
+ for m in (
49
+ "january february march april may june july august september "
50
+ "october november december"
51
+ ).split()
52
+ }
34
53
 
35
54
  ARXIV_ID_RE = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?")
36
55
 
@@ -42,11 +61,18 @@ def _log(msg: str):
42
61
  def _parser() -> BibTexParser:
43
62
  p = BibTexParser(common_strings=True)
44
63
  p.ignore_nonstandard_types = False
64
+ p.bib_database.strings.update(MONTH_STRINGS)
45
65
  return p
46
66
 
47
67
 
48
68
  def parse_bib(text: str) -> BibDatabase:
49
- return bibtexparser.loads(text, parser=_parser())
69
+ try:
70
+ return bibtexparser.loads(text, parser=_parser())
71
+ except Exception as e:
72
+ # Undefined @string macros raise bare KeyError('macro'); rewrap so
73
+ # callers see a real message and KeyError never masquerades as a
74
+ # LookupError "not found" upstream.
75
+ raise ValueError(f"BibTeX parse failed: {type(e).__name__}: {e}") from e
50
76
 
51
77
 
52
78
  def parse_bibtex_entry(text: str) -> dict:
@@ -117,10 +143,12 @@ def find_existing(db: BibDatabase, title: str, arxiv_id: str = "", doi: str = ""
117
143
  return None
118
144
 
119
145
 
120
- def upsert_entry(path: Path, entry: dict) -> tuple[str, str]:
146
+ def upsert_entry(path: Path, entry: dict, replace: bool = False) -> tuple[str, str]:
121
147
  """Insert or upgrade ``entry`` in ``path``.
122
148
 
123
- Returns (action, key) where action is "added" | "upgraded" | "exists".
149
+ Returns (action, key), action in "added" | "upgraded" | "exists" |
150
+ "replaced". With ``replace``, an existing matching entry is overwritten
151
+ (its citation key is kept so existing \\cite{} commands stay valid).
124
152
  """
125
153
  db = load_bib_file(path)
126
154
  if db is None: # unparseable file: append blindly
@@ -132,13 +160,14 @@ def upsert_entry(path: Path, entry: dict) -> tuple[str, str]:
132
160
  db, entry.get("title", ""), entry_arxiv_id(entry), entry.get("doi", "")
133
161
  )
134
162
  if existing is not None:
135
- if is_preprint(existing) and not is_preprint(entry):
163
+ upgrade = is_preprint(existing) and not is_preprint(entry)
164
+ if replace or upgrade:
136
165
  key = existing["ID"]
137
166
  existing.clear()
138
- existing.update(entry)
167
+ existing.update({k: str(v) for k, v in entry.items() if v})
139
168
  existing["ID"] = key # keep the key the user may already \cite
140
169
  _write_db(path, db)
141
- return "upgraded", key
170
+ return ("replaced" if replace else "upgraded"), key
142
171
  return "exists", existing["ID"]
143
172
 
144
173
  db.entries.append({k: str(v) for k, v in entry.items() if v})
@@ -146,6 +175,20 @@ def upsert_entry(path: Path, entry: dict) -> tuple[str, str]:
146
175
  return "added", entry["ID"]
147
176
 
148
177
 
178
+ def remove_entry(path: Path, key: str) -> bool:
179
+ """Delete the entry with citation key ``key``. True if something was
180
+ removed."""
181
+ db = load_bib_file(path)
182
+ if db is None:
183
+ return False
184
+ before = len(db.entries)
185
+ db.entries = [e for e in db.entries if e.get("ID") != key]
186
+ if len(db.entries) == before:
187
+ return False
188
+ _write_db(path, db)
189
+ return True
190
+
191
+
149
192
  def _write_db(path: Path, db: BibDatabase):
150
193
  writer = BibTexWriter()
151
194
  writer.indent = " "
@@ -0,0 +1,48 @@
1
+ """Local cache of successful publication matches.
2
+
3
+ Keyed by normalized title. Only *published* matches are stored — a paper that
4
+ is published stays published, while a preprint may get published tomorrow, so
5
+ negative/preprint results are never cached. Re-running `fix`/`upgrade` or
6
+ re-adding known papers therefore costs zero API calls.
7
+
8
+ Disable with --no-cache or BIBCITE_NO_CACHE=1. Lives at
9
+ $XDG_CACHE_HOME/bibcite/published.json (~/.cache/bibcite/published.json).
10
+ """
11
+
12
+ import json
13
+ import os
14
+ import sys
15
+ from pathlib import Path
16
+
17
+ DISABLED = os.environ.get("BIBCITE_NO_CACHE", "") == "1"
18
+
19
+
20
+ def _path() -> Path:
21
+ root = os.environ.get("XDG_CACHE_HOME") or "~/.cache"
22
+ return Path(root).expanduser() / "bibcite" / "published.json"
23
+
24
+
25
+ def _load() -> dict:
26
+ try:
27
+ return json.loads(_path().read_text())
28
+ except Exception:
29
+ return {}
30
+
31
+
32
+ def get(key: str) -> dict | None:
33
+ if DISABLED or not key:
34
+ return None
35
+ return _load().get(key)
36
+
37
+
38
+ def put(key: str, value: dict):
39
+ if DISABLED or not key:
40
+ return
41
+ try:
42
+ data = _load()
43
+ data[key] = value
44
+ p = _path()
45
+ p.parent.mkdir(parents=True, exist_ok=True)
46
+ p.write_text(json.dumps(data, ensure_ascii=False))
47
+ except Exception as e: # cache must never break resolution
48
+ print(f"[cache] write failed: {e}", file=sys.stderr)
@@ -11,12 +11,25 @@ import sys
11
11
  import time
12
12
  from pathlib import Path
13
13
 
14
- from . import bibfile
14
+ from . import bibfile, cache
15
15
  from .normalize import first_author_last_name, norm_title
16
- from .resolve import Resolved, guess_entry_type, resolve
16
+ from .resolve import (
17
+ NotFound,
18
+ Resolved,
19
+ SourcesUnavailable,
20
+ guess_entry_type,
21
+ resolve,
22
+ )
17
23
  from .sources import find_published
18
24
  from .venues import canonicalize
19
25
 
26
+ # Exit codes (part of the agent-facing contract):
27
+ # 0 success
28
+ # 2 the paper could not be resolved — ask for a stronger identifier
29
+ # 3 internal/network failure (sources down, unexpected error) — retry later
30
+ EXIT_NOT_FOUND = 2
31
+ EXIT_INTERNAL = 3
32
+
20
33
 
21
34
  def _log(msg: str):
22
35
  print(msg, file=sys.stderr)
@@ -42,21 +55,30 @@ def _emit(payload: dict, as_json: bool = True):
42
55
  # get
43
56
  # ---------------------------------------------------------------------------
44
57
 
45
- def _resolve_or_none(query: str, require_published: bool):
58
+ def _resolve_or_none(query: str, require_published: bool) -> tuple[Resolved | None, int]:
59
+ """(result, exit_code). Distinguishes 'not found' (2) from 'tool/source
60
+ failure' (3) so agents know whether to retry with a better identifier or
61
+ just retry later."""
46
62
  try:
47
- return resolve(query, require_published=require_published)
48
- except (LookupError, ValueError) as e:
63
+ return resolve(query, require_published=require_published), 0
64
+ except (NotFound, ValueError) as e:
49
65
  _log(f"[bibcite] {e}")
66
+ return None, EXIT_NOT_FOUND
67
+ except SourcesUnavailable as e:
68
+ _log(f"[bibcite] sources unavailable: {e}")
69
+ return None, EXIT_INTERNAL
50
70
  except Exception as e:
51
- _log(f"[bibcite] network error: {type(e).__name__}: {e}")
52
- return None
71
+ _log(f"[bibcite] internal error: {type(e).__name__}: {e}")
72
+ return None, EXIT_INTERNAL
53
73
 
54
74
 
55
75
  def cmd_get(args) -> int:
56
76
  query = " ".join(args.query)
57
- res = _resolve_or_none(query, args.require_published)
77
+ if args.no_cache:
78
+ cache.DISABLED = True
79
+ res, code = _resolve_or_none(query, args.require_published)
58
80
  if res is None:
59
- return 2
81
+ return code
60
82
  _emit(
61
83
  {
62
84
  "action": "resolved",
@@ -76,48 +98,86 @@ def cmd_get(args) -> int:
76
98
  # add
77
99
  # ---------------------------------------------------------------------------
78
100
 
101
+ def _resolve_user_bibtex(text: str) -> Resolved:
102
+ entry = bibfile.parse_bibtex_entry(text)
103
+ raw_venue = entry.get("booktitle", "") or entry.get("journal", "")
104
+ canonical = canonicalize(raw_venue, entry.get("year"))
105
+ if canonical:
106
+ entry.pop("booktitle", None)
107
+ entry.pop("journal", None)
108
+ entry["ENTRYTYPE"] = canonical.entry_type
109
+ entry[canonical.bib_field] = canonical.name
110
+ return Resolved(entry, "user-bibtex", canonical.name if canonical else raw_venue, True)
111
+
112
+
79
113
  def cmd_add(args) -> int:
80
114
  path = Path(args.file)
115
+ if args.no_cache:
116
+ cache.DISABLED = True
117
+
118
+ # Collect the queries for this invocation (single, --bibtex, or --from).
81
119
  if args.bibtex:
82
120
  text = sys.stdin.read() if args.bibtex == "-" else args.bibtex
83
- entry = bibfile.parse_bibtex_entry(text)
84
- raw_venue = entry.get("booktitle", "") or entry.get("journal", "")
85
- canonical = canonicalize(raw_venue, entry.get("year"))
86
- if canonical:
87
- entry.pop("booktitle", None)
88
- entry.pop("journal", None)
89
- entry["ENTRYTYPE"] = canonical.entry_type
90
- entry[canonical.bib_field] = canonical.name
91
- res = Resolved(entry, "user-bibtex", canonical.name if canonical else raw_venue, True)
121
+ try:
122
+ resolutions = [("<bibtex>", _resolve_user_bibtex(text), 0)]
123
+ except ValueError as e:
124
+ _log(f"[bibcite] {e}")
125
+ return EXIT_NOT_FOUND
126
+ elif args.from_file:
127
+ lines = Path(args.from_file).read_text().splitlines()
128
+ queries = [q.strip() for q in lines if q.strip() and not q.strip().startswith("#")]
129
+ resolutions = []
130
+ for i, q in enumerate(queries):
131
+ if i:
132
+ time.sleep(1) # one process shares the rate-limit breaker; stay polite
133
+ _log(f"[bibcite] ({i + 1}/{len(queries)}) {q}")
134
+ res, code = _resolve_or_none(q, args.require_published)
135
+ resolutions.append((q, res, code))
92
136
  else:
93
137
  if not args.query:
94
- _log("[bibcite] provide a query (arXiv id / DOI / title) or --bibtex")
95
- return 2
138
+ _log("[bibcite] provide a query (arXiv id / DOI / title), --bibtex, or --from")
139
+ return EXIT_NOT_FOUND
96
140
  query = " ".join(args.query)
97
- res = _resolve_or_none(query, args.require_published)
141
+ res, code = _resolve_or_none(query, args.require_published)
142
+ if res is None:
143
+ return code
144
+ resolutions = [(query, res, 0)]
145
+
146
+ # Write all entries first, tidy once, then read back the final keys.
147
+ results = []
148
+ wrote = False
149
+ for query, res, code in resolutions:
98
150
  if res is None:
99
- return 2
151
+ results.append({"query": query, "action": "failed", "exit_code": code})
152
+ continue
153
+ action, key = bibfile.upsert_entry(path, res.entry, replace=args.replace)
154
+ wrote = wrote or action != "exists"
155
+ results.append(
156
+ {
157
+ "query": query,
158
+ "action": action,
159
+ "key": key,
160
+ "title": res.entry.get("title", ""),
161
+ "venue": res.venue or "arXiv (preprint)",
162
+ "published": res.published,
163
+ "source": res.source,
164
+ }
165
+ )
100
166
 
101
- action, key = bibfile.upsert_entry(path, res.entry)
102
167
  tidied = False
103
- if action != "exists" and not args.no_tidy:
168
+ if wrote and not args.no_tidy:
104
169
  tidied = bibfile.run_tidy(path)
105
170
  if tidied:
106
- key = bibfile.key_after_tidy(path, res.entry.get("title", ""), key)
171
+ for r in results:
172
+ if r.get("title"):
173
+ r["key"] = bibfile.key_after_tidy(path, r["title"], r["key"])
107
174
 
108
- _emit(
109
- {
110
- "action": action,
111
- "key": key,
112
- "title": res.entry.get("title", ""),
113
- "venue": res.venue or "arXiv (preprint)",
114
- "published": res.published,
115
- "source": res.source,
116
- "file": str(path),
117
- "tidied": tidied,
118
- }
119
- )
120
- return 0
175
+ exit_code = max((r.get("exit_code", 0) for r in results), default=0)
176
+ if len(results) == 1 and not args.from_file:
177
+ _emit({**results[0], "file": str(path), "tidied": tidied})
178
+ else:
179
+ _emit({"file": str(path), "tidied": tidied, "results": results})
180
+ return exit_code
121
181
 
122
182
 
123
183
  # ---------------------------------------------------------------------------
@@ -139,6 +199,10 @@ def _upgrade_entries(path: Path, dry_run: bool) -> dict:
139
199
  for entry in db.entries:
140
200
  if not bibfile.is_preprint(entry):
141
201
  continue
202
+ if entry.get("pubstate", "").strip("{}") == "preprint":
203
+ # User-confirmed preprint-only (e.g. never-to-be-published arXiv
204
+ # reports): muted from upgrade and check.
205
+ continue
142
206
  title = entry.get("title", "").replace("{", "").replace("}", "")
143
207
  if not title:
144
208
  continue
@@ -150,9 +214,16 @@ def _upgrade_entries(path: Path, dry_run: bool) -> dict:
150
214
  hint = (
151
215
  first_author_last_name(entry["author"]) if entry.get("author") else ""
152
216
  )
153
- match = find_published(title, entry.get("year", ""), aid, hint)
217
+ match, status = find_published(title, entry.get("year", ""), aid, hint)
154
218
  if not match:
155
- report.append({"key": entry["ID"], "title": title, "matched": False})
219
+ # "no_published_version" is a trustworthy miss; "sources_unavailable"
220
+ # means the sources were down — do not conclude anything.
221
+ reason = (
222
+ "sources_unavailable" if status == "unavailable" else "no_published_version"
223
+ )
224
+ report.append(
225
+ {"key": entry["ID"], "title": title, "matched": False, "reason": reason}
226
+ )
156
227
  continue
157
228
  canonical = canonicalize(match.venue, match.year or entry.get("year"))
158
229
  venue_name = canonical.name if canonical else match.venue
@@ -192,13 +263,15 @@ def _upgrade_entries(path: Path, dry_run: bool) -> dict:
192
263
  matched = sum(1 for r in report if r["matched"])
193
264
  for r in report:
194
265
  mark = "✓" if r["matched"] else "✗"
195
- _log(f"{mark} {r['key']}: {r.get('venue', 'no match')}")
266
+ _log(f"{mark} {r['key']}: {r.get('venue') or r.get('reason', 'no match')}")
196
267
  _log(f"[bibcite] {matched} matched, {changed} upgraded{' (dry-run)' if dry_run else ''}")
197
268
  return {"upgraded": changed, "matched": matched, "entries": report}
198
269
 
199
270
 
200
271
  def cmd_upgrade(args) -> int:
201
272
  path = Path(args.file)
273
+ if args.no_cache:
274
+ cache.DISABLED = True
202
275
  result = _upgrade_entries(path, args.dry_run)
203
276
  if result["upgraded"] and not args.no_tidy:
204
277
  bibfile.run_tidy(path)
@@ -230,8 +303,12 @@ def _check_problems(path: Path) -> tuple[int, list] | None:
230
303
  for f in ("author", "title", "year"):
231
304
  if not entry.get(f):
232
305
  problems.append({"key": key, "issue": f"missing {f}"})
233
- if bibfile.is_preprint(entry):
234
- problems.append({"key": key, "issue": "arXiv preprint (try `bibcite upgrade`)"})
306
+ if bibfile.is_preprint(entry) and entry.get("pubstate", "").strip("{}") != "preprint":
307
+ problems.append({"key": key, "issue": "arXiv preprint (try `bibcite upgrade`, or set pubstate = {preprint} to mute)"})
308
+ author = entry.get("author", "")
309
+ letters = "".join(c for c in author if c.isalpha())
310
+ if letters and letters.isupper():
311
+ problems.append({"key": key, "issue": "author names are ALL CAPS"})
235
312
  for p in problems:
236
313
  _log(f"{p['key']}: {p['issue']}")
237
314
  _log(f"[bibcite] {len(db.entries)} entries, {len(problems)} issues")
@@ -248,9 +325,30 @@ def cmd_check(args) -> int:
248
325
  return 0
249
326
 
250
327
 
328
+ def cmd_remove(args) -> int:
329
+ """Delete an entry by citation key — the sanctioned way to drop a bad
330
+ entry without hand-editing the file."""
331
+ path = Path(args.file)
332
+ removed = bibfile.remove_entry(path, args.key)
333
+ tidied = False
334
+ if removed and not args.no_tidy:
335
+ tidied = bibfile.run_tidy(path)
336
+ _emit(
337
+ {
338
+ "action": "removed" if removed else "not_found",
339
+ "key": args.key,
340
+ "file": str(path),
341
+ "tidied": tidied,
342
+ }
343
+ )
344
+ return 0 if removed else EXIT_NOT_FOUND
345
+
346
+
251
347
  def cmd_fix(args) -> int:
252
348
  """One-shot cleanup: upgrade preprints, always tidy, then re-lint."""
253
349
  path = Path(args.file)
350
+ if args.no_cache:
351
+ cache.DISABLED = True
254
352
  if not path.exists():
255
353
  _log(f"[bibcite] {path} does not exist")
256
354
  return 1
@@ -282,20 +380,31 @@ def main(argv=None) -> int:
282
380
  g.add_argument("query", nargs="+", help="arXiv id / arXiv URL / DOI / title")
283
381
  g.add_argument("--json", action="store_true", help="print a JSON object instead of BibTeX")
284
382
  g.add_argument("--require-published", action="store_true", help="fail instead of falling back to an arXiv entry")
383
+ g.add_argument("--no-cache", action="store_true", help="bypass the local match cache")
285
384
  g.set_defaults(fn=cmd_get)
286
385
 
287
386
  a = sub.add_parser("add", help="resolve and write into a .bib file, then run bibtex-tidy (prints JSON)")
288
387
  a.add_argument("file", help="target .bib file (created if missing)")
289
388
  a.add_argument("query", nargs="*", help="arXiv id / arXiv URL / DOI / title")
290
389
  a.add_argument("--bibtex", help="raw BibTeX entry to add instead of a query ('-' reads stdin)")
390
+ a.add_argument("--from", dest="from_file", metavar="FILE", help="batch mode: one query per line (shares rate-limit state, tidies once)")
391
+ a.add_argument("--replace", action="store_true", help="overwrite an existing matching entry (keeps its citation key)")
291
392
  a.add_argument("--no-tidy", action="store_true")
393
+ a.add_argument("--no-cache", action="store_true", help="bypass the local match cache")
292
394
  a.add_argument("--require-published", action="store_true")
293
395
  a.set_defaults(fn=cmd_add)
294
396
 
397
+ rm = sub.add_parser("remove", help="delete an entry by citation key (prints JSON)")
398
+ rm.add_argument("file")
399
+ rm.add_argument("key", help="citation key of the entry to remove")
400
+ rm.add_argument("--no-tidy", action="store_true")
401
+ rm.set_defaults(fn=cmd_remove)
402
+
295
403
  u = sub.add_parser("upgrade", help="match all arXiv entries in a file to their published versions (prints JSON)")
296
404
  u.add_argument("file")
297
405
  u.add_argument("--dry-run", action="store_true")
298
406
  u.add_argument("--no-tidy", action="store_true")
407
+ u.add_argument("--no-cache", action="store_true", help="bypass the local match cache")
299
408
  u.set_defaults(fn=cmd_upgrade)
300
409
 
301
410
  t = sub.add_parser("tidy", help="run bibtex-tidy with the canonical flags")
@@ -311,6 +420,7 @@ def main(argv=None) -> int:
311
420
  help="one-shot cleanup: upgrade preprints to published versions, tidy, then lint (prints JSON)",
312
421
  )
313
422
  f.add_argument("file")
423
+ f.add_argument("--no-cache", action="store_true", help="bypass the local match cache")
314
424
  f.set_defaults(fn=cmd_fix)
315
425
 
316
426
  args = p.parse_args(argv)
@@ -76,6 +76,28 @@ def first_author_last_name(author_field: str) -> str:
76
76
  return mini_hash(last) or "anon"
77
77
 
78
78
 
79
+ def fix_author_caps(author_field: str) -> str:
80
+ """Normalize ALL-CAPS author names (old CrossRef records store e.g.
81
+ "EPPS, T. W. and PULLEY, LAWRENCE B."). A word is re-cased only when it
82
+ is fully uppercase and longer than 2 letters, so initials ("T.", "W.")
83
+ and legitimately-capitalized short names survive."""
84
+
85
+ def fix_word(w: str) -> str:
86
+ core = re.sub(r"[^A-Za-z]", "", w)
87
+ if len(core) > 2 and core.isupper():
88
+ return w.capitalize()
89
+ return w
90
+
91
+ def fix_name(name: str) -> str:
92
+ letters = re.sub(r"[^A-Za-z]", "", name)
93
+ if not letters.isupper():
94
+ return name # mixed case already — leave it alone
95
+ return " ".join(fix_word(w) for w in name.split())
96
+
97
+ names = re.split(r"\s+and\s+", author_field)
98
+ return " and ".join(fix_name(n) for n in names)
99
+
100
+
79
101
  def make_key(author_field: str, year: str | int, title: str) -> str:
80
102
  """Deterministic citation key: <lastname><year><firstword>.
81
103
 
@@ -10,7 +10,17 @@ import sys
10
10
  from dataclasses import dataclass
11
11
 
12
12
  from .bibfile import NOISE_FIELDS, parse_bibtex_entry
13
- from .normalize import clean_title, first_author_last_name, make_key
13
+ from .normalize import clean_title, first_author_last_name, fix_author_caps, make_key
14
+
15
+
16
+ class NotFound(Exception):
17
+ """No source could resolve the query — asking for a better identifier is
18
+ the right next step."""
19
+
20
+
21
+ class SourcesUnavailable(Exception):
22
+ """Resolution failed because sources were down/rate-limited, NOT because
23
+ the paper doesn't exist. Retrying later is the right next step."""
14
24
  from .sources import (
15
25
  ArxivMeta,
16
26
  Match,
@@ -84,7 +94,10 @@ def _entry_from_match(match: Match, meta: ArxivMeta | None) -> dict:
84
94
  if match.bibtex:
85
95
  try:
86
96
  entry = parse_bibtex_entry(match.bibtex)
87
- except ValueError:
97
+ except Exception as e:
98
+ # Bad source bibtex must degrade to field construction, never
99
+ # abort the resolution.
100
+ _log(f"[{match.source}] could not parse its BibTeX ({e}); building from fields")
88
101
  entry = {}
89
102
  if not entry:
90
103
  authors = match.authors or (meta.authors if meta else [])
@@ -100,6 +113,8 @@ def _entry_from_match(match: Match, meta: ArxivMeta | None) -> dict:
100
113
  entry.pop(f, None)
101
114
 
102
115
  entry["title"] = clean_title(entry.get("title", ""))
116
+ if entry.get("author"):
117
+ entry["author"] = fix_author_caps(entry["author"])
103
118
  if match.doi and not entry.get("doi"):
104
119
  entry["doi"] = match.doi
105
120
 
@@ -132,8 +147,12 @@ def _finalize(entry: dict, meta: ArxivMeta | None) -> dict:
132
147
  entry["archiveprefix"] = "arXiv"
133
148
  if meta.primary_class:
134
149
  entry["primaryclass"] = meta.primary_class
135
- elif entry.get("doi") and not entry.get("url"):
136
- entry["url"] = f"https://doi.org/{entry['doi']}"
150
+ elif entry.get("doi"):
151
+ url = entry.get("url", "")
152
+ # Modernize legacy resolver links (http://dx.doi.org/...) and fill in
153
+ # a missing url from the DOI.
154
+ if not url or "dx.doi.org" in url:
155
+ entry["url"] = f"https://doi.org/{entry['doi']}"
137
156
  author = entry.get("author", "") or "anonymous"
138
157
  year = entry.get("year", "") or "XXXX"
139
158
  entry["ID"] = make_key(author, year, entry.get("title", ""))
@@ -176,19 +195,23 @@ def resolve(query: str, require_published: bool = False) -> Resolved:
176
195
  if meta is not None:
177
196
  break
178
197
  if meta is None:
179
- raise LookupError(
198
+ raise SourcesUnavailable(
180
199
  f"Could not fetch metadata for arXiv:{value} "
181
200
  "(arXiv API, Semantic Scholar, and arxiv.org all unavailable)"
182
201
  )
183
202
  _log(f"[arxiv] {meta.title} ({meta.year})")
184
203
  hint = first_author_last_name(meta.authors[0]) if meta.authors else ""
185
- match = find_published(meta.title, meta.year, meta.arxiv_id, hint)
204
+ match, status = find_published(meta.title, meta.year, meta.arxiv_id, hint)
186
205
  if match:
187
206
  entry = _entry_from_match(match, meta)
188
207
  venue = entry.pop("__venue", match.venue)
189
208
  return Resolved(_finalize(entry, meta), match.source, venue, True)
190
209
  if require_published:
191
- raise LookupError(f"No published version found for arXiv:{value}")
210
+ if status == "unavailable":
211
+ raise SourcesUnavailable(
212
+ f"Could not check publication status for arXiv:{value} (sources down)"
213
+ )
214
+ raise NotFound(f"No published version found for arXiv:{value}")
192
215
  _log("[bibcite] no published version found; using arXiv preprint entry")
193
216
  entry = _arxiv_only_entry(meta)
194
217
  return Resolved(_finalize(entry, meta), "arxiv", "", False)
@@ -196,7 +219,7 @@ def resolve(query: str, require_published: bool = False) -> Resolved:
196
219
  if kind == "doi":
197
220
  match = crossref_by_doi(value)
198
221
  if not match or not match.title:
199
- raise LookupError(f"DOI not found on CrossRef: {value}")
222
+ raise NotFound(f"DOI not found on CrossRef: {value}")
200
223
  entry = _entry_from_match(match, None)
201
224
  venue = entry.pop("__venue", match.venue)
202
225
  return Resolved(_finalize(entry, None), match.source, venue, True)
@@ -212,7 +235,7 @@ def resolve(query: str, require_published: bool = False) -> Resolved:
212
235
  if meta:
213
236
  _log(f"[openalex] metadata: arXiv {meta.arxiv_id or '?'} ({meta.year})")
214
237
  hint = first_author_last_name(meta.authors[0]) if meta and meta.authors else ""
215
- match = find_published(
238
+ match, status = find_published(
216
239
  meta.title if meta else value,
217
240
  meta.year if meta else "",
218
241
  meta.arxiv_id if meta else "",
@@ -224,11 +247,15 @@ def resolve(query: str, require_published: bool = False) -> Resolved:
224
247
  return Resolved(_finalize(entry, meta), match.source, venue, True)
225
248
  if meta and meta.arxiv_id:
226
249
  if require_published:
227
- raise LookupError(f"Only an arXiv preprint was found for: {value}")
250
+ raise NotFound(f"Only an arXiv preprint was found for: {value}")
228
251
  _log("[bibcite] no published version found; using arXiv preprint entry")
229
252
  entry = _arxiv_only_entry(meta)
230
253
  return Resolved(_finalize(entry, meta), "arxiv", "", False)
231
- raise LookupError(f"No match found anywhere for: {value}")
254
+ if status == "unavailable":
255
+ raise SourcesUnavailable(
256
+ f"All sources were rate-limited or down while resolving: {value}"
257
+ )
258
+ raise NotFound(f"No match found anywhere for: {value}")
232
259
 
233
260
 
234
261
  def _openalex_meta(title: str) -> ArxivMeta | None:
@@ -595,19 +595,36 @@ _DISABLED: dict[str, str] = {}
595
595
 
596
596
  def find_published(
597
597
  title: str, year: str = "", arxiv_id: str = "", author_hint: str = ""
598
- ) -> Match | None:
599
- """Try each source in order; first verified hit wins."""
598
+ ) -> tuple[Match | None, str]:
599
+ """Try each source in order; first verified hit wins.
600
+
601
+ Returns (match, status). status distinguishes a trustworthy miss from an
602
+ outage: "found" | "not_found" (>=1 source answered cleanly with no hit) |
603
+ "unavailable" (every source was disabled or errored — do NOT conclude the
604
+ paper is unpublished).
605
+ """
606
+ from . import cache
607
+
608
+ cache_key = norm_title(title)
609
+ cached = cache.get(cache_key)
610
+ if cached:
611
+ _log(f"[cache] hit: {cached.get('venue', '')} ({cached.get('source', '')})")
612
+ return Match(**cached), "found"
613
+
614
+ clean_misses = 0
600
615
  for name, fn in CASCADE:
601
616
  if name in _DISABLED:
602
617
  continue
603
618
  try:
604
619
  m = fn(title, year, arxiv_id, author_hint)
605
620
  if m:
606
- return m
621
+ cache.put(cache_key, m.__dict__)
622
+ return m, "found"
623
+ clean_misses += 1
607
624
  _log(f"[{name}] no publication found")
608
625
  except SourceUnavailable as e:
609
626
  _DISABLED[name] = str(e)
610
627
  _log(f"[{name}] disabled for the rest of this run: {e}")
611
628
  except Exception as e: # network hiccup on one source must not kill the run
612
629
  _log(f"[{name}] error: {type(e).__name__}: {e}")
613
- return None
630
+ return None, ("not_found" if clean_misses else "unavailable")
@@ -0,0 +1,91 @@
1
+ """Regression tests for the first round of real-world bug reports."""
2
+
3
+ from pathlib import Path
4
+
5
+ from bibcite import cache
6
+ from bibcite.bibfile import parse_bibtex_entry, remove_entry, upsert_entry
7
+ from bibcite.normalize import fix_author_caps
8
+
9
+ # CrossRef's transform endpoint emits bare month macros (month=June) that are
10
+ # not in bibtexparser's common_strings — this used to KeyError('june').
11
+ CROSSREF_STYLE = (
12
+ " @article{Hyv_rinen_2000, title={Independent component analysis}, "
13
+ "volume={13}, url={http://dx.doi.org/10.1016/x}, DOI={10.1016/x}, "
14
+ "journal={Neural Networks}, author={Hyvärinen, A. and Oja, E.}, "
15
+ "year={2000}, month=June, pages={411–430} }"
16
+ )
17
+
18
+
19
+ def test_month_macro_full_name_parses():
20
+ entry = parse_bibtex_entry(CROSSREF_STYLE)
21
+ assert entry["title"] == "Independent component analysis"
22
+ assert "month" not in entry # month is a noise field, dropped
23
+
24
+
25
+ def test_month_macro_abbrev_parses():
26
+ entry = parse_bibtex_entry("@article{x, title={T}, year={2000}, month=jun }")
27
+ assert entry["title"] == "T"
28
+
29
+
30
+ def test_unknown_macro_raises_value_error_not_keyerror():
31
+ import pytest
32
+
33
+ with pytest.raises(ValueError, match="BibTeX parse failed"):
34
+ parse_bibtex_entry("@article{x, title = somemacro }")
35
+
36
+
37
+ def test_fix_author_caps():
38
+ assert (
39
+ fix_author_caps("EPPS, T. W. and PULLEY, LAWRENCE B.")
40
+ == "Epps, T. W. and Pulley, Lawrence B."
41
+ )
42
+ # Mixed-case names are never touched.
43
+ assert fix_author_caps("McDonald, J. and van der Berg, A.") == (
44
+ "McDonald, J. and van der Berg, A."
45
+ )
46
+ assert fix_author_caps("Ashish Vaswani") == "Ashish Vaswani"
47
+
48
+
49
+ PUB = {
50
+ "ENTRYTYPE": "inproceedings",
51
+ "ID": "k1",
52
+ "title": "Paper One",
53
+ "author": "A B",
54
+ "booktitle": "Some Conference (SC)",
55
+ "year": "2020",
56
+ }
57
+
58
+
59
+ def test_remove_entry(tmp_path: Path):
60
+ bib = tmp_path / "r.bib"
61
+ upsert_entry(bib, dict(PUB))
62
+ assert remove_entry(bib, "k1") is True
63
+ assert remove_entry(bib, "k1") is False # already gone
64
+ assert "Paper One" not in bib.read_text()
65
+
66
+
67
+ def test_upsert_replace_keeps_key(tmp_path: Path):
68
+ bib = tmp_path / "r.bib"
69
+ upsert_entry(bib, dict(PUB))
70
+ newer = dict(PUB, ID="differentkey", author="Fixed Author")
71
+ action, key = upsert_entry(bib, newer, replace=True)
72
+ assert (action, key) == ("replaced", "k1")
73
+ assert "Fixed Author" in bib.read_text()
74
+ # Without --replace, a published duplicate stays untouched.
75
+ action, key = upsert_entry(bib, newer)
76
+ assert (action, key) == ("exists", "k1")
77
+
78
+
79
+ def test_cache_roundtrip(tmp_path, monkeypatch):
80
+ monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path))
81
+ monkeypatch.setattr(cache, "DISABLED", False)
82
+ assert cache.get("somekey") is None
83
+ cache.put("somekey", {"source": "dblp", "venue": "SC"})
84
+ assert cache.get("somekey")["venue"] == "SC"
85
+
86
+
87
+ def test_cache_disabled(tmp_path, monkeypatch):
88
+ monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path))
89
+ monkeypatch.setattr(cache, "DISABLED", True)
90
+ cache.put("k", {"venue": "X"})
91
+ assert cache.get("k") is None
@@ -18,7 +18,7 @@ wheels = [
18
18
 
19
19
  [[package]]
20
20
  name = "bibcite-cli"
21
- version = "0.2.0"
21
+ version = "0.3.0"
22
22
  source = { editable = "." }
23
23
  dependencies = [
24
24
  { name = "bibtexparser" },
File without changes
File without changes