PyPI - bibcite-cli - Versions diffs - 0.2.0__tar.gz → 0.3.0__tar.gz - Mend

bibcite-cli 0.2.0tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

{bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,7 +1,8 @@
 Metadata-Version: 2.4
 Name: bibcite-cli
-Version: 0.2.0
+Version: 0.3.0
 Summary: Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX for agents and humans
+Project-URL: Repository, https://github.com/leo1oel/bibcite
 License-Expression: MIT
 License-File: LICENSE
 Keywords: arxiv,bibliography,bibtex,citations,dblp
@@ -32,7 +33,7 @@ After every write, the file is formatted with [bibtex-tidy](https://github.com/F
 uv tool install --editable .
 # from git, no checkout needed
-uv tool install git+https://github.com/<you>/bibcite
+uv tool install git+https://github.com/leo1oel/bibcite
 # once published to PyPI (package name bibcite-cli, command name bibcite)
 uv tool install bibcite-cli   # or: uvx --from bibcite-cli bibcite ...
@@ -57,6 +58,13 @@ bibcite add refs.bib 2103.14030 --json
 # Add a raw BibTeX entry you already have (venue still canonicalized, file still tidied)
 bibcite add refs.bib --bibtex "$(pbpaste)"
+# Batch add (one query per line; shares rate-limit state, tidies once)
+bibcite add refs.bib --from ids.txt
+# Overwrite a bad existing entry (keeps its key), or delete one
+bibcite add refs.bib <query> --replace
+bibcite remove refs.bib <key>
 # One-shot cleanup: upgrade preprints → tidy → lint
 bibcite fix refs.bib
@@ -68,8 +76,11 @@ bibcite tidy refs.bib
 bibcite check refs.bib
 ```
-`--json` prints a machine-readable result on stdout (`action`, `key`, `venue`, `source`, ...); all diagnostics go to stderr.
+`add`/`upgrade`/`check`/`fix`/`remove` print a machine-readable JSON result on stdout (`action`, `key`, `venue`, `source`, ...); all diagnostics go to stderr.
 `add` is idempotent: an existing entry returns `action: exists` with its key, and an existing arXiv entry matched to a published version is upgraded in place, keeping its citation key.
+Exit codes: 0 success, 2 paper not found (ask for a better identifier), 3 sources/tool failure (retry later).
+Successful matches are cached at `~/.cache/bibcite/published.json` (published papers only — preprint status is never cached); bypass with `--no-cache` or `BIBCITE_NO_CACHE=1`.
+Entries marked `pubstate = {preprint}` are treated as confirmed preprint-only and muted from `check`/`upgrade`.
 ## For agents

{bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/Readme.md RENAMED Viewed

@@ -20,7 +20,7 @@ After every write, the file is formatted with [bibtex-tidy](https://github.com/F
 uv tool install --editable .
 # from git, no checkout needed
-uv tool install git+https://github.com/<you>/bibcite
+uv tool install git+https://github.com/leo1oel/bibcite
 # once published to PyPI (package name bibcite-cli, command name bibcite)
 uv tool install bibcite-cli   # or: uvx --from bibcite-cli bibcite ...
@@ -45,6 +45,13 @@ bibcite add refs.bib 2103.14030 --json
 # Add a raw BibTeX entry you already have (venue still canonicalized, file still tidied)
 bibcite add refs.bib --bibtex "$(pbpaste)"
+# Batch add (one query per line; shares rate-limit state, tidies once)
+bibcite add refs.bib --from ids.txt
+# Overwrite a bad existing entry (keeps its key), or delete one
+bibcite add refs.bib <query> --replace
+bibcite remove refs.bib <key>
 # One-shot cleanup: upgrade preprints → tidy → lint
 bibcite fix refs.bib
@@ -56,8 +63,11 @@ bibcite tidy refs.bib
 bibcite check refs.bib
 ```
-`--json` prints a machine-readable result on stdout (`action`, `key`, `venue`, `source`, ...); all diagnostics go to stderr.
+`add`/`upgrade`/`check`/`fix`/`remove` print a machine-readable JSON result on stdout (`action`, `key`, `venue`, `source`, ...); all diagnostics go to stderr.
 `add` is idempotent: an existing entry returns `action: exists` with its key, and an existing arXiv entry matched to a published version is upgraded in place, keeping its citation key.
+Exit codes: 0 success, 2 paper not found (ask for a better identifier), 3 sources/tool failure (retry later).
+Successful matches are cached at `~/.cache/bibcite/published.json` (published papers only — preprint status is never cached); bypass with `--no-cache` or `BIBCITE_NO_CACHE=1`.
+Entries marked `pubstate = {preprint}` are treated as confirmed preprint-only and muted from `check`/`upgrade`.
 ## For agents

{bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "bibcite-cli"
-version = "0.2.0"
+version = "0.3.0"
 description = "Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX for agents and humans"
 readme = "Readme.md"
 license = "MIT"
@@ -11,6 +11,9 @@ dependencies = [
 ]
 keywords = ["bibtex", "arxiv", "citations", "dblp", "bibliography"]
+[project.urls]
+Repository = "https://github.com/leo1oel/bibcite"
 [project.scripts]
 bibcite = "bibcite.cli:main"

{bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/src/bibcite/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """bibcite: canonical BibTeX resolution for papers (arXiv id / DOI / title)."""
-__version__ = "0.2.0"
+__version__ = "0.3.0"

{bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/src/bibcite/bibfile.py RENAMED Viewed

@@ -14,7 +14,10 @@ from bibtexparser.bwriter import BibTexWriter
 from .normalize import norm_title
 # The exact bibtex-tidy invocation requested by the user; keep in sync with
-# their LaTeX workflow.
+# their LaTeX workflow. NOTE: no --generate-keys — bibcite owns key
+# generation (make_key ASCII-folds names, so Hyvärinen -> hyvarinen2000...,
+# where tidy would emit hyv_arinen2000...), and stable keys keep existing
+# \cite{} commands valid.
 TIDY_ARGS = [
     "--modify",
     "--omit=pages,publisher,doi,timestamp,biburl,bibsource,abstract,month,series,volume,editor,note,date,number,address",
@@ -27,10 +30,26 @@ TIDY_ARGS = [
     "--sort-fields=author,title,booktitle,journal,year,url,pdf",
     "--strip-enclosing-braces",
     "--tidy-comments",
-    "--generate-keys",
 ]
-NOISE_FIELDS = ("timestamp", "biburl", "bibsource", "crossref")
+NOISE_FIELDS = ("timestamp", "biburl", "bibsource", "crossref", "month")
+# BibTeX month macros. bibtexparser's common_strings only defines jan..dec;
+# CrossRef's transform endpoint emits bare full names (month=June), which
+# otherwise KeyError during string interpolation.
+MONTH_STRINGS = {
+    m[:3]: m.capitalize()
+    for m in (
+        "january february march april may june july august september "
+        "october november december"
+    ).split()
+} | {
+    m: m.capitalize()
+    for m in (
+        "january february march april may june july august september "
+        "october november december"
+    ).split()
+}
 ARXIV_ID_RE = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?")
@@ -42,11 +61,18 @@ def _log(msg: str):
 def _parser() -> BibTexParser:
     p = BibTexParser(common_strings=True)
     p.ignore_nonstandard_types = False
+    p.bib_database.strings.update(MONTH_STRINGS)
     return p
 def parse_bib(text: str) -> BibDatabase:
-    return bibtexparser.loads(text, parser=_parser())
+    try:
+        return bibtexparser.loads(text, parser=_parser())
+    except Exception as e:
+        # Undefined @string macros raise bare KeyError('macro'); rewrap so
+        # callers see a real message and KeyError never masquerades as a
+        # LookupError "not found" upstream.
+        raise ValueError(f"BibTeX parse failed: {type(e).__name__}: {e}") from e
 def parse_bibtex_entry(text: str) -> dict:
@@ -117,10 +143,12 @@ def find_existing(db: BibDatabase, title: str, arxiv_id: str = "", doi: str = ""
     return None
-def upsert_entry(path: Path, entry: dict) -> tuple[str, str]:
+def upsert_entry(path: Path, entry: dict, replace: bool = False) -> tuple[str, str]:
     """Insert or upgrade ``entry`` in ``path``.
-    Returns (action, key) where action is "added" | "upgraded" | "exists".
+    Returns (action, key), action in "added" | "upgraded" | "exists" |
+    "replaced". With ``replace``, an existing matching entry is overwritten
+    (its citation key is kept so existing \\cite{} commands stay valid).
     """
     db = load_bib_file(path)
     if db is None:  # unparseable file: append blindly
@@ -132,13 +160,14 @@ def upsert_entry(path: Path, entry: dict) -> tuple[str, str]:
         db, entry.get("title", ""), entry_arxiv_id(entry), entry.get("doi", "")
     )
     if existing is not None:
-        if is_preprint(existing) and not is_preprint(entry):
+        upgrade = is_preprint(existing) and not is_preprint(entry)
+        if replace or upgrade:
             key = existing["ID"]
             existing.clear()
-            existing.update(entry)
+            existing.update({k: str(v) for k, v in entry.items() if v})
             existing["ID"] = key  # keep the key the user may already \cite
             _write_db(path, db)
-            return "upgraded", key
+            return ("replaced" if replace else "upgraded"), key
         return "exists", existing["ID"]
     db.entries.append({k: str(v) for k, v in entry.items() if v})
@@ -146,6 +175,20 @@ def upsert_entry(path: Path, entry: dict) -> tuple[str, str]:
     return "added", entry["ID"]
+def remove_entry(path: Path, key: str) -> bool:
+    """Delete the entry with citation key ``key``. True if something was
+    removed."""
+    db = load_bib_file(path)
+    if db is None:
+        return False
+    before = len(db.entries)
+    db.entries = [e for e in db.entries if e.get("ID") != key]
+    if len(db.entries) == before:
+        return False
+    _write_db(path, db)
+    return True
 def _write_db(path: Path, db: BibDatabase):
     writer = BibTexWriter()
     writer.indent = "  "

bibcite_cli-0.3.0/src/bibcite/cache.py ADDED Viewed

@@ -0,0 +1,48 @@
+"""Local cache of successful publication matches.
+Keyed by normalized title. Only *published* matches are stored — a paper that
+is published stays published, while a preprint may get published tomorrow, so
+negative/preprint results are never cached. Re-running `fix`/`upgrade` or
+re-adding known papers therefore costs zero API calls.
+Disable with --no-cache or BIBCITE_NO_CACHE=1. Lives at
+$XDG_CACHE_HOME/bibcite/published.json (~/.cache/bibcite/published.json).
+"""
+import json
+import os
+import sys
+from pathlib import Path
+DISABLED = os.environ.get("BIBCITE_NO_CACHE", "") == "1"
+def _path() -> Path:
+    root = os.environ.get("XDG_CACHE_HOME") or "~/.cache"
+    return Path(root).expanduser() / "bibcite" / "published.json"
+def _load() -> dict:
+    try:
+        return json.loads(_path().read_text())
+    except Exception:
+        return {}
+def get(key: str) -> dict | None:
+    if DISABLED or not key:
+        return None
+    return _load().get(key)
+def put(key: str, value: dict):
+    if DISABLED or not key:
+        return
+    try:
+        data = _load()
+        data[key] = value
+        p = _path()
+        p.parent.mkdir(parents=True, exist_ok=True)
+        p.write_text(json.dumps(data, ensure_ascii=False))
+    except Exception as e:  # cache must never break resolution
+        print(f"[cache] write failed: {e}", file=sys.stderr)

{bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/src/bibcite/cli.py RENAMED Viewed

@@ -11,12 +11,25 @@ import sys
 import time
 from pathlib import Path
-from . import bibfile
+from . import bibfile, cache
 from .normalize import first_author_last_name, norm_title
-from .resolve import Resolved, guess_entry_type, resolve
+from .resolve import (
+    NotFound,
+    Resolved,
+    SourcesUnavailable,
+    guess_entry_type,
+    resolve,
+)
 from .sources import find_published
 from .venues import canonicalize
+# Exit codes (part of the agent-facing contract):
+#   0 success
+#   2 the paper could not be resolved — ask for a stronger identifier
+#   3 internal/network failure (sources down, unexpected error) — retry later
+EXIT_NOT_FOUND = 2
+EXIT_INTERNAL = 3
 def _log(msg: str):
     print(msg, file=sys.stderr)
@@ -42,21 +55,30 @@ def _emit(payload: dict, as_json: bool = True):
 # get
 # ---------------------------------------------------------------------------
-def _resolve_or_none(query: str, require_published: bool):
+def _resolve_or_none(query: str, require_published: bool) -> tuple[Resolved | None, int]:
+    """(result, exit_code). Distinguishes 'not found' (2) from 'tool/source
+    failure' (3) so agents know whether to retry with a better identifier or
+    just retry later."""
     try:
-        return resolve(query, require_published=require_published)
-    except (LookupError, ValueError) as e:
+        return resolve(query, require_published=require_published), 0
+    except (NotFound, ValueError) as e:
         _log(f"[bibcite] {e}")
+        return None, EXIT_NOT_FOUND
+    except SourcesUnavailable as e:
+        _log(f"[bibcite] sources unavailable: {e}")
+        return None, EXIT_INTERNAL
     except Exception as e:
-        _log(f"[bibcite] network error: {type(e).__name__}: {e}")
-    return None
+        _log(f"[bibcite] internal error: {type(e).__name__}: {e}")
+        return None, EXIT_INTERNAL
 def cmd_get(args) -> int:
     query = " ".join(args.query)
-    res = _resolve_or_none(query, args.require_published)
+    if args.no_cache:
+        cache.DISABLED = True
+    res, code = _resolve_or_none(query, args.require_published)
     if res is None:
-        return 2
+        return code
     _emit(
         {
             "action": "resolved",
@@ -76,48 +98,86 @@ def cmd_get(args) -> int:
 # add
 # ---------------------------------------------------------------------------
+def _resolve_user_bibtex(text: str) -> Resolved:
+    entry = bibfile.parse_bibtex_entry(text)
+    raw_venue = entry.get("booktitle", "") or entry.get("journal", "")
+    canonical = canonicalize(raw_venue, entry.get("year"))
+    if canonical:
+        entry.pop("booktitle", None)
+        entry.pop("journal", None)
+        entry["ENTRYTYPE"] = canonical.entry_type
+        entry[canonical.bib_field] = canonical.name
+    return Resolved(entry, "user-bibtex", canonical.name if canonical else raw_venue, True)
 def cmd_add(args) -> int:
     path = Path(args.file)
+    if args.no_cache:
+        cache.DISABLED = True
+    # Collect the queries for this invocation (single, --bibtex, or --from).
     if args.bibtex:
         text = sys.stdin.read() if args.bibtex == "-" else args.bibtex
-        entry = bibfile.parse_bibtex_entry(text)
-        raw_venue = entry.get("booktitle", "") or entry.get("journal", "")
-        canonical = canonicalize(raw_venue, entry.get("year"))
-        if canonical:
-            entry.pop("booktitle", None)
-            entry.pop("journal", None)
-            entry["ENTRYTYPE"] = canonical.entry_type
-            entry[canonical.bib_field] = canonical.name
-        res = Resolved(entry, "user-bibtex", canonical.name if canonical else raw_venue, True)
+        try:
+            resolutions = [("<bibtex>", _resolve_user_bibtex(text), 0)]
+        except ValueError as e:
+            _log(f"[bibcite] {e}")
+            return EXIT_NOT_FOUND
+    elif args.from_file:
+        lines = Path(args.from_file).read_text().splitlines()
+        queries = [q.strip() for q in lines if q.strip() and not q.strip().startswith("#")]
+        resolutions = []
+        for i, q in enumerate(queries):
+            if i:
+                time.sleep(1)  # one process shares the rate-limit breaker; stay polite
+            _log(f"[bibcite] ({i + 1}/{len(queries)}) {q}")
+            res, code = _resolve_or_none(q, args.require_published)
+            resolutions.append((q, res, code))
     else:
         if not args.query:
-            _log("[bibcite] provide a query (arXiv id / DOI / title) or --bibtex")
-            return 2
+            _log("[bibcite] provide a query (arXiv id / DOI / title), --bibtex, or --from")
+            return EXIT_NOT_FOUND
         query = " ".join(args.query)
-        res = _resolve_or_none(query, args.require_published)
+        res, code = _resolve_or_none(query, args.require_published)
+        if res is None:
+            return code
+        resolutions = [(query, res, 0)]
+    # Write all entries first, tidy once, then read back the final keys.
+    results = []
+    wrote = False
+    for query, res, code in resolutions:
         if res is None:
-            return 2
+            results.append({"query": query, "action": "failed", "exit_code": code})
+            continue
+        action, key = bibfile.upsert_entry(path, res.entry, replace=args.replace)
+        wrote = wrote or action != "exists"
+        results.append(
+            {
+                "query": query,
+                "action": action,
+                "key": key,
+                "title": res.entry.get("title", ""),
+                "venue": res.venue or "arXiv (preprint)",
+                "published": res.published,
+                "source": res.source,
+            }
+        )
-    action, key = bibfile.upsert_entry(path, res.entry)
     tidied = False
-    if action != "exists" and not args.no_tidy:
+    if wrote and not args.no_tidy:
         tidied = bibfile.run_tidy(path)
         if tidied:
-            key = bibfile.key_after_tidy(path, res.entry.get("title", ""), key)
+            for r in results:
+                if r.get("title"):
+                    r["key"] = bibfile.key_after_tidy(path, r["title"], r["key"])
-    _emit(
-        {
-            "action": action,
-            "key": key,
-            "title": res.entry.get("title", ""),
-            "venue": res.venue or "arXiv (preprint)",
-            "published": res.published,
-            "source": res.source,
-            "file": str(path),
-            "tidied": tidied,
-        }
-    )
-    return 0
+    exit_code = max((r.get("exit_code", 0) for r in results), default=0)
+    if len(results) == 1 and not args.from_file:
+        _emit({**results[0], "file": str(path), "tidied": tidied})
+    else:
+        _emit({"file": str(path), "tidied": tidied, "results": results})
+    return exit_code
 # ---------------------------------------------------------------------------
@@ -139,6 +199,10 @@ def _upgrade_entries(path: Path, dry_run: bool) -> dict:
     for entry in db.entries:
         if not bibfile.is_preprint(entry):
             continue
+        if entry.get("pubstate", "").strip("{}") == "preprint":
+            # User-confirmed preprint-only (e.g. never-to-be-published arXiv
+            # reports): muted from upgrade and check.
+            continue
         title = entry.get("title", "").replace("{", "").replace("}", "")
         if not title:
             continue
@@ -150,9 +214,16 @@ def _upgrade_entries(path: Path, dry_run: bool) -> dict:
         hint = (
             first_author_last_name(entry["author"]) if entry.get("author") else ""
         )
-        match = find_published(title, entry.get("year", ""), aid, hint)
+        match, status = find_published(title, entry.get("year", ""), aid, hint)
         if not match:
-            report.append({"key": entry["ID"], "title": title, "matched": False})
+            # "no_published_version" is a trustworthy miss; "sources_unavailable"
+            # means the sources were down — do not conclude anything.
+            reason = (
+                "sources_unavailable" if status == "unavailable" else "no_published_version"
+            )
+            report.append(
+                {"key": entry["ID"], "title": title, "matched": False, "reason": reason}
+            )
             continue
         canonical = canonicalize(match.venue, match.year or entry.get("year"))
         venue_name = canonical.name if canonical else match.venue
@@ -192,13 +263,15 @@ def _upgrade_entries(path: Path, dry_run: bool) -> dict:
     matched = sum(1 for r in report if r["matched"])
     for r in report:
         mark = "✓" if r["matched"] else "✗"
-        _log(f"{mark} {r['key']}: {r.get('venue', 'no match')}")
+        _log(f"{mark} {r['key']}: {r.get('venue') or r.get('reason', 'no match')}")
     _log(f"[bibcite] {matched} matched, {changed} upgraded{' (dry-run)' if dry_run else ''}")
     return {"upgraded": changed, "matched": matched, "entries": report}
 def cmd_upgrade(args) -> int:
     path = Path(args.file)
+    if args.no_cache:
+        cache.DISABLED = True
     result = _upgrade_entries(path, args.dry_run)
     if result["upgraded"] and not args.no_tidy:
         bibfile.run_tidy(path)
@@ -230,8 +303,12 @@ def _check_problems(path: Path) -> tuple[int, list] | None:
         for f in ("author", "title", "year"):
             if not entry.get(f):
                 problems.append({"key": key, "issue": f"missing {f}"})
-        if bibfile.is_preprint(entry):
-            problems.append({"key": key, "issue": "arXiv preprint (try `bibcite upgrade`)"})
+        if bibfile.is_preprint(entry) and entry.get("pubstate", "").strip("{}") != "preprint":
+            problems.append({"key": key, "issue": "arXiv preprint (try `bibcite upgrade`, or set pubstate = {preprint} to mute)"})
+        author = entry.get("author", "")
+        letters = "".join(c for c in author if c.isalpha())
+        if letters and letters.isupper():
+            problems.append({"key": key, "issue": "author names are ALL CAPS"})
     for p in problems:
         _log(f"{p['key']}: {p['issue']}")
     _log(f"[bibcite] {len(db.entries)} entries, {len(problems)} issues")
@@ -248,9 +325,30 @@ def cmd_check(args) -> int:
     return 0
+def cmd_remove(args) -> int:
+    """Delete an entry by citation key — the sanctioned way to drop a bad
+    entry without hand-editing the file."""
+    path = Path(args.file)
+    removed = bibfile.remove_entry(path, args.key)
+    tidied = False
+    if removed and not args.no_tidy:
+        tidied = bibfile.run_tidy(path)
+    _emit(
+        {
+            "action": "removed" if removed else "not_found",
+            "key": args.key,
+            "file": str(path),
+            "tidied": tidied,
+        }
+    )
+    return 0 if removed else EXIT_NOT_FOUND
 def cmd_fix(args) -> int:
     """One-shot cleanup: upgrade preprints, always tidy, then re-lint."""
     path = Path(args.file)
+    if args.no_cache:
+        cache.DISABLED = True
     if not path.exists():
         _log(f"[bibcite] {path} does not exist")
         return 1
@@ -282,20 +380,31 @@ def main(argv=None) -> int:
     g.add_argument("query", nargs="+", help="arXiv id / arXiv URL / DOI / title")
     g.add_argument("--json", action="store_true", help="print a JSON object instead of BibTeX")
     g.add_argument("--require-published", action="store_true", help="fail instead of falling back to an arXiv entry")
+    g.add_argument("--no-cache", action="store_true", help="bypass the local match cache")
     g.set_defaults(fn=cmd_get)
     a = sub.add_parser("add", help="resolve and write into a .bib file, then run bibtex-tidy (prints JSON)")
     a.add_argument("file", help="target .bib file (created if missing)")
     a.add_argument("query", nargs="*", help="arXiv id / arXiv URL / DOI / title")
     a.add_argument("--bibtex", help="raw BibTeX entry to add instead of a query ('-' reads stdin)")
+    a.add_argument("--from", dest="from_file", metavar="FILE", help="batch mode: one query per line (shares rate-limit state, tidies once)")
+    a.add_argument("--replace", action="store_true", help="overwrite an existing matching entry (keeps its citation key)")
     a.add_argument("--no-tidy", action="store_true")
+    a.add_argument("--no-cache", action="store_true", help="bypass the local match cache")
     a.add_argument("--require-published", action="store_true")
     a.set_defaults(fn=cmd_add)
+    rm = sub.add_parser("remove", help="delete an entry by citation key (prints JSON)")
+    rm.add_argument("file")
+    rm.add_argument("key", help="citation key of the entry to remove")
+    rm.add_argument("--no-tidy", action="store_true")
+    rm.set_defaults(fn=cmd_remove)
     u = sub.add_parser("upgrade", help="match all arXiv entries in a file to their published versions (prints JSON)")
     u.add_argument("file")
     u.add_argument("--dry-run", action="store_true")
     u.add_argument("--no-tidy", action="store_true")
+    u.add_argument("--no-cache", action="store_true", help="bypass the local match cache")
     u.set_defaults(fn=cmd_upgrade)
     t = sub.add_parser("tidy", help="run bibtex-tidy with the canonical flags")
@@ -311,6 +420,7 @@ def main(argv=None) -> int:
         help="one-shot cleanup: upgrade preprints to published versions, tidy, then lint (prints JSON)",
     )
     f.add_argument("file")
+    f.add_argument("--no-cache", action="store_true", help="bypass the local match cache")
     f.set_defaults(fn=cmd_fix)
     args = p.parse_args(argv)

{bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/src/bibcite/normalize.py RENAMED Viewed

@@ -76,6 +76,28 @@ def first_author_last_name(author_field: str) -> str:
     return mini_hash(last) or "anon"
+def fix_author_caps(author_field: str) -> str:
+    """Normalize ALL-CAPS author names (old CrossRef records store e.g.
+    "EPPS, T. W. and PULLEY, LAWRENCE B."). A word is re-cased only when it
+    is fully uppercase and longer than 2 letters, so initials ("T.", "W.")
+    and legitimately-capitalized short names survive."""
+    def fix_word(w: str) -> str:
+        core = re.sub(r"[^A-Za-z]", "", w)
+        if len(core) > 2 and core.isupper():
+            return w.capitalize()
+        return w
+    def fix_name(name: str) -> str:
+        letters = re.sub(r"[^A-Za-z]", "", name)
+        if not letters.isupper():
+            return name  # mixed case already — leave it alone
+        return " ".join(fix_word(w) for w in name.split())
+    names = re.split(r"\s+and\s+", author_field)
+    return " and ".join(fix_name(n) for n in names)
 def make_key(author_field: str, year: str | int, title: str) -> str:
     """Deterministic citation key: <lastname><year><firstword>.

{bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/src/bibcite/resolve.py RENAMED Viewed

@@ -10,7 +10,17 @@ import sys
 from dataclasses import dataclass
 from .bibfile import NOISE_FIELDS, parse_bibtex_entry
-from .normalize import clean_title, first_author_last_name, make_key
+from .normalize import clean_title, first_author_last_name, fix_author_caps, make_key
+class NotFound(Exception):
+    """No source could resolve the query — asking for a better identifier is
+    the right next step."""
+class SourcesUnavailable(Exception):
+    """Resolution failed because sources were down/rate-limited, NOT because
+    the paper doesn't exist. Retrying later is the right next step."""
 from .sources import (
     ArxivMeta,
     Match,
@@ -84,7 +94,10 @@ def _entry_from_match(match: Match, meta: ArxivMeta | None) -> dict:
     if match.bibtex:
         try:
             entry = parse_bibtex_entry(match.bibtex)
-        except ValueError:
+        except Exception as e:
+            # Bad source bibtex must degrade to field construction, never
+            # abort the resolution.
+            _log(f"[{match.source}] could not parse its BibTeX ({e}); building from fields")
             entry = {}
     if not entry:
         authors = match.authors or (meta.authors if meta else [])
@@ -100,6 +113,8 @@ def _entry_from_match(match: Match, meta: ArxivMeta | None) -> dict:
         entry.pop(f, None)
     entry["title"] = clean_title(entry.get("title", ""))
+    if entry.get("author"):
+        entry["author"] = fix_author_caps(entry["author"])
     if match.doi and not entry.get("doi"):
         entry["doi"] = match.doi
@@ -132,8 +147,12 @@ def _finalize(entry: dict, meta: ArxivMeta | None) -> dict:
         entry["archiveprefix"] = "arXiv"
         if meta.primary_class:
             entry["primaryclass"] = meta.primary_class
-    elif entry.get("doi") and not entry.get("url"):
-        entry["url"] = f"https://doi.org/{entry['doi']}"
+    elif entry.get("doi"):
+        url = entry.get("url", "")
+        # Modernize legacy resolver links (http://dx.doi.org/...) and fill in
+        # a missing url from the DOI.
+        if not url or "dx.doi.org" in url:
+            entry["url"] = f"https://doi.org/{entry['doi']}"
     author = entry.get("author", "") or "anonymous"
     year = entry.get("year", "") or "XXXX"
     entry["ID"] = make_key(author, year, entry.get("title", ""))
@@ -176,19 +195,23 @@ def resolve(query: str, require_published: bool = False) -> Resolved:
                 if meta is not None:
                     break
             if meta is None:
-                raise LookupError(
+                raise SourcesUnavailable(
                     f"Could not fetch metadata for arXiv:{value} "
                     "(arXiv API, Semantic Scholar, and arxiv.org all unavailable)"
                 )
         _log(f"[arxiv] {meta.title} ({meta.year})")
         hint = first_author_last_name(meta.authors[0]) if meta.authors else ""
-        match = find_published(meta.title, meta.year, meta.arxiv_id, hint)
+        match, status = find_published(meta.title, meta.year, meta.arxiv_id, hint)
         if match:
             entry = _entry_from_match(match, meta)
             venue = entry.pop("__venue", match.venue)
             return Resolved(_finalize(entry, meta), match.source, venue, True)
         if require_published:
-            raise LookupError(f"No published version found for arXiv:{value}")
+            if status == "unavailable":
+                raise SourcesUnavailable(
+                    f"Could not check publication status for arXiv:{value} (sources down)"
+                )
+            raise NotFound(f"No published version found for arXiv:{value}")
         _log("[bibcite] no published version found; using arXiv preprint entry")
         entry = _arxiv_only_entry(meta)
         return Resolved(_finalize(entry, meta), "arxiv", "", False)
@@ -196,7 +219,7 @@ def resolve(query: str, require_published: bool = False) -> Resolved:
     if kind == "doi":
         match = crossref_by_doi(value)
         if not match or not match.title:
-            raise LookupError(f"DOI not found on CrossRef: {value}")
+            raise NotFound(f"DOI not found on CrossRef: {value}")
         entry = _entry_from_match(match, None)
         venue = entry.pop("__venue", match.venue)
         return Resolved(_finalize(entry, None), match.source, venue, True)
@@ -212,7 +235,7 @@ def resolve(query: str, require_published: bool = False) -> Resolved:
         if meta:
             _log(f"[openalex] metadata: arXiv {meta.arxiv_id or '?'} ({meta.year})")
     hint = first_author_last_name(meta.authors[0]) if meta and meta.authors else ""
-    match = find_published(
+    match, status = find_published(
         meta.title if meta else value,
         meta.year if meta else "",
         meta.arxiv_id if meta else "",
@@ -224,11 +247,15 @@ def resolve(query: str, require_published: bool = False) -> Resolved:
         return Resolved(_finalize(entry, meta), match.source, venue, True)
     if meta and meta.arxiv_id:
         if require_published:
-            raise LookupError(f"Only an arXiv preprint was found for: {value}")
+            raise NotFound(f"Only an arXiv preprint was found for: {value}")
         _log("[bibcite] no published version found; using arXiv preprint entry")
         entry = _arxiv_only_entry(meta)
         return Resolved(_finalize(entry, meta), "arxiv", "", False)
-    raise LookupError(f"No match found anywhere for: {value}")
+    if status == "unavailable":
+        raise SourcesUnavailable(
+            f"All sources were rate-limited or down while resolving: {value}"
+        )
+    raise NotFound(f"No match found anywhere for: {value}")
 def _openalex_meta(title: str) -> ArxivMeta | None:

{bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/src/bibcite/sources.py RENAMED Viewed

@@ -595,19 +595,36 @@ _DISABLED: dict[str, str] = {}
 def find_published(
     title: str, year: str = "", arxiv_id: str = "", author_hint: str = ""
-) -> Match | None:
-    """Try each source in order; first verified hit wins."""
+) -> tuple[Match | None, str]:
+    """Try each source in order; first verified hit wins.
+    Returns (match, status). status distinguishes a trustworthy miss from an
+    outage: "found" | "not_found" (>=1 source answered cleanly with no hit) |
+    "unavailable" (every source was disabled or errored — do NOT conclude the
+    paper is unpublished).
+    """
+    from . import cache
+    cache_key = norm_title(title)
+    cached = cache.get(cache_key)
+    if cached:
+        _log(f"[cache] hit: {cached.get('venue', '')} ({cached.get('source', '')})")
+        return Match(**cached), "found"
+    clean_misses = 0
     for name, fn in CASCADE:
         if name in _DISABLED:
             continue
         try:
             m = fn(title, year, arxiv_id, author_hint)
             if m:
-                return m
+                cache.put(cache_key, m.__dict__)
+                return m, "found"
+            clean_misses += 1
             _log(f"[{name}] no publication found")
         except SourceUnavailable as e:
             _DISABLED[name] = str(e)
             _log(f"[{name}] disabled for the rest of this run: {e}")
         except Exception as e:  # network hiccup on one source must not kill the run
             _log(f"[{name}] error: {type(e).__name__}: {e}")
-    return None
+    return None, ("not_found" if clean_misses else "unavailable")

bibcite_cli-0.3.0/tests/test_bugfixes.py ADDED Viewed

@@ -0,0 +1,91 @@
+"""Regression tests for the first round of real-world bug reports."""
+from pathlib import Path
+from bibcite import cache
+from bibcite.bibfile import parse_bibtex_entry, remove_entry, upsert_entry
+from bibcite.normalize import fix_author_caps
+# CrossRef's transform endpoint emits bare month macros (month=June) that are
+# not in bibtexparser's common_strings — this used to KeyError('june').
+CROSSREF_STYLE = (
+    " @article{Hyv_rinen_2000, title={Independent component analysis}, "
+    "volume={13}, url={http://dx.doi.org/10.1016/x}, DOI={10.1016/x}, "
+    "journal={Neural Networks}, author={Hyvärinen, A. and Oja, E.}, "
+    "year={2000}, month=June, pages={411–430} }"
+)
+def test_month_macro_full_name_parses():
+    entry = parse_bibtex_entry(CROSSREF_STYLE)
+    assert entry["title"] == "Independent component analysis"
+    assert "month" not in entry  # month is a noise field, dropped
+def test_month_macro_abbrev_parses():
+    entry = parse_bibtex_entry("@article{x, title={T}, year={2000}, month=jun }")
+    assert entry["title"] == "T"
+def test_unknown_macro_raises_value_error_not_keyerror():
+    import pytest
+    with pytest.raises(ValueError, match="BibTeX parse failed"):
+        parse_bibtex_entry("@article{x, title = somemacro }")
+def test_fix_author_caps():
+    assert (
+        fix_author_caps("EPPS, T. W. and PULLEY, LAWRENCE B.")
+        == "Epps, T. W. and Pulley, Lawrence B."
+    )
+    # Mixed-case names are never touched.
+    assert fix_author_caps("McDonald, J. and van der Berg, A.") == (
+        "McDonald, J. and van der Berg, A."
+    )
+    assert fix_author_caps("Ashish Vaswani") == "Ashish Vaswani"
+PUB = {
+    "ENTRYTYPE": "inproceedings",
+    "ID": "k1",
+    "title": "Paper One",
+    "author": "A B",
+    "booktitle": "Some Conference (SC)",
+    "year": "2020",
+}
+def test_remove_entry(tmp_path: Path):
+    bib = tmp_path / "r.bib"
+    upsert_entry(bib, dict(PUB))
+    assert remove_entry(bib, "k1") is True
+    assert remove_entry(bib, "k1") is False  # already gone
+    assert "Paper One" not in bib.read_text()
+def test_upsert_replace_keeps_key(tmp_path: Path):
+    bib = tmp_path / "r.bib"
+    upsert_entry(bib, dict(PUB))
+    newer = dict(PUB, ID="differentkey", author="Fixed Author")
+    action, key = upsert_entry(bib, newer, replace=True)
+    assert (action, key) == ("replaced", "k1")
+    assert "Fixed Author" in bib.read_text()
+    # Without --replace, a published duplicate stays untouched.
+    action, key = upsert_entry(bib, newer)
+    assert (action, key) == ("exists", "k1")
+def test_cache_roundtrip(tmp_path, monkeypatch):
+    monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path))
+    monkeypatch.setattr(cache, "DISABLED", False)
+    assert cache.get("somekey") is None
+    cache.put("somekey", {"source": "dblp", "venue": "SC"})
+    assert cache.get("somekey")["venue"] == "SC"
+def test_cache_disabled(tmp_path, monkeypatch):
+    monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path))
+    monkeypatch.setattr(cache, "DISABLED", True)
+    cache.put("k", {"venue": "X"})
+    assert cache.get("k") is None

{bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/uv.lock RENAMED Viewed

@@ -18,7 +18,7 @@ wheels = [
 [[package]]
 name = "bibcite-cli"
-version = "0.2.0"
+version = "0.3.0"
 source = { editable = "." }
 dependencies = [
     { name = "bibtexparser" },