PyPI - bibcite-cli - Versions diffs - 0.1.0__py3-none-any.whl - Mend

bibcite-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

bibcite/__init__.py +3 -0
bibcite/bibfile.py +194 -0
bibcite/cli.py +272 -0
bibcite/data/strings.bib +352 -0
bibcite/normalize.py +86 -0
bibcite/resolve.py +289 -0
bibcite/sources.py +593 -0
bibcite/venues.py +241 -0
bibcite_cli-0.1.0.dist-info/METADATA +74 -0
bibcite_cli-0.1.0.dist-info/RECORD +13 -0
bibcite_cli-0.1.0.dist-info/WHEEL +4 -0
bibcite_cli-0.1.0.dist-info/entry_points.txt +2 -0
bibcite_cli-0.1.0.dist-info/licenses/LICENSE +21 -0

bibcite/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""bibcite: canonical BibTeX resolution for papers (arXiv id / DOI / title)."""
+__version__ = "0.1.0"

bibcite/bibfile.py ADDED Viewed

@@ -0,0 +1,194 @@
+"""Reading/writing .bib files, deduplication, and the bibtex-tidy runner."""
+import re
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+import bibtexparser
+from bibtexparser.bibdatabase import BibDatabase
+from bibtexparser.bparser import BibTexParser
+from bibtexparser.bwriter import BibTexWriter
+from .normalize import norm_title
+# The exact bibtex-tidy invocation requested by the user; keep in sync with
+# their LaTeX workflow.
+TIDY_ARGS = [
+    "--modify",
+    "--omit=pages,publisher,doi,timestamp,biburl,bibsource,abstract,month,series,volume,editor,note,date,number,address",
+    "--curly",
+    "--blank-lines",
+    "--trailing-commas",
+    "--sort=-year",
+    "--duplicates=citation",
+    "--merge=first",
+    "--sort-fields=author,title,booktitle,journal,year,url,pdf",
+    "--strip-enclosing-braces",
+    "--tidy-comments",
+    "--generate-keys",
+]
+NOISE_FIELDS = ("timestamp", "biburl", "bibsource", "crossref")
+ARXIV_ID_RE = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?")
+def _log(msg: str):
+    print(msg, file=sys.stderr)
+def _parser() -> BibTexParser:
+    p = BibTexParser(common_strings=True)
+    p.ignore_nonstandard_types = False
+    return p
+def parse_bib(text: str) -> BibDatabase:
+    return bibtexparser.loads(text, parser=_parser())
+def parse_bibtex_entry(text: str) -> dict:
+    """First entry of a bibtex string as a dict (fields + ID + ENTRYTYPE)."""
+    db = parse_bib(text)
+    if not db.entries:
+        raise ValueError("No BibTeX entry could be parsed")
+    entry = dict(db.entries[0])
+    for f in NOISE_FIELDS:
+        entry.pop(f, None)
+    return entry
+def entry_to_bibtex(entry: dict) -> str:
+    db = BibDatabase()
+    db.entries = [{k: str(v) for k, v in entry.items() if v}]
+    writer = BibTexWriter()
+    writer.indent = "  "
+    return bibtexparser.dumps(db, writer).strip() + "\n"
+def entry_arxiv_id(entry: dict) -> str:
+    """Extract an arXiv id from eprint/url/journal/note fields, if any."""
+    for f in ("eprint", "url", "journal", "note", "doi"):
+        v = entry.get(f, "")
+        if "arxiv" in v.lower() or f == "eprint":
+            m = ARXIV_ID_RE.search(v)
+            if m:
+                return m.group(1)
+    return ""
+def is_preprint(entry: dict) -> bool:
+    """Preprint = the venue fields say arXiv/preprint, or there is no venue.
+    eprint/archiveprefix/url fields do NOT count: published entries keep
+    their arXiv pointers.
+    """
+    venue = " ".join(
+        str(entry.get(f, "")) for f in ("journal", "booktitle", "howpublished")
+    ).lower()
+    if "arxiv" in venue or "preprint" in venue or "corr" in venue.split():
+        return True
+    return not entry.get("journal") and not entry.get("booktitle")
+def load_bib_file(path: Path) -> BibDatabase | None:
+    """Parse an existing .bib file; None when it cannot be parsed (we then
+    degrade to append-only mode)."""
+    if not path.exists() or not path.read_text().strip():
+        return BibDatabase()
+    try:
+        return parse_bib(path.read_text())
+    except Exception as e:
+        _log(f"[bibcite] warning: could not parse {path} ({e}); appending without dedup")
+        return None
+def find_existing(db: BibDatabase, title: str, arxiv_id: str = "", doi: str = "") -> dict | None:
+    ref = norm_title(title)
+    for entry in db.entries:
+        if arxiv_id and entry_arxiv_id(entry) == arxiv_id:
+            return entry
+        if doi and entry.get("doi", "").lower() == doi.lower():
+            return entry
+        if ref and norm_title(entry.get("title", "")) == ref:
+            return entry
+    return None
+def upsert_entry(path: Path, entry: dict) -> tuple[str, str]:
+    """Insert or upgrade ``entry`` in ``path``.
+    Returns (action, key) where action is "added" | "upgraded" | "exists".
+    """
+    db = load_bib_file(path)
+    if db is None:  # unparseable file: append blindly
+        with path.open("a") as f:
+            f.write("\n" + entry_to_bibtex(entry))
+        return "added", entry["ID"]
+    existing = find_existing(
+        db, entry.get("title", ""), entry_arxiv_id(entry), entry.get("doi", "")
+    )
+    if existing is not None:
+        if is_preprint(existing) and not is_preprint(entry):
+            key = existing["ID"]
+            existing.clear()
+            existing.update(entry)
+            existing["ID"] = key  # keep the key the user may already \cite
+            _write_db(path, db)
+            return "upgraded", key
+        return "exists", existing["ID"]
+    db.entries.append({k: str(v) for k, v in entry.items() if v})
+    _write_db(path, db)
+    return "added", entry["ID"]
+def _write_db(path: Path, db: BibDatabase):
+    writer = BibTexWriter()
+    writer.indent = "  "
+    writer.order_entries_by = None  # preserve file order; tidy re-sorts anyway
+    path.write_text(bibtexparser.dumps(db, writer))
+# ---------------------------------------------------------------------------
+# bibtex-tidy
+# ---------------------------------------------------------------------------
+def tidy_command() -> list[str] | None:
+    exe = shutil.which("bibtex-tidy")
+    if exe:
+        return [exe]
+    if shutil.which("npx"):
+        return ["npx", "--yes", "bibtex-tidy"]
+    return None
+def run_tidy(path: Path) -> bool:
+    cmd = tidy_command()
+    if cmd is None:
+        _log("[bibcite] bibtex-tidy not found (npm i -g bibtex-tidy); skipping tidy")
+        return False
+    proc = subprocess.run(
+        cmd + [str(path)] + TIDY_ARGS, capture_output=True, text=True
+    )
+    if proc.returncode != 0:
+        _log(f"[bibcite] bibtex-tidy failed:\n{proc.stderr.strip()}")
+        return False
+    _log(f"[bibcite] bibtex-tidy: {proc.stdout.strip().splitlines()[-1] if proc.stdout.strip() else 'ok'}")
+    return True
+def key_after_tidy(path: Path, title: str, fallback_key: str) -> str:
+    """bibtex-tidy --generate-keys rewrites keys; re-read the file to report
+    the final key for the entry with this title."""
+    db = load_bib_file(path)
+    if db is None:
+        return fallback_key
+    ref = norm_title(title)
+    for entry in db.entries:
+        if norm_title(entry.get("title", "")) == ref:
+            return entry["ID"]
+    return fallback_key

bibcite/cli.py ADDED Viewed

@@ -0,0 +1,272 @@
+"""bibcite CLI.
+Designed to be called by agents: never hand-edit a .bib file — let
+``bibcite add`` resolve, canonicalize, dedupe, write, and tidy, then use the
+citation key it prints.
+"""
+import argparse
+import json
+import sys
+from pathlib import Path
+from . import bibfile
+from .normalize import first_author_last_name, norm_title
+from .resolve import Resolved, guess_entry_type, resolve
+from .sources import find_published
+from .venues import canonicalize
+def _log(msg: str):
+    print(msg, file=sys.stderr)
+def _emit(payload: dict, as_json: bool = True):
+    """File-mutating commands always print one JSON object on stdout — the
+    agent-facing contract. Only `get` has a plain mode (BibTeX on stdout for
+    previewing/piping)."""
+    if as_json:
+        print(json.dumps(payload, ensure_ascii=False, indent=2))
+    else:
+        for k, v in payload.items():
+            if k != "bibtex":
+                _log(f"{k}: {v}")
+        if payload.get("bibtex"):
+            print(payload["bibtex"], end="")
+        elif payload.get("key"):
+            print(payload["key"])
+# ---------------------------------------------------------------------------
+# get
+# ---------------------------------------------------------------------------
+def _resolve_or_none(query: str, require_published: bool):
+    try:
+        return resolve(query, require_published=require_published)
+    except (LookupError, ValueError) as e:
+        _log(f"[bibcite] {e}")
+    except Exception as e:
+        _log(f"[bibcite] network error: {type(e).__name__}: {e}")
+    return None
+def cmd_get(args) -> int:
+    query = " ".join(args.query)
+    res = _resolve_or_none(query, args.require_published)
+    if res is None:
+        return 2
+    _emit(
+        {
+            "action": "resolved",
+            "key": res.entry["ID"],
+            "title": res.entry.get("title", ""),
+            "venue": res.venue or "arXiv (preprint, no published venue found)",
+            "published": res.published,
+            "source": res.source,
+            "bibtex": res.bibtex,
+        },
+        args.json,
+    )
+    return 0
+# ---------------------------------------------------------------------------
+# add
+# ---------------------------------------------------------------------------
+def cmd_add(args) -> int:
+    path = Path(args.file)
+    if args.bibtex:
+        text = sys.stdin.read() if args.bibtex == "-" else args.bibtex
+        entry = bibfile.parse_bibtex_entry(text)
+        raw_venue = entry.get("booktitle", "") or entry.get("journal", "")
+        canonical = canonicalize(raw_venue, entry.get("year"))
+        if canonical:
+            entry.pop("booktitle", None)
+            entry.pop("journal", None)
+            entry["ENTRYTYPE"] = canonical.entry_type
+            entry[canonical.bib_field] = canonical.name
+        res = Resolved(entry, "user-bibtex", canonical.name if canonical else raw_venue, True)
+    else:
+        if not args.query:
+            _log("[bibcite] provide a query (arXiv id / DOI / title) or --bibtex")
+            return 2
+        query = " ".join(args.query)
+        res = _resolve_or_none(query, args.require_published)
+        if res is None:
+            return 2
+    action, key = bibfile.upsert_entry(path, res.entry)
+    tidied = False
+    if action != "exists" and not args.no_tidy:
+        tidied = bibfile.run_tidy(path)
+        if tidied:
+            key = bibfile.key_after_tidy(path, res.entry.get("title", ""), key)
+    _emit(
+        {
+            "action": action,
+            "key": key,
+            "title": res.entry.get("title", ""),
+            "venue": res.venue or "arXiv (preprint)",
+            "published": res.published,
+            "source": res.source,
+            "file": str(path),
+            "tidied": tidied,
+        }
+    )
+    return 0
+# ---------------------------------------------------------------------------
+# upgrade: batch-match arXiv entries in an existing file (bibMatcher, CLI-style)
+# ---------------------------------------------------------------------------
+def cmd_upgrade(args) -> int:
+    path = Path(args.file)
+    db = bibfile.load_bib_file(path)
+    if db is None or not db.entries:
+        _log(f"[bibcite] nothing to do in {path}")
+        return 0
+    report = []
+    changed = 0
+    for entry in db.entries:
+        if not bibfile.is_preprint(entry):
+            continue
+        title = entry.get("title", "").replace("{", "").replace("}", "")
+        if not title:
+            continue
+        _log(f"[upgrade] matching: {title[:80]}")
+        aid = bibfile.entry_arxiv_id(entry)
+        hint = (
+            first_author_last_name(entry["author"]) if entry.get("author") else ""
+        )
+        match = find_published(title, entry.get("year", ""), aid, hint)
+        if not match:
+            report.append({"key": entry["ID"], "title": title, "matched": False})
+            continue
+        canonical = canonicalize(match.venue, match.year or entry.get("year"))
+        venue_name = canonical.name if canonical else match.venue
+        if not args.dry_run:
+            entry.pop("journal", None)
+            entry.pop("booktitle", None)
+            entry.pop("howpublished", None)
+            if canonical:
+                entry["ENTRYTYPE"] = canonical.entry_type
+                entry[canonical.bib_field] = canonical.name
+            else:
+                entry["ENTRYTYPE"] = guess_entry_type(match.venue)
+                field = (
+                    "booktitle"
+                    if entry["ENTRYTYPE"] == "inproceedings"
+                    else "journal"
+                )
+                entry[field] = match.venue
+            if match.year:
+                entry["year"] = match.year
+            if match.doi and not entry.get("doi"):
+                entry["doi"] = match.doi
+            changed += 1
+        report.append(
+            {
+                "key": entry["ID"],
+                "title": title,
+                "matched": True,
+                "venue": venue_name,
+                "source": match.source,
+            }
+        )
+    if changed and not args.dry_run:
+        bibfile._write_db(path, db)
+        if not args.no_tidy:
+            bibfile.run_tidy(path)
+    matched = sum(1 for r in report if r["matched"])
+    for r in report:
+        mark = "✓" if r["matched"] else "✗"
+        _log(f"{mark} {r['key']}: {r.get('venue', 'no match')}")
+    _log(f"[bibcite] {matched} matched, {changed} upgraded{' (dry-run)' if args.dry_run else ''}")
+    _emit({"upgraded": changed, "matched": matched, "dry_run": args.dry_run, "entries": report})
+    return 0
+# ---------------------------------------------------------------------------
+# tidy / check
+# ---------------------------------------------------------------------------
+def cmd_tidy(args) -> int:
+    return 0 if bibfile.run_tidy(Path(args.file)) else 1
+def cmd_check(args) -> int:
+    path = Path(args.file)
+    db = bibfile.load_bib_file(path)
+    if db is None:
+        _log(f"[bibcite] {path} could not be parsed")
+        return 1
+    problems = []
+    seen_titles: dict[str, str] = {}
+    for entry in db.entries:
+        key = entry.get("ID", "?")
+        nt = norm_title(entry.get("title", ""))
+        if nt and nt in seen_titles:
+            problems.append({"key": key, "issue": f"duplicate title of {seen_titles[nt]}"})
+        seen_titles.setdefault(nt, key)
+        for f in ("author", "title", "year"):
+            if not entry.get(f):
+                problems.append({"key": key, "issue": f"missing {f}"})
+        if bibfile.is_preprint(entry):
+            problems.append({"key": key, "issue": "arXiv preprint (try `bibcite upgrade`)"})
+    for p in problems:
+        _log(f"{p['key']}: {p['issue']}")
+    _log(f"[bibcite] {len(db.entries)} entries, {len(problems)} issues")
+    _emit({"entries": len(db.entries), "problems": problems})
+    return 0
+# ---------------------------------------------------------------------------
+def main(argv=None) -> int:
+    p = argparse.ArgumentParser(
+        prog="bibcite",
+        description="Resolve papers to canonical BibTeX and manage .bib files (agents: use `add`, never hand-edit).",
+    )
+    sub = p.add_subparsers(dest="cmd", required=True)
+    g = sub.add_parser("get", help="resolve a query and print BibTeX to stdout")
+    g.add_argument("query", nargs="+", help="arXiv id / arXiv URL / DOI / title")
+    g.add_argument("--json", action="store_true", help="print a JSON object instead of BibTeX")
+    g.add_argument("--require-published", action="store_true", help="fail instead of falling back to an arXiv entry")
+    g.set_defaults(fn=cmd_get)
+    a = sub.add_parser("add", help="resolve and write into a .bib file, then run bibtex-tidy (prints JSON)")
+    a.add_argument("file", help="target .bib file (created if missing)")
+    a.add_argument("query", nargs="*", help="arXiv id / arXiv URL / DOI / title")
+    a.add_argument("--bibtex", help="raw BibTeX entry to add instead of a query ('-' reads stdin)")
+    a.add_argument("--no-tidy", action="store_true")
+    a.add_argument("--require-published", action="store_true")
+    a.set_defaults(fn=cmd_add)
+    u = sub.add_parser("upgrade", help="match all arXiv entries in a file to their published versions (prints JSON)")
+    u.add_argument("file")
+    u.add_argument("--dry-run", action="store_true")
+    u.add_argument("--no-tidy", action="store_true")
+    u.set_defaults(fn=cmd_upgrade)
+    t = sub.add_parser("tidy", help="run bibtex-tidy with the canonical flags")
+    t.add_argument("file")
+    t.set_defaults(fn=cmd_tidy)
+    c = sub.add_parser("check", help="offline sanity check of a .bib file (prints JSON)")
+    c.add_argument("file")
+    c.set_defaults(fn=cmd_check)
+    args = p.parse_args(argv)
+    return args.fn(args)
+if __name__ == "__main__":
+    raise SystemExit(main())