PyPI - bibcite-cli - Versions diffs - 0.3.0__tar.gz → 0.4.0__tar.gz - Mend

bibcite-cli 0.3.0tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: bibcite-cli
-Version: 0.3.0
+Version: 0.4.0
 Summary: Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX for agents and humans
 Project-URL: Repository, https://github.com/leo1oel/bibcite
 License-Expression: MIT

{bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "bibcite-cli"
-version = "0.3.0"
+version = "0.4.0"
 description = "Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX for agents and humans"
 readme = "Readme.md"
 license = "MIT"

{bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/src/bibcite/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """bibcite: canonical BibTeX resolution for papers (arXiv id / DOI / title)."""
-__version__ = "0.3.0"
+__version__ = "0.4.0"

{bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/src/bibcite/bibfile.py RENAMED Viewed

@@ -20,14 +20,16 @@ from .normalize import norm_title
 # \cite{} commands valid.
 TIDY_ARGS = [
     "--modify",
-    "--omit=pages,publisher,doi,timestamp,biburl,bibsource,abstract,month,series,volume,editor,note,date,number,address",
+    # volume/number/pages/doi are kept (bibliographic substance the user
+    # asked to retain); the omit list drops only true noise.
+    "--omit=publisher,timestamp,biburl,bibsource,abstract,month,series,editor,note,date,address",
     "--curly",
     "--blank-lines",
     "--trailing-commas",
     "--sort=-year",
     "--duplicates=citation",
     "--merge=first",
-    "--sort-fields=author,title,booktitle,journal,year,url,pdf",
+    "--sort-fields=author,title,booktitle,journal,volume,number,pages,year,doi,url,pdf",
     "--strip-enclosing-braces",
     "--tidy-comments",
 ]
@@ -143,33 +145,48 @@ def find_existing(db: BibDatabase, title: str, arxiv_id: str = "", doi: str = ""
     return None
-def upsert_entry(path: Path, entry: dict, replace: bool = False) -> tuple[str, str]:
+def upsert_entry(
+    path: Path, entry: dict, replace: bool = False, replace_key: str = ""
+) -> tuple[str, str]:
     """Insert or upgrade ``entry`` in ``path``.
     Returns (action, key), action in "added" | "upgraded" | "exists" |
-    "replaced". With ``replace``, an existing matching entry is overwritten
-    (its citation key is kept so existing \\cite{} commands stay valid).
+    "replaced" | "no_match_to_replace". With ``replace``, an existing
+    matching entry is overwritten; ``replace_key`` targets a specific entry
+    by citation key (for when title drift defeats the automatic match). The
+    existing key is always kept so \\cite{} commands stay valid. A replace
+    that matches nothing is an ERROR, not a silent add — that is how
+    duplicate entries sneak into a file.
     """
     db = load_bib_file(path)
     if db is None:  # unparseable file: append blindly
+        if replace or replace_key:
+            return "no_match_to_replace", replace_key or entry["ID"]
         with path.open("a") as f:
             f.write("\n" + entry_to_bibtex(entry))
         return "added", entry["ID"]
-    existing = find_existing(
-        db, entry.get("title", ""), entry_arxiv_id(entry), entry.get("doi", "")
-    )
+    if replace_key:
+        existing = next((e for e in db.entries if e.get("ID") == replace_key), None)
+    else:
+        existing = find_existing(
+            db, entry.get("title", ""), entry_arxiv_id(entry), entry.get("doi", "")
+        )
     if existing is not None:
         upgrade = is_preprint(existing) and not is_preprint(entry)
-        if replace or upgrade:
+        if replace or replace_key or upgrade:
             key = existing["ID"]
             existing.clear()
             existing.update({k: str(v) for k, v in entry.items() if v})
             existing["ID"] = key  # keep the key the user may already \cite
             _write_db(path, db)
-            return ("replaced" if replace else "upgraded"), key
+            return ("replaced" if (replace or replace_key) else "upgraded"), key
         return "exists", existing["ID"]
+    if replace or replace_key:
+        return "no_match_to_replace", replace_key or entry["ID"]
     db.entries.append({k: str(v) for k, v in entry.items() if v})
     _write_db(path, db)
     return "added", entry["ID"]
@@ -190,6 +207,12 @@ def remove_entry(path: Path, key: str) -> bool:
 def _write_db(path: Path, db: BibDatabase):
+    # Never write our injected month macros back out as @string blocks (they
+    # exist only so parsing month=June doesn't crash); this also scrubs any
+    # that leaked into a file before this guard existed. User-defined
+    # @strings are untouched.
+    for k in MONTH_STRINGS:
+        db.strings.pop(k, None)
     writer = BibTexWriter()
     writer.indent = "  "
     writer.order_entries_by = None  # preserve file order; tidy re-sorts anyway

{bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/src/bibcite/cli.py RENAMED Viewed

@@ -12,7 +12,8 @@ import time
 from pathlib import Path
 from . import bibfile, cache
-from .normalize import first_author_last_name, norm_title
+from .normalize import first_author_last_name, norm_title, titles_similar
+from .resolve import classify
 from .resolve import (
     NotFound,
     Resolved,
@@ -107,50 +108,107 @@ def _resolve_user_bibtex(text: str) -> Resolved:
         entry.pop("journal", None)
         entry["ENTRYTYPE"] = canonical.entry_type
         entry[canonical.bib_field] = canonical.name
-    return Resolved(entry, "user-bibtex", canonical.name if canonical else raw_venue, True)
+    published = not bibfile.is_preprint(entry)
+    return Resolved(
+        entry,
+        "user-bibtex",
+        (canonical.name if canonical else raw_venue) if published else "",
+        published,
+    )
+def _local_exists(path: Path, query: str) -> str | None:
+    """Local pre-check: if the query is already in the file as a PUBLISHED
+    entry, skip the network entirely (makes --from re-runs and repeated adds
+    near-instant). Preprints still resolve online — they may be upgradable."""
+    db = bibfile.load_bib_file(path)
+    if db is None or not db.entries:
+        return None
+    kind, value = classify(query)
+    if kind == "arxiv":
+        existing = bibfile.find_existing(db, "", arxiv_id=value)
+    elif kind == "doi":
+        existing = bibfile.find_existing(db, "", doi=value)
+    else:
+        existing = bibfile.find_existing(db, value)
+    if existing is not None and not bibfile.is_preprint(existing):
+        return existing["ID"]
+    return None
 def cmd_add(args) -> int:
     path = Path(args.file)
     if args.no_cache:
         cache.DISABLED = True
+    targeting = args.replace or bool(args.key)
+    if args.key and args.from_file:
+        _log("[bibcite] --key targets one entry; it cannot be combined with --from")
+        return EXIT_NOT_FOUND
     # Collect the queries for this invocation (single, --bibtex, or --from).
+    # Each item: (query, resolved_or_None, exit_code, local_exists_key).
+    items: list[tuple[str, Resolved | None, int, str]] = []
     if args.bibtex:
         text = sys.stdin.read() if args.bibtex == "-" else args.bibtex
         try:
-            resolutions = [("<bibtex>", _resolve_user_bibtex(text), 0)]
+            items.append(("<bibtex>", _resolve_user_bibtex(text), 0, ""))
         except ValueError as e:
             _log(f"[bibcite] {e}")
             return EXIT_NOT_FOUND
     elif args.from_file:
         lines = Path(args.from_file).read_text().splitlines()
         queries = [q.strip() for q in lines if q.strip() and not q.strip().startswith("#")]
-        resolutions = []
+        resolved_any = False
         for i, q in enumerate(queries):
-            if i:
+            local = None if targeting else _local_exists(path, q)
+            if local:
+                _log(f"[bibcite] ({i + 1}/{len(queries)}) {q} — already in file: {local}")
+                items.append((q, None, 0, local))
+                continue
+            if resolved_any:
                 time.sleep(1)  # one process shares the rate-limit breaker; stay polite
+            resolved_any = True
             _log(f"[bibcite] ({i + 1}/{len(queries)}) {q}")
             res, code = _resolve_or_none(q, args.require_published)
-            resolutions.append((q, res, code))
+            items.append((q, res, code, ""))
     else:
         if not args.query:
             _log("[bibcite] provide a query (arXiv id / DOI / title), --bibtex, or --from")
             return EXIT_NOT_FOUND
         query = " ".join(args.query)
+        local = None if targeting else _local_exists(path, query)
+        if local:
+            _log(f"[bibcite] already in file (matched locally, no network): {local}")
+            _emit({"action": "exists", "key": local, "file": str(path), "tidied": False})
+            return 0
         res, code = _resolve_or_none(query, args.require_published)
         if res is None:
             return code
-        resolutions = [(query, res, 0)]
+        items.append((query, res, 0, ""))
     # Write all entries first, tidy once, then read back the final keys.
     results = []
     wrote = False
-    for query, res, code in resolutions:
+    for query, res, code, local_key in items:
+        if local_key:
+            results.append({"query": query, "action": "exists", "key": local_key})
+            continue
         if res is None:
             results.append({"query": query, "action": "failed", "exit_code": code})
             continue
-        action, key = bibfile.upsert_entry(path, res.entry, replace=args.replace)
+        action, key = bibfile.upsert_entry(
+            path, res.entry, replace=args.replace, replace_key=args.key or ""
+        )
+        if action == "no_match_to_replace":
+            # A replace that matches nothing must fail loudly, never silently
+            # add a duplicate entry.
+            _log(
+                f"[bibcite] no matching entry to replace for '{query}'"
+                + (f" (key: {args.key})" if args.key else "")
+                + " — nothing written. Use `bibcite add --key <existing-key>` to target one."
+            )
+            results.append({"query": query, "action": action, "exit_code": EXIT_NOT_FOUND})
+            continue
         wrote = wrote or action != "exists"
         results.append(
             {
@@ -246,6 +304,10 @@ def _upgrade_entries(path: Path, dry_run: bool) -> dict:
                 entry["year"] = match.year
             if match.doi and not entry.get("doi"):
                 entry["doi"] = match.doi
+            if match.title:
+                # Camera-ready titles drift from arXiv ones; the published
+                # title is the correct one to cite.
+                entry["title"] = match.title
             changed += 1
         report.append(
             {
@@ -294,11 +356,30 @@ def _check_problems(path: Path) -> tuple[int, list] | None:
         return None
     problems = []
     seen_titles: dict[str, str] = {}
+    by_author: dict[str, list[tuple[str, str]]] = {}  # lastname -> [(key, title)]
     for entry in db.entries:
         key = entry.get("ID", "?")
         nt = norm_title(entry.get("title", ""))
         if nt and nt in seen_titles:
             problems.append({"key": key, "issue": f"duplicate title of {seen_titles[nt]}"})
+        elif nt:
+            # Near-duplicates (title drift: same first author, similar title)
+            # slip past exact matching — exactly how a failed replace plus a
+            # re-add pollutes a file.
+            last = (
+                first_author_last_name(entry["author"]) if entry.get("author") else ""
+            )
+            for other_key, other_title in by_author.get(last, []):
+                if titles_similar(entry.get("title", ""), other_title):
+                    problems.append(
+                        {
+                            "key": key,
+                            "issue": f"near-duplicate of {other_key} (title drift?)",
+                        }
+                    )
+                    break
+            if last:
+                by_author.setdefault(last, []).append((key, entry.get("title", "")))
         seen_titles.setdefault(nt, key)
         for f in ("author", "title", "year"):
             if not entry.get(f):
@@ -388,7 +469,8 @@ def main(argv=None) -> int:
     a.add_argument("query", nargs="*", help="arXiv id / arXiv URL / DOI / title")
     a.add_argument("--bibtex", help="raw BibTeX entry to add instead of a query ('-' reads stdin)")
     a.add_argument("--from", dest="from_file", metavar="FILE", help="batch mode: one query per line (shares rate-limit state, tidies once)")
-    a.add_argument("--replace", action="store_true", help="overwrite an existing matching entry (keeps its citation key)")
+    a.add_argument("--replace", action="store_true", help="overwrite an existing matching entry (keeps its citation key); errors if nothing matches")
+    a.add_argument("--key", metavar="KEY", help="replace exactly the entry with this citation key (for title drift)")
     a.add_argument("--no-tidy", action="store_true")
     a.add_argument("--no-cache", action="store_true", help="bypass the local match cache")
     a.add_argument("--require-published", action="store_true")

{bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/src/bibcite/normalize.py RENAMED Viewed

@@ -76,6 +76,22 @@ def first_author_last_name(author_field: str) -> str:
     return mini_hash(last) or "anon"
+def sig_tokens(title: str) -> set[str]:
+    """Significant title tokens: folded, alphanumeric, stopwords removed."""
+    tokens = re.split(r"[^a-z0-9]+", fold_ascii(title).lower())
+    return {t for t in tokens if len(t) > 2 and t not in ENGLISH_STOPWORDS}
+def titles_similar(a: str, b: str, threshold: float = 0.7) -> bool:
+    """Token-Jaccard similarity — catches preprint→camera-ready title drift
+    ("Information-Theoretic Perspective" vs "Information Theory Perspective")
+    without matching genuinely different papers."""
+    ta, tb = sig_tokens(a), sig_tokens(b)
+    if not ta or not tb:
+        return False
+    return len(ta & tb) / len(ta | tb) >= threshold
 def fix_author_caps(author_field: str) -> str:
     """Normalize ALL-CAPS author names (old CrossRef records store e.g.
     "EPPS, T. W. and PULLEY, LAWRENCE B."). A word is re-cased only when it

{bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/src/bibcite/sources.py RENAMED Viewed

@@ -16,7 +16,7 @@ from dataclasses import dataclass, field
 import httpx
-from .normalize import clean_title, mini_hash, norm_title
+from .normalize import clean_title, mini_hash, norm_title, sig_tokens, titles_similar
 UA = "bibcite/0.1 (https://github.com/leonardo/bibcite; mailto:bibcite@gmail.com)"
 BROWSER_UA = (
@@ -198,6 +198,73 @@ def try_dblp(title: str, author_hint: str = "") -> Match | None:
     return None
+def _dblp_hit_authors(info: dict) -> list[str]:
+    authors = (info.get("authors") or {}).get("author") or []
+    if isinstance(authors, dict):
+        authors = [authors]
+    return [a.get("text", "") for a in authors if isinstance(a, dict)]
+def try_dblp_fuzzy(title: str, author_hint: str, year: str = "") -> Match | None:
+    """Title-drift fallback: camera-ready titles often differ from the arXiv
+    ones ("Information-Theoretic" -> "Information Theory"), and DBLP's
+    token-AND search then misses entirely. Query author + the most
+    distinctive title tokens instead, and accept token-Jaccard-similar
+    titles — guarded by author and year so different papers can't sneak in.
+    """
+    if not author_hint:
+        return None
+    tokens = sorted(sig_tokens(title), key=len, reverse=True)[:3]
+    if not tokens:
+        return None
+    q = " ".join([author_hint] + tokens)
+    with _client() as c:
+        r = c.get(
+            "https://dblp.org/search/publ/api",
+            params={"q": q, "format": "json", "h": 100},
+        )
+        if r.status_code == 429:
+            raise SourceUnavailable("DBLP rate-limited (429)")
+        r.raise_for_status()
+        hits = r.json().get("result", {}).get("hits", {}).get("hit", []) or []
+        hits.sort(key=lambda h: int(h.get("info", {}).get("year", 9999)))
+        for hit in hits:
+            info = hit.get("info", {})
+            hit_title = clean_title(html.unescape(info.get("title", "")))
+            if info.get("venue") == "CoRR" or not info.get("venue"):
+                continue
+            if not titles_similar(hit_title, title):
+                continue
+            if year and info.get("year"):
+                if abs(int(info["year"]) - int(year)) > 2:
+                    continue
+            hit_authors = mini_hash(" ".join(_dblp_hit_authors(info)))
+            if author_hint not in hit_authors:
+                continue
+            venue = info["venue"]
+            if isinstance(venue, list):
+                venue = venue[0]
+            bibtex = ""
+            if info.get("url"):
+                br = c.get(info["url"] + ".bib")
+                if br.status_code == 200:
+                    bibtex = br.text
+            _log(
+                f"[dblp-fuzzy] match with title drift: '{hit_title}' "
+                f"@ {venue} {info.get('year', '')}"
+            )
+            return Match(
+                source="dblp-fuzzy",
+                venue=str(venue),
+                title=hit_title,
+                year=str(info.get("year", "")),
+                doi=info.get("doi", ""),
+                bibtex=bibtex,
+                url=info.get("ee", "") or info.get("url", ""),
+            )
+    return None
 # ---------------------------------------------------------------------------
 # Semantic Scholar
 # ---------------------------------------------------------------------------
@@ -627,4 +694,19 @@ def find_published(
             _log(f"[{name}] disabled for the rest of this run: {e}")
         except Exception as e:  # network hiccup on one source must not kill the run
             _log(f"[{name}] error: {type(e).__name__}: {e}")
+    # Exact-title search missed everywhere. Before concluding "no published
+    # version", try the title-drift fallback — camera-ready titles frequently
+    # differ from the arXiv ones, which is precisely the upgrade scenario.
+    if author_hint and "dblp" not in _DISABLED:
+        try:
+            m = try_dblp_fuzzy(title, author_hint, year)
+            if m:
+                cache.put(cache_key, m.__dict__)
+                return m, "found"
+            clean_misses += 1
+        except SourceUnavailable as e:
+            _DISABLED["dblp"] = str(e)
+        except Exception as e:
+            _log(f"[dblp-fuzzy] error: {type(e).__name__}: {e}")
     return None, ("not_found" if clean_misses else "unavailable")

bibcite_cli-0.4.0/tests/test_round2.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""Regression tests for the second round of field-use bug reports."""
+from pathlib import Path
+from bibcite.bibfile import MONTH_STRINGS, load_bib_file, upsert_entry, _write_db
+from bibcite.normalize import titles_similar
+ARXIV_TITLE = "An Information-Theoretic Perspective on Variance-Invariance-Covariance Regularization"
+PUBLISHED_TITLE = "An Information Theory Perspective on Variance-Invariance-Covariance Regularization"
+def test_titles_similar_catches_camera_ready_drift():
+    assert titles_similar(ARXIV_TITLE, PUBLISHED_TITLE)
+def test_titles_similar_rejects_different_papers():
+    assert not titles_similar(
+        "Attention Is All You Need",
+        "An Image is Worth 16x16 Words: Transformers for Image Recognition",
+    )
+    assert not titles_similar("Deep Residual Learning", "")
+ENTRY = {
+    "ENTRYTYPE": "inproceedings",
+    "ID": "k1",
+    "title": "Paper One",
+    "author": "A B",
+    "booktitle": "Some Conference (SC)",
+    "year": "2020",
+}
+def test_replace_without_match_errors_instead_of_adding(tmp_path: Path):
+    bib = tmp_path / "r.bib"
+    upsert_entry(bib, dict(ENTRY))
+    stranger = dict(ENTRY, ID="k2", title="A Totally Different Paper")
+    action, key = upsert_entry(bib, stranger, replace=True)
+    assert action == "no_match_to_replace"
+    assert "Totally Different" not in bib.read_text()  # nothing was written
+def test_replace_key_targets_specific_entry(tmp_path: Path):
+    bib = tmp_path / "r.bib"
+    upsert_entry(bib, dict(ENTRY))
+    drifted = dict(ENTRY, ID="whatever", title="Paper One Revised Title")
+    action, key = upsert_entry(bib, drifted, replace_key="k1")
+    assert (action, key) == ("replaced", "k1")
+    assert "Paper One Revised Title" in bib.read_text()
+    action, _ = upsert_entry(bib, drifted, replace_key="nonexistent")
+    assert action == "no_match_to_replace"
+def test_month_strings_never_written_to_file(tmp_path: Path):
+    bib = tmp_path / "m.bib"
+    # Simulate a file polluted by the old bug: @string month macros present.
+    bib.write_text(
+        '@string{january = {January}}\n'
+        '@article{x, title = {T}, author = {A B}, year = {2000}, month = january }\n'
+    )
+    db = load_bib_file(bib)
+    _write_db(bib, db)
+    text = bib.read_text()
+    assert "@string" not in text  # scrubbed on write
+    assert "title" in text
+def test_month_strings_cover_all_months():
+    for m in ("january", "may", "june", "december", "jan", "jun", "dec"):
+        assert m in MONTH_STRINGS

{bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/uv.lock RENAMED Viewed

@@ -18,7 +18,7 @@ wheels = [
 [[package]]
 name = "bibcite-cli"
-version = "0.3.0"
+version = "0.4.0"
 source = { editable = "." }
 dependencies = [
     { name = "bibtexparser" },