bibcite-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bibcite/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """bibcite: canonical BibTeX resolution for papers (arXiv id / DOI / title)."""
2
+
3
+ __version__ = "0.1.0"
bibcite/bibfile.py ADDED
@@ -0,0 +1,194 @@
1
+ """Reading/writing .bib files, deduplication, and the bibtex-tidy runner."""
2
+
3
+ import re
4
+ import shutil
5
+ import subprocess
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ import bibtexparser
10
+ from bibtexparser.bibdatabase import BibDatabase
11
+ from bibtexparser.bparser import BibTexParser
12
+ from bibtexparser.bwriter import BibTexWriter
13
+
14
+ from .normalize import norm_title
15
+
16
+ # The exact bibtex-tidy invocation requested by the user; keep in sync with
17
+ # their LaTeX workflow.
18
+ TIDY_ARGS = [
19
+ "--modify",
20
+ "--omit=pages,publisher,doi,timestamp,biburl,bibsource,abstract,month,series,volume,editor,note,date,number,address",
21
+ "--curly",
22
+ "--blank-lines",
23
+ "--trailing-commas",
24
+ "--sort=-year",
25
+ "--duplicates=citation",
26
+ "--merge=first",
27
+ "--sort-fields=author,title,booktitle,journal,year,url,pdf",
28
+ "--strip-enclosing-braces",
29
+ "--tidy-comments",
30
+ "--generate-keys",
31
+ ]
32
+
33
+ NOISE_FIELDS = ("timestamp", "biburl", "bibsource", "crossref")
34
+
35
+ ARXIV_ID_RE = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?")
36
+
37
+
38
+ def _log(msg: str):
39
+ print(msg, file=sys.stderr)
40
+
41
+
42
+ def _parser() -> BibTexParser:
43
+ p = BibTexParser(common_strings=True)
44
+ p.ignore_nonstandard_types = False
45
+ return p
46
+
47
+
48
+ def parse_bib(text: str) -> BibDatabase:
49
+ return bibtexparser.loads(text, parser=_parser())
50
+
51
+
52
+ def parse_bibtex_entry(text: str) -> dict:
53
+ """First entry of a bibtex string as a dict (fields + ID + ENTRYTYPE)."""
54
+ db = parse_bib(text)
55
+ if not db.entries:
56
+ raise ValueError("No BibTeX entry could be parsed")
57
+ entry = dict(db.entries[0])
58
+ for f in NOISE_FIELDS:
59
+ entry.pop(f, None)
60
+ return entry
61
+
62
+
63
+ def entry_to_bibtex(entry: dict) -> str:
64
+ db = BibDatabase()
65
+ db.entries = [{k: str(v) for k, v in entry.items() if v}]
66
+ writer = BibTexWriter()
67
+ writer.indent = " "
68
+ return bibtexparser.dumps(db, writer).strip() + "\n"
69
+
70
+
71
+ def entry_arxiv_id(entry: dict) -> str:
72
+ """Extract an arXiv id from eprint/url/journal/note fields, if any."""
73
+ for f in ("eprint", "url", "journal", "note", "doi"):
74
+ v = entry.get(f, "")
75
+ if "arxiv" in v.lower() or f == "eprint":
76
+ m = ARXIV_ID_RE.search(v)
77
+ if m:
78
+ return m.group(1)
79
+ return ""
80
+
81
+
82
+ def is_preprint(entry: dict) -> bool:
83
+ """Preprint = the venue fields say arXiv/preprint, or there is no venue.
84
+
85
+ eprint/archiveprefix/url fields do NOT count: published entries keep
86
+ their arXiv pointers.
87
+ """
88
+ venue = " ".join(
89
+ str(entry.get(f, "")) for f in ("journal", "booktitle", "howpublished")
90
+ ).lower()
91
+ if "arxiv" in venue or "preprint" in venue or "corr" in venue.split():
92
+ return True
93
+ return not entry.get("journal") and not entry.get("booktitle")
94
+
95
+
96
+ def load_bib_file(path: Path) -> BibDatabase | None:
97
+ """Parse an existing .bib file; None when it cannot be parsed (we then
98
+ degrade to append-only mode)."""
99
+ if not path.exists() or not path.read_text().strip():
100
+ return BibDatabase()
101
+ try:
102
+ return parse_bib(path.read_text())
103
+ except Exception as e:
104
+ _log(f"[bibcite] warning: could not parse {path} ({e}); appending without dedup")
105
+ return None
106
+
107
+
108
+ def find_existing(db: BibDatabase, title: str, arxiv_id: str = "", doi: str = "") -> dict | None:
109
+ ref = norm_title(title)
110
+ for entry in db.entries:
111
+ if arxiv_id and entry_arxiv_id(entry) == arxiv_id:
112
+ return entry
113
+ if doi and entry.get("doi", "").lower() == doi.lower():
114
+ return entry
115
+ if ref and norm_title(entry.get("title", "")) == ref:
116
+ return entry
117
+ return None
118
+
119
+
120
+ def upsert_entry(path: Path, entry: dict) -> tuple[str, str]:
121
+ """Insert or upgrade ``entry`` in ``path``.
122
+
123
+ Returns (action, key) where action is "added" | "upgraded" | "exists".
124
+ """
125
+ db = load_bib_file(path)
126
+ if db is None: # unparseable file: append blindly
127
+ with path.open("a") as f:
128
+ f.write("\n" + entry_to_bibtex(entry))
129
+ return "added", entry["ID"]
130
+
131
+ existing = find_existing(
132
+ db, entry.get("title", ""), entry_arxiv_id(entry), entry.get("doi", "")
133
+ )
134
+ if existing is not None:
135
+ if is_preprint(existing) and not is_preprint(entry):
136
+ key = existing["ID"]
137
+ existing.clear()
138
+ existing.update(entry)
139
+ existing["ID"] = key # keep the key the user may already \cite
140
+ _write_db(path, db)
141
+ return "upgraded", key
142
+ return "exists", existing["ID"]
143
+
144
+ db.entries.append({k: str(v) for k, v in entry.items() if v})
145
+ _write_db(path, db)
146
+ return "added", entry["ID"]
147
+
148
+
149
+ def _write_db(path: Path, db: BibDatabase):
150
+ writer = BibTexWriter()
151
+ writer.indent = " "
152
+ writer.order_entries_by = None # preserve file order; tidy re-sorts anyway
153
+ path.write_text(bibtexparser.dumps(db, writer))
154
+
155
+
156
+ # ---------------------------------------------------------------------------
157
+ # bibtex-tidy
158
+ # ---------------------------------------------------------------------------
159
+
160
+ def tidy_command() -> list[str] | None:
161
+ exe = shutil.which("bibtex-tidy")
162
+ if exe:
163
+ return [exe]
164
+ if shutil.which("npx"):
165
+ return ["npx", "--yes", "bibtex-tidy"]
166
+ return None
167
+
168
+
169
+ def run_tidy(path: Path) -> bool:
170
+ cmd = tidy_command()
171
+ if cmd is None:
172
+ _log("[bibcite] bibtex-tidy not found (npm i -g bibtex-tidy); skipping tidy")
173
+ return False
174
+ proc = subprocess.run(
175
+ cmd + [str(path)] + TIDY_ARGS, capture_output=True, text=True
176
+ )
177
+ if proc.returncode != 0:
178
+ _log(f"[bibcite] bibtex-tidy failed:\n{proc.stderr.strip()}")
179
+ return False
180
+ _log(f"[bibcite] bibtex-tidy: {proc.stdout.strip().splitlines()[-1] if proc.stdout.strip() else 'ok'}")
181
+ return True
182
+
183
+
184
+ def key_after_tidy(path: Path, title: str, fallback_key: str) -> str:
185
+ """bibtex-tidy --generate-keys rewrites keys; re-read the file to report
186
+ the final key for the entry with this title."""
187
+ db = load_bib_file(path)
188
+ if db is None:
189
+ return fallback_key
190
+ ref = norm_title(title)
191
+ for entry in db.entries:
192
+ if norm_title(entry.get("title", "")) == ref:
193
+ return entry["ID"]
194
+ return fallback_key
bibcite/cli.py ADDED
@@ -0,0 +1,272 @@
1
+ """bibcite CLI.
2
+
3
+ Designed to be called by agents: never hand-edit a .bib file — let
4
+ ``bibcite add`` resolve, canonicalize, dedupe, write, and tidy, then use the
5
+ citation key it prints.
6
+ """
7
+
8
+ import argparse
9
+ import json
10
+ import sys
11
+ from pathlib import Path
12
+
13
+ from . import bibfile
14
+ from .normalize import first_author_last_name, norm_title
15
+ from .resolve import Resolved, guess_entry_type, resolve
16
+ from .sources import find_published
17
+ from .venues import canonicalize
18
+
19
+
20
+ def _log(msg: str):
21
+ print(msg, file=sys.stderr)
22
+
23
+
24
+ def _emit(payload: dict, as_json: bool = True):
25
+ """File-mutating commands always print one JSON object on stdout — the
26
+ agent-facing contract. Only `get` has a plain mode (BibTeX on stdout for
27
+ previewing/piping)."""
28
+ if as_json:
29
+ print(json.dumps(payload, ensure_ascii=False, indent=2))
30
+ else:
31
+ for k, v in payload.items():
32
+ if k != "bibtex":
33
+ _log(f"{k}: {v}")
34
+ if payload.get("bibtex"):
35
+ print(payload["bibtex"], end="")
36
+ elif payload.get("key"):
37
+ print(payload["key"])
38
+
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # get
42
+ # ---------------------------------------------------------------------------
43
+
44
+ def _resolve_or_none(query: str, require_published: bool):
45
+ try:
46
+ return resolve(query, require_published=require_published)
47
+ except (LookupError, ValueError) as e:
48
+ _log(f"[bibcite] {e}")
49
+ except Exception as e:
50
+ _log(f"[bibcite] network error: {type(e).__name__}: {e}")
51
+ return None
52
+
53
+
54
+ def cmd_get(args) -> int:
55
+ query = " ".join(args.query)
56
+ res = _resolve_or_none(query, args.require_published)
57
+ if res is None:
58
+ return 2
59
+ _emit(
60
+ {
61
+ "action": "resolved",
62
+ "key": res.entry["ID"],
63
+ "title": res.entry.get("title", ""),
64
+ "venue": res.venue or "arXiv (preprint, no published venue found)",
65
+ "published": res.published,
66
+ "source": res.source,
67
+ "bibtex": res.bibtex,
68
+ },
69
+ args.json,
70
+ )
71
+ return 0
72
+
73
+
74
+ # ---------------------------------------------------------------------------
75
+ # add
76
+ # ---------------------------------------------------------------------------
77
+
78
+ def cmd_add(args) -> int:
79
+ path = Path(args.file)
80
+ if args.bibtex:
81
+ text = sys.stdin.read() if args.bibtex == "-" else args.bibtex
82
+ entry = bibfile.parse_bibtex_entry(text)
83
+ raw_venue = entry.get("booktitle", "") or entry.get("journal", "")
84
+ canonical = canonicalize(raw_venue, entry.get("year"))
85
+ if canonical:
86
+ entry.pop("booktitle", None)
87
+ entry.pop("journal", None)
88
+ entry["ENTRYTYPE"] = canonical.entry_type
89
+ entry[canonical.bib_field] = canonical.name
90
+ res = Resolved(entry, "user-bibtex", canonical.name if canonical else raw_venue, True)
91
+ else:
92
+ if not args.query:
93
+ _log("[bibcite] provide a query (arXiv id / DOI / title) or --bibtex")
94
+ return 2
95
+ query = " ".join(args.query)
96
+ res = _resolve_or_none(query, args.require_published)
97
+ if res is None:
98
+ return 2
99
+
100
+ action, key = bibfile.upsert_entry(path, res.entry)
101
+ tidied = False
102
+ if action != "exists" and not args.no_tidy:
103
+ tidied = bibfile.run_tidy(path)
104
+ if tidied:
105
+ key = bibfile.key_after_tidy(path, res.entry.get("title", ""), key)
106
+
107
+ _emit(
108
+ {
109
+ "action": action,
110
+ "key": key,
111
+ "title": res.entry.get("title", ""),
112
+ "venue": res.venue or "arXiv (preprint)",
113
+ "published": res.published,
114
+ "source": res.source,
115
+ "file": str(path),
116
+ "tidied": tidied,
117
+ }
118
+ )
119
+ return 0
120
+
121
+
122
+ # ---------------------------------------------------------------------------
123
+ # upgrade: batch-match arXiv entries in an existing file (bibMatcher, CLI-style)
124
+ # ---------------------------------------------------------------------------
125
+
126
+ def cmd_upgrade(args) -> int:
127
+ path = Path(args.file)
128
+ db = bibfile.load_bib_file(path)
129
+ if db is None or not db.entries:
130
+ _log(f"[bibcite] nothing to do in {path}")
131
+ return 0
132
+
133
+ report = []
134
+ changed = 0
135
+ for entry in db.entries:
136
+ if not bibfile.is_preprint(entry):
137
+ continue
138
+ title = entry.get("title", "").replace("{", "").replace("}", "")
139
+ if not title:
140
+ continue
141
+ _log(f"[upgrade] matching: {title[:80]}")
142
+ aid = bibfile.entry_arxiv_id(entry)
143
+ hint = (
144
+ first_author_last_name(entry["author"]) if entry.get("author") else ""
145
+ )
146
+ match = find_published(title, entry.get("year", ""), aid, hint)
147
+ if not match:
148
+ report.append({"key": entry["ID"], "title": title, "matched": False})
149
+ continue
150
+ canonical = canonicalize(match.venue, match.year or entry.get("year"))
151
+ venue_name = canonical.name if canonical else match.venue
152
+ if not args.dry_run:
153
+ entry.pop("journal", None)
154
+ entry.pop("booktitle", None)
155
+ entry.pop("howpublished", None)
156
+ if canonical:
157
+ entry["ENTRYTYPE"] = canonical.entry_type
158
+ entry[canonical.bib_field] = canonical.name
159
+ else:
160
+ entry["ENTRYTYPE"] = guess_entry_type(match.venue)
161
+ field = (
162
+ "booktitle"
163
+ if entry["ENTRYTYPE"] == "inproceedings"
164
+ else "journal"
165
+ )
166
+ entry[field] = match.venue
167
+ if match.year:
168
+ entry["year"] = match.year
169
+ if match.doi and not entry.get("doi"):
170
+ entry["doi"] = match.doi
171
+ changed += 1
172
+ report.append(
173
+ {
174
+ "key": entry["ID"],
175
+ "title": title,
176
+ "matched": True,
177
+ "venue": venue_name,
178
+ "source": match.source,
179
+ }
180
+ )
181
+
182
+ if changed and not args.dry_run:
183
+ bibfile._write_db(path, db)
184
+ if not args.no_tidy:
185
+ bibfile.run_tidy(path)
186
+
187
+ matched = sum(1 for r in report if r["matched"])
188
+ for r in report:
189
+ mark = "✓" if r["matched"] else "✗"
190
+ _log(f"{mark} {r['key']}: {r.get('venue', 'no match')}")
191
+ _log(f"[bibcite] {matched} matched, {changed} upgraded{' (dry-run)' if args.dry_run else ''}")
192
+ _emit({"upgraded": changed, "matched": matched, "dry_run": args.dry_run, "entries": report})
193
+ return 0
194
+
195
+
196
+ # ---------------------------------------------------------------------------
197
+ # tidy / check
198
+ # ---------------------------------------------------------------------------
199
+
200
+ def cmd_tidy(args) -> int:
201
+ return 0 if bibfile.run_tidy(Path(args.file)) else 1
202
+
203
+
204
+ def cmd_check(args) -> int:
205
+ path = Path(args.file)
206
+ db = bibfile.load_bib_file(path)
207
+ if db is None:
208
+ _log(f"[bibcite] {path} could not be parsed")
209
+ return 1
210
+ problems = []
211
+ seen_titles: dict[str, str] = {}
212
+ for entry in db.entries:
213
+ key = entry.get("ID", "?")
214
+ nt = norm_title(entry.get("title", ""))
215
+ if nt and nt in seen_titles:
216
+ problems.append({"key": key, "issue": f"duplicate title of {seen_titles[nt]}"})
217
+ seen_titles.setdefault(nt, key)
218
+ for f in ("author", "title", "year"):
219
+ if not entry.get(f):
220
+ problems.append({"key": key, "issue": f"missing {f}"})
221
+ if bibfile.is_preprint(entry):
222
+ problems.append({"key": key, "issue": "arXiv preprint (try `bibcite upgrade`)"})
223
+ for p in problems:
224
+ _log(f"{p['key']}: {p['issue']}")
225
+ _log(f"[bibcite] {len(db.entries)} entries, {len(problems)} issues")
226
+ _emit({"entries": len(db.entries), "problems": problems})
227
+ return 0
228
+
229
+
230
+ # ---------------------------------------------------------------------------
231
+
232
+ def main(argv=None) -> int:
233
+ p = argparse.ArgumentParser(
234
+ prog="bibcite",
235
+ description="Resolve papers to canonical BibTeX and manage .bib files (agents: use `add`, never hand-edit).",
236
+ )
237
+ sub = p.add_subparsers(dest="cmd", required=True)
238
+
239
+ g = sub.add_parser("get", help="resolve a query and print BibTeX to stdout")
240
+ g.add_argument("query", nargs="+", help="arXiv id / arXiv URL / DOI / title")
241
+ g.add_argument("--json", action="store_true", help="print a JSON object instead of BibTeX")
242
+ g.add_argument("--require-published", action="store_true", help="fail instead of falling back to an arXiv entry")
243
+ g.set_defaults(fn=cmd_get)
244
+
245
+ a = sub.add_parser("add", help="resolve and write into a .bib file, then run bibtex-tidy (prints JSON)")
246
+ a.add_argument("file", help="target .bib file (created if missing)")
247
+ a.add_argument("query", nargs="*", help="arXiv id / arXiv URL / DOI / title")
248
+ a.add_argument("--bibtex", help="raw BibTeX entry to add instead of a query ('-' reads stdin)")
249
+ a.add_argument("--no-tidy", action="store_true")
250
+ a.add_argument("--require-published", action="store_true")
251
+ a.set_defaults(fn=cmd_add)
252
+
253
+ u = sub.add_parser("upgrade", help="match all arXiv entries in a file to their published versions (prints JSON)")
254
+ u.add_argument("file")
255
+ u.add_argument("--dry-run", action="store_true")
256
+ u.add_argument("--no-tidy", action="store_true")
257
+ u.set_defaults(fn=cmd_upgrade)
258
+
259
+ t = sub.add_parser("tidy", help="run bibtex-tidy with the canonical flags")
260
+ t.add_argument("file")
261
+ t.set_defaults(fn=cmd_tidy)
262
+
263
+ c = sub.add_parser("check", help="offline sanity check of a .bib file (prints JSON)")
264
+ c.add_argument("file")
265
+ c.set_defaults(fn=cmd_check)
266
+
267
+ args = p.parse_args(argv)
268
+ return args.fn(args)
269
+
270
+
271
+ if __name__ == "__main__":
272
+ raise SystemExit(main())