bibcite-cli 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bibcite_cli-0.1.0/.gitignore +5 -0
- bibcite_cli-0.1.0/LICENSE +21 -0
- bibcite_cli-0.1.0/PKG-INFO +74 -0
- bibcite_cli-0.1.0/Readme.md +62 -0
- bibcite_cli-0.1.0/pyproject.toml +27 -0
- bibcite_cli-0.1.0/src/bibcite/__init__.py +3 -0
- bibcite_cli-0.1.0/src/bibcite/bibfile.py +194 -0
- bibcite_cli-0.1.0/src/bibcite/cli.py +272 -0
- bibcite_cli-0.1.0/src/bibcite/data/strings.bib +352 -0
- bibcite_cli-0.1.0/src/bibcite/normalize.py +86 -0
- bibcite_cli-0.1.0/src/bibcite/resolve.py +289 -0
- bibcite_cli-0.1.0/src/bibcite/sources.py +593 -0
- bibcite_cli-0.1.0/src/bibcite/venues.py +241 -0
- bibcite_cli-0.1.0/tests/test_bibfile.py +79 -0
- bibcite_cli-0.1.0/tests/test_entry_types.py +29 -0
- bibcite_cli-0.1.0/tests/test_normalize.py +38 -0
- bibcite_cli-0.1.0/tests/test_strings_override.py +21 -0
- bibcite_cli-0.1.0/tests/test_venues.py +76 -0
- bibcite_cli-0.1.0/uv.lock +251 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Leonardo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bibcite-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX for agents and humans
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: arxiv,bibliography,bibtex,citations,dblp
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Requires-Dist: bibtexparser<2,>=1.4
|
|
10
|
+
Requires-Dist: httpx>=0.27
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# bibcite
|
|
14
|
+
|
|
15
|
+
Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX, and manage `.bib` files so agents never hand-edit them.
|
|
16
|
+
|
|
17
|
+
The publication-matching cascade is ported from [PaperMemory](https://github.com/vict0rsch/PaperMemory)'s bibMatcher:
|
|
18
|
+
DBLP → Semantic Scholar → Google Scholar → CrossRef → Unpaywall.
|
|
19
|
+
A match must have an identical normalized title, a plausible year, and a non-preprint venue.
|
|
20
|
+
|
|
21
|
+
Venue names are canonicalized against the `@string` table vendored in `src/bibcite/data/strings.bib` (journals / conferences / workshops), including year-aware rules (NIPS before 2018 vs NeurIPS, WACV before 2017).
|
|
22
|
+
|
|
23
|
+
Entry types are strict: conference/workshop papers become `@inproceedings` + `booktitle`, journal papers `@article` + `journal`, and unpublished arXiv preprints `@misc` + `howpublished = {arXiv preprint arXiv:ID}`.
|
|
24
|
+
Types coming from authoritative source BibTeX (DBLP) are preserved.
|
|
25
|
+
|
|
26
|
+
After every write, the file is formatted with [bibtex-tidy](https://github.com/FlamingTempura/bibtex-tidy) using the canonical flags in `bibfile.TIDY_ARGS` (requires `bibtex-tidy` on PATH or `npx`).
|
|
27
|
+
|
|
28
|
+
## Install
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# from a local checkout (development)
|
|
32
|
+
uv tool install --editable .
|
|
33
|
+
|
|
34
|
+
# from git, no checkout needed
|
|
35
|
+
uv tool install git+https://github.com/<you>/bibcite
|
|
36
|
+
|
|
37
|
+
# once published to PyPI (package name bibcite-cli, command name bibcite)
|
|
38
|
+
uv tool install bibcite-cli # or: uvx --from bibcite-cli bibcite ...
|
|
39
|
+
|
|
40
|
+
# plus, once (required for the tidy step):
|
|
41
|
+
npm install -g bibtex-tidy
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
To use your own venue table instead of the vendored one, set `BIBCITE_STRINGS=/path/to/strings.bib` or place it at `~/.config/bibcite/strings.bib`.
|
|
45
|
+
|
|
46
|
+
## Usage
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# Preview the BibTeX for a paper (nothing written)
|
|
50
|
+
bibcite get 1706.03762
|
|
51
|
+
bibcite get "Attention is all you need"
|
|
52
|
+
bibcite get 10.1109/CVPR52688.2022.01167
|
|
53
|
+
|
|
54
|
+
# Resolve and write into a .bib file, dedupe, then bibtex-tidy; prints the final key
|
|
55
|
+
bibcite add refs.bib 2103.14030 --json
|
|
56
|
+
|
|
57
|
+
# Add a raw BibTeX entry you already have (venue still canonicalized, file still tidied)
|
|
58
|
+
bibcite add refs.bib --bibtex "$(pbpaste)"
|
|
59
|
+
|
|
60
|
+
# Upgrade every arXiv entry in a file to its published version (bibMatcher, CLI-style)
|
|
61
|
+
bibcite upgrade refs.bib --dry-run
|
|
62
|
+
|
|
63
|
+
# Just format, or just lint
|
|
64
|
+
bibcite tidy refs.bib
|
|
65
|
+
bibcite check refs.bib
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
`--json` prints a machine-readable result on stdout (`action`, `key`, `venue`, `source`, ...); all diagnostics go to stderr.
|
|
69
|
+
`add` is idempotent: an existing entry returns `action: exists` with its key, and an existing arXiv entry matched to a published version is upgraded in place, keeping its citation key.
|
|
70
|
+
|
|
71
|
+
## For agents
|
|
72
|
+
|
|
73
|
+
Never edit `.bib` files by hand.
|
|
74
|
+
Call `bibcite add <file> <query> --json` and use the returned `key` in `\cite{...}`.
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# bibcite
|
|
2
|
+
|
|
3
|
+
Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX, and manage `.bib` files so agents never hand-edit them.
|
|
4
|
+
|
|
5
|
+
The publication-matching cascade is ported from [PaperMemory](https://github.com/vict0rsch/PaperMemory)'s bibMatcher:
|
|
6
|
+
DBLP → Semantic Scholar → Google Scholar → CrossRef → Unpaywall.
|
|
7
|
+
A match must have an identical normalized title, a plausible year, and a non-preprint venue.
|
|
8
|
+
|
|
9
|
+
Venue names are canonicalized against the `@string` table vendored in `src/bibcite/data/strings.bib` (journals / conferences / workshops), including year-aware rules (NIPS before 2018 vs NeurIPS, WACV before 2017).
|
|
10
|
+
|
|
11
|
+
Entry types are strict: conference/workshop papers become `@inproceedings` + `booktitle`, journal papers `@article` + `journal`, and unpublished arXiv preprints `@misc` + `howpublished = {arXiv preprint arXiv:ID}`.
|
|
12
|
+
Types coming from authoritative source BibTeX (DBLP) are preserved.
|
|
13
|
+
|
|
14
|
+
After every write, the file is formatted with [bibtex-tidy](https://github.com/FlamingTempura/bibtex-tidy) using the canonical flags in `bibfile.TIDY_ARGS` (requires `bibtex-tidy` on PATH or `npx`).
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
# from a local checkout (development)
|
|
20
|
+
uv tool install --editable .
|
|
21
|
+
|
|
22
|
+
# from git, no checkout needed
|
|
23
|
+
uv tool install git+https://github.com/<you>/bibcite
|
|
24
|
+
|
|
25
|
+
# once published to PyPI (package name bibcite-cli, command name bibcite)
|
|
26
|
+
uv tool install bibcite-cli # or: uvx --from bibcite-cli bibcite ...
|
|
27
|
+
|
|
28
|
+
# plus, once (required for the tidy step):
|
|
29
|
+
npm install -g bibtex-tidy
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
To use your own venue table instead of the vendored one, set `BIBCITE_STRINGS=/path/to/strings.bib` or place it at `~/.config/bibcite/strings.bib`.
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
# Preview the BibTeX for a paper (nothing written)
|
|
38
|
+
bibcite get 1706.03762
|
|
39
|
+
bibcite get "Attention is all you need"
|
|
40
|
+
bibcite get 10.1109/CVPR52688.2022.01167
|
|
41
|
+
|
|
42
|
+
# Resolve and write into a .bib file, dedupe, then bibtex-tidy; prints the final key
|
|
43
|
+
bibcite add refs.bib 2103.14030 --json
|
|
44
|
+
|
|
45
|
+
# Add a raw BibTeX entry you already have (venue still canonicalized, file still tidied)
|
|
46
|
+
bibcite add refs.bib --bibtex "$(pbpaste)"
|
|
47
|
+
|
|
48
|
+
# Upgrade every arXiv entry in a file to its published version (bibMatcher, CLI-style)
|
|
49
|
+
bibcite upgrade refs.bib --dry-run
|
|
50
|
+
|
|
51
|
+
# Just format, or just lint
|
|
52
|
+
bibcite tidy refs.bib
|
|
53
|
+
bibcite check refs.bib
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
`--json` prints a machine-readable result on stdout (`action`, `key`, `venue`, `source`, ...); all diagnostics go to stderr.
|
|
57
|
+
`add` is idempotent: an existing entry returns `action: exists` with its key, and an existing arXiv entry matched to a published version is upgraded in place, keeping its citation key.
|
|
58
|
+
|
|
59
|
+
## For agents
|
|
60
|
+
|
|
61
|
+
Never edit `.bib` files by hand.
|
|
62
|
+
Call `bibcite add <file> <query> --json` and use the returned `key` in `\cite{...}`.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "bibcite-cli"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX for agents and humans"
|
|
5
|
+
readme = "Readme.md"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
requires-python = ">=3.10"
|
|
8
|
+
dependencies = [
|
|
9
|
+
"bibtexparser>=1.4,<2",
|
|
10
|
+
"httpx>=0.27",
|
|
11
|
+
]
|
|
12
|
+
keywords = ["bibtex", "arxiv", "citations", "dblp", "bibliography"]
|
|
13
|
+
|
|
14
|
+
[project.scripts]
|
|
15
|
+
bibcite = "bibcite.cli:main"
|
|
16
|
+
|
|
17
|
+
[dependency-groups]
|
|
18
|
+
dev = [
|
|
19
|
+
"pytest>=8",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[build-system]
|
|
23
|
+
requires = ["hatchling"]
|
|
24
|
+
build-backend = "hatchling.build"
|
|
25
|
+
|
|
26
|
+
[tool.hatch.build.targets.wheel]
|
|
27
|
+
packages = ["src/bibcite"]
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""Reading/writing .bib files, deduplication, and the bibtex-tidy runner."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import shutil
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import bibtexparser
|
|
10
|
+
from bibtexparser.bibdatabase import BibDatabase
|
|
11
|
+
from bibtexparser.bparser import BibTexParser
|
|
12
|
+
from bibtexparser.bwriter import BibTexWriter
|
|
13
|
+
|
|
14
|
+
from .normalize import norm_title
|
|
15
|
+
|
|
16
|
+
# The exact bibtex-tidy invocation requested by the user; keep in sync with
|
|
17
|
+
# their LaTeX workflow.
|
|
18
|
+
TIDY_ARGS = [
|
|
19
|
+
"--modify",
|
|
20
|
+
"--omit=pages,publisher,doi,timestamp,biburl,bibsource,abstract,month,series,volume,editor,note,date,number,address",
|
|
21
|
+
"--curly",
|
|
22
|
+
"--blank-lines",
|
|
23
|
+
"--trailing-commas",
|
|
24
|
+
"--sort=-year",
|
|
25
|
+
"--duplicates=citation",
|
|
26
|
+
"--merge=first",
|
|
27
|
+
"--sort-fields=author,title,booktitle,journal,year,url,pdf",
|
|
28
|
+
"--strip-enclosing-braces",
|
|
29
|
+
"--tidy-comments",
|
|
30
|
+
"--generate-keys",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
NOISE_FIELDS = ("timestamp", "biburl", "bibsource", "crossref")
|
|
34
|
+
|
|
35
|
+
ARXIV_ID_RE = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _log(msg: str):
|
|
39
|
+
print(msg, file=sys.stderr)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _parser() -> BibTexParser:
|
|
43
|
+
p = BibTexParser(common_strings=True)
|
|
44
|
+
p.ignore_nonstandard_types = False
|
|
45
|
+
return p
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def parse_bib(text: str) -> BibDatabase:
|
|
49
|
+
return bibtexparser.loads(text, parser=_parser())
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def parse_bibtex_entry(text: str) -> dict:
|
|
53
|
+
"""First entry of a bibtex string as a dict (fields + ID + ENTRYTYPE)."""
|
|
54
|
+
db = parse_bib(text)
|
|
55
|
+
if not db.entries:
|
|
56
|
+
raise ValueError("No BibTeX entry could be parsed")
|
|
57
|
+
entry = dict(db.entries[0])
|
|
58
|
+
for f in NOISE_FIELDS:
|
|
59
|
+
entry.pop(f, None)
|
|
60
|
+
return entry
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def entry_to_bibtex(entry: dict) -> str:
|
|
64
|
+
db = BibDatabase()
|
|
65
|
+
db.entries = [{k: str(v) for k, v in entry.items() if v}]
|
|
66
|
+
writer = BibTexWriter()
|
|
67
|
+
writer.indent = " "
|
|
68
|
+
return bibtexparser.dumps(db, writer).strip() + "\n"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def entry_arxiv_id(entry: dict) -> str:
|
|
72
|
+
"""Extract an arXiv id from eprint/url/journal/note fields, if any."""
|
|
73
|
+
for f in ("eprint", "url", "journal", "note", "doi"):
|
|
74
|
+
v = entry.get(f, "")
|
|
75
|
+
if "arxiv" in v.lower() or f == "eprint":
|
|
76
|
+
m = ARXIV_ID_RE.search(v)
|
|
77
|
+
if m:
|
|
78
|
+
return m.group(1)
|
|
79
|
+
return ""
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def is_preprint(entry: dict) -> bool:
|
|
83
|
+
"""Preprint = the venue fields say arXiv/preprint, or there is no venue.
|
|
84
|
+
|
|
85
|
+
eprint/archiveprefix/url fields do NOT count: published entries keep
|
|
86
|
+
their arXiv pointers.
|
|
87
|
+
"""
|
|
88
|
+
venue = " ".join(
|
|
89
|
+
str(entry.get(f, "")) for f in ("journal", "booktitle", "howpublished")
|
|
90
|
+
).lower()
|
|
91
|
+
if "arxiv" in venue or "preprint" in venue or "corr" in venue.split():
|
|
92
|
+
return True
|
|
93
|
+
return not entry.get("journal") and not entry.get("booktitle")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def load_bib_file(path: Path) -> BibDatabase | None:
|
|
97
|
+
"""Parse an existing .bib file; None when it cannot be parsed (we then
|
|
98
|
+
degrade to append-only mode)."""
|
|
99
|
+
if not path.exists() or not path.read_text().strip():
|
|
100
|
+
return BibDatabase()
|
|
101
|
+
try:
|
|
102
|
+
return parse_bib(path.read_text())
|
|
103
|
+
except Exception as e:
|
|
104
|
+
_log(f"[bibcite] warning: could not parse {path} ({e}); appending without dedup")
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def find_existing(db: BibDatabase, title: str, arxiv_id: str = "", doi: str = "") -> dict | None:
|
|
109
|
+
ref = norm_title(title)
|
|
110
|
+
for entry in db.entries:
|
|
111
|
+
if arxiv_id and entry_arxiv_id(entry) == arxiv_id:
|
|
112
|
+
return entry
|
|
113
|
+
if doi and entry.get("doi", "").lower() == doi.lower():
|
|
114
|
+
return entry
|
|
115
|
+
if ref and norm_title(entry.get("title", "")) == ref:
|
|
116
|
+
return entry
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def upsert_entry(path: Path, entry: dict) -> tuple[str, str]:
|
|
121
|
+
"""Insert or upgrade ``entry`` in ``path``.
|
|
122
|
+
|
|
123
|
+
Returns (action, key) where action is "added" | "upgraded" | "exists".
|
|
124
|
+
"""
|
|
125
|
+
db = load_bib_file(path)
|
|
126
|
+
if db is None: # unparseable file: append blindly
|
|
127
|
+
with path.open("a") as f:
|
|
128
|
+
f.write("\n" + entry_to_bibtex(entry))
|
|
129
|
+
return "added", entry["ID"]
|
|
130
|
+
|
|
131
|
+
existing = find_existing(
|
|
132
|
+
db, entry.get("title", ""), entry_arxiv_id(entry), entry.get("doi", "")
|
|
133
|
+
)
|
|
134
|
+
if existing is not None:
|
|
135
|
+
if is_preprint(existing) and not is_preprint(entry):
|
|
136
|
+
key = existing["ID"]
|
|
137
|
+
existing.clear()
|
|
138
|
+
existing.update(entry)
|
|
139
|
+
existing["ID"] = key # keep the key the user may already \cite
|
|
140
|
+
_write_db(path, db)
|
|
141
|
+
return "upgraded", key
|
|
142
|
+
return "exists", existing["ID"]
|
|
143
|
+
|
|
144
|
+
db.entries.append({k: str(v) for k, v in entry.items() if v})
|
|
145
|
+
_write_db(path, db)
|
|
146
|
+
return "added", entry["ID"]
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _write_db(path: Path, db: BibDatabase):
|
|
150
|
+
writer = BibTexWriter()
|
|
151
|
+
writer.indent = " "
|
|
152
|
+
writer.order_entries_by = None # preserve file order; tidy re-sorts anyway
|
|
153
|
+
path.write_text(bibtexparser.dumps(db, writer))
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# ---------------------------------------------------------------------------
|
|
157
|
+
# bibtex-tidy
|
|
158
|
+
# ---------------------------------------------------------------------------
|
|
159
|
+
|
|
160
|
+
def tidy_command() -> list[str] | None:
|
|
161
|
+
exe = shutil.which("bibtex-tidy")
|
|
162
|
+
if exe:
|
|
163
|
+
return [exe]
|
|
164
|
+
if shutil.which("npx"):
|
|
165
|
+
return ["npx", "--yes", "bibtex-tidy"]
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def run_tidy(path: Path) -> bool:
|
|
170
|
+
cmd = tidy_command()
|
|
171
|
+
if cmd is None:
|
|
172
|
+
_log("[bibcite] bibtex-tidy not found (npm i -g bibtex-tidy); skipping tidy")
|
|
173
|
+
return False
|
|
174
|
+
proc = subprocess.run(
|
|
175
|
+
cmd + [str(path)] + TIDY_ARGS, capture_output=True, text=True
|
|
176
|
+
)
|
|
177
|
+
if proc.returncode != 0:
|
|
178
|
+
_log(f"[bibcite] bibtex-tidy failed:\n{proc.stderr.strip()}")
|
|
179
|
+
return False
|
|
180
|
+
_log(f"[bibcite] bibtex-tidy: {proc.stdout.strip().splitlines()[-1] if proc.stdout.strip() else 'ok'}")
|
|
181
|
+
return True
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def key_after_tidy(path: Path, title: str, fallback_key: str) -> str:
|
|
185
|
+
"""bibtex-tidy --generate-keys rewrites keys; re-read the file to report
|
|
186
|
+
the final key for the entry with this title."""
|
|
187
|
+
db = load_bib_file(path)
|
|
188
|
+
if db is None:
|
|
189
|
+
return fallback_key
|
|
190
|
+
ref = norm_title(title)
|
|
191
|
+
for entry in db.entries:
|
|
192
|
+
if norm_title(entry.get("title", "")) == ref:
|
|
193
|
+
return entry["ID"]
|
|
194
|
+
return fallback_key
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
"""bibcite CLI.
|
|
2
|
+
|
|
3
|
+
Designed to be called by agents: never hand-edit a .bib file — let
|
|
4
|
+
``bibcite add`` resolve, canonicalize, dedupe, write, and tidy, then use the
|
|
5
|
+
citation key it prints.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import json
|
|
10
|
+
import sys
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from . import bibfile
|
|
14
|
+
from .normalize import first_author_last_name, norm_title
|
|
15
|
+
from .resolve import Resolved, guess_entry_type, resolve
|
|
16
|
+
from .sources import find_published
|
|
17
|
+
from .venues import canonicalize
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _log(msg: str):
|
|
21
|
+
print(msg, file=sys.stderr)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _emit(payload: dict, as_json: bool = True):
|
|
25
|
+
"""File-mutating commands always print one JSON object on stdout — the
|
|
26
|
+
agent-facing contract. Only `get` has a plain mode (BibTeX on stdout for
|
|
27
|
+
previewing/piping)."""
|
|
28
|
+
if as_json:
|
|
29
|
+
print(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
30
|
+
else:
|
|
31
|
+
for k, v in payload.items():
|
|
32
|
+
if k != "bibtex":
|
|
33
|
+
_log(f"{k}: {v}")
|
|
34
|
+
if payload.get("bibtex"):
|
|
35
|
+
print(payload["bibtex"], end="")
|
|
36
|
+
elif payload.get("key"):
|
|
37
|
+
print(payload["key"])
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
# get
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
|
|
44
|
+
def _resolve_or_none(query: str, require_published: bool):
|
|
45
|
+
try:
|
|
46
|
+
return resolve(query, require_published=require_published)
|
|
47
|
+
except (LookupError, ValueError) as e:
|
|
48
|
+
_log(f"[bibcite] {e}")
|
|
49
|
+
except Exception as e:
|
|
50
|
+
_log(f"[bibcite] network error: {type(e).__name__}: {e}")
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def cmd_get(args) -> int:
|
|
55
|
+
query = " ".join(args.query)
|
|
56
|
+
res = _resolve_or_none(query, args.require_published)
|
|
57
|
+
if res is None:
|
|
58
|
+
return 2
|
|
59
|
+
_emit(
|
|
60
|
+
{
|
|
61
|
+
"action": "resolved",
|
|
62
|
+
"key": res.entry["ID"],
|
|
63
|
+
"title": res.entry.get("title", ""),
|
|
64
|
+
"venue": res.venue or "arXiv (preprint, no published venue found)",
|
|
65
|
+
"published": res.published,
|
|
66
|
+
"source": res.source,
|
|
67
|
+
"bibtex": res.bibtex,
|
|
68
|
+
},
|
|
69
|
+
args.json,
|
|
70
|
+
)
|
|
71
|
+
return 0
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# ---------------------------------------------------------------------------
|
|
75
|
+
# add
|
|
76
|
+
# ---------------------------------------------------------------------------
|
|
77
|
+
|
|
78
|
+
def cmd_add(args) -> int:
|
|
79
|
+
path = Path(args.file)
|
|
80
|
+
if args.bibtex:
|
|
81
|
+
text = sys.stdin.read() if args.bibtex == "-" else args.bibtex
|
|
82
|
+
entry = bibfile.parse_bibtex_entry(text)
|
|
83
|
+
raw_venue = entry.get("booktitle", "") or entry.get("journal", "")
|
|
84
|
+
canonical = canonicalize(raw_venue, entry.get("year"))
|
|
85
|
+
if canonical:
|
|
86
|
+
entry.pop("booktitle", None)
|
|
87
|
+
entry.pop("journal", None)
|
|
88
|
+
entry["ENTRYTYPE"] = canonical.entry_type
|
|
89
|
+
entry[canonical.bib_field] = canonical.name
|
|
90
|
+
res = Resolved(entry, "user-bibtex", canonical.name if canonical else raw_venue, True)
|
|
91
|
+
else:
|
|
92
|
+
if not args.query:
|
|
93
|
+
_log("[bibcite] provide a query (arXiv id / DOI / title) or --bibtex")
|
|
94
|
+
return 2
|
|
95
|
+
query = " ".join(args.query)
|
|
96
|
+
res = _resolve_or_none(query, args.require_published)
|
|
97
|
+
if res is None:
|
|
98
|
+
return 2
|
|
99
|
+
|
|
100
|
+
action, key = bibfile.upsert_entry(path, res.entry)
|
|
101
|
+
tidied = False
|
|
102
|
+
if action != "exists" and not args.no_tidy:
|
|
103
|
+
tidied = bibfile.run_tidy(path)
|
|
104
|
+
if tidied:
|
|
105
|
+
key = bibfile.key_after_tidy(path, res.entry.get("title", ""), key)
|
|
106
|
+
|
|
107
|
+
_emit(
|
|
108
|
+
{
|
|
109
|
+
"action": action,
|
|
110
|
+
"key": key,
|
|
111
|
+
"title": res.entry.get("title", ""),
|
|
112
|
+
"venue": res.venue or "arXiv (preprint)",
|
|
113
|
+
"published": res.published,
|
|
114
|
+
"source": res.source,
|
|
115
|
+
"file": str(path),
|
|
116
|
+
"tidied": tidied,
|
|
117
|
+
}
|
|
118
|
+
)
|
|
119
|
+
return 0
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# ---------------------------------------------------------------------------
|
|
123
|
+
# upgrade: batch-match arXiv entries in an existing file (bibMatcher, CLI-style)
|
|
124
|
+
# ---------------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
def cmd_upgrade(args) -> int:
|
|
127
|
+
path = Path(args.file)
|
|
128
|
+
db = bibfile.load_bib_file(path)
|
|
129
|
+
if db is None or not db.entries:
|
|
130
|
+
_log(f"[bibcite] nothing to do in {path}")
|
|
131
|
+
return 0
|
|
132
|
+
|
|
133
|
+
report = []
|
|
134
|
+
changed = 0
|
|
135
|
+
for entry in db.entries:
|
|
136
|
+
if not bibfile.is_preprint(entry):
|
|
137
|
+
continue
|
|
138
|
+
title = entry.get("title", "").replace("{", "").replace("}", "")
|
|
139
|
+
if not title:
|
|
140
|
+
continue
|
|
141
|
+
_log(f"[upgrade] matching: {title[:80]}")
|
|
142
|
+
aid = bibfile.entry_arxiv_id(entry)
|
|
143
|
+
hint = (
|
|
144
|
+
first_author_last_name(entry["author"]) if entry.get("author") else ""
|
|
145
|
+
)
|
|
146
|
+
match = find_published(title, entry.get("year", ""), aid, hint)
|
|
147
|
+
if not match:
|
|
148
|
+
report.append({"key": entry["ID"], "title": title, "matched": False})
|
|
149
|
+
continue
|
|
150
|
+
canonical = canonicalize(match.venue, match.year or entry.get("year"))
|
|
151
|
+
venue_name = canonical.name if canonical else match.venue
|
|
152
|
+
if not args.dry_run:
|
|
153
|
+
entry.pop("journal", None)
|
|
154
|
+
entry.pop("booktitle", None)
|
|
155
|
+
entry.pop("howpublished", None)
|
|
156
|
+
if canonical:
|
|
157
|
+
entry["ENTRYTYPE"] = canonical.entry_type
|
|
158
|
+
entry[canonical.bib_field] = canonical.name
|
|
159
|
+
else:
|
|
160
|
+
entry["ENTRYTYPE"] = guess_entry_type(match.venue)
|
|
161
|
+
field = (
|
|
162
|
+
"booktitle"
|
|
163
|
+
if entry["ENTRYTYPE"] == "inproceedings"
|
|
164
|
+
else "journal"
|
|
165
|
+
)
|
|
166
|
+
entry[field] = match.venue
|
|
167
|
+
if match.year:
|
|
168
|
+
entry["year"] = match.year
|
|
169
|
+
if match.doi and not entry.get("doi"):
|
|
170
|
+
entry["doi"] = match.doi
|
|
171
|
+
changed += 1
|
|
172
|
+
report.append(
|
|
173
|
+
{
|
|
174
|
+
"key": entry["ID"],
|
|
175
|
+
"title": title,
|
|
176
|
+
"matched": True,
|
|
177
|
+
"venue": venue_name,
|
|
178
|
+
"source": match.source,
|
|
179
|
+
}
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
if changed and not args.dry_run:
|
|
183
|
+
bibfile._write_db(path, db)
|
|
184
|
+
if not args.no_tidy:
|
|
185
|
+
bibfile.run_tidy(path)
|
|
186
|
+
|
|
187
|
+
matched = sum(1 for r in report if r["matched"])
|
|
188
|
+
for r in report:
|
|
189
|
+
mark = "✓" if r["matched"] else "✗"
|
|
190
|
+
_log(f"{mark} {r['key']}: {r.get('venue', 'no match')}")
|
|
191
|
+
_log(f"[bibcite] {matched} matched, {changed} upgraded{' (dry-run)' if args.dry_run else ''}")
|
|
192
|
+
_emit({"upgraded": changed, "matched": matched, "dry_run": args.dry_run, "entries": report})
|
|
193
|
+
return 0
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# ---------------------------------------------------------------------------
|
|
197
|
+
# tidy / check
|
|
198
|
+
# ---------------------------------------------------------------------------
|
|
199
|
+
|
|
200
|
+
def cmd_tidy(args) -> int:
|
|
201
|
+
return 0 if bibfile.run_tidy(Path(args.file)) else 1
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def cmd_check(args) -> int:
|
|
205
|
+
path = Path(args.file)
|
|
206
|
+
db = bibfile.load_bib_file(path)
|
|
207
|
+
if db is None:
|
|
208
|
+
_log(f"[bibcite] {path} could not be parsed")
|
|
209
|
+
return 1
|
|
210
|
+
problems = []
|
|
211
|
+
seen_titles: dict[str, str] = {}
|
|
212
|
+
for entry in db.entries:
|
|
213
|
+
key = entry.get("ID", "?")
|
|
214
|
+
nt = norm_title(entry.get("title", ""))
|
|
215
|
+
if nt and nt in seen_titles:
|
|
216
|
+
problems.append({"key": key, "issue": f"duplicate title of {seen_titles[nt]}"})
|
|
217
|
+
seen_titles.setdefault(nt, key)
|
|
218
|
+
for f in ("author", "title", "year"):
|
|
219
|
+
if not entry.get(f):
|
|
220
|
+
problems.append({"key": key, "issue": f"missing {f}"})
|
|
221
|
+
if bibfile.is_preprint(entry):
|
|
222
|
+
problems.append({"key": key, "issue": "arXiv preprint (try `bibcite upgrade`)"})
|
|
223
|
+
for p in problems:
|
|
224
|
+
_log(f"{p['key']}: {p['issue']}")
|
|
225
|
+
_log(f"[bibcite] {len(db.entries)} entries, {len(problems)} issues")
|
|
226
|
+
_emit({"entries": len(db.entries), "problems": problems})
|
|
227
|
+
return 0
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
# ---------------------------------------------------------------------------
|
|
231
|
+
|
|
232
|
+
def main(argv=None) -> int:
|
|
233
|
+
p = argparse.ArgumentParser(
|
|
234
|
+
prog="bibcite",
|
|
235
|
+
description="Resolve papers to canonical BibTeX and manage .bib files (agents: use `add`, never hand-edit).",
|
|
236
|
+
)
|
|
237
|
+
sub = p.add_subparsers(dest="cmd", required=True)
|
|
238
|
+
|
|
239
|
+
g = sub.add_parser("get", help="resolve a query and print BibTeX to stdout")
|
|
240
|
+
g.add_argument("query", nargs="+", help="arXiv id / arXiv URL / DOI / title")
|
|
241
|
+
g.add_argument("--json", action="store_true", help="print a JSON object instead of BibTeX")
|
|
242
|
+
g.add_argument("--require-published", action="store_true", help="fail instead of falling back to an arXiv entry")
|
|
243
|
+
g.set_defaults(fn=cmd_get)
|
|
244
|
+
|
|
245
|
+
a = sub.add_parser("add", help="resolve and write into a .bib file, then run bibtex-tidy (prints JSON)")
|
|
246
|
+
a.add_argument("file", help="target .bib file (created if missing)")
|
|
247
|
+
a.add_argument("query", nargs="*", help="arXiv id / arXiv URL / DOI / title")
|
|
248
|
+
a.add_argument("--bibtex", help="raw BibTeX entry to add instead of a query ('-' reads stdin)")
|
|
249
|
+
a.add_argument("--no-tidy", action="store_true")
|
|
250
|
+
a.add_argument("--require-published", action="store_true")
|
|
251
|
+
a.set_defaults(fn=cmd_add)
|
|
252
|
+
|
|
253
|
+
u = sub.add_parser("upgrade", help="match all arXiv entries in a file to their published versions (prints JSON)")
|
|
254
|
+
u.add_argument("file")
|
|
255
|
+
u.add_argument("--dry-run", action="store_true")
|
|
256
|
+
u.add_argument("--no-tidy", action="store_true")
|
|
257
|
+
u.set_defaults(fn=cmd_upgrade)
|
|
258
|
+
|
|
259
|
+
t = sub.add_parser("tidy", help="run bibtex-tidy with the canonical flags")
|
|
260
|
+
t.add_argument("file")
|
|
261
|
+
t.set_defaults(fn=cmd_tidy)
|
|
262
|
+
|
|
263
|
+
c = sub.add_parser("check", help="offline sanity check of a .bib file (prints JSON)")
|
|
264
|
+
c.add_argument("file")
|
|
265
|
+
c.set_defaults(fn=cmd_check)
|
|
266
|
+
|
|
267
|
+
args = p.parse_args(argv)
|
|
268
|
+
return args.fn(args)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
if __name__ == "__main__":
|
|
272
|
+
raise SystemExit(main())
|