bibcite-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bibcite/__init__.py +3 -0
- bibcite/bibfile.py +194 -0
- bibcite/cli.py +272 -0
- bibcite/data/strings.bib +352 -0
- bibcite/normalize.py +86 -0
- bibcite/resolve.py +289 -0
- bibcite/sources.py +593 -0
- bibcite/venues.py +241 -0
- bibcite_cli-0.1.0.dist-info/METADATA +74 -0
- bibcite_cli-0.1.0.dist-info/RECORD +13 -0
- bibcite_cli-0.1.0.dist-info/WHEEL +4 -0
- bibcite_cli-0.1.0.dist-info/entry_points.txt +2 -0
- bibcite_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
bibcite/venues.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
"""Canonical venue names.
|
|
2
|
+
|
|
3
|
+
Parses the vendored ``data/strings.bib`` @string table (journals /
|
|
4
|
+
conferences / workshops) and maps venue strings returned by DBLP, Semantic
|
|
5
|
+
Scholar, Google Scholar, CrossRef, Unpaywall, etc. onto the canonical names.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from importlib import resources
|
|
11
|
+
|
|
12
|
+
from .normalize import fold_ascii
|
|
13
|
+
|
|
14
|
+
CATEGORY_HEADER = re.compile(r"%{2,}\s*(Journals|Conferences|Workshops)", re.I)
|
|
15
|
+
STRING_DEF = re.compile(r'@string\{(\w+)\s*=\s*"([^"]+)"\}', re.I)
|
|
16
|
+
|
|
17
|
+
# Entries whose section in strings.bib does not reflect how they are cited.
|
|
18
|
+
CATEGORY_OVERRIDES = {
|
|
19
|
+
"RSS": "conference", # listed among journals
|
|
20
|
+
"TMLR": "journal", # listed among conferences
|
|
21
|
+
"SIGGRAPH": "journal", # cited as ACM TOG articles
|
|
22
|
+
"SIGGRAPHAsia": "journal",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
# Tokens dropped on BOTH sides before comparing venue names.
|
|
26
|
+
DROP_TOKENS = frozenset(
|
|
27
|
+
"ieee cvf acm rsj the annual proceedings proc of on in".split()
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Hand-written aliases (normalized form -> macro) for spellings the automatic
|
|
31
|
+
# alias generation cannot derive, e.g. DBLP's abbreviated journal names.
|
|
32
|
+
EXTRA_ALIASES = {
|
|
33
|
+
"neural information processing systems": "NeurIPS",
|
|
34
|
+
"nips": "NIPS",
|
|
35
|
+
"trans pattern anal mach intell": "TPAMI",
|
|
36
|
+
"pattern analysis and machine intelligence": "TPAMI",
|
|
37
|
+
"j mach learn res": "JMLR",
|
|
38
|
+
"int j comput vis": "IJCV",
|
|
39
|
+
"trans mach learn res": "TMLR",
|
|
40
|
+
"transactions machine learning research": "TMLR",
|
|
41
|
+
"robotics autom lett": "RAL",
|
|
42
|
+
"robotics and automation letters": "RAL",
|
|
43
|
+
"computer vision and pattern recognition": "CVPR",
|
|
44
|
+
"winter applications computer vision": "WACV",
|
|
45
|
+
"aaai artificial intelligence": "AAAI",
|
|
46
|
+
"national aaai": "AAAI",
|
|
47
|
+
"aaai": "AAAI",
|
|
48
|
+
"acl": "ACL",
|
|
49
|
+
"emnlp": "EMNLP",
|
|
50
|
+
"naacl": "NAACL",
|
|
51
|
+
"naacl hlt": "NAACL",
|
|
52
|
+
"north american chapter association for computational linguistics": "NAACL",
|
|
53
|
+
"empirical methods natural language processing": "EMNLP",
|
|
54
|
+
"association for computational linguistics": "ACL",
|
|
55
|
+
"robotics science and systems": "RSS",
|
|
56
|
+
"3dv": "TDV",
|
|
57
|
+
"colt": "COLT92",
|
|
58
|
+
"computational learning theory": "COLT92",
|
|
59
|
+
"knowledge discovery and data mining": "KDD",
|
|
60
|
+
"int j robotics res": "IJRR",
|
|
61
|
+
"j field robotics": "IJRR",
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass(frozen=True)
|
|
66
|
+
class Venue:
|
|
67
|
+
macro: str
|
|
68
|
+
name: str # canonical full name (LaTeX escapes removed at parse time? kept)
|
|
69
|
+
category: str # "journal" | "conference" | "workshop"
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def is_journal(self) -> bool:
|
|
73
|
+
return self.category == "journal"
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def bib_field(self) -> str:
|
|
77
|
+
return "journal" if self.is_journal else "booktitle"
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def entry_type(self) -> str:
|
|
81
|
+
return "article" if self.is_journal else "inproceedings"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _norm(s: str) -> str:
|
|
85
|
+
"""Normalize a venue string for comparison."""
|
|
86
|
+
s = fold_ascii(s).lower()
|
|
87
|
+
s = re.sub(r"\\(.)", r"\1", s) # \& -> &
|
|
88
|
+
s = s.replace("&", " and ")
|
|
89
|
+
s = re.sub(r"\(.*?\)", " ", s) # drop parentheticals (acronyms handled separately)
|
|
90
|
+
tokens = re.split(r"[^a-z0-9]+", s)
|
|
91
|
+
out = []
|
|
92
|
+
for t in tokens:
|
|
93
|
+
if not t or t in DROP_TOKENS:
|
|
94
|
+
continue
|
|
95
|
+
if t.isdigit() or re.fullmatch(r"\d+(st|nd|rd|th)", t):
|
|
96
|
+
continue
|
|
97
|
+
out.append(t)
|
|
98
|
+
return " ".join(out)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _acronyms(s: str) -> list[str]:
|
|
102
|
+
"""Candidate acronyms in a raw venue string: parenthesized chunks and
|
|
103
|
+
standalone ALL-CAPS tokens, mini-hashed."""
|
|
104
|
+
cands = re.findall(r"\(([^()]+)\)", s)
|
|
105
|
+
cands += [t for t in re.split(r"[\s,.:]+", s) if len(t) >= 2 and t.isupper()]
|
|
106
|
+
out = []
|
|
107
|
+
for c in cands:
|
|
108
|
+
c = re.sub(r"[^a-z0-9]", "", fold_ascii(c).lower())
|
|
109
|
+
if c and c not in out:
|
|
110
|
+
out.append(c)
|
|
111
|
+
return out
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class VenueTable:
|
|
115
|
+
def __init__(self, text: str):
|
|
116
|
+
self.venues: dict[str, Venue] = {}
|
|
117
|
+
self._exact: dict[str, str] = {} # normalized string -> macro
|
|
118
|
+
self._acr: dict[str, str] = {} # minihashed acronym -> macro
|
|
119
|
+
self._containment: list[tuple[str, str]] = [] # (normalized full, macro)
|
|
120
|
+
self._parse(text)
|
|
121
|
+
self._build_aliases()
|
|
122
|
+
|
|
123
|
+
def _parse(self, text: str):
|
|
124
|
+
category = "journal"
|
|
125
|
+
for line in text.splitlines():
|
|
126
|
+
m = CATEGORY_HEADER.search(line)
|
|
127
|
+
if m:
|
|
128
|
+
category = m.group(1).lower().rstrip("s") # journal/conference/workshop
|
|
129
|
+
continue
|
|
130
|
+
m = STRING_DEF.search(line)
|
|
131
|
+
if not m:
|
|
132
|
+
continue
|
|
133
|
+
macro, name = m.group(1), m.group(2)
|
|
134
|
+
cat = CATEGORY_OVERRIDES.get(macro, category)
|
|
135
|
+
self.venues[macro] = Venue(macro=macro, name=name, category=cat)
|
|
136
|
+
|
|
137
|
+
def _build_aliases(self):
|
|
138
|
+
for macro, v in self.venues.items():
|
|
139
|
+
mini = re.sub(r"[^a-z0-9]", "", macro.lower())
|
|
140
|
+
self._acr.setdefault(mini, macro)
|
|
141
|
+
for acr in _acronyms(v.name):
|
|
142
|
+
self._acr.setdefault(acr, macro)
|
|
143
|
+
# Workshop variants: "ICCV Workshops" style aliases.
|
|
144
|
+
if v.category == "workshop" and acr.endswith("w"):
|
|
145
|
+
self._exact.setdefault(f"{acr[:-1]} workshops", macro)
|
|
146
|
+
n = _norm(v.name)
|
|
147
|
+
if n:
|
|
148
|
+
self._exact.setdefault(n, macro)
|
|
149
|
+
if len(n.split()) >= 3:
|
|
150
|
+
self._containment.append((n, macro))
|
|
151
|
+
for alias, macro in EXTRA_ALIASES.items():
|
|
152
|
+
self._exact[alias] = macro
|
|
153
|
+
# Longest names first so e.g. ICCVW ("... computer vision workshops")
|
|
154
|
+
# wins over ICCV on containment.
|
|
155
|
+
self._containment.sort(key=lambda p: -len(p[0]))
|
|
156
|
+
|
|
157
|
+
# ------------------------------------------------------------------
|
|
158
|
+
def canonicalize(self, venue: str, year: int | str | None = None) -> Venue | None:
|
|
159
|
+
"""Map an arbitrary venue string to a canonical Venue, or None."""
|
|
160
|
+
if not venue or not venue.strip():
|
|
161
|
+
return None
|
|
162
|
+
raw = venue.strip()
|
|
163
|
+
n = _norm(raw)
|
|
164
|
+
lowered = re.sub(r"[^a-z0-9]", "", fold_ascii(raw).lower())
|
|
165
|
+
is_workshoppy = "workshop" in n
|
|
166
|
+
|
|
167
|
+
macro = None
|
|
168
|
+
# 1. exact normalized-name / alias match
|
|
169
|
+
if n in self._exact:
|
|
170
|
+
macro = self._exact[n]
|
|
171
|
+
# 2. the whole string is an acronym (e.g. DBLP venue "CVPR")
|
|
172
|
+
if macro is None and lowered in self._acr:
|
|
173
|
+
macro = self._acr[lowered]
|
|
174
|
+
# 3. embedded acronyms — prefer workshop variants when applicable
|
|
175
|
+
if macro is None:
|
|
176
|
+
for acr in _acronyms(raw):
|
|
177
|
+
if is_workshoppy and acr + "w" in self._acr:
|
|
178
|
+
macro = self._acr[acr + "w"]
|
|
179
|
+
break
|
|
180
|
+
if acr in self._acr:
|
|
181
|
+
macro = self._acr[acr]
|
|
182
|
+
break
|
|
183
|
+
# 4. containment of the canonical full name in the venue string
|
|
184
|
+
if macro is None:
|
|
185
|
+
for full, m in self._containment:
|
|
186
|
+
if full in n:
|
|
187
|
+
macro = m
|
|
188
|
+
break
|
|
189
|
+
if macro is None:
|
|
190
|
+
return None
|
|
191
|
+
macro = self._apply_year_rules(macro, year)
|
|
192
|
+
return self.venues[macro]
|
|
193
|
+
|
|
194
|
+
@staticmethod
|
|
195
|
+
def _year_int(year) -> int | None:
|
|
196
|
+
try:
|
|
197
|
+
return int(str(year)[:4])
|
|
198
|
+
except (TypeError, ValueError):
|
|
199
|
+
return None
|
|
200
|
+
|
|
201
|
+
def _apply_year_rules(self, macro: str, year) -> str:
|
|
202
|
+
y = self._year_int(year)
|
|
203
|
+
if macro in ("NIPS", "NeurIPS"):
|
|
204
|
+
if y is None:
|
|
205
|
+
return "NeurIPS"
|
|
206
|
+
return "NeurIPS" if y >= 2018 else "NIPS"
|
|
207
|
+
if macro in ("WACV", "WACV_until_2016"):
|
|
208
|
+
if y is not None and y <= 2016:
|
|
209
|
+
return "WACV_until_2016"
|
|
210
|
+
return "WACV"
|
|
211
|
+
return macro
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
_table: VenueTable | None = None
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _strings_text() -> str:
|
|
218
|
+
"""The @string table, overridable so other people can use their own:
|
|
219
|
+
$BIBCITE_STRINGS, then ~/.config/bibcite/strings.bib, then the vendored
|
|
220
|
+
default."""
|
|
221
|
+
import os
|
|
222
|
+
from pathlib import Path
|
|
223
|
+
|
|
224
|
+
env = os.environ.get("BIBCITE_STRINGS")
|
|
225
|
+
if env:
|
|
226
|
+
return Path(env).read_text()
|
|
227
|
+
user = Path.home() / ".config" / "bibcite" / "strings.bib"
|
|
228
|
+
if user.exists():
|
|
229
|
+
return user.read_text()
|
|
230
|
+
return (resources.files("bibcite") / "data" / "strings.bib").read_text()
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def get_table() -> VenueTable:
|
|
234
|
+
global _table
|
|
235
|
+
if _table is None:
|
|
236
|
+
_table = VenueTable(_strings_text())
|
|
237
|
+
return _table
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def canonicalize(venue: str, year=None) -> Venue | None:
|
|
241
|
+
return get_table().canonicalize(venue, year)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bibcite-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX for agents and humans
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: arxiv,bibliography,bibtex,citations,dblp
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Requires-Dist: bibtexparser<2,>=1.4
|
|
10
|
+
Requires-Dist: httpx>=0.27
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# bibcite
|
|
14
|
+
|
|
15
|
+
Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX, and manage `.bib` files so agents never hand-edit them.
|
|
16
|
+
|
|
17
|
+
The publication-matching cascade is ported from [PaperMemory](https://github.com/vict0rsch/PaperMemory)'s bibMatcher:
|
|
18
|
+
DBLP → Semantic Scholar → Google Scholar → CrossRef → Unpaywall.
|
|
19
|
+
A match must have an identical normalized title, a plausible year, and a non-preprint venue.
|
|
20
|
+
|
|
21
|
+
Venue names are canonicalized against the `@string` table vendored in `src/bibcite/data/strings.bib` (journals / conferences / workshops), including year-aware rules (NIPS before 2018 vs NeurIPS, WACV before 2017).
|
|
22
|
+
|
|
23
|
+
Entry types are strict: conference/workshop papers become `@inproceedings` + `booktitle`, journal papers `@article` + `journal`, and unpublished arXiv preprints `@misc` + `howpublished = {arXiv preprint arXiv:ID}`.
|
|
24
|
+
Types coming from authoritative source BibTeX (DBLP) are preserved.
|
|
25
|
+
|
|
26
|
+
After every write, the file is formatted with [bibtex-tidy](https://github.com/FlamingTempura/bibtex-tidy) using the canonical flags in `bibfile.TIDY_ARGS` (requires `bibtex-tidy` on PATH or `npx`).
|
|
27
|
+
|
|
28
|
+
## Install
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# from a local checkout (development)
|
|
32
|
+
uv tool install --editable .
|
|
33
|
+
|
|
34
|
+
# from git, no checkout needed
|
|
35
|
+
uv tool install git+https://github.com/<you>/bibcite
|
|
36
|
+
|
|
37
|
+
# once published to PyPI (package name bibcite-cli, command name bibcite)
|
|
38
|
+
uv tool install bibcite-cli # or: uvx --from bibcite-cli bibcite ...
|
|
39
|
+
|
|
40
|
+
# plus, once (required for the tidy step):
|
|
41
|
+
npm install -g bibtex-tidy
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
To use your own venue table instead of the vendored one, set `BIBCITE_STRINGS=/path/to/strings.bib` or place it at `~/.config/bibcite/strings.bib`.
|
|
45
|
+
|
|
46
|
+
## Usage
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# Preview the BibTeX for a paper (nothing written)
|
|
50
|
+
bibcite get 1706.03762
|
|
51
|
+
bibcite get "Attention is all you need"
|
|
52
|
+
bibcite get 10.1109/CVPR52688.2022.01167
|
|
53
|
+
|
|
54
|
+
# Resolve and write into a .bib file, dedupe, then bibtex-tidy; prints the final key
|
|
55
|
+
bibcite add refs.bib 2103.14030 --json
|
|
56
|
+
|
|
57
|
+
# Add a raw BibTeX entry you already have (venue still canonicalized, file still tidied)
|
|
58
|
+
bibcite add refs.bib --bibtex "$(pbpaste)"
|
|
59
|
+
|
|
60
|
+
# Upgrade every arXiv entry in a file to its published version (bibMatcher, CLI-style)
|
|
61
|
+
bibcite upgrade refs.bib --dry-run
|
|
62
|
+
|
|
63
|
+
# Just format, or just lint
|
|
64
|
+
bibcite tidy refs.bib
|
|
65
|
+
bibcite check refs.bib
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
`--json` prints a machine-readable result on stdout (`action`, `key`, `venue`, `source`, ...); all diagnostics go to stderr.
|
|
69
|
+
`add` is idempotent: an existing entry returns `action: exists` with its key, and an existing arXiv entry matched to a published version is upgraded in place, keeping its citation key.
|
|
70
|
+
|
|
71
|
+
## For agents
|
|
72
|
+
|
|
73
|
+
Never edit `.bib` files by hand.
|
|
74
|
+
Call `bibcite add <file> <query> --json` and use the returned `key` in `\cite{...}`.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
bibcite/__init__.py,sha256=LlyEv0C1TqMB-9wU1T35LcRK2J090totd8_QA1lS5Uk,103
|
|
2
|
+
bibcite/bibfile.py,sha256=2MYJ4ptanwKsmwywV5WiVe1y70vQ6_PQuCW-U74J3sI,6183
|
|
3
|
+
bibcite/cli.py,sha256=tt9G67JBW5Dimy-MBN6A0OyzL0-wHSCZzgRTPI8e2ls,9811
|
|
4
|
+
bibcite/normalize.py,sha256=FOPh678mb87hULJa4W8nZa259M0a8HnniS-icPVnrk8,3303
|
|
5
|
+
bibcite/resolve.py,sha256=nxQMHx3M3X3D9amQHokfWSOX-Dey0GRG45wyBh3zfH4,10867
|
|
6
|
+
bibcite/sources.py,sha256=MqqOagO3VcVcZZftAKRXwhFEKeOIOZb2p57u77Ng1oc,21108
|
|
7
|
+
bibcite/venues.py,sha256=vINYhIvCHv2UwyMBXMvuncmX6bSb-Wp91dqp7qLmx-0,8553
|
|
8
|
+
bibcite/data/strings.bib,sha256=OokHlBtiExDmBRo3NTYZGgVtKYb4TLQ1-U2VDNZcEP8,12381
|
|
9
|
+
bibcite_cli-0.1.0.dist-info/METADATA,sha256=RzF9Ycn4YWBtNiFt5GMv7zOSZxgaOGhDlVhZ9fpgIFg,3129
|
|
10
|
+
bibcite_cli-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
11
|
+
bibcite_cli-0.1.0.dist-info/entry_points.txt,sha256=EKrDiAWGIjWyS_XTRAC9A0_FQL73xZjXjso6Fjf3w00,45
|
|
12
|
+
bibcite_cli-0.1.0.dist-info/licenses/LICENSE,sha256=TKJKgSgrKkEWgJwigqLIGNf0tUaT-2NsM25jMwdgszU,1065
|
|
13
|
+
bibcite_cli-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Leonardo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|