bibcite-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bibcite/venues.py ADDED
@@ -0,0 +1,241 @@
1
+ """Canonical venue names.
2
+
3
+ Parses the vendored ``data/strings.bib`` @string table (journals /
4
+ conferences / workshops) and maps venue strings returned by DBLP, Semantic
5
+ Scholar, Google Scholar, CrossRef, Unpaywall, etc. onto the canonical names.
6
+ """
7
+
8
+ import re
9
+ from dataclasses import dataclass
10
+ from importlib import resources
11
+
12
+ from .normalize import fold_ascii
13
+
14
+ CATEGORY_HEADER = re.compile(r"%{2,}\s*(Journals|Conferences|Workshops)", re.I)
15
+ STRING_DEF = re.compile(r'@string\{(\w+)\s*=\s*"([^"]+)"\}', re.I)
16
+
17
+ # Entries whose section in strings.bib does not reflect how they are cited.
18
+ CATEGORY_OVERRIDES = {
19
+ "RSS": "conference", # listed among journals
20
+ "TMLR": "journal", # listed among conferences
21
+ "SIGGRAPH": "journal", # cited as ACM TOG articles
22
+ "SIGGRAPHAsia": "journal",
23
+ }
24
+
25
+ # Tokens dropped on BOTH sides before comparing venue names.
26
+ DROP_TOKENS = frozenset(
27
+ "ieee cvf acm rsj the annual proceedings proc of on in".split()
28
+ )
29
+
30
+ # Hand-written aliases (normalized form -> macro) for spellings the automatic
31
+ # alias generation cannot derive, e.g. DBLP's abbreviated journal names.
32
+ EXTRA_ALIASES = {
33
+ "neural information processing systems": "NeurIPS",
34
+ "nips": "NIPS",
35
+ "trans pattern anal mach intell": "TPAMI",
36
+ "pattern analysis and machine intelligence": "TPAMI",
37
+ "j mach learn res": "JMLR",
38
+ "int j comput vis": "IJCV",
39
+ "trans mach learn res": "TMLR",
40
+ "transactions machine learning research": "TMLR",
41
+ "robotics autom lett": "RAL",
42
+ "robotics and automation letters": "RAL",
43
+ "computer vision and pattern recognition": "CVPR",
44
+ "winter applications computer vision": "WACV",
45
+ "aaai artificial intelligence": "AAAI",
46
+ "national aaai": "AAAI",
47
+ "aaai": "AAAI",
48
+ "acl": "ACL",
49
+ "emnlp": "EMNLP",
50
+ "naacl": "NAACL",
51
+ "naacl hlt": "NAACL",
52
+ "north american chapter association for computational linguistics": "NAACL",
53
+ "empirical methods natural language processing": "EMNLP",
54
+ "association for computational linguistics": "ACL",
55
+ "robotics science and systems": "RSS",
56
+ "3dv": "TDV",
57
+ "colt": "COLT92",
58
+ "computational learning theory": "COLT92",
59
+ "knowledge discovery and data mining": "KDD",
60
+ "int j robotics res": "IJRR",
61
+ "j field robotics": "IJRR",
62
+ }
63
+
64
+
65
+ @dataclass(frozen=True)
66
+ class Venue:
67
+ macro: str
68
+ name: str # canonical full name (LaTeX escapes removed at parse time? kept)
69
+ category: str # "journal" | "conference" | "workshop"
70
+
71
+ @property
72
+ def is_journal(self) -> bool:
73
+ return self.category == "journal"
74
+
75
+ @property
76
+ def bib_field(self) -> str:
77
+ return "journal" if self.is_journal else "booktitle"
78
+
79
+ @property
80
+ def entry_type(self) -> str:
81
+ return "article" if self.is_journal else "inproceedings"
82
+
83
+
84
+ def _norm(s: str) -> str:
85
+ """Normalize a venue string for comparison."""
86
+ s = fold_ascii(s).lower()
87
+ s = re.sub(r"\\(.)", r"\1", s) # \& -> &
88
+ s = s.replace("&", " and ")
89
+ s = re.sub(r"\(.*?\)", " ", s) # drop parentheticals (acronyms handled separately)
90
+ tokens = re.split(r"[^a-z0-9]+", s)
91
+ out = []
92
+ for t in tokens:
93
+ if not t or t in DROP_TOKENS:
94
+ continue
95
+ if t.isdigit() or re.fullmatch(r"\d+(st|nd|rd|th)", t):
96
+ continue
97
+ out.append(t)
98
+ return " ".join(out)
99
+
100
+
101
+ def _acronyms(s: str) -> list[str]:
102
+ """Candidate acronyms in a raw venue string: parenthesized chunks and
103
+ standalone ALL-CAPS tokens, mini-hashed."""
104
+ cands = re.findall(r"\(([^()]+)\)", s)
105
+ cands += [t for t in re.split(r"[\s,.:]+", s) if len(t) >= 2 and t.isupper()]
106
+ out = []
107
+ for c in cands:
108
+ c = re.sub(r"[^a-z0-9]", "", fold_ascii(c).lower())
109
+ if c and c not in out:
110
+ out.append(c)
111
+ return out
112
+
113
+
114
+ class VenueTable:
115
+ def __init__(self, text: str):
116
+ self.venues: dict[str, Venue] = {}
117
+ self._exact: dict[str, str] = {} # normalized string -> macro
118
+ self._acr: dict[str, str] = {} # minihashed acronym -> macro
119
+ self._containment: list[tuple[str, str]] = [] # (normalized full, macro)
120
+ self._parse(text)
121
+ self._build_aliases()
122
+
123
+ def _parse(self, text: str):
124
+ category = "journal"
125
+ for line in text.splitlines():
126
+ m = CATEGORY_HEADER.search(line)
127
+ if m:
128
+ category = m.group(1).lower().rstrip("s") # journal/conference/workshop
129
+ continue
130
+ m = STRING_DEF.search(line)
131
+ if not m:
132
+ continue
133
+ macro, name = m.group(1), m.group(2)
134
+ cat = CATEGORY_OVERRIDES.get(macro, category)
135
+ self.venues[macro] = Venue(macro=macro, name=name, category=cat)
136
+
137
+ def _build_aliases(self):
138
+ for macro, v in self.venues.items():
139
+ mini = re.sub(r"[^a-z0-9]", "", macro.lower())
140
+ self._acr.setdefault(mini, macro)
141
+ for acr in _acronyms(v.name):
142
+ self._acr.setdefault(acr, macro)
143
+ # Workshop variants: "ICCV Workshops" style aliases.
144
+ if v.category == "workshop" and acr.endswith("w"):
145
+ self._exact.setdefault(f"{acr[:-1]} workshops", macro)
146
+ n = _norm(v.name)
147
+ if n:
148
+ self._exact.setdefault(n, macro)
149
+ if len(n.split()) >= 3:
150
+ self._containment.append((n, macro))
151
+ for alias, macro in EXTRA_ALIASES.items():
152
+ self._exact[alias] = macro
153
+ # Longest names first so e.g. ICCVW ("... computer vision workshops")
154
+ # wins over ICCV on containment.
155
+ self._containment.sort(key=lambda p: -len(p[0]))
156
+
157
+ # ------------------------------------------------------------------
158
+ def canonicalize(self, venue: str, year: int | str | None = None) -> Venue | None:
159
+ """Map an arbitrary venue string to a canonical Venue, or None."""
160
+ if not venue or not venue.strip():
161
+ return None
162
+ raw = venue.strip()
163
+ n = _norm(raw)
164
+ lowered = re.sub(r"[^a-z0-9]", "", fold_ascii(raw).lower())
165
+ is_workshoppy = "workshop" in n
166
+
167
+ macro = None
168
+ # 1. exact normalized-name / alias match
169
+ if n in self._exact:
170
+ macro = self._exact[n]
171
+ # 2. the whole string is an acronym (e.g. DBLP venue "CVPR")
172
+ if macro is None and lowered in self._acr:
173
+ macro = self._acr[lowered]
174
+ # 3. embedded acronyms — prefer workshop variants when applicable
175
+ if macro is None:
176
+ for acr in _acronyms(raw):
177
+ if is_workshoppy and acr + "w" in self._acr:
178
+ macro = self._acr[acr + "w"]
179
+ break
180
+ if acr in self._acr:
181
+ macro = self._acr[acr]
182
+ break
183
+ # 4. containment of the canonical full name in the venue string
184
+ if macro is None:
185
+ for full, m in self._containment:
186
+ if full in n:
187
+ macro = m
188
+ break
189
+ if macro is None:
190
+ return None
191
+ macro = self._apply_year_rules(macro, year)
192
+ return self.venues[macro]
193
+
194
+ @staticmethod
195
+ def _year_int(year) -> int | None:
196
+ try:
197
+ return int(str(year)[:4])
198
+ except (TypeError, ValueError):
199
+ return None
200
+
201
+ def _apply_year_rules(self, macro: str, year) -> str:
202
+ y = self._year_int(year)
203
+ if macro in ("NIPS", "NeurIPS"):
204
+ if y is None:
205
+ return "NeurIPS"
206
+ return "NeurIPS" if y >= 2018 else "NIPS"
207
+ if macro in ("WACV", "WACV_until_2016"):
208
+ if y is not None and y <= 2016:
209
+ return "WACV_until_2016"
210
+ return "WACV"
211
+ return macro
212
+
213
+
214
+ _table: VenueTable | None = None
215
+
216
+
217
+ def _strings_text() -> str:
218
+ """The @string table, overridable so other people can use their own:
219
+ $BIBCITE_STRINGS, then ~/.config/bibcite/strings.bib, then the vendored
220
+ default."""
221
+ import os
222
+ from pathlib import Path
223
+
224
+ env = os.environ.get("BIBCITE_STRINGS")
225
+ if env:
226
+ return Path(env).read_text()
227
+ user = Path.home() / ".config" / "bibcite" / "strings.bib"
228
+ if user.exists():
229
+ return user.read_text()
230
+ return (resources.files("bibcite") / "data" / "strings.bib").read_text()
231
+
232
+
233
+ def get_table() -> VenueTable:
234
+ global _table
235
+ if _table is None:
236
+ _table = VenueTable(_strings_text())
237
+ return _table
238
+
239
+
240
+ def canonicalize(venue: str, year=None) -> Venue | None:
241
+ return get_table().canonicalize(venue, year)
@@ -0,0 +1,74 @@
1
+ Metadata-Version: 2.4
2
+ Name: bibcite-cli
3
+ Version: 0.1.0
4
+ Summary: Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX for agents and humans
5
+ License-Expression: MIT
6
+ License-File: LICENSE
7
+ Keywords: arxiv,bibliography,bibtex,citations,dblp
8
+ Requires-Python: >=3.10
9
+ Requires-Dist: bibtexparser<2,>=1.4
10
+ Requires-Dist: httpx>=0.27
11
+ Description-Content-Type: text/markdown
12
+
13
+ # bibcite
14
+
15
+ Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX, and manage `.bib` files so agents never hand-edit them.
16
+
17
+ The publication-matching cascade is ported from [PaperMemory](https://github.com/vict0rsch/PaperMemory)'s bibMatcher:
18
+ DBLP → Semantic Scholar → Google Scholar → CrossRef → Unpaywall.
19
+ A match must have an identical normalized title, a plausible year, and a non-preprint venue.
20
+
21
+ Venue names are canonicalized against the `@string` table vendored in `src/bibcite/data/strings.bib` (journals / conferences / workshops), including year-aware rules (NIPS before 2018 vs NeurIPS, WACV before 2017).
22
+
23
+ Entry types are strict: conference/workshop papers become `@inproceedings` + `booktitle`, journal papers `@article` + `journal`, and unpublished arXiv preprints `@misc` + `howpublished = {arXiv preprint arXiv:ID}`.
24
+ Types coming from authoritative source BibTeX (DBLP) are preserved.
25
+
26
+ After every write, the file is formatted with [bibtex-tidy](https://github.com/FlamingTempura/bibtex-tidy) using the canonical flags in `bibfile.TIDY_ARGS` (requires `bibtex-tidy` on PATH or `npx`).
27
+
28
+ ## Install
29
+
30
+ ```bash
31
+ # from a local checkout (development)
32
+ uv tool install --editable .
33
+
34
+ # from git, no checkout needed
35
+ uv tool install git+https://github.com/<you>/bibcite
36
+
37
+ # once published to PyPI (package name bibcite-cli, command name bibcite)
38
+ uv tool install bibcite-cli # or: uvx --from bibcite-cli bibcite ...
39
+
40
+ # plus, once (required for the tidy step):
41
+ npm install -g bibtex-tidy
42
+ ```
43
+
44
+ To use your own venue table instead of the vendored one, set `BIBCITE_STRINGS=/path/to/strings.bib` or place it at `~/.config/bibcite/strings.bib`.
45
+
46
+ ## Usage
47
+
48
+ ```bash
49
+ # Preview the BibTeX for a paper (nothing written)
50
+ bibcite get 1706.03762
51
+ bibcite get "Attention is all you need"
52
+ bibcite get 10.1109/CVPR52688.2022.01167
53
+
54
+ # Resolve and write into a .bib file, dedupe, then bibtex-tidy; prints the final key
55
+ bibcite add refs.bib 2103.14030 --json
56
+
57
+ # Add a raw BibTeX entry you already have (venue still canonicalized, file still tidied)
58
+ bibcite add refs.bib --bibtex "$(pbpaste)"
59
+
60
+ # Upgrade every arXiv entry in a file to its published version (bibMatcher, CLI-style)
61
+ bibcite upgrade refs.bib --dry-run
62
+
63
+ # Just format, or just lint
64
+ bibcite tidy refs.bib
65
+ bibcite check refs.bib
66
+ ```
67
+
68
+ `--json` prints a machine-readable result on stdout (`action`, `key`, `venue`, `source`, ...); all diagnostics go to stderr.
69
+ `add` is idempotent: an existing entry returns `action: exists` with its key, and an existing arXiv entry matched to a published version is upgraded in place, keeping its citation key.
70
+
71
+ ## For agents
72
+
73
+ Never edit `.bib` files by hand.
74
+ Call `bibcite add <file> <query> --json` and use the returned `key` in `\cite{...}`.
@@ -0,0 +1,13 @@
1
+ bibcite/__init__.py,sha256=LlyEv0C1TqMB-9wU1T35LcRK2J090totd8_QA1lS5Uk,103
2
+ bibcite/bibfile.py,sha256=2MYJ4ptanwKsmwywV5WiVe1y70vQ6_PQuCW-U74J3sI,6183
3
+ bibcite/cli.py,sha256=tt9G67JBW5Dimy-MBN6A0OyzL0-wHSCZzgRTPI8e2ls,9811
4
+ bibcite/normalize.py,sha256=FOPh678mb87hULJa4W8nZa259M0a8HnniS-icPVnrk8,3303
5
+ bibcite/resolve.py,sha256=nxQMHx3M3X3D9amQHokfWSOX-Dey0GRG45wyBh3zfH4,10867
6
+ bibcite/sources.py,sha256=MqqOagO3VcVcZZftAKRXwhFEKeOIOZb2p57u77Ng1oc,21108
7
+ bibcite/venues.py,sha256=vINYhIvCHv2UwyMBXMvuncmX6bSb-Wp91dqp7qLmx-0,8553
8
+ bibcite/data/strings.bib,sha256=OokHlBtiExDmBRo3NTYZGgVtKYb4TLQ1-U2VDNZcEP8,12381
9
+ bibcite_cli-0.1.0.dist-info/METADATA,sha256=RzF9Ycn4YWBtNiFt5GMv7zOSZxgaOGhDlVhZ9fpgIFg,3129
10
+ bibcite_cli-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
+ bibcite_cli-0.1.0.dist-info/entry_points.txt,sha256=EKrDiAWGIjWyS_XTRAC9A0_FQL73xZjXjso6Fjf3w00,45
12
+ bibcite_cli-0.1.0.dist-info/licenses/LICENSE,sha256=TKJKgSgrKkEWgJwigqLIGNf0tUaT-2NsM25jMwdgszU,1065
13
+ bibcite_cli-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ bibcite = bibcite.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Leonardo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.