bibcite-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bibcite/__init__.py +3 -0
- bibcite/bibfile.py +194 -0
- bibcite/cli.py +272 -0
- bibcite/data/strings.bib +352 -0
- bibcite/normalize.py +86 -0
- bibcite/resolve.py +289 -0
- bibcite/sources.py +593 -0
- bibcite/venues.py +241 -0
- bibcite_cli-0.1.0.dist-info/METADATA +74 -0
- bibcite_cli-0.1.0.dist-info/RECORD +13 -0
- bibcite_cli-0.1.0.dist-info/WHEEL +4 -0
- bibcite_cli-0.1.0.dist-info/entry_points.txt +2 -0
- bibcite_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
bibcite/resolve.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
"""Query classification and entry building.
|
|
2
|
+
|
|
3
|
+
resolve(query) turns "1706.03762" / an arXiv URL / a DOI / a free-form title
|
|
4
|
+
into a normalized BibTeX entry dict, with the venue canonicalized against the
|
|
5
|
+
vendored strings.bib table.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
import sys
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
from .bibfile import NOISE_FIELDS, parse_bibtex_entry
|
|
13
|
+
from .normalize import clean_title, first_author_last_name, make_key
|
|
14
|
+
from .sources import (
|
|
15
|
+
ArxivMeta,
|
|
16
|
+
Match,
|
|
17
|
+
arxiv_metadata,
|
|
18
|
+
crossref_by_doi,
|
|
19
|
+
find_published,
|
|
20
|
+
)
|
|
21
|
+
from .venues import canonicalize
|
|
22
|
+
|
|
23
|
+
ARXIV_NEW = re.compile(r"^(?:arxiv:)?(\d{4}\.\d{4,5})(v\d+)?$", re.I)
|
|
24
|
+
ARXIV_OLD = re.compile(r"^(?:arxiv:)?([a-z-]+(?:\.[A-Z]{2})?/\d{7})(v\d+)?$", re.I)
|
|
25
|
+
ARXIV_URL = re.compile(
|
|
26
|
+
r"arxiv\.org/(?:abs|pdf|html)/(\d{4}\.\d{4,5}|[a-z-]+(?:\.[A-Z]{2})?/\d{7})",
|
|
27
|
+
re.I,
|
|
28
|
+
)
|
|
29
|
+
DOI_URL = re.compile(r"doi\.org/(10\.\S+)", re.I)
|
|
30
|
+
DOI_RE = re.compile(r"^10\.\d{4,9}/\S+$")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _log(msg: str):
|
|
34
|
+
print(msg, file=sys.stderr)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def classify(query: str) -> tuple[str, str]:
|
|
38
|
+
q = query.strip().rstrip(".")
|
|
39
|
+
m = ARXIV_URL.search(q)
|
|
40
|
+
if m:
|
|
41
|
+
return "arxiv", m.group(1)
|
|
42
|
+
m = ARXIV_NEW.match(q) or ARXIV_OLD.match(q)
|
|
43
|
+
if m:
|
|
44
|
+
return "arxiv", m.group(1)
|
|
45
|
+
m = DOI_URL.search(q)
|
|
46
|
+
if m:
|
|
47
|
+
return "doi", m.group(1)
|
|
48
|
+
if DOI_RE.match(q):
|
|
49
|
+
return "doi", q
|
|
50
|
+
return "title", query.strip()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class Resolved:
|
|
55
|
+
entry: dict # bibtexparser-style: fields + ID + ENTRYTYPE
|
|
56
|
+
source: str # where the publication info came from
|
|
57
|
+
venue: str # final venue string ("" if preprint)
|
|
58
|
+
published: bool
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def bibtex(self) -> str:
|
|
62
|
+
from .bibfile import entry_to_bibtex
|
|
63
|
+
|
|
64
|
+
return entry_to_bibtex(self.entry)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def guess_entry_type(venue: str) -> str:
|
|
68
|
+
"""Entry type for a venue that is NOT in the canonical table.
|
|
69
|
+
|
|
70
|
+
Sources without bibtex only give us a venue string; a conference-sounding
|
|
71
|
+
name must become @inproceedings, not a sloppy @article.
|
|
72
|
+
"""
|
|
73
|
+
v = venue.lower()
|
|
74
|
+
# NOTE: "proceedings" alone is NOT conclusive — PNAS and Proceedings of
|
|
75
|
+
# the IEEE are journals. Real conference names carry one of these words.
|
|
76
|
+
conference_words = ("conference", "workshop", "symposium", "meeting", "congress")
|
|
77
|
+
return "inproceedings" if any(w in v for w in conference_words) else "article"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _entry_from_match(match: Match, meta: ArxivMeta | None) -> dict:
|
|
81
|
+
"""Best available entry for a published match: parse the source's bibtex
|
|
82
|
+
when there is one, else construct from structured fields."""
|
|
83
|
+
entry: dict = {}
|
|
84
|
+
if match.bibtex:
|
|
85
|
+
try:
|
|
86
|
+
entry = parse_bibtex_entry(match.bibtex)
|
|
87
|
+
except ValueError:
|
|
88
|
+
entry = {}
|
|
89
|
+
if not entry:
|
|
90
|
+
authors = match.authors or (meta.authors if meta else [])
|
|
91
|
+
entry_type = guess_entry_type(match.venue)
|
|
92
|
+
entry = {
|
|
93
|
+
"ENTRYTYPE": entry_type,
|
|
94
|
+
"author": " and ".join(authors),
|
|
95
|
+
"title": match.title or (meta.title if meta else ""),
|
|
96
|
+
("booktitle" if entry_type == "inproceedings" else "journal"): match.venue,
|
|
97
|
+
"year": match.year or (meta.year if meta else ""),
|
|
98
|
+
}
|
|
99
|
+
for f in NOISE_FIELDS:
|
|
100
|
+
entry.pop(f, None)
|
|
101
|
+
|
|
102
|
+
entry["title"] = clean_title(entry.get("title", ""))
|
|
103
|
+
if match.doi and not entry.get("doi"):
|
|
104
|
+
entry["doi"] = match.doi
|
|
105
|
+
|
|
106
|
+
# Canonicalize the venue against the strings.bib table.
|
|
107
|
+
raw_venue = match.venue or entry.get("booktitle", "") or entry.get("journal", "")
|
|
108
|
+
year = entry.get("year", "") or match.year
|
|
109
|
+
canonical = canonicalize(raw_venue, year) or canonicalize(
|
|
110
|
+
entry.get("booktitle", "") or entry.get("journal", ""), year
|
|
111
|
+
)
|
|
112
|
+
if canonical:
|
|
113
|
+
entry.pop("booktitle", None)
|
|
114
|
+
entry.pop("journal", None)
|
|
115
|
+
entry["ENTRYTYPE"] = canonical.entry_type
|
|
116
|
+
entry[canonical.bib_field] = canonical.name
|
|
117
|
+
venue_str = canonical.name
|
|
118
|
+
_log(f"[venues] '{raw_venue}' -> {canonical.macro} ({canonical.name})")
|
|
119
|
+
else:
|
|
120
|
+
venue_str = raw_venue
|
|
121
|
+
_log(f"[venues] no canonical mapping for '{raw_venue}' (kept as-is)")
|
|
122
|
+
|
|
123
|
+
entry["__venue"] = venue_str
|
|
124
|
+
return entry
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _finalize(entry: dict, meta: ArxivMeta | None) -> dict:
|
|
128
|
+
"""URL / eprint fields, key, cleanup."""
|
|
129
|
+
if meta and meta.arxiv_id:
|
|
130
|
+
entry["url"] = meta.abs_url # prefer the arXiv link for access
|
|
131
|
+
entry["eprint"] = meta.arxiv_id
|
|
132
|
+
entry["archiveprefix"] = "arXiv"
|
|
133
|
+
if meta.primary_class:
|
|
134
|
+
entry["primaryclass"] = meta.primary_class
|
|
135
|
+
elif entry.get("doi") and not entry.get("url"):
|
|
136
|
+
entry["url"] = f"https://doi.org/{entry['doi']}"
|
|
137
|
+
author = entry.get("author", "") or "anonymous"
|
|
138
|
+
year = entry.get("year", "") or "XXXX"
|
|
139
|
+
entry["ID"] = make_key(author, year, entry.get("title", ""))
|
|
140
|
+
entry.pop("__venue", None)
|
|
141
|
+
return entry
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _arxiv_only_entry(meta: ArxivMeta) -> dict:
|
|
145
|
+
"""Unpublished preprint: @misc per arXiv's own recommendation — never
|
|
146
|
+
@article with a fake journal. howpublished keeps the arXiv pointer
|
|
147
|
+
visible under classic BibTeX styles that ignore eprint fields."""
|
|
148
|
+
return {
|
|
149
|
+
"ENTRYTYPE": "misc",
|
|
150
|
+
"author": " and ".join(meta.authors),
|
|
151
|
+
"title": meta.title,
|
|
152
|
+
"howpublished": f"arXiv preprint arXiv:{meta.arxiv_id}",
|
|
153
|
+
"year": meta.year,
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def resolve(query: str, require_published: bool = False) -> Resolved:
|
|
158
|
+
kind, value = classify(query)
|
|
159
|
+
_log(f"[bibcite] query understood as {kind}: {value}")
|
|
160
|
+
|
|
161
|
+
if kind == "arxiv":
|
|
162
|
+
try:
|
|
163
|
+
meta = arxiv_metadata(value)
|
|
164
|
+
except ValueError:
|
|
165
|
+
raise
|
|
166
|
+
except Exception as e:
|
|
167
|
+
_log(f"[arxiv] API unavailable ({e}); trying fallback metadata sources")
|
|
168
|
+
from .sources import arxiv_abs_metadata, s2_arxiv_metadata
|
|
169
|
+
|
|
170
|
+
meta = None
|
|
171
|
+
for fallback in (s2_arxiv_metadata, arxiv_abs_metadata):
|
|
172
|
+
try:
|
|
173
|
+
meta = fallback(value)
|
|
174
|
+
except Exception as fe:
|
|
175
|
+
_log(f"[arxiv-fallback] {fallback.__name__}: {fe}")
|
|
176
|
+
if meta is not None:
|
|
177
|
+
break
|
|
178
|
+
if meta is None:
|
|
179
|
+
raise LookupError(
|
|
180
|
+
f"Could not fetch metadata for arXiv:{value} "
|
|
181
|
+
"(arXiv API, Semantic Scholar, and arxiv.org all unavailable)"
|
|
182
|
+
)
|
|
183
|
+
_log(f"[arxiv] {meta.title} ({meta.year})")
|
|
184
|
+
hint = first_author_last_name(meta.authors[0]) if meta.authors else ""
|
|
185
|
+
match = find_published(meta.title, meta.year, meta.arxiv_id, hint)
|
|
186
|
+
if match:
|
|
187
|
+
entry = _entry_from_match(match, meta)
|
|
188
|
+
venue = entry.pop("__venue", match.venue)
|
|
189
|
+
return Resolved(_finalize(entry, meta), match.source, venue, True)
|
|
190
|
+
if require_published:
|
|
191
|
+
raise LookupError(f"No published version found for arXiv:{value}")
|
|
192
|
+
_log("[bibcite] no published version found; using arXiv preprint entry")
|
|
193
|
+
entry = _arxiv_only_entry(meta)
|
|
194
|
+
return Resolved(_finalize(entry, meta), "arxiv", "", False)
|
|
195
|
+
|
|
196
|
+
if kind == "doi":
|
|
197
|
+
match = crossref_by_doi(value)
|
|
198
|
+
if not match or not match.title:
|
|
199
|
+
raise LookupError(f"DOI not found on CrossRef: {value}")
|
|
200
|
+
entry = _entry_from_match(match, None)
|
|
201
|
+
venue = entry.pop("__venue", match.venue)
|
|
202
|
+
return Resolved(_finalize(entry, None), match.source, venue, True)
|
|
203
|
+
|
|
204
|
+
# Free-form title: locate it on arXiv first — the authors sharpen the
|
|
205
|
+
# DBLP query (generic titles drown in DBLP's ranking) and we gain the
|
|
206
|
+
# eprint/url fields; papers not on arXiv still go through the cascade.
|
|
207
|
+
meta = _arxiv_search_title(value)
|
|
208
|
+
if meta:
|
|
209
|
+
_log(f"[arxiv] found on arXiv: {meta.arxiv_id} ({meta.year})")
|
|
210
|
+
else:
|
|
211
|
+
meta = _openalex_meta(value) # arXiv API throttled/paper not found
|
|
212
|
+
if meta:
|
|
213
|
+
_log(f"[openalex] metadata: arXiv {meta.arxiv_id or '?'} ({meta.year})")
|
|
214
|
+
hint = first_author_last_name(meta.authors[0]) if meta and meta.authors else ""
|
|
215
|
+
match = find_published(
|
|
216
|
+
meta.title if meta else value,
|
|
217
|
+
meta.year if meta else "",
|
|
218
|
+
meta.arxiv_id if meta else "",
|
|
219
|
+
hint,
|
|
220
|
+
)
|
|
221
|
+
if match:
|
|
222
|
+
entry = _entry_from_match(match, meta)
|
|
223
|
+
venue = entry.pop("__venue", match.venue)
|
|
224
|
+
return Resolved(_finalize(entry, meta), match.source, venue, True)
|
|
225
|
+
if meta and meta.arxiv_id:
|
|
226
|
+
if require_published:
|
|
227
|
+
raise LookupError(f"Only an arXiv preprint was found for: {value}")
|
|
228
|
+
_log("[bibcite] no published version found; using arXiv preprint entry")
|
|
229
|
+
entry = _arxiv_only_entry(meta)
|
|
230
|
+
return Resolved(_finalize(entry, meta), "arxiv", "", False)
|
|
231
|
+
raise LookupError(f"No match found anywhere for: {value}")
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _openalex_meta(title: str) -> ArxivMeta | None:
|
|
235
|
+
"""Author/year/arXiv-id metadata via OpenAlex when the arXiv API is down."""
|
|
236
|
+
from .sources import openalex_arxiv_id, openalex_authors, openalex_search
|
|
237
|
+
|
|
238
|
+
try:
|
|
239
|
+
work = openalex_search(title)
|
|
240
|
+
except Exception as e:
|
|
241
|
+
_log(f"[openalex] unavailable: {e}")
|
|
242
|
+
return None
|
|
243
|
+
if not work:
|
|
244
|
+
return None
|
|
245
|
+
aid = openalex_arxiv_id(work)
|
|
246
|
+
return ArxivMeta(
|
|
247
|
+
arxiv_id=aid,
|
|
248
|
+
title=clean_title(work.get("title") or title),
|
|
249
|
+
authors=openalex_authors(work),
|
|
250
|
+
year=str(work.get("publication_year") or ""),
|
|
251
|
+
abs_url=f"https://arxiv.org/abs/{aid}" if aid else "",
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def _arxiv_search_title(title: str) -> ArxivMeta | None:
|
|
256
|
+
from .normalize import norm_title
|
|
257
|
+
from .sources import ATOM, ARXIV_NS, arxiv_api_get
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
r = arxiv_api_get({"search_query": f'ti:"{title}"', "max_results": 5})
|
|
261
|
+
except Exception as e:
|
|
262
|
+
_log(f"[arxiv-search] unavailable: {e}")
|
|
263
|
+
return None
|
|
264
|
+
try:
|
|
265
|
+
import xml.etree.ElementTree as ET
|
|
266
|
+
|
|
267
|
+
root = ET.fromstring(r.text)
|
|
268
|
+
for e in root.findall(f"{ATOM}entry"):
|
|
269
|
+
t = clean_title(e.findtext(f"{ATOM}title") or "")
|
|
270
|
+
if norm_title(t) != norm_title(title):
|
|
271
|
+
continue
|
|
272
|
+
aid = (e.findtext(f"{ATOM}id") or "").split("/abs/")[-1]
|
|
273
|
+
aid = re.sub(r"v\d+$", "", aid)
|
|
274
|
+
primary = e.find(f"{ARXIV_NS}primary_category")
|
|
275
|
+
return ArxivMeta(
|
|
276
|
+
arxiv_id=aid,
|
|
277
|
+
title=t,
|
|
278
|
+
authors=[
|
|
279
|
+
a.findtext(f"{ATOM}name").strip()
|
|
280
|
+
for a in e.findall(f"{ATOM}author")
|
|
281
|
+
if (a.findtext(f"{ATOM}name") or "").strip()
|
|
282
|
+
],
|
|
283
|
+
year=(e.findtext(f"{ATOM}published") or "")[:4],
|
|
284
|
+
abs_url=f"https://arxiv.org/abs/{aid}",
|
|
285
|
+
primary_class=primary.get("term") if primary is not None else "",
|
|
286
|
+
)
|
|
287
|
+
except Exception as e:
|
|
288
|
+
_log(f"[arxiv-search] error: {e}")
|
|
289
|
+
return None
|