bibcite-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bibcite/resolve.py ADDED
@@ -0,0 +1,289 @@
1
+ """Query classification and entry building.
2
+
3
+ resolve(query) turns "1706.03762" / an arXiv URL / a DOI / a free-form title
4
+ into a normalized BibTeX entry dict, with the venue canonicalized against the
5
+ vendored strings.bib table.
6
+ """
7
+
8
+ import re
9
+ import sys
10
+ from dataclasses import dataclass
11
+
12
+ from .bibfile import NOISE_FIELDS, parse_bibtex_entry
13
+ from .normalize import clean_title, first_author_last_name, make_key
14
+ from .sources import (
15
+ ArxivMeta,
16
+ Match,
17
+ arxiv_metadata,
18
+ crossref_by_doi,
19
+ find_published,
20
+ )
21
+ from .venues import canonicalize
22
+
23
+ ARXIV_NEW = re.compile(r"^(?:arxiv:)?(\d{4}\.\d{4,5})(v\d+)?$", re.I)
24
+ ARXIV_OLD = re.compile(r"^(?:arxiv:)?([a-z-]+(?:\.[A-Z]{2})?/\d{7})(v\d+)?$", re.I)
25
+ ARXIV_URL = re.compile(
26
+ r"arxiv\.org/(?:abs|pdf|html)/(\d{4}\.\d{4,5}|[a-z-]+(?:\.[A-Z]{2})?/\d{7})",
27
+ re.I,
28
+ )
29
+ DOI_URL = re.compile(r"doi\.org/(10\.\S+)", re.I)
30
+ DOI_RE = re.compile(r"^10\.\d{4,9}/\S+$")
31
+
32
+
33
+ def _log(msg: str):
34
+ print(msg, file=sys.stderr)
35
+
36
+
37
+ def classify(query: str) -> tuple[str, str]:
38
+ q = query.strip().rstrip(".")
39
+ m = ARXIV_URL.search(q)
40
+ if m:
41
+ return "arxiv", m.group(1)
42
+ m = ARXIV_NEW.match(q) or ARXIV_OLD.match(q)
43
+ if m:
44
+ return "arxiv", m.group(1)
45
+ m = DOI_URL.search(q)
46
+ if m:
47
+ return "doi", m.group(1)
48
+ if DOI_RE.match(q):
49
+ return "doi", q
50
+ return "title", query.strip()
51
+
52
+
53
+ @dataclass
54
+ class Resolved:
55
+ entry: dict # bibtexparser-style: fields + ID + ENTRYTYPE
56
+ source: str # where the publication info came from
57
+ venue: str # final venue string ("" if preprint)
58
+ published: bool
59
+
60
+ @property
61
+ def bibtex(self) -> str:
62
+ from .bibfile import entry_to_bibtex
63
+
64
+ return entry_to_bibtex(self.entry)
65
+
66
+
67
+ def guess_entry_type(venue: str) -> str:
68
+ """Entry type for a venue that is NOT in the canonical table.
69
+
70
+ Sources without bibtex only give us a venue string; a conference-sounding
71
+ name must become @inproceedings, not a sloppy @article.
72
+ """
73
+ v = venue.lower()
74
+ # NOTE: "proceedings" alone is NOT conclusive — PNAS and Proceedings of
75
+ # the IEEE are journals. Real conference names carry one of these words.
76
+ conference_words = ("conference", "workshop", "symposium", "meeting", "congress")
77
+ return "inproceedings" if any(w in v for w in conference_words) else "article"
78
+
79
+
80
+ def _entry_from_match(match: Match, meta: ArxivMeta | None) -> dict:
81
+ """Best available entry for a published match: parse the source's bibtex
82
+ when there is one, else construct from structured fields."""
83
+ entry: dict = {}
84
+ if match.bibtex:
85
+ try:
86
+ entry = parse_bibtex_entry(match.bibtex)
87
+ except ValueError:
88
+ entry = {}
89
+ if not entry:
90
+ authors = match.authors or (meta.authors if meta else [])
91
+ entry_type = guess_entry_type(match.venue)
92
+ entry = {
93
+ "ENTRYTYPE": entry_type,
94
+ "author": " and ".join(authors),
95
+ "title": match.title or (meta.title if meta else ""),
96
+ ("booktitle" if entry_type == "inproceedings" else "journal"): match.venue,
97
+ "year": match.year or (meta.year if meta else ""),
98
+ }
99
+ for f in NOISE_FIELDS:
100
+ entry.pop(f, None)
101
+
102
+ entry["title"] = clean_title(entry.get("title", ""))
103
+ if match.doi and not entry.get("doi"):
104
+ entry["doi"] = match.doi
105
+
106
+ # Canonicalize the venue against the strings.bib table.
107
+ raw_venue = match.venue or entry.get("booktitle", "") or entry.get("journal", "")
108
+ year = entry.get("year", "") or match.year
109
+ canonical = canonicalize(raw_venue, year) or canonicalize(
110
+ entry.get("booktitle", "") or entry.get("journal", ""), year
111
+ )
112
+ if canonical:
113
+ entry.pop("booktitle", None)
114
+ entry.pop("journal", None)
115
+ entry["ENTRYTYPE"] = canonical.entry_type
116
+ entry[canonical.bib_field] = canonical.name
117
+ venue_str = canonical.name
118
+ _log(f"[venues] '{raw_venue}' -> {canonical.macro} ({canonical.name})")
119
+ else:
120
+ venue_str = raw_venue
121
+ _log(f"[venues] no canonical mapping for '{raw_venue}' (kept as-is)")
122
+
123
+ entry["__venue"] = venue_str
124
+ return entry
125
+
126
+
127
+ def _finalize(entry: dict, meta: ArxivMeta | None) -> dict:
128
+ """URL / eprint fields, key, cleanup."""
129
+ if meta and meta.arxiv_id:
130
+ entry["url"] = meta.abs_url # prefer the arXiv link for access
131
+ entry["eprint"] = meta.arxiv_id
132
+ entry["archiveprefix"] = "arXiv"
133
+ if meta.primary_class:
134
+ entry["primaryclass"] = meta.primary_class
135
+ elif entry.get("doi") and not entry.get("url"):
136
+ entry["url"] = f"https://doi.org/{entry['doi']}"
137
+ author = entry.get("author", "") or "anonymous"
138
+ year = entry.get("year", "") or "XXXX"
139
+ entry["ID"] = make_key(author, year, entry.get("title", ""))
140
+ entry.pop("__venue", None)
141
+ return entry
142
+
143
+
144
+ def _arxiv_only_entry(meta: ArxivMeta) -> dict:
145
+ """Unpublished preprint: @misc per arXiv's own recommendation — never
146
+ @article with a fake journal. howpublished keeps the arXiv pointer
147
+ visible under classic BibTeX styles that ignore eprint fields."""
148
+ return {
149
+ "ENTRYTYPE": "misc",
150
+ "author": " and ".join(meta.authors),
151
+ "title": meta.title,
152
+ "howpublished": f"arXiv preprint arXiv:{meta.arxiv_id}",
153
+ "year": meta.year,
154
+ }
155
+
156
+
157
+ def resolve(query: str, require_published: bool = False) -> Resolved:
158
+ kind, value = classify(query)
159
+ _log(f"[bibcite] query understood as {kind}: {value}")
160
+
161
+ if kind == "arxiv":
162
+ try:
163
+ meta = arxiv_metadata(value)
164
+ except ValueError:
165
+ raise
166
+ except Exception as e:
167
+ _log(f"[arxiv] API unavailable ({e}); trying fallback metadata sources")
168
+ from .sources import arxiv_abs_metadata, s2_arxiv_metadata
169
+
170
+ meta = None
171
+ for fallback in (s2_arxiv_metadata, arxiv_abs_metadata):
172
+ try:
173
+ meta = fallback(value)
174
+ except Exception as fe:
175
+ _log(f"[arxiv-fallback] {fallback.__name__}: {fe}")
176
+ if meta is not None:
177
+ break
178
+ if meta is None:
179
+ raise LookupError(
180
+ f"Could not fetch metadata for arXiv:{value} "
181
+ "(arXiv API, Semantic Scholar, and arxiv.org all unavailable)"
182
+ )
183
+ _log(f"[arxiv] {meta.title} ({meta.year})")
184
+ hint = first_author_last_name(meta.authors[0]) if meta.authors else ""
185
+ match = find_published(meta.title, meta.year, meta.arxiv_id, hint)
186
+ if match:
187
+ entry = _entry_from_match(match, meta)
188
+ venue = entry.pop("__venue", match.venue)
189
+ return Resolved(_finalize(entry, meta), match.source, venue, True)
190
+ if require_published:
191
+ raise LookupError(f"No published version found for arXiv:{value}")
192
+ _log("[bibcite] no published version found; using arXiv preprint entry")
193
+ entry = _arxiv_only_entry(meta)
194
+ return Resolved(_finalize(entry, meta), "arxiv", "", False)
195
+
196
+ if kind == "doi":
197
+ match = crossref_by_doi(value)
198
+ if not match or not match.title:
199
+ raise LookupError(f"DOI not found on CrossRef: {value}")
200
+ entry = _entry_from_match(match, None)
201
+ venue = entry.pop("__venue", match.venue)
202
+ return Resolved(_finalize(entry, None), match.source, venue, True)
203
+
204
+ # Free-form title: locate it on arXiv first — the authors sharpen the
205
+ # DBLP query (generic titles drown in DBLP's ranking) and we gain the
206
+ # eprint/url fields; papers not on arXiv still go through the cascade.
207
+ meta = _arxiv_search_title(value)
208
+ if meta:
209
+ _log(f"[arxiv] found on arXiv: {meta.arxiv_id} ({meta.year})")
210
+ else:
211
+ meta = _openalex_meta(value) # arXiv API throttled/paper not found
212
+ if meta:
213
+ _log(f"[openalex] metadata: arXiv {meta.arxiv_id or '?'} ({meta.year})")
214
+ hint = first_author_last_name(meta.authors[0]) if meta and meta.authors else ""
215
+ match = find_published(
216
+ meta.title if meta else value,
217
+ meta.year if meta else "",
218
+ meta.arxiv_id if meta else "",
219
+ hint,
220
+ )
221
+ if match:
222
+ entry = _entry_from_match(match, meta)
223
+ venue = entry.pop("__venue", match.venue)
224
+ return Resolved(_finalize(entry, meta), match.source, venue, True)
225
+ if meta and meta.arxiv_id:
226
+ if require_published:
227
+ raise LookupError(f"Only an arXiv preprint was found for: {value}")
228
+ _log("[bibcite] no published version found; using arXiv preprint entry")
229
+ entry = _arxiv_only_entry(meta)
230
+ return Resolved(_finalize(entry, meta), "arxiv", "", False)
231
+ raise LookupError(f"No match found anywhere for: {value}")
232
+
233
+
234
+ def _openalex_meta(title: str) -> ArxivMeta | None:
235
+ """Author/year/arXiv-id metadata via OpenAlex when the arXiv API is down."""
236
+ from .sources import openalex_arxiv_id, openalex_authors, openalex_search
237
+
238
+ try:
239
+ work = openalex_search(title)
240
+ except Exception as e:
241
+ _log(f"[openalex] unavailable: {e}")
242
+ return None
243
+ if not work:
244
+ return None
245
+ aid = openalex_arxiv_id(work)
246
+ return ArxivMeta(
247
+ arxiv_id=aid,
248
+ title=clean_title(work.get("title") or title),
249
+ authors=openalex_authors(work),
250
+ year=str(work.get("publication_year") or ""),
251
+ abs_url=f"https://arxiv.org/abs/{aid}" if aid else "",
252
+ )
253
+
254
+
255
+ def _arxiv_search_title(title: str) -> ArxivMeta | None:
256
+ from .normalize import norm_title
257
+ from .sources import ATOM, ARXIV_NS, arxiv_api_get
258
+
259
+ try:
260
+ r = arxiv_api_get({"search_query": f'ti:"{title}"', "max_results": 5})
261
+ except Exception as e:
262
+ _log(f"[arxiv-search] unavailable: {e}")
263
+ return None
264
+ try:
265
+ import xml.etree.ElementTree as ET
266
+
267
+ root = ET.fromstring(r.text)
268
+ for e in root.findall(f"{ATOM}entry"):
269
+ t = clean_title(e.findtext(f"{ATOM}title") or "")
270
+ if norm_title(t) != norm_title(title):
271
+ continue
272
+ aid = (e.findtext(f"{ATOM}id") or "").split("/abs/")[-1]
273
+ aid = re.sub(r"v\d+$", "", aid)
274
+ primary = e.find(f"{ARXIV_NS}primary_category")
275
+ return ArxivMeta(
276
+ arxiv_id=aid,
277
+ title=t,
278
+ authors=[
279
+ a.findtext(f"{ATOM}name").strip()
280
+ for a in e.findall(f"{ATOM}author")
281
+ if (a.findtext(f"{ATOM}name") or "").strip()
282
+ ],
283
+ year=(e.findtext(f"{ATOM}published") or "")[:4],
284
+ abs_url=f"https://arxiv.org/abs/{aid}",
285
+ primary_class=primary.get("term") if primary is not None else "",
286
+ )
287
+ except Exception as e:
288
+ _log(f"[arxiv-search] error: {e}")
289
+ return None