bibcite-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bibcite/sources.py ADDED
@@ -0,0 +1,593 @@
1
+ """API clients for the publication-matching cascade.
2
+
3
+ Order and matching rules ported from PaperMemory's bibMatcher:
4
+ DBLP -> Semantic Scholar -> Google Scholar -> CrossRef -> Unpaywall.
5
+ All matchers verify identity via normalized-title equality and reject
6
+ preprint venues (arXiv / CoRR / bioRxiv / ...).
7
+ """
8
+
9
+ import html
10
+ import re
11
+ import sys
12
+ import time
13
+ import xml.etree.ElementTree as ET
14
+ from dataclasses import dataclass, field
15
+
16
+ import httpx
17
+
18
+ from .normalize import clean_title, mini_hash, norm_title
19
+
20
+ UA = "bibcite/0.1 (https://github.com/leonardo/bibcite; mailto:bibcite@gmail.com)"
21
+ BROWSER_UA = (
22
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
23
+ "(KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
24
+ )
25
+ TIMEOUT = 20.0
26
+
27
+ PREPRINT_VENUES = re.compile(r"arxiv|corr|biorxiv|medrxiv|chemrxiv|ssrn|preprint", re.I)
28
+ ARXIV_DOI = re.compile(r"^10\.48550/", re.I)
29
+
30
+
31
+ def _log(msg: str):
32
+ print(msg, file=sys.stderr)
33
+
34
+
35
+ class SourceUnavailable(Exception):
36
+ """Raised when a source rate-limits/blocks us; the cascade skips it."""
37
+
38
+
39
+ def _client(browser: bool = False) -> httpx.Client:
40
+ return httpx.Client(
41
+ headers={"User-Agent": BROWSER_UA if browser else UA},
42
+ timeout=TIMEOUT,
43
+ follow_redirects=True,
44
+ )
45
+
46
+
47
+ @dataclass
48
+ class Match:
49
+ source: str
50
+ venue: str
51
+ title: str = ""
52
+ year: str = ""
53
+ authors: list[str] = field(default_factory=list)
54
+ doi: str = ""
55
+ bibtex: str = "" # raw bibtex when the source provides one
56
+ url: str = ""
57
+
58
+
59
+ @dataclass
60
+ class ArxivMeta:
61
+ arxiv_id: str
62
+ title: str
63
+ authors: list[str]
64
+ year: str
65
+ abs_url: str
66
+ primary_class: str = ""
67
+ doi: str = ""
68
+
69
+
70
+ def _is_published_venue(venue: str) -> bool:
71
+ return bool(venue) and not PREPRINT_VENUES.search(venue)
72
+
73
+
74
+ # ---------------------------------------------------------------------------
75
+ # arXiv metadata
76
+ # ---------------------------------------------------------------------------
77
+
78
+ ATOM = "{http://www.w3.org/2005/Atom}"
79
+ ARXIV_NS = "{http://arxiv.org/schemas/atom}"
80
+
81
+
82
+ def arxiv_api_get(params: dict) -> httpx.Response:
83
+ """export.arxiv.org allows ~1 request / 3s; retry politely on 429/timeouts."""
84
+ last: Exception | None = None
85
+ for attempt in range(3):
86
+ if attempt:
87
+ time.sleep(3 * attempt)
88
+ try:
89
+ with _client() as c:
90
+ r = c.get(
91
+ "https://export.arxiv.org/api/query",
92
+ params=params,
93
+ timeout=30.0,
94
+ )
95
+ if r.status_code == 429:
96
+ last = SourceUnavailable("arXiv API rate-limited (429)")
97
+ continue
98
+ r.raise_for_status()
99
+ return r
100
+ except httpx.HTTPError as e:
101
+ last = e
102
+ raise last if last else SourceUnavailable("arXiv API unavailable")
103
+
104
+
105
+ def arxiv_metadata(arxiv_id: str) -> ArxivMeta:
106
+ r = arxiv_api_get({"id_list": arxiv_id})
107
+ root = ET.fromstring(r.text)
108
+ entry = root.find(f"{ATOM}entry")
109
+ if entry is None or entry.find(f"{ATOM}title") is None:
110
+ raise ValueError(f"arXiv id not found: {arxiv_id}")
111
+ title = clean_title(entry.find(f"{ATOM}title").text or "")
112
+ if title.lower() == "error":
113
+ raise ValueError(f"arXiv id not found: {arxiv_id}")
114
+ authors = [
115
+ (a.find(f"{ATOM}name").text or "").strip()
116
+ for a in entry.findall(f"{ATOM}author")
117
+ if a.find(f"{ATOM}name") is not None
118
+ ]
119
+ authors = [a for a in authors if a]
120
+ published = entry.find(f"{ATOM}published")
121
+ year = (published.text or "")[:4] if published is not None else ""
122
+ primary = entry.find(f"{ARXIV_NS}primary_category")
123
+ primary_class = primary.get("term") if primary is not None else ""
124
+ doi_el = entry.find(f"{ARXIV_NS}doi")
125
+ doi = doi_el.text if doi_el is not None else ""
126
+ return ArxivMeta(
127
+ arxiv_id=arxiv_id,
128
+ title=title,
129
+ authors=authors,
130
+ year=year,
131
+ abs_url=f"https://arxiv.org/abs/{arxiv_id}",
132
+ primary_class=primary_class,
133
+ doi=doi or "",
134
+ )
135
+
136
+
137
+ # ---------------------------------------------------------------------------
138
+ # DBLP
139
+ # ---------------------------------------------------------------------------
140
+
141
+ def try_dblp(title: str, author_hint: str = "") -> Match | None:
142
+ """DBLP search. Generic titles ("X is all you need") drown in DBLP's
143
+ ranking, so when we know the first author we query with their last name
144
+ first, then fall back to the bare title."""
145
+ queries = []
146
+ if author_hint:
147
+ queries.append(f"{title} {author_hint}")
148
+ queries.append(title)
149
+ with _client() as c:
150
+ for q in queries:
151
+ r = c.get(
152
+ "https://dblp.org/search/publ/api",
153
+ params={"q": q, "format": "json", "h": 100},
154
+ )
155
+ if r.status_code == 429:
156
+ raise SourceUnavailable("DBLP rate-limited (429)")
157
+ r.raise_for_status()
158
+ hits = (
159
+ r.json().get("result", {}).get("hits", {}).get("hit", []) or []
160
+ )
161
+ # Earliest year first: prefer the original conference publication
162
+ # over later journal extensions (same heuristic as PaperMemory).
163
+ hits.sort(key=lambda h: int(h.get("info", {}).get("year", 9999)))
164
+ ref = norm_title(title)
165
+ for hit in hits:
166
+ info = hit.get("info", {})
167
+ if norm_title(html.unescape(info.get("title", ""))) != ref:
168
+ continue
169
+ if info.get("venue") == "CoRR" or not info.get("venue"):
170
+ continue
171
+ venue = info["venue"]
172
+ if isinstance(venue, list):
173
+ venue = venue[0]
174
+ bibtex = ""
175
+ if info.get("url"):
176
+ br = c.get(info["url"] + ".bib")
177
+ if br.status_code == 200:
178
+ bibtex = br.text
179
+ _log(f"[dblp] match: {venue} {info.get('year', '')}")
180
+ return Match(
181
+ source="dblp",
182
+ venue=str(venue),
183
+ title=clean_title(html.unescape(info.get("title", ""))),
184
+ year=str(info.get("year", "")),
185
+ doi=info.get("doi", ""),
186
+ bibtex=bibtex,
187
+ url=info.get("ee", "") or info.get("url", ""),
188
+ )
189
+ return None
190
+
191
+
192
+ # ---------------------------------------------------------------------------
193
+ # Semantic Scholar
194
+ # ---------------------------------------------------------------------------
195
+
196
+ S2_FIELDS = "title,venue,year,authors,externalIds,url"
197
+
198
+
199
+ def _s2_to_match(data: dict, ref_title: str, ref_year: str) -> Match | None:
200
+ venue = (data.get("venue") or "").strip()
201
+ if not _is_published_venue(venue):
202
+ return None
203
+ if norm_title(data.get("title", "")) != norm_title(ref_title):
204
+ return None
205
+ year = data.get("year")
206
+ if ref_year and year and abs(int(year) - int(ref_year)) >= 3:
207
+ return None
208
+ venue = re.sub(r"^\d{4}\s*", "", venue).strip()
209
+ if " " not in venue:
210
+ venue = venue.upper()
211
+ doi = (data.get("externalIds") or {}).get("DOI", "") or ""
212
+ if ARXIV_DOI.match(doi):
213
+ doi = ""
214
+ _log(f"[semanticscholar] match: {venue} {year}")
215
+ return Match(
216
+ source="semanticscholar",
217
+ venue=venue,
218
+ title=clean_title(data.get("title", "")),
219
+ year=str(year or ""),
220
+ authors=[a["name"] for a in data.get("authors") or []],
221
+ doi=doi,
222
+ url=data.get("url", ""),
223
+ )
224
+
225
+
226
+ def arxiv_abs_metadata(arxiv_id: str) -> ArxivMeta | None:
227
+ """Scrape the arxiv.org abs page's Highwire meta tags — the abs pages stay
228
+ up when the export API throttles."""
229
+ with _client(browser=True) as c:
230
+ r = c.get(f"https://arxiv.org/abs/{arxiv_id}")
231
+ if r.status_code != 200:
232
+ return None
233
+ page = r.text
234
+
235
+ def metas(name: str) -> list[str]:
236
+ return [
237
+ html.unescape(m)
238
+ for m in re.findall(
239
+ rf'<meta\s+name="{name}"\s+content="([^"]*)"', page
240
+ )
241
+ ]
242
+
243
+ titles = metas("citation_title")
244
+ if not titles:
245
+ return None
246
+ dates = metas("citation_date")
247
+ return ArxivMeta(
248
+ arxiv_id=arxiv_id,
249
+ title=clean_title(titles[0]),
250
+ authors=metas("citation_author"),
251
+ year=dates[0][:4] if dates else "",
252
+ abs_url=f"https://arxiv.org/abs/{arxiv_id}",
253
+ )
254
+
255
+
256
+ def s2_arxiv_metadata(arxiv_id: str) -> ArxivMeta | None:
257
+ """Metadata (title/authors/year) for an arXiv id via Semantic Scholar —
258
+ the fallback when export.arxiv.org itself is throttled."""
259
+ with _client() as c:
260
+ r = c.get(
261
+ f"https://api.semanticscholar.org/graph/v1/paper/arXiv:{arxiv_id}",
262
+ params={"fields": "title,year,authors"},
263
+ )
264
+ if r.status_code != 200:
265
+ return None
266
+ data = r.json()
267
+ if not data.get("title"):
268
+ return None
269
+ return ArxivMeta(
270
+ arxiv_id=arxiv_id,
271
+ title=clean_title(data["title"]),
272
+ authors=[a["name"] for a in data.get("authors") or []],
273
+ year=str(data.get("year") or ""),
274
+ abs_url=f"https://arxiv.org/abs/{arxiv_id}",
275
+ )
276
+
277
+
278
+ def try_semantic_scholar(
279
+ title: str, year: str = "", arxiv_id: str = ""
280
+ ) -> Match | None:
281
+ with _client() as c:
282
+ # Direct id lookup first: unambiguous, no title-search needed.
283
+ if arxiv_id:
284
+ r = c.get(
285
+ f"https://api.semanticscholar.org/graph/v1/paper/arXiv:{arxiv_id}",
286
+ params={"fields": S2_FIELDS},
287
+ )
288
+ if r.status_code == 429:
289
+ raise SourceUnavailable("Semantic Scholar rate-limited (429)")
290
+ if r.status_code == 200:
291
+ m = _s2_to_match(r.json(), title, year)
292
+ if m:
293
+ return m
294
+ r = c.get(
295
+ "https://api.semanticscholar.org/graph/v1/paper/search",
296
+ params={"query": title, "fields": S2_FIELDS, "limit": 5},
297
+ )
298
+ if r.status_code == 429:
299
+ raise SourceUnavailable("Semantic Scholar rate-limited (429)")
300
+ r.raise_for_status()
301
+ for item in r.json().get("data") or []:
302
+ m = _s2_to_match(item, title, year)
303
+ if m:
304
+ return m
305
+ return None
306
+
307
+
308
+ # ---------------------------------------------------------------------------
309
+ # Google Scholar (port of PaperMemory's background fetchGSData)
310
+ # ---------------------------------------------------------------------------
311
+
312
+ def try_google_scholar(title: str) -> Match | None:
313
+ with _client(browser=True) as c:
314
+ r = c.get(
315
+ "https://scholar.google.com/scholar",
316
+ params={"q": title, "hl": "en"},
317
+ )
318
+ if r.status_code == 429 or "captcha" in r.text.lower()[:5000]:
319
+ raise SourceUnavailable("Google Scholar is blocking requests (captcha/429)")
320
+ r.raise_for_status()
321
+ parts = r.text.split("gs_res_ccl_mid")
322
+ if len(parts) < 2:
323
+ return None
324
+ page = parts[1]
325
+ # Each result title anchor looks like <a id="DATAID" href=...>Title</a>
326
+ # (the title may contain <b> highlights and HTML entities).
327
+ data_id = ""
328
+ for am in re.finditer(
329
+ r'<a[^>]*\bid="([\w-]{6,40})"[^>]*>(.*?)</a>', page, re.S
330
+ ):
331
+ text = html.unescape(re.sub(r"<[^>]+>", "", am.group(2)))
332
+ if norm_title(text) == norm_title(title):
333
+ data_id = am.group(1)
334
+ break
335
+ if not data_id:
336
+ return None
337
+ cite_url = (
338
+ "https://scholar.google.com/scholar?q=info:"
339
+ f"{data_id}:scholar.google.com/&output=cite&scirp=0&hl=en"
340
+ )
341
+ cite_html = c.get(cite_url).text
342
+ bm = re.search(r'<a[^>]*href="([^">]+)"[^>]*>BibTex</a>', cite_html, re.I)
343
+ if not bm:
344
+ return None
345
+ bib_url = re.sub(r"\s+", "", bm.group(1).replace("&amp;", "&"))
346
+ bibtex = c.get(bib_url).text
347
+ from .bibfile import parse_bibtex_entry # local import to avoid cycle
348
+
349
+ entry = parse_bibtex_entry(bibtex)
350
+ venue = entry.get("journal", "") or entry.get("booktitle", "")
351
+ if venue and not venue.lower().endswith("xiv") and "preprint" not in venue.lower():
352
+ _log(f"[googlescholar] match: {venue}")
353
+ return Match(
354
+ source="googlescholar",
355
+ venue=venue,
356
+ title=clean_title(entry.get("title", title)),
357
+ year=entry.get("year", ""),
358
+ bibtex=bibtex,
359
+ )
360
+ return None
361
+
362
+
363
+ # ---------------------------------------------------------------------------
364
+ # CrossRef
365
+ # ---------------------------------------------------------------------------
366
+
367
+ def try_crossref(title: str) -> Match | None:
368
+ with _client() as c:
369
+ r = c.get(
370
+ "https://api.crossref.org/works",
371
+ params={
372
+ "rows": 3,
373
+ "query.title": title,
374
+ "select": "title,event,container-title,DOI,issued",
375
+ "mailto": "bibcite@gmail.com",
376
+ },
377
+ )
378
+ if r.status_code == 429:
379
+ raise SourceUnavailable("CrossRef rate-limited (429)")
380
+ r.raise_for_status()
381
+ payload = r.json()
382
+ if payload.get("status") != "ok":
383
+ return None
384
+ ref = norm_title(title)
385
+ for item in payload["message"].get("items", []):
386
+ titles = item.get("title") or []
387
+ if not titles or norm_title(titles[0]) != ref:
388
+ continue
389
+ event = (item.get("event") or {}).get("name", "")
390
+ container = (item.get("container-title") or [""])[0]
391
+ venue = (event or container).strip()
392
+ if not _is_published_venue(venue):
393
+ continue
394
+ doi = item.get("DOI", "")
395
+ if ARXIV_DOI.match(doi):
396
+ continue
397
+ year = ""
398
+ parts = (item.get("issued") or {}).get("date-parts") or []
399
+ if parts and parts[0]:
400
+ year = str(parts[0][0])
401
+ bibtex = ""
402
+ if doi:
403
+ br = c.get(
404
+ f"https://api.crossref.org/works/{doi}/transform/application/x-bibtex"
405
+ )
406
+ if br.status_code == 200:
407
+ bibtex = br.text
408
+ _log(f"[crossref] match: {venue} {year}")
409
+ return Match(
410
+ source="crossref",
411
+ venue=venue,
412
+ title=clean_title(titles[0]),
413
+ year=year,
414
+ doi=doi,
415
+ bibtex=bibtex,
416
+ )
417
+ return None
418
+
419
+
420
+ # ---------------------------------------------------------------------------
421
+ # Unpaywall
422
+ # ---------------------------------------------------------------------------
423
+
424
+ def try_unpaywall(title: str) -> Match | None:
425
+ with _client() as c:
426
+ r = c.get(
427
+ "https://api.unpaywall.org/v2/search",
428
+ params={"query": title, "is_oa": "true", "email": "bibcite@gmail.com"},
429
+ )
430
+ if r.status_code == 429:
431
+ raise SourceUnavailable("Unpaywall rate-limited (429)")
432
+ r.raise_for_status()
433
+ ref = norm_title(title)
434
+ for res in r.json().get("results") or []:
435
+ resp = res.get("response", {})
436
+ if norm_title(resp.get("title", "")) != ref:
437
+ continue
438
+ venue = (resp.get("journal_name") or "").strip()
439
+ if not _is_published_venue(venue):
440
+ continue
441
+ doi = resp.get("doi", "")
442
+ if ARXIV_DOI.match(doi):
443
+ continue
444
+ authors = [
445
+ " ".join(filter(None, [a.get("given"), a.get("family")]))
446
+ for a in resp.get("z_authors") or []
447
+ ]
448
+ _log(f"[unpaywall] match: {venue} {resp.get('year', '')}")
449
+ return Match(
450
+ source="unpaywall",
451
+ venue=venue,
452
+ title=clean_title(resp.get("title", "")),
453
+ year=str(resp.get("year") or ""),
454
+ authors=[a for a in authors if a],
455
+ doi=doi,
456
+ )
457
+ return None
458
+
459
+
460
+ # ---------------------------------------------------------------------------
461
+ # OpenAlex (not in PaperMemory; unauthenticated with generous rate limits, so
462
+ # it doubles as the metadata fallback when the arXiv API / S2 are throttled)
463
+ # ---------------------------------------------------------------------------
464
+
465
+ def openalex_search(title: str) -> dict | None:
466
+ """OpenAlex work with an exactly-matching normalized title, or None."""
467
+ with _client() as c:
468
+ r = c.get(
469
+ "https://api.openalex.org/works",
470
+ params={"search": title, "per-page": 5, "mailto": "bibcite@gmail.com"},
471
+ )
472
+ if r.status_code == 429:
473
+ raise SourceUnavailable("OpenAlex rate-limited (429)")
474
+ r.raise_for_status()
475
+ ref = norm_title(title)
476
+ for w in r.json().get("results") or []:
477
+ if norm_title(w.get("title") or "") == ref:
478
+ return w
479
+ return None
480
+
481
+
482
+ def openalex_arxiv_id(work: dict) -> str:
483
+ for loc in work.get("locations") or []:
484
+ for f in ("landing_page_url", "pdf_url"):
485
+ m = re.search(
486
+ r"arxiv\.org/(?:abs|pdf)/(\d{4}\.\d{4,5})", loc.get(f) or ""
487
+ )
488
+ if m:
489
+ return m.group(1)
490
+ return ""
491
+
492
+
493
+ def openalex_authors(work: dict) -> list[str]:
494
+ return [
495
+ a["author"]["display_name"]
496
+ for a in work.get("authorships") or []
497
+ if a.get("author", {}).get("display_name")
498
+ ]
499
+
500
+
501
+ def try_openalex(title: str) -> Match | None:
502
+ work = openalex_search(title)
503
+ if not work:
504
+ return None
505
+ venue = ""
506
+ locations = [work.get("primary_location") or {}] + (work.get("locations") or [])
507
+ for loc in locations:
508
+ src = loc.get("source") or {}
509
+ name = (src.get("display_name") or "").strip()
510
+ if src.get("type") != "repository" and _is_published_venue(name):
511
+ venue = name
512
+ break
513
+ if not venue:
514
+ return None
515
+ doi = re.sub(r"^https://doi\.org/", "", work.get("doi") or "")
516
+ if ARXIV_DOI.match(doi):
517
+ doi = ""
518
+ _log(f"[openalex] match: {venue} {work.get('publication_year', '')}")
519
+ return Match(
520
+ source="openalex",
521
+ venue=venue,
522
+ title=clean_title(work.get("title") or ""),
523
+ year=str(work.get("publication_year") or ""),
524
+ authors=openalex_authors(work),
525
+ doi=doi,
526
+ )
527
+
528
+
529
+ # ---------------------------------------------------------------------------
530
+ # CrossRef by DOI (for `bibcite add file 10.xxxx/yyy`)
531
+ # ---------------------------------------------------------------------------
532
+
533
+ def crossref_by_doi(doi: str) -> Match | None:
534
+ with _client() as c:
535
+ r = c.get(f"https://api.crossref.org/works/{doi}", params={"mailto": "bibcite@gmail.com"})
536
+ if r.status_code != 200:
537
+ return None
538
+ data = r.json().get("message", {})
539
+ titles = data.get("title") or []
540
+ event = (data.get("event") or {}).get("name", "")
541
+ container = (data.get("container-title") or [""])[0]
542
+ year = ""
543
+ parts = (data.get("issued") or {}).get("date-parts") or []
544
+ if parts and parts[0]:
545
+ year = str(parts[0][0])
546
+ bibtex = ""
547
+ br = c.get(f"https://api.crossref.org/works/{doi}/transform/application/x-bibtex")
548
+ if br.status_code == 200:
549
+ bibtex = br.text
550
+ authors = [
551
+ " ".join(filter(None, [a.get("given"), a.get("family")]))
552
+ for a in data.get("author") or []
553
+ ]
554
+ return Match(
555
+ source="crossref",
556
+ venue=(event or container).strip(),
557
+ title=clean_title(titles[0]) if titles else "",
558
+ year=year,
559
+ authors=[a for a in authors if a],
560
+ doi=doi,
561
+ bibtex=bibtex,
562
+ )
563
+
564
+
565
+ # ---------------------------------------------------------------------------
566
+ # The cascade
567
+ # ---------------------------------------------------------------------------
568
+
569
+ CASCADE = (
570
+ ("dblp", lambda t, y, a, au: try_dblp(t, au)),
571
+ ("semanticscholar", lambda t, y, a, au: try_semantic_scholar(t, y, a)),
572
+ ("googlescholar", lambda t, y, a, au: try_google_scholar(t)),
573
+ ("crossref", lambda t, y, a, au: try_crossref(t)),
574
+ ("unpaywall", lambda t, y, a, au: try_unpaywall(t)),
575
+ ("openalex", lambda t, y, a, au: try_openalex(t)),
576
+ )
577
+
578
+
579
+ def find_published(
580
+ title: str, year: str = "", arxiv_id: str = "", author_hint: str = ""
581
+ ) -> Match | None:
582
+ """Try each source in order; first verified hit wins."""
583
+ for name, fn in CASCADE:
584
+ try:
585
+ m = fn(title, year, arxiv_id, author_hint)
586
+ if m:
587
+ return m
588
+ _log(f"[{name}] no publication found")
589
+ except SourceUnavailable as e:
590
+ _log(f"[{name}] skipped: {e}")
591
+ except Exception as e: # network hiccup on one source must not kill the run
592
+ _log(f"[{name}] error: {type(e).__name__}: {e}")
593
+ return None