bibcite-cli 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bibcite-cli
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX for agents and humans
5
5
  License-Expression: MIT
6
6
  License-File: LICENSE
@@ -57,10 +57,13 @@ bibcite add refs.bib 2103.14030 --json
57
57
  # Add a raw BibTeX entry you already have (venue still canonicalized, file still tidied)
58
58
  bibcite add refs.bib --bibtex "$(pbpaste)"
59
59
 
60
+ # One-shot cleanup: upgrade preprints → tidy → lint
61
+ bibcite fix refs.bib
62
+
60
63
  # Upgrade every arXiv entry in a file to its published version (bibMatcher, CLI-style)
61
64
  bibcite upgrade refs.bib --dry-run
62
65
 
63
- # Just format, or just lint
66
+ # Just format, or just lint (check is read-only)
64
67
  bibcite tidy refs.bib
65
68
  bibcite check refs.bib
66
69
  ```
@@ -45,10 +45,13 @@ bibcite add refs.bib 2103.14030 --json
45
45
  # Add a raw BibTeX entry you already have (venue still canonicalized, file still tidied)
46
46
  bibcite add refs.bib --bibtex "$(pbpaste)"
47
47
 
48
+ # One-shot cleanup: upgrade preprints → tidy → lint
49
+ bibcite fix refs.bib
50
+
48
51
  # Upgrade every arXiv entry in a file to its published version (bibMatcher, CLI-style)
49
52
  bibcite upgrade refs.bib --dry-run
50
53
 
51
- # Just format, or just lint
54
+ # Just format, or just lint (check is read-only)
52
55
  bibcite tidy refs.bib
53
56
  bibcite check refs.bib
54
57
  ```
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "bibcite-cli"
3
- version = "0.1.0"
3
+ version = "0.2.0"
4
4
  description = "Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX for agents and humans"
5
5
  readme = "Readme.md"
6
6
  license = "MIT"
@@ -1,3 +1,3 @@
1
1
  """bibcite: canonical BibTeX resolution for papers (arXiv id / DOI / title)."""
2
2
 
3
- __version__ = "0.1.0"
3
+ __version__ = "0.2.0"
@@ -8,6 +8,7 @@ citation key it prints.
8
8
  import argparse
9
9
  import json
10
10
  import sys
11
+ import time
11
12
  from pathlib import Path
12
13
 
13
14
  from . import bibfile
@@ -123,21 +124,27 @@ def cmd_add(args) -> int:
123
124
  # upgrade: batch-match arXiv entries in an existing file (bibMatcher, CLI-style)
124
125
  # ---------------------------------------------------------------------------
125
126
 
126
- def cmd_upgrade(args) -> int:
127
- path = Path(args.file)
127
+ def _upgrade_entries(path: Path, dry_run: bool) -> dict:
128
+ """Match every preprint entry in ``path`` to its published version and
129
+ rewrite it in place (unless dry_run). Returns the report; does NOT tidy —
130
+ callers decide."""
128
131
  db = bibfile.load_bib_file(path)
129
132
  if db is None or not db.entries:
130
133
  _log(f"[bibcite] nothing to do in {path}")
131
- return 0
134
+ return {"upgraded": 0, "matched": 0, "entries": []}
132
135
 
133
136
  report = []
134
137
  changed = 0
138
+ processed = 0
135
139
  for entry in db.entries:
136
140
  if not bibfile.is_preprint(entry):
137
141
  continue
138
142
  title = entry.get("title", "").replace("{", "").replace("}", "")
139
143
  if not title:
140
144
  continue
145
+ if processed:
146
+ time.sleep(1) # be polite to the APIs on batch runs
147
+ processed += 1
141
148
  _log(f"[upgrade] matching: {title[:80]}")
142
149
  aid = bibfile.entry_arxiv_id(entry)
143
150
  hint = (
@@ -149,7 +156,7 @@ def cmd_upgrade(args) -> int:
149
156
  continue
150
157
  canonical = canonicalize(match.venue, match.year or entry.get("year"))
151
158
  venue_name = canonical.name if canonical else match.venue
152
- if not args.dry_run:
159
+ if not dry_run:
153
160
  entry.pop("journal", None)
154
161
  entry.pop("booktitle", None)
155
162
  entry.pop("howpublished", None)
@@ -179,17 +186,23 @@ def cmd_upgrade(args) -> int:
179
186
  }
180
187
  )
181
188
 
182
- if changed and not args.dry_run:
189
+ if changed and not dry_run:
183
190
  bibfile._write_db(path, db)
184
- if not args.no_tidy:
185
- bibfile.run_tidy(path)
186
191
 
187
192
  matched = sum(1 for r in report if r["matched"])
188
193
  for r in report:
189
194
  mark = "✓" if r["matched"] else "✗"
190
195
  _log(f"{mark} {r['key']}: {r.get('venue', 'no match')}")
191
- _log(f"[bibcite] {matched} matched, {changed} upgraded{' (dry-run)' if args.dry_run else ''}")
192
- _emit({"upgraded": changed, "matched": matched, "dry_run": args.dry_run, "entries": report})
196
+ _log(f"[bibcite] {matched} matched, {changed} upgraded{' (dry-run)' if dry_run else ''}")
197
+ return {"upgraded": changed, "matched": matched, "entries": report}
198
+
199
+
200
+ def cmd_upgrade(args) -> int:
201
+ path = Path(args.file)
202
+ result = _upgrade_entries(path, args.dry_run)
203
+ if result["upgraded"] and not args.no_tidy:
204
+ bibfile.run_tidy(path)
205
+ _emit({**result, "dry_run": args.dry_run})
193
206
  return 0
194
207
 
195
208
 
@@ -201,12 +214,11 @@ def cmd_tidy(args) -> int:
201
214
  return 0 if bibfile.run_tidy(Path(args.file)) else 1
202
215
 
203
216
 
204
- def cmd_check(args) -> int:
205
- path = Path(args.file)
217
+ def _check_problems(path: Path) -> tuple[int, list] | None:
218
+ """(entry count, problem list) for a .bib file, or None if unparseable."""
206
219
  db = bibfile.load_bib_file(path)
207
220
  if db is None:
208
- _log(f"[bibcite] {path} could not be parsed")
209
- return 1
221
+ return None
210
222
  problems = []
211
223
  seen_titles: dict[str, str] = {}
212
224
  for entry in db.entries:
@@ -223,7 +235,37 @@ def cmd_check(args) -> int:
223
235
  for p in problems:
224
236
  _log(f"{p['key']}: {p['issue']}")
225
237
  _log(f"[bibcite] {len(db.entries)} entries, {len(problems)} issues")
226
- _emit({"entries": len(db.entries), "problems": problems})
238
+ return len(db.entries), problems
239
+
240
+
241
+ def cmd_check(args) -> int:
242
+ checked = _check_problems(Path(args.file))
243
+ if checked is None:
244
+ _log(f"[bibcite] {args.file} could not be parsed")
245
+ return 1
246
+ entries, problems = checked
247
+ _emit({"entries": entries, "problems": problems})
248
+ return 0
249
+
250
+
251
+ def cmd_fix(args) -> int:
252
+ """One-shot cleanup: upgrade preprints, always tidy, then re-lint."""
253
+ path = Path(args.file)
254
+ if not path.exists():
255
+ _log(f"[bibcite] {path} does not exist")
256
+ return 1
257
+ result = _upgrade_entries(path, dry_run=False)
258
+ tidied = bibfile.run_tidy(path)
259
+ checked = _check_problems(path)
260
+ entries, problems = checked if checked else (0, [])
261
+ _emit(
262
+ {
263
+ **result,
264
+ "tidied": tidied,
265
+ "entries_total": entries,
266
+ "remaining_problems": problems,
267
+ }
268
+ )
227
269
  return 0
228
270
 
229
271
 
@@ -260,10 +302,17 @@ def main(argv=None) -> int:
260
302
  t.add_argument("file")
261
303
  t.set_defaults(fn=cmd_tidy)
262
304
 
263
- c = sub.add_parser("check", help="offline sanity check of a .bib file (prints JSON)")
305
+ c = sub.add_parser("check", help="offline read-only lint of a .bib file (prints JSON)")
264
306
  c.add_argument("file")
265
307
  c.set_defaults(fn=cmd_check)
266
308
 
309
+ f = sub.add_parser(
310
+ "fix",
311
+ help="one-shot cleanup: upgrade preprints to published versions, tidy, then lint (prints JSON)",
312
+ )
313
+ f.add_argument("file")
314
+ f.set_defaults(fn=cmd_fix)
315
+
267
316
  args = p.parse_args(argv)
268
317
  return args.fn(args)
269
318
 
@@ -7,6 +7,7 @@ preprint venues (arXiv / CoRR / bioRxiv / ...).
7
7
  """
8
8
 
9
9
  import html
10
+ import os
10
11
  import re
11
12
  import sys
12
13
  import time
@@ -44,6 +45,14 @@ def _client(browser: bool = False) -> httpx.Client:
44
45
  )
45
46
 
46
47
 
48
+ def _s2_headers() -> dict:
49
+ """Semantic Scholar's unauthenticated pool is shared globally and 429s
50
+ often; a free API key (https://api.semanticscholar.org) gets a private
51
+ quota. Set S2_API_KEY (or SEMANTIC_SCHOLAR_API_KEY)."""
52
+ key = os.environ.get("S2_API_KEY") or os.environ.get("SEMANTIC_SCHOLAR_API_KEY")
53
+ return {"x-api-key": key} if key else {}
54
+
55
+
47
56
  @dataclass
48
57
  class Match:
49
58
  source: str
@@ -260,6 +269,7 @@ def s2_arxiv_metadata(arxiv_id: str) -> ArxivMeta | None:
260
269
  r = c.get(
261
270
  f"https://api.semanticscholar.org/graph/v1/paper/arXiv:{arxiv_id}",
262
271
  params={"fields": "title,year,authors"},
272
+ headers=_s2_headers(),
263
273
  )
264
274
  if r.status_code != 200:
265
275
  return None
@@ -284,6 +294,7 @@ def try_semantic_scholar(
284
294
  r = c.get(
285
295
  f"https://api.semanticscholar.org/graph/v1/paper/arXiv:{arxiv_id}",
286
296
  params={"fields": S2_FIELDS},
297
+ headers=_s2_headers(),
287
298
  )
288
299
  if r.status_code == 429:
289
300
  raise SourceUnavailable("Semantic Scholar rate-limited (429)")
@@ -294,6 +305,7 @@ def try_semantic_scholar(
294
305
  r = c.get(
295
306
  "https://api.semanticscholar.org/graph/v1/paper/search",
296
307
  params={"query": title, "fields": S2_FIELDS, "limit": 5},
308
+ headers=_s2_headers(),
297
309
  )
298
310
  if r.status_code == 429:
299
311
  raise SourceUnavailable("Semantic Scholar rate-limited (429)")
@@ -575,19 +587,27 @@ CASCADE = (
575
587
  ("openalex", lambda t, y, a, au: try_openalex(t)),
576
588
  )
577
589
 
590
+ # Sources that rate-limited/blocked us this process: skip them for the rest of
591
+ # the run instead of hammering them once per entry during batch `upgrade`
592
+ # (PaperMemory's DISABLE_MATCH, ported).
593
+ _DISABLED: dict[str, str] = {}
594
+
578
595
 
579
596
  def find_published(
580
597
  title: str, year: str = "", arxiv_id: str = "", author_hint: str = ""
581
598
  ) -> Match | None:
582
599
  """Try each source in order; first verified hit wins."""
583
600
  for name, fn in CASCADE:
601
+ if name in _DISABLED:
602
+ continue
584
603
  try:
585
604
  m = fn(title, year, arxiv_id, author_hint)
586
605
  if m:
587
606
  return m
588
607
  _log(f"[{name}] no publication found")
589
608
  except SourceUnavailable as e:
590
- _log(f"[{name}] skipped: {e}")
609
+ _DISABLED[name] = str(e)
610
+ _log(f"[{name}] disabled for the rest of this run: {e}")
591
611
  except Exception as e: # network hiccup on one source must not kill the run
592
612
  _log(f"[{name}] error: {type(e).__name__}: {e}")
593
613
  return None
@@ -18,7 +18,7 @@ wheels = [
18
18
 
19
19
  [[package]]
20
20
  name = "bibcite-cli"
21
- version = "0.1.0"
21
+ version = "0.2.0"
22
22
  source = { editable = "." }
23
23
  dependencies = [
24
24
  { name = "bibtexparser" },
File without changes
File without changes