bibcite-cli 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/PKG-INFO +14 -3
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/Readme.md +12 -2
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/pyproject.toml +4 -1
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/src/bibcite/__init__.py +1 -1
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/src/bibcite/bibfile.py +52 -9
- bibcite_cli-0.3.0/src/bibcite/cache.py +48 -0
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/src/bibcite/cli.py +153 -43
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/src/bibcite/normalize.py +22 -0
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/src/bibcite/resolve.py +38 -11
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/src/bibcite/sources.py +21 -4
- bibcite_cli-0.3.0/tests/test_bugfixes.py +91 -0
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/uv.lock +1 -1
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/.gitignore +0 -0
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/LICENSE +0 -0
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/src/bibcite/data/strings.bib +0 -0
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/src/bibcite/venues.py +0 -0
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/tests/test_bibfile.py +0 -0
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/tests/test_entry_types.py +0 -0
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/tests/test_normalize.py +0 -0
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/tests/test_strings_override.py +0 -0
- {bibcite_cli-0.2.0 → bibcite_cli-0.3.0}/tests/test_venues.py +0 -0
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bibcite-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX for agents and humans
|
|
5
|
+
Project-URL: Repository, https://github.com/leo1oel/bibcite
|
|
5
6
|
License-Expression: MIT
|
|
6
7
|
License-File: LICENSE
|
|
7
8
|
Keywords: arxiv,bibliography,bibtex,citations,dblp
|
|
@@ -32,7 +33,7 @@ After every write, the file is formatted with [bibtex-tidy](https://github.com/F
|
|
|
32
33
|
uv tool install --editable .
|
|
33
34
|
|
|
34
35
|
# from git, no checkout needed
|
|
35
|
-
uv tool install git+https://github.com
|
|
36
|
+
uv tool install git+https://github.com/leo1oel/bibcite
|
|
36
37
|
|
|
37
38
|
# once published to PyPI (package name bibcite-cli, command name bibcite)
|
|
38
39
|
uv tool install bibcite-cli # or: uvx --from bibcite-cli bibcite ...
|
|
@@ -57,6 +58,13 @@ bibcite add refs.bib 2103.14030 --json
|
|
|
57
58
|
# Add a raw BibTeX entry you already have (venue still canonicalized, file still tidied)
|
|
58
59
|
bibcite add refs.bib --bibtex "$(pbpaste)"
|
|
59
60
|
|
|
61
|
+
# Batch add (one query per line; shares rate-limit state, tidies once)
|
|
62
|
+
bibcite add refs.bib --from ids.txt
|
|
63
|
+
|
|
64
|
+
# Overwrite a bad existing entry (keeps its key), or delete one
|
|
65
|
+
bibcite add refs.bib <query> --replace
|
|
66
|
+
bibcite remove refs.bib <key>
|
|
67
|
+
|
|
60
68
|
# One-shot cleanup: upgrade preprints → tidy → lint
|
|
61
69
|
bibcite fix refs.bib
|
|
62
70
|
|
|
@@ -68,8 +76,11 @@ bibcite tidy refs.bib
|
|
|
68
76
|
bibcite check refs.bib
|
|
69
77
|
```
|
|
70
78
|
|
|
71
|
-
|
|
79
|
+
`add`/`upgrade`/`check`/`fix`/`remove` print a machine-readable JSON result on stdout (`action`, `key`, `venue`, `source`, ...); all diagnostics go to stderr.
|
|
72
80
|
`add` is idempotent: an existing entry returns `action: exists` with its key, and an existing arXiv entry matched to a published version is upgraded in place, keeping its citation key.
|
|
81
|
+
Exit codes: 0 success, 2 paper not found (ask for a better identifier), 3 sources/tool failure (retry later).
|
|
82
|
+
Successful matches are cached at `~/.cache/bibcite/published.json` (published papers only — preprint status is never cached); bypass with `--no-cache` or `BIBCITE_NO_CACHE=1`.
|
|
83
|
+
Entries marked `pubstate = {preprint}` are treated as confirmed preprint-only and muted from `check`/`upgrade`.
|
|
73
84
|
|
|
74
85
|
## For agents
|
|
75
86
|
|
|
@@ -20,7 +20,7 @@ After every write, the file is formatted with [bibtex-tidy](https://github.com/F
|
|
|
20
20
|
uv tool install --editable .
|
|
21
21
|
|
|
22
22
|
# from git, no checkout needed
|
|
23
|
-
uv tool install git+https://github.com
|
|
23
|
+
uv tool install git+https://github.com/leo1oel/bibcite
|
|
24
24
|
|
|
25
25
|
# once published to PyPI (package name bibcite-cli, command name bibcite)
|
|
26
26
|
uv tool install bibcite-cli # or: uvx --from bibcite-cli bibcite ...
|
|
@@ -45,6 +45,13 @@ bibcite add refs.bib 2103.14030 --json
|
|
|
45
45
|
# Add a raw BibTeX entry you already have (venue still canonicalized, file still tidied)
|
|
46
46
|
bibcite add refs.bib --bibtex "$(pbpaste)"
|
|
47
47
|
|
|
48
|
+
# Batch add (one query per line; shares rate-limit state, tidies once)
|
|
49
|
+
bibcite add refs.bib --from ids.txt
|
|
50
|
+
|
|
51
|
+
# Overwrite a bad existing entry (keeps its key), or delete one
|
|
52
|
+
bibcite add refs.bib <query> --replace
|
|
53
|
+
bibcite remove refs.bib <key>
|
|
54
|
+
|
|
48
55
|
# One-shot cleanup: upgrade preprints → tidy → lint
|
|
49
56
|
bibcite fix refs.bib
|
|
50
57
|
|
|
@@ -56,8 +63,11 @@ bibcite tidy refs.bib
|
|
|
56
63
|
bibcite check refs.bib
|
|
57
64
|
```
|
|
58
65
|
|
|
59
|
-
|
|
66
|
+
`add`/`upgrade`/`check`/`fix`/`remove` print a machine-readable JSON result on stdout (`action`, `key`, `venue`, `source`, ...); all diagnostics go to stderr.
|
|
60
67
|
`add` is idempotent: an existing entry returns `action: exists` with its key, and an existing arXiv entry matched to a published version is upgraded in place, keeping its citation key.
|
|
68
|
+
Exit codes: 0 success, 2 paper not found (ask for a better identifier), 3 sources/tool failure (retry later).
|
|
69
|
+
Successful matches are cached at `~/.cache/bibcite/published.json` (published papers only — preprint status is never cached); bypass with `--no-cache` or `BIBCITE_NO_CACHE=1`.
|
|
70
|
+
Entries marked `pubstate = {preprint}` are treated as confirmed preprint-only and muted from `check`/`upgrade`.
|
|
61
71
|
|
|
62
72
|
## For agents
|
|
63
73
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "bibcite-cli"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.3.0"
|
|
4
4
|
description = "Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX for agents and humans"
|
|
5
5
|
readme = "Readme.md"
|
|
6
6
|
license = "MIT"
|
|
@@ -11,6 +11,9 @@ dependencies = [
|
|
|
11
11
|
]
|
|
12
12
|
keywords = ["bibtex", "arxiv", "citations", "dblp", "bibliography"]
|
|
13
13
|
|
|
14
|
+
[project.urls]
|
|
15
|
+
Repository = "https://github.com/leo1oel/bibcite"
|
|
16
|
+
|
|
14
17
|
[project.scripts]
|
|
15
18
|
bibcite = "bibcite.cli:main"
|
|
16
19
|
|
|
@@ -14,7 +14,10 @@ from bibtexparser.bwriter import BibTexWriter
|
|
|
14
14
|
from .normalize import norm_title
|
|
15
15
|
|
|
16
16
|
# The exact bibtex-tidy invocation requested by the user; keep in sync with
|
|
17
|
-
# their LaTeX workflow.
|
|
17
|
+
# their LaTeX workflow. NOTE: no --generate-keys — bibcite owns key
|
|
18
|
+
# generation (make_key ASCII-folds names, so Hyvärinen -> hyvarinen2000...,
|
|
19
|
+
# where tidy would emit hyv_arinen2000...), and stable keys keep existing
|
|
20
|
+
# \cite{} commands valid.
|
|
18
21
|
TIDY_ARGS = [
|
|
19
22
|
"--modify",
|
|
20
23
|
"--omit=pages,publisher,doi,timestamp,biburl,bibsource,abstract,month,series,volume,editor,note,date,number,address",
|
|
@@ -27,10 +30,26 @@ TIDY_ARGS = [
|
|
|
27
30
|
"--sort-fields=author,title,booktitle,journal,year,url,pdf",
|
|
28
31
|
"--strip-enclosing-braces",
|
|
29
32
|
"--tidy-comments",
|
|
30
|
-
"--generate-keys",
|
|
31
33
|
]
|
|
32
34
|
|
|
33
|
-
NOISE_FIELDS = ("timestamp", "biburl", "bibsource", "crossref")
|
|
35
|
+
NOISE_FIELDS = ("timestamp", "biburl", "bibsource", "crossref", "month")
|
|
36
|
+
|
|
37
|
+
# BibTeX month macros. bibtexparser's common_strings only defines jan..dec;
|
|
38
|
+
# CrossRef's transform endpoint emits bare full names (month=June), which
|
|
39
|
+
# otherwise KeyError during string interpolation.
|
|
40
|
+
MONTH_STRINGS = {
|
|
41
|
+
m[:3]: m.capitalize()
|
|
42
|
+
for m in (
|
|
43
|
+
"january february march april may june july august september "
|
|
44
|
+
"october november december"
|
|
45
|
+
).split()
|
|
46
|
+
} | {
|
|
47
|
+
m: m.capitalize()
|
|
48
|
+
for m in (
|
|
49
|
+
"january february march april may june july august september "
|
|
50
|
+
"october november december"
|
|
51
|
+
).split()
|
|
52
|
+
}
|
|
34
53
|
|
|
35
54
|
ARXIV_ID_RE = re.compile(r"(\d{4}\.\d{4,5})(v\d+)?")
|
|
36
55
|
|
|
@@ -42,11 +61,18 @@ def _log(msg: str):
|
|
|
42
61
|
def _parser() -> BibTexParser:
|
|
43
62
|
p = BibTexParser(common_strings=True)
|
|
44
63
|
p.ignore_nonstandard_types = False
|
|
64
|
+
p.bib_database.strings.update(MONTH_STRINGS)
|
|
45
65
|
return p
|
|
46
66
|
|
|
47
67
|
|
|
48
68
|
def parse_bib(text: str) -> BibDatabase:
|
|
49
|
-
|
|
69
|
+
try:
|
|
70
|
+
return bibtexparser.loads(text, parser=_parser())
|
|
71
|
+
except Exception as e:
|
|
72
|
+
# Undefined @string macros raise bare KeyError('macro'); rewrap so
|
|
73
|
+
# callers see a real message and KeyError never masquerades as a
|
|
74
|
+
# LookupError "not found" upstream.
|
|
75
|
+
raise ValueError(f"BibTeX parse failed: {type(e).__name__}: {e}") from e
|
|
50
76
|
|
|
51
77
|
|
|
52
78
|
def parse_bibtex_entry(text: str) -> dict:
|
|
@@ -117,10 +143,12 @@ def find_existing(db: BibDatabase, title: str, arxiv_id: str = "", doi: str = ""
|
|
|
117
143
|
return None
|
|
118
144
|
|
|
119
145
|
|
|
120
|
-
def upsert_entry(path: Path, entry: dict) -> tuple[str, str]:
|
|
146
|
+
def upsert_entry(path: Path, entry: dict, replace: bool = False) -> tuple[str, str]:
|
|
121
147
|
"""Insert or upgrade ``entry`` in ``path``.
|
|
122
148
|
|
|
123
|
-
Returns (action, key)
|
|
149
|
+
Returns (action, key), action in "added" | "upgraded" | "exists" |
|
|
150
|
+
"replaced". With ``replace``, an existing matching entry is overwritten
|
|
151
|
+
(its citation key is kept so existing \\cite{} commands stay valid).
|
|
124
152
|
"""
|
|
125
153
|
db = load_bib_file(path)
|
|
126
154
|
if db is None: # unparseable file: append blindly
|
|
@@ -132,13 +160,14 @@ def upsert_entry(path: Path, entry: dict) -> tuple[str, str]:
|
|
|
132
160
|
db, entry.get("title", ""), entry_arxiv_id(entry), entry.get("doi", "")
|
|
133
161
|
)
|
|
134
162
|
if existing is not None:
|
|
135
|
-
|
|
163
|
+
upgrade = is_preprint(existing) and not is_preprint(entry)
|
|
164
|
+
if replace or upgrade:
|
|
136
165
|
key = existing["ID"]
|
|
137
166
|
existing.clear()
|
|
138
|
-
existing.update(entry)
|
|
167
|
+
existing.update({k: str(v) for k, v in entry.items() if v})
|
|
139
168
|
existing["ID"] = key # keep the key the user may already \cite
|
|
140
169
|
_write_db(path, db)
|
|
141
|
-
return "upgraded", key
|
|
170
|
+
return ("replaced" if replace else "upgraded"), key
|
|
142
171
|
return "exists", existing["ID"]
|
|
143
172
|
|
|
144
173
|
db.entries.append({k: str(v) for k, v in entry.items() if v})
|
|
@@ -146,6 +175,20 @@ def upsert_entry(path: Path, entry: dict) -> tuple[str, str]:
|
|
|
146
175
|
return "added", entry["ID"]
|
|
147
176
|
|
|
148
177
|
|
|
178
|
+
def remove_entry(path: Path, key: str) -> bool:
|
|
179
|
+
"""Delete the entry with citation key ``key``. True if something was
|
|
180
|
+
removed."""
|
|
181
|
+
db = load_bib_file(path)
|
|
182
|
+
if db is None:
|
|
183
|
+
return False
|
|
184
|
+
before = len(db.entries)
|
|
185
|
+
db.entries = [e for e in db.entries if e.get("ID") != key]
|
|
186
|
+
if len(db.entries) == before:
|
|
187
|
+
return False
|
|
188
|
+
_write_db(path, db)
|
|
189
|
+
return True
|
|
190
|
+
|
|
191
|
+
|
|
149
192
|
def _write_db(path: Path, db: BibDatabase):
|
|
150
193
|
writer = BibTexWriter()
|
|
151
194
|
writer.indent = " "
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Local cache of successful publication matches.
|
|
2
|
+
|
|
3
|
+
Keyed by normalized title. Only *published* matches are stored — a paper that
|
|
4
|
+
is published stays published, while a preprint may get published tomorrow, so
|
|
5
|
+
negative/preprint results are never cached. Re-running `fix`/`upgrade` or
|
|
6
|
+
re-adding known papers therefore costs zero API calls.
|
|
7
|
+
|
|
8
|
+
Disable with --no-cache or BIBCITE_NO_CACHE=1. Lives at
|
|
9
|
+
$XDG_CACHE_HOME/bibcite/published.json (~/.cache/bibcite/published.json).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
DISABLED = os.environ.get("BIBCITE_NO_CACHE", "") == "1"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _path() -> Path:
|
|
21
|
+
root = os.environ.get("XDG_CACHE_HOME") or "~/.cache"
|
|
22
|
+
return Path(root).expanduser() / "bibcite" / "published.json"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _load() -> dict:
|
|
26
|
+
try:
|
|
27
|
+
return json.loads(_path().read_text())
|
|
28
|
+
except Exception:
|
|
29
|
+
return {}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get(key: str) -> dict | None:
|
|
33
|
+
if DISABLED or not key:
|
|
34
|
+
return None
|
|
35
|
+
return _load().get(key)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def put(key: str, value: dict):
|
|
39
|
+
if DISABLED or not key:
|
|
40
|
+
return
|
|
41
|
+
try:
|
|
42
|
+
data = _load()
|
|
43
|
+
data[key] = value
|
|
44
|
+
p = _path()
|
|
45
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
p.write_text(json.dumps(data, ensure_ascii=False))
|
|
47
|
+
except Exception as e: # cache must never break resolution
|
|
48
|
+
print(f"[cache] write failed: {e}", file=sys.stderr)
|
|
@@ -11,12 +11,25 @@ import sys
|
|
|
11
11
|
import time
|
|
12
12
|
from pathlib import Path
|
|
13
13
|
|
|
14
|
-
from . import bibfile
|
|
14
|
+
from . import bibfile, cache
|
|
15
15
|
from .normalize import first_author_last_name, norm_title
|
|
16
|
-
from .resolve import
|
|
16
|
+
from .resolve import (
|
|
17
|
+
NotFound,
|
|
18
|
+
Resolved,
|
|
19
|
+
SourcesUnavailable,
|
|
20
|
+
guess_entry_type,
|
|
21
|
+
resolve,
|
|
22
|
+
)
|
|
17
23
|
from .sources import find_published
|
|
18
24
|
from .venues import canonicalize
|
|
19
25
|
|
|
26
|
+
# Exit codes (part of the agent-facing contract):
|
|
27
|
+
# 0 success
|
|
28
|
+
# 2 the paper could not be resolved — ask for a stronger identifier
|
|
29
|
+
# 3 internal/network failure (sources down, unexpected error) — retry later
|
|
30
|
+
EXIT_NOT_FOUND = 2
|
|
31
|
+
EXIT_INTERNAL = 3
|
|
32
|
+
|
|
20
33
|
|
|
21
34
|
def _log(msg: str):
|
|
22
35
|
print(msg, file=sys.stderr)
|
|
@@ -42,21 +55,30 @@ def _emit(payload: dict, as_json: bool = True):
|
|
|
42
55
|
# get
|
|
43
56
|
# ---------------------------------------------------------------------------
|
|
44
57
|
|
|
45
|
-
def _resolve_or_none(query: str, require_published: bool):
|
|
58
|
+
def _resolve_or_none(query: str, require_published: bool) -> tuple[Resolved | None, int]:
|
|
59
|
+
"""(result, exit_code). Distinguishes 'not found' (2) from 'tool/source
|
|
60
|
+
failure' (3) so agents know whether to retry with a better identifier or
|
|
61
|
+
just retry later."""
|
|
46
62
|
try:
|
|
47
|
-
return resolve(query, require_published=require_published)
|
|
48
|
-
except (
|
|
63
|
+
return resolve(query, require_published=require_published), 0
|
|
64
|
+
except (NotFound, ValueError) as e:
|
|
49
65
|
_log(f"[bibcite] {e}")
|
|
66
|
+
return None, EXIT_NOT_FOUND
|
|
67
|
+
except SourcesUnavailable as e:
|
|
68
|
+
_log(f"[bibcite] sources unavailable: {e}")
|
|
69
|
+
return None, EXIT_INTERNAL
|
|
50
70
|
except Exception as e:
|
|
51
|
-
_log(f"[bibcite]
|
|
52
|
-
|
|
71
|
+
_log(f"[bibcite] internal error: {type(e).__name__}: {e}")
|
|
72
|
+
return None, EXIT_INTERNAL
|
|
53
73
|
|
|
54
74
|
|
|
55
75
|
def cmd_get(args) -> int:
|
|
56
76
|
query = " ".join(args.query)
|
|
57
|
-
|
|
77
|
+
if args.no_cache:
|
|
78
|
+
cache.DISABLED = True
|
|
79
|
+
res, code = _resolve_or_none(query, args.require_published)
|
|
58
80
|
if res is None:
|
|
59
|
-
return
|
|
81
|
+
return code
|
|
60
82
|
_emit(
|
|
61
83
|
{
|
|
62
84
|
"action": "resolved",
|
|
@@ -76,48 +98,86 @@ def cmd_get(args) -> int:
|
|
|
76
98
|
# add
|
|
77
99
|
# ---------------------------------------------------------------------------
|
|
78
100
|
|
|
101
|
+
def _resolve_user_bibtex(text: str) -> Resolved:
|
|
102
|
+
entry = bibfile.parse_bibtex_entry(text)
|
|
103
|
+
raw_venue = entry.get("booktitle", "") or entry.get("journal", "")
|
|
104
|
+
canonical = canonicalize(raw_venue, entry.get("year"))
|
|
105
|
+
if canonical:
|
|
106
|
+
entry.pop("booktitle", None)
|
|
107
|
+
entry.pop("journal", None)
|
|
108
|
+
entry["ENTRYTYPE"] = canonical.entry_type
|
|
109
|
+
entry[canonical.bib_field] = canonical.name
|
|
110
|
+
return Resolved(entry, "user-bibtex", canonical.name if canonical else raw_venue, True)
|
|
111
|
+
|
|
112
|
+
|
|
79
113
|
def cmd_add(args) -> int:
|
|
80
114
|
path = Path(args.file)
|
|
115
|
+
if args.no_cache:
|
|
116
|
+
cache.DISABLED = True
|
|
117
|
+
|
|
118
|
+
# Collect the queries for this invocation (single, --bibtex, or --from).
|
|
81
119
|
if args.bibtex:
|
|
82
120
|
text = sys.stdin.read() if args.bibtex == "-" else args.bibtex
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
121
|
+
try:
|
|
122
|
+
resolutions = [("<bibtex>", _resolve_user_bibtex(text), 0)]
|
|
123
|
+
except ValueError as e:
|
|
124
|
+
_log(f"[bibcite] {e}")
|
|
125
|
+
return EXIT_NOT_FOUND
|
|
126
|
+
elif args.from_file:
|
|
127
|
+
lines = Path(args.from_file).read_text().splitlines()
|
|
128
|
+
queries = [q.strip() for q in lines if q.strip() and not q.strip().startswith("#")]
|
|
129
|
+
resolutions = []
|
|
130
|
+
for i, q in enumerate(queries):
|
|
131
|
+
if i:
|
|
132
|
+
time.sleep(1) # one process shares the rate-limit breaker; stay polite
|
|
133
|
+
_log(f"[bibcite] ({i + 1}/{len(queries)}) {q}")
|
|
134
|
+
res, code = _resolve_or_none(q, args.require_published)
|
|
135
|
+
resolutions.append((q, res, code))
|
|
92
136
|
else:
|
|
93
137
|
if not args.query:
|
|
94
|
-
_log("[bibcite] provide a query (arXiv id / DOI / title) or --
|
|
95
|
-
return
|
|
138
|
+
_log("[bibcite] provide a query (arXiv id / DOI / title), --bibtex, or --from")
|
|
139
|
+
return EXIT_NOT_FOUND
|
|
96
140
|
query = " ".join(args.query)
|
|
97
|
-
res = _resolve_or_none(query, args.require_published)
|
|
141
|
+
res, code = _resolve_or_none(query, args.require_published)
|
|
142
|
+
if res is None:
|
|
143
|
+
return code
|
|
144
|
+
resolutions = [(query, res, 0)]
|
|
145
|
+
|
|
146
|
+
# Write all entries first, tidy once, then read back the final keys.
|
|
147
|
+
results = []
|
|
148
|
+
wrote = False
|
|
149
|
+
for query, res, code in resolutions:
|
|
98
150
|
if res is None:
|
|
99
|
-
|
|
151
|
+
results.append({"query": query, "action": "failed", "exit_code": code})
|
|
152
|
+
continue
|
|
153
|
+
action, key = bibfile.upsert_entry(path, res.entry, replace=args.replace)
|
|
154
|
+
wrote = wrote or action != "exists"
|
|
155
|
+
results.append(
|
|
156
|
+
{
|
|
157
|
+
"query": query,
|
|
158
|
+
"action": action,
|
|
159
|
+
"key": key,
|
|
160
|
+
"title": res.entry.get("title", ""),
|
|
161
|
+
"venue": res.venue or "arXiv (preprint)",
|
|
162
|
+
"published": res.published,
|
|
163
|
+
"source": res.source,
|
|
164
|
+
}
|
|
165
|
+
)
|
|
100
166
|
|
|
101
|
-
action, key = bibfile.upsert_entry(path, res.entry)
|
|
102
167
|
tidied = False
|
|
103
|
-
if
|
|
168
|
+
if wrote and not args.no_tidy:
|
|
104
169
|
tidied = bibfile.run_tidy(path)
|
|
105
170
|
if tidied:
|
|
106
|
-
|
|
171
|
+
for r in results:
|
|
172
|
+
if r.get("title"):
|
|
173
|
+
r["key"] = bibfile.key_after_tidy(path, r["title"], r["key"])
|
|
107
174
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
"published": res.published,
|
|
115
|
-
"source": res.source,
|
|
116
|
-
"file": str(path),
|
|
117
|
-
"tidied": tidied,
|
|
118
|
-
}
|
|
119
|
-
)
|
|
120
|
-
return 0
|
|
175
|
+
exit_code = max((r.get("exit_code", 0) for r in results), default=0)
|
|
176
|
+
if len(results) == 1 and not args.from_file:
|
|
177
|
+
_emit({**results[0], "file": str(path), "tidied": tidied})
|
|
178
|
+
else:
|
|
179
|
+
_emit({"file": str(path), "tidied": tidied, "results": results})
|
|
180
|
+
return exit_code
|
|
121
181
|
|
|
122
182
|
|
|
123
183
|
# ---------------------------------------------------------------------------
|
|
@@ -139,6 +199,10 @@ def _upgrade_entries(path: Path, dry_run: bool) -> dict:
|
|
|
139
199
|
for entry in db.entries:
|
|
140
200
|
if not bibfile.is_preprint(entry):
|
|
141
201
|
continue
|
|
202
|
+
if entry.get("pubstate", "").strip("{}") == "preprint":
|
|
203
|
+
# User-confirmed preprint-only (e.g. never-to-be-published arXiv
|
|
204
|
+
# reports): muted from upgrade and check.
|
|
205
|
+
continue
|
|
142
206
|
title = entry.get("title", "").replace("{", "").replace("}", "")
|
|
143
207
|
if not title:
|
|
144
208
|
continue
|
|
@@ -150,9 +214,16 @@ def _upgrade_entries(path: Path, dry_run: bool) -> dict:
|
|
|
150
214
|
hint = (
|
|
151
215
|
first_author_last_name(entry["author"]) if entry.get("author") else ""
|
|
152
216
|
)
|
|
153
|
-
match = find_published(title, entry.get("year", ""), aid, hint)
|
|
217
|
+
match, status = find_published(title, entry.get("year", ""), aid, hint)
|
|
154
218
|
if not match:
|
|
155
|
-
|
|
219
|
+
# "no_published_version" is a trustworthy miss; "sources_unavailable"
|
|
220
|
+
# means the sources were down — do not conclude anything.
|
|
221
|
+
reason = (
|
|
222
|
+
"sources_unavailable" if status == "unavailable" else "no_published_version"
|
|
223
|
+
)
|
|
224
|
+
report.append(
|
|
225
|
+
{"key": entry["ID"], "title": title, "matched": False, "reason": reason}
|
|
226
|
+
)
|
|
156
227
|
continue
|
|
157
228
|
canonical = canonicalize(match.venue, match.year or entry.get("year"))
|
|
158
229
|
venue_name = canonical.name if canonical else match.venue
|
|
@@ -192,13 +263,15 @@ def _upgrade_entries(path: Path, dry_run: bool) -> dict:
|
|
|
192
263
|
matched = sum(1 for r in report if r["matched"])
|
|
193
264
|
for r in report:
|
|
194
265
|
mark = "✓" if r["matched"] else "✗"
|
|
195
|
-
_log(f"{mark} {r['key']}: {r.get('venue', 'no match')}")
|
|
266
|
+
_log(f"{mark} {r['key']}: {r.get('venue') or r.get('reason', 'no match')}")
|
|
196
267
|
_log(f"[bibcite] {matched} matched, {changed} upgraded{' (dry-run)' if dry_run else ''}")
|
|
197
268
|
return {"upgraded": changed, "matched": matched, "entries": report}
|
|
198
269
|
|
|
199
270
|
|
|
200
271
|
def cmd_upgrade(args) -> int:
|
|
201
272
|
path = Path(args.file)
|
|
273
|
+
if args.no_cache:
|
|
274
|
+
cache.DISABLED = True
|
|
202
275
|
result = _upgrade_entries(path, args.dry_run)
|
|
203
276
|
if result["upgraded"] and not args.no_tidy:
|
|
204
277
|
bibfile.run_tidy(path)
|
|
@@ -230,8 +303,12 @@ def _check_problems(path: Path) -> tuple[int, list] | None:
|
|
|
230
303
|
for f in ("author", "title", "year"):
|
|
231
304
|
if not entry.get(f):
|
|
232
305
|
problems.append({"key": key, "issue": f"missing {f}"})
|
|
233
|
-
if bibfile.is_preprint(entry):
|
|
234
|
-
problems.append({"key": key, "issue": "arXiv preprint (try `bibcite upgrade
|
|
306
|
+
if bibfile.is_preprint(entry) and entry.get("pubstate", "").strip("{}") != "preprint":
|
|
307
|
+
problems.append({"key": key, "issue": "arXiv preprint (try `bibcite upgrade`, or set pubstate = {preprint} to mute)"})
|
|
308
|
+
author = entry.get("author", "")
|
|
309
|
+
letters = "".join(c for c in author if c.isalpha())
|
|
310
|
+
if letters and letters.isupper():
|
|
311
|
+
problems.append({"key": key, "issue": "author names are ALL CAPS"})
|
|
235
312
|
for p in problems:
|
|
236
313
|
_log(f"{p['key']}: {p['issue']}")
|
|
237
314
|
_log(f"[bibcite] {len(db.entries)} entries, {len(problems)} issues")
|
|
@@ -248,9 +325,30 @@ def cmd_check(args) -> int:
|
|
|
248
325
|
return 0
|
|
249
326
|
|
|
250
327
|
|
|
328
|
+
def cmd_remove(args) -> int:
|
|
329
|
+
"""Delete an entry by citation key — the sanctioned way to drop a bad
|
|
330
|
+
entry without hand-editing the file."""
|
|
331
|
+
path = Path(args.file)
|
|
332
|
+
removed = bibfile.remove_entry(path, args.key)
|
|
333
|
+
tidied = False
|
|
334
|
+
if removed and not args.no_tidy:
|
|
335
|
+
tidied = bibfile.run_tidy(path)
|
|
336
|
+
_emit(
|
|
337
|
+
{
|
|
338
|
+
"action": "removed" if removed else "not_found",
|
|
339
|
+
"key": args.key,
|
|
340
|
+
"file": str(path),
|
|
341
|
+
"tidied": tidied,
|
|
342
|
+
}
|
|
343
|
+
)
|
|
344
|
+
return 0 if removed else EXIT_NOT_FOUND
|
|
345
|
+
|
|
346
|
+
|
|
251
347
|
def cmd_fix(args) -> int:
|
|
252
348
|
"""One-shot cleanup: upgrade preprints, always tidy, then re-lint."""
|
|
253
349
|
path = Path(args.file)
|
|
350
|
+
if args.no_cache:
|
|
351
|
+
cache.DISABLED = True
|
|
254
352
|
if not path.exists():
|
|
255
353
|
_log(f"[bibcite] {path} does not exist")
|
|
256
354
|
return 1
|
|
@@ -282,20 +380,31 @@ def main(argv=None) -> int:
|
|
|
282
380
|
g.add_argument("query", nargs="+", help="arXiv id / arXiv URL / DOI / title")
|
|
283
381
|
g.add_argument("--json", action="store_true", help="print a JSON object instead of BibTeX")
|
|
284
382
|
g.add_argument("--require-published", action="store_true", help="fail instead of falling back to an arXiv entry")
|
|
383
|
+
g.add_argument("--no-cache", action="store_true", help="bypass the local match cache")
|
|
285
384
|
g.set_defaults(fn=cmd_get)
|
|
286
385
|
|
|
287
386
|
a = sub.add_parser("add", help="resolve and write into a .bib file, then run bibtex-tidy (prints JSON)")
|
|
288
387
|
a.add_argument("file", help="target .bib file (created if missing)")
|
|
289
388
|
a.add_argument("query", nargs="*", help="arXiv id / arXiv URL / DOI / title")
|
|
290
389
|
a.add_argument("--bibtex", help="raw BibTeX entry to add instead of a query ('-' reads stdin)")
|
|
390
|
+
a.add_argument("--from", dest="from_file", metavar="FILE", help="batch mode: one query per line (shares rate-limit state, tidies once)")
|
|
391
|
+
a.add_argument("--replace", action="store_true", help="overwrite an existing matching entry (keeps its citation key)")
|
|
291
392
|
a.add_argument("--no-tidy", action="store_true")
|
|
393
|
+
a.add_argument("--no-cache", action="store_true", help="bypass the local match cache")
|
|
292
394
|
a.add_argument("--require-published", action="store_true")
|
|
293
395
|
a.set_defaults(fn=cmd_add)
|
|
294
396
|
|
|
397
|
+
rm = sub.add_parser("remove", help="delete an entry by citation key (prints JSON)")
|
|
398
|
+
rm.add_argument("file")
|
|
399
|
+
rm.add_argument("key", help="citation key of the entry to remove")
|
|
400
|
+
rm.add_argument("--no-tidy", action="store_true")
|
|
401
|
+
rm.set_defaults(fn=cmd_remove)
|
|
402
|
+
|
|
295
403
|
u = sub.add_parser("upgrade", help="match all arXiv entries in a file to their published versions (prints JSON)")
|
|
296
404
|
u.add_argument("file")
|
|
297
405
|
u.add_argument("--dry-run", action="store_true")
|
|
298
406
|
u.add_argument("--no-tidy", action="store_true")
|
|
407
|
+
u.add_argument("--no-cache", action="store_true", help="bypass the local match cache")
|
|
299
408
|
u.set_defaults(fn=cmd_upgrade)
|
|
300
409
|
|
|
301
410
|
t = sub.add_parser("tidy", help="run bibtex-tidy with the canonical flags")
|
|
@@ -311,6 +420,7 @@ def main(argv=None) -> int:
|
|
|
311
420
|
help="one-shot cleanup: upgrade preprints to published versions, tidy, then lint (prints JSON)",
|
|
312
421
|
)
|
|
313
422
|
f.add_argument("file")
|
|
423
|
+
f.add_argument("--no-cache", action="store_true", help="bypass the local match cache")
|
|
314
424
|
f.set_defaults(fn=cmd_fix)
|
|
315
425
|
|
|
316
426
|
args = p.parse_args(argv)
|
|
@@ -76,6 +76,28 @@ def first_author_last_name(author_field: str) -> str:
|
|
|
76
76
|
return mini_hash(last) or "anon"
|
|
77
77
|
|
|
78
78
|
|
|
79
|
+
def fix_author_caps(author_field: str) -> str:
|
|
80
|
+
"""Normalize ALL-CAPS author names (old CrossRef records store e.g.
|
|
81
|
+
"EPPS, T. W. and PULLEY, LAWRENCE B."). A word is re-cased only when it
|
|
82
|
+
is fully uppercase and longer than 2 letters, so initials ("T.", "W.")
|
|
83
|
+
and legitimately-capitalized short names survive."""
|
|
84
|
+
|
|
85
|
+
def fix_word(w: str) -> str:
|
|
86
|
+
core = re.sub(r"[^A-Za-z]", "", w)
|
|
87
|
+
if len(core) > 2 and core.isupper():
|
|
88
|
+
return w.capitalize()
|
|
89
|
+
return w
|
|
90
|
+
|
|
91
|
+
def fix_name(name: str) -> str:
|
|
92
|
+
letters = re.sub(r"[^A-Za-z]", "", name)
|
|
93
|
+
if not letters.isupper():
|
|
94
|
+
return name # mixed case already — leave it alone
|
|
95
|
+
return " ".join(fix_word(w) for w in name.split())
|
|
96
|
+
|
|
97
|
+
names = re.split(r"\s+and\s+", author_field)
|
|
98
|
+
return " and ".join(fix_name(n) for n in names)
|
|
99
|
+
|
|
100
|
+
|
|
79
101
|
def make_key(author_field: str, year: str | int, title: str) -> str:
|
|
80
102
|
"""Deterministic citation key: <lastname><year><firstword>.
|
|
81
103
|
|
|
@@ -10,7 +10,17 @@ import sys
|
|
|
10
10
|
from dataclasses import dataclass
|
|
11
11
|
|
|
12
12
|
from .bibfile import NOISE_FIELDS, parse_bibtex_entry
|
|
13
|
-
from .normalize import clean_title, first_author_last_name, make_key
|
|
13
|
+
from .normalize import clean_title, first_author_last_name, fix_author_caps, make_key
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class NotFound(Exception):
|
|
17
|
+
"""No source could resolve the query — asking for a better identifier is
|
|
18
|
+
the right next step."""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SourcesUnavailable(Exception):
|
|
22
|
+
"""Resolution failed because sources were down/rate-limited, NOT because
|
|
23
|
+
the paper doesn't exist. Retrying later is the right next step."""
|
|
14
24
|
from .sources import (
|
|
15
25
|
ArxivMeta,
|
|
16
26
|
Match,
|
|
@@ -84,7 +94,10 @@ def _entry_from_match(match: Match, meta: ArxivMeta | None) -> dict:
|
|
|
84
94
|
if match.bibtex:
|
|
85
95
|
try:
|
|
86
96
|
entry = parse_bibtex_entry(match.bibtex)
|
|
87
|
-
except
|
|
97
|
+
except Exception as e:
|
|
98
|
+
# Bad source bibtex must degrade to field construction, never
|
|
99
|
+
# abort the resolution.
|
|
100
|
+
_log(f"[{match.source}] could not parse its BibTeX ({e}); building from fields")
|
|
88
101
|
entry = {}
|
|
89
102
|
if not entry:
|
|
90
103
|
authors = match.authors or (meta.authors if meta else [])
|
|
@@ -100,6 +113,8 @@ def _entry_from_match(match: Match, meta: ArxivMeta | None) -> dict:
|
|
|
100
113
|
entry.pop(f, None)
|
|
101
114
|
|
|
102
115
|
entry["title"] = clean_title(entry.get("title", ""))
|
|
116
|
+
if entry.get("author"):
|
|
117
|
+
entry["author"] = fix_author_caps(entry["author"])
|
|
103
118
|
if match.doi and not entry.get("doi"):
|
|
104
119
|
entry["doi"] = match.doi
|
|
105
120
|
|
|
@@ -132,8 +147,12 @@ def _finalize(entry: dict, meta: ArxivMeta | None) -> dict:
|
|
|
132
147
|
entry["archiveprefix"] = "arXiv"
|
|
133
148
|
if meta.primary_class:
|
|
134
149
|
entry["primaryclass"] = meta.primary_class
|
|
135
|
-
elif entry.get("doi")
|
|
136
|
-
entry
|
|
150
|
+
elif entry.get("doi"):
|
|
151
|
+
url = entry.get("url", "")
|
|
152
|
+
# Modernize legacy resolver links (http://dx.doi.org/...) and fill in
|
|
153
|
+
# a missing url from the DOI.
|
|
154
|
+
if not url or "dx.doi.org" in url:
|
|
155
|
+
entry["url"] = f"https://doi.org/{entry['doi']}"
|
|
137
156
|
author = entry.get("author", "") or "anonymous"
|
|
138
157
|
year = entry.get("year", "") or "XXXX"
|
|
139
158
|
entry["ID"] = make_key(author, year, entry.get("title", ""))
|
|
@@ -176,19 +195,23 @@ def resolve(query: str, require_published: bool = False) -> Resolved:
|
|
|
176
195
|
if meta is not None:
|
|
177
196
|
break
|
|
178
197
|
if meta is None:
|
|
179
|
-
raise
|
|
198
|
+
raise SourcesUnavailable(
|
|
180
199
|
f"Could not fetch metadata for arXiv:{value} "
|
|
181
200
|
"(arXiv API, Semantic Scholar, and arxiv.org all unavailable)"
|
|
182
201
|
)
|
|
183
202
|
_log(f"[arxiv] {meta.title} ({meta.year})")
|
|
184
203
|
hint = first_author_last_name(meta.authors[0]) if meta.authors else ""
|
|
185
|
-
match = find_published(meta.title, meta.year, meta.arxiv_id, hint)
|
|
204
|
+
match, status = find_published(meta.title, meta.year, meta.arxiv_id, hint)
|
|
186
205
|
if match:
|
|
187
206
|
entry = _entry_from_match(match, meta)
|
|
188
207
|
venue = entry.pop("__venue", match.venue)
|
|
189
208
|
return Resolved(_finalize(entry, meta), match.source, venue, True)
|
|
190
209
|
if require_published:
|
|
191
|
-
|
|
210
|
+
if status == "unavailable":
|
|
211
|
+
raise SourcesUnavailable(
|
|
212
|
+
f"Could not check publication status for arXiv:{value} (sources down)"
|
|
213
|
+
)
|
|
214
|
+
raise NotFound(f"No published version found for arXiv:{value}")
|
|
192
215
|
_log("[bibcite] no published version found; using arXiv preprint entry")
|
|
193
216
|
entry = _arxiv_only_entry(meta)
|
|
194
217
|
return Resolved(_finalize(entry, meta), "arxiv", "", False)
|
|
@@ -196,7 +219,7 @@ def resolve(query: str, require_published: bool = False) -> Resolved:
|
|
|
196
219
|
if kind == "doi":
|
|
197
220
|
match = crossref_by_doi(value)
|
|
198
221
|
if not match or not match.title:
|
|
199
|
-
raise
|
|
222
|
+
raise NotFound(f"DOI not found on CrossRef: {value}")
|
|
200
223
|
entry = _entry_from_match(match, None)
|
|
201
224
|
venue = entry.pop("__venue", match.venue)
|
|
202
225
|
return Resolved(_finalize(entry, None), match.source, venue, True)
|
|
@@ -212,7 +235,7 @@ def resolve(query: str, require_published: bool = False) -> Resolved:
|
|
|
212
235
|
if meta:
|
|
213
236
|
_log(f"[openalex] metadata: arXiv {meta.arxiv_id or '?'} ({meta.year})")
|
|
214
237
|
hint = first_author_last_name(meta.authors[0]) if meta and meta.authors else ""
|
|
215
|
-
match = find_published(
|
|
238
|
+
match, status = find_published(
|
|
216
239
|
meta.title if meta else value,
|
|
217
240
|
meta.year if meta else "",
|
|
218
241
|
meta.arxiv_id if meta else "",
|
|
@@ -224,11 +247,15 @@ def resolve(query: str, require_published: bool = False) -> Resolved:
|
|
|
224
247
|
return Resolved(_finalize(entry, meta), match.source, venue, True)
|
|
225
248
|
if meta and meta.arxiv_id:
|
|
226
249
|
if require_published:
|
|
227
|
-
raise
|
|
250
|
+
raise NotFound(f"Only an arXiv preprint was found for: {value}")
|
|
228
251
|
_log("[bibcite] no published version found; using arXiv preprint entry")
|
|
229
252
|
entry = _arxiv_only_entry(meta)
|
|
230
253
|
return Resolved(_finalize(entry, meta), "arxiv", "", False)
|
|
231
|
-
|
|
254
|
+
if status == "unavailable":
|
|
255
|
+
raise SourcesUnavailable(
|
|
256
|
+
f"All sources were rate-limited or down while resolving: {value}"
|
|
257
|
+
)
|
|
258
|
+
raise NotFound(f"No match found anywhere for: {value}")
|
|
232
259
|
|
|
233
260
|
|
|
234
261
|
def _openalex_meta(title: str) -> ArxivMeta | None:
|
|
@@ -595,19 +595,36 @@ _DISABLED: dict[str, str] = {}
|
|
|
595
595
|
|
|
596
596
|
def find_published(
|
|
597
597
|
title: str, year: str = "", arxiv_id: str = "", author_hint: str = ""
|
|
598
|
-
) -> Match | None:
|
|
599
|
-
"""Try each source in order; first verified hit wins.
|
|
598
|
+
) -> tuple[Match | None, str]:
|
|
599
|
+
"""Try each source in order; first verified hit wins.
|
|
600
|
+
|
|
601
|
+
Returns (match, status). status distinguishes a trustworthy miss from an
|
|
602
|
+
outage: "found" | "not_found" (>=1 source answered cleanly with no hit) |
|
|
603
|
+
"unavailable" (every source was disabled or errored — do NOT conclude the
|
|
604
|
+
paper is unpublished).
|
|
605
|
+
"""
|
|
606
|
+
from . import cache
|
|
607
|
+
|
|
608
|
+
cache_key = norm_title(title)
|
|
609
|
+
cached = cache.get(cache_key)
|
|
610
|
+
if cached:
|
|
611
|
+
_log(f"[cache] hit: {cached.get('venue', '')} ({cached.get('source', '')})")
|
|
612
|
+
return Match(**cached), "found"
|
|
613
|
+
|
|
614
|
+
clean_misses = 0
|
|
600
615
|
for name, fn in CASCADE:
|
|
601
616
|
if name in _DISABLED:
|
|
602
617
|
continue
|
|
603
618
|
try:
|
|
604
619
|
m = fn(title, year, arxiv_id, author_hint)
|
|
605
620
|
if m:
|
|
606
|
-
|
|
621
|
+
cache.put(cache_key, m.__dict__)
|
|
622
|
+
return m, "found"
|
|
623
|
+
clean_misses += 1
|
|
607
624
|
_log(f"[{name}] no publication found")
|
|
608
625
|
except SourceUnavailable as e:
|
|
609
626
|
_DISABLED[name] = str(e)
|
|
610
627
|
_log(f"[{name}] disabled for the rest of this run: {e}")
|
|
611
628
|
except Exception as e: # network hiccup on one source must not kill the run
|
|
612
629
|
_log(f"[{name}] error: {type(e).__name__}: {e}")
|
|
613
|
-
return None
|
|
630
|
+
return None, ("not_found" if clean_misses else "unavailable")
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Regression tests for the first round of real-world bug reports."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from bibcite import cache
|
|
6
|
+
from bibcite.bibfile import parse_bibtex_entry, remove_entry, upsert_entry
|
|
7
|
+
from bibcite.normalize import fix_author_caps
|
|
8
|
+
|
|
9
|
+
# CrossRef's transform endpoint emits bare month macros (month=June) that are
|
|
10
|
+
# not in bibtexparser's common_strings — this used to KeyError('june').
|
|
11
|
+
CROSSREF_STYLE = (
|
|
12
|
+
" @article{Hyv_rinen_2000, title={Independent component analysis}, "
|
|
13
|
+
"volume={13}, url={http://dx.doi.org/10.1016/x}, DOI={10.1016/x}, "
|
|
14
|
+
"journal={Neural Networks}, author={Hyvärinen, A. and Oja, E.}, "
|
|
15
|
+
"year={2000}, month=June, pages={411–430} }"
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_month_macro_full_name_parses():
|
|
20
|
+
entry = parse_bibtex_entry(CROSSREF_STYLE)
|
|
21
|
+
assert entry["title"] == "Independent component analysis"
|
|
22
|
+
assert "month" not in entry # month is a noise field, dropped
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_month_macro_abbrev_parses():
|
|
26
|
+
entry = parse_bibtex_entry("@article{x, title={T}, year={2000}, month=jun }")
|
|
27
|
+
assert entry["title"] == "T"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_unknown_macro_raises_value_error_not_keyerror():
|
|
31
|
+
import pytest
|
|
32
|
+
|
|
33
|
+
with pytest.raises(ValueError, match="BibTeX parse failed"):
|
|
34
|
+
parse_bibtex_entry("@article{x, title = somemacro }")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_fix_author_caps():
|
|
38
|
+
assert (
|
|
39
|
+
fix_author_caps("EPPS, T. W. and PULLEY, LAWRENCE B.")
|
|
40
|
+
== "Epps, T. W. and Pulley, Lawrence B."
|
|
41
|
+
)
|
|
42
|
+
# Mixed-case names are never touched.
|
|
43
|
+
assert fix_author_caps("McDonald, J. and van der Berg, A.") == (
|
|
44
|
+
"McDonald, J. and van der Berg, A."
|
|
45
|
+
)
|
|
46
|
+
assert fix_author_caps("Ashish Vaswani") == "Ashish Vaswani"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
PUB = {
|
|
50
|
+
"ENTRYTYPE": "inproceedings",
|
|
51
|
+
"ID": "k1",
|
|
52
|
+
"title": "Paper One",
|
|
53
|
+
"author": "A B",
|
|
54
|
+
"booktitle": "Some Conference (SC)",
|
|
55
|
+
"year": "2020",
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_remove_entry(tmp_path: Path):
|
|
60
|
+
bib = tmp_path / "r.bib"
|
|
61
|
+
upsert_entry(bib, dict(PUB))
|
|
62
|
+
assert remove_entry(bib, "k1") is True
|
|
63
|
+
assert remove_entry(bib, "k1") is False # already gone
|
|
64
|
+
assert "Paper One" not in bib.read_text()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_upsert_replace_keeps_key(tmp_path: Path):
|
|
68
|
+
bib = tmp_path / "r.bib"
|
|
69
|
+
upsert_entry(bib, dict(PUB))
|
|
70
|
+
newer = dict(PUB, ID="differentkey", author="Fixed Author")
|
|
71
|
+
action, key = upsert_entry(bib, newer, replace=True)
|
|
72
|
+
assert (action, key) == ("replaced", "k1")
|
|
73
|
+
assert "Fixed Author" in bib.read_text()
|
|
74
|
+
# Without --replace, a published duplicate stays untouched.
|
|
75
|
+
action, key = upsert_entry(bib, newer)
|
|
76
|
+
assert (action, key) == ("exists", "k1")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_cache_roundtrip(tmp_path, monkeypatch):
|
|
80
|
+
monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path))
|
|
81
|
+
monkeypatch.setattr(cache, "DISABLED", False)
|
|
82
|
+
assert cache.get("somekey") is None
|
|
83
|
+
cache.put("somekey", {"source": "dblp", "venue": "SC"})
|
|
84
|
+
assert cache.get("somekey")["venue"] == "SC"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def test_cache_disabled(tmp_path, monkeypatch):
|
|
88
|
+
monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path))
|
|
89
|
+
monkeypatch.setattr(cache, "DISABLED", True)
|
|
90
|
+
cache.put("k", {"venue": "X"})
|
|
91
|
+
assert cache.get("k") is None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|