bibcite-cli 0.3.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/PKG-INFO +1 -1
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/pyproject.toml +1 -1
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/src/bibcite/__init__.py +1 -1
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/src/bibcite/bibfile.py +33 -10
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/src/bibcite/cli.py +92 -10
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/src/bibcite/normalize.py +16 -0
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/src/bibcite/sources.py +83 -1
- bibcite_cli-0.4.0/tests/test_round2.py +70 -0
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/uv.lock +1 -1
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/.gitignore +0 -0
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/LICENSE +0 -0
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/Readme.md +0 -0
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/src/bibcite/cache.py +0 -0
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/src/bibcite/data/strings.bib +0 -0
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/src/bibcite/resolve.py +0 -0
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/src/bibcite/venues.py +0 -0
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/tests/test_bibfile.py +0 -0
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/tests/test_bugfixes.py +0 -0
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/tests/test_entry_types.py +0 -0
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/tests/test_normalize.py +0 -0
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/tests/test_strings_override.py +0 -0
- {bibcite_cli-0.3.0 → bibcite_cli-0.4.0}/tests/test_venues.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bibcite-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Resolve papers (arXiv id / DOI / title) to canonical, normalized BibTeX for agents and humans
|
|
5
5
|
Project-URL: Repository, https://github.com/leo1oel/bibcite
|
|
6
6
|
License-Expression: MIT
|
|
@@ -20,14 +20,16 @@ from .normalize import norm_title
|
|
|
20
20
|
# \cite{} commands valid.
|
|
21
21
|
TIDY_ARGS = [
|
|
22
22
|
"--modify",
|
|
23
|
-
|
|
23
|
+
# volume/number/pages/doi are kept (bibliographic substance the user
|
|
24
|
+
# asked to retain); the omit list drops only true noise.
|
|
25
|
+
"--omit=publisher,timestamp,biburl,bibsource,abstract,month,series,editor,note,date,address",
|
|
24
26
|
"--curly",
|
|
25
27
|
"--blank-lines",
|
|
26
28
|
"--trailing-commas",
|
|
27
29
|
"--sort=-year",
|
|
28
30
|
"--duplicates=citation",
|
|
29
31
|
"--merge=first",
|
|
30
|
-
"--sort-fields=author,title,booktitle,journal,year,url,pdf",
|
|
32
|
+
"--sort-fields=author,title,booktitle,journal,volume,number,pages,year,doi,url,pdf",
|
|
31
33
|
"--strip-enclosing-braces",
|
|
32
34
|
"--tidy-comments",
|
|
33
35
|
]
|
|
@@ -143,33 +145,48 @@ def find_existing(db: BibDatabase, title: str, arxiv_id: str = "", doi: str = ""
|
|
|
143
145
|
return None
|
|
144
146
|
|
|
145
147
|
|
|
146
|
-
def upsert_entry(
|
|
148
|
+
def upsert_entry(
|
|
149
|
+
path: Path, entry: dict, replace: bool = False, replace_key: str = ""
|
|
150
|
+
) -> tuple[str, str]:
|
|
147
151
|
"""Insert or upgrade ``entry`` in ``path``.
|
|
148
152
|
|
|
149
153
|
Returns (action, key), action in "added" | "upgraded" | "exists" |
|
|
150
|
-
"replaced". With ``replace``, an existing
|
|
151
|
-
|
|
154
|
+
"replaced" | "no_match_to_replace". With ``replace``, an existing
|
|
155
|
+
matching entry is overwritten; ``replace_key`` targets a specific entry
|
|
156
|
+
by citation key (for when title drift defeats the automatic match). The
|
|
157
|
+
existing key is always kept so \\cite{} commands stay valid. A replace
|
|
158
|
+
that matches nothing is an ERROR, not a silent add — that is how
|
|
159
|
+
duplicate entries sneak into a file.
|
|
152
160
|
"""
|
|
153
161
|
db = load_bib_file(path)
|
|
154
162
|
if db is None: # unparseable file: append blindly
|
|
163
|
+
if replace or replace_key:
|
|
164
|
+
return "no_match_to_replace", replace_key or entry["ID"]
|
|
155
165
|
with path.open("a") as f:
|
|
156
166
|
f.write("\n" + entry_to_bibtex(entry))
|
|
157
167
|
return "added", entry["ID"]
|
|
158
168
|
|
|
159
|
-
|
|
160
|
-
db
|
|
161
|
-
|
|
169
|
+
if replace_key:
|
|
170
|
+
existing = next((e for e in db.entries if e.get("ID") == replace_key), None)
|
|
171
|
+
else:
|
|
172
|
+
existing = find_existing(
|
|
173
|
+
db, entry.get("title", ""), entry_arxiv_id(entry), entry.get("doi", "")
|
|
174
|
+
)
|
|
175
|
+
|
|
162
176
|
if existing is not None:
|
|
163
177
|
upgrade = is_preprint(existing) and not is_preprint(entry)
|
|
164
|
-
if replace or upgrade:
|
|
178
|
+
if replace or replace_key or upgrade:
|
|
165
179
|
key = existing["ID"]
|
|
166
180
|
existing.clear()
|
|
167
181
|
existing.update({k: str(v) for k, v in entry.items() if v})
|
|
168
182
|
existing["ID"] = key # keep the key the user may already \cite
|
|
169
183
|
_write_db(path, db)
|
|
170
|
-
return ("replaced" if replace else "upgraded"), key
|
|
184
|
+
return ("replaced" if (replace or replace_key) else "upgraded"), key
|
|
171
185
|
return "exists", existing["ID"]
|
|
172
186
|
|
|
187
|
+
if replace or replace_key:
|
|
188
|
+
return "no_match_to_replace", replace_key or entry["ID"]
|
|
189
|
+
|
|
173
190
|
db.entries.append({k: str(v) for k, v in entry.items() if v})
|
|
174
191
|
_write_db(path, db)
|
|
175
192
|
return "added", entry["ID"]
|
|
@@ -190,6 +207,12 @@ def remove_entry(path: Path, key: str) -> bool:
|
|
|
190
207
|
|
|
191
208
|
|
|
192
209
|
def _write_db(path: Path, db: BibDatabase):
|
|
210
|
+
# Never write our injected month macros back out as @string blocks (they
|
|
211
|
+
# exist only so parsing month=June doesn't crash); this also scrubs any
|
|
212
|
+
# that leaked into a file before this guard existed. User-defined
|
|
213
|
+
# @strings are untouched.
|
|
214
|
+
for k in MONTH_STRINGS:
|
|
215
|
+
db.strings.pop(k, None)
|
|
193
216
|
writer = BibTexWriter()
|
|
194
217
|
writer.indent = " "
|
|
195
218
|
writer.order_entries_by = None # preserve file order; tidy re-sorts anyway
|
|
@@ -12,7 +12,8 @@ import time
|
|
|
12
12
|
from pathlib import Path
|
|
13
13
|
|
|
14
14
|
from . import bibfile, cache
|
|
15
|
-
from .normalize import first_author_last_name, norm_title
|
|
15
|
+
from .normalize import first_author_last_name, norm_title, titles_similar
|
|
16
|
+
from .resolve import classify
|
|
16
17
|
from .resolve import (
|
|
17
18
|
NotFound,
|
|
18
19
|
Resolved,
|
|
@@ -107,50 +108,107 @@ def _resolve_user_bibtex(text: str) -> Resolved:
|
|
|
107
108
|
entry.pop("journal", None)
|
|
108
109
|
entry["ENTRYTYPE"] = canonical.entry_type
|
|
109
110
|
entry[canonical.bib_field] = canonical.name
|
|
110
|
-
|
|
111
|
+
published = not bibfile.is_preprint(entry)
|
|
112
|
+
return Resolved(
|
|
113
|
+
entry,
|
|
114
|
+
"user-bibtex",
|
|
115
|
+
(canonical.name if canonical else raw_venue) if published else "",
|
|
116
|
+
published,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _local_exists(path: Path, query: str) -> str | None:
|
|
121
|
+
"""Local pre-check: if the query is already in the file as a PUBLISHED
|
|
122
|
+
entry, skip the network entirely (makes --from re-runs and repeated adds
|
|
123
|
+
near-instant). Preprints still resolve online — they may be upgradable."""
|
|
124
|
+
db = bibfile.load_bib_file(path)
|
|
125
|
+
if db is None or not db.entries:
|
|
126
|
+
return None
|
|
127
|
+
kind, value = classify(query)
|
|
128
|
+
if kind == "arxiv":
|
|
129
|
+
existing = bibfile.find_existing(db, "", arxiv_id=value)
|
|
130
|
+
elif kind == "doi":
|
|
131
|
+
existing = bibfile.find_existing(db, "", doi=value)
|
|
132
|
+
else:
|
|
133
|
+
existing = bibfile.find_existing(db, value)
|
|
134
|
+
if existing is not None and not bibfile.is_preprint(existing):
|
|
135
|
+
return existing["ID"]
|
|
136
|
+
return None
|
|
111
137
|
|
|
112
138
|
|
|
113
139
|
def cmd_add(args) -> int:
|
|
114
140
|
path = Path(args.file)
|
|
115
141
|
if args.no_cache:
|
|
116
142
|
cache.DISABLED = True
|
|
143
|
+
targeting = args.replace or bool(args.key)
|
|
144
|
+
if args.key and args.from_file:
|
|
145
|
+
_log("[bibcite] --key targets one entry; it cannot be combined with --from")
|
|
146
|
+
return EXIT_NOT_FOUND
|
|
117
147
|
|
|
118
148
|
# Collect the queries for this invocation (single, --bibtex, or --from).
|
|
149
|
+
# Each item: (query, resolved_or_None, exit_code, local_exists_key).
|
|
150
|
+
items: list[tuple[str, Resolved | None, int, str]] = []
|
|
119
151
|
if args.bibtex:
|
|
120
152
|
text = sys.stdin.read() if args.bibtex == "-" else args.bibtex
|
|
121
153
|
try:
|
|
122
|
-
|
|
154
|
+
items.append(("<bibtex>", _resolve_user_bibtex(text), 0, ""))
|
|
123
155
|
except ValueError as e:
|
|
124
156
|
_log(f"[bibcite] {e}")
|
|
125
157
|
return EXIT_NOT_FOUND
|
|
126
158
|
elif args.from_file:
|
|
127
159
|
lines = Path(args.from_file).read_text().splitlines()
|
|
128
160
|
queries = [q.strip() for q in lines if q.strip() and not q.strip().startswith("#")]
|
|
129
|
-
|
|
161
|
+
resolved_any = False
|
|
130
162
|
for i, q in enumerate(queries):
|
|
131
|
-
if
|
|
163
|
+
local = None if targeting else _local_exists(path, q)
|
|
164
|
+
if local:
|
|
165
|
+
_log(f"[bibcite] ({i + 1}/{len(queries)}) {q} — already in file: {local}")
|
|
166
|
+
items.append((q, None, 0, local))
|
|
167
|
+
continue
|
|
168
|
+
if resolved_any:
|
|
132
169
|
time.sleep(1) # one process shares the rate-limit breaker; stay polite
|
|
170
|
+
resolved_any = True
|
|
133
171
|
_log(f"[bibcite] ({i + 1}/{len(queries)}) {q}")
|
|
134
172
|
res, code = _resolve_or_none(q, args.require_published)
|
|
135
|
-
|
|
173
|
+
items.append((q, res, code, ""))
|
|
136
174
|
else:
|
|
137
175
|
if not args.query:
|
|
138
176
|
_log("[bibcite] provide a query (arXiv id / DOI / title), --bibtex, or --from")
|
|
139
177
|
return EXIT_NOT_FOUND
|
|
140
178
|
query = " ".join(args.query)
|
|
179
|
+
local = None if targeting else _local_exists(path, query)
|
|
180
|
+
if local:
|
|
181
|
+
_log(f"[bibcite] already in file (matched locally, no network): {local}")
|
|
182
|
+
_emit({"action": "exists", "key": local, "file": str(path), "tidied": False})
|
|
183
|
+
return 0
|
|
141
184
|
res, code = _resolve_or_none(query, args.require_published)
|
|
142
185
|
if res is None:
|
|
143
186
|
return code
|
|
144
|
-
|
|
187
|
+
items.append((query, res, 0, ""))
|
|
145
188
|
|
|
146
189
|
# Write all entries first, tidy once, then read back the final keys.
|
|
147
190
|
results = []
|
|
148
191
|
wrote = False
|
|
149
|
-
for query, res, code in
|
|
192
|
+
for query, res, code, local_key in items:
|
|
193
|
+
if local_key:
|
|
194
|
+
results.append({"query": query, "action": "exists", "key": local_key})
|
|
195
|
+
continue
|
|
150
196
|
if res is None:
|
|
151
197
|
results.append({"query": query, "action": "failed", "exit_code": code})
|
|
152
198
|
continue
|
|
153
|
-
action, key = bibfile.upsert_entry(
|
|
199
|
+
action, key = bibfile.upsert_entry(
|
|
200
|
+
path, res.entry, replace=args.replace, replace_key=args.key or ""
|
|
201
|
+
)
|
|
202
|
+
if action == "no_match_to_replace":
|
|
203
|
+
# A replace that matches nothing must fail loudly, never silently
|
|
204
|
+
# add a duplicate entry.
|
|
205
|
+
_log(
|
|
206
|
+
f"[bibcite] no matching entry to replace for '{query}'"
|
|
207
|
+
+ (f" (key: {args.key})" if args.key else "")
|
|
208
|
+
+ " — nothing written. Use `bibcite add --key <existing-key>` to target one."
|
|
209
|
+
)
|
|
210
|
+
results.append({"query": query, "action": action, "exit_code": EXIT_NOT_FOUND})
|
|
211
|
+
continue
|
|
154
212
|
wrote = wrote or action != "exists"
|
|
155
213
|
results.append(
|
|
156
214
|
{
|
|
@@ -246,6 +304,10 @@ def _upgrade_entries(path: Path, dry_run: bool) -> dict:
|
|
|
246
304
|
entry["year"] = match.year
|
|
247
305
|
if match.doi and not entry.get("doi"):
|
|
248
306
|
entry["doi"] = match.doi
|
|
307
|
+
if match.title:
|
|
308
|
+
# Camera-ready titles drift from arXiv ones; the published
|
|
309
|
+
# title is the correct one to cite.
|
|
310
|
+
entry["title"] = match.title
|
|
249
311
|
changed += 1
|
|
250
312
|
report.append(
|
|
251
313
|
{
|
|
@@ -294,11 +356,30 @@ def _check_problems(path: Path) -> tuple[int, list] | None:
|
|
|
294
356
|
return None
|
|
295
357
|
problems = []
|
|
296
358
|
seen_titles: dict[str, str] = {}
|
|
359
|
+
by_author: dict[str, list[tuple[str, str]]] = {} # lastname -> [(key, title)]
|
|
297
360
|
for entry in db.entries:
|
|
298
361
|
key = entry.get("ID", "?")
|
|
299
362
|
nt = norm_title(entry.get("title", ""))
|
|
300
363
|
if nt and nt in seen_titles:
|
|
301
364
|
problems.append({"key": key, "issue": f"duplicate title of {seen_titles[nt]}"})
|
|
365
|
+
elif nt:
|
|
366
|
+
# Near-duplicates (title drift: same first author, similar title)
|
|
367
|
+
# slip past exact matching — exactly how a failed replace plus a
|
|
368
|
+
# re-add pollutes a file.
|
|
369
|
+
last = (
|
|
370
|
+
first_author_last_name(entry["author"]) if entry.get("author") else ""
|
|
371
|
+
)
|
|
372
|
+
for other_key, other_title in by_author.get(last, []):
|
|
373
|
+
if titles_similar(entry.get("title", ""), other_title):
|
|
374
|
+
problems.append(
|
|
375
|
+
{
|
|
376
|
+
"key": key,
|
|
377
|
+
"issue": f"near-duplicate of {other_key} (title drift?)",
|
|
378
|
+
}
|
|
379
|
+
)
|
|
380
|
+
break
|
|
381
|
+
if last:
|
|
382
|
+
by_author.setdefault(last, []).append((key, entry.get("title", "")))
|
|
302
383
|
seen_titles.setdefault(nt, key)
|
|
303
384
|
for f in ("author", "title", "year"):
|
|
304
385
|
if not entry.get(f):
|
|
@@ -388,7 +469,8 @@ def main(argv=None) -> int:
|
|
|
388
469
|
a.add_argument("query", nargs="*", help="arXiv id / arXiv URL / DOI / title")
|
|
389
470
|
a.add_argument("--bibtex", help="raw BibTeX entry to add instead of a query ('-' reads stdin)")
|
|
390
471
|
a.add_argument("--from", dest="from_file", metavar="FILE", help="batch mode: one query per line (shares rate-limit state, tidies once)")
|
|
391
|
-
a.add_argument("--replace", action="store_true", help="overwrite an existing matching entry (keeps its citation key)")
|
|
472
|
+
a.add_argument("--replace", action="store_true", help="overwrite an existing matching entry (keeps its citation key); errors if nothing matches")
|
|
473
|
+
a.add_argument("--key", metavar="KEY", help="replace exactly the entry with this citation key (for title drift)")
|
|
392
474
|
a.add_argument("--no-tidy", action="store_true")
|
|
393
475
|
a.add_argument("--no-cache", action="store_true", help="bypass the local match cache")
|
|
394
476
|
a.add_argument("--require-published", action="store_true")
|
|
@@ -76,6 +76,22 @@ def first_author_last_name(author_field: str) -> str:
|
|
|
76
76
|
return mini_hash(last) or "anon"
|
|
77
77
|
|
|
78
78
|
|
|
79
|
+
def sig_tokens(title: str) -> set[str]:
|
|
80
|
+
"""Significant title tokens: folded, alphanumeric, stopwords removed."""
|
|
81
|
+
tokens = re.split(r"[^a-z0-9]+", fold_ascii(title).lower())
|
|
82
|
+
return {t for t in tokens if len(t) > 2 and t not in ENGLISH_STOPWORDS}
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def titles_similar(a: str, b: str, threshold: float = 0.7) -> bool:
|
|
86
|
+
"""Token-Jaccard similarity — catches preprint→camera-ready title drift
|
|
87
|
+
("Information-Theoretic Perspective" vs "Information Theory Perspective")
|
|
88
|
+
without matching genuinely different papers."""
|
|
89
|
+
ta, tb = sig_tokens(a), sig_tokens(b)
|
|
90
|
+
if not ta or not tb:
|
|
91
|
+
return False
|
|
92
|
+
return len(ta & tb) / len(ta | tb) >= threshold
|
|
93
|
+
|
|
94
|
+
|
|
79
95
|
def fix_author_caps(author_field: str) -> str:
|
|
80
96
|
"""Normalize ALL-CAPS author names (old CrossRef records store e.g.
|
|
81
97
|
"EPPS, T. W. and PULLEY, LAWRENCE B."). A word is re-cased only when it
|
|
@@ -16,7 +16,7 @@ from dataclasses import dataclass, field
|
|
|
16
16
|
|
|
17
17
|
import httpx
|
|
18
18
|
|
|
19
|
-
from .normalize import clean_title, mini_hash, norm_title
|
|
19
|
+
from .normalize import clean_title, mini_hash, norm_title, sig_tokens, titles_similar
|
|
20
20
|
|
|
21
21
|
UA = "bibcite/0.1 (https://github.com/leonardo/bibcite; mailto:bibcite@gmail.com)"
|
|
22
22
|
BROWSER_UA = (
|
|
@@ -198,6 +198,73 @@ def try_dblp(title: str, author_hint: str = "") -> Match | None:
|
|
|
198
198
|
return None
|
|
199
199
|
|
|
200
200
|
|
|
201
|
+
def _dblp_hit_authors(info: dict) -> list[str]:
|
|
202
|
+
authors = (info.get("authors") or {}).get("author") or []
|
|
203
|
+
if isinstance(authors, dict):
|
|
204
|
+
authors = [authors]
|
|
205
|
+
return [a.get("text", "") for a in authors if isinstance(a, dict)]
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def try_dblp_fuzzy(title: str, author_hint: str, year: str = "") -> Match | None:
|
|
209
|
+
"""Title-drift fallback: camera-ready titles often differ from the arXiv
|
|
210
|
+
ones ("Information-Theoretic" -> "Information Theory"), and DBLP's
|
|
211
|
+
token-AND search then misses entirely. Query author + the most
|
|
212
|
+
distinctive title tokens instead, and accept token-Jaccard-similar
|
|
213
|
+
titles — guarded by author and year so different papers can't sneak in.
|
|
214
|
+
"""
|
|
215
|
+
if not author_hint:
|
|
216
|
+
return None
|
|
217
|
+
tokens = sorted(sig_tokens(title), key=len, reverse=True)[:3]
|
|
218
|
+
if not tokens:
|
|
219
|
+
return None
|
|
220
|
+
q = " ".join([author_hint] + tokens)
|
|
221
|
+
with _client() as c:
|
|
222
|
+
r = c.get(
|
|
223
|
+
"https://dblp.org/search/publ/api",
|
|
224
|
+
params={"q": q, "format": "json", "h": 100},
|
|
225
|
+
)
|
|
226
|
+
if r.status_code == 429:
|
|
227
|
+
raise SourceUnavailable("DBLP rate-limited (429)")
|
|
228
|
+
r.raise_for_status()
|
|
229
|
+
hits = r.json().get("result", {}).get("hits", {}).get("hit", []) or []
|
|
230
|
+
hits.sort(key=lambda h: int(h.get("info", {}).get("year", 9999)))
|
|
231
|
+
for hit in hits:
|
|
232
|
+
info = hit.get("info", {})
|
|
233
|
+
hit_title = clean_title(html.unescape(info.get("title", "")))
|
|
234
|
+
if info.get("venue") == "CoRR" or not info.get("venue"):
|
|
235
|
+
continue
|
|
236
|
+
if not titles_similar(hit_title, title):
|
|
237
|
+
continue
|
|
238
|
+
if year and info.get("year"):
|
|
239
|
+
if abs(int(info["year"]) - int(year)) > 2:
|
|
240
|
+
continue
|
|
241
|
+
hit_authors = mini_hash(" ".join(_dblp_hit_authors(info)))
|
|
242
|
+
if author_hint not in hit_authors:
|
|
243
|
+
continue
|
|
244
|
+
venue = info["venue"]
|
|
245
|
+
if isinstance(venue, list):
|
|
246
|
+
venue = venue[0]
|
|
247
|
+
bibtex = ""
|
|
248
|
+
if info.get("url"):
|
|
249
|
+
br = c.get(info["url"] + ".bib")
|
|
250
|
+
if br.status_code == 200:
|
|
251
|
+
bibtex = br.text
|
|
252
|
+
_log(
|
|
253
|
+
f"[dblp-fuzzy] match with title drift: '{hit_title}' "
|
|
254
|
+
f"@ {venue} {info.get('year', '')}"
|
|
255
|
+
)
|
|
256
|
+
return Match(
|
|
257
|
+
source="dblp-fuzzy",
|
|
258
|
+
venue=str(venue),
|
|
259
|
+
title=hit_title,
|
|
260
|
+
year=str(info.get("year", "")),
|
|
261
|
+
doi=info.get("doi", ""),
|
|
262
|
+
bibtex=bibtex,
|
|
263
|
+
url=info.get("ee", "") or info.get("url", ""),
|
|
264
|
+
)
|
|
265
|
+
return None
|
|
266
|
+
|
|
267
|
+
|
|
201
268
|
# ---------------------------------------------------------------------------
|
|
202
269
|
# Semantic Scholar
|
|
203
270
|
# ---------------------------------------------------------------------------
|
|
@@ -627,4 +694,19 @@ def find_published(
|
|
|
627
694
|
_log(f"[{name}] disabled for the rest of this run: {e}")
|
|
628
695
|
except Exception as e: # network hiccup on one source must not kill the run
|
|
629
696
|
_log(f"[{name}] error: {type(e).__name__}: {e}")
|
|
697
|
+
|
|
698
|
+
# Exact-title search missed everywhere. Before concluding "no published
|
|
699
|
+
# version", try the title-drift fallback — camera-ready titles frequently
|
|
700
|
+
# differ from the arXiv ones, which is precisely the upgrade scenario.
|
|
701
|
+
if author_hint and "dblp" not in _DISABLED:
|
|
702
|
+
try:
|
|
703
|
+
m = try_dblp_fuzzy(title, author_hint, year)
|
|
704
|
+
if m:
|
|
705
|
+
cache.put(cache_key, m.__dict__)
|
|
706
|
+
return m, "found"
|
|
707
|
+
clean_misses += 1
|
|
708
|
+
except SourceUnavailable as e:
|
|
709
|
+
_DISABLED["dblp"] = str(e)
|
|
710
|
+
except Exception as e:
|
|
711
|
+
_log(f"[dblp-fuzzy] error: {type(e).__name__}: {e}")
|
|
630
712
|
return None, ("not_found" if clean_misses else "unavailable")
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Regression tests for the second round of field-use bug reports."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from bibcite.bibfile import MONTH_STRINGS, load_bib_file, upsert_entry, _write_db
|
|
6
|
+
from bibcite.normalize import titles_similar
|
|
7
|
+
|
|
8
|
+
ARXIV_TITLE = "An Information-Theoretic Perspective on Variance-Invariance-Covariance Regularization"
|
|
9
|
+
PUBLISHED_TITLE = "An Information Theory Perspective on Variance-Invariance-Covariance Regularization"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_titles_similar_catches_camera_ready_drift():
|
|
13
|
+
assert titles_similar(ARXIV_TITLE, PUBLISHED_TITLE)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_titles_similar_rejects_different_papers():
|
|
17
|
+
assert not titles_similar(
|
|
18
|
+
"Attention Is All You Need",
|
|
19
|
+
"An Image is Worth 16x16 Words: Transformers for Image Recognition",
|
|
20
|
+
)
|
|
21
|
+
assert not titles_similar("Deep Residual Learning", "")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
ENTRY = {
|
|
25
|
+
"ENTRYTYPE": "inproceedings",
|
|
26
|
+
"ID": "k1",
|
|
27
|
+
"title": "Paper One",
|
|
28
|
+
"author": "A B",
|
|
29
|
+
"booktitle": "Some Conference (SC)",
|
|
30
|
+
"year": "2020",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_replace_without_match_errors_instead_of_adding(tmp_path: Path):
|
|
35
|
+
bib = tmp_path / "r.bib"
|
|
36
|
+
upsert_entry(bib, dict(ENTRY))
|
|
37
|
+
stranger = dict(ENTRY, ID="k2", title="A Totally Different Paper")
|
|
38
|
+
action, key = upsert_entry(bib, stranger, replace=True)
|
|
39
|
+
assert action == "no_match_to_replace"
|
|
40
|
+
assert "Totally Different" not in bib.read_text() # nothing was written
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_replace_key_targets_specific_entry(tmp_path: Path):
|
|
44
|
+
bib = tmp_path / "r.bib"
|
|
45
|
+
upsert_entry(bib, dict(ENTRY))
|
|
46
|
+
drifted = dict(ENTRY, ID="whatever", title="Paper One Revised Title")
|
|
47
|
+
action, key = upsert_entry(bib, drifted, replace_key="k1")
|
|
48
|
+
assert (action, key) == ("replaced", "k1")
|
|
49
|
+
assert "Paper One Revised Title" in bib.read_text()
|
|
50
|
+
action, _ = upsert_entry(bib, drifted, replace_key="nonexistent")
|
|
51
|
+
assert action == "no_match_to_replace"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_month_strings_never_written_to_file(tmp_path: Path):
|
|
55
|
+
bib = tmp_path / "m.bib"
|
|
56
|
+
# Simulate a file polluted by the old bug: @string month macros present.
|
|
57
|
+
bib.write_text(
|
|
58
|
+
'@string{january = {January}}\n'
|
|
59
|
+
'@article{x, title = {T}, author = {A B}, year = {2000}, month = january }\n'
|
|
60
|
+
)
|
|
61
|
+
db = load_bib_file(bib)
|
|
62
|
+
_write_db(bib, db)
|
|
63
|
+
text = bib.read_text()
|
|
64
|
+
assert "@string" not in text # scrubbed on write
|
|
65
|
+
assert "title" in text
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_month_strings_cover_all_months():
|
|
69
|
+
for m in ("january", "may", "june", "december", "jan", "jun", "dec"):
|
|
70
|
+
assert m in MONTH_STRINGS
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|