citefinder 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- citefinder-0.3.0/.claude/settings.local.json +69 -0
- citefinder-0.3.0/.claude/skills/use-citefinder/SKILL.md +185 -0
- citefinder-0.3.0/.github/dependabot.yml +10 -0
- citefinder-0.3.0/.github/pull_request_template.md +5 -0
- citefinder-0.3.0/.github/workflows/publish.yml +31 -0
- citefinder-0.3.0/.github/workflows/test.yml +26 -0
- citefinder-0.3.0/.gitignore +9 -0
- citefinder-0.3.0/.pre-commit-config.yaml +16 -0
- citefinder-0.3.0/.python-version +1 -0
- citefinder-0.3.0/CHANGELOG.md +20 -0
- citefinder-0.3.0/CLAUDE.md +28 -0
- citefinder-0.3.0/LICENSE +22 -0
- citefinder-0.3.0/PKG-INFO +197 -0
- citefinder-0.3.0/README.md +184 -0
- citefinder-0.3.0/TODO.md +3 -0
- citefinder-0.3.0/citefinder/__init__.py +13 -0
- citefinder-0.3.0/citefinder/_base.py +85 -0
- citefinder-0.3.0/citefinder/cache.py +52 -0
- citefinder-0.3.0/citefinder/cli.py +132 -0
- citefinder-0.3.0/citefinder/client.py +53 -0
- citefinder-0.3.0/citefinder/openalex.py +150 -0
- citefinder-0.3.0/citefinder/py.typed +0 -0
- citefinder-0.3.0/docs/README.md +13 -0
- citefinder-0.3.0/docs/guides/.gitkeep +0 -0
- citefinder-0.3.0/docs/plans/.gitkeep +0 -0
- citefinder-0.3.0/docs/plans/000-add-openalex-client.md +100 -0
- citefinder-0.3.0/pyproject.toml +54 -0
- citefinder-0.3.0/tests/__init__.py +0 -0
- citefinder-0.3.0/tests/conftest.py +19 -0
- citefinder-0.3.0/tests/test_cache.py +44 -0
- citefinder-0.3.0/tests/test_client.py +153 -0
- citefinder-0.3.0/tests/test_openalex.py +252 -0
- citefinder-0.3.0/uv.lock +699 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(date:*)",
|
|
5
|
+
"Bash(diff:*)",
|
|
6
|
+
"Bash(du:*)",
|
|
7
|
+
"Bash(file:*)",
|
|
8
|
+
"Bash(find:*)",
|
|
9
|
+
"Bash(gh api:*)",
|
|
10
|
+
"Bash(gh issue:*)",
|
|
11
|
+
"Bash(gh pr create:*)",
|
|
12
|
+
"Bash(gh pr diff:*)",
|
|
13
|
+
"Bash(gh pr list:*)",
|
|
14
|
+
"Bash(gh pr merge:*)",
|
|
15
|
+
"Bash(gh pr view:*)",
|
|
16
|
+
"Bash(gh repo:*)",
|
|
17
|
+
"Bash(git add:*)",
|
|
18
|
+
"Bash(git branch:*)",
|
|
19
|
+
"Bash(git checkout:*)",
|
|
20
|
+
"Bash(git commit:*)",
|
|
21
|
+
"Bash(git config:*)",
|
|
22
|
+
"Bash(git diff:*)",
|
|
23
|
+
"Bash(git fetch:*)",
|
|
24
|
+
"Bash(git log:*)",
|
|
25
|
+
"Bash(git merge:*)",
|
|
26
|
+
"Bash(git mv:*)",
|
|
27
|
+
"Bash(git pull:*)",
|
|
28
|
+
"Bash(git remote:*)",
|
|
29
|
+
"Bash(git show:*)",
|
|
30
|
+
"Bash(git stash:*)",
|
|
31
|
+
"Bash(git status:*)",
|
|
32
|
+
"Bash(git switch:*)",
|
|
33
|
+
"Bash(git tag:*)",
|
|
34
|
+
"Bash(grep:*)",
|
|
35
|
+
"Bash(head:*)",
|
|
36
|
+
"Bash(jq:*)",
|
|
37
|
+
"Bash(ls:*)",
|
|
38
|
+
"Bash(sqlite3:*)",
|
|
39
|
+
"Bash(stanza:*)",
|
|
40
|
+
"Bash(test:*)",
|
|
41
|
+
"Bash(tree:*)",
|
|
42
|
+
"Bash(uv add:*)",
|
|
43
|
+
"Bash(uv build:*)",
|
|
44
|
+
"Bash(uv lock:*)",
|
|
45
|
+
"Bash(uv pip:*)",
|
|
46
|
+
"Bash(uv remove:*)",
|
|
47
|
+
"Bash(uv run python:*)",
|
|
48
|
+
"Bash(uv run:*)",
|
|
49
|
+
"Bash(uv sync:*)",
|
|
50
|
+
"Bash(wc:*)",
|
|
51
|
+
"Bash(which:*)",
|
|
52
|
+
"Bash(xxd:*)",
|
|
53
|
+
"Edit(.claude/**)"
|
|
54
|
+
],
|
|
55
|
+
"deny": [
|
|
56
|
+
"Bash(git clean:*)",
|
|
57
|
+
"Bash(git push --force:*)",
|
|
58
|
+
"Bash(git reset --hard:*)",
|
|
59
|
+
"Bash(rm -rf:*)"
|
|
60
|
+
],
|
|
61
|
+
"ask": [
|
|
62
|
+
"Bash(git checkout .:*)",
|
|
63
|
+
"Bash(git push:*)",
|
|
64
|
+
"Bash(git rebase:*)",
|
|
65
|
+
"Bash(git restore:*)",
|
|
66
|
+
"Bash(rm:*)"
|
|
67
|
+
]
|
|
68
|
+
}
|
|
69
|
+
}
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: use-citefinder
|
|
3
|
+
description: Look up DOIs, search Crossref or OpenAlex, and resolve book chapters with `citefinder` — a small Crossref + OpenAlex client with a JSONL cache that survives sessions and remembers 404s. Use this whenever the user wants to verify a DOI, find a paper by author + title, check whether a citation is real, resolve a chapter DOI, look up an arXiv/preprint DOI Crossref doesn't index, or generate canonical metadata for a reference list — even when they don't say "Crossref" or "DOI" explicitly. Phrases like "is this paper real?", "find the published version", "look up this citation", "the subagent gave me these papers — verify them", or "what's the DOI for X?" should trigger it.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Use citefinder
|
|
7
|
+
|
|
8
|
+
`citefinder` (https://github.com/gitronald/citefinder) is a small Python library + CLI for Crossref **and OpenAlex** lookups, with a JSONL-backed cache. Use it instead of raw `curl https://api.crossref.org/...` because:
|
|
9
|
+
|
|
10
|
+
- The cache survives sessions, so re-running verification is cheap.
|
|
11
|
+
- 404s are cached, so known-missing DOIs don't get re-queried.
|
|
12
|
+
- The cache is JSONL (one record per line) — `grep`-able, diffable, and crash-safe.
|
|
13
|
+
- It exposes both a Python API (for batch work, scripts, notebooks) and a CLI (for ad-hoc lookups).
|
|
14
|
+
|
|
15
|
+
## When to use this skill
|
|
16
|
+
|
|
17
|
+
- Verifying that a DOI resolves to the paper the user expects (the most common need).
|
|
18
|
+
- Finding the canonical / published DOI from an arxiv ID, SSRN URL, preprint title, or an `(Author Year)` inline citation.
|
|
19
|
+
- Resolving a book chapter DOI when you only have the book's DOI and a chapter number.
|
|
20
|
+
- Sanity-checking a list of references produced by a research subagent or extracted from a PDF.
|
|
21
|
+
- Building or enriching a bibliography (`.bib`, CSV) from an outline.
|
|
22
|
+
|
|
23
|
+
If the user describes a multi-step Zotero/bibliography workflow, also load the `resolve-zotero-references` skill — it composes citefinder with Zotero matching and a verification loop.
|
|
24
|
+
|
|
25
|
+
## Install / availability check
|
|
26
|
+
|
|
27
|
+
If citefinder isn't already a dependency:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
uv add citefinder # or: uv add git+https://github.com/gitronald/citefinder
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Confirm it's wired:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
uv run citefinder --help
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Three core operations
|
|
40
|
+
|
|
41
|
+
### 1. Verify a single DOI
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from citefinder import CrossrefClient
|
|
45
|
+
|
|
46
|
+
client = CrossrefClient(cache_path="~/.cache/citefinder/crossref.jsonl")
|
|
47
|
+
work = client.lookup_doi("10.1126/science.aap9559")
|
|
48
|
+
if work is None:
|
|
49
|
+
# 404 — the DOI doesn't resolve. May be fabricated, mistyped, or too new for Crossref's index.
|
|
50
|
+
...
|
|
51
|
+
else:
|
|
52
|
+
print(work["title"][0])
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
CLI:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
citefinder doi 10.1126/science.aap9559
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**Always compare the returned title to the title you expected.** This is the single most important habit. Subagents and PDF extractors regularly produce DOIs that are *off by a few characters* in the suffix (e.g., `psrm.2025.14` vs `psrm.2025.10063`) — those wrong suffixes often resolve to a real-but-different paper in the same journal. The DOI lookup itself returns 200; only a title comparison catches it.
|
|
62
|
+
|
|
63
|
+
### 2. Search bibliographically
|
|
64
|
+
|
|
65
|
+
When you don't have a DOI (or the DOI you have is suspect), search by free-form text:
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
hits = client.search_bibliographic(
|
|
69
|
+
f"{first_author_last_name} {distinctive_title_words}",
|
|
70
|
+
rows=3,
|
|
71
|
+
)
|
|
72
|
+
for hit in hits:
|
|
73
|
+
print(hit["DOI"], "-", hit["title"][0])
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
CLI:
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
citefinder search "Wolfowicz hate speech meta-analysis" --rows 3
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Tips for good queries:
|
|
83
|
+
|
|
84
|
+
- First author's last name plus 2–4 distinctive title words is usually enough.
|
|
85
|
+
- Avoid generic words ("study", "analysis", "the") — they dilute the relevance score.
|
|
86
|
+
- For preprints, both an SSRN/arxiv DOI and a published DOI may come back. Prefer the published one unless the user wants the preprint.
|
|
87
|
+
|
|
88
|
+
### 3. Look up a book chapter
|
|
89
|
+
|
|
90
|
+
Many edited volumes follow the convention `{book_doi}.{NNN}` for chapter DOIs (e.g., `10.1017/9781108890960.005` for chapter 5).
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
chapter = client.lookup_book_chapter("10.1017/9781108890960", 5)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
CLI:
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
citefinder chapter 10.1017/9781108890960 5
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
`lookup_book_chapter` zero-pads numeric chapters to 3 digits. Pass a string instead (`client.lookup_book_chapter(book_doi, "ch1a")`) for publishers using a different format.
|
|
103
|
+
|
|
104
|
+
## Key behaviors to know
|
|
105
|
+
|
|
106
|
+
- **Cache path:** defaults to `~/.cache/citefinder/crossref.jsonl`. Use a project-local path (e.g., `data/crossref-cache.jsonl`) when you want results committed alongside an outline so collaborators don't re-query.
|
|
107
|
+
- **Latest value wins on replay.** Re-querying after a fix transparently overwrites — no manual cache invalidation needed.
|
|
108
|
+
- **`None` is a real cache value.** A cached `None` means "Crossref returned 404 for this DOI" — citefinder uses it to avoid re-hitting known-missing DOIs. If you suspect Crossref has now indexed a paper it didn't before, delete that line from the JSONL or use a fresh cache path.
|
|
109
|
+
- **`lookup_doi` returns the `message` payload directly,** not the full Crossref envelope. So you access `work["title"][0]`, not `work["message"]["title"][0]`.
|
|
110
|
+
- **`title` is a list, not a string.** Crossref returns titles as arrays. Use `work["title"][0]`.
|
|
111
|
+
- **`search_bibliographic` returns the items list,** which may be empty. Always handle the empty case.
|
|
112
|
+
|
|
113
|
+
## OpenAlex fallback for arXiv / preprint / thin-metadata DOIs
|
|
114
|
+
|
|
115
|
+
Crossref doesn't index arXiv DOIs (`10.48550/arXiv.*`) and many repository deposits — those return 404 from `lookup_doi`. Crossref also frequently has thin metadata (missing abstract, abbreviated title, no affiliations) on records that exist. Use OpenAlex as the second source in those cases:
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from citefinder import CrossrefClient, OpenAlexClient, is_arxiv_doi
|
|
119
|
+
|
|
120
|
+
crossref = CrossrefClient(cache_path="~/.cache/citefinder/crossref.jsonl")
|
|
121
|
+
openalex = OpenAlexClient(
|
|
122
|
+
cache_path="~/.cache/citefinder/openalex.jsonl",
|
|
123
|
+
mailto="you@example.com", # opts into OpenAlex's polite pool — faster, higher daily quota
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
doi = "10.48550/arXiv.2410.21554"
|
|
127
|
+
if is_arxiv_doi(doi):
|
|
128
|
+
work = openalex.lookup_doi(doi) # arXiv DOIs go straight to OpenAlex
|
|
129
|
+
else:
|
|
130
|
+
work = crossref.lookup_doi(doi) or openalex.lookup_doi(doi) # Crossref-first, OpenAlex fallback
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
CLI:
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
citefinder openalex doi 10.48550/arXiv.2410.21554
|
|
137
|
+
citefinder openalex search "fact-checking large language models"
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
OpenAlex's schema differs from Crossref — different keys for the same data:
|
|
141
|
+
|
|
142
|
+
| Crossref | OpenAlex |
|
|
143
|
+
|---|---|
|
|
144
|
+
| `work["title"][0]` (+ `subtitle[0]`) | `work["display_name"]` |
|
|
145
|
+
| `work["author"][0]["family"]` | `work["authorships"][0]["author"]["display_name"]` |
|
|
146
|
+
| `work["container-title"][0]` | `work["primary_location"]["source"]["display_name"]` |
|
|
147
|
+
| `work["published-print"]["date-parts"][0][0]` | `work["publication_year"]` |
|
|
148
|
+
|
|
149
|
+
OpenAlex stores abstracts as an `abstract_inverted_index` (`{word: [positions]}`), not a string. Use the helper:
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
from citefinder import reconstruct_abstract
|
|
153
|
+
abstract = reconstruct_abstract(work) # returns plain string or None
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### OpenAlex API key (optional, for higher rate limits)
|
|
157
|
+
|
|
158
|
+
`OpenAlexClient` reads the API key in this order: explicit `api_key=...` arg → `OPENALEX_API_KEY` env var → (CLI only) `.env` in CWD or any parent. The key is sent as `Authorization: Bearer ...`, never in the URL or cache key.
|
|
159
|
+
|
|
160
|
+
For ad-hoc lookups, no key is needed — common-pool requests work fine. If the user has a key set in their env or `.env`, the CLI picks it up automatically:
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
# .env in the project (gitignored)
|
|
164
|
+
OPENALEX_API_KEY=oa_pk_...
|
|
165
|
+
|
|
166
|
+
citefinder openalex doi 10.48550/arXiv.2410.21554 # uses key from .env automatically
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
For programmatic library use, `.env` is *not* auto-loaded — pass `api_key=...` explicitly or set the env var before constructing the client.
|
|
170
|
+
|
|
171
|
+
### Picking `mailto`
|
|
172
|
+
|
|
173
|
+
Use a project alias (e.g. the `authors` email in `pyproject.toml`) or omit entirely. Don't drop the user's personal email into `mailto` without asking — it's an outbound identifier, and a project/noreply address is the right default.
|
|
174
|
+
|
|
175
|
+
## When citefinder isn't enough
|
|
176
|
+
|
|
177
|
+
For **generating formatted BibTeX strings** from a DOI or query, use [`fetchbib`](https://github.com/mr-devs/fetchbib) (`fbib`) instead — it handles doi.org content negotiation, arXiv routing, and BibTeX-flavored config (protect titles, exclude ISSN, etc.). citefinder returns raw JSON for verification; fetchbib emits paste-ready BibTeX for citation lists.
|
|
178
|
+
|
|
179
|
+
Drop down to raw HTTP (`requests.get("https://api.crossref.org/...")`) only if you need:
|
|
180
|
+
|
|
181
|
+
- Crossref or OpenAlex endpoints citefinder doesn't wrap (Crossref `/funders`, `/journals`, `/types`; OpenAlex `/authors`, `/institutions`, `/sources`).
|
|
182
|
+
- A one-off query you specifically don't want cached.
|
|
183
|
+
- Streaming through large result sets via `cursor` pagination.
|
|
184
|
+
|
|
185
|
+
For everything else, prefer citefinder so the cache stays the single source of truth across sessions.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
name: Publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags: ["v*"]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
steps:
|
|
11
|
+
- uses: actions/checkout@v6
|
|
12
|
+
- uses: astral-sh/setup-uv@v8.1.0
|
|
13
|
+
- run: uv build
|
|
14
|
+
- uses: actions/upload-artifact@v7
|
|
15
|
+
with:
|
|
16
|
+
name: dist
|
|
17
|
+
path: dist/
|
|
18
|
+
|
|
19
|
+
publish:
|
|
20
|
+
needs: build
|
|
21
|
+
runs-on: ubuntu-latest
|
|
22
|
+
environment: pypi
|
|
23
|
+
permissions:
|
|
24
|
+
id-token: write
|
|
25
|
+
contents: read
|
|
26
|
+
steps:
|
|
27
|
+
- uses: actions/download-artifact@v8
|
|
28
|
+
with:
|
|
29
|
+
name: dist
|
|
30
|
+
path: dist/
|
|
31
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
name: Tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [dev, main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [dev, main]
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: read
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
test:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
strategy:
|
|
16
|
+
matrix:
|
|
17
|
+
python-version: ["3.11", "3.12", "3.13", "3.14"]
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v6
|
|
20
|
+
- uses: astral-sh/setup-uv@v8.1.0
|
|
21
|
+
- run: uv python install ${{ matrix.python-version }}
|
|
22
|
+
- run: uv sync --all-groups --python ${{ matrix.python-version }}
|
|
23
|
+
- run: uv run ruff check .
|
|
24
|
+
- run: uv run ruff format --check .
|
|
25
|
+
- run: uv run pyrefly check
|
|
26
|
+
- run: uv run pytest --cov --cov-report=term-missing
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
3
|
+
rev: v0.15.5
|
|
4
|
+
hooks:
|
|
5
|
+
- id: ruff-format
|
|
6
|
+
- id: ruff
|
|
7
|
+
args: [--fix]
|
|
8
|
+
- repo: local
|
|
9
|
+
hooks:
|
|
10
|
+
- id: pyrefly-check
|
|
11
|
+
name: pyrefly check
|
|
12
|
+
entry: uv run pyrefly check
|
|
13
|
+
language: system
|
|
14
|
+
types_or: [python, pyi]
|
|
15
|
+
pass_filenames: false
|
|
16
|
+
require_serial: true
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.14
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
### Changed
|
|
13
|
+
|
|
14
|
+
### Deprecated
|
|
15
|
+
|
|
16
|
+
### Removed
|
|
17
|
+
|
|
18
|
+
### Fixed
|
|
19
|
+
|
|
20
|
+
### Security
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Claude Settings
|
|
2
|
+
|
|
3
|
+
This file provides guidance to [Claude Code](claude.ai/code).
|
|
4
|
+
|
|
5
|
+
## Package Structure
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
citefinder/
|
|
9
|
+
├── cli.py # Typer CLI entry point
|
|
10
|
+
└── __init__.py
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Development
|
|
14
|
+
|
|
15
|
+
- Install: `uv sync --all-groups`
|
|
16
|
+
- Tests: `uv run pytest`
|
|
17
|
+
- Linting: pre-commit hooks run ruff format + lint on commit
|
|
18
|
+
- Type checking: pre-commit hooks run pyrefly on commit
|
|
19
|
+
- CI: GitHub Actions runs lint + type check + test matrix (Python 3.11–3.14) on push/PR to dev/main
|
|
20
|
+
|
|
21
|
+
## Release Automation
|
|
22
|
+
|
|
23
|
+
Use [stanza](https://github.com/gitronald/stanza) for release workflows:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
stanza release [patch|minor|major|prerelease]
|
|
27
|
+
stanza init
|
|
28
|
+
```
|
citefinder-0.3.0/LICENSE
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ronald E. Robertson
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: citefinder
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Project-URL: repository, https://github.com/gitronald/citefinder
|
|
5
|
+
Author-email: gitronald <gitronald@users.noreply.github.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Python: >=3.11
|
|
9
|
+
Requires-Dist: python-dotenv>=1.2.2
|
|
10
|
+
Requires-Dist: requests>=2.33.1
|
|
11
|
+
Requires-Dist: typer>=0.25.0
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# citefinder
|
|
15
|
+
|
|
16
|
+
OpenAlex (default) + Crossref reference lookups with local JSONL caching.
|
|
17
|
+
|
|
18
|
+
A small Python library + CLI for verifying academic references against the
|
|
19
|
+
OpenAlex and Crossref APIs. Every lookup is appended to an append-only JSONL
|
|
20
|
+
log so repeated queries (across verification passes or sessions) are served
|
|
21
|
+
from the cache. Negative results (404s) are cached too, so known-missing DOIs
|
|
22
|
+
aren't re-hit.
|
|
23
|
+
|
|
24
|
+
OpenAlex is the default source: it merges Crossref + Unpaywall + ORCID + ROR
|
|
25
|
+
+ repository sources, so it covers what Crossref alone is missing — arXiv
|
|
26
|
+
DOIs (`10.48550/arXiv.*`), other preprints, repository deposits — and
|
|
27
|
+
frequently has richer metadata (abstracts, full author lists, affiliations)
|
|
28
|
+
for records that exist in both. Crossref is still available via the
|
|
29
|
+
`crossref` subcommand for its own workflows (book-chapter lookup, the
|
|
30
|
+
canonical published-deposit metadata).
|
|
31
|
+
|
|
32
|
+
### OpenAlex API key (optional)
|
|
33
|
+
|
|
34
|
+
OpenAlex works without authentication, but a free API key gives you higher
|
|
35
|
+
limits and tier-specific endpoints.
|
|
36
|
+
|
|
37
|
+
- Docs: https://developers.openalex.org/
|
|
38
|
+
- Sign up / generate a key: https://openalex.org/login?redirect=/settings/api-key
|
|
39
|
+
|
|
40
|
+
The key is read in this order:
|
|
41
|
+
|
|
42
|
+
1. `api_key=...` argument to `OpenAlexClient(...)` (or `--api-key` on the CLI).
|
|
43
|
+
2. `OPENALEX_API_KEY` environment variable.
|
|
44
|
+
3. A `.env` file in the current working directory or any parent (loaded by
|
|
45
|
+
the CLI; library users can opt in via `from dotenv import load_dotenv`).
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
# .env
|
|
49
|
+
OPENALEX_API_KEY=oa_pk_...
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
The key is sent as `Authorization: Bearer ...`, never as a URL parameter, so
|
|
53
|
+
it doesn't land in cache keys, logs, or referer headers.
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
## Install
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
uv add citefinder
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Or for development:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
git clone https://github.com/gitronald/citefinder
|
|
66
|
+
cd citefinder
|
|
67
|
+
uv sync
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Library usage
|
|
71
|
+
|
|
72
|
+
### OpenAlex (default)
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from citefinder import OpenAlexClient, is_arxiv_doi, reconstruct_abstract
|
|
76
|
+
|
|
77
|
+
openalex = OpenAlexClient(
|
|
78
|
+
cache_path="~/.cache/citefinder/openalex.jsonl",
|
|
79
|
+
mailto="you@example.com", # opts into OpenAlex's polite pool — faster, higher quota
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Single DOI (works for arXiv DOIs that Crossref doesn't index)
|
|
83
|
+
work = openalex.lookup_doi("10.48550/arXiv.2410.21554")
|
|
84
|
+
|
|
85
|
+
# Title-only search — tuned for citation verification. Handles OpenAlex's
|
|
86
|
+
# curly-apostrophe quirk and strips filter-reserved punctuation that would
|
|
87
|
+
# 400 the request, so straight ASCII inputs match curly-quoted indexed titles.
|
|
88
|
+
hits = openalex.search_title("Backstabber's Knife Collection", rows=3)
|
|
89
|
+
|
|
90
|
+
# Free-text search across titles + abstracts (noisier; prefer search_title
|
|
91
|
+
# for citation lookup)
|
|
92
|
+
hits = openalex.search("fact-checking large language models", rows=3)
|
|
93
|
+
|
|
94
|
+
# OpenAlex stores abstracts as an inverted index — reconstruct to plain text
|
|
95
|
+
abstract = reconstruct_abstract(work) if work else None
|
|
96
|
+
|
|
97
|
+
# Helper for routing logic
|
|
98
|
+
assert is_arxiv_doi("10.48550/arXiv.2410.21554")
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
The `mailto` argument is optional but recommended: it puts requests into
|
|
102
|
+
OpenAlex's [polite pool](https://docs.openalex.org/how-to-use-the-api/rate-limits-and-authentication#the-polite-pool)
|
|
103
|
+
for faster responses. The cache key strips `mailto` so changing it doesn't
|
|
104
|
+
invalidate prior entries.
|
|
105
|
+
|
|
106
|
+
### Crossref
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from citefinder import CrossrefClient
|
|
110
|
+
|
|
111
|
+
client = CrossrefClient(
|
|
112
|
+
cache_path="~/.cache/citefinder/crossref.jsonl",
|
|
113
|
+
mailto="you@example.com", # opts into Crossref's polite pool — faster, higher quota
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Single DOI
|
|
117
|
+
work = client.lookup_doi("10.1126/science.aap9559")
|
|
118
|
+
print(work["title"][0])
|
|
119
|
+
|
|
120
|
+
# Bibliographic search (author + title + year)
|
|
121
|
+
hits = client.search_bibliographic("Wolfowicz hate speech meta-analysis", rows=3)
|
|
122
|
+
|
|
123
|
+
# Book chapter via {book_doi}.{NNN} pattern
|
|
124
|
+
chapter = client.lookup_book_chapter("10.1017/9781108890960", 5)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Crossref and OpenAlex both honor `mailto` for their polite pools; the cache
|
|
128
|
+
key strips it on either side, so rotating the email doesn't invalidate prior
|
|
129
|
+
entries.
|
|
130
|
+
|
|
131
|
+
OpenAlex's schema differs from Crossref. Quick map:
|
|
132
|
+
|
|
133
|
+
| Field | Crossref | OpenAlex |
|
|
134
|
+
|---|---|---|
|
|
135
|
+
| Title | `work["title"][0]` (+ optional `subtitle[0]`) | `work["display_name"]` |
|
|
136
|
+
| First author | `work["author"][0]["family"]` (surname only) | `work["authorships"][0]["author"]["display_name"]` (**full name** — parse for surname) |
|
|
137
|
+
| Container | `work["container-title"][0]` (+ `short-container-title`) | `work["primary_location"]["source"]["display_name"]` (+ `host_venue` on older records) |
|
|
138
|
+
| Year | `published-print` / `published-online` / `issued` / `created` → `["date-parts"][0][0]` | `work["publication_year"]` (int) |
|
|
139
|
+
|
|
140
|
+
## CLI usage
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
# OpenAlex (default)
|
|
144
|
+
citefinder doi 10.48550/arXiv.2410.21554 --mailto you@example.com
|
|
145
|
+
citefinder search "Backstabber's Knife Collection" --rows 3
|
|
146
|
+
|
|
147
|
+
# Crossref
|
|
148
|
+
citefinder crossref doi 10.1126/science.aap9559 --mailto you@example.com
|
|
149
|
+
citefinder crossref search "Wolfowicz hate speech meta-analysis" --rows 3
|
|
150
|
+
citefinder crossref chapter 10.1017/9781108890960 5
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### CLI arguments
|
|
154
|
+
|
|
155
|
+
- `--cache PATH` — JSONL cache path. Defaults to
|
|
156
|
+
`~/.cache/citefinder/openalex.jsonl` for top-level commands and
|
|
157
|
+
`~/.cache/citefinder/crossref.jsonl` for `crossref` subcommands. Separate
|
|
158
|
+
files so sources don't mix; override per command if you want
|
|
159
|
+
per-project caches (e.g., `--cache ./data/refs.jsonl`).
|
|
160
|
+
- `--rows N` *(search only)* — Number of results to return. Default `3`.
|
|
161
|
+
- `--mailto EMAIL` — Opts the request into the source's polite pool (both
|
|
162
|
+
OpenAlex and Crossref honor it): faster responses and a higher quota.
|
|
163
|
+
Sent as a `?mailto=…` query param; stripped from the cache key, so
|
|
164
|
+
rotating the email doesn't invalidate prior entries.
|
|
165
|
+
- `--api-key KEY` *(OpenAlex only)* — OpenAlex API key for higher
|
|
166
|
+
rate limits and tier-specific endpoints. Also read from `OPENALEX_API_KEY`
|
|
167
|
+
in the env or a `.env` file (loaded from cwd or any parent). Sent as
|
|
168
|
+
`Authorization: Bearer <key>` so it never lands in cache keys, URL logs,
|
|
169
|
+
or referer headers.
|
|
170
|
+
|
|
171
|
+
## Why JSONL?
|
|
172
|
+
|
|
173
|
+
The cache is an append-only log: every lookup is one JSON object per line.
|
|
174
|
+
Benefits:
|
|
175
|
+
|
|
176
|
+
- **Auditable**: `cat`/`grep` to see every query that ever ran.
|
|
177
|
+
- **Diffable**: plays nicely with git if you want to commit a project's cache.
|
|
178
|
+
- **Crash-safe**: an interrupted write loses at most the last line.
|
|
179
|
+
- **Recoverable**: rebuild the in-memory dict by replaying the log.
|
|
180
|
+
|
|
181
|
+
Latest value wins on replay, so over-writes are a no-op semantic.
|
|
182
|
+
|
|
183
|
+
**SQLite alternative.** A SQLite-backed cache is another reasonable
|
|
184
|
+
implementation — it would trade the audit log and `grep`-ability for faster
|
|
185
|
+
random access on very large caches (millions of entries) and concurrent
|
|
186
|
+
writers. The current scale of citefinder use (per-project bibs, tens of
|
|
187
|
+
thousands of entries at most) doesn't need it, and replaying a JSONL on
|
|
188
|
+
startup is fast enough that the simplicity wins. If a future workload pushes
|
|
189
|
+
past those limits, swapping the storage layer is a single class — `JsonlCache`
|
|
190
|
+
in `citefinder/cache.py` — behind the same `get` / `put` / `__contains__`
|
|
191
|
+
interface.
|
|
192
|
+
|
|
193
|
+
## Tests
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
uv run pytest
|
|
197
|
+
```
|