quelle 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. quelle-0.1.0/.env.example +29 -0
  2. quelle-0.1.0/.github/workflows/release.yml +25 -0
  3. quelle-0.1.0/.gitignore +28 -0
  4. quelle-0.1.0/CLAUDE.md +95 -0
  5. quelle-0.1.0/LICENSE +20 -0
  6. quelle-0.1.0/Makefile +29 -0
  7. quelle-0.1.0/PKG-INFO +199 -0
  8. quelle-0.1.0/README.md +172 -0
  9. quelle-0.1.0/pyproject.toml +65 -0
  10. quelle-0.1.0/quelle/__init__.py +0 -0
  11. quelle-0.1.0/quelle/cli/__init__.py +0 -0
  12. quelle-0.1.0/quelle/cli/config.py +175 -0
  13. quelle-0.1.0/quelle/cli/main.py +241 -0
  14. quelle-0.1.0/quelle/cli/output.py +123 -0
  15. quelle-0.1.0/quelle/migrate.py +105 -0
  16. quelle-0.1.0/quelle/models/__init__.py +0 -0
  17. quelle-0.1.0/quelle/models/publication.py +130 -0
  18. quelle-0.1.0/quelle/paths.py +100 -0
  19. quelle-0.1.0/quelle/repositories/__init__.py +0 -0
  20. quelle-0.1.0/quelle/repositories/cache.py +203 -0
  21. quelle-0.1.0/quelle/repositories/errors.py +38 -0
  22. quelle-0.1.0/quelle/repositories/http_client.py +48 -0
  23. quelle-0.1.0/quelle/repositories/pdf_downloader.py +89 -0
  24. quelle-0.1.0/quelle/repositories/sources/__init__.py +0 -0
  25. quelle-0.1.0/quelle/repositories/sources/arxiv.py +181 -0
  26. quelle-0.1.0/quelle/repositories/sources/crossref.py +126 -0
  27. quelle-0.1.0/quelle/repositories/sources/openalex.py +155 -0
  28. quelle-0.1.0/quelle/repositories/sources/semantic_scholar.py +109 -0
  29. quelle-0.1.0/quelle/repositories/sources/unpaywall.py +43 -0
  30. quelle-0.1.0/quelle/services/__init__.py +0 -0
  31. quelle-0.1.0/quelle/services/pdf_resolver.py +84 -0
  32. quelle-0.1.0/quelle/services/resolver.py +151 -0
  33. quelle-0.1.0/quelle/settings.py +67 -0
  34. quelle-0.1.0/skills/quelle/SKILL.md +118 -0
  35. quelle-0.1.0/tests/__init__.py +0 -0
  36. quelle-0.1.0/tests/conftest.py +55 -0
  37. quelle-0.1.0/tests/test_arxiv_mapper.py +118 -0
  38. quelle-0.1.0/tests/test_cache.py +126 -0
  39. quelle-0.1.0/tests/test_cli_config.py +124 -0
  40. quelle-0.1.0/tests/test_cli_smoke.py +43 -0
  41. quelle-0.1.0/tests/test_crossref_mapper.py +113 -0
  42. quelle-0.1.0/tests/test_merge_logic.py +99 -0
  43. quelle-0.1.0/tests/test_migrate.py +154 -0
  44. quelle-0.1.0/tests/test_openalex_mapper.py +93 -0
  45. quelle-0.1.0/tests/test_paths.py +135 -0
  46. quelle-0.1.0/tests/test_pdf_downloader.py +84 -0
  47. quelle-0.1.0/tests/test_pdf_resolver.py +102 -0
  48. quelle-0.1.0/tests/test_resolver.py +58 -0
  49. quelle-0.1.0/tests/test_resolver_cache.py +81 -0
  50. quelle-0.1.0/tests/test_semantic_scholar_mapper.py +77 -0
  51. quelle-0.1.0/tests/test_unpaywall.py +37 -0
  52. quelle-0.1.0/uv.lock +433 -0
@@ -0,0 +1,29 @@
1
+ # quelle configuration.
2
+ # Copy to `.env` and fill in. All values are optional — defaults are tight
3
+ # enough that `uv run quelle fetch 10.xxx/yyy` works without any config.
4
+ #
5
+ # Filesystem layout (config / data / cache) is resolved automatically via
6
+ # platformdirs. Override any of the three with QUELLE_CONFIG_DIR,
7
+ # QUELLE_DATA_DIR, QUELLE_CACHE_DIR — useful for tests or custom deployments.
8
+
9
+ # Your email. Goes into the User-Agent and drops into the polite pool
10
+ # for both Crossref and OpenAlex — recommended for any production use.
11
+ QUELLE_CONTACT_EMAIL=alice@example.com
12
+
13
+ # Free OpenAlex key (openalex.org/settings/api). Unauthenticated calls work
14
+ # but have a lower daily quota. Get one if you plan to do bulk ingest.
15
+ #OPENALEX_API_KEY=
16
+
17
+ # Free Semantic Scholar key (api.semanticscholar.org/getting-started).
18
+ # Unauthenticated calls also work; the key only raises the rate limit.
19
+ #SEMANTIC_SCHOLAR_API_KEY=
20
+
21
+ # Unpaywall requires an email as an API parameter. Defaults to
22
+ # QUELLE_CONTACT_EMAIL when unset.
23
+ #UNPAYWALL_EMAIL=alice@example.com
24
+
25
+ # HTTP timeout per request, in seconds.
26
+ #QUELLE_HTTP_TIMEOUT=30
27
+
28
+ # Maximum PDF size to download, in megabytes. Downloads are aborted above this.
29
+ #QUELLE_MAX_PDF_MB=100
@@ -0,0 +1,25 @@
1
+ name: release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*'
7
+
8
+ jobs:
9
+ build-and-publish:
10
+ name: Build and publish to PyPI
11
+ runs-on: ubuntu-latest
12
+ permissions:
13
+ id-token: write # required for OIDC trusted publishing
14
+ contents: read # required so actions/checkout@v4 can read the repo
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Set up uv
19
+ uses: astral-sh/setup-uv@v4
20
+
21
+ - name: Build
22
+ run: uv build
23
+
24
+ - name: Publish to PyPI
25
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,28 @@
1
+ # Secrets and local config
2
+ .env
3
+ .env.local
4
+
5
+ # Local dev state (dev-mode cache DB + downloaded PDFs under the repo root).
6
+ # Installed users' data lives in platformdirs dirs, not here.
7
+ .dev-state/
8
+ .publications-state/
9
+
10
+ # Python build artefacts
11
+ __pycache__/
12
+ *.py[cod]
13
+ *.egg-info/
14
+ .venv/
15
+ build/
16
+ dist/
17
+
18
+ # Test / coverage artefacts
19
+ .pytest_cache/
20
+ .coverage
21
+ htmlcov/
22
+ .ruff_cache/
23
+ .mypy_cache/
24
+
25
+ # Editor / OS
26
+ .DS_Store
27
+ .idea/
28
+ .vscode/
quelle-0.1.0/CLAUDE.md ADDED
@@ -0,0 +1,95 @@
1
+ # CLAUDE.md — quelle
2
+
3
+ Local CLI that fetches academic publication metadata and PDFs from open sources (OpenAlex, Crossref, Semantic Scholar, arXiv, Unpaywall) and returns them as normalised JSON. Designed as a composable building block — the tool has no hardcoded opinion about where results end up; downstream consumers (skills, scripts, reference managers) decide that.
4
+
5
+ The name is German for *source*: in academic citations, "Quelle:" prefixes a bibliographic reference, and fetching from open sources is what this tool does. The package distributes on PyPI under `quelle`; the command-line binary is also `quelle`.
6
+
7
+ ## Project type
8
+
9
+ - **Not deployed.** Per-laptop tool, distributed via PyPI (`pip install quelle`) or `uv tool install quelle`.
10
+ - **No daemon / server.** Every invocation is a short-lived CLI process.
11
+ - **Multiple upstreams, no single source of truth.** This tool queries 5+ different open APIs and normalises their responses. Cache is a convenience, not a mirror.
12
+
13
+ ## Stack
14
+
15
+ - Python 3.12+, `uv`-managed
16
+ - Typer (CLI) + httpx (sync HTTP) + stdlib `sqlite3` (cache) + rich + environs + platformdirs + pytest + pytest-httpx
17
+ - No GUI. No ORM. No async.
18
+
19
+ ## Architecture
20
+
21
+ Strict layers — imports only go downward.
22
+
23
+ ```
24
+ quelle/
25
+ models/ <- Publication, Author (pure dataclasses, no I/O)
26
+ repositories/ <- http_client, errors, sources/{openalex, crossref, ...}
27
+ services/ <- resolver (orchestrates which source to hit first)
28
+ cli/ <- Typer app + config sub-app + rich/JSON output helpers
29
+ paths.py <- platformdirs resolution (config / data / cache)
30
+ migrate.py <- One-shot migration from the legacy PublicationManager layout
31
+ settings.py <- environs config (uses paths.resolve internally)
32
+ ```
33
+
34
+ Layer rules:
35
+
36
+ - **Models** import nothing from this project.
37
+ - **Repositories** import from models. Each source (`sources/openalex.py` etc.) is a self-contained module that returns a `Publication`.
38
+ - **Services** import from models + repositories. The `resolver` decides which source to call based on the shape of the query (DOI vs arXiv id vs free text).
39
+ - **CLI** is the wiring layer: Typer command → load Settings → build httpx client → call resolver → render via `quelle/cli/output.py`.
40
+
41
+ ## Paths
42
+
43
+ `quelle/paths.py` resolves three locations via [`platformdirs`](https://platformdirs.readthedocs.io/), following each OS's conventions:
44
+
45
+ | Role | Linux (XDG) | macOS | Windows |
46
+ |---|---|---|---|
47
+ | Config (`.env`) | `~/.config/quelle/` | `~/Library/Application Support/quelle/` | `%APPDATA%\quelle\` |
48
+ | Data (`pdfs/`) | `~/.local/share/quelle/` | `~/Library/Application Support/quelle/` | `%LOCALAPPDATA%\quelle\` |
49
+ | Cache (`cache.sqlite`) | `~/.cache/quelle/` | `~/Library/Caches/quelle/` | `%LOCALAPPDATA%\quelle\Cache\` |
50
+
51
+ Override any of the three at runtime via `QUELLE_CONFIG_DIR`, `QUELLE_DATA_DIR`, `QUELLE_CACHE_DIR`. Env-var overrides always win over both dev-mode detection and platformdirs defaults — this is how tests isolate filesystem state via `tmp_path`.
52
+
53
+ ### Dev mode vs installed mode
54
+
55
+ `paths.resolve()` walks up from `__file__` looking for `pyproject.toml`:
56
+
57
+ - **Dev** (running from a source checkout, e.g. `uv run quelle …`): config dir = repo root (so `.env` at the repo root is still loaded automatically), data dir = `.dev-state/`, cache dir = `.dev-state/cache/`. `.dev-state/` is gitignored.
58
+ - **Installed** (`__file__` inside `site-packages/` or a `uv tools/` venv): config / data / cache come from `platformdirs`.
59
+
60
+ The detection is a heuristic (`_looks_like_installed_location` in `paths.py`) — site-packages and uv tools venvs are recognised and force installed mode regardless of any nearby `pyproject.toml`.
61
+
62
+ ## Sources
63
+
64
+ Implemented in priority order (see `quelle/repositories/sources/`). Each module exports:
65
+
66
+ - `search_by_title(client, settings, title) -> Publication`
67
+ - `fetch_by_doi(client, settings, doi) -> Publication` (where applicable)
68
+ - `_to_publication(raw) -> Publication` — private mapper, unit-tested without network
69
+
70
+ Sources never decide the resolution order themselves — the `services/resolver.py` orchestrator does.
71
+
72
+ ## Rate-limit discipline
73
+
74
+ - **Crossref polite pool**: every request must carry `mailto=…` (either as a query param or a `User-Agent: …(mailto:…)` suffix). `build_client` in `quelle/repositories/http_client.py` bakes the User-Agent.
75
+ - **arXiv**: max 1 request / 3 seconds for metadata queries. Static PDF fetches from `arxiv.org/pdf/...` are unbounded.
76
+ - **Unpaywall**: 100 ms delay between requests, 100k / day quota.
77
+ - **OpenAlex**: $1/day quota when authenticated, lower unauth. Don't batch fetch without caching.
78
+
79
+ ## Commands
80
+
81
+ ```bash
82
+ make dev-install # install all deps incl. dev
83
+ make test # pytest
84
+ make lint # ruff check + format --check
85
+ make format # ruff check --fix + format
86
+ make tool-install # install `quelle` globally via `uv tool install`
87
+ ```
88
+
89
+ ## Workflow
90
+
91
+ 1. After any code change: `make format` — enforced by ruff.
92
+ 2. Before committing: `make lint && make test`.
93
+ 3. When adding a new CLI command: add a smoke test in `tests/test_cli_smoke.py` (or `test_cli_config.py` for `config`-subapp commands).
94
+ 4. When adding a new source: add a mapper unit test that feeds a recorded fixture JSON into `_to_publication` — no network required.
95
+ 5. When touching the path-resolution layer or the migration: tests in `tests/test_paths.py` and `tests/test_migrate.py` must stay green. Both use `monkeypatch` to isolate filesystem state; never hit the real user's home dir.
quelle-0.1.0/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Alice Voland
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE DEALINGS IN THE SOFTWARE.
quelle-0.1.0/Makefile ADDED
@@ -0,0 +1,29 @@
1
+ PYTHONPATH := $(shell pwd)
2
+
3
+ help: ## Show this help
4
+ @grep -E '^[a-zA-Z_-]+:.*## ' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*## "}; {printf "%-16s %s\n", $$1, $$2}'
5
+
6
+ install: ## Install dependencies into a uv-managed venv
7
+ uv sync
8
+
9
+ dev-install: ## Install dev dependencies too
10
+ uv sync --all-groups
11
+
12
+ run: ## Run the quelle CLI (pass args after --, e.g. make run -- config)
13
+ uv run quelle
14
+
15
+ test: ## Run pytest
16
+ uv run pytest; RET=$$?; if [ $$RET -eq 5 ]; then exit 0; else exit $$RET; fi
17
+
18
+ coverage: ## Run pytest with line-coverage report
19
+ uv run pytest --cov=quelle --cov-report=term-missing --cov-report=html
20
+
21
+ lint: ## Ruff lint + format check
22
+ uv run ruff check .
23
+ uv run ruff format --check .
24
+
25
+ format: ## Ruff auto-fix + format
26
+ uv run ruff check --fix .
27
+ uv run ruff format .
28
+
29
+ .PHONY: help install dev-install run test coverage lint format
quelle-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,199 @@
1
+ Metadata-Version: 2.4
2
+ Name: quelle
3
+ Version: 0.1.0
4
+ Summary: Fetch academic publication metadata and PDFs from open sources (OpenAlex, Crossref, Semantic Scholar, arXiv, Unpaywall) and return normalised JSON
5
+ Project-URL: Homepage, https://github.com/vcoeur/quelle
6
+ Project-URL: Repository, https://github.com/vcoeur/quelle
7
+ Project-URL: Issues, https://github.com/vcoeur/quelle/issues
8
+ Author: Alice Voland
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering
19
+ Classifier: Topic :: Text Processing :: Markup
20
+ Requires-Python: >=3.12
21
+ Requires-Dist: environs>=14.6.0
22
+ Requires-Dist: httpx>=0.28
23
+ Requires-Dist: platformdirs>=4
24
+ Requires-Dist: rich>=13.9
25
+ Requires-Dist: typer>=0.16
26
+ Description-Content-Type: text/markdown
27
+
28
+ # quelle
29
+
30
+ `quelle` is a local CLI that fetches academic publication metadata and PDFs from open academic sources (OpenAlex, Crossref, Semantic Scholar, arXiv, Unpaywall) and returns them as normalised JSON. Designed as a composable building block — feed the output into any note-taking system, reference manager, or research workflow.
31
+
32
+ The name is German for *source* — in academic German, "Quelle:" is the word that introduces a bibliographic reference, and fetching from open sources is exactly what the tool does.
33
+
34
+ ## What it does
35
+
36
+ Given a publication identifier or a free-text title, `quelle fetch` returns a normalised JSON blob with title, authors, year, venue, DOI, abstract, citation count and (optionally) a downloaded local PDF. It walks a fallback chain of free open sources:
37
+
38
+ | Source | Role | Rate limit |
39
+ |---|---|---|
40
+ | [OpenAlex](https://docs.openalex.org/) | Primary metadata + OA PDF URL | $1/day on free key |
41
+ | [Crossref](https://www.crossref.org/documentation/retrieve-metadata/rest-api/) | DOI-authoritative fallback (abstract, journal block) | polite pool (no hard cap) |
42
+ | [Semantic Scholar](https://api.semanticscholar.org/) | Citation graph + metadata fallback | 5000 / 5 min unauth |
43
+ | [arXiv](https://info.arxiv.org/help/api/) | Preprint metadata + direct PDFs | 1 req / 3s (enforced) |
44
+ | [Unpaywall](https://unpaywall.org/products/api) | DOI → OA PDF lookup | 100k / day |
45
+
46
+ Google Scholar URLs are **not supported**: Scholar has no public API and its Terms of Service prohibit automated access. If you only have a Scholar link, open the page, copy the paper title, and feed that to `quelle fetch` as a free-text query — OpenAlex and Crossref cover almost every paper with a DOI.
47
+
48
+ ## Stack
49
+
50
+ Python 3.12+, `uv`-managed. Typer (CLI) + httpx (sync HTTP) + stdlib `sqlite3` (cache) + rich + environs + platformdirs + pytest + pytest-httpx. No GUI, no ORM, no async.
51
+
52
+ ## Installation
53
+
54
+ Install from PyPI:
55
+
56
+ ```bash
57
+ pip install quelle
58
+ ```
59
+
60
+ Run the one-time bootstrap to create the config, data, and cache directories and seed a default `.env`:
61
+
62
+ ```bash
63
+ quelle init
64
+ quelle config edit # opens the .env in your $EDITOR
65
+ ```
66
+
67
+ ### Development from a source checkout
68
+
69
+ ```bash
70
+ git clone https://github.com/vcoeur/quelle.git
71
+ cd quelle
72
+ make dev-install # uv sync --all-groups
73
+ make test # pytest
74
+ make lint # ruff check + format --check
75
+ make format # ruff --fix + format
76
+ uv run quelle --help # run the CLI straight from the repo
77
+ ```
78
+
79
+ When run from the repo, `quelle` picks up the `.env` at the repo root and stores dev cache / PDFs under a repo-local `.dev-state/` instead of polluting your installed user data.
80
+
81
+ ## Configuration
82
+
83
+ `quelle` follows each OS's standard "config dir + data dir + cache dir" layout via [`platformdirs`](https://platformdirs.readthedocs.io/):
84
+
85
+ | Role | Linux (XDG) | macOS | Windows |
86
+ |---|---|---|---|
87
+ | Config (`.env`) | `~/.config/quelle/` | `~/Library/Application Support/quelle/` | `%APPDATA%\quelle\` |
88
+ | Data (downloaded PDFs) | `~/.local/share/quelle/` | `~/Library/Application Support/quelle/` | `%LOCALAPPDATA%\quelle\` |
89
+ | Cache (sqlite index) | `~/.cache/quelle/` | `~/Library/Caches/quelle/` | `%LOCALAPPDATA%\quelle\Cache\` |
90
+
91
+ Any of the three can be overridden via env vars — useful for tests, Docker, or custom deployments:
92
+
93
+ ```bash
94
+ export QUELLE_CONFIG_DIR=/etc/quelle
95
+ export QUELLE_DATA_DIR=/srv/quelle/data
96
+ export QUELLE_CACHE_DIR=/var/cache/quelle
97
+ ```
98
+
99
+ Inspect the resolved paths at any time:
100
+
101
+ ```bash
102
+ quelle config path # plain output, one path per line
103
+ quelle config path --json # JSON, scriptable
104
+ quelle config show # all values including API keys (redacted)
105
+ ```
106
+
107
+ The only variable worth setting by default is `QUELLE_CONTACT_EMAIL` — it goes into the `User-Agent` header and enrolls you in the Crossref / OpenAlex polite pool. See [`.env.example`](.env.example) for the full list.
108
+
109
+ **Dev mode**: when you run `quelle` from a source checkout (`uv run quelle …` inside the repo), the `.env` at the repo root is still picked up — the same ergonomics as before — but downloaded PDFs and the cache go into a repo-local `.dev-state/` directory so your installed user data stays clean.
110
+
111
+ **Migration from PublicationManager**: if you upgraded from the old `publication-manager` package, the first run of `quelle` automatically moves your `~/.config/publications/.env` and `~/.publications/.publications-state/` into the new locations. No data loss, no manual steps.
112
+
113
+ ## Usage
114
+
115
+ ```bash
116
+ # Resolve by DOI (uses OpenAlex + Crossref enrichment by default).
117
+ quelle fetch 10.1109/83.902291
118
+
119
+ # Resolve by arXiv id, with PDF download into the data dir.
120
+ quelle fetch 1706.03762 --download-pdf
121
+
122
+ # Resolve by free-text title.
123
+ quelle fetch "The Perceptron: A Probabilistic Model" --json
124
+
125
+ # Bypass the local cache and force network.
126
+ quelle fetch 10.xxxx/yyyy --no-cache
127
+
128
+ # Inspect the cache.
129
+ quelle cache stats
130
+ quelle cache list --limit 20
131
+ quelle cache show 10.1109/83.902291
132
+ quelle cache clear --yes
133
+ ```
134
+
135
+ ## Claude Code skill
136
+
137
+ A minimal example [`SKILL.md`](skills/quelle/SKILL.md) ships in `skills/quelle/` — drop it into `~/.claude/skills/quelle/` (or `<project>/.claude/skills/quelle/`) to use the CLI from a Claude Code session. It's deliberately thin: resolve the paper, print the metadata, stop. Adapt the last step for your own downstream workflow (Zettelkasten import, BibTeX append, etc.).
138
+
139
+ ## Layout
140
+
141
+ ```
142
+ quelle/
143
+ models/ <- Publication, Author (pure dataclasses)
144
+ repositories/
145
+ cache.py <- SQLite cache keyed by DOI / arXiv / title
146
+ errors.py <- Error hierarchy -> exit codes 1/2/3/4
147
+ http_client.py <- httpx + polite User-Agent
148
+ pdf_downloader.py <- Streaming PDF download with content-type + size checks
149
+ sources/ <- One module per source: openalex, crossref, semantic_scholar, arxiv, unpaywall
150
+ services/
151
+ resolver.py <- Source orchestration + enrichment chain + cache lookup
152
+ pdf_resolver.py <- Lazy PDF fallback chain
153
+ cli/
154
+ main.py <- Typer app (fetch, cache, version, init)
155
+ config.py <- `config show` / `path` / `edit` + bootstrap
156
+ output.py <- JSON vs rich TTY rendering
157
+ paths.py <- platformdirs resolution (config / data / cache)
158
+ migrate.py <- One-shot migration from the legacy PublicationManager layout
159
+ settings.py <- environs-layered config
160
+ tests/
161
+ ```
162
+
163
+ Layer rules: imports only go downward. Models import nothing from this project. Repositories import models. Services import models + repositories. CLI is the wiring layer.
164
+
165
+ ## Status
166
+
167
+ v0.1 — all five open-API sources wired up with a merge-logic enrichment chain, a SQLite cache keyed by DOI / arXiv id / OpenAlex id / title (second query for the same paper is offline), and a PDF download chain with OpenAlex → arXiv → Unpaywall fallback plus content-type and size validation.
168
+
169
+ ## Usage and terms
170
+
171
+ This tool is intended for **personal and academic research use**. It queries free, public APIs on your behalf. **You are responsible for complying with each upstream's terms of service** — the MIT licence on this repo covers the *code* of this tool, not the data you fetch through it.
172
+
173
+ **Not supported use cases**:
174
+
175
+ - **Bulk scraping** / batch ingestion of many records. Most upstreams publish free database snapshots; use those instead of hammering the live API.
176
+ - **Rehosting downloaded PDFs** on a public server. The `--download-pdf` flag writes to a local cache on your machine — that is fine. Re-serving arXiv PDFs, publisher PDFs, or full text from your own infrastructure is not (see arXiv and Semantic Scholar rows below).
177
+ - **Commercial repackaging** of the JSON output as a paid product. Individual commercial use of the metadata is generally allowed by the underlying licences, but Semantic Scholar in particular requires attribution and some S2 records are `CC BY-NC`.
178
+
179
+ **Per-source summary**:
180
+
181
+ | Source | Data licence | Rate limit | Attribution | Notes |
182
+ |---|---|---|---|---|
183
+ | [OpenAlex](https://docs.openalex.org/how-to-use-the-api/rate-limits-and-authentication) | CC0 — *"OpenAlex data is and will remain available at no cost"* | ~100k / day on the polite pool; single-entity lookups unlimited | not required | Provide an email via `QUELLE_CONTACT_EMAIL` for the polite pool, or set `OPENALEX_API_KEY` for the new key-based tier (OpenAlex announced in January 2026 that key authentication is replacing the mailto polite pool; the tool supports both). |
184
+ | [Crossref REST](https://www.crossref.org/documentation/retrieve-metadata/rest-api/) | CC0 for metadata — *"almost none of the metadata is subject to copyright, and you may use it for any purpose"*. Some abstracts may remain copyrighted. | No hard cap; the polite pool is requested via your `mailto=` / User-Agent | not required, but recommended | Commercial users who need SLAs should subscribe to Metadata Plus directly with Crossref. |
185
+ | [arXiv API](https://info.arxiv.org/help/api/tou.html) | Metadata CC0. PDFs retain their authors' / arXiv's licence. | **1 request / 3 seconds** (the tool enforces this globally via a module-level lock) | Do not claim arXiv endorses your project. | **You may not store and re-serve arXiv e-prints (PDFs, source files, other content) from your own servers unless you have the copyright holder's permission.** Downloading for local personal reading is explicitly allowed. |
186
+ | [Semantic Scholar](https://www.semanticscholar.org/product/api/license) | S2 data may be `CC BY-NC` or `ODC-BY` depending on the record. The API itself is provided *"AS IS, WITH ALL FAULTS, AND AS AVAILABLE"* with no warranty. | Public endpoints need no auth; higher throughput requires a free key from Ai2. | **Required** — *"Licensee will include an attribution to 'Semantic Scholar'"*, and publications must cite *The Semantic Scholar Open Data Platform*. | You may not *"repackage, sell, rent, lease, lend, distribute, or sublicense the API"*. This tool is a personal client, not a proxy. |
187
+ | [Unpaywall](https://unpaywall.org/products/api) | CC0 data | 100k requests / day | not required | The email parameter is **mandatory** — Unpaywall uses it to contact you if something goes wrong. Don't fake it. For bulk workloads, download the free data snapshot instead of hammering the API. |
188
+
189
+ **Google Scholar is not supported.** Google Scholar has no official API, and Google's Terms of Service prohibit automated access. Passing a Scholar URL to `quelle fetch` returns a `UserError` asking you to copy the paper title manually and retry — OpenAlex and Crossref together cover almost every paper with a DOI, so the workaround is usually one extra copy-paste.
190
+
191
+ **No warranty**: see the MIT [`LICENSE`](LICENSE) — this tool is provided as-is, with no guarantee that its JSON output is correct, complete, or current. Verify critical metadata against the canonical upstream before relying on it.
192
+
193
+ ## Licence
194
+
195
+ MIT — see [`LICENSE`](LICENSE).
196
+
197
+ ## Questions or feedback
198
+
199
+ This is a personal tool — I'm happy to hear from you, but there is no formal support. The best way to reach me is the contact form on [vcoeur.com](https://vcoeur.com).
quelle-0.1.0/README.md ADDED
@@ -0,0 +1,172 @@
1
+ # quelle
2
+
3
+ `quelle` is a local CLI that fetches academic publication metadata and PDFs from open academic sources (OpenAlex, Crossref, Semantic Scholar, arXiv, Unpaywall) and returns them as normalised JSON. Designed as a composable building block — feed the output into any note-taking system, reference manager, or research workflow.
4
+
5
+ The name is German for *source* — in academic German, "Quelle:" is the word that introduces a bibliographic reference, and fetching from open sources is exactly what the tool does.
6
+
7
+ ## What it does
8
+
9
+ Given a publication identifier or a free-text title, `quelle fetch` returns a normalised JSON blob with title, authors, year, venue, DOI, abstract, citation count and (optionally) a downloaded local PDF. It walks a fallback chain of free open sources:
10
+
11
+ | Source | Role | Rate limit |
12
+ |---|---|---|
13
+ | [OpenAlex](https://docs.openalex.org/) | Primary metadata + OA PDF URL | $1/day on free key |
14
+ | [Crossref](https://www.crossref.org/documentation/retrieve-metadata/rest-api/) | DOI-authoritative fallback (abstract, journal block) | polite pool (no hard cap) |
15
+ | [Semantic Scholar](https://api.semanticscholar.org/) | Citation graph + metadata fallback | 5000 / 5 min unauth |
16
+ | [arXiv](https://info.arxiv.org/help/api/) | Preprint metadata + direct PDFs | 1 req / 3s (enforced) |
17
+ | [Unpaywall](https://unpaywall.org/products/api) | DOI → OA PDF lookup | 100k / day |
18
+
19
+ Google Scholar URLs are **not supported**: Scholar has no public API and its Terms of Service prohibit automated access. If you only have a Scholar link, open the page, copy the paper title, and feed that to `quelle fetch` as a free-text query — OpenAlex and Crossref cover almost every paper with a DOI.
20
+
21
+ ## Stack
22
+
23
+ Python 3.12+, `uv`-managed. Typer (CLI) + httpx (sync HTTP) + stdlib `sqlite3` (cache) + rich + environs + platformdirs + pytest + pytest-httpx. No GUI, no ORM, no async.
24
+
25
+ ## Installation
26
+
27
+ Install from PyPI:
28
+
29
+ ```bash
30
+ pip install quelle
31
+ ```
32
+
33
+ Run the one-time bootstrap to create the config, data, and cache directories and seed a default `.env`:
34
+
35
+ ```bash
36
+ quelle init
37
+ quelle config edit # opens the .env in your $EDITOR
38
+ ```
39
+
40
+ ### Development from a source checkout
41
+
42
+ ```bash
43
+ git clone https://github.com/vcoeur/quelle.git
44
+ cd quelle
45
+ make dev-install # uv sync --all-groups
46
+ make test # pytest
47
+ make lint # ruff check + format --check
48
+ make format # ruff --fix + format
49
+ uv run quelle --help # run the CLI straight from the repo
50
+ ```
51
+
52
+ When run from the repo, `quelle` picks up the `.env` at the repo root and stores dev cache / PDFs under a repo-local `.dev-state/` instead of polluting your installed user data.
53
+
54
+ ## Configuration
55
+
56
+ `quelle` follows each OS's standard "config dir + data dir + cache dir" layout via [`platformdirs`](https://platformdirs.readthedocs.io/):
57
+
58
+ | Role | Linux (XDG) | macOS | Windows |
59
+ |---|---|---|---|
60
+ | Config (`.env`) | `~/.config/quelle/` | `~/Library/Application Support/quelle/` | `%APPDATA%\quelle\` |
61
+ | Data (downloaded PDFs) | `~/.local/share/quelle/` | `~/Library/Application Support/quelle/` | `%LOCALAPPDATA%\quelle\` |
62
+ | Cache (sqlite index) | `~/.cache/quelle/` | `~/Library/Caches/quelle/` | `%LOCALAPPDATA%\quelle\Cache\` |
63
+
64
+ Any of the three can be overridden via env vars — useful for tests, Docker, or custom deployments:
65
+
66
+ ```bash
67
+ export QUELLE_CONFIG_DIR=/etc/quelle
68
+ export QUELLE_DATA_DIR=/srv/quelle/data
69
+ export QUELLE_CACHE_DIR=/var/cache/quelle
70
+ ```
71
+
72
+ Inspect the resolved paths at any time:
73
+
74
+ ```bash
75
+ quelle config path # plain output, one path per line
76
+ quelle config path --json # JSON, scriptable
77
+ quelle config show # all values including API keys (redacted)
78
+ ```
79
+
80
+ The only variable worth setting by default is `QUELLE_CONTACT_EMAIL` — it goes into the `User-Agent` header and enrolls you in the Crossref / OpenAlex polite pool. See [`.env.example`](.env.example) for the full list.
81
+
82
+ **Dev mode**: when you run `quelle` from a source checkout (`uv run quelle …` inside the repo), the `.env` at the repo root is still picked up — the same ergonomics as before — but downloaded PDFs and the cache go into a repo-local `.dev-state/` directory so your installed user data stays clean.
83
+
84
+ **Migration from PublicationManager**: if you upgraded from the old `publication-manager` package, the first run of `quelle` automatically moves your `~/.config/publications/.env` and `~/.publications/.publications-state/` into the new locations. No data loss, no manual steps.
85
+
86
+ ## Usage
87
+
88
+ ```bash
89
+ # Resolve by DOI (uses OpenAlex + Crossref enrichment by default).
90
+ quelle fetch 10.1109/83.902291
91
+
92
+ # Resolve by arXiv id, with PDF download into the data dir.
93
+ quelle fetch 1706.03762 --download-pdf
94
+
95
+ # Resolve by free-text title.
96
+ quelle fetch "The Perceptron: A Probabilistic Model" --json
97
+
98
+ # Bypass the local cache and force network.
99
+ quelle fetch 10.xxxx/yyyy --no-cache
100
+
101
+ # Inspect the cache.
102
+ quelle cache stats
103
+ quelle cache list --limit 20
104
+ quelle cache show 10.1109/83.902291
105
+ quelle cache clear --yes
106
+ ```
107
+
108
+ ## Claude Code skill
109
+
110
+ A minimal example [`SKILL.md`](skills/quelle/SKILL.md) ships in `skills/quelle/` — drop it into `~/.claude/skills/quelle/` (or `<project>/.claude/skills/quelle/`) to use the CLI from a Claude Code session. It's deliberately thin: resolve the paper, print the metadata, stop. Adapt the last step for your own downstream workflow (Zettelkasten import, BibTeX append, etc.).
111
+
112
+ ## Layout
113
+
114
+ ```
115
+ quelle/
116
+ models/ <- Publication, Author (pure dataclasses)
117
+ repositories/
118
+ cache.py <- SQLite cache keyed by DOI / arXiv / title
119
+ errors.py <- Error hierarchy -> exit codes 1/2/3/4
120
+ http_client.py <- httpx + polite User-Agent
121
+ pdf_downloader.py <- Streaming PDF download with content-type + size checks
122
+ sources/ <- One module per source: openalex, crossref, semantic_scholar, arxiv, unpaywall
123
+ services/
124
+ resolver.py <- Source orchestration + enrichment chain + cache lookup
125
+ pdf_resolver.py <- Lazy PDF fallback chain
126
+ cli/
127
+ main.py <- Typer app (fetch, cache, version, init)
128
+ config.py <- `config show` / `path` / `edit` + bootstrap
129
+ output.py <- JSON vs rich TTY rendering
130
+ paths.py <- platformdirs resolution (config / data / cache)
131
+ migrate.py <- One-shot migration from the legacy PublicationManager layout
132
+ settings.py <- environs-layered config
133
+ tests/
134
+ ```
135
+
136
+ Layer rules: imports only go downward. Models import nothing from this project. Repositories import models. Services import models + repositories. CLI is the wiring layer.
137
+
138
+ ## Status
139
+
140
+ v0.1 — all five open-API sources wired up with a merge-logic enrichment chain, a SQLite cache keyed by DOI / arXiv id / OpenAlex id / title (second query for the same paper is offline), and a PDF download chain with OpenAlex → arXiv → Unpaywall fallback plus content-type and size validation.
141
+
142
+ ## Usage and terms
143
+
144
+ This tool is intended for **personal and academic research use**. It queries free, public APIs on your behalf. **You are responsible for complying with each upstream's terms of service** — the MIT licence on this repo covers the *code* of this tool, not the data you fetch through it.
145
+
146
+ **Not supported use cases**:
147
+
148
+ - **Bulk scraping** / batch ingestion of many records. Most upstreams publish free database snapshots; use those instead of hammering the live API.
149
+ - **Rehosting downloaded PDFs** on a public server. The `--download-pdf` flag writes to a local cache on your machine — that is fine. Re-serving arXiv PDFs, publisher PDFs, or full text from your own infrastructure is not (see arXiv and Semantic Scholar rows below).
150
+ - **Commercial repackaging** of the JSON output as a paid product. Individual commercial use of the metadata is generally allowed by the underlying licences, but Semantic Scholar in particular requires attribution and some S2 records are `CC BY-NC`.
151
+
152
+ **Per-source summary**:
153
+
154
+ | Source | Data licence | Rate limit | Attribution | Notes |
155
+ |---|---|---|---|---|
156
+ | [OpenAlex](https://docs.openalex.org/how-to-use-the-api/rate-limits-and-authentication) | CC0 — *"OpenAlex data is and will remain available at no cost"* | ~100k / day on the polite pool; single-entity lookups unlimited | not required | Provide an email via `QUELLE_CONTACT_EMAIL` for the polite pool, or set `OPENALEX_API_KEY` for the new key-based tier (OpenAlex announced in January 2026 that key authentication is replacing the mailto polite pool; the tool supports both). |
157
+ | [Crossref REST](https://www.crossref.org/documentation/retrieve-metadata/rest-api/) | CC0 for metadata — *"almost none of the metadata is subject to copyright, and you may use it for any purpose"*. Some abstracts may remain copyrighted. | No hard cap; the polite pool is requested via your `mailto=` / User-Agent | not required, but recommended | Commercial users who need SLAs should subscribe to Metadata Plus directly with Crossref. |
158
+ | [arXiv API](https://info.arxiv.org/help/api/tou.html) | Metadata CC0. PDFs retain their authors' / arXiv's licence. | **1 request / 3 seconds** (the tool enforces this globally via a module-level lock) | Do not claim arXiv endorses your project. | **You may not store and re-serve arXiv e-prints (PDFs, source files, other content) from your own servers unless you have the copyright holder's permission.** Downloading for local personal reading is explicitly allowed. |
159
+ | [Semantic Scholar](https://www.semanticscholar.org/product/api/license) | S2 data may be `CC BY-NC` or `ODC-BY` depending on the record. The API itself is provided *"AS IS, WITH ALL FAULTS, AND AS AVAILABLE"* with no warranty. | Public endpoints need no auth; higher throughput requires a free key from Ai2. | **Required** — *"Licensee will include an attribution to 'Semantic Scholar'"*, and publications must cite *The Semantic Scholar Open Data Platform*. | You may not *"repackage, sell, rent, lease, lend, distribute, or sublicense the API"*. This tool is a personal client, not a proxy. |
160
+ | [Unpaywall](https://unpaywall.org/products/api) | CC0 data | 100k requests / day | not required | The email parameter is **mandatory** — Unpaywall uses it to contact you if something goes wrong. Don't fake it. For bulk workloads, download the free data snapshot instead of hammering the API. |
161
+
162
+ **Google Scholar is not supported.** Google Scholar has no official API, and Google's Terms of Service prohibit automated access. Passing a Scholar URL to `quelle fetch` returns a `UserError` asking you to copy the paper title manually and retry — OpenAlex and Crossref together cover almost every paper with a DOI, so the workaround is usually one extra copy-paste.
163
+
164
+ **No warranty**: see the MIT [`LICENSE`](LICENSE) — this tool is provided as-is, with no guarantee that its JSON output is correct, complete, or current. Verify critical metadata against the canonical upstream before relying on it.
165
+
166
+ ## Licence
167
+
168
+ MIT — see [`LICENSE`](LICENSE).
169
+
170
+ ## Questions or feedback
171
+
172
+ This is a personal tool — I'm happy to hear from you, but there is no formal support. The best way to reach me is the contact form on [vcoeur.com](https://vcoeur.com).