argus-quarry 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. argus_quarry-0.1.0/.env.example +22 -0
  2. argus_quarry-0.1.0/.github/workflows/ci.yml +51 -0
  3. argus_quarry-0.1.0/.github/workflows/release.yml +105 -0
  4. argus_quarry-0.1.0/.gitignore +23 -0
  5. argus_quarry-0.1.0/DESIGN.md +380 -0
  6. argus_quarry-0.1.0/Dockerfile +25 -0
  7. argus_quarry-0.1.0/LICENSE +21 -0
  8. argus_quarry-0.1.0/Makefile +47 -0
  9. argus_quarry-0.1.0/PKG-INFO +233 -0
  10. argus_quarry-0.1.0/README.md +186 -0
  11. argus_quarry-0.1.0/pyproject.toml +79 -0
  12. argus_quarry-0.1.0/src/argus_quarry/__init__.py +43 -0
  13. argus_quarry-0.1.0/src/argus_quarry/_version.py +24 -0
  14. argus_quarry-0.1.0/src/argus_quarry/cli.py +260 -0
  15. argus_quarry-0.1.0/src/argus_quarry/config.py +104 -0
  16. argus_quarry-0.1.0/src/argus_quarry/downloaders/__init__.py +29 -0
  17. argus_quarry-0.1.0/src/argus_quarry/downloaders/base.py +39 -0
  18. argus_quarry-0.1.0/src/argus_quarry/downloaders/commons.py +115 -0
  19. argus_quarry-0.1.0/src/argus_quarry/export.py +82 -0
  20. argus_quarry-0.1.0/src/argus_quarry/ingest.py +257 -0
  21. argus_quarry-0.1.0/src/argus_quarry/models.py +151 -0
  22. argus_quarry-0.1.0/src/argus_quarry/net.py +164 -0
  23. argus_quarry-0.1.0/src/argus_quarry/people.py +56 -0
  24. argus_quarry-0.1.0/src/argus_quarry/py.typed +0 -0
  25. argus_quarry-0.1.0/src/argus_quarry/seeds/people.yaml +43 -0
  26. argus_quarry-0.1.0/src/argus_quarry/store.py +254 -0
  27. argus_quarry-0.1.0/tests/conftest.py +75 -0
  28. argus_quarry-0.1.0/tests/test_cli.py +20 -0
  29. argus_quarry-0.1.0/tests/test_commons.py +60 -0
  30. argus_quarry-0.1.0/tests/test_export.py +42 -0
  31. argus_quarry-0.1.0/tests/test_ingest.py +90 -0
  32. argus_quarry-0.1.0/tests/test_models.py +40 -0
  33. argus_quarry-0.1.0/tests/test_people.py +24 -0
  34. argus_quarry-0.1.0/tests/test_store.py +52 -0
@@ -0,0 +1,22 @@
1
+ # ── Acquisition state ────────────────────────────────────────────────
2
+ # QUARRY_HOME is a sibling side-car dir (raw pool + DB + cache + logs). It
3
+ # lives OUTSIDE the published image tree so a curator scan never sees it.
4
+ QUARRY_HOME=./quarry
5
+
6
+ # Total raw-pool ceiling in GB. 0 (or unset) = unlimited. When the next file
7
+ # would exceed this, the run stops cleanly and marks the rest resumable.
8
+ QUARRY_MAX_GB=40
9
+
10
+ # ── Per-source politeness ────────────────────────────────────────────
11
+ # Wikimedia Commons (and LoC) ask for a descriptive User-Agent with contact.
12
+ COMMONS_USER_AGENT=argus-quarry/0.1 (contact@example.com)
13
+
14
+ # ── Suite integration (compose) ──────────────────────────────────────
15
+ # Published, curator-ready view. Quarry `export` builds a Person_Name/ tree
16
+ # here (symlinks by default) pointing back into QUARRY_HOME/images.
17
+ DATASET_DIR=./data
18
+
19
+ # NOTE: the published tree symlinks into QUARRY_HOME/images. For those links
20
+ # to resolve inside the curator/lens containers, they must ALSO mount
21
+ # QUARRY_HOME read-only (e.g. ${QUARRY_HOME}:/data/quarry:ro), or run quarry
22
+ # `export --copy` so the published files are real copies.
@@ -0,0 +1,51 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ concurrency:
10
+ group: ci-${{ github.ref }}
11
+ cancel-in-progress: true
12
+
13
+ jobs:
14
+ lint:
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+ - uses: astral-sh/setup-uv@v6
19
+ with:
20
+ enable-cache: true
21
+ - run: uvx ruff@0.15.20 check src/ tests/
22
+ - run: uvx ruff@0.15.20 format --check src/ tests/
23
+
24
+ test:
25
+ runs-on: ubuntu-latest
26
+ strategy:
27
+ matrix:
28
+ python-version: ["3.11", "3.12"]
29
+ steps:
30
+ - uses: actions/checkout@v4
31
+ - uses: astral-sh/setup-uv@v6
32
+ with:
33
+ enable-cache: true
34
+ - run: uv venv --python ${{ matrix.python-version }}
35
+ - run: uv pip install -e ".[dev,cli]"
36
+ - run: uv run --no-sync pytest --tb=short -q
37
+
38
+ build:
39
+ runs-on: ubuntu-latest
40
+ needs: [lint, test]
41
+ steps:
42
+ - uses: actions/checkout@v4
43
+ - uses: astral-sh/setup-uv@v6
44
+ with:
45
+ enable-cache: true
46
+ - run: uv build
47
+ - uses: actions/upload-artifact@v4
48
+ with:
49
+ name: dist
50
+ path: dist/
51
+ retention-days: 14
@@ -0,0 +1,105 @@
1
+ name: Release to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags: ["v*"]
6
+
7
+ permissions:
8
+ contents: read
9
+ id-token: write
10
+
11
+ jobs:
12
+ build:
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ with:
17
+ fetch-depth: 0
18
+ - uses: astral-sh/setup-uv@v6
19
+ with:
20
+ enable-cache: true
21
+ - run: uv build
22
+ - uses: actions/upload-artifact@v4
23
+ with:
24
+ name: dist
25
+ path: dist/
26
+
27
+ test-install:
28
+ runs-on: ubuntu-latest
29
+ needs: build
30
+ strategy:
31
+ matrix:
32
+ python-version: ["3.11", "3.12"]
33
+ steps:
34
+ - uses: actions/download-artifact@v4
35
+ with:
36
+ name: dist
37
+ path: dist/
38
+ - uses: astral-sh/setup-uv@v6
39
+ with:
40
+ enable-cache: true
41
+ - name: Install wheel and smoke-test import
42
+ run: |
43
+ uv venv --python ${{ matrix.python-version }}
44
+ uv pip install dist/*.whl
45
+ uv run --no-sync python -c "from argus_quarry import PortraitRecord, __version__; print(f'argus-quarry {__version__} OK')"
46
+
47
+ publish-testpypi:
48
+ runs-on: ubuntu-latest
49
+ needs: test-install
50
+ environment: testpypi
51
+ steps:
52
+ - uses: actions/download-artifact@v4
53
+ with:
54
+ name: dist
55
+ path: dist/
56
+ - uses: pypa/gh-action-pypi-publish@release/v1
57
+ with:
58
+ repository-url: https://test.pypi.org/legacy/
59
+
60
+ publish-pypi:
61
+ runs-on: ubuntu-latest
62
+ needs: publish-testpypi
63
+ environment: pypi
64
+ steps:
65
+ - uses: actions/download-artifact@v4
66
+ with:
67
+ name: dist
68
+ path: dist/
69
+ - uses: pypa/gh-action-pypi-publish@release/v1
70
+
71
+ publish-image:
72
+ runs-on: ubuntu-latest
73
+ needs: test-install
74
+ permissions:
75
+ contents: read
76
+ packages: write
77
+ steps:
78
+ - uses: actions/checkout@v4
79
+ - uses: docker/setup-buildx-action@v3
80
+ - uses: docker/login-action@v3
81
+ with:
82
+ registry: ghcr.io
83
+ username: ${{ github.actor }}
84
+ password: ${{ secrets.GITHUB_TOKEN }}
85
+ - id: meta
86
+ uses: docker/metadata-action@v5
87
+ with:
88
+ images: ghcr.io/${{ github.repository_owner }}/argus-quarry
89
+ tags: |
90
+ type=semver,pattern={{version}}
91
+ type=semver,pattern={{major}}.{{minor}}
92
+ type=raw,value=latest
93
+ # The base image has no git, so pass the version in via build-arg
94
+ # (SETUPTOOLS_SCM_PRETEND_VERSION) rather than relying on .git history.
95
+ - uses: docker/build-push-action@v6
96
+ with:
97
+ context: .
98
+ file: ./Dockerfile
99
+ push: true
100
+ build-args: |
101
+ VERSION=${{ steps.meta.outputs.version }}
102
+ tags: ${{ steps.meta.outputs.tags }}
103
+ labels: ${{ steps.meta.outputs.labels }}
104
+ cache-from: type=gha
105
+ cache-to: type=gha,mode=max
@@ -0,0 +1,23 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .venv/
7
+ .eggs/
8
+ *.egg
9
+ .env
10
+ .ruff_cache/
11
+ .pytest_cache/
12
+ .mypy_cache/
13
+
14
+ # Generated by hatch-vcs at build time (version derived from git tags).
15
+ src/argus_quarry/_version.py
16
+
17
+ # uv lockfile — the suite uses the imperative `uv pip install` model, no lock.
18
+ uv.lock
19
+
20
+ # Local acquisition state (raw pool + provenance DB + cache/logs) and the
21
+ # default published view (DATASET_DIR = ./data) produced by `export`.
22
+ quarry/
23
+ data/
@@ -0,0 +1,380 @@
1
+ # argus-quarry — Design Doc (Draft)
2
+
3
+ > Status: **proposal / plan only** — no code yet.
4
+ > Owner: smk762 · Suite: Argus · Sibling of `argus-lens`, `argus-curator`, `argus-vision-demo`.
5
+
6
+ The *quarry* is where the suite digs up raw material. `argus-quarry` acquires
7
+ public-domain / CC0 portrait images from upstream archives and lands them —
8
+ **with full provenance and licensing** — into a folder the rest of the Argus
9
+ suite already knows how to consume (`DATASET_DIR` → `/data/images`).
10
+
11
+ It is deliberately **lean**: an *acquisition + provenance* tool, nothing more.
12
+ Everything downstream (quality scoring, near-dup, faces, embeddings, selection,
13
+ captioning, viewing) is already owned by `argus-curator` and `argus-lens`, and
14
+ `argus-quarry` must not re-implement it.
15
+
16
+ ---
17
+
18
+ ## 1. Where it fits in the suite
19
+
20
+ ```
21
+ argus-quarry (NEW) argus-curator (:8101) argus-lens (:8100) imogen / kohya
22
+ ─ download ─┐ ─ scan + score ─┐ ─ caption ─┐ ─ train ─
23
+ ─ verify ─┤ images + ─ near-dup ─┤ manifest ─ buckets ─┤ dataset ─ LoRA ─
24
+ ─ provenance┤ provenance ─ face-cluster ─┤ ─ (ident/ ─┴──────────► ───────►
25
+ ─ SHA256 ─┴───────────► ─ select+export ─┴──────────► wardrobe)
26
+ /data/images (DATASET_DIR) ───────────────────────────────►
27
+ ```
28
+
29
+ `argus-quarry` sits **upstream** of everything. It is a *producer* of
30
+ `DATASET_DIR`; curator and lens are the *consumers*. The only integration
31
+ surface is the shared images folder plus a new `gallery` compose profile —
32
+ the exact loosely-coupled pattern the suite already uses.
33
+
34
+ ---
35
+
36
+ ## 2. Scope
37
+
38
+ ### In scope (the genuinely new capability)
39
+
40
+ - **Source downloader modules** — one per archive, behind a common contract.
41
+ - **Provenance & licensing capture** — never lose source URL, licence, or
42
+ attribution. This is the reason the tool exists ("provenance-first").
43
+ - **Resumable, rate-limited, retrying downloads** with integrity verification.
44
+ - **Exact dedup at ingest** — SHA256 only. Skip bytes we already have.
45
+ - **Provenance database** — SQLite: `people` + `photographs`.
46
+ - **A thin folder layout** that lands cleanly as `DATASET_DIR`.
47
+
48
+ ### Out of scope (delegated — do NOT rebuild)
49
+
50
+ | Concern | Owned by | Why not here |
51
+ |---|---|---|
52
+ | Near-duplicate (pHash) detection | `argus-curator` | Curator already keeps the best representative and reports the rest. |
53
+ | Quality metrics (sharpness/blur/contrast/entropy/jpeg…) | `argus-curator` | Curator's scoring is *training-suitability* aware; a second stack would diverge. |
54
+ | Face detection / clustering / bounding boxes | `argus-curator` | InsightFace clustering already lives there. |
55
+ | CLIP / face embeddings | `argus-curator` (`gpu`/`faces`) | Same. |
56
+ | Quality/identity search & ranking | `argus-curator` manifest + CSV | Provenance search stays here; *quality* search is curator's. |
57
+ | Captioning | `argus-lens` | — |
58
+ | Rich gallery UI | `argus-vision-demo` frontend | Avoid a second UI (see §9). |
59
+
60
+ **Net effect vs. the original brief:** the `quality` table and all CV
61
+ (quality/faces/embeddings) sections are dropped from quarry. The DB shrinks to
62
+ `people` + `photographs`. `phash` is optional metadata only (recorded if cheap,
63
+ never the basis of quarry's dedup — SHA256 is). Search is provenance/licence
64
+ oriented, not quality/ranking oriented.
65
+
66
+ ---
67
+
68
+ ## 3. Repo layout (mirrors argus-curator conventions)
69
+
70
+ ```
71
+ argus-quarry/
72
+ ├── pyproject.toml # hatchling, src layout, optional-dependency extras
73
+ ├── Makefile # help/install/dev/lint/fmt/test/build/smoke
74
+ ├── Dockerfile
75
+ ├── README.md
76
+ ├── DESIGN.md # this file
77
+ ├── LICENSE # MIT
78
+ ├── src/argus_quarry/
79
+ │ ├── __init__.py # exports + __version__
80
+ │ ├── py.typed
81
+ │ ├── models.py # PortraitRecord, Person, Photograph (pydantic)
82
+ │ ├── store.py # SQLite provenance DB (sqlite3 stdlib, WAL)
83
+ │ ├── ingest.py # download → verify → SHA256 dedup → land → record
84
+ │ ├── net.py # httpx client: rate limit, retry/backoff, resume
85
+ │ ├── config.py # QuarryConfig: per-source settings, resolution + total-GB caps
86
+ │ ├── people.py # load seed list; optional Wikidata SPARQL harvester
87
+ │ ├── cli.py # typer app: run / fetch / export / list / stats / verify / people
88
+ │ ├── seeds/
89
+ │ │ └── people.yaml # curated deterministic seed (name, wikidata_id, aliases)
90
+ │ ├── downloaders/
91
+ │ │ ├── __init__.py # registry (name -> Downloader)
92
+ │ │ ├── base.py # Downloader protocol / ABC -> yields PortraitRecord
93
+ │ │ ├── commons.py # Wikimedia Commons (Phase 1)
94
+ │ │ ├── loc.py # Library of Congress (Phase 2)
95
+ │ │ ├── smithsonian.py # Smithsonian Open Access (Phase 2)
96
+ │ │ ├── rijksmuseum.py # Rijksmuseum Open Data (Phase 2)
97
+ │ │ ├── lac.py # Library & Archives Canada (Phase 2, Karsh)
98
+ │ │ ├── europeana.py # Europeana (Phase 3, rights-messy)
99
+ │ │ └── flickr.py # Flickr Commons (optional) (Phase 3)
100
+ │ └── server/ # OPTIONAL, deferred — see §9
101
+ └── tests/
102
+ ```
103
+
104
+ Package name `argus_quarry`, distribution `argus-quarry`, CLI entrypoint
105
+ `argus-quarry` (Typer), structlog for logging, pydantic v2 for models —
106
+ identical toolchain to curator so the suite stays consistent.
107
+
108
+ ---
109
+
110
+ ## 4. The common contract: `PortraitRecord`
111
+
112
+ Every downloader is source-independent because it yields the same object. The
113
+ rest of the pipeline never learns which archive a file came from.
114
+
115
+ ```python
116
+ class PortraitRecord(BaseModel):
117
+ # identity / subject
118
+ person_name: str # canonical folder name, e.g. "Albert_Einstein"
119
+ wikidata_id: str | None = None
120
+ birth_year: int | None = None
121
+ death_year: int | None = None
122
+ occupation: str | None = None
123
+
124
+ # the asset
125
+ title: str | None = None
126
+ photographer: str | None = None
127
+ year: int | None = None
128
+ remote_url: str # full-resolution source URL
129
+
130
+ # provenance / licence (NEVER optional in spirit — this is the point)
131
+ source: str # "commons" | "loc" | ...
132
+ source_url: str # human-facing landing page
133
+ licence: str # "PD" | "CC0" | "PD-US" | ...
134
+ attribution: str | None = None # required credit line if any
135
+ ```
136
+
137
+ `Downloader.harvest(query) -> Iterator[PortraitRecord]` streams candidates;
138
+ `ingest.py` turns each into bytes on disk + a DB row (idempotently).
139
+
140
+ ### The people seed (Q2, resolved: hybrid)
141
+
142
+ The subject list is decoupled from the downloaders. `people.py` supplies the
143
+ names/`wikidata_id`s each downloader harvests around, from two interchangeable
144
+ sources:
145
+
146
+ - **Curated seed (default, deterministic):** `seeds/people.yaml` — a small,
147
+ hand-maintained list (name, `wikidata_id`, aliases, optional birth/death).
148
+ This is what dev/QA runs against so results are reproducible and licence-safe.
149
+ - **Wikidata SPARQL harvester (optional, `--from-wikidata`):** query "humans
150
+ with a Commons portrait" (+ filters like occupation / death-year for PD
151
+ likelihood) to scale toward the 5–7k target. Cached under `QUARRY_HOME/cache`.
152
+
153
+ Both resolve to the same `Person` shape, so downloaders never care which was
154
+ used. Seed ships in Phase 1; SPARQL harvester lands in Phase 2.
155
+
156
+ ---
157
+
158
+ ## 5. Data model (SQLite)
159
+
160
+ Two tables. Provenance-first; no CV columns.
161
+
162
+ **people**
163
+ `id · name · wikidata_id · birth_year · death_year · occupation`
164
+
165
+ **photographs**
166
+ `id · person_id (fk) · title · photographer · year · source · source_url ·
167
+ licence · attribution · width · height · file_size · filename · sha256 (unique) ·
168
+ phash (nullable, informational) · remote_url · status · downloaded_at`
169
+
170
+ - `sha256` is `UNIQUE` → exact-dup ingest is a no-op (idempotent reruns).
171
+ - `status` tracks resumability: `pending | downloading | complete | failed`.
172
+ - `phash` recorded opportunistically (cheap with Pillow+ImageHash) but **never**
173
+ drives dedup here — that's curator's job.
174
+ - SQLite in WAL mode; single writer, safe concurrent readers.
175
+
176
+ Deliberately **no `quality` table** (dropped from the brief — see §2).
177
+
178
+ ---
179
+
180
+ ## 6. Folder structure produced
181
+
182
+ Two-stage layout: quarry fetches into a **raw pool** it fully owns, then
183
+ `export` publishes a clean, curator-ready tree into `DATASET_DIR`.
184
+
185
+ ```
186
+ $QUARRY_HOME/ # sibling ./quarry — side-car state, NEVER scanned
187
+ ├── images/ # the RAW POOL — every byte quarry has landed
188
+ │ ├── Albert_Einstein/
189
+ │ │ ├── einstein_1921_commons_<sha8>.jpg
190
+ │ │ └── ...
191
+ │ └── ...
192
+ ├── metadata/portraits.sqlite
193
+ ├── cache/ # HTTP cache / partial downloads (resume)
194
+ ├── logs/
195
+ └── thumbnails/ # OPTIONAL; curator makes its own previews
196
+
197
+ $DATASET_DIR/ # == /data/images — PUBLISHED view (via `export`)
198
+ ├── Albert_Einstein/ # symlinks (default) or copies into the pool
199
+ │ └── einstein_1921_commons_<sha8>.jpg -> $QUARRY_HOME/images/...
200
+ ├── Winston_Churchill/
201
+ └── ...
202
+ ```
203
+
204
+ Key decisions (Q4 + Q5, resolved):
205
+
206
+ - **`QUARRY_HOME` is a sibling `./quarry` dir**, fully outside the image tree, so
207
+ the DB/cache/logs a curator scan would choke on are never in view.
208
+ - **Images land in the raw pool first**, then `argus-quarry export` builds the
209
+ `Person_Name/` tree in `DATASET_DIR` — **symlink by default** (cheap, no
210
+ duplication), `--copy` when a mount can't cross the boundary. This keeps
211
+ quarry's provenance-complete pool separate from the curated view: you can
212
+ re-publish a subset (e.g. only `licence = CC0`) without re-downloading, and a
213
+ curator scan only ever sees clean images.
214
+
215
+ ---
216
+
217
+ ## 7. Downloader requirements
218
+
219
+ Each source module must:
220
+
221
+ - **Resume** interrupted downloads (partial-file + `status` in DB).
222
+ - **Skip** anything already `complete` (by `remote_url` / expected `sha256`).
223
+ - **Respect rate limits** (per-source token bucket in `net.py`; polite `User-Agent`).
224
+ - **Retry** transient network errors with exponential backoff + jitter.
225
+ - **Verify integrity** (content-length, decodes as an image via Pillow).
226
+ - **Record licence + attribution** — a record with no licence is quarantined,
227
+ not landed.
228
+ - **Prefer high resolution within a configurable cap** (Q3): request the largest
229
+ rendition the API offers, but downscale/skip past a per-file ceiling
230
+ (`QuarryConfig.max_megapixels` default ~12 MP, `max_file_bytes` default a few
231
+ MB), overridable per run. The **full-resolution `remote_url` is always kept in
232
+ the DB**, so a capped image can be re-fetched at original size on demand
233
+ without losing provenance. Keeps the archive inside the 20–40 GB budget by
234
+ default while never throwing away the ability to go bigger.
235
+ - **Respect a total-archive budget** (`QuarryConfig.max_total_gb`, env
236
+ `QUARRY_MAX_GB`, default e.g. `40`; `0`/unset = unlimited): before each write,
237
+ check the current raw-pool size (`QUARRY_HOME/images`, tracked incrementally
238
+ from `photographs.file_size` so it's O(1), not a directory walk). When the
239
+ next file would exceed the ceiling, **stop the run cleanly** — mark remaining
240
+ candidates `pending` (resumable later if the cap is raised), log a
241
+ `budget_reached` event, and exit non-error. This bounds disk use predictably
242
+ for dev/QA regardless of how many sources/people are queued.
243
+ - **Log all failures** (structlog → `logs/`), never crash the whole run.
244
+
245
+ Everything is **idempotent**: rerunning `fetch` resumes/repairs, never duplicates.
246
+
247
+ ---
248
+
249
+ ## 8. Suite integration (compose)
250
+
251
+ Add a `gallery` profile to the demo's `compose.yaml`. It's a run-to-completion
252
+ job (not a long-lived server). Quarry fetches into its own pool
253
+ (`$QUARRY_HOME/images`) and then publishes into `DATASET_DIR` — both mounts are
254
+ present so a single `up` can fetch-then-export:
255
+
256
+ ```yaml
257
+ argus-quarry:
258
+ profiles: ["gallery"]
259
+ build:
260
+ context: ../argus-quarry
261
+ image: argus-quarry:latest
262
+ environment:
263
+ - QUARRY_HOME=/data/quarry
264
+ - QUARRY_MAX_GB=${QUARRY_MAX_GB:-40} # total raw-pool ceiling; 0 = unlimited
265
+ - COMMONS_USER_AGENT=${COMMONS_USER_AGENT:-argus-quarry/0.1 (contact@example.com)}
266
+ volumes:
267
+ - ${QUARRY_HOME:-./quarry}:/data/quarry # raw pool + db/cache/logs
268
+ - ${DATASET_DIR:-./data}:/data/images # published (curator-ready) view
269
+ # fetch into the pool, then publish a symlinked tree into DATASET_DIR
270
+ command: ["run", "--source", "commons", "--limit", "500", "--export", "--licence", "CC0,PD"]
271
+ restart: "no"
272
+ ```
273
+
274
+ > Symlinks only resolve inside the container if both targets are mounted; since
275
+ > the published tree points back into `/data/quarry/images`, the curator/lens
276
+ > containers must also mount `QUARRY_HOME` **or** quarry should publish with
277
+ > `--copy`. Simplest for the suite: curator/lens add the same
278
+ > `${QUARRY_HOME}:/data/quarry` read-only mount. Documented in `.env.example`.
279
+
280
+ Usage stays true to the suite's profile idiom:
281
+
282
+ ```bash
283
+ docker compose --profile gallery up --build # fetch -> pool -> publish DATASET_DIR
284
+ docker compose --profile curator up --build # then curate the published view
285
+ ```
286
+
287
+ New `.env` knobs (documented in `.env.example`): `QUARRY_HOME` (default
288
+ `./quarry`), `QUARRY_MAX_GB` (total raw-pool ceiling, default `40`, `0` =
289
+ unlimited), per-source API keys / contact `User-Agent` strings (Commons and LoC
290
+ want a real UA; Rijksmuseum and Europeana need API keys).
291
+
292
+ ---
293
+
294
+ ## 9. On the "local viewer"
295
+
296
+ The brief asks for a Flask/FastAPI viewer. That overlaps with the existing
297
+ Next.js frontend, so:
298
+
299
+ - **Phase 1–2:** no standalone UI. `argus-quarry stats` / `list` on the CLI is
300
+ enough to inspect provenance.
301
+ - **Later (optional):** a tiny read-only FastAPI `server/` exposing provenance
302
+ queries (by person / photographer / year / source / licence), which the demo
303
+ frontend could surface as a `/gallery` route — consistent with how `/curate`
304
+ already talks to curator. Not built until there's demand.
305
+
306
+ This keeps us to one real UI (the demo) instead of maintaining a second.
307
+
308
+ ---
309
+
310
+ ## 10. Licensing / feasibility notes (for the 5–7k, 20–40 GB target)
311
+
312
+ - **Reliable PD/CC0 with real APIs:** Wikimedia Commons, Library of Congress,
313
+ Smithsonian Open Access, Rijksmuseum Open Data. Start here.
314
+ - **Messier rights:** Europeana and Flickr Commons mix licences per-item — the
315
+ downloader must read per-record rights and quarantine anything not clearly
316
+ PD/CC0.
317
+ - **LAC / Karsh:** many Karsh works are *not* PD (photographer d. 2002); treat
318
+ as a curated allow-list, not a bulk scrape.
319
+ - For dev/QA this dataset is plenty; do **not** advertise uniformly clean
320
+ licences across every source — enforce it per-record instead.
321
+
322
+ ---
323
+
324
+ ## 11. Design principles (unchanged from the brief, enforced by the above)
325
+
326
+ Modular · source-independent (`PortraitRecord`) · idempotent (SHA256 + `status`)
327
+ · extensible (downloader registry) · reproducible · **provenance-first** (a
328
+ record with no licence never lands) · optimised as the *input* to the suite's
329
+ existing CV/search/curation stages rather than duplicating them.
330
+
331
+ ---
332
+
333
+ ## 12. Phased delivery
334
+
335
+ **Phase 1 — walking skeleton**
336
+ - `pyproject.toml` (extras: `cli`, `server`, `dev`), `Makefile`, `Dockerfile`.
337
+ - `models.PortraitRecord`, `store` (SQLite `people`+`photographs`, WAL, migrations).
338
+ - `net` (rate limit + retry + resume), `ingest` (download→verify→cap→SHA256→pool→record).
339
+ - `people.py` + `seeds/people.yaml` curated seed loader.
340
+ - `downloaders/commons.py` (Wikimedia Commons).
341
+ - `export` (symlink/`--copy` published tree, with `--licence` filter).
342
+ - Typer CLI: `run` (fetch+export), `fetch`, `export`, `list`, `stats`, `verify`.
343
+ - `gallery` compose profile wired into `argus-vision-demo/compose.yaml`.
344
+
345
+ **Phase 2 — breadth**
346
+ - `loc`, `smithsonian`, `rijksmuseum`, curated `lac` (Karsh allow-list) downloaders.
347
+ - Wikidata SPARQL people harvester (`people --from-wikidata`); incremental update mode.
348
+ - Opportunistic `phash` metadata (informational only).
349
+
350
+ **Phase 3 — polish (optional)**
351
+ - `europeana`, `flickr` with strict per-record rights filtering.
352
+ - Read-only provenance FastAPI + `/gallery` route in the demo frontend.
353
+
354
+ ---
355
+
356
+ ## 13. Resolved decisions
357
+
358
+ 1. **Name** — `argus-quarry` (acquisition connotation, no clash with the
359
+ frontend's "viewing"). ✅
360
+ 2. **People list** — **hybrid**: curated `seeds/people.yaml` for deterministic
361
+ dev/QA (Phase 1), plus an optional Wikidata SPARQL harvester to scale toward
362
+ 5–7k (Phase 2). See §4. ✅
363
+ 3. **Resolution** — **configurable per-file cap** (~12 MP / few MB default),
364
+ full-res `remote_url` retained for on-demand re-fetch; overridable per run.
365
+ Keeps the 20–40 GB budget without discarding fidelity. See §7. ✅
366
+ 4. **`QUARRY_HOME`** — sibling **`./quarry`** dir, fully outside the image tree.
367
+ See §6. ✅
368
+ 5. **Landing** — **raw pool + export**: fetch into `QUARRY_HOME/images`, then
369
+ `export` publishes a `Person_Name/` tree into `DATASET_DIR` (symlink default,
370
+ `--copy` fallback), with an optional `--licence` filter. See §6/§8. ✅
371
+
372
+ ### Follow-ups that surfaced while resolving
373
+
374
+ - **Symlink cross-mount:** for the published symlink tree to resolve inside
375
+ curator/lens containers, they must also mount `QUARRY_HOME` read-only, else
376
+ quarry publishes with `--copy`. Needs a one-line `.env.example` + compose note
377
+ when wiring the `gallery` profile (already flagged in §8).
378
+ - **Wikidata → PD likelihood:** SPARQL harvester should pre-filter on death-year
379
+ / country to reduce quarantines, but licence is still enforced per-record at
380
+ ingest (never trust the query alone).
@@ -0,0 +1,25 @@
1
+ FROM python:3.12-slim
2
+
3
+ # System libs for Pillow image decoding.
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ libjpeg62-turbo libopenjp2-7 zlib1g \
6
+ && rm -rf /var/lib/apt/lists/*
7
+
8
+ WORKDIR /app
9
+
10
+ # The base image has no `git`, so hatch-vcs can't derive the version from
11
+ # history — hand it in via the VERSION build arg (the release tag, sans "v").
12
+ # Defaults to 0.0.0 for local `docker compose` builds.
13
+ ARG VERSION=0.0.0
14
+ ENV SETUPTOOLS_SCM_PRETEND_VERSION=${VERSION}
15
+
16
+ COPY . /app
17
+
18
+ RUN --mount=type=cache,target=/root/.cache/pip \
19
+ pip install --upgrade pip && \
20
+ pip install ".[cli,phash]"
21
+
22
+ # Run-to-completion acquisition job (not a long-lived server). The compose
23
+ # `gallery` profile overrides this command with concrete source/limit flags.
24
+ ENTRYPOINT ["argus-quarry"]
25
+ CMD ["--help"]
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 dragonhound
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,47 @@
1
+ .DEFAULT_GOAL := help
2
+ DIST := dist
3
+ UV := uv
4
+
5
+ .PHONY: help install dev lint fmt test build clean smoke check
6
+
7
+ help: ## Show this help
8
+ @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \
9
+ awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-12s\033[0m %s\n", $$1, $$2}'
10
+
11
+ install: ## Create .venv + editable install (core deps only)
12
+ $(UV) venv
13
+ $(UV) pip install -e .
14
+
15
+ dev: ## Create .venv + editable install with dev + cli extras
16
+ $(UV) venv
17
+ $(UV) pip install -e ".[dev,cli]"
18
+
19
+ lint: ## Run ruff linter
20
+ $(UV) run --no-sync ruff check src/ tests/
21
+
22
+ fmt: ## Auto-format with ruff
23
+ $(UV) run --no-sync ruff format src/ tests/
24
+ $(UV) run --no-sync ruff check --fix src/ tests/
25
+
26
+ test: ## Run pytest
27
+ $(UV) run --no-sync pytest --tb=short -q
28
+
29
+ build: clean ## Build sdist + wheel into dist/
30
+ $(UV) build
31
+ @echo ""
32
+ @ls -lh $(DIST)/
33
+
34
+ clean: ## Remove build artifacts
35
+ rm -rf $(DIST) build src/*.egg-info src/argus_quarry/*.egg-info
36
+
37
+ smoke: build ## Build wheel, install in throwaway venv, smoke-test import
38
+ $(eval TMPVENV := $(shell mktemp -d))
39
+ $(UV) venv $(TMPVENV)/venv
40
+ $(UV) pip install --python $(TMPVENV)/venv $(DIST)/*.whl
41
+ $(TMPVENV)/venv/bin/python -c \
42
+ "from argus_quarry import PortraitRecord, __version__; print(f'argus-quarry {__version__} OK')"
43
+ rm -rf $(TMPVENV)
44
+
45
+ check: lint test build ## Full local CI: lint + test + build
46
+ @echo ""
47
+ @echo "All checks passed."