argus-quarry 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- argus_quarry-0.1.0/.env.example +22 -0
- argus_quarry-0.1.0/.github/workflows/ci.yml +51 -0
- argus_quarry-0.1.0/.github/workflows/release.yml +105 -0
- argus_quarry-0.1.0/.gitignore +23 -0
- argus_quarry-0.1.0/DESIGN.md +380 -0
- argus_quarry-0.1.0/Dockerfile +25 -0
- argus_quarry-0.1.0/LICENSE +21 -0
- argus_quarry-0.1.0/Makefile +47 -0
- argus_quarry-0.1.0/PKG-INFO +233 -0
- argus_quarry-0.1.0/README.md +186 -0
- argus_quarry-0.1.0/pyproject.toml +79 -0
- argus_quarry-0.1.0/src/argus_quarry/__init__.py +43 -0
- argus_quarry-0.1.0/src/argus_quarry/_version.py +24 -0
- argus_quarry-0.1.0/src/argus_quarry/cli.py +260 -0
- argus_quarry-0.1.0/src/argus_quarry/config.py +104 -0
- argus_quarry-0.1.0/src/argus_quarry/downloaders/__init__.py +29 -0
- argus_quarry-0.1.0/src/argus_quarry/downloaders/base.py +39 -0
- argus_quarry-0.1.0/src/argus_quarry/downloaders/commons.py +115 -0
- argus_quarry-0.1.0/src/argus_quarry/export.py +82 -0
- argus_quarry-0.1.0/src/argus_quarry/ingest.py +257 -0
- argus_quarry-0.1.0/src/argus_quarry/models.py +151 -0
- argus_quarry-0.1.0/src/argus_quarry/net.py +164 -0
- argus_quarry-0.1.0/src/argus_quarry/people.py +56 -0
- argus_quarry-0.1.0/src/argus_quarry/py.typed +0 -0
- argus_quarry-0.1.0/src/argus_quarry/seeds/people.yaml +43 -0
- argus_quarry-0.1.0/src/argus_quarry/store.py +254 -0
- argus_quarry-0.1.0/tests/conftest.py +75 -0
- argus_quarry-0.1.0/tests/test_cli.py +20 -0
- argus_quarry-0.1.0/tests/test_commons.py +60 -0
- argus_quarry-0.1.0/tests/test_export.py +42 -0
- argus_quarry-0.1.0/tests/test_ingest.py +90 -0
- argus_quarry-0.1.0/tests/test_models.py +40 -0
- argus_quarry-0.1.0/tests/test_people.py +24 -0
- argus_quarry-0.1.0/tests/test_store.py +52 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# ── Acquisition state ────────────────────────────────────────────────
|
|
2
|
+
# QUARRY_HOME is a sibling side-car dir (raw pool + DB + cache + logs). It
|
|
3
|
+
# lives OUTSIDE the published image tree so a curator scan never sees it.
|
|
4
|
+
QUARRY_HOME=./quarry
|
|
5
|
+
|
|
6
|
+
# Total raw-pool ceiling in GB. 0 (or unset) = unlimited. When the next file
|
|
7
|
+
# would exceed this, the run stops cleanly and marks the rest resumable.
|
|
8
|
+
QUARRY_MAX_GB=40
|
|
9
|
+
|
|
10
|
+
# ── Per-source politeness ────────────────────────────────────────────
|
|
11
|
+
# Wikimedia Commons (and LoC) ask for a descriptive User-Agent with contact.
|
|
12
|
+
COMMONS_USER_AGENT=argus-quarry/0.1 (contact@example.com)
|
|
13
|
+
|
|
14
|
+
# ── Suite integration (compose) ──────────────────────────────────────
|
|
15
|
+
# Published, curator-ready view. Quarry `export` builds a Person_Name/ tree
|
|
16
|
+
# here (symlinks by default) pointing back into QUARRY_HOME/images.
|
|
17
|
+
DATASET_DIR=./data
|
|
18
|
+
|
|
19
|
+
# NOTE: the published tree symlinks into QUARRY_HOME/images. For those links
|
|
20
|
+
# to resolve inside the curator/lens containers, they must ALSO mount
|
|
21
|
+
# QUARRY_HOME read-only (e.g. ${QUARRY_HOME}:/data/quarry:ro), or run quarry
|
|
22
|
+
# `export --copy` so the published files are real copies.
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
concurrency:
|
|
10
|
+
group: ci-${{ github.ref }}
|
|
11
|
+
cancel-in-progress: true
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
lint:
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
- uses: astral-sh/setup-uv@v6
|
|
19
|
+
with:
|
|
20
|
+
enable-cache: true
|
|
21
|
+
- run: uvx ruff@0.15.20 check src/ tests/
|
|
22
|
+
- run: uvx ruff@0.15.20 format --check src/ tests/
|
|
23
|
+
|
|
24
|
+
test:
|
|
25
|
+
runs-on: ubuntu-latest
|
|
26
|
+
strategy:
|
|
27
|
+
matrix:
|
|
28
|
+
python-version: ["3.11", "3.12"]
|
|
29
|
+
steps:
|
|
30
|
+
- uses: actions/checkout@v4
|
|
31
|
+
- uses: astral-sh/setup-uv@v6
|
|
32
|
+
with:
|
|
33
|
+
enable-cache: true
|
|
34
|
+
- run: uv venv --python ${{ matrix.python-version }}
|
|
35
|
+
- run: uv pip install -e ".[dev,cli]"
|
|
36
|
+
- run: uv run --no-sync pytest --tb=short -q
|
|
37
|
+
|
|
38
|
+
build:
|
|
39
|
+
runs-on: ubuntu-latest
|
|
40
|
+
needs: [lint, test]
|
|
41
|
+
steps:
|
|
42
|
+
- uses: actions/checkout@v4
|
|
43
|
+
- uses: astral-sh/setup-uv@v6
|
|
44
|
+
with:
|
|
45
|
+
enable-cache: true
|
|
46
|
+
- run: uv build
|
|
47
|
+
- uses: actions/upload-artifact@v4
|
|
48
|
+
with:
|
|
49
|
+
name: dist
|
|
50
|
+
path: dist/
|
|
51
|
+
retention-days: 14
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
name: Release to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags: ["v*"]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
contents: read
|
|
9
|
+
id-token: write
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
build:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
with:
|
|
17
|
+
fetch-depth: 0
|
|
18
|
+
- uses: astral-sh/setup-uv@v6
|
|
19
|
+
with:
|
|
20
|
+
enable-cache: true
|
|
21
|
+
- run: uv build
|
|
22
|
+
- uses: actions/upload-artifact@v4
|
|
23
|
+
with:
|
|
24
|
+
name: dist
|
|
25
|
+
path: dist/
|
|
26
|
+
|
|
27
|
+
test-install:
|
|
28
|
+
runs-on: ubuntu-latest
|
|
29
|
+
needs: build
|
|
30
|
+
strategy:
|
|
31
|
+
matrix:
|
|
32
|
+
python-version: ["3.11", "3.12"]
|
|
33
|
+
steps:
|
|
34
|
+
- uses: actions/download-artifact@v4
|
|
35
|
+
with:
|
|
36
|
+
name: dist
|
|
37
|
+
path: dist/
|
|
38
|
+
- uses: astral-sh/setup-uv@v6
|
|
39
|
+
with:
|
|
40
|
+
enable-cache: true
|
|
41
|
+
- name: Install wheel and smoke-test import
|
|
42
|
+
run: |
|
|
43
|
+
uv venv --python ${{ matrix.python-version }}
|
|
44
|
+
uv pip install dist/*.whl
|
|
45
|
+
uv run --no-sync python -c "from argus_quarry import PortraitRecord, __version__; print(f'argus-quarry {__version__} OK')"
|
|
46
|
+
|
|
47
|
+
publish-testpypi:
|
|
48
|
+
runs-on: ubuntu-latest
|
|
49
|
+
needs: test-install
|
|
50
|
+
environment: testpypi
|
|
51
|
+
steps:
|
|
52
|
+
- uses: actions/download-artifact@v4
|
|
53
|
+
with:
|
|
54
|
+
name: dist
|
|
55
|
+
path: dist/
|
|
56
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
57
|
+
with:
|
|
58
|
+
repository-url: https://test.pypi.org/legacy/
|
|
59
|
+
|
|
60
|
+
publish-pypi:
|
|
61
|
+
runs-on: ubuntu-latest
|
|
62
|
+
needs: publish-testpypi
|
|
63
|
+
environment: pypi
|
|
64
|
+
steps:
|
|
65
|
+
- uses: actions/download-artifact@v4
|
|
66
|
+
with:
|
|
67
|
+
name: dist
|
|
68
|
+
path: dist/
|
|
69
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
70
|
+
|
|
71
|
+
publish-image:
|
|
72
|
+
runs-on: ubuntu-latest
|
|
73
|
+
needs: test-install
|
|
74
|
+
permissions:
|
|
75
|
+
contents: read
|
|
76
|
+
packages: write
|
|
77
|
+
steps:
|
|
78
|
+
- uses: actions/checkout@v4
|
|
79
|
+
- uses: docker/setup-buildx-action@v3
|
|
80
|
+
- uses: docker/login-action@v3
|
|
81
|
+
with:
|
|
82
|
+
registry: ghcr.io
|
|
83
|
+
username: ${{ github.actor }}
|
|
84
|
+
password: ${{ secrets.GITHUB_TOKEN }}
|
|
85
|
+
- id: meta
|
|
86
|
+
uses: docker/metadata-action@v5
|
|
87
|
+
with:
|
|
88
|
+
images: ghcr.io/${{ github.repository_owner }}/argus-quarry
|
|
89
|
+
tags: |
|
|
90
|
+
type=semver,pattern={{version}}
|
|
91
|
+
type=semver,pattern={{major}}.{{minor}}
|
|
92
|
+
type=raw,value=latest
|
|
93
|
+
# The base image has no git, so pass the version in via build-arg
|
|
94
|
+
# (SETUPTOOLS_SCM_PRETEND_VERSION) rather than relying on .git history.
|
|
95
|
+
- uses: docker/build-push-action@v6
|
|
96
|
+
with:
|
|
97
|
+
context: .
|
|
98
|
+
file: ./Dockerfile
|
|
99
|
+
push: true
|
|
100
|
+
build-args: |
|
|
101
|
+
VERSION=${{ steps.meta.outputs.version }}
|
|
102
|
+
tags: ${{ steps.meta.outputs.tags }}
|
|
103
|
+
labels: ${{ steps.meta.outputs.labels }}
|
|
104
|
+
cache-from: type=gha
|
|
105
|
+
cache-to: type=gha,mode=max
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*.egg-info/
|
|
4
|
+
dist/
|
|
5
|
+
build/
|
|
6
|
+
.venv/
|
|
7
|
+
.eggs/
|
|
8
|
+
*.egg
|
|
9
|
+
.env
|
|
10
|
+
.ruff_cache/
|
|
11
|
+
.pytest_cache/
|
|
12
|
+
.mypy_cache/
|
|
13
|
+
|
|
14
|
+
# Generated by hatch-vcs at build time (version derived from git tags).
|
|
15
|
+
src/argus_quarry/_version.py
|
|
16
|
+
|
|
17
|
+
# uv lockfile — the suite uses the imperative `uv pip install` model, no lock.
|
|
18
|
+
uv.lock
|
|
19
|
+
|
|
20
|
+
# Local acquisition state (raw pool + provenance DB + cache/logs) and the
|
|
21
|
+
# default published view (DATASET_DIR = ./data) produced by `export`.
|
|
22
|
+
quarry/
|
|
23
|
+
data/
|
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
# argus-quarry — Design Doc (Draft)
|
|
2
|
+
|
|
3
|
+
> Status: **proposal / plan only** — no code yet.
|
|
4
|
+
> Owner: smk762 · Suite: Argus · Sibling of `argus-lens`, `argus-curator`, `argus-vision-demo`.
|
|
5
|
+
|
|
6
|
+
The *quarry* is where the suite digs up raw material. `argus-quarry` acquires
|
|
7
|
+
public-domain / CC0 portrait images from upstream archives and lands them —
|
|
8
|
+
**with full provenance and licensing** — into a folder the rest of the Argus
|
|
9
|
+
suite already knows how to consume (`DATASET_DIR` → `/data/images`).
|
|
10
|
+
|
|
11
|
+
It is deliberately **lean**: an *acquisition + provenance* tool, nothing more.
|
|
12
|
+
Everything downstream (quality scoring, near-dup, faces, embeddings, selection,
|
|
13
|
+
captioning, viewing) is already owned by `argus-curator` and `argus-lens`, and
|
|
14
|
+
`argus-quarry` must not re-implement it.
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## 1. Where it fits in the suite
|
|
19
|
+
|
|
20
|
+
```
|
|
21
|
+
argus-quarry (NEW) argus-curator (:8101) argus-lens (:8100) imogen / kohya
|
|
22
|
+
─ download ─┐ ─ scan + score ─┐ ─ caption ─┐ ─ train ─
|
|
23
|
+
─ verify ─┤ images + ─ near-dup ─┤ manifest ─ buckets ─┤ dataset ─ LoRA ─
|
|
24
|
+
─ provenance┤ provenance ─ face-cluster ─┤ ─ (ident/ ─┴──────────► ───────►
|
|
25
|
+
─ SHA256 ─┴───────────► ─ select+export ─┴──────────► wardrobe)
|
|
26
|
+
/data/images (DATASET_DIR) ───────────────────────────────►
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
`argus-quarry` sits **upstream** of everything. It is a *producer* of
|
|
30
|
+
`DATASET_DIR`; curator and lens are the *consumers*. The only integration
|
|
31
|
+
surface is the shared images folder plus a new `gallery` compose profile —
|
|
32
|
+
the exact loosely-coupled pattern the suite already uses.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## 2. Scope
|
|
37
|
+
|
|
38
|
+
### In scope (the genuinely new capability)
|
|
39
|
+
|
|
40
|
+
- **Source downloader modules** — one per archive, behind a common contract.
|
|
41
|
+
- **Provenance & licensing capture** — never lose source URL, licence, or
|
|
42
|
+
attribution. This is the reason the tool exists ("provenance-first").
|
|
43
|
+
- **Resumable, rate-limited, retrying downloads** with integrity verification.
|
|
44
|
+
- **Exact dedup at ingest** — SHA256 only. Skip bytes we already have.
|
|
45
|
+
- **Provenance database** — SQLite: `people` + `photographs`.
|
|
46
|
+
- **A thin folder layout** that lands cleanly as `DATASET_DIR`.
|
|
47
|
+
|
|
48
|
+
### Out of scope (delegated — do NOT rebuild)
|
|
49
|
+
|
|
50
|
+
| Concern | Owned by | Why not here |
|
|
51
|
+
|---|---|---|
|
|
52
|
+
| Near-duplicate (pHash) detection | `argus-curator` | Curator already keeps the best representative and reports the rest. |
|
|
53
|
+
| Quality metrics (sharpness/blur/contrast/entropy/jpeg…) | `argus-curator` | Curator's scoring is *training-suitability* aware; a second stack would diverge. |
|
|
54
|
+
| Face detection / clustering / bounding boxes | `argus-curator` | InsightFace clustering already lives there. |
|
|
55
|
+
| CLIP / face embeddings | `argus-curator` (`gpu`/`faces`) | Same. |
|
|
56
|
+
| Quality/identity search & ranking | `argus-curator` manifest + CSV | Provenance search stays here; *quality* search is curator's. |
|
|
57
|
+
| Captioning | `argus-lens` | — |
|
|
58
|
+
| Rich gallery UI | `argus-vision-demo` frontend | Avoid a second UI (see §9). |
|
|
59
|
+
|
|
60
|
+
**Net effect vs. the original brief:** the `quality` table and all CV
|
|
61
|
+
(quality/faces/embeddings) sections are dropped from quarry. The DB shrinks to
|
|
62
|
+
`people` + `photographs`. `phash` is optional metadata only (recorded if cheap,
|
|
63
|
+
never the basis of quarry's dedup — SHA256 is). Search is provenance/licence
|
|
64
|
+
oriented, not quality/ranking oriented.
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## 3. Repo layout (mirrors argus-curator conventions)
|
|
69
|
+
|
|
70
|
+
```
|
|
71
|
+
argus-quarry/
|
|
72
|
+
├── pyproject.toml # hatchling, src layout, optional-dependency extras
|
|
73
|
+
├── Makefile # help/install/dev/lint/fmt/test/build/smoke
|
|
74
|
+
├── Dockerfile
|
|
75
|
+
├── README.md
|
|
76
|
+
├── DESIGN.md # this file
|
|
77
|
+
├── LICENSE # MIT
|
|
78
|
+
├── src/argus_quarry/
|
|
79
|
+
│ ├── __init__.py # exports + __version__
|
|
80
|
+
│ ├── py.typed
|
|
81
|
+
│ ├── models.py # PortraitRecord, Person, Photograph (pydantic)
|
|
82
|
+
│ ├── store.py # SQLite provenance DB (sqlite3 stdlib, WAL)
|
|
83
|
+
│ ├── ingest.py # download → verify → SHA256 dedup → land → record
|
|
84
|
+
│ ├── net.py # httpx client: rate limit, retry/backoff, resume
|
|
85
|
+
│ ├── config.py # QuarryConfig: per-source settings, resolution + total-GB caps
|
|
86
|
+
│ ├── people.py # load seed list; optional Wikidata SPARQL harvester
|
|
87
|
+
│ ├── cli.py # typer app: run / fetch / export / list / stats / verify / people
|
|
88
|
+
│ ├── seeds/
|
|
89
|
+
│ │ └── people.yaml # curated deterministic seed (name, wikidata_id, aliases)
|
|
90
|
+
│ ├── downloaders/
|
|
91
|
+
│ │ ├── __init__.py # registry (name -> Downloader)
|
|
92
|
+
│ │ ├── base.py # Downloader protocol / ABC -> yields PortraitRecord
|
|
93
|
+
│ │ ├── commons.py # Wikimedia Commons (Phase 1)
|
|
94
|
+
│ │ ├── loc.py # Library of Congress (Phase 2)
|
|
95
|
+
│ │ ├── smithsonian.py # Smithsonian Open Access (Phase 2)
|
|
96
|
+
│ │ ├── rijksmuseum.py # Rijksmuseum Open Data (Phase 2)
|
|
97
|
+
│ │ ├── lac.py # Library & Archives Canada (Phase 2, Karsh)
|
|
98
|
+
│ │ ├── europeana.py # Europeana (Phase 3, rights-messy)
|
|
99
|
+
│ │ └── flickr.py # Flickr Commons (optional) (Phase 3)
|
|
100
|
+
│ └── server/ # OPTIONAL, deferred — see §9
|
|
101
|
+
└── tests/
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Package name `argus_quarry`, distribution `argus-quarry`, CLI entrypoint
|
|
105
|
+
`argus-quarry` (Typer), structlog for logging, pydantic v2 for models —
|
|
106
|
+
identical toolchain to curator so the suite stays consistent.
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## 4. The common contract: `PortraitRecord`
|
|
111
|
+
|
|
112
|
+
Every downloader is source-independent because it yields the same object. The
|
|
113
|
+
rest of the pipeline never learns which archive a file came from.
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
class PortraitRecord(BaseModel):
|
|
117
|
+
# identity / subject
|
|
118
|
+
person_name: str # canonical folder name, e.g. "Albert_Einstein"
|
|
119
|
+
wikidata_id: str | None = None
|
|
120
|
+
birth_year: int | None = None
|
|
121
|
+
death_year: int | None = None
|
|
122
|
+
occupation: str | None = None
|
|
123
|
+
|
|
124
|
+
# the asset
|
|
125
|
+
title: str | None = None
|
|
126
|
+
photographer: str | None = None
|
|
127
|
+
year: int | None = None
|
|
128
|
+
remote_url: str # full-resolution source URL
|
|
129
|
+
|
|
130
|
+
# provenance / licence (NEVER optional in spirit — this is the point)
|
|
131
|
+
source: str # "commons" | "loc" | ...
|
|
132
|
+
source_url: str # human-facing landing page
|
|
133
|
+
licence: str # "PD" | "CC0" | "PD-US" | ...
|
|
134
|
+
attribution: str | None = None # required credit line if any
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
`Downloader.harvest(query) -> Iterator[PortraitRecord]` streams candidates;
|
|
138
|
+
`ingest.py` turns each into bytes on disk + a DB row (idempotently).
|
|
139
|
+
|
|
140
|
+
### The people seed (Q2, resolved: hybrid)
|
|
141
|
+
|
|
142
|
+
The subject list is decoupled from the downloaders. `people.py` supplies the
|
|
143
|
+
names/`wikidata_id`s each downloader harvests around, from two interchangeable
|
|
144
|
+
sources:
|
|
145
|
+
|
|
146
|
+
- **Curated seed (default, deterministic):** `seeds/people.yaml` — a small,
|
|
147
|
+
hand-maintained list (name, `wikidata_id`, aliases, optional birth/death).
|
|
148
|
+
This is what dev/QA runs against so results are reproducible and licence-safe.
|
|
149
|
+
- **Wikidata SPARQL harvester (optional, `--from-wikidata`):** query "humans
|
|
150
|
+
with a Commons portrait" (+ filters like occupation / death-year for PD
|
|
151
|
+
likelihood) to scale toward the 5–7k target. Cached under `QUARRY_HOME/cache`.
|
|
152
|
+
|
|
153
|
+
Both resolve to the same `Person` shape, so downloaders never care which was
|
|
154
|
+
used. Seed ships in Phase 1; SPARQL harvester lands in Phase 2.
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## 5. Data model (SQLite)
|
|
159
|
+
|
|
160
|
+
Two tables. Provenance-first; no CV columns.
|
|
161
|
+
|
|
162
|
+
**people**
|
|
163
|
+
`id · name · wikidata_id · birth_year · death_year · occupation`
|
|
164
|
+
|
|
165
|
+
**photographs**
|
|
166
|
+
`id · person_id (fk) · title · photographer · year · source · source_url ·
|
|
167
|
+
licence · attribution · width · height · file_size · filename · sha256 (unique) ·
|
|
168
|
+
phash (nullable, informational) · remote_url · status · downloaded_at`
|
|
169
|
+
|
|
170
|
+
- `sha256` is `UNIQUE` → exact-dup ingest is a no-op (idempotent reruns).
|
|
171
|
+
- `status` tracks resumability: `pending | downloading | complete | failed`.
|
|
172
|
+
- `phash` recorded opportunistically (cheap with Pillow+ImageHash) but **never**
|
|
173
|
+
drives dedup here — that's curator's job.
|
|
174
|
+
- SQLite in WAL mode; single writer, safe concurrent readers.
|
|
175
|
+
|
|
176
|
+
Deliberately **no `quality` table** (dropped from the brief — see §2).
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## 6. Folder structure produced
|
|
181
|
+
|
|
182
|
+
Two-stage layout: quarry fetches into a **raw pool** it fully owns, then
|
|
183
|
+
`export` publishes a clean, curator-ready tree into `DATASET_DIR`.
|
|
184
|
+
|
|
185
|
+
```
|
|
186
|
+
$QUARRY_HOME/ # sibling ./quarry — side-car state, NEVER scanned
|
|
187
|
+
├── images/ # the RAW POOL — every byte quarry has landed
|
|
188
|
+
│ ├── Albert_Einstein/
|
|
189
|
+
│ │ ├── einstein_1921_commons_<sha8>.jpg
|
|
190
|
+
│ │ └── ...
|
|
191
|
+
│ └── ...
|
|
192
|
+
├── metadata/portraits.sqlite
|
|
193
|
+
├── cache/ # HTTP cache / partial downloads (resume)
|
|
194
|
+
├── logs/
|
|
195
|
+
└── thumbnails/ # OPTIONAL; curator makes its own previews
|
|
196
|
+
|
|
197
|
+
$DATASET_DIR/ # == /data/images — PUBLISHED view (via `export`)
|
|
198
|
+
├── Albert_Einstein/ # symlinks (default) or copies into the pool
|
|
199
|
+
│ └── einstein_1921_commons_<sha8>.jpg -> $QUARRY_HOME/images/...
|
|
200
|
+
├── Winston_Churchill/
|
|
201
|
+
└── ...
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
Key decisions (Q4 + Q5, resolved):
|
|
205
|
+
|
|
206
|
+
- **`QUARRY_HOME` is a sibling `./quarry` dir**, fully outside the image tree, so
|
|
207
|
+
the DB/cache/logs a curator scan would choke on are never in view.
|
|
208
|
+
- **Images land in the raw pool first**, then `argus-quarry export` builds the
|
|
209
|
+
`Person_Name/` tree in `DATASET_DIR` — **symlink by default** (cheap, no
|
|
210
|
+
duplication), `--copy` when a mount can't cross the boundary. This keeps
|
|
211
|
+
quarry's provenance-complete pool separate from the curated view: you can
|
|
212
|
+
re-publish a subset (e.g. only `licence = CC0`) without re-downloading, and a
|
|
213
|
+
curator scan only ever sees clean images.
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
## 7. Downloader requirements
|
|
218
|
+
|
|
219
|
+
Each source module must:
|
|
220
|
+
|
|
221
|
+
- **Resume** interrupted downloads (partial-file + `status` in DB).
|
|
222
|
+
- **Skip** anything already `complete` (by `remote_url` / expected `sha256`).
|
|
223
|
+
- **Respect rate limits** (per-source token bucket in `net.py`; polite `User-Agent`).
|
|
224
|
+
- **Retry** transient network errors with exponential backoff + jitter.
|
|
225
|
+
- **Verify integrity** (content-length, decodes as an image via Pillow).
|
|
226
|
+
- **Record licence + attribution** — a record with no licence is quarantined,
|
|
227
|
+
not landed.
|
|
228
|
+
- **Prefer high resolution within a configurable cap** (Q3): request the largest
|
|
229
|
+
rendition the API offers, but downscale/skip past a per-file ceiling
|
|
230
|
+
(`QuarryConfig.max_megapixels` default ~12 MP, `max_file_bytes` default a few
|
|
231
|
+
MB), overridable per run. The **full-resolution `remote_url` is always kept in
|
|
232
|
+
the DB**, so a capped image can be re-fetched at original size on demand
|
|
233
|
+
without losing provenance. Keeps the archive inside the 20–40 GB budget by
|
|
234
|
+
default while never throwing away the ability to go bigger.
|
|
235
|
+
- **Respect a total-archive budget** (`QuarryConfig.max_total_gb`, env
|
|
236
|
+
`QUARRY_MAX_GB`, default e.g. `40`; `0`/unset = unlimited): before each write,
|
|
237
|
+
check the current raw-pool size (`QUARRY_HOME/images`, tracked incrementally
|
|
238
|
+
from `photographs.file_size` so it's O(1), not a directory walk). When the
|
|
239
|
+
next file would exceed the ceiling, **stop the run cleanly** — mark remaining
|
|
240
|
+
candidates `pending` (resumable later if the cap is raised), log a
|
|
241
|
+
`budget_reached` event, and exit non-error. This bounds disk use predictably
|
|
242
|
+
for dev/QA regardless of how many sources/people are queued.
|
|
243
|
+
- **Log all failures** (structlog → `logs/`), never crash the whole run.
|
|
244
|
+
|
|
245
|
+
Everything is **idempotent**: rerunning `fetch` resumes/repairs, never duplicates.
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## 8. Suite integration (compose)
|
|
250
|
+
|
|
251
|
+
Add a `gallery` profile to the demo's `compose.yaml`. It's a run-to-completion
|
|
252
|
+
job (not a long-lived server). Quarry fetches into its own pool
|
|
253
|
+
(`$QUARRY_HOME/images`) and then publishes into `DATASET_DIR` — both mounts are
|
|
254
|
+
present so a single `up` can fetch-then-export:
|
|
255
|
+
|
|
256
|
+
```yaml
|
|
257
|
+
argus-quarry:
|
|
258
|
+
profiles: ["gallery"]
|
|
259
|
+
build:
|
|
260
|
+
context: ../argus-quarry
|
|
261
|
+
image: argus-quarry:latest
|
|
262
|
+
environment:
|
|
263
|
+
- QUARRY_HOME=/data/quarry
|
|
264
|
+
- QUARRY_MAX_GB=${QUARRY_MAX_GB:-40} # total raw-pool ceiling; 0 = unlimited
|
|
265
|
+
- COMMONS_USER_AGENT=${COMMONS_USER_AGENT:-argus-quarry/0.1 (contact@example.com)}
|
|
266
|
+
volumes:
|
|
267
|
+
- ${QUARRY_HOME:-./quarry}:/data/quarry # raw pool + db/cache/logs
|
|
268
|
+
- ${DATASET_DIR:-./data}:/data/images # published (curator-ready) view
|
|
269
|
+
# fetch into the pool, then publish a symlinked tree into DATASET_DIR
|
|
270
|
+
command: ["run", "--source", "commons", "--limit", "500", "--export", "--licence", "CC0,PD"]
|
|
271
|
+
restart: "no"
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
> Symlinks only resolve inside the container if both targets are mounted; since
|
|
275
|
+
> the published tree points back into `/data/quarry/images`, the curator/lens
|
|
276
|
+
> containers must also mount `QUARRY_HOME` **or** quarry should publish with
|
|
277
|
+
> `--copy`. Simplest for the suite: curator/lens add the same
|
|
278
|
+
> `${QUARRY_HOME}:/data/quarry` read-only mount. Documented in `.env.example`.
|
|
279
|
+
|
|
280
|
+
Usage stays true to the suite's profile idiom:
|
|
281
|
+
|
|
282
|
+
```bash
|
|
283
|
+
docker compose --profile gallery up --build # fetch -> pool -> publish DATASET_DIR
|
|
284
|
+
docker compose --profile curator up --build # then curate the published view
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
New `.env` knobs (documented in `.env.example`): `QUARRY_HOME` (default
|
|
288
|
+
`./quarry`), `QUARRY_MAX_GB` (total raw-pool ceiling, default `40`, `0` =
|
|
289
|
+
unlimited), per-source API keys / contact `User-Agent` strings (Commons and LoC
|
|
290
|
+
want a real UA; Rijksmuseum and Europeana need API keys).
|
|
291
|
+
|
|
292
|
+
---
|
|
293
|
+
|
|
294
|
+
## 9. On the "local viewer"
|
|
295
|
+
|
|
296
|
+
The brief asks for a Flask/FastAPI viewer. That overlaps with the existing
|
|
297
|
+
Next.js frontend, so:
|
|
298
|
+
|
|
299
|
+
- **Phase 1–2:** no standalone UI. `argus-quarry stats` / `list` on the CLI is
|
|
300
|
+
enough to inspect provenance.
|
|
301
|
+
- **Later (optional):** a tiny read-only FastAPI `server/` exposing provenance
|
|
302
|
+
queries (by person / photographer / year / source / licence), which the demo
|
|
303
|
+
frontend could surface as a `/gallery` route — consistent with how `/curate`
|
|
304
|
+
already talks to curator. Not built until there's demand.
|
|
305
|
+
|
|
306
|
+
This keeps us to one real UI (the demo) instead of maintaining a second.
|
|
307
|
+
|
|
308
|
+
---
|
|
309
|
+
|
|
310
|
+
## 10. Licensing / feasibility notes (for the 5–7k, 20–40 GB target)
|
|
311
|
+
|
|
312
|
+
- **Reliable PD/CC0 with real APIs:** Wikimedia Commons, Library of Congress,
|
|
313
|
+
Smithsonian Open Access, Rijksmuseum Open Data. Start here.
|
|
314
|
+
- **Messier rights:** Europeana and Flickr Commons mix licences per-item — the
|
|
315
|
+
downloader must read per-record rights and quarantine anything not clearly
|
|
316
|
+
PD/CC0.
|
|
317
|
+
- **LAC / Karsh:** many Karsh works are *not* PD (photographer d. 2002); treat
|
|
318
|
+
as a curated allow-list, not a bulk scrape.
|
|
319
|
+
- For dev/QA this dataset is plenty; do **not** advertise uniformly clean
|
|
320
|
+
licences across every source — enforce it per-record instead.
|
|
321
|
+
|
|
322
|
+
---
|
|
323
|
+
|
|
324
|
+
## 11. Design principles (unchanged from the brief, enforced by the above)
|
|
325
|
+
|
|
326
|
+
Modular · source-independent (`PortraitRecord`) · idempotent (SHA256 + `status`)
|
|
327
|
+
· extensible (downloader registry) · reproducible · **provenance-first** (a
|
|
328
|
+
record with no licence never lands) · optimised as the *input* to the suite's
|
|
329
|
+
existing CV/search/curation stages rather than duplicating them.
|
|
330
|
+
|
|
331
|
+
---
|
|
332
|
+
|
|
333
|
+
## 12. Phased delivery
|
|
334
|
+
|
|
335
|
+
**Phase 1 — walking skeleton**
|
|
336
|
+
- `pyproject.toml` (extras: `cli`, `server`, `dev`), `Makefile`, `Dockerfile`.
|
|
337
|
+
- `models.PortraitRecord`, `store` (SQLite `people`+`photographs`, WAL, migrations).
|
|
338
|
+
- `net` (rate limit + retry + resume), `ingest` (download→verify→cap→SHA256→pool→record).
|
|
339
|
+
- `people.py` + `seeds/people.yaml` curated seed loader.
|
|
340
|
+
- `downloaders/commons.py` (Wikimedia Commons).
|
|
341
|
+
- `export` (symlink/`--copy` published tree, with `--licence` filter).
|
|
342
|
+
- Typer CLI: `run` (fetch+export), `fetch`, `export`, `list`, `stats`, `verify`.
|
|
343
|
+
- `gallery` compose profile wired into `argus-vision-demo/compose.yaml`.
|
|
344
|
+
|
|
345
|
+
**Phase 2 — breadth**
|
|
346
|
+
- `loc`, `smithsonian`, `rijksmuseum`, curated `lac` (Karsh allow-list) downloaders.
|
|
347
|
+
- Wikidata SPARQL people harvester (`people --from-wikidata`); incremental update mode.
|
|
348
|
+
- Opportunistic `phash` metadata (informational only).
|
|
349
|
+
|
|
350
|
+
**Phase 3 — polish (optional)**
|
|
351
|
+
- `europeana`, `flickr` with strict per-record rights filtering.
|
|
352
|
+
- Read-only provenance FastAPI + `/gallery` route in the demo frontend.
|
|
353
|
+
|
|
354
|
+
---
|
|
355
|
+
|
|
356
|
+
## 13. Resolved decisions
|
|
357
|
+
|
|
358
|
+
1. **Name** — `argus-quarry` (acquisition connotation, no clash with the
|
|
359
|
+
frontend's "viewing"). ✅
|
|
360
|
+
2. **People list** — **hybrid**: curated `seeds/people.yaml` for deterministic
|
|
361
|
+
dev/QA (Phase 1), plus an optional Wikidata SPARQL harvester to scale toward
|
|
362
|
+
5–7k (Phase 2). See §4. ✅
|
|
363
|
+
3. **Resolution** — **configurable per-file cap** (~12 MP / few MB default),
|
|
364
|
+
full-res `remote_url` retained for on-demand re-fetch; overridable per run.
|
|
365
|
+
Keeps the 20–40 GB budget without discarding fidelity. See §7. ✅
|
|
366
|
+
4. **`QUARRY_HOME`** — sibling **`./quarry`** dir, fully outside the image tree.
|
|
367
|
+
See §6. ✅
|
|
368
|
+
5. **Landing** — **raw pool + export**: fetch into `QUARRY_HOME/images`, then
|
|
369
|
+
`export` publishes a `Person_Name/` tree into `DATASET_DIR` (symlink default,
|
|
370
|
+
`--copy` fallback), with an optional `--licence` filter. See §6/§8. ✅
|
|
371
|
+
|
|
372
|
+
### Follow-ups that surfaced while resolving
|
|
373
|
+
|
|
374
|
+
- **Symlink cross-mount:** for the published symlink tree to resolve inside
|
|
375
|
+
curator/lens containers, they must also mount `QUARRY_HOME` read-only, else
|
|
376
|
+
quarry publishes with `--copy`. Needs a one-line `.env.example` + compose note
|
|
377
|
+
when wiring the `gallery` profile (already flagged in §8).
|
|
378
|
+
- **Wikidata → PD likelihood:** SPARQL harvester should pre-filter on death-year
|
|
379
|
+
/ country to reduce quarantines, but licence is still enforced per-record at
|
|
380
|
+
ingest (never trust the query alone).
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
FROM python:3.12-slim
|
|
2
|
+
|
|
3
|
+
# System libs for Pillow image decoding.
|
|
4
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
5
|
+
libjpeg62-turbo libopenjp2-7 zlib1g \
|
|
6
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
7
|
+
|
|
8
|
+
WORKDIR /app
|
|
9
|
+
|
|
10
|
+
# The base image has no `git`, so hatch-vcs can't derive the version from
|
|
11
|
+
# history — hand it in via the VERSION build arg (the release tag, sans "v").
|
|
12
|
+
# Defaults to 0.0.0 for local `docker compose` builds.
|
|
13
|
+
ARG VERSION=0.0.0
|
|
14
|
+
ENV SETUPTOOLS_SCM_PRETEND_VERSION=${VERSION}
|
|
15
|
+
|
|
16
|
+
COPY . /app
|
|
17
|
+
|
|
18
|
+
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
19
|
+
pip install --upgrade pip && \
|
|
20
|
+
pip install ".[cli,phash]"
|
|
21
|
+
|
|
22
|
+
# Run-to-completion acquisition job (not a long-lived server). The compose
|
|
23
|
+
# `gallery` profile overrides this command with concrete source/limit flags.
|
|
24
|
+
ENTRYPOINT ["argus-quarry"]
|
|
25
|
+
CMD ["--help"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 dragonhound
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
.DEFAULT_GOAL := help
|
|
2
|
+
DIST := dist
|
|
3
|
+
UV := uv
|
|
4
|
+
|
|
5
|
+
.PHONY: help install dev lint fmt test build clean smoke check
|
|
6
|
+
|
|
7
|
+
help: ## Show this help
|
|
8
|
+
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \
|
|
9
|
+
awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-12s\033[0m %s\n", $$1, $$2}'
|
|
10
|
+
|
|
11
|
+
install: ## Create .venv + editable install (core deps only)
|
|
12
|
+
$(UV) venv
|
|
13
|
+
$(UV) pip install -e .
|
|
14
|
+
|
|
15
|
+
dev: ## Create .venv + editable install with dev + cli extras
|
|
16
|
+
$(UV) venv
|
|
17
|
+
$(UV) pip install -e ".[dev,cli]"
|
|
18
|
+
|
|
19
|
+
lint: ## Run ruff linter
|
|
20
|
+
$(UV) run --no-sync ruff check src/ tests/
|
|
21
|
+
|
|
22
|
+
fmt: ## Auto-format with ruff
|
|
23
|
+
$(UV) run --no-sync ruff format src/ tests/
|
|
24
|
+
$(UV) run --no-sync ruff check --fix src/ tests/
|
|
25
|
+
|
|
26
|
+
test: ## Run pytest
|
|
27
|
+
$(UV) run --no-sync pytest --tb=short -q
|
|
28
|
+
|
|
29
|
+
build: clean ## Build sdist + wheel into dist/
|
|
30
|
+
$(UV) build
|
|
31
|
+
@echo ""
|
|
32
|
+
@ls -lh $(DIST)/
|
|
33
|
+
|
|
34
|
+
clean: ## Remove build artifacts
|
|
35
|
+
rm -rf $(DIST) build src/*.egg-info src/argus_quarry/*.egg-info
|
|
36
|
+
|
|
37
|
+
smoke: build ## Build wheel, install in throwaway venv, smoke-test import
|
|
38
|
+
$(eval TMPVENV := $(shell mktemp -d))
|
|
39
|
+
$(UV) venv $(TMPVENV)/venv
|
|
40
|
+
$(UV) pip install --python $(TMPVENV)/venv $(DIST)/*.whl
|
|
41
|
+
$(TMPVENV)/venv/bin/python -c \
|
|
42
|
+
"from argus_quarry import PortraitRecord, __version__; print(f'argus-quarry {__version__} OK')"
|
|
43
|
+
rm -rf $(TMPVENV)
|
|
44
|
+
|
|
45
|
+
check: lint test build ## Full local CI: lint + test + build
|
|
46
|
+
@echo ""
|
|
47
|
+
@echo "All checks passed."
|