argus-curator 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. argus_curator-0.1.0/.env.example +16 -0
  2. argus_curator-0.1.0/.github/workflows/ci.yml +18 -0
  3. argus_curator-0.1.0/.github/workflows/release.yml +105 -0
  4. argus_curator-0.1.0/.gitignore +15 -0
  5. argus_curator-0.1.0/CHANGELOG.md +39 -0
  6. argus_curator-0.1.0/Dockerfile +23 -0
  7. argus_curator-0.1.0/LICENSE +21 -0
  8. argus_curator-0.1.0/Makefile +50 -0
  9. argus_curator-0.1.0/PKG-INFO +272 -0
  10. argus_curator-0.1.0/README.md +208 -0
  11. argus_curator-0.1.0/docker-compose.yaml +26 -0
  12. argus_curator-0.1.0/pyproject.toml +97 -0
  13. argus_curator-0.1.0/schema/curator-wire.schema.json +757 -0
  14. argus_curator-0.1.0/src/argus_curator/__init__.py +50 -0
  15. argus_curator-0.1.0/src/argus_curator/_version.py +24 -0
  16. argus_curator-0.1.0/src/argus_curator/cli.py +225 -0
  17. argus_curator-0.1.0/src/argus_curator/export.py +223 -0
  18. argus_curator-0.1.0/src/argus_curator/faces.py +302 -0
  19. argus_curator-0.1.0/src/argus_curator/models.py +262 -0
  20. argus_curator-0.1.0/src/argus_curator/py.typed +0 -0
  21. argus_curator-0.1.0/src/argus_curator/scanner.py +455 -0
  22. argus_curator-0.1.0/src/argus_curator/selection.py +102 -0
  23. argus_curator-0.1.0/src/argus_curator/server/__init__.py +5 -0
  24. argus_curator-0.1.0/src/argus_curator/server/app.py +360 -0
  25. argus_curator-0.1.0/src/argus_curator/store.py +73 -0
  26. argus_curator-0.1.0/tests/conftest.py +56 -0
  27. argus_curator-0.1.0/tests/test_export.py +79 -0
  28. argus_curator-0.1.0/tests/test_pose.py +95 -0
  29. argus_curator-0.1.0/tests/test_scanner.py +62 -0
  30. argus_curator-0.1.0/tests/test_schema.py +17 -0
  31. argus_curator-0.1.0/tests/test_server.py +173 -0
@@ -0,0 +1,16 @@
1
+ # ── Server ───────────────────────────────────────────────────────────
2
+ # Host port for the argus-curator FastAPI server (peer to argus-lens :8100).
3
+ CURATOR_PORT=8101
4
+
5
+ # Default mount root that GET /thumb serves from when no scan_id is supplied.
6
+ # Mirrors NEXT_PUBLIC_CURATOR_SOURCE_PATH in argus-vision-demo.
7
+ CURATOR_SOURCE_PATH=/data/images
8
+
9
+ # ── State ────────────────────────────────────────────────────────────
10
+ # On-disk scan cache (keyed by scan_id) used for pagination + export-by-id.
11
+ # Defaults to ~/.cache/argus_curator/scans if unset.
12
+ CURATOR_CACHE_DIR=/home/smk/.cache/argus_curator/scans
13
+
14
+ # ── Handoff (optional) ───────────────────────────────────────────────
15
+ # argus-lens caption endpoint for the one-click curate->caption run.
16
+ ARGUS_LENS_URL=http://localhost:8100
@@ -0,0 +1,18 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ concurrency:
10
+ group: ci-${{ github.ref }}
11
+ cancel-in-progress: true
12
+
13
+ jobs:
14
+ ci:
15
+ uses: smk762/argus-ci/.github/workflows/python-ci.yml@v1
16
+ with:
17
+ test-extras: "dev,server,cli"
18
+ post-test: "uv run --no-sync argus-curator schema --check"
@@ -0,0 +1,105 @@
1
+ name: Release to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags: ["v*"]
6
+
7
+ permissions:
8
+ contents: read
9
+ id-token: write
10
+
11
+ jobs:
12
+ build:
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ with:
17
+ fetch-depth: 0
18
+ - uses: astral-sh/setup-uv@v6
19
+ with:
20
+ enable-cache: true
21
+ - run: uv build
22
+ - uses: actions/upload-artifact@v4
23
+ with:
24
+ name: dist
25
+ path: dist/
26
+
27
+ test-install:
28
+ runs-on: ubuntu-latest
29
+ needs: build
30
+ strategy:
31
+ matrix:
32
+ python-version: ["3.11", "3.12"]
33
+ steps:
34
+ - uses: actions/download-artifact@v4
35
+ with:
36
+ name: dist
37
+ path: dist/
38
+ - uses: astral-sh/setup-uv@v6
39
+ with:
40
+ enable-cache: true
41
+ - name: Install wheel and smoke-test import
42
+ run: |
43
+ uv venv --python ${{ matrix.python-version }}
44
+ uv pip install dist/*.whl
45
+ uv run --no-sync python -c "from argus_curator import scan_folder, __version__; print(f'argus-curator {__version__} OK')"
46
+
47
+ publish-testpypi:
48
+ runs-on: ubuntu-latest
49
+ needs: test-install
50
+ environment: testpypi
51
+ steps:
52
+ - uses: actions/download-artifact@v4
53
+ with:
54
+ name: dist
55
+ path: dist/
56
+ - uses: pypa/gh-action-pypi-publish@release/v1
57
+ with:
58
+ repository-url: https://test.pypi.org/legacy/
59
+
60
+ publish-pypi:
61
+ runs-on: ubuntu-latest
62
+ needs: publish-testpypi
63
+ environment: pypi
64
+ steps:
65
+ - uses: actions/download-artifact@v4
66
+ with:
67
+ name: dist
68
+ path: dist/
69
+ - uses: pypa/gh-action-pypi-publish@release/v1
70
+
71
+ publish-image:
72
+ runs-on: ubuntu-latest
73
+ needs: test-install
74
+ permissions:
75
+ contents: read
76
+ packages: write
77
+ steps:
78
+ - uses: actions/checkout@v4
79
+ - uses: docker/setup-buildx-action@v3
80
+ - uses: docker/login-action@v3
81
+ with:
82
+ registry: ghcr.io
83
+ username: ${{ github.actor }}
84
+ password: ${{ secrets.GITHUB_TOKEN }}
85
+ - id: meta
86
+ uses: docker/metadata-action@v5
87
+ with:
88
+ images: ghcr.io/${{ github.repository_owner }}/argus-curator
89
+ tags: |
90
+ type=semver,pattern={{version}}
91
+ type=semver,pattern={{major}}.{{minor}}
92
+ type=raw,value=latest
93
+ # The base image has no git, so pass the version in via build-arg
94
+ # (SETUPTOOLS_SCM_PRETEND_VERSION) rather than relying on .git history.
95
+ - uses: docker/build-push-action@v6
96
+ with:
97
+ context: .
98
+ file: ./Dockerfile
99
+ push: true
100
+ build-args: |
101
+ VERSION=${{ steps.meta.outputs.version }}
102
+ tags: ${{ steps.meta.outputs.tags }}
103
+ labels: ${{ steps.meta.outputs.labels }}
104
+ cache-from: type=gha
105
+ cache-to: type=gha,mode=max
@@ -0,0 +1,15 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .venv/
7
+ .eggs/
8
+ *.egg
9
+ .env
10
+ .ruff_cache/
11
+ .pytest_cache/
12
+ .mypy_cache/
13
+
14
+ # Generated by hatch-vcs at build time (version derived from git tags).
15
+ src/argus_curator/_version.py
@@ -0,0 +1,39 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.1.0] - 2026-07-02
9
+
10
+ Initial release — the curation stage of the Argus suite
11
+ ([argus-quarry](https://github.com/smk762/argus-quarry) →
12
+ **argus-curator** →
13
+ [argus-lens](https://github.com/smk762/argus-lens)).
14
+
15
+ ### Added
16
+
17
+ - Training-suitability scanning: hard filters (min short side, aspect ratio,
18
+ blur), target-aware composite scoring, and per-image reject reasons.
19
+ - Near-duplicate detection via pHash clustering — keeps the highest-scoring
20
+ representative and reports (never silently drops) the rest.
21
+ - Identity-aware face clustering (InsightFace, `[faces]` / `[gpu]` extras) with
22
+ head-pose (yaw) capture and pose-balanced subset selection.
23
+ - Structure-preserving export (copy / symlink / move) with score threshold,
24
+ diversity cap, face-cluster and pose filters, plus a per-image CSV report.
25
+ - Versioned JSONL handoff manifest carrying the shared `TargetProfile`, with an
26
+ optional `caption_url` POST for a one-click curate→caption run against
27
+ argus-lens.
28
+ - Wire-contract JSON Schema (`schema/curator-wire.schema.json`) published for
29
+ consumer codegen, with a CI staleness check (`argus-curator schema --check`).
30
+ - FastAPI micro-server (`[server]` extra, :8101): `/health`, `/detectors`,
31
+ `/folders`, `/scan/folder`, `/scan/{scan_id}` (paginated), `/thumb`,
32
+ `/export`, and SSE streaming variants `/scan/folder/stream` and
33
+ `/export/stream`.
34
+ - Typer CLI (`[cli]` extra): `scan`, `serve`, `detectors`, `schema`.
35
+ - On-disk scan cache keyed by `scan_id` (`CURATOR_CACHE_DIR`) powering
36
+ pagination and export-by-id.
37
+ - Docker image (GHCR) and `docker compose` deployment.
38
+
39
+ [0.1.0]: https://github.com/smk762/argus-curator/releases/tag/v0.1.0
@@ -0,0 +1,23 @@
1
+ FROM python:3.12-slim
2
+
3
+ # System libs for Pillow / OpenCV / onnxruntime image decoding.
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ libgl1 libglib2.0-0 libgomp1 \
6
+ && rm -rf /var/lib/apt/lists/*
7
+
8
+ WORKDIR /app
9
+
10
+ # The base image has no `git`, so hatch-vcs/setuptools-scm can't derive the
11
+ # version from history — hand it in via the VERSION build arg (the release tag,
12
+ # sans "v"). Defaults to 0.0.0 for local `docker compose` builds.
13
+ ARG VERSION=0.0.0
14
+ ENV SETUPTOOLS_SCM_PRETEND_VERSION=${VERSION}
15
+
16
+ COPY . /app
17
+
18
+ RUN --mount=type=cache,target=/root/.cache/pip \
19
+ pip install --upgrade pip && \
20
+ pip install ".[server,cli,faces]"
21
+
22
+ EXPOSE 8101
23
+ CMD ["argus-curator", "serve", "--port", "8101", "--cors"]
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 dragonhound
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,50 @@
1
+ .DEFAULT_GOAL := help
2
+ DIST := dist
3
+ UV := uv
4
+
5
+ .PHONY: help install dev lint fmt test build clean smoke check schema
6
+
7
+ help: ## Show this help
8
+ @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \
9
+ awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-12s\033[0m %s\n", $$1, $$2}'
10
+
11
+ install: ## Create .venv + editable install (core deps only)
12
+ $(UV) venv
13
+ $(UV) pip install -e .
14
+
15
+ dev: ## Create .venv + editable install with dev + cli + server extras
16
+ $(UV) venv
17
+ $(UV) pip install -e ".[dev,cli,server]"
18
+
19
+ lint: ## Run ruff linter
20
+ $(UV) run --no-sync ruff check src/ tests/
21
+
22
+ fmt: ## Auto-format with ruff
23
+ $(UV) run --no-sync ruff format src/ tests/
24
+ $(UV) run --no-sync ruff check --fix src/ tests/
25
+
26
+ test: ## Run pytest
27
+ $(UV) run --no-sync pytest --tb=short -q
28
+
29
+ schema: ## Regenerate the committed wire-contract JSON Schema
30
+ $(UV) run --no-sync argus-curator schema
31
+
32
+ build: clean ## Build sdist + wheel into dist/
33
+ $(UV) build
34
+ @echo ""
35
+ @ls -lh $(DIST)/
36
+
37
+ clean: ## Remove build artifacts
38
+ rm -rf $(DIST) build src/*.egg-info src/argus_curator/*.egg-info
39
+
40
+ smoke: build ## Build wheel, install in throwaway venv, smoke-test import
41
+ $(eval TMPVENV := $(shell mktemp -d))
42
+ $(UV) venv $(TMPVENV)/venv
43
+ $(UV) pip install --python $(TMPVENV)/venv $(DIST)/*.whl
44
+ $(TMPVENV)/venv/bin/python -c \
45
+ "from argus_curator import scan_folder, __version__; print(f'argus-curator {__version__} OK')"
46
+ rm -rf $(TMPVENV)
47
+
48
+ check: lint test build ## Full local CI: lint + test + build
49
+ @echo ""
50
+ @echo "All checks passed."
@@ -0,0 +1,272 @@
1
+ Metadata-Version: 2.4
2
+ Name: argus-curator
3
+ Version: 0.1.0
4
+ Summary: LoRA-native dataset curation: training-suitability scoring, near-duplicate dedup, and identity-aware face clustering
5
+ Project-URL: Homepage, https://github.com/smk762/argus-curator
6
+ Project-URL: Repository, https://github.com/smk762/argus-curator
7
+ Project-URL: Issues, https://github.com/smk762/argus-curator/issues
8
+ Project-URL: Changelog, https://github.com/smk762/argus-curator/blob/main/CHANGELOG.md
9
+ Author: smk762
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: curation,dataset,deduplication,diffusion,face-clustering,flux,image-quality,insightface,lora,phash,sdxl,stable-diffusion,training
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Classifier: Topic :: Scientific/Engineering :: Image Recognition
22
+ Classifier: Typing :: Typed
23
+ Requires-Python: >=3.11
24
+ Requires-Dist: imagehash>=4.3
25
+ Requires-Dist: numpy>=1.24
26
+ Requires-Dist: pillow>=10.0
27
+ Requires-Dist: pydantic>=2.6
28
+ Requires-Dist: structlog>=24.0
29
+ Provides-Extra: all
30
+ Requires-Dist: fastapi>=0.110; extra == 'all'
31
+ Requires-Dist: httpx>=0.27; extra == 'all'
32
+ Requires-Dist: insightface>=0.7.3; extra == 'all'
33
+ Requires-Dist: onnxruntime>=1.16; extra == 'all'
34
+ Requires-Dist: opencv-python-headless>=4.9; extra == 'all'
35
+ Requires-Dist: python-multipart>=0.0.7; extra == 'all'
36
+ Requires-Dist: rich>=13.0; extra == 'all'
37
+ Requires-Dist: scikit-learn>=1.3; extra == 'all'
38
+ Requires-Dist: typer>=0.12; extra == 'all'
39
+ Requires-Dist: uvicorn>=0.27; extra == 'all'
40
+ Provides-Extra: cli
41
+ Requires-Dist: rich>=13.0; extra == 'cli'
42
+ Requires-Dist: typer>=0.12; extra == 'cli'
43
+ Provides-Extra: dev
44
+ Requires-Dist: httpx>=0.27; extra == 'dev'
45
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
46
+ Requires-Dist: pytest>=8.0; extra == 'dev'
47
+ Requires-Dist: ruff>=0.5; extra == 'dev'
48
+ Provides-Extra: faces
49
+ Requires-Dist: insightface>=0.7.3; extra == 'faces'
50
+ Requires-Dist: onnxruntime>=1.16; extra == 'faces'
51
+ Requires-Dist: opencv-python-headless>=4.9; extra == 'faces'
52
+ Requires-Dist: scikit-learn>=1.3; extra == 'faces'
53
+ Provides-Extra: gpu
54
+ Requires-Dist: insightface>=0.7.3; extra == 'gpu'
55
+ Requires-Dist: onnxruntime-gpu>=1.16; extra == 'gpu'
56
+ Requires-Dist: opencv-python-headless>=4.9; extra == 'gpu'
57
+ Requires-Dist: scikit-learn>=1.3; extra == 'gpu'
58
+ Provides-Extra: server
59
+ Requires-Dist: fastapi>=0.110; extra == 'server'
60
+ Requires-Dist: httpx>=0.27; extra == 'server'
61
+ Requires-Dist: python-multipart>=0.0.7; extra == 'server'
62
+ Requires-Dist: uvicorn>=0.27; extra == 'server'
63
+ Description-Content-Type: text/markdown
64
+
65
+ # Argus Curator
66
+
67
+ [![PyPI](https://img.shields.io/pypi/v/argus-curator)](https://pypi.org/project/argus-curator/)
68
+ [![Python](https://img.shields.io/pypi/pyversions/argus-curator)](https://pypi.org/project/argus-curator/)
69
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
70
+ [![CI](https://github.com/smk762/argus-curator/actions/workflows/ci.yml/badge.svg)](https://github.com/smk762/argus-curator/actions/workflows/ci.yml)
71
+
72
+ The LoRA data-prep front-end — curate by quality and by face, then caption with [argus-lens](https://github.com/smk762/argus-lens).
73
+
74
+ `argus-curator` is the **curation stage** of the Argus suite. Upstream,
75
+ [argus-quarry](https://github.com/smk762/argus-quarry) acquires provenance-clean
76
+ images; curator decides *which images, of whom, at what quality* belong in a
77
+ LoRA training set; argus-lens then decides *what's in each image*. Curator and
78
+ lens share one `TargetProfile`, so a manifest written here is captioned
79
+ downstream with no remapping.
80
+
81
+ ```
82
+ argus-quarry argus-curator (:8101) argus-lens (:8100) imogen / kohya
83
+ ─ download ─┐ ─ scan + score ─┐ ─ caption ─┐ ─ train ─
84
+ ─ provenance ─┤ images ─ face-cluster ─┤ manifest ─ buckets ─┤ dataset ─ LoRA ─
85
+ ─ verify ─────┴─────────► ─ select ───────┴────────► (identity/ ─┴────────► ───────►
86
+ ─ export ───────► wardrobe/…)
87
+ ```
88
+
89
+ ## Why not FiftyOne / fastdup / Immich?
90
+
91
+ | | Generic CV curation (FiftyOne, fastdup) | Consumer galleries (Immich, PhotoPrism) | **Argus Curator** |
92
+ |---|---|---|---|
93
+ | Quality scoring | Generic | None | **Training-suitability** (target-aware sharpness/res/artifact + face-count fit) |
94
+ | Dedup | pHash | Basic | pHash, keeps best representative, **reports** the rest |
95
+ | Faces | Plugin | Clustering for browsing | Clustering **for dataset filtering** (export by identity) |
96
+ | Dataset export | Manual | None | Structure-preserving copy/symlink/move + **manifest** |
97
+ | Captioner handoff | None | None | One shared `TargetProfile` → argus-lens |
98
+
99
+ It's LoRA-native, identity-aware, and caption-integrated — the reason the suite exists.
100
+
101
+ ## The shared TargetProfile (the moat)
102
+
103
+ Both services speak the same taxonomy. This single schema is the contract:
104
+
105
+ ```python
106
+ from argus_curator import TargetProfile
107
+
108
+ TargetProfile(
109
+ target_style="photo", # "photo" | "anime"
110
+ target_backend="sdxl", # "sdxl" | "flux-dev-1" | ...
111
+ checkpoint=None,
112
+ target_category="identity", # identity | wardrobe | pose_composition | setting
113
+ )
114
+ ```
115
+
116
+ The curator uses it to weight scoring and label exports; argus-lens inherits it verbatim.
117
+
118
+ ## Installation
119
+
120
+ ```bash
121
+ pip install argus-curator # engine only (Pillow, numpy, ImageHash, pydantic)
122
+ pip install "argus-curator[server,cli]" # FastAPI server + CLI
123
+ pip install "argus-curator[faces]" # + InsightFace identity clustering (CPU onnxruntime)
124
+ pip install "argus-curator[gpu]" # + onnxruntime-gpu for GPU face detection
125
+ pip install "argus-curator[all]" # everything
126
+ ```
127
+
128
+ System libraries for the face stack (Ubuntu/Debian):
129
+
130
+ ```bash
131
+ sudo apt install -y libgl1 libglib2.0-0 libgomp1
132
+ ```
133
+
134
+ ## Usage
135
+
136
+ ### Python
137
+
138
+ ```python
139
+ from argus_curator import scan_folder, TargetProfile, ScanConfig, FaceConfig
140
+
141
+ summary = scan_folder(
142
+ "/data/images",
143
+ profile=TargetProfile(target_category="identity"),
144
+ cfg=ScanConfig(min_short_side=512, blur_threshold=100.0, cluster_distance=10),
145
+ faces_cfg=FaceConfig(enabled=True, model="buffalo_l", cluster_eps=0.5),
146
+ )
147
+
148
+ print(summary.passed, "passed,", summary.duplicates, "near-dupes")
149
+ for fc in summary.face_clusters:
150
+ print(fc.cluster_id, fc.size, "faces, rep:", fc.representative_rel_path)
151
+ ```
152
+
153
+ ### CLI
154
+
155
+ ```bash
156
+ # Report only — see the score distribution before committing to a threshold
157
+ argus-curator scan /data/images --csv report.csv
158
+
159
+ # Identity curation with face clustering, copy 0.65+ keepers (structure preserved)
160
+ argus-curator scan /data/images \
161
+ --target-category identity --faces --device cuda \
162
+ --min-score 0.65 --copy-to /data/curated
163
+
164
+ # Export only specific identities, capped to a diverse 200
165
+ argus-curator scan /data/images --faces \
166
+ --face-clusters face_1,face_2 --max-keep 200 --copy-to /data/curated
167
+
168
+ # Pick a pose-balanced subset (head-on + 3/4 only, drop side profiles)
169
+ argus-curator scan /data/images --faces \
170
+ --pose frontal,three_quarter --copy-to /data/curated
171
+
172
+ # Which detectors are available?
173
+ argus-curator detectors
174
+ ```
175
+
176
+ ### HTTP server (:8101, peer to argus-lens)
177
+
178
+ ```bash
179
+ pip install "argus-curator[server,faces]"
180
+ argus-curator serve --cors --port 8101 --source-root /data/images
181
+ ```
182
+
183
+ | Route | Description |
184
+ |---|---|
185
+ | `GET /health` | Liveness |
186
+ | `GET /detectors` | `{ torch, cuda, clip, insightface, onnxruntime }` |
187
+ | `GET /folders?path=<rel>` | Browse Docker-mounted folders under the source root (for the UI picker) |
188
+ | `POST /scan/folder` | Scan + score + dedup + face-cluster → `ScanSummary` |
189
+ | `GET /scan/{scan_id}` | Cached summary, paginated via `?offset=&limit=` |
190
+ | `GET /thumb?path=<rel>&scan_id=<id>` | `image/webp` thumbnail from the mount |
191
+ | `POST /export` | Structure-preserving transfer + manifest → `ExportResult` |
192
+ | `POST /scan/folder/stream` | Same as `/scan/folder`, streaming live progress over SSE |
193
+ | `POST /export/stream` | Same as `/export`, streaming per-file transfer progress over SSE |
194
+
195
+ The `*/stream` variants emit `event: progress` frames (`{phase, done, total}`)
196
+ while the work runs, then a single `event: complete` frame carrying the same
197
+ payload the non-streaming endpoint returns (or `event: error`).
198
+
199
+ `POST /scan/folder` body:
200
+
201
+ ```jsonc
202
+ {
203
+ "folder": "/data/images",
204
+ "target_profile": { "target_style": "photo", "target_category": "identity" },
205
+ "config": { "min_short_side": 512, "max_aspect_ratio": 3.0, "blur_threshold": 100.0,
206
+ "cluster_distance": 10, "max_workers": 4 },
207
+ "faces": { "enabled": true, "model": "buffalo_l", "min_det_score": 0.5, "cluster_eps": 0.5 }
208
+ }
209
+ ```
210
+
211
+ `POST /export` body:
212
+
213
+ ```jsonc
214
+ {
215
+ "scan_id": "...", // or inline "selection": ["rel_path", ...]
216
+ "dest": "/data/out",
217
+ "mode": "copy", // "copy" | "symlink" | "move"
218
+ "preserve_structure": true,
219
+ "min_score": 0.6, "include_rejected": false, "keep_similar": false,
220
+ "face_clusters": ["face_2"], // optional: export only these identities
221
+ "face_poses": ["frontal", "three_quarter"], // optional: export only these head poses
222
+ "write_manifest": true,
223
+ "caption_url": null // optional: POST manifest to argus-lens
224
+ }
225
+ ```
226
+
227
+ ### Docker
228
+
229
+ ```bash
230
+ docker compose up --build
231
+ ```
232
+
233
+ Bind-mounts a dataset into `/data/images`, an output dir into `/data/out`, and
234
+ persists the scan cache + InsightFace model downloads across rebuilds.
235
+
236
+ ## Handoff to argus-lens
237
+
238
+ Export writes a JSONL manifest (one row per selected image):
239
+
240
+ ```jsonc
241
+ { "rel_path": "...", "abs_path": "...", "target_profile": { ... },
242
+ "primary_face_cluster": "face_2", "primary_face_pose": "three_quarter",
243
+ "score": 0.87, "similar_group": 3 }
244
+ ```
245
+
246
+ argus-lens batch-captions this manifest — categories are already shared, so no
247
+ remapping. Set `caption_url` on the export request to POST it straight to lens
248
+ for a one-click curate→caption run.
249
+
250
+ ## How scoring works (per image)
251
+
252
+ 1. **Hard filters** — min short side, max aspect ratio, blur (Laplacian-edge variance floor).
253
+ 2. **Composite score** — target-aware weighted blend of sharpness / resolution / artifact, plus a small composition bonus that depends on `target_category` (e.g. identity rewards a single centred face; setting rewards wide framing).
254
+ 3. **Face-count fit** — with `[faces]`, identity targets penalise 0 or 2+ faces; other categories are progressively more tolerant.
255
+ 4. **Near-duplicate dedup** — pHash clustering keeps the highest-scoring representative and *reports* the rest (never silently dropped).
256
+ 5. **Selection (at export)** — score threshold + optional diversity cap (`max_keep` / `diversity_weight`) + optional face-cluster filter. Every excluded image carries a `keep_reason`.
257
+
258
+ ## State
259
+
260
+ Scans are cached on disk (keyed by `scan_id`, default `~/.cache/argus_curator/scans`,
261
+ override with `CURATOR_CACHE_DIR`). This is what makes paginated `GET /scan/{id}`
262
+ and export-by-id work without recomputing.
263
+
264
+ ## Related projects
265
+
266
+ - [**argus-quarry**](https://github.com/smk762/argus-quarry) — provenance-first acquisition of public-domain / CC0 portraits (the suite's input stage).
267
+ - [**argus-lens**](https://github.com/smk762/argus-lens) — intent-aware, multi-model captioning (consumes the manifest this package exports).
268
+ - [**argus-vision-demo**](https://github.com/smk762/argus-vision-demo) — the suite's Next.js web UI (its `/curate` view drives this server).
269
+
270
+ ## License
271
+
272
+ MIT — matches the rest of the Argus suite.