argus-curator 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- argus_curator-0.1.0/.env.example +16 -0
- argus_curator-0.1.0/.github/workflows/ci.yml +18 -0
- argus_curator-0.1.0/.github/workflows/release.yml +105 -0
- argus_curator-0.1.0/.gitignore +15 -0
- argus_curator-0.1.0/CHANGELOG.md +39 -0
- argus_curator-0.1.0/Dockerfile +23 -0
- argus_curator-0.1.0/LICENSE +21 -0
- argus_curator-0.1.0/Makefile +50 -0
- argus_curator-0.1.0/PKG-INFO +272 -0
- argus_curator-0.1.0/README.md +208 -0
- argus_curator-0.1.0/docker-compose.yaml +26 -0
- argus_curator-0.1.0/pyproject.toml +97 -0
- argus_curator-0.1.0/schema/curator-wire.schema.json +757 -0
- argus_curator-0.1.0/src/argus_curator/__init__.py +50 -0
- argus_curator-0.1.0/src/argus_curator/_version.py +24 -0
- argus_curator-0.1.0/src/argus_curator/cli.py +225 -0
- argus_curator-0.1.0/src/argus_curator/export.py +223 -0
- argus_curator-0.1.0/src/argus_curator/faces.py +302 -0
- argus_curator-0.1.0/src/argus_curator/models.py +262 -0
- argus_curator-0.1.0/src/argus_curator/py.typed +0 -0
- argus_curator-0.1.0/src/argus_curator/scanner.py +455 -0
- argus_curator-0.1.0/src/argus_curator/selection.py +102 -0
- argus_curator-0.1.0/src/argus_curator/server/__init__.py +5 -0
- argus_curator-0.1.0/src/argus_curator/server/app.py +360 -0
- argus_curator-0.1.0/src/argus_curator/store.py +73 -0
- argus_curator-0.1.0/tests/conftest.py +56 -0
- argus_curator-0.1.0/tests/test_export.py +79 -0
- argus_curator-0.1.0/tests/test_pose.py +95 -0
- argus_curator-0.1.0/tests/test_scanner.py +62 -0
- argus_curator-0.1.0/tests/test_schema.py +17 -0
- argus_curator-0.1.0/tests/test_server.py +173 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# ── Server ───────────────────────────────────────────────────────────
|
|
2
|
+
# Host port for the argus-curator FastAPI server (peer to argus-lens :8100).
|
|
3
|
+
CURATOR_PORT=8101
|
|
4
|
+
|
|
5
|
+
# Default mount root that GET /thumb serves from when no scan_id is supplied.
|
|
6
|
+
# Mirrors NEXT_PUBLIC_CURATOR_SOURCE_PATH in argus-vision-demo.
|
|
7
|
+
CURATOR_SOURCE_PATH=/data/images
|
|
8
|
+
|
|
9
|
+
# ── State ────────────────────────────────────────────────────────────
|
|
10
|
+
# On-disk scan cache (keyed by scan_id) used for pagination + export-by-id.
|
|
11
|
+
# Defaults to ~/.cache/argus_curator/scans if unset.
|
|
12
|
+
CURATOR_CACHE_DIR=/home/smk/.cache/argus_curator/scans
|
|
13
|
+
|
|
14
|
+
# ── Handoff (optional) ───────────────────────────────────────────────
|
|
15
|
+
# argus-lens caption endpoint for the one-click curate->caption run.
|
|
16
|
+
ARGUS_LENS_URL=http://localhost:8100
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
concurrency:
|
|
10
|
+
group: ci-${{ github.ref }}
|
|
11
|
+
cancel-in-progress: true
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
ci:
|
|
15
|
+
uses: smk762/argus-ci/.github/workflows/python-ci.yml@v1
|
|
16
|
+
with:
|
|
17
|
+
test-extras: "dev,server,cli"
|
|
18
|
+
post-test: "uv run --no-sync argus-curator schema --check"
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
name: Release to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags: ["v*"]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
contents: read
|
|
9
|
+
id-token: write
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
build:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
with:
|
|
17
|
+
fetch-depth: 0
|
|
18
|
+
- uses: astral-sh/setup-uv@v6
|
|
19
|
+
with:
|
|
20
|
+
enable-cache: true
|
|
21
|
+
- run: uv build
|
|
22
|
+
- uses: actions/upload-artifact@v4
|
|
23
|
+
with:
|
|
24
|
+
name: dist
|
|
25
|
+
path: dist/
|
|
26
|
+
|
|
27
|
+
test-install:
|
|
28
|
+
runs-on: ubuntu-latest
|
|
29
|
+
needs: build
|
|
30
|
+
strategy:
|
|
31
|
+
matrix:
|
|
32
|
+
python-version: ["3.11", "3.12"]
|
|
33
|
+
steps:
|
|
34
|
+
- uses: actions/download-artifact@v4
|
|
35
|
+
with:
|
|
36
|
+
name: dist
|
|
37
|
+
path: dist/
|
|
38
|
+
- uses: astral-sh/setup-uv@v6
|
|
39
|
+
with:
|
|
40
|
+
enable-cache: true
|
|
41
|
+
- name: Install wheel and smoke-test import
|
|
42
|
+
run: |
|
|
43
|
+
uv venv --python ${{ matrix.python-version }}
|
|
44
|
+
uv pip install dist/*.whl
|
|
45
|
+
uv run --no-sync python -c "from argus_curator import scan_folder, __version__; print(f'argus-curator {__version__} OK')"
|
|
46
|
+
|
|
47
|
+
publish-testpypi:
|
|
48
|
+
runs-on: ubuntu-latest
|
|
49
|
+
needs: test-install
|
|
50
|
+
environment: testpypi
|
|
51
|
+
steps:
|
|
52
|
+
- uses: actions/download-artifact@v4
|
|
53
|
+
with:
|
|
54
|
+
name: dist
|
|
55
|
+
path: dist/
|
|
56
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
57
|
+
with:
|
|
58
|
+
repository-url: https://test.pypi.org/legacy/
|
|
59
|
+
|
|
60
|
+
publish-pypi:
|
|
61
|
+
runs-on: ubuntu-latest
|
|
62
|
+
needs: publish-testpypi
|
|
63
|
+
environment: pypi
|
|
64
|
+
steps:
|
|
65
|
+
- uses: actions/download-artifact@v4
|
|
66
|
+
with:
|
|
67
|
+
name: dist
|
|
68
|
+
path: dist/
|
|
69
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
70
|
+
|
|
71
|
+
publish-image:
|
|
72
|
+
runs-on: ubuntu-latest
|
|
73
|
+
needs: test-install
|
|
74
|
+
permissions:
|
|
75
|
+
contents: read
|
|
76
|
+
packages: write
|
|
77
|
+
steps:
|
|
78
|
+
- uses: actions/checkout@v4
|
|
79
|
+
- uses: docker/setup-buildx-action@v3
|
|
80
|
+
- uses: docker/login-action@v3
|
|
81
|
+
with:
|
|
82
|
+
registry: ghcr.io
|
|
83
|
+
username: ${{ github.actor }}
|
|
84
|
+
password: ${{ secrets.GITHUB_TOKEN }}
|
|
85
|
+
- id: meta
|
|
86
|
+
uses: docker/metadata-action@v5
|
|
87
|
+
with:
|
|
88
|
+
images: ghcr.io/${{ github.repository_owner }}/argus-curator
|
|
89
|
+
tags: |
|
|
90
|
+
type=semver,pattern={{version}}
|
|
91
|
+
type=semver,pattern={{major}}.{{minor}}
|
|
92
|
+
type=raw,value=latest
|
|
93
|
+
# The base image has no git, so pass the version in via build-arg
|
|
94
|
+
# (SETUPTOOLS_SCM_PRETEND_VERSION) rather than relying on .git history.
|
|
95
|
+
- uses: docker/build-push-action@v6
|
|
96
|
+
with:
|
|
97
|
+
context: .
|
|
98
|
+
file: ./Dockerfile
|
|
99
|
+
push: true
|
|
100
|
+
build-args: |
|
|
101
|
+
VERSION=${{ steps.meta.outputs.version }}
|
|
102
|
+
tags: ${{ steps.meta.outputs.tags }}
|
|
103
|
+
labels: ${{ steps.meta.outputs.labels }}
|
|
104
|
+
cache-from: type=gha
|
|
105
|
+
cache-to: type=gha,mode=max
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.1.0] - 2026-07-02
|
|
9
|
+
|
|
10
|
+
Initial release — the curation stage of the Argus suite
|
|
11
|
+
([argus-quarry](https://github.com/smk762/argus-quarry) →
|
|
12
|
+
**argus-curator** →
|
|
13
|
+
[argus-lens](https://github.com/smk762/argus-lens)).
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
|
|
17
|
+
- Training-suitability scanning: hard filters (min short side, aspect ratio,
|
|
18
|
+
blur), target-aware composite scoring, and per-image reject reasons.
|
|
19
|
+
- Near-duplicate detection via pHash clustering — keeps the highest-scoring
|
|
20
|
+
representative and reports (never silently drops) the rest.
|
|
21
|
+
- Identity-aware face clustering (InsightFace, `[faces]` / `[gpu]` extras) with
|
|
22
|
+
head-pose (yaw) capture and pose-balanced subset selection.
|
|
23
|
+
- Structure-preserving export (copy / symlink / move) with score threshold,
|
|
24
|
+
diversity cap, face-cluster and pose filters, plus a per-image CSV report.
|
|
25
|
+
- Versioned JSONL handoff manifest carrying the shared `TargetProfile`, with an
|
|
26
|
+
optional `caption_url` POST for a one-click curate→caption run against
|
|
27
|
+
argus-lens.
|
|
28
|
+
- Wire-contract JSON Schema (`schema/curator-wire.schema.json`) published for
|
|
29
|
+
consumer codegen, with a CI staleness check (`argus-curator schema --check`).
|
|
30
|
+
- FastAPI micro-server (`[server]` extra, :8101): `/health`, `/detectors`,
|
|
31
|
+
`/folders`, `/scan/folder`, `/scan/{scan_id}` (paginated), `/thumb`,
|
|
32
|
+
`/export`, and SSE streaming variants `/scan/folder/stream` and
|
|
33
|
+
`/export/stream`.
|
|
34
|
+
- Typer CLI (`[cli]` extra): `scan`, `serve`, `detectors`, `schema`.
|
|
35
|
+
- On-disk scan cache keyed by `scan_id` (`CURATOR_CACHE_DIR`) powering
|
|
36
|
+
pagination and export-by-id.
|
|
37
|
+
- Docker image (GHCR) and `docker compose` deployment.
|
|
38
|
+
|
|
39
|
+
[0.1.0]: https://github.com/smk762/argus-curator/releases/tag/v0.1.0
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
FROM python:3.12-slim
|
|
2
|
+
|
|
3
|
+
# System libs for Pillow / OpenCV / onnxruntime image decoding.
|
|
4
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
5
|
+
libgl1 libglib2.0-0 libgomp1 \
|
|
6
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
7
|
+
|
|
8
|
+
WORKDIR /app
|
|
9
|
+
|
|
10
|
+
# The base image has no `git`, so hatch-vcs/setuptools-scm can't derive the
|
|
11
|
+
# version from history — hand it in via the VERSION build arg (the release tag,
|
|
12
|
+
# sans "v"). Defaults to 0.0.0 for local `docker compose` builds.
|
|
13
|
+
ARG VERSION=0.0.0
|
|
14
|
+
ENV SETUPTOOLS_SCM_PRETEND_VERSION=${VERSION}
|
|
15
|
+
|
|
16
|
+
COPY . /app
|
|
17
|
+
|
|
18
|
+
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
19
|
+
pip install --upgrade pip && \
|
|
20
|
+
pip install ".[server,cli,faces]"
|
|
21
|
+
|
|
22
|
+
EXPOSE 8101
|
|
23
|
+
CMD ["argus-curator", "serve", "--port", "8101", "--cors"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 dragonhound
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
.DEFAULT_GOAL := help
|
|
2
|
+
DIST := dist
|
|
3
|
+
UV := uv
|
|
4
|
+
|
|
5
|
+
.PHONY: help install dev lint fmt test build clean smoke check schema
|
|
6
|
+
|
|
7
|
+
help: ## Show this help
|
|
8
|
+
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \
|
|
9
|
+
awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-12s\033[0m %s\n", $$1, $$2}'
|
|
10
|
+
|
|
11
|
+
install: ## Create .venv + editable install (core deps only)
|
|
12
|
+
$(UV) venv
|
|
13
|
+
$(UV) pip install -e .
|
|
14
|
+
|
|
15
|
+
dev: ## Create .venv + editable install with dev + cli + server extras
|
|
16
|
+
$(UV) venv
|
|
17
|
+
$(UV) pip install -e ".[dev,cli,server]"
|
|
18
|
+
|
|
19
|
+
lint: ## Run ruff linter
|
|
20
|
+
$(UV) run --no-sync ruff check src/ tests/
|
|
21
|
+
|
|
22
|
+
fmt: ## Auto-format with ruff
|
|
23
|
+
$(UV) run --no-sync ruff format src/ tests/
|
|
24
|
+
$(UV) run --no-sync ruff check --fix src/ tests/
|
|
25
|
+
|
|
26
|
+
test: ## Run pytest
|
|
27
|
+
$(UV) run --no-sync pytest --tb=short -q
|
|
28
|
+
|
|
29
|
+
schema: ## Regenerate the committed wire-contract JSON Schema
|
|
30
|
+
$(UV) run --no-sync argus-curator schema
|
|
31
|
+
|
|
32
|
+
build: clean ## Build sdist + wheel into dist/
|
|
33
|
+
$(UV) build
|
|
34
|
+
@echo ""
|
|
35
|
+
@ls -lh $(DIST)/
|
|
36
|
+
|
|
37
|
+
clean: ## Remove build artifacts
|
|
38
|
+
rm -rf $(DIST) build src/*.egg-info src/argus_curator/*.egg-info
|
|
39
|
+
|
|
40
|
+
smoke: build ## Build wheel, install in throwaway venv, smoke-test import
|
|
41
|
+
$(eval TMPVENV := $(shell mktemp -d))
|
|
42
|
+
$(UV) venv $(TMPVENV)/venv
|
|
43
|
+
$(UV) pip install --python $(TMPVENV)/venv $(DIST)/*.whl
|
|
44
|
+
$(TMPVENV)/venv/bin/python -c \
|
|
45
|
+
"from argus_curator import scan_folder, __version__; print(f'argus-curator {__version__} OK')"
|
|
46
|
+
rm -rf $(TMPVENV)
|
|
47
|
+
|
|
48
|
+
check: lint test build ## Full local CI: lint + test + build
|
|
49
|
+
@echo ""
|
|
50
|
+
@echo "All checks passed."
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: argus-curator
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LoRA-native dataset curation: training-suitability scoring, near-duplicate dedup, and identity-aware face clustering
|
|
5
|
+
Project-URL: Homepage, https://github.com/smk762/argus-curator
|
|
6
|
+
Project-URL: Repository, https://github.com/smk762/argus-curator
|
|
7
|
+
Project-URL: Issues, https://github.com/smk762/argus-curator/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/smk762/argus-curator/blob/main/CHANGELOG.md
|
|
9
|
+
Author: smk762
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: curation,dataset,deduplication,diffusion,face-clustering,flux,image-quality,insightface,lora,phash,sdxl,stable-diffusion,training
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
22
|
+
Classifier: Typing :: Typed
|
|
23
|
+
Requires-Python: >=3.11
|
|
24
|
+
Requires-Dist: imagehash>=4.3
|
|
25
|
+
Requires-Dist: numpy>=1.24
|
|
26
|
+
Requires-Dist: pillow>=10.0
|
|
27
|
+
Requires-Dist: pydantic>=2.6
|
|
28
|
+
Requires-Dist: structlog>=24.0
|
|
29
|
+
Provides-Extra: all
|
|
30
|
+
Requires-Dist: fastapi>=0.110; extra == 'all'
|
|
31
|
+
Requires-Dist: httpx>=0.27; extra == 'all'
|
|
32
|
+
Requires-Dist: insightface>=0.7.3; extra == 'all'
|
|
33
|
+
Requires-Dist: onnxruntime>=1.16; extra == 'all'
|
|
34
|
+
Requires-Dist: opencv-python-headless>=4.9; extra == 'all'
|
|
35
|
+
Requires-Dist: python-multipart>=0.0.7; extra == 'all'
|
|
36
|
+
Requires-Dist: rich>=13.0; extra == 'all'
|
|
37
|
+
Requires-Dist: scikit-learn>=1.3; extra == 'all'
|
|
38
|
+
Requires-Dist: typer>=0.12; extra == 'all'
|
|
39
|
+
Requires-Dist: uvicorn>=0.27; extra == 'all'
|
|
40
|
+
Provides-Extra: cli
|
|
41
|
+
Requires-Dist: rich>=13.0; extra == 'cli'
|
|
42
|
+
Requires-Dist: typer>=0.12; extra == 'cli'
|
|
43
|
+
Provides-Extra: dev
|
|
44
|
+
Requires-Dist: httpx>=0.27; extra == 'dev'
|
|
45
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
46
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
47
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
48
|
+
Provides-Extra: faces
|
|
49
|
+
Requires-Dist: insightface>=0.7.3; extra == 'faces'
|
|
50
|
+
Requires-Dist: onnxruntime>=1.16; extra == 'faces'
|
|
51
|
+
Requires-Dist: opencv-python-headless>=4.9; extra == 'faces'
|
|
52
|
+
Requires-Dist: scikit-learn>=1.3; extra == 'faces'
|
|
53
|
+
Provides-Extra: gpu
|
|
54
|
+
Requires-Dist: insightface>=0.7.3; extra == 'gpu'
|
|
55
|
+
Requires-Dist: onnxruntime-gpu>=1.16; extra == 'gpu'
|
|
56
|
+
Requires-Dist: opencv-python-headless>=4.9; extra == 'gpu'
|
|
57
|
+
Requires-Dist: scikit-learn>=1.3; extra == 'gpu'
|
|
58
|
+
Provides-Extra: server
|
|
59
|
+
Requires-Dist: fastapi>=0.110; extra == 'server'
|
|
60
|
+
Requires-Dist: httpx>=0.27; extra == 'server'
|
|
61
|
+
Requires-Dist: python-multipart>=0.0.7; extra == 'server'
|
|
62
|
+
Requires-Dist: uvicorn>=0.27; extra == 'server'
|
|
63
|
+
Description-Content-Type: text/markdown
|
|
64
|
+
|
|
65
|
+
# Argus Curator
|
|
66
|
+
|
|
67
|
+
[](https://pypi.org/project/argus-curator/)
|
|
68
|
+
[](https://pypi.org/project/argus-curator/)
|
|
69
|
+
[](LICENSE)
|
|
70
|
+
[](https://github.com/smk762/argus-curator/actions/workflows/ci.yml)
|
|
71
|
+
|
|
72
|
+
The LoRA data-prep front-end — curate by quality and by face, then caption with [argus-lens](https://github.com/smk762/argus-lens).
|
|
73
|
+
|
|
74
|
+
`argus-curator` is the **curation stage** of the Argus suite. Upstream,
|
|
75
|
+
[argus-quarry](https://github.com/smk762/argus-quarry) acquires provenance-clean
|
|
76
|
+
images; curator decides *which images, of whom, at what quality* belong in a
|
|
77
|
+
LoRA training set; argus-lens then decides *what's in each image*. Curator and
|
|
78
|
+
lens share one `TargetProfile`, so a manifest written here is captioned
|
|
79
|
+
downstream with no remapping.
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
argus-quarry argus-curator (:8101) argus-lens (:8100) imogen / kohya
|
|
83
|
+
─ download ─┐ ─ scan + score ─┐ ─ caption ─┐ ─ train ─
|
|
84
|
+
─ provenance ─┤ images ─ face-cluster ─┤ manifest ─ buckets ─┤ dataset ─ LoRA ─
|
|
85
|
+
─ verify ─────┴─────────► ─ select ───────┴────────► (identity/ ─┴────────► ───────►
|
|
86
|
+
─ export ───────► wardrobe/…)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Why not FiftyOne / fastdup / Immich?
|
|
90
|
+
|
|
91
|
+
| | Generic CV curation (FiftyOne, fastdup) | Consumer galleries (Immich, PhotoPrism) | **Argus Curator** |
|
|
92
|
+
|---|---|---|---|
|
|
93
|
+
| Quality scoring | Generic | None | **Training-suitability** (target-aware sharpness/res/artifact + face-count fit) |
|
|
94
|
+
| Dedup | pHash | Basic | pHash, keeps best representative, **reports** the rest |
|
|
95
|
+
| Faces | Plugin | Clustering for browsing | Clustering **for dataset filtering** (export by identity) |
|
|
96
|
+
| Dataset export | Manual | None | Structure-preserving copy/symlink/move + **manifest** |
|
|
97
|
+
| Captioner handoff | None | None | One shared `TargetProfile` → argus-lens |
|
|
98
|
+
|
|
99
|
+
It's LoRA-native, identity-aware, and caption-integrated — the reason the suite exists.
|
|
100
|
+
|
|
101
|
+
## The shared TargetProfile (the moat)
|
|
102
|
+
|
|
103
|
+
Both services speak the same taxonomy. This single schema is the contract:
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from argus_curator import TargetProfile
|
|
107
|
+
|
|
108
|
+
TargetProfile(
|
|
109
|
+
target_style="photo", # "photo" | "anime"
|
|
110
|
+
target_backend="sdxl", # "sdxl" | "flux-dev-1" | ...
|
|
111
|
+
checkpoint=None,
|
|
112
|
+
target_category="identity", # identity | wardrobe | pose_composition | setting
|
|
113
|
+
)
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
The curator uses it to weight scoring and label exports; argus-lens inherits it verbatim.
|
|
117
|
+
|
|
118
|
+
## Installation
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
pip install argus-curator # engine only (Pillow, numpy, ImageHash, pydantic)
|
|
122
|
+
pip install "argus-curator[server,cli]" # FastAPI server + CLI
|
|
123
|
+
pip install "argus-curator[faces]" # + InsightFace identity clustering (CPU onnxruntime)
|
|
124
|
+
pip install "argus-curator[gpu]" # + onnxruntime-gpu for GPU face detection
|
|
125
|
+
pip install "argus-curator[all]" # everything
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
System libraries for the face stack (Ubuntu/Debian):
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
sudo apt install -y libgl1 libglib2.0-0 libgomp1
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Usage
|
|
135
|
+
|
|
136
|
+
### Python
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
from argus_curator import scan_folder, TargetProfile, ScanConfig, FaceConfig
|
|
140
|
+
|
|
141
|
+
summary = scan_folder(
|
|
142
|
+
"/data/images",
|
|
143
|
+
profile=TargetProfile(target_category="identity"),
|
|
144
|
+
cfg=ScanConfig(min_short_side=512, blur_threshold=100.0, cluster_distance=10),
|
|
145
|
+
faces_cfg=FaceConfig(enabled=True, model="buffalo_l", cluster_eps=0.5),
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
print(summary.passed, "passed,", summary.duplicates, "near-dupes")
|
|
149
|
+
for fc in summary.face_clusters:
|
|
150
|
+
print(fc.cluster_id, fc.size, "faces, rep:", fc.representative_rel_path)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### CLI
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
# Report only — see the score distribution before committing to a threshold
|
|
157
|
+
argus-curator scan /data/images --csv report.csv
|
|
158
|
+
|
|
159
|
+
# Identity curation with face clustering, copy 0.65+ keepers (structure preserved)
|
|
160
|
+
argus-curator scan /data/images \
|
|
161
|
+
--target-category identity --faces --device cuda \
|
|
162
|
+
--min-score 0.65 --copy-to /data/curated
|
|
163
|
+
|
|
164
|
+
# Export only specific identities, capped to a diverse 200
|
|
165
|
+
argus-curator scan /data/images --faces \
|
|
166
|
+
--face-clusters face_1,face_2 --max-keep 200 --copy-to /data/curated
|
|
167
|
+
|
|
168
|
+
# Pick a pose-balanced subset (head-on + 3/4 only, drop side profiles)
|
|
169
|
+
argus-curator scan /data/images --faces \
|
|
170
|
+
--pose frontal,three_quarter --copy-to /data/curated
|
|
171
|
+
|
|
172
|
+
# Which detectors are available?
|
|
173
|
+
argus-curator detectors
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### HTTP server (:8101, peer to argus-lens)
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
pip install "argus-curator[server,faces]"
|
|
180
|
+
argus-curator serve --cors --port 8101 --source-root /data/images
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
| Route | Description |
|
|
184
|
+
|---|---|
|
|
185
|
+
| `GET /health` | Liveness |
|
|
186
|
+
| `GET /detectors` | `{ torch, cuda, clip, insightface, onnxruntime }` |
|
|
187
|
+
| `GET /folders?path=<rel>` | Browse Docker-mounted folders under the source root (for the UI picker) |
|
|
188
|
+
| `POST /scan/folder` | Scan + score + dedup + face-cluster → `ScanSummary` |
|
|
189
|
+
| `GET /scan/{scan_id}` | Cached summary, paginated via `?offset=&limit=` |
|
|
190
|
+
| `GET /thumb?path=<rel>&scan_id=<id>` | `image/webp` thumbnail from the mount |
|
|
191
|
+
| `POST /export` | Structure-preserving transfer + manifest → `ExportResult` |
|
|
192
|
+
| `POST /scan/folder/stream` | Same as `/scan/folder`, streaming live progress over SSE |
|
|
193
|
+
| `POST /export/stream` | Same as `/export`, streaming per-file transfer progress over SSE |
|
|
194
|
+
|
|
195
|
+
The `*/stream` variants emit `event: progress` frames (`{phase, done, total}`)
|
|
196
|
+
while the work runs, then a single `event: complete` frame carrying the same
|
|
197
|
+
payload the non-streaming endpoint returns (or `event: error`).
|
|
198
|
+
|
|
199
|
+
`POST /scan/folder` body:
|
|
200
|
+
|
|
201
|
+
```jsonc
|
|
202
|
+
{
|
|
203
|
+
"folder": "/data/images",
|
|
204
|
+
"target_profile": { "target_style": "photo", "target_category": "identity" },
|
|
205
|
+
"config": { "min_short_side": 512, "max_aspect_ratio": 3.0, "blur_threshold": 100.0,
|
|
206
|
+
"cluster_distance": 10, "max_workers": 4 },
|
|
207
|
+
"faces": { "enabled": true, "model": "buffalo_l", "min_det_score": 0.5, "cluster_eps": 0.5 }
|
|
208
|
+
}
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
`POST /export` body:
|
|
212
|
+
|
|
213
|
+
```jsonc
|
|
214
|
+
{
|
|
215
|
+
"scan_id": "...", // or inline "selection": ["rel_path", ...]
|
|
216
|
+
"dest": "/data/out",
|
|
217
|
+
"mode": "copy", // "copy" | "symlink" | "move"
|
|
218
|
+
"preserve_structure": true,
|
|
219
|
+
"min_score": 0.6, "include_rejected": false, "keep_similar": false,
|
|
220
|
+
"face_clusters": ["face_2"], // optional: export only these identities
|
|
221
|
+
"face_poses": ["frontal", "three_quarter"], // optional: export only these head poses
|
|
222
|
+
"write_manifest": true,
|
|
223
|
+
"caption_url": null // optional: POST manifest to argus-lens
|
|
224
|
+
}
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### Docker
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
docker compose up --build
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
Bind-mounts a dataset into `/data/images`, an output dir into `/data/out`, and
|
|
234
|
+
persists the scan cache + InsightFace model downloads across rebuilds.
|
|
235
|
+
|
|
236
|
+
## Handoff to argus-lens
|
|
237
|
+
|
|
238
|
+
Export writes a JSONL manifest (one row per selected image):
|
|
239
|
+
|
|
240
|
+
```jsonc
|
|
241
|
+
{ "rel_path": "...", "abs_path": "...", "target_profile": { ... },
|
|
242
|
+
"primary_face_cluster": "face_2", "primary_face_pose": "three_quarter",
|
|
243
|
+
"score": 0.87, "similar_group": 3 }
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
argus-lens batch-captions this manifest — categories are already shared, so no
|
|
247
|
+
remapping. Set `caption_url` on the export request to POST it straight to lens
|
|
248
|
+
for a one-click curate→caption run.
|
|
249
|
+
|
|
250
|
+
## How scoring works (per image)
|
|
251
|
+
|
|
252
|
+
1. **Hard filters** — min short side, max aspect ratio, blur (Laplacian-edge variance floor).
|
|
253
|
+
2. **Composite score** — target-aware weighted blend of sharpness / resolution / artifact, plus a small composition bonus that depends on `target_category` (e.g. identity rewards a single centred face; setting rewards wide framing).
|
|
254
|
+
3. **Face-count fit** — with `[faces]`, identity targets penalise 0 or 2+ faces; other categories are progressively more tolerant.
|
|
255
|
+
4. **Near-duplicate dedup** — pHash clustering keeps the highest-scoring representative and *reports* the rest (never silently dropped).
|
|
256
|
+
5. **Selection (at export)** — score threshold + optional diversity cap (`max_keep` / `diversity_weight`) + optional face-cluster filter. Every excluded image carries a `keep_reason`.
|
|
257
|
+
|
|
258
|
+
## State
|
|
259
|
+
|
|
260
|
+
Scans are cached on disk (keyed by `scan_id`, default `~/.cache/argus_curator/scans`,
|
|
261
|
+
override with `CURATOR_CACHE_DIR`). This is what makes paginated `GET /scan/{id}`
|
|
262
|
+
and export-by-id work without recomputing.
|
|
263
|
+
|
|
264
|
+
## Related projects
|
|
265
|
+
|
|
266
|
+
- [**argus-quarry**](https://github.com/smk762/argus-quarry) — provenance-first acquisition of public-domain / CC0 portraits (the suite's input stage).
|
|
267
|
+
- [**argus-lens**](https://github.com/smk762/argus-lens) — intent-aware, multi-model captioning (consumes the manifest this package exports).
|
|
268
|
+
- [**argus-vision-demo**](https://github.com/smk762/argus-vision-demo) — the suite's Next.js web UI (its `/curate` view drives this server).
|
|
269
|
+
|
|
270
|
+
## License
|
|
271
|
+
|
|
272
|
+
MIT — matches the rest of the Argus suite.
|