krunic 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- krunic-0.1.0/.gitignore +207 -0
- krunic-0.1.0/.python-version +1 -0
- krunic-0.1.0/PKG-INFO +196 -0
- krunic-0.1.0/README.md +175 -0
- krunic-0.1.0/krunic/__init__.py +0 -0
- krunic-0.1.0/krunic/krunic.py +244 -0
- krunic-0.1.0/krunic/requirements.txt +10 -0
- krunic-0.1.0/krunic/tunic.py +1308 -0
- krunic-0.1.0/krunic/tunic_plotter.py +92 -0
- krunic-0.1.0/pyproject.toml +34 -0
- krunic-0.1.0/requirements.txt +517 -0
- krunic-0.1.0/sky-tunic.yaml +46 -0
- krunic-0.1.0/sky-tunic_T4:4.yaml +50 -0
- krunic-0.1.0/sky-tunic_twonodes.yaml +58 -0
- krunic-0.1.0/spot-tunic.yaml +59 -0
- krunic-0.1.0/uv.lock +3897 -0
krunic-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
#uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
#poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
#pdm.lock
|
|
116
|
+
#pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
#pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# SageMath parsed files
|
|
135
|
+
*.sage.py
|
|
136
|
+
|
|
137
|
+
# Environments
|
|
138
|
+
.env
|
|
139
|
+
.envrc
|
|
140
|
+
.venv
|
|
141
|
+
env/
|
|
142
|
+
venv/
|
|
143
|
+
ENV/
|
|
144
|
+
env.bak/
|
|
145
|
+
venv.bak/
|
|
146
|
+
|
|
147
|
+
# Spyder project settings
|
|
148
|
+
.spyderproject
|
|
149
|
+
.spyproject
|
|
150
|
+
|
|
151
|
+
# Rope project settings
|
|
152
|
+
.ropeproject
|
|
153
|
+
|
|
154
|
+
# mkdocs documentation
|
|
155
|
+
/site
|
|
156
|
+
|
|
157
|
+
# mypy
|
|
158
|
+
.mypy_cache/
|
|
159
|
+
.dmypy.json
|
|
160
|
+
dmypy.json
|
|
161
|
+
|
|
162
|
+
# Pyre type checker
|
|
163
|
+
.pyre/
|
|
164
|
+
|
|
165
|
+
# pytype static type analyzer
|
|
166
|
+
.pytype/
|
|
167
|
+
|
|
168
|
+
# Cython debug symbols
|
|
169
|
+
cython_debug/
|
|
170
|
+
|
|
171
|
+
# PyCharm
|
|
172
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
173
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
174
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
175
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
176
|
+
#.idea/
|
|
177
|
+
|
|
178
|
+
# Abstra
|
|
179
|
+
# Abstra is an AI-powered process automation framework.
|
|
180
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
181
|
+
# Learn more at https://abstra.io/docs
|
|
182
|
+
.abstra/
|
|
183
|
+
|
|
184
|
+
# Visual Studio Code
|
|
185
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
186
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
188
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
189
|
+
# .vscode/
|
|
190
|
+
|
|
191
|
+
# Ruff stuff:
|
|
192
|
+
.ruff_cache/
|
|
193
|
+
|
|
194
|
+
# PyPI configuration file
|
|
195
|
+
.pypirc
|
|
196
|
+
|
|
197
|
+
# Cursor
|
|
198
|
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
|
199
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
|
200
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
|
201
|
+
.cursorignore
|
|
202
|
+
.cursorindexingignore
|
|
203
|
+
|
|
204
|
+
# Marimo
|
|
205
|
+
marimo/_static/
|
|
206
|
+
marimo/_lsp/
|
|
207
|
+
__marimo__/
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
krunic-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: krunic
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Hyperparameter search for image classifiers using Ray Tune + SkyPilot
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Requires-Dist: argparse>=1.4.0
|
|
7
|
+
Requires-Dist: gdown>=5.2.1
|
|
8
|
+
Requires-Dist: h5py>=3.16.0
|
|
9
|
+
Requires-Dist: matplotlib>=3.10.8
|
|
10
|
+
Requires-Dist: medmnist>=3.0.2
|
|
11
|
+
Requires-Dist: numpy>=2.4.3
|
|
12
|
+
Requires-Dist: optuna>=4.7.0
|
|
13
|
+
Requires-Dist: pillow>=12.1.1
|
|
14
|
+
Requires-Dist: ray[train,tune]>=2.54.0
|
|
15
|
+
Requires-Dist: scikit-learn>=1.8.0
|
|
16
|
+
Requires-Dist: skypilot[aws]>=0.11.2
|
|
17
|
+
Requires-Dist: timm>=1.0.25
|
|
18
|
+
Requires-Dist: torch==2.8.0
|
|
19
|
+
Requires-Dist: webdataset>=1.0.2
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# krunic
|
|
23
|
+
|
|
24
|
+
Automated hyperparameter search for image classifiers — from dataset to tuned model with one command.
|
|
25
|
+
|
|
26
|
+
Built on [Ray Tune](https://docs.ray.io/en/latest/tune/index.html), [Optuna](https://optuna.org/), [timm](https://github.com/huggingface/pytorch-image-models), and [SkyPilot](https://skypilot.readthedocs.io/).
|
|
27
|
+
|
|
28
|
+
## Install
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pipx install krunic
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
This installs three commands: `tunic` (local training), `krunic` (cloud launcher), and `tunic-plotter` (results visualizer).
|
|
35
|
+
|
|
36
|
+
## Quick start
|
|
37
|
+
|
|
38
|
+
**Local:**
|
|
39
|
+
```bash
|
|
40
|
+
tunic --data /path/to/dataset --model resnet50 --n_trials 30 --epochs 30 --output results.json
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
**Cloud (AWS):**
|
|
44
|
+
```bash
|
|
45
|
+
krunic \
|
|
46
|
+
--cluster my-cluster \
|
|
47
|
+
--workdir ~/github/krunic \
|
|
48
|
+
--s3-path my-dataset \
|
|
49
|
+
--model resnet50 \
|
|
50
|
+
--accelerator T4:4 \
|
|
51
|
+
--num-nodes 4 \
|
|
52
|
+
--n-trials 48 \
|
|
53
|
+
--n-epochs 50 \
|
|
54
|
+
--prefix run1
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
**Train final model from tuning results:**
|
|
58
|
+
```bash
|
|
59
|
+
tunic --final results.json --data /path/to/dataset --epochs 50 --amp
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
**Plot results:**
|
|
63
|
+
```bash
|
|
64
|
+
tunic-plotter results.json
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Results on standard benchmarks
|
|
68
|
+
|
|
69
|
+
| Dataset | Model | Val AUROC | Test AUROC | Notes |
|
|
70
|
+
|---|---|---|---|---|
|
|
71
|
+
| PCam (patch camelyon) | ResNet18 | 0.96 | 0.97 | SOTA is 0.96 |
|
|
72
|
+
| TinyImageNet | ViT-Small | 0.87 (acc) | — | SOTA ~0.90 |
|
|
73
|
+
| ChestMNIST | ResNet18 | 0.76 | 0.75 | 14-class multi-label |
|
|
74
|
+
| TissueMNIST | ResNet18 | — | 0.94 | |
|
|
75
|
+
|
|
76
|
+
All runs use generic off-the-shelf models with no domain-specific modifications.
|
|
77
|
+
|
|
78
|
+
## Search space
|
|
79
|
+
|
|
80
|
+
| Parameter | Range |
|
|
81
|
+
|---|---|
|
|
82
|
+
| Optimizer | AdamW, SGD |
|
|
83
|
+
| Learning rate | 1e-5 – 1e-1 (log) |
|
|
84
|
+
| Weight decay | 1e-6 – 1e-1 (log) |
|
|
85
|
+
| Label smoothing | 0 – 0.3 |
|
|
86
|
+
| Dropout rate | 0 – 0.5 |
|
|
87
|
+
| RandAugment magnitude | 1 – 15 |
|
|
88
|
+
| RandAugment num ops | 1 – 4 |
|
|
89
|
+
| Mixup alpha | 0 – 0.5 |
|
|
90
|
+
| CutMix alpha | 0 – 1.0 |
|
|
91
|
+
|
|
92
|
+
Override any part with a YAML file via `--search-space`.
|
|
93
|
+
|
|
94
|
+
## tunic — local hyperparameter search
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
tunic --data PATH --model MODEL [options]
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
| Flag | Default | Description |
|
|
101
|
+
|---|---|---|
|
|
102
|
+
| `--data` | required | Dataset root (ImageFolder or WebDataset) |
|
|
103
|
+
| `--model` | required | Any timm model name |
|
|
104
|
+
| `--n_trials` | 80 | Number of Optuna trials |
|
|
105
|
+
| `--epochs` | 30 | Training epochs per trial (also used for `--final`) |
|
|
106
|
+
| `--tune-metric` | `val_auroc` | Metric for trial selection and pruning |
|
|
107
|
+
| `--training_fraction` | 1.0 | Fraction of training data (val always uses 1.0) |
|
|
108
|
+
| `--batch-size` | 32 | Batch size per trial |
|
|
109
|
+
| `--amp` | — | Enable automatic mixed precision |
|
|
110
|
+
| `--ray-address` | local | Ray cluster address |
|
|
111
|
+
| `--ray-storage` | local | Ray Tune storage path (local or S3 URI) |
|
|
112
|
+
| `--resume` | — | Warm-start from a previous experiment directory |
|
|
113
|
+
| `--final` | — | Skip tuning; train final model from results JSON |
|
|
114
|
+
| `--combine` | — | Train final model on train+val combined |
|
|
115
|
+
| `--final-model` | `tunic_final.pt` | Output path for final model weights |
|
|
116
|
+
| `--final-stats` | — | Output path for final model stats (JSON) |
|
|
117
|
+
| `--device` | `auto` | `auto`, `cuda`, `mps`, or `cpu` |
|
|
118
|
+
| `--smoke-test` | — | Quick end-to-end test with synthetic data |
|
|
119
|
+
|
|
120
|
+
## krunic — cloud launcher
|
|
121
|
+
|
|
122
|
+
krunic generates a SkyPilot YAML and launches the job. The dataset is S3-mounted (or copied); results are uploaded to S3 when the job completes.
|
|
123
|
+
|
|
124
|
+
**Prerequisites:** SkyPilot configured with AWS credentials; dataset in S3.
|
|
125
|
+
|
|
126
|
+
`--workdir` defaults to the installed package directory (contains `tunic.py` and `requirements.txt`). Override it only if you are developing from a local source checkout and want to test unpublished changes.
|
|
127
|
+
|
|
128
|
+
```
|
|
129
|
+
krunic --cluster NAME --workdir DIR --s3-path PATH --model MODEL [options]
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
| Flag | Default | Description |
|
|
133
|
+
|---|---|---|
|
|
134
|
+
| `--cluster` | required | SkyPilot cluster name |
|
|
135
|
+
| `--workdir` | package dir | Local directory synced to the cluster |
|
|
136
|
+
| `--s3-path` | required | Dataset path within the S3 bucket |
|
|
137
|
+
| `--model` | required | Any timm model name |
|
|
138
|
+
| `--accelerator` | `T4:4` | GPU spec (e.g. `T4:4`, `A10G:1`, `A100:8`) |
|
|
139
|
+
| `--num-nodes` | 1 | Number of cluster nodes |
|
|
140
|
+
| `--n-trials` | 30 | Number of Optuna trials |
|
|
141
|
+
| `--n-epochs` | 30 | Training epochs per trial |
|
|
142
|
+
| `--batch-size` | 32 | Batch size per trial |
|
|
143
|
+
| `--training-fraction` | 1.0 | Fraction of training data per trial |
|
|
144
|
+
| `--tune-metric` | `val_auroc` | Metric for trial selection and pruning |
|
|
145
|
+
| `--bucket` | `image.data` | S3 bucket name |
|
|
146
|
+
| `--prefix` | `tunic` | Prefix for output files and S3 paths |
|
|
147
|
+
| `--spot` | — | Use spot instances (with retry-until-up) |
|
|
148
|
+
| `--copy` | — | Copy data from S3 to local disk instead of mounting |
|
|
149
|
+
| `--idle-minutes` | 60 | Auto-stop cluster after N idle minutes |
|
|
150
|
+
| `--no-autostop` | — | Disable auto-stop |
|
|
151
|
+
|
|
152
|
+
Results are uploaded to `s3://<bucket>/ray-results/<prefix>/<prefix>_results.json`.
|
|
153
|
+
|
|
154
|
+
## tunic-plotter — visualize results
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
tunic-plotter results.json # plots val_auroc and val_acc
|
|
158
|
+
tunic-plotter results.json --metric val_acc # single metric
|
|
159
|
+
tunic-plotter results.json --trial_sort # keep original trial order, show running best
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
Saves PNG files alongside the results JSON.
|
|
163
|
+
|
|
164
|
+
## Dataset format
|
|
165
|
+
|
|
166
|
+
tunic auto-detects the dataset format:
|
|
167
|
+
|
|
168
|
+
- **ImageFolder** — standard `split/class/image.ext` layout
|
|
169
|
+
- **WebDataset** — sharded TAR files; detected when `wds/dataset_info.json` exists
|
|
170
|
+
|
|
171
|
+
## Scaling
|
|
172
|
+
|
|
173
|
+
Concurrent trials = total GPUs. `--num-nodes 4 --accelerator T4:4` = 16 concurrent trials.
|
|
174
|
+
|
|
175
|
+
Optuna's TPE needs ~20 trials before it outperforms random search. 32–64 trials is a practical range for most problems.
|
|
176
|
+
|
|
177
|
+
## Output format
|
|
178
|
+
|
|
179
|
+
```json
|
|
180
|
+
{
|
|
181
|
+
"model": "resnet18",
|
|
182
|
+
"best_val_auroc": 0.963,
|
|
183
|
+
"best_val_acc": 0.891,
|
|
184
|
+
"best_params": {
|
|
185
|
+
"optimizer": "AdamW",
|
|
186
|
+
"lr": 0.0028,
|
|
187
|
+
"weight_decay": 3.6e-06,
|
|
188
|
+
"label_smoothing": 0.058,
|
|
189
|
+
"drop_rate": 0.183
|
|
190
|
+
},
|
|
191
|
+
"n_trials": 48,
|
|
192
|
+
"completed_trials": 48,
|
|
193
|
+
"epochs": 50,
|
|
194
|
+
"all_trials": [...]
|
|
195
|
+
}
|
|
196
|
+
```
|
krunic-0.1.0/README.md
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# krunic
|
|
2
|
+
|
|
3
|
+
Automated hyperparameter search for image classifiers — from dataset to tuned model with one command.
|
|
4
|
+
|
|
5
|
+
Built on [Ray Tune](https://docs.ray.io/en/latest/tune/index.html), [Optuna](https://optuna.org/), [timm](https://github.com/huggingface/pytorch-image-models), and [SkyPilot](https://skypilot.readthedocs.io/).
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pipx install krunic
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
This installs three commands: `tunic` (local training), `krunic` (cloud launcher), and `tunic-plotter` (results visualizer).
|
|
14
|
+
|
|
15
|
+
## Quick start
|
|
16
|
+
|
|
17
|
+
**Local:**
|
|
18
|
+
```bash
|
|
19
|
+
tunic --data /path/to/dataset --model resnet50 --n_trials 30 --epochs 30 --output results.json
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
**Cloud (AWS):**
|
|
23
|
+
```bash
|
|
24
|
+
krunic \
|
|
25
|
+
--cluster my-cluster \
|
|
26
|
+
--workdir ~/github/krunic \
|
|
27
|
+
--s3-path my-dataset \
|
|
28
|
+
--model resnet50 \
|
|
29
|
+
--accelerator T4:4 \
|
|
30
|
+
--num-nodes 4 \
|
|
31
|
+
--n-trials 48 \
|
|
32
|
+
--n-epochs 50 \
|
|
33
|
+
--prefix run1
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
**Train final model from tuning results:**
|
|
37
|
+
```bash
|
|
38
|
+
tunic --final results.json --data /path/to/dataset --epochs 50 --amp
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
**Plot results:**
|
|
42
|
+
```bash
|
|
43
|
+
tunic-plotter results.json
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Results on standard benchmarks
|
|
47
|
+
|
|
48
|
+
| Dataset | Model | Val AUROC | Test AUROC | Notes |
|
|
49
|
+
|---|---|---|---|---|
|
|
50
|
+
| PCam (patch camelyon) | ResNet18 | 0.96 | 0.97 | SOTA is 0.96 |
|
|
51
|
+
| TinyImageNet | ViT-Small | 0.87 (acc) | — | SOTA ~0.90 |
|
|
52
|
+
| ChestMNIST | ResNet18 | 0.76 | 0.75 | 14-class multi-label |
|
|
53
|
+
| TissueMNIST | ResNet18 | — | 0.94 | |
|
|
54
|
+
|
|
55
|
+
All runs use generic off-the-shelf models with no domain-specific modifications.
|
|
56
|
+
|
|
57
|
+
## Search space
|
|
58
|
+
|
|
59
|
+
| Parameter | Range |
|
|
60
|
+
|---|---|
|
|
61
|
+
| Optimizer | AdamW, SGD |
|
|
62
|
+
| Learning rate | 1e-5 – 1e-1 (log) |
|
|
63
|
+
| Weight decay | 1e-6 – 1e-1 (log) |
|
|
64
|
+
| Label smoothing | 0 – 0.3 |
|
|
65
|
+
| Dropout rate | 0 – 0.5 |
|
|
66
|
+
| RandAugment magnitude | 1 – 15 |
|
|
67
|
+
| RandAugment num ops | 1 – 4 |
|
|
68
|
+
| Mixup alpha | 0 – 0.5 |
|
|
69
|
+
| CutMix alpha | 0 – 1.0 |
|
|
70
|
+
|
|
71
|
+
Override any part with a YAML file via `--search-space`.
|
|
72
|
+
|
|
73
|
+
## tunic — local hyperparameter search
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
tunic --data PATH --model MODEL [options]
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
| Flag | Default | Description |
|
|
80
|
+
|---|---|---|
|
|
81
|
+
| `--data` | required | Dataset root (ImageFolder or WebDataset) |
|
|
82
|
+
| `--model` | required | Any timm model name |
|
|
83
|
+
| `--n_trials` | 80 | Number of Optuna trials |
|
|
84
|
+
| `--epochs` | 30 | Training epochs per trial (also used for `--final`) |
|
|
85
|
+
| `--tune-metric` | `val_auroc` | Metric for trial selection and pruning |
|
|
86
|
+
| `--training_fraction` | 1.0 | Fraction of training data (val always uses 1.0) |
|
|
87
|
+
| `--batch-size` | 32 | Batch size per trial |
|
|
88
|
+
| `--amp` | — | Enable automatic mixed precision |
|
|
89
|
+
| `--ray-address` | local | Ray cluster address |
|
|
90
|
+
| `--ray-storage` | local | Ray Tune storage path (local or S3 URI) |
|
|
91
|
+
| `--resume` | — | Warm-start from a previous experiment directory |
|
|
92
|
+
| `--final` | — | Skip tuning; train final model from results JSON |
|
|
93
|
+
| `--combine` | — | Train final model on train+val combined |
|
|
94
|
+
| `--final-model` | `tunic_final.pt` | Output path for final model weights |
|
|
95
|
+
| `--final-stats` | — | Output path for final model stats (JSON) |
|
|
96
|
+
| `--device` | `auto` | `auto`, `cuda`, `mps`, or `cpu` |
|
|
97
|
+
| `--smoke-test` | — | Quick end-to-end test with synthetic data |
|
|
98
|
+
|
|
99
|
+
## krunic — cloud launcher
|
|
100
|
+
|
|
101
|
+
krunic generates a SkyPilot YAML and launches the job. The dataset is S3-mounted (or copied); results are uploaded to S3 when the job completes.
|
|
102
|
+
|
|
103
|
+
**Prerequisites:** SkyPilot configured with AWS credentials; dataset in S3.
|
|
104
|
+
|
|
105
|
+
`--workdir` defaults to the installed package directory (contains `tunic.py` and `requirements.txt`). Override it only if you are developing from a local source checkout and want to test unpublished changes.
|
|
106
|
+
|
|
107
|
+
```
|
|
108
|
+
krunic --cluster NAME --workdir DIR --s3-path PATH --model MODEL [options]
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
| Flag | Default | Description |
|
|
112
|
+
|---|---|---|
|
|
113
|
+
| `--cluster` | required | SkyPilot cluster name |
|
|
114
|
+
| `--workdir` | package dir | Local directory synced to the cluster |
|
|
115
|
+
| `--s3-path` | required | Dataset path within the S3 bucket |
|
|
116
|
+
| `--model` | required | Any timm model name |
|
|
117
|
+
| `--accelerator` | `T4:4` | GPU spec (e.g. `T4:4`, `A10G:1`, `A100:8`) |
|
|
118
|
+
| `--num-nodes` | 1 | Number of cluster nodes |
|
|
119
|
+
| `--n-trials` | 30 | Number of Optuna trials |
|
|
120
|
+
| `--n-epochs` | 30 | Training epochs per trial |
|
|
121
|
+
| `--batch-size` | 32 | Batch size per trial |
|
|
122
|
+
| `--training-fraction` | 1.0 | Fraction of training data per trial |
|
|
123
|
+
| `--tune-metric` | `val_auroc` | Metric for trial selection and pruning |
|
|
124
|
+
| `--bucket` | `image.data` | S3 bucket name |
|
|
125
|
+
| `--prefix` | `tunic` | Prefix for output files and S3 paths |
|
|
126
|
+
| `--spot` | — | Use spot instances (with retry-until-up) |
|
|
127
|
+
| `--copy` | — | Copy data from S3 to local disk instead of mounting |
|
|
128
|
+
| `--idle-minutes` | 60 | Auto-stop cluster after N idle minutes |
|
|
129
|
+
| `--no-autostop` | — | Disable auto-stop |
|
|
130
|
+
|
|
131
|
+
Results are uploaded to `s3://<bucket>/ray-results/<prefix>/<prefix>_results.json`.
|
|
132
|
+
|
|
133
|
+
## tunic-plotter — visualize results
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
tunic-plotter results.json # plots val_auroc and val_acc
|
|
137
|
+
tunic-plotter results.json --metric val_acc # single metric
|
|
138
|
+
tunic-plotter results.json --trial_sort # keep original trial order, show running best
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
Saves PNG files alongside the results JSON.
|
|
142
|
+
|
|
143
|
+
## Dataset format
|
|
144
|
+
|
|
145
|
+
tunic auto-detects the dataset format:
|
|
146
|
+
|
|
147
|
+
- **ImageFolder** — standard `split/class/image.ext` layout
|
|
148
|
+
- **WebDataset** — sharded TAR files; detected when `wds/dataset_info.json` exists
|
|
149
|
+
|
|
150
|
+
## Scaling
|
|
151
|
+
|
|
152
|
+
Concurrent trials = total GPUs. `--num-nodes 4 --accelerator T4:4` = 16 concurrent trials.
|
|
153
|
+
|
|
154
|
+
Optuna's TPE needs ~20 trials before it outperforms random search. 32–64 trials is a practical range for most problems.
|
|
155
|
+
|
|
156
|
+
## Output format
|
|
157
|
+
|
|
158
|
+
```json
|
|
159
|
+
{
|
|
160
|
+
"model": "resnet18",
|
|
161
|
+
"best_val_auroc": 0.963,
|
|
162
|
+
"best_val_acc": 0.891,
|
|
163
|
+
"best_params": {
|
|
164
|
+
"optimizer": "AdamW",
|
|
165
|
+
"lr": 0.0028,
|
|
166
|
+
"weight_decay": 3.6e-06,
|
|
167
|
+
"label_smoothing": 0.058,
|
|
168
|
+
"drop_rate": 0.183
|
|
169
|
+
},
|
|
170
|
+
"n_trials": 48,
|
|
171
|
+
"completed_trials": 48,
|
|
172
|
+
"epochs": 50,
|
|
173
|
+
"all_trials": [...]
|
|
174
|
+
}
|
|
175
|
+
```
|
|
File without changes
|