brr-cli 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. brr_cli-0.1.0/.brr/aws/cluster.yaml +50 -0
  2. brr_cli-0.1.0/.brr/aws/dev.yaml +36 -0
  3. brr_cli-0.1.0/.brr/aws/setup.sh +14 -0
  4. brr_cli-0.1.0/.brr/config.env +6 -0
  5. brr_cli-0.1.0/.github/workflows/publish.yml +17 -0
  6. brr_cli-0.1.0/.gitignore +7 -0
  7. brr_cli-0.1.0/CLAUDE.md +89 -0
  8. brr_cli-0.1.0/LICENSE +21 -0
  9. brr_cli-0.1.0/PKG-INFO +287 -0
  10. brr_cli-0.1.0/README.md +258 -0
  11. brr_cli-0.1.0/brr/__init__.py +0 -0
  12. brr_cli-0.1.0/brr/aws/__init__.py +0 -0
  13. brr_cli-0.1.0/brr/aws/configure.py +448 -0
  14. brr_cli-0.1.0/brr/aws/iam-policy.json +77 -0
  15. brr_cli-0.1.0/brr/aws/nodes.py +255 -0
  16. brr_cli-0.1.0/brr/aws/templates/__init__.py +0 -0
  17. brr_cli-0.1.0/brr/aws/templates/cpu-l4.yaml +50 -0
  18. brr_cli-0.1.0/brr/aws/templates/cpu.yaml +50 -0
  19. brr_cli-0.1.0/brr/aws/templates/h100.yaml +35 -0
  20. brr_cli-0.1.0/brr/aws/templates/l4.yaml +36 -0
  21. brr_cli-0.1.0/brr/cli.py +106 -0
  22. brr_cli-0.1.0/brr/cluster.py +949 -0
  23. brr_cli-0.1.0/brr/commands/__init__.py +0 -0
  24. brr_cli-0.1.0/brr/commands/bake.py +657 -0
  25. brr_cli-0.1.0/brr/commands/config.py +121 -0
  26. brr_cli-0.1.0/brr/commands/configure.py +285 -0
  27. brr_cli-0.1.0/brr/commands/init.py +182 -0
  28. brr_cli-0.1.0/brr/commands/nuke.py +497 -0
  29. brr_cli-0.1.0/brr/data/__init__.py +0 -0
  30. brr_cli-0.1.0/brr/data/idle-shutdown.sh +133 -0
  31. brr_cli-0.1.0/brr/data/setup.sh +450 -0
  32. brr_cli-0.1.0/brr/nebius/__init__.py +0 -0
  33. brr_cli-0.1.0/brr/nebius/configure.py +334 -0
  34. brr_cli-0.1.0/brr/nebius/node_provider.py +464 -0
  35. brr_cli-0.1.0/brr/nebius/nodes.py +282 -0
  36. brr_cli-0.1.0/brr/nebius/templates/__init__.py +0 -0
  37. brr_cli-0.1.0/brr/nebius/templates/cpu-h100.yaml +46 -0
  38. brr_cli-0.1.0/brr/nebius/templates/cpu.yaml +45 -0
  39. brr_cli-0.1.0/brr/nebius/templates/h100.yaml +35 -0
  40. brr_cli-0.1.0/brr/state.py +230 -0
  41. brr_cli-0.1.0/brr/templates.py +560 -0
  42. brr_cli-0.1.0/brr/utils.py +8 -0
  43. brr_cli-0.1.0/pyproject.toml +45 -0
  44. brr_cli-0.1.0/tests/e2e/autoscale_test.py +102 -0
  45. brr_cli-0.1.0/uv.lock +2003 -0
@@ -0,0 +1,50 @@
1
+ cluster_name: brr-cluster
2
+ max_workers: 4
3
+
4
+ provider:
5
+ type: aws
6
+ region: us-east-1
7
+ availability_zone: us-east-1a, us-east-1b, us-east-1c, us-east-1d
8
+ cache_stopped_nodes: True
9
+
10
+ auth:
11
+ ssh_user: ubuntu
12
+ ssh_private_key: {{AWS_SSH_KEY}}
13
+
14
+ head_node_type: ray.head.default
15
+
16
+ available_node_types:
17
+ ray.head.default:
18
+ resources: {}
19
+ node_config:
20
+ InstanceType: t3.2xlarge
21
+ ImageId: {{AMI_UBUNTU}}
22
+ KeyName: {{AWS_KEY_NAME}}
23
+ SecurityGroupIds: ["{{AWS_SECURITY_GROUP}}"]
24
+ BlockDeviceMappings:
25
+ - DeviceName: /dev/sda1
26
+ Ebs:
27
+ VolumeSize: 100
28
+ VolumeType: gp3
29
+ ray.worker.default:
30
+ min_workers: 0
31
+ max_workers: 4
32
+ resources: {}
33
+ node_config:
34
+ InstanceType: g6.4xlarge
35
+ ImageId: {{AMI_DL}}
36
+ KeyName: {{AWS_KEY_NAME}}
37
+ SecurityGroupIds: ["{{AWS_SECURITY_GROUP}}"]
38
+ BlockDeviceMappings:
39
+ - DeviceName: /dev/sda1
40
+ Ebs:
41
+ VolumeSize: 200
42
+ VolumeType: gp3
43
+
44
+ head_start_ray_commands:
45
+ - source $HOME/.venv/bin/activate && ray stop
46
+ - source $HOME/.venv/bin/activate && ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
47
+
48
+ worker_start_ray_commands:
49
+ - source $HOME/.venv/bin/activate && ray stop
50
+ - source $HOME/.venv/bin/activate && ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -0,0 +1,36 @@
1
+ cluster_name: brr-dev
2
+ max_workers: 0
3
+
4
+ provider:
5
+ type: aws
6
+ region: us-east-1
7
+ availability_zone: us-east-1a, us-east-1b, us-east-1c, us-east-1d
8
+ cache_stopped_nodes: True # If not present, the default is True.
9
+
10
+ auth:
11
+ ssh_user: ubuntu
12
+ ssh_private_key: {{AWS_SSH_KEY}}
13
+
14
+ head_node_type: ray.head.default
15
+
16
+ available_node_types:
17
+ ray.head.default:
18
+ resources: {}
19
+ node_config:
20
+ InstanceType: gr6.4xlarge # L4 GPU https://aws.amazon.com/ec2/instance-types/g6/
21
+ ImageId: {{AMI_DL}}
22
+ KeyName: {{AWS_KEY_NAME}}
23
+ SecurityGroupIds: ["{{AWS_SECURITY_GROUP}}"]
24
+ BlockDeviceMappings:
25
+ - DeviceName: /dev/sda1
26
+ Ebs:
27
+ VolumeSize: 200
28
+ VolumeType: gp3
29
+
30
+ head_start_ray_commands:
31
+ - source $HOME/.venv/bin/activate && ray stop
32
+ - source $HOME/.venv/bin/activate && ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
33
+
34
+ worker_start_ray_commands:
35
+ - source $HOME/.venv/bin/activate && ray stop
36
+ - source $HOME/.venv/bin/activate && ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -0,0 +1,14 @@
1
+ #!/bin/bash
2
+ # Project setup — runs after global setup on every node boot.
3
+ set -Eeuo pipefail
4
+
5
+ # Sync project dependencies (uses locked versions from uv.lock)
6
+ if [ -d "$HOME/code/brr" ]; then
7
+ cd "$HOME/code/brr"
8
+ # Pre-fetch the Python version required by the project so uv sync doesn't hang.
9
+ uv python install
10
+ uv sync --group brr
11
+ fi
12
+
13
+ # Add extra project-specific dependencies below:
14
+ # uv pip install torch
@@ -0,0 +1,6 @@
1
+ # Project config — overrides ~/.brr/config.env for this project.
2
+ # Uncomment and edit values as needed.
3
+
4
+ # IDLE_SHUTDOWN_TIMEOUT_MIN="20"
5
+ # DOTFILES_REPO="https://github.com/user/dotfiles"
6
+ # PYTHON_VERSION="3.11"
@@ -0,0 +1,17 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ publish:
9
+ runs-on: ubuntu-latest
10
+ environment: release
11
+ permissions:
12
+ id-token: write
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+ - uses: astral-sh/setup-uv@v5
16
+ - run: uv build
17
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,7 @@
1
+ .claude
2
+ .venv/
3
+ __pycache__/
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .DS_Store
@@ -0,0 +1,89 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Development Setup
6
+
7
+ ```bash
8
+ # Install dependencies (uses uv + hatchling)
9
+ uv sync
10
+
11
+ # Install CLI in editable mode
12
+ uv tool install --editable .
13
+
14
+ # Run the CLI
15
+ brr --help
16
+ ```
17
+
18
+ There are no tests, linters, or CI pipelines configured.
19
+
20
+ ## Architecture
21
+
22
+ brr is a CLI for managing GPU/CPU compute clusters across AWS and Nebius. It uses Click for commands, Rich for terminal output, InquirerPy for interactive prompts, PyYAML for config templating, and Ray for cluster orchestration.
23
+
24
+ ### Command Flow
25
+
26
+ All cluster commands (`up`, `down`, `attach`, `list`, `clean`, `vscode`) live in `brr/cluster.py` and follow this pattern:
27
+
28
+ 1. **Provider parsing** — `state.py:parse_provider()` splits `provider:name` syntax (e.g. `nebius:h100`). Default provider is `aws`.
29
+ 2. **Config loading** — `state.py:read_merged_config()` layers: `CONFIG_DEFAULTS` → `~/.brr/config.env` → `.brr/config.env` (project).
30
+ 3. **Template resolution** — `templates.py:resolve_template()` finds a YAML: project templates (`.brr/{provider}/{name}.yaml`) take precedence when inside a project; explicit `provider:name` prefix bypasses project and uses built-in (`brr/{provider}/templates/{name}.yaml`).
31
+ 4. **Rendering** — `{{VAR}}` placeholders replaced with config values; `???` marks required fields that must be overridden.
32
+ 5. **Overrides** — CLI args like `instance_type=t3.xlarge` applied via alias system (`_brr` YAML section), `GLOBAL_ARGS` mapping, or raw dot-notation paths.
33
+ 6. **Staging** — `prepare_staging()` writes setup scripts and config to `~/.brr/staging/{name}/`, then `inject_brr_infra()` adds file_mounts and setup_commands to the Ray YAML.
34
+ 7. **Execution** — `ray up`/`ray down` called via subprocess.
35
+ 8. **SSH config sync** — `nodes.py:update_ssh_config()` writes `brr-{cluster}` host entries to `~/.ssh/config`.
36
+
37
+ ### Project System
38
+
39
+ Projects are repos with a `.brr/` directory (created by `brr init`):
40
+
41
+ ```
42
+ .brr/
43
+ aws/dev.yaml # Project template (standard Ray YAML)
44
+ aws/cluster.yaml
45
+ aws/setup.sh # Runs after global setup on every node
46
+ config.env # Overrides ~/.brr/config.env
47
+ ```
48
+
49
+ Key behaviors:
50
+ - `state.py:find_project_root()` walks up from CWD looking for `.brr/` with YAML files (skips `~/.brr`).
51
+ - `resolve_project_provider()` infers provider from project: single provider → automatic; multiple → requires `DEFAULT_PROVIDER` or explicit prefix.
52
+ - Setup layering: global `~/.brr/setup.sh` runs first, then project `.brr/{provider}/setup.sh`.
53
+ - uv-managed projects: `templates.py:rewrite_ray_commands_for_uv()` replaces venv activation with `uv run --group brr` in Ray start commands.
54
+
55
+ ### Key Modules
56
+
57
+ - **`brr/cli.py`** — Click command group, version from `importlib.metadata`.
58
+ - **`brr/cluster.py`** — Cluster lifecycle. Uses `_find_ray()` to locate the Ray binary and `_run_ray()` to exec it.
59
+ - **`brr/state.py`** — Config parsing (`read_config`/`write_config`), state dirs, project discovery, provider checks.
60
+ - **`brr/templates.py`** — Template resolution, rendering, override system, staging, baked image substitution.
61
+ - **`brr/commands/init.py`** — `brr init` scaffolds `.brr/{provider}/` with templates + setup.sh. Maps project template names to built-in ones (`_TEMPLATE_MAP`).
62
+ - **`brr/commands/configure.py`** — Interactive wizard: cloud provider, AI tools, general settings. Uses InquirerPy for menus.
63
+ - **`brr/commands/bake.py`** — Pre-bakes global setup into AMIs/images. Strips secrets (`_BAKE_STRIP_KEYS`) before baking. Tracks staleness via setup.sh hash.
64
+ - **`brr/commands/nuke.py`** — Destructive teardown. Multi-region parallel cleanup with ThreadPoolExecutor (AWS) or async SDK (Nebius).
65
+ - **`brr/data/setup.sh`** — Node bootstrap: mounts, AWS CLI, GitHub SSH keys, AI tools, Python venv, Ray, idle shutdown daemon.
66
+ - **`brr/data/idle-shutdown.sh`** — Systemd daemon monitoring CPU/GPU/SSH activity.
67
+
68
+ #### AWS
69
+
70
+ - **`brr/aws/configure.py`** — Creates key pairs, security groups, EFS, stores secrets in Secrets Manager.
71
+ - **`brr/aws/nodes.py`** — EC2 queries (`query_ray_clusters`), SSH config management.
72
+ - **`brr/aws/templates/`** — Ray YAML templates: `cpu.yaml`, `l4.yaml`, `h100.yaml`, `cpu-l4.yaml`.
73
+
74
+ #### Nebius
75
+
76
+ - **`brr/nebius/configure.py`** — Project selection, subnet, SSH keys, shared filesystem, GitHub SSH.
77
+ - **`brr/nebius/nodes.py`** — Instance queries (`query_clusters`, `query_head_ip`), SSH config management.
78
+ - **`brr/nebius/node_provider.py`** — Custom Ray NodeProvider for autoscaling. Stop-instead-of-delete for cached nodes. Restarts stopped instances before creating new ones.
79
+ - **`brr/nebius/templates/`** — Ray YAML templates: `cpu.yaml`, `h100.yaml`, `cpu-h100.yaml`.
80
+
81
+ ### Known Pitfalls
82
+
83
+ - `textwrap.dedent` with f-strings breaks when interpolated values have different indentation. Build shell scripts as concatenated string parts instead.
84
+ - `config.env` keys can contain digits (e.g. `EC2_SSH_SECRET`), so the parsing regex must be `[A-Z0-9_]+`.
85
+ - Nebius `recovery_policy` is immutable after instance creation — must use `InstanceRecoveryPolicy.FAIL` to prevent auto-restart after idle shutdown.
86
+ - Nebius instance state 8 is ERROR (not DELETED) — `_TERMINAL_STATES` must include it.
87
+ - External providers (Nebius) need explicit `resources: {CPU: N}` in Ray YAML templates — Ray can't auto-detect.
88
+ - Unresolved `{{VAR}}` placeholders in provider config must be guarded (e.g. `"{{" in value` check in node_provider).
89
+ - InquirerPy `Choice` class doesn't have a `disabled` parameter. Use dict syntax `{"value": ..., "name": ..., "disabled": "reason"}` for disabled items.
brr_cli-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Jon Carter
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
brr_cli-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,287 @@
1
+ Metadata-Version: 2.4
2
+ Name: brr-cli
3
+ Version: 0.1.0
4
+ Summary: Research infrastructure management tooling.
5
+ Project-URL: Homepage, https://github.com/joncarter1/brr
6
+ Project-URL: Repository, https://github.com/joncarter1/brr
7
+ Project-URL: Issues, https://github.com/joncarter1/brr/issues
8
+ Author: Jon Carter
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: aws,cloud,cluster,gpu,infrastructure,ray
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Topic :: System :: Clustering
17
+ Requires-Python: >=3.11
18
+ Requires-Dist: click
19
+ Requires-Dist: inquirerpy
20
+ Requires-Dist: pyyaml
21
+ Requires-Dist: rich
22
+ Provides-Extra: aws
23
+ Requires-Dist: boto3; extra == 'aws'
24
+ Requires-Dist: ray[default]; extra == 'aws'
25
+ Provides-Extra: nebius
26
+ Requires-Dist: nebius; extra == 'nebius'
27
+ Requires-Dist: ray[default]; extra == 'nebius'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # ❄️ brr ❄️
31
+
32
+ Opinionated research infrastructure tooling. Launch clusters, get SSH access, start building.
33
+
34
+ ## Features
35
+ - **Shared filesystem** — All nodes share `~/code/` via EFS (AWS) or virtiofs (Nebius).
36
+ - **Coding tools** — Install Claude Code, Codex, or Gemini. Connect with e.g. `brr attach dev claude`
37
+ - **Autoscaling** — Ray-based cluster scaling with cached instances.
38
+ - **Project-based workflows** — Per-repo cluster configs and project-specific dependencies.
39
+ - **Auto-shutdown** — Monitors CPU, GPU, and SSH activity. Shuts down idle instances to save costs.
40
+ - **Dotfiles integration** — Take your dev environment (vim, tmux, shell config) to every cluster node via GNU Stow.
41
+
42
+ ## Prerequisites
43
+
44
+ - Python 3.11+
45
+ - [uv](https://docs.astral.sh/uv/) (for installation)
46
+
47
+ ## Quick Start
48
+
49
+ ```sh
50
+ # Install (AWS only)
51
+ uv tool install brr-cli[aws]
52
+
53
+ # Install (both providers)
54
+ # uv tool install brr-cli[aws,nebius]
55
+
56
+ # Configure (interactive wizard)
57
+ brr configure # or: brr configure nebius
58
+
59
+ # Launch an H100
60
+ brr up aws:h100
61
+
62
+ # brr up nebius:h100
63
+
64
+ # Connect
65
+ brr attach aws:h100 # SSH
66
+ brr attach aws:h100 claude # Claude Code on the cluster
67
+ brr vscode aws:h100 # VS Code remote
68
+ ```
69
+
70
+ Built-in templates use `provider:name` syntax (e.g. `aws:h100`). Inside a [project](#projects), short names like `brr up dev` work automatically.
71
+
72
+ Supported clouds: [AWS](#aws-setup) · [Nebius](#nebius-setup)
73
+
74
+ ## Projects
75
+
76
+ For per-repo cluster configs, initialize a project:
77
+
78
+ ```sh
79
+ cd my-research-repo/
80
+ brr init
81
+ ```
82
+
83
+ This creates:
84
+
85
+ ```
86
+ .brr/
87
+ aws/
88
+ dev.yaml # Single GPU for development
89
+ cluster.yaml # CPU head + GPU workers
90
+ setup.sh # Project-specific dependencies
91
+ config.env # Project config (overrides global)
92
+ ```
93
+
94
+ Templates are standard Ray YAML — edit them or add your own. Inside a project, use short names:
95
+
96
+ ```sh
97
+ brr up # launches DEFAULT_TEMPLATE (set in .brr/config.env)
98
+ brr up dev # launches .brr/aws/dev.yaml
99
+ brr up cluster # launches .brr/aws/cluster.yaml
100
+ brr attach dev # SSH into dev cluster
101
+ brr down dev # tear down
102
+ ```
103
+
104
+ If your project uses `uv`, `brr init` automatically adds `brr-cli` and `ray` to a `brr` dependency group. The cluster uses your project-locked versions — no manual setup needed.
105
+
106
+ Project config (`.brr/config.env`) overrides global settings (`~/.brr/config.env`). Use it for project-specific settings like idle timeouts or dotfiles.
107
+
108
+ ## Templates
109
+
110
+ ### Built-in templates
111
+
112
+ | Template | Instance | GPU | Workers |
113
+ | :--- | :--- | :--- | :--- |
114
+ | `aws:cpu` | t3.2xlarge | — | 0-2 |
115
+ | `aws:l4` | gr6.4xlarge | 1x L4 | — |
116
+ | `aws:h100` | p5.4xlarge | 8x H100 | — |
117
+ | `aws:cpu-l4` | t3.2xlarge + g6.4xlarge | 1x L4 | 0-4 |
118
+ | `nebius:cpu` | 8vcpu-32gb | — | 0-2 |
119
+ | `nebius:h100` | 1gpu-16vcpu-200gb | 1x H100 | — |
120
+ | `nebius:cpu-h100` | 8vcpu-32gb + 8gpu-128vcpu-1600gb | 8x H100 | 0-4 |
121
+
122
+ ### Overrides
123
+
124
+ Override template values inline:
125
+
126
+ ```sh
127
+ brr up aws:cpu instance_type=t3.xlarge max_workers=4
128
+ brr up aws:h100 spot=true
129
+ brr up dev region=us-west-2
130
+ ```
131
+
132
+ Preview the rendered config without launching:
133
+
134
+ ```sh
135
+ brr up dev --dry-run
136
+ ```
137
+
138
+ See available overrides for a template:
139
+
140
+ ```sh
141
+ brr templates show dev
142
+ ```
143
+
144
+ ### Multi-provider
145
+
146
+ Use the provider prefix for built-in templates:
147
+
148
+ ```sh
149
+ brr up aws:h100
150
+ brr up nebius:h100
151
+ brr attach nebius:h100
152
+ brr down nebius:h100
153
+ ```
154
+
155
+ Both providers can run simultaneously.
156
+
157
+ ## Customization
158
+
159
+ ### Node setup
160
+
161
+ `~/.brr/setup.sh` runs on every node boot. It installs packages, mounts shared storage, sets up Python/Ray, GitHub SSH keys, AI coding tools, dotfiles, and the idle shutdown daemon.
162
+
163
+ Edit it to customize:
164
+ ```sh
165
+ vim ~/.brr/setup.sh
166
+ ```
167
+
168
+ Project-specific dependencies go in `.brr/{provider}/setup.sh` (created by `brr init`), which runs after the global setup.
169
+
170
+ ### AI coding tools
171
+
172
+ Install AI coding assistants on every cluster node:
173
+
174
+ ```sh
175
+ brr configure tools # select Claude Code, Codex, and/or Gemini CLI
176
+ ```
177
+
178
+ Then connect and start coding:
179
+
180
+ ```sh
181
+ brr attach dev claude
182
+ ```
183
+
184
+ ### Dotfiles
185
+
186
+ Set a dotfiles repo to sync your dev environment to every node:
187
+
188
+ ```sh
189
+ brr config set DOTFILES_REPO "https://github.com/user/dotfiles"
190
+ ```
191
+
192
+ The repo is cloned to `~/dotfiles` and installed via `install.sh` (if present) or GNU Stow.
193
+
194
+ ### Image baking
195
+
196
+ Bake the global setup into AMIs/images for fast boot:
197
+
198
+ ```sh
199
+ brr bake aws # bake both CPU + GPU AMIs
200
+ brr bake status # check if baked images are up to date
201
+ ```
202
+
203
+ After baking, clusters boot from the pre-built image. Only project-specific deps need to install. `brr up` warns when `setup.sh` has changed since the last bake.
204
+
205
+ ### Idle shutdown
206
+
207
+ A systemd daemon monitors CPU, GPU, and SSH activity. When all signals are idle for the configured timeout, the instance shuts down.
208
+
209
+ Configure in `~/.brr/config.env`:
210
+
211
+ ```
212
+ IDLE_SHUTDOWN_ENABLED="true"
213
+ IDLE_SHUTDOWN_TIMEOUT_MIN="30"
214
+ IDLE_SHUTDOWN_CPU_THRESHOLD="10"
215
+ IDLE_SHUTDOWN_GRACE_MIN="15"
216
+ ```
217
+
218
+ The grace period prevents shutdown during initial setup. Monitor on a node with `journalctl -u idle-shutdown -f`.
219
+
220
+ ## Commands
221
+
222
+ | Command | Description |
223
+ | :--- | :--- |
224
+ | `brr up TEMPLATE [OVERRIDES...]` | Launch or update a cluster (`aws:h100`, `dev`, or `path.yaml`) |
225
+ | `brr up TEMPLATE --dry-run` | Preview rendered config without launching |
226
+ | `brr down TEMPLATE` | Stop a cluster (instances preserved for fast restart) |
227
+ | `brr down TEMPLATE --delete` | Terminate all instances and remove staging files |
228
+ | `brr attach TEMPLATE [COMMAND]` | SSH into head node, optionally run a command (e.g. `claude`) |
229
+ | `brr list [--all]` | List clusters (project-scoped by default, `--all` for everything) |
230
+ | `brr clean [TEMPLATE]` | Terminate stopped (cached) instances |
231
+ | `brr vscode TEMPLATE` | Open VS Code on a running cluster |
232
+ | `brr templates list` | List built-in templates |
233
+ | `brr templates show TEMPLATE` | Show template config and overrides |
234
+ | `brr init` | Initialize a project (interactive provider selection) |
235
+ | `brr configure [cloud\|tools\|general]` | Interactive setup (cloud provider, AI tools, settings) |
236
+ | `brr config [list\|get\|set\|path]` | View and manage configuration |
237
+ | `brr bake [aws\|nebius]` | Bake setup into cloud images |
238
+ | `brr bake status` | Check if baked images are up to date |
239
+ | `brr completion [bash\|zsh\|fish]` | Shell completion (`--install` to add to shell rc) |
240
+ | `brr nuke [aws\|nebius]` | Tear down all cloud resources |
241
+
242
+ ## Cloud Setup
243
+
244
+ ### AWS Setup
245
+
246
+ 1. Attach the [IAM policy](brr/aws/iam-policy.json) to your IAM user
247
+ 2. Install the [AWS CLI](https://aws.amazon.com/cli/) and run `aws configure`
248
+ 3. *(Optional)* For GitHub SSH access on clusters, authenticate the [GitHub CLI](https://cli.github.com/):
249
+ ```sh
250
+ gh auth login
251
+ gh auth refresh -h github.com -s admin:public_key
252
+ ```
253
+ 4. Run the setup wizard:
254
+ ```sh
255
+ brr configure aws
256
+ ```
257
+
258
+ ### Nebius Setup
259
+
260
+ 1. Install the [Nebius CLI](https://docs.nebius.com/cli/install) and run `nebius init`
261
+ 2. Create a service account with editor permissions:
262
+ ```sh
263
+ TENANT_ID="<your-tenant-id>" # from console.nebius.com → Administration
264
+
265
+ SA_ID=$(nebius iam service-account create \
266
+ --name brr-cluster --format json | jq -r '.metadata.id')
267
+
268
+ EDITORS_GROUP_ID=$(nebius iam group get-by-name \
269
+ --name editors --parent-id $TENANT_ID --format json | jq -r '.metadata.id')
270
+
271
+ nebius iam group-membership create \
272
+ --parent-id $EDITORS_GROUP_ID --member-id $SA_ID
273
+ ```
274
+ 3. Generate credentials:
275
+ ```sh
276
+ mkdir -p ~/.nebius
277
+ nebius iam auth-public-key generate \
278
+ --service-account-id $SA_ID --output ~/.nebius/credentials.json
279
+ ```
280
+ 4. Run the setup wizard:
281
+ ```sh
282
+ brr configure nebius
283
+ ```
284
+
285
+ ## Acknowledgments
286
+
287
+ This project started as a fork of [aws_wiz](https://github.com/besarthoxhaj/aws_wiz) by [Bes](https://github.com/besarthoxhaj) and has been inspired by discussions with colleagues from the [Encode: AI for Science Fellowship](https://encode.pillar.vc/).