brr-cli 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- brr_cli-0.1.0/.brr/aws/cluster.yaml +50 -0
- brr_cli-0.1.0/.brr/aws/dev.yaml +36 -0
- brr_cli-0.1.0/.brr/aws/setup.sh +14 -0
- brr_cli-0.1.0/.brr/config.env +6 -0
- brr_cli-0.1.0/.github/workflows/publish.yml +17 -0
- brr_cli-0.1.0/.gitignore +7 -0
- brr_cli-0.1.0/CLAUDE.md +89 -0
- brr_cli-0.1.0/LICENSE +21 -0
- brr_cli-0.1.0/PKG-INFO +287 -0
- brr_cli-0.1.0/README.md +258 -0
- brr_cli-0.1.0/brr/__init__.py +0 -0
- brr_cli-0.1.0/brr/aws/__init__.py +0 -0
- brr_cli-0.1.0/brr/aws/configure.py +448 -0
- brr_cli-0.1.0/brr/aws/iam-policy.json +77 -0
- brr_cli-0.1.0/brr/aws/nodes.py +255 -0
- brr_cli-0.1.0/brr/aws/templates/__init__.py +0 -0
- brr_cli-0.1.0/brr/aws/templates/cpu-l4.yaml +50 -0
- brr_cli-0.1.0/brr/aws/templates/cpu.yaml +50 -0
- brr_cli-0.1.0/brr/aws/templates/h100.yaml +35 -0
- brr_cli-0.1.0/brr/aws/templates/l4.yaml +36 -0
- brr_cli-0.1.0/brr/cli.py +106 -0
- brr_cli-0.1.0/brr/cluster.py +949 -0
- brr_cli-0.1.0/brr/commands/__init__.py +0 -0
- brr_cli-0.1.0/brr/commands/bake.py +657 -0
- brr_cli-0.1.0/brr/commands/config.py +121 -0
- brr_cli-0.1.0/brr/commands/configure.py +285 -0
- brr_cli-0.1.0/brr/commands/init.py +182 -0
- brr_cli-0.1.0/brr/commands/nuke.py +497 -0
- brr_cli-0.1.0/brr/data/__init__.py +0 -0
- brr_cli-0.1.0/brr/data/idle-shutdown.sh +133 -0
- brr_cli-0.1.0/brr/data/setup.sh +450 -0
- brr_cli-0.1.0/brr/nebius/__init__.py +0 -0
- brr_cli-0.1.0/brr/nebius/configure.py +334 -0
- brr_cli-0.1.0/brr/nebius/node_provider.py +464 -0
- brr_cli-0.1.0/brr/nebius/nodes.py +282 -0
- brr_cli-0.1.0/brr/nebius/templates/__init__.py +0 -0
- brr_cli-0.1.0/brr/nebius/templates/cpu-h100.yaml +46 -0
- brr_cli-0.1.0/brr/nebius/templates/cpu.yaml +45 -0
- brr_cli-0.1.0/brr/nebius/templates/h100.yaml +35 -0
- brr_cli-0.1.0/brr/state.py +230 -0
- brr_cli-0.1.0/brr/templates.py +560 -0
- brr_cli-0.1.0/brr/utils.py +8 -0
- brr_cli-0.1.0/pyproject.toml +45 -0
- brr_cli-0.1.0/tests/e2e/autoscale_test.py +102 -0
- brr_cli-0.1.0/uv.lock +2003 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
cluster_name: brr-cluster
|
|
2
|
+
max_workers: 4
|
|
3
|
+
|
|
4
|
+
provider:
|
|
5
|
+
type: aws
|
|
6
|
+
region: us-east-1
|
|
7
|
+
availability_zone: us-east-1a, us-east-1b, us-east-1c, us-east-1d
|
|
8
|
+
cache_stopped_nodes: True
|
|
9
|
+
|
|
10
|
+
auth:
|
|
11
|
+
ssh_user: ubuntu
|
|
12
|
+
ssh_private_key: {{AWS_SSH_KEY}}
|
|
13
|
+
|
|
14
|
+
head_node_type: ray.head.default
|
|
15
|
+
|
|
16
|
+
available_node_types:
|
|
17
|
+
ray.head.default:
|
|
18
|
+
resources: {}
|
|
19
|
+
node_config:
|
|
20
|
+
InstanceType: t3.2xlarge
|
|
21
|
+
ImageId: {{AMI_UBUNTU}}
|
|
22
|
+
KeyName: {{AWS_KEY_NAME}}
|
|
23
|
+
SecurityGroupIds: ["{{AWS_SECURITY_GROUP}}"]
|
|
24
|
+
BlockDeviceMappings:
|
|
25
|
+
- DeviceName: /dev/sda1
|
|
26
|
+
Ebs:
|
|
27
|
+
VolumeSize: 100
|
|
28
|
+
VolumeType: gp3
|
|
29
|
+
ray.worker.default:
|
|
30
|
+
min_workers: 0
|
|
31
|
+
max_workers: 4
|
|
32
|
+
resources: {}
|
|
33
|
+
node_config:
|
|
34
|
+
InstanceType: g6.4xlarge
|
|
35
|
+
ImageId: {{AMI_DL}}
|
|
36
|
+
KeyName: {{AWS_KEY_NAME}}
|
|
37
|
+
SecurityGroupIds: ["{{AWS_SECURITY_GROUP}}"]
|
|
38
|
+
BlockDeviceMappings:
|
|
39
|
+
- DeviceName: /dev/sda1
|
|
40
|
+
Ebs:
|
|
41
|
+
VolumeSize: 200
|
|
42
|
+
VolumeType: gp3
|
|
43
|
+
|
|
44
|
+
head_start_ray_commands:
|
|
45
|
+
- source $HOME/.venv/bin/activate && ray stop
|
|
46
|
+
- source $HOME/.venv/bin/activate && ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
|
|
47
|
+
|
|
48
|
+
worker_start_ray_commands:
|
|
49
|
+
- source $HOME/.venv/bin/activate && ray stop
|
|
50
|
+
- source $HOME/.venv/bin/activate && ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
cluster_name: brr-dev
|
|
2
|
+
max_workers: 0
|
|
3
|
+
|
|
4
|
+
provider:
|
|
5
|
+
type: aws
|
|
6
|
+
region: us-east-1
|
|
7
|
+
availability_zone: us-east-1a, us-east-1b, us-east-1c, us-east-1d
|
|
8
|
+
cache_stopped_nodes: True # If not present, the default is True.
|
|
9
|
+
|
|
10
|
+
auth:
|
|
11
|
+
ssh_user: ubuntu
|
|
12
|
+
ssh_private_key: {{AWS_SSH_KEY}}
|
|
13
|
+
|
|
14
|
+
head_node_type: ray.head.default
|
|
15
|
+
|
|
16
|
+
available_node_types:
|
|
17
|
+
ray.head.default:
|
|
18
|
+
resources: {}
|
|
19
|
+
node_config:
|
|
20
|
+
InstanceType: gr6.4xlarge # L4 GPU https://aws.amazon.com/ec2/instance-types/g6/
|
|
21
|
+
ImageId: {{AMI_DL}}
|
|
22
|
+
KeyName: {{AWS_KEY_NAME}}
|
|
23
|
+
SecurityGroupIds: ["{{AWS_SECURITY_GROUP}}"]
|
|
24
|
+
BlockDeviceMappings:
|
|
25
|
+
- DeviceName: /dev/sda1
|
|
26
|
+
Ebs:
|
|
27
|
+
VolumeSize: 200
|
|
28
|
+
VolumeType: gp3
|
|
29
|
+
|
|
30
|
+
head_start_ray_commands:
|
|
31
|
+
- source $HOME/.venv/bin/activate && ray stop
|
|
32
|
+
- source $HOME/.venv/bin/activate && ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
|
|
33
|
+
|
|
34
|
+
worker_start_ray_commands:
|
|
35
|
+
- source $HOME/.venv/bin/activate && ray stop
|
|
36
|
+
- source $HOME/.venv/bin/activate && ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Project setup — runs after global setup on every node boot.
|
|
3
|
+
set -Eeuo pipefail
|
|
4
|
+
|
|
5
|
+
# Sync project dependencies (uses locked versions from uv.lock)
|
|
6
|
+
if [ -d "$HOME/code/brr" ]; then
|
|
7
|
+
cd "$HOME/code/brr"
|
|
8
|
+
# Pre-fetch the Python version required by the project so uv sync doesn't hang.
|
|
9
|
+
uv python install
|
|
10
|
+
uv sync --group brr
|
|
11
|
+
fi
|
|
12
|
+
|
|
13
|
+
# Add extra project-specific dependencies below:
|
|
14
|
+
# uv pip install torch
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
publish:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
environment: release
|
|
11
|
+
permissions:
|
|
12
|
+
id-token: write
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
- uses: astral-sh/setup-uv@v5
|
|
16
|
+
- run: uv build
|
|
17
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
brr_cli-0.1.0/.gitignore
ADDED
brr_cli-0.1.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Development Setup
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Install dependencies (uses uv + hatchling)
|
|
9
|
+
uv sync
|
|
10
|
+
|
|
11
|
+
# Install CLI in editable mode
|
|
12
|
+
uv tool install --editable .
|
|
13
|
+
|
|
14
|
+
# Run the CLI
|
|
15
|
+
brr --help
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
There are no tests, linters, or CI pipelines configured.
|
|
19
|
+
|
|
20
|
+
## Architecture
|
|
21
|
+
|
|
22
|
+
brr is a CLI for managing GPU/CPU compute clusters across AWS and Nebius. It uses Click for commands, Rich for terminal output, InquirerPy for interactive prompts, PyYAML for config templating, and Ray for cluster orchestration.
|
|
23
|
+
|
|
24
|
+
### Command Flow
|
|
25
|
+
|
|
26
|
+
All cluster commands (`up`, `down`, `attach`, `list`, `clean`, `vscode`) live in `brr/cluster.py` and follow this pattern:
|
|
27
|
+
|
|
28
|
+
1. **Provider parsing** — `state.py:parse_provider()` splits `provider:name` syntax (e.g. `nebius:h100`). Default provider is `aws`.
|
|
29
|
+
2. **Config loading** — `state.py:read_merged_config()` layers: `CONFIG_DEFAULTS` → `~/.brr/config.env` → `.brr/config.env` (project).
|
|
30
|
+
3. **Template resolution** — `templates.py:resolve_template()` finds a YAML: project templates (`.brr/{provider}/{name}.yaml`) take precedence when inside a project; explicit `provider:name` prefix bypasses project and uses built-in (`brr/{provider}/templates/{name}.yaml`).
|
|
31
|
+
4. **Rendering** — `{{VAR}}` placeholders replaced with config values; `???` marks required fields that must be overridden.
|
|
32
|
+
5. **Overrides** — CLI args like `instance_type=t3.xlarge` applied via alias system (`_brr` YAML section), `GLOBAL_ARGS` mapping, or raw dot-notation paths.
|
|
33
|
+
6. **Staging** — `prepare_staging()` writes setup scripts and config to `~/.brr/staging/{name}/`, then `inject_brr_infra()` adds file_mounts and setup_commands to the Ray YAML.
|
|
34
|
+
7. **Execution** — `ray up`/`ray down` called via subprocess.
|
|
35
|
+
8. **SSH config sync** — `nodes.py:update_ssh_config()` writes `brr-{cluster}` host entries to `~/.ssh/config`.
|
|
36
|
+
|
|
37
|
+
### Project System
|
|
38
|
+
|
|
39
|
+
Projects are repos with a `.brr/` directory (created by `brr init`):
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
.brr/
|
|
43
|
+
aws/dev.yaml # Project template (standard Ray YAML)
|
|
44
|
+
aws/cluster.yaml
|
|
45
|
+
aws/setup.sh # Runs after global setup on every node
|
|
46
|
+
config.env # Overrides ~/.brr/config.env
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Key behaviors:
|
|
50
|
+
- `state.py:find_project_root()` walks up from CWD looking for `.brr/` with YAML files (skips `~/.brr`).
|
|
51
|
+
- `resolve_project_provider()` infers provider from project: single provider → automatic; multiple → requires `DEFAULT_PROVIDER` or explicit prefix.
|
|
52
|
+
- Setup layering: global `~/.brr/setup.sh` runs first, then project `.brr/{provider}/setup.sh`.
|
|
53
|
+
- uv-managed projects: `templates.py:rewrite_ray_commands_for_uv()` replaces venv activation with `uv run --group brr` in Ray start commands.
|
|
54
|
+
|
|
55
|
+
### Key Modules
|
|
56
|
+
|
|
57
|
+
- **`brr/cli.py`** — Click command group, version from `importlib.metadata`.
|
|
58
|
+
- **`brr/cluster.py`** — Cluster lifecycle. Uses `_find_ray()` to locate the Ray binary and `_run_ray()` to exec it.
|
|
59
|
+
- **`brr/state.py`** — Config parsing (`read_config`/`write_config`), state dirs, project discovery, provider checks.
|
|
60
|
+
- **`brr/templates.py`** — Template resolution, rendering, override system, staging, baked image substitution.
|
|
61
|
+
- **`brr/commands/init.py`** — `brr init` scaffolds `.brr/{provider}/` with templates + setup.sh. Maps project template names to built-in ones (`_TEMPLATE_MAP`).
|
|
62
|
+
- **`brr/commands/configure.py`** — Interactive wizard: cloud provider, AI tools, general settings. Uses InquirerPy for menus.
|
|
63
|
+
- **`brr/commands/bake.py`** — Pre-bakes global setup into AMIs/images. Strips secrets (`_BAKE_STRIP_KEYS`) before baking. Tracks staleness via setup.sh hash.
|
|
64
|
+
- **`brr/commands/nuke.py`** — Destructive teardown. Multi-region parallel cleanup with ThreadPoolExecutor (AWS) or async SDK (Nebius).
|
|
65
|
+
- **`brr/data/setup.sh`** — Node bootstrap: mounts, AWS CLI, GitHub SSH keys, AI tools, Python venv, Ray, idle shutdown daemon.
|
|
66
|
+
- **`brr/data/idle-shutdown.sh`** — Systemd daemon monitoring CPU/GPU/SSH activity.
|
|
67
|
+
|
|
68
|
+
#### AWS
|
|
69
|
+
|
|
70
|
+
- **`brr/aws/configure.py`** — Creates key pairs, security groups, EFS, stores secrets in Secrets Manager.
|
|
71
|
+
- **`brr/aws/nodes.py`** — EC2 queries (`query_ray_clusters`), SSH config management.
|
|
72
|
+
- **`brr/aws/templates/`** — Ray YAML templates: `cpu.yaml`, `l4.yaml`, `h100.yaml`, `cpu-l4.yaml`.
|
|
73
|
+
|
|
74
|
+
#### Nebius
|
|
75
|
+
|
|
76
|
+
- **`brr/nebius/configure.py`** — Project selection, subnet, SSH keys, shared filesystem, GitHub SSH.
|
|
77
|
+
- **`brr/nebius/nodes.py`** — Instance queries (`query_clusters`, `query_head_ip`), SSH config management.
|
|
78
|
+
- **`brr/nebius/node_provider.py`** — Custom Ray NodeProvider for autoscaling. Stop-instead-of-delete for cached nodes. Restarts stopped instances before creating new ones.
|
|
79
|
+
- **`brr/nebius/templates/`** — Ray YAML templates: `cpu.yaml`, `h100.yaml`, `cpu-h100.yaml`.
|
|
80
|
+
|
|
81
|
+
### Known Pitfalls
|
|
82
|
+
|
|
83
|
+
- `textwrap.dedent` with f-strings breaks when interpolated values have different indentation. Build shell scripts as concatenated string parts instead.
|
|
84
|
+
- `config.env` keys can contain digits (e.g. `EC2_SSH_SECRET`), so the parsing regex must be `[A-Z0-9_]+`.
|
|
85
|
+
- Nebius `recovery_policy` is immutable after instance creation — must use `InstanceRecoveryPolicy.FAIL` to prevent auto-restart after idle shutdown.
|
|
86
|
+
- Nebius instance state 8 is ERROR (not DELETED) — `_TERMINAL_STATES` must include it.
|
|
87
|
+
- External providers (Nebius) need explicit `resources: {CPU: N}` in Ray YAML templates — Ray can't auto-detect.
|
|
88
|
+
- Unresolved `{{VAR}}` placeholders in provider config must be guarded (e.g. `"{{" in value` check in node_provider).
|
|
89
|
+
- InquirerPy `Choice` class doesn't have a `disabled` parameter. Use dict syntax `{"value": ..., "name": ..., "disabled": "reason"}` for disabled items.
|
brr_cli-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Jon Carter
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
brr_cli-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: brr-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Research infrastructure management tooling.
|
|
5
|
+
Project-URL: Homepage, https://github.com/joncarter1/brr
|
|
6
|
+
Project-URL: Repository, https://github.com/joncarter1/brr
|
|
7
|
+
Project-URL: Issues, https://github.com/joncarter1/brr/issues
|
|
8
|
+
Author: Jon Carter
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: aws,cloud,cluster,gpu,infrastructure,ray
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Topic :: System :: Clustering
|
|
17
|
+
Requires-Python: >=3.11
|
|
18
|
+
Requires-Dist: click
|
|
19
|
+
Requires-Dist: inquirerpy
|
|
20
|
+
Requires-Dist: pyyaml
|
|
21
|
+
Requires-Dist: rich
|
|
22
|
+
Provides-Extra: aws
|
|
23
|
+
Requires-Dist: boto3; extra == 'aws'
|
|
24
|
+
Requires-Dist: ray[default]; extra == 'aws'
|
|
25
|
+
Provides-Extra: nebius
|
|
26
|
+
Requires-Dist: nebius; extra == 'nebius'
|
|
27
|
+
Requires-Dist: ray[default]; extra == 'nebius'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# ❄️ brr ❄️
|
|
31
|
+
|
|
32
|
+
Opinionated research infrastructure tooling. Launch clusters, get SSH access, start building.
|
|
33
|
+
|
|
34
|
+
## Features
|
|
35
|
+
- **Shared filesystem** — All nodes share `~/code/` via EFS (AWS) or virtiofs (Nebius).
|
|
36
|
+
- **Coding tools** — Install Claude Code, Codex, or Gemini. Connect with e.g. `brr attach dev claude`
|
|
37
|
+
- **Autoscaling** — Ray-based cluster scaling with cached instances.
|
|
38
|
+
- **Project-based workflows** — Per-repo cluster configs and project-specific dependencies.
|
|
39
|
+
- **Auto-shutdown** — Monitors CPU, GPU, and SSH activity. Shuts down idle instances to save costs.
|
|
40
|
+
- **Dotfiles integration** — Take your dev environment (vim, tmux, shell config) to every cluster node via GNU Stow.
|
|
41
|
+
|
|
42
|
+
## Prerequisites
|
|
43
|
+
|
|
44
|
+
- Python 3.11+
|
|
45
|
+
- [uv](https://docs.astral.sh/uv/) (for installation)
|
|
46
|
+
|
|
47
|
+
## Quick Start
|
|
48
|
+
|
|
49
|
+
```sh
|
|
50
|
+
# Install (AWS only)
|
|
51
|
+
uv tool install brr-cli[aws]
|
|
52
|
+
|
|
53
|
+
# Install (both providers)
|
|
54
|
+
# uv tool install brr-cli[aws,nebius]
|
|
55
|
+
|
|
56
|
+
# Configure (interactive wizard)
|
|
57
|
+
brr configure # or: brr configure nebius
|
|
58
|
+
|
|
59
|
+
# Launch an H100
|
|
60
|
+
brr up aws:h100
|
|
61
|
+
|
|
62
|
+
# brr up nebius:h100
|
|
63
|
+
|
|
64
|
+
# Connect
|
|
65
|
+
brr attach aws:h100 # SSH
|
|
66
|
+
brr attach aws:h100 claude # Claude Code on the cluster
|
|
67
|
+
brr vscode aws:h100 # VS Code remote
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Built-in templates use `provider:name` syntax (e.g. `aws:h100`). Inside a [project](#projects), short names like `brr up dev` work automatically.
|
|
71
|
+
|
|
72
|
+
Supported clouds: [AWS](#aws-setup) · [Nebius](#nebius-setup)
|
|
73
|
+
|
|
74
|
+
## Projects
|
|
75
|
+
|
|
76
|
+
For per-repo cluster configs, initialize a project:
|
|
77
|
+
|
|
78
|
+
```sh
|
|
79
|
+
cd my-research-repo/
|
|
80
|
+
brr init
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
This creates:
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
.brr/
|
|
87
|
+
aws/
|
|
88
|
+
dev.yaml # Single GPU for development
|
|
89
|
+
cluster.yaml # CPU head + GPU workers
|
|
90
|
+
setup.sh # Project-specific dependencies
|
|
91
|
+
config.env # Project config (overrides global)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Templates are standard Ray YAML — edit them or add your own. Inside a project, use short names:
|
|
95
|
+
|
|
96
|
+
```sh
|
|
97
|
+
brr up # launches DEFAULT_TEMPLATE (set in .brr/config.env)
|
|
98
|
+
brr up dev # launches .brr/aws/dev.yaml
|
|
99
|
+
brr up cluster # launches .brr/aws/cluster.yaml
|
|
100
|
+
brr attach dev # SSH into dev cluster
|
|
101
|
+
brr down dev # tear down
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
If your project uses `uv`, `brr init` automatically adds `brr-cli` and `ray` to a `brr` dependency group. The cluster uses your project-locked versions — no manual setup needed.
|
|
105
|
+
|
|
106
|
+
Project config (`.brr/config.env`) overrides global settings (`~/.brr/config.env`). Use it for project-specific settings like idle timeouts or dotfiles.
|
|
107
|
+
|
|
108
|
+
## Templates
|
|
109
|
+
|
|
110
|
+
### Built-in templates
|
|
111
|
+
|
|
112
|
+
| Template | Instance | GPU | Workers |
|
|
113
|
+
| :--- | :--- | :--- | :--- |
|
|
114
|
+
| `aws:cpu` | t3.2xlarge | — | 0-2 |
|
|
115
|
+
| `aws:l4` | gr6.4xlarge | 1x L4 | — |
|
|
116
|
+
| `aws:h100` | p5.4xlarge | 8x H100 | — |
|
|
117
|
+
| `aws:cpu-l4` | t3.2xlarge + g6.4xlarge | 1x L4 | 0-4 |
|
|
118
|
+
| `nebius:cpu` | 8vcpu-32gb | — | 0-2 |
|
|
119
|
+
| `nebius:h100` | 1gpu-16vcpu-200gb | 1x H100 | — |
|
|
120
|
+
| `nebius:cpu-h100` | 8vcpu-32gb + 8gpu-128vcpu-1600gb | 8x H100 | 0-4 |
|
|
121
|
+
|
|
122
|
+
### Overrides
|
|
123
|
+
|
|
124
|
+
Override template values inline:
|
|
125
|
+
|
|
126
|
+
```sh
|
|
127
|
+
brr up aws:cpu instance_type=t3.xlarge max_workers=4
|
|
128
|
+
brr up aws:h100 spot=true
|
|
129
|
+
brr up dev region=us-west-2
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Preview the rendered config without launching:
|
|
133
|
+
|
|
134
|
+
```sh
|
|
135
|
+
brr up dev --dry-run
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
See available overrides for a template:
|
|
139
|
+
|
|
140
|
+
```sh
|
|
141
|
+
brr templates show dev
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Multi-provider
|
|
145
|
+
|
|
146
|
+
Use the provider prefix for built-in templates:
|
|
147
|
+
|
|
148
|
+
```sh
|
|
149
|
+
brr up aws:h100
|
|
150
|
+
brr up nebius:h100
|
|
151
|
+
brr attach nebius:h100
|
|
152
|
+
brr down nebius:h100
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
Both providers can run simultaneously.
|
|
156
|
+
|
|
157
|
+
## Customization
|
|
158
|
+
|
|
159
|
+
### Node setup
|
|
160
|
+
|
|
161
|
+
`~/.brr/setup.sh` runs on every node boot. It installs packages, mounts shared storage, sets up Python/Ray, GitHub SSH keys, AI coding tools, dotfiles, and the idle shutdown daemon.
|
|
162
|
+
|
|
163
|
+
Edit it to customize:
|
|
164
|
+
```sh
|
|
165
|
+
vim ~/.brr/setup.sh
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
Project-specific dependencies go in `.brr/{provider}/setup.sh` (created by `brr init`), which runs after the global setup.
|
|
169
|
+
|
|
170
|
+
### AI coding tools
|
|
171
|
+
|
|
172
|
+
Install AI coding assistants on every cluster node:
|
|
173
|
+
|
|
174
|
+
```sh
|
|
175
|
+
brr configure tools # select Claude Code, Codex, and/or Gemini CLI
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
Then connect and start coding:
|
|
179
|
+
|
|
180
|
+
```sh
|
|
181
|
+
brr attach dev claude
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### Dotfiles
|
|
185
|
+
|
|
186
|
+
Set a dotfiles repo to sync your dev environment to every node:
|
|
187
|
+
|
|
188
|
+
```sh
|
|
189
|
+
brr config set DOTFILES_REPO "https://github.com/user/dotfiles"
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
The repo is cloned to `~/dotfiles` and installed via `install.sh` (if present) or GNU Stow.
|
|
193
|
+
|
|
194
|
+
### Image baking
|
|
195
|
+
|
|
196
|
+
Bake the global setup into AMIs/images for fast boot:
|
|
197
|
+
|
|
198
|
+
```sh
|
|
199
|
+
brr bake aws # bake both CPU + GPU AMIs
|
|
200
|
+
brr bake status # check if baked images are up to date
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
After baking, clusters boot from the pre-built image. Only project-specific deps need to install. `brr up` warns when `setup.sh` has changed since the last bake.
|
|
204
|
+
|
|
205
|
+
### Idle shutdown
|
|
206
|
+
|
|
207
|
+
A systemd daemon monitors CPU, GPU, and SSH activity. When all signals are idle for the configured timeout, the instance shuts down.
|
|
208
|
+
|
|
209
|
+
Configure in `~/.brr/config.env`:
|
|
210
|
+
|
|
211
|
+
```
|
|
212
|
+
IDLE_SHUTDOWN_ENABLED="true"
|
|
213
|
+
IDLE_SHUTDOWN_TIMEOUT_MIN="30"
|
|
214
|
+
IDLE_SHUTDOWN_CPU_THRESHOLD="10"
|
|
215
|
+
IDLE_SHUTDOWN_GRACE_MIN="15"
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
The grace period prevents shutdown during initial setup. Monitor on a node with `journalctl -u idle-shutdown -f`.
|
|
219
|
+
|
|
220
|
+
## Commands
|
|
221
|
+
|
|
222
|
+
| Command | Description |
|
|
223
|
+
| :--- | :--- |
|
|
224
|
+
| `brr up TEMPLATE [OVERRIDES...]` | Launch or update a cluster (`aws:h100`, `dev`, or `path.yaml`) |
|
|
225
|
+
| `brr up TEMPLATE --dry-run` | Preview rendered config without launching |
|
|
226
|
+
| `brr down TEMPLATE` | Stop a cluster (instances preserved for fast restart) |
|
|
227
|
+
| `brr down TEMPLATE --delete` | Terminate all instances and remove staging files |
|
|
228
|
+
| `brr attach TEMPLATE [COMMAND]` | SSH into head node, optionally run a command (e.g. `claude`) |
|
|
229
|
+
| `brr list [--all]` | List clusters (project-scoped by default, `--all` for everything) |
|
|
230
|
+
| `brr clean [TEMPLATE]` | Terminate stopped (cached) instances |
|
|
231
|
+
| `brr vscode TEMPLATE` | Open VS Code on a running cluster |
|
|
232
|
+
| `brr templates list` | List built-in templates |
|
|
233
|
+
| `brr templates show TEMPLATE` | Show template config and overrides |
|
|
234
|
+
| `brr init` | Initialize a project (interactive provider selection) |
|
|
235
|
+
| `brr configure [cloud\|tools\|general]` | Interactive setup (cloud provider, AI tools, settings) |
|
|
236
|
+
| `brr config [list\|get\|set\|path]` | View and manage configuration |
|
|
237
|
+
| `brr bake [aws\|nebius]` | Bake setup into cloud images |
|
|
238
|
+
| `brr bake status` | Check if baked images are up to date |
|
|
239
|
+
| `brr completion [bash\|zsh\|fish]` | Shell completion (`--install` to add to shell rc) |
|
|
240
|
+
| `brr nuke [aws\|nebius]` | Tear down all cloud resources |
|
|
241
|
+
|
|
242
|
+
## Cloud Setup
|
|
243
|
+
|
|
244
|
+
### AWS Setup
|
|
245
|
+
|
|
246
|
+
1. Attach the [IAM policy](brr/aws/iam-policy.json) to your IAM user
|
|
247
|
+
2. Install the [AWS CLI](https://aws.amazon.com/cli/) and run `aws configure`
|
|
248
|
+
3. *(Optional)* For GitHub SSH access on clusters, authenticate the [GitHub CLI](https://cli.github.com/):
|
|
249
|
+
```sh
|
|
250
|
+
gh auth login
|
|
251
|
+
gh auth refresh -h github.com -s admin:public_key
|
|
252
|
+
```
|
|
253
|
+
4. Run the setup wizard:
|
|
254
|
+
```sh
|
|
255
|
+
brr configure aws
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
### Nebius Setup
|
|
259
|
+
|
|
260
|
+
1. Install the [Nebius CLI](https://docs.nebius.com/cli/install) and run `nebius init`
|
|
261
|
+
2. Create a service account with editor permissions:
|
|
262
|
+
```sh
|
|
263
|
+
TENANT_ID="<your-tenant-id>" # from console.nebius.com → Administration
|
|
264
|
+
|
|
265
|
+
SA_ID=$(nebius iam service-account create \
|
|
266
|
+
--name brr-cluster --format json | jq -r '.metadata.id')
|
|
267
|
+
|
|
268
|
+
EDITORS_GROUP_ID=$(nebius iam group get-by-name \
|
|
269
|
+
--name editors --parent-id $TENANT_ID --format json | jq -r '.metadata.id')
|
|
270
|
+
|
|
271
|
+
nebius iam group-membership create \
|
|
272
|
+
--parent-id $EDITORS_GROUP_ID --member-id $SA_ID
|
|
273
|
+
```
|
|
274
|
+
3. Generate credentials:
|
|
275
|
+
```sh
|
|
276
|
+
mkdir -p ~/.nebius
|
|
277
|
+
nebius iam auth-public-key generate \
|
|
278
|
+
--service-account-id $SA_ID --output ~/.nebius/credentials.json
|
|
279
|
+
```
|
|
280
|
+
4. Run the setup wizard:
|
|
281
|
+
```sh
|
|
282
|
+
brr configure nebius
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
## Acknowledgments
|
|
286
|
+
|
|
287
|
+
This project started as a fork of [aws_wiz](https://github.com/besarthoxhaj/aws_wiz) by [Bes](https://github.com/besarthoxhaj) and has been inspired by discussions with colleagues from the [Encode: AI for Science Fellowship](https://encode.pillar.vc/).
|