brr-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- brr/__init__.py +0 -0
- brr/aws/__init__.py +0 -0
- brr/aws/configure.py +448 -0
- brr/aws/iam-policy.json +77 -0
- brr/aws/nodes.py +255 -0
- brr/aws/templates/__init__.py +0 -0
- brr/aws/templates/cpu-l4.yaml +50 -0
- brr/aws/templates/cpu.yaml +50 -0
- brr/aws/templates/h100.yaml +35 -0
- brr/aws/templates/l4.yaml +36 -0
- brr/cli.py +106 -0
- brr/cluster.py +949 -0
- brr/commands/__init__.py +0 -0
- brr/commands/bake.py +657 -0
- brr/commands/config.py +121 -0
- brr/commands/configure.py +285 -0
- brr/commands/init.py +182 -0
- brr/commands/nuke.py +497 -0
- brr/data/__init__.py +0 -0
- brr/data/idle-shutdown.sh +133 -0
- brr/data/setup.sh +450 -0
- brr/nebius/__init__.py +0 -0
- brr/nebius/configure.py +334 -0
- brr/nebius/node_provider.py +464 -0
- brr/nebius/nodes.py +282 -0
- brr/nebius/templates/__init__.py +0 -0
- brr/nebius/templates/cpu-h100.yaml +46 -0
- brr/nebius/templates/cpu.yaml +45 -0
- brr/nebius/templates/h100.yaml +35 -0
- brr/state.py +230 -0
- brr/templates.py +560 -0
- brr/utils.py +8 -0
- brr_cli-0.1.0.dist-info/METADATA +287 -0
- brr_cli-0.1.0.dist-info/RECORD +37 -0
- brr_cli-0.1.0.dist-info/WHEEL +4 -0
- brr_cli-0.1.0.dist-info/entry_points.txt +2 -0
- brr_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
brr/__init__.py
ADDED
|
File without changes
|
brr/aws/__init__.py
ADDED
|
File without changes
|
brr/aws/configure.py
ADDED
|
@@ -0,0 +1,448 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
import stat
|
|
5
|
+
import subprocess
|
|
6
|
+
import tempfile
|
|
7
|
+
import time
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
|
|
10
|
+
import click
|
|
11
|
+
from rich.console import Console
|
|
12
|
+
from rich.panel import Panel
|
|
13
|
+
|
|
14
|
+
from brr.state import ensure_state_dirs, read_config, write_config, CONFIG_PATH, KEYS_DIR
|
|
15
|
+
|
|
16
|
+
console = Console()
|
|
17
|
+
|
|
18
|
+
DEFAULTS = {
|
|
19
|
+
"AWS_REGION": "us-east-1",
|
|
20
|
+
"AMI_UBUNTU": "ami-0360c520857e3138f",
|
|
21
|
+
"AMI_DL": "ami-0b594b8835777de74",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_or_create_key(ec2, region):
|
|
26
|
+
"""Find an existing EC2 key pair or create a new one."""
|
|
27
|
+
ensure_state_dirs()
|
|
28
|
+
|
|
29
|
+
local_keys = [f for f in os.listdir(KEYS_DIR) if f.endswith(".pem")]
|
|
30
|
+
if local_keys:
|
|
31
|
+
aws_keys_resp = ec2.describe_key_pairs()
|
|
32
|
+
aws_key_names = [k['KeyName'] for k in aws_keys_resp['KeyPairs']]
|
|
33
|
+
|
|
34
|
+
for lk in local_keys:
|
|
35
|
+
key_name = lk.replace(".pem", "")
|
|
36
|
+
if key_name in aws_key_names:
|
|
37
|
+
console.print(f"Using existing local key: [green]{lk}[/green]")
|
|
38
|
+
return key_name, str(KEYS_DIR / lk)
|
|
39
|
+
|
|
40
|
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
41
|
+
key_name = f"brr-{region}-{timestamp}"
|
|
42
|
+
key_file = str(KEYS_DIR / f"{key_name}.pem")
|
|
43
|
+
|
|
44
|
+
console.print(f"Generating new key pair: [bold cyan]{key_name}[/bold cyan]...")
|
|
45
|
+
resp = ec2.create_key_pair(KeyName=key_name)
|
|
46
|
+
|
|
47
|
+
with open(key_file, "w") as f:
|
|
48
|
+
f.write(resp['KeyMaterial'])
|
|
49
|
+
|
|
50
|
+
os.chmod(key_file, stat.S_IRUSR)
|
|
51
|
+
|
|
52
|
+
return key_name, key_file
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_default_vpc(ec2):
|
|
56
|
+
"""Find the default VPC, or the first available VPC."""
|
|
57
|
+
vpcs = ec2.describe_vpcs(Filters=[{"Name": "isDefault", "Values": ["true"]}])
|
|
58
|
+
if vpcs["Vpcs"]:
|
|
59
|
+
return vpcs["Vpcs"][0]["VpcId"]
|
|
60
|
+
vpcs = ec2.describe_vpcs()
|
|
61
|
+
if vpcs["Vpcs"]:
|
|
62
|
+
return vpcs["Vpcs"][0]["VpcId"]
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def get_or_create_cluster_sg(ec2, vpc_id):
|
|
67
|
+
"""Create or find the brr-cluster security group with SSH + cluster mesh rules."""
|
|
68
|
+
sg_name = "brr-cluster"
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
resp = ec2.describe_security_groups(
|
|
72
|
+
Filters=[
|
|
73
|
+
{"Name": "group-name", "Values": [sg_name]},
|
|
74
|
+
{"Name": "vpc-id", "Values": [vpc_id]},
|
|
75
|
+
]
|
|
76
|
+
)
|
|
77
|
+
if resp["SecurityGroups"]:
|
|
78
|
+
sg_id = resp["SecurityGroups"][0]["GroupId"]
|
|
79
|
+
console.print(f"Using existing security group: [green]{sg_name}[/green] ({sg_id})")
|
|
80
|
+
return sg_id
|
|
81
|
+
except Exception:
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
console.print(f"Creating security group: [bold cyan]{sg_name}[/bold cyan]...")
|
|
86
|
+
resp = ec2.create_security_group(
|
|
87
|
+
GroupName=sg_name,
|
|
88
|
+
Description="Ray cluster - SSH + cluster mesh",
|
|
89
|
+
VpcId=vpc_id,
|
|
90
|
+
)
|
|
91
|
+
sg_id = resp["GroupId"]
|
|
92
|
+
|
|
93
|
+
ec2.authorize_security_group_ingress(
|
|
94
|
+
GroupId=sg_id,
|
|
95
|
+
IpPermissions=[
|
|
96
|
+
{
|
|
97
|
+
"IpProtocol": "tcp",
|
|
98
|
+
"FromPort": 22,
|
|
99
|
+
"ToPort": 22,
|
|
100
|
+
"IpRanges": [{"CidrIp": "0.0.0.0/0"}],
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
"IpProtocol": "-1",
|
|
104
|
+
"UserIdGroupPairs": [{"GroupId": sg_id}],
|
|
105
|
+
},
|
|
106
|
+
],
|
|
107
|
+
)
|
|
108
|
+
console.print(f"Created security group: [green]{sg_id}[/green]")
|
|
109
|
+
return sg_id
|
|
110
|
+
except Exception as e:
|
|
111
|
+
console.print(f"[red]Error creating security group: {e}[/red]")
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _wait_for_efs(efs_client, fs_id, timeout=120):
|
|
116
|
+
"""Poll until EFS filesystem is available."""
|
|
117
|
+
for _ in range(timeout // 5):
|
|
118
|
+
resp = efs_client.describe_file_systems(FileSystemId=fs_id)
|
|
119
|
+
state = resp["FileSystems"][0]["LifeCycleState"]
|
|
120
|
+
if state == "available":
|
|
121
|
+
return
|
|
122
|
+
time.sleep(5)
|
|
123
|
+
raise TimeoutError(f"EFS {fs_id} did not become available within {timeout}s")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _ensure_mount_targets(efs_client, ec2, fs_id, vpc_id, sg_id):
|
|
127
|
+
"""Create mount targets in all AZs of the VPC that don't already have one."""
|
|
128
|
+
existing = efs_client.describe_mount_targets(FileSystemId=fs_id)
|
|
129
|
+
existing_azs = {mt["AvailabilityZoneName"] for mt in existing["MountTargets"]}
|
|
130
|
+
|
|
131
|
+
subnets = ec2.describe_subnets(Filters=[{"Name": "vpc-id", "Values": [vpc_id]}])
|
|
132
|
+
az_to_subnet = {}
|
|
133
|
+
for subnet in subnets["Subnets"]:
|
|
134
|
+
az = subnet["AvailabilityZone"]
|
|
135
|
+
if az not in az_to_subnet:
|
|
136
|
+
az_to_subnet[az] = subnet["SubnetId"]
|
|
137
|
+
|
|
138
|
+
created = []
|
|
139
|
+
for az, subnet_id in az_to_subnet.items():
|
|
140
|
+
if az in existing_azs:
|
|
141
|
+
continue
|
|
142
|
+
try:
|
|
143
|
+
efs_client.create_mount_target(
|
|
144
|
+
FileSystemId=fs_id,
|
|
145
|
+
SubnetId=subnet_id,
|
|
146
|
+
SecurityGroups=[sg_id],
|
|
147
|
+
)
|
|
148
|
+
created.append(az)
|
|
149
|
+
except efs_client.exceptions.MountTargetConflict:
|
|
150
|
+
pass
|
|
151
|
+
|
|
152
|
+
if created:
|
|
153
|
+
console.print(f"Created EFS mount targets in: [green]{', '.join(created)}[/green]")
|
|
154
|
+
for _ in range(60):
|
|
155
|
+
resp = efs_client.describe_mount_targets(FileSystemId=fs_id)
|
|
156
|
+
states = [mt["LifeCycleState"] for mt in resp["MountTargets"]]
|
|
157
|
+
if all(s == "available" for s in states):
|
|
158
|
+
return
|
|
159
|
+
time.sleep(5)
|
|
160
|
+
console.print("[yellow]Warning: some mount targets may still be initializing[/yellow]")
|
|
161
|
+
else:
|
|
162
|
+
console.print("EFS mount targets already exist in all AZs")
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def get_or_create_efs(efs_client, ec2, vpc_id, sg_id):
|
|
166
|
+
"""Create or find the brr-shared EFS filesystem with mount targets."""
|
|
167
|
+
existing = efs_client.describe_file_systems(CreationToken="brr-shared")
|
|
168
|
+
if existing["FileSystems"]:
|
|
169
|
+
fs = existing["FileSystems"][0]
|
|
170
|
+
fs_id = fs["FileSystemId"]
|
|
171
|
+
console.print(f"Using existing EFS: [green]{fs_id}[/green]")
|
|
172
|
+
if fs["LifeCycleState"] != "available":
|
|
173
|
+
_wait_for_efs(efs_client, fs_id)
|
|
174
|
+
else:
|
|
175
|
+
console.print("Creating EFS filesystem: [bold cyan]brr-shared[/bold cyan]...")
|
|
176
|
+
resp = efs_client.create_file_system(
|
|
177
|
+
CreationToken="brr-shared",
|
|
178
|
+
PerformanceMode="generalPurpose",
|
|
179
|
+
ThroughputMode="elastic",
|
|
180
|
+
Encrypted=True,
|
|
181
|
+
Tags=[{"Key": "Name", "Value": "brr-shared"}],
|
|
182
|
+
)
|
|
183
|
+
fs_id = resp["FileSystemId"]
|
|
184
|
+
console.print(f"Created EFS: [green]{fs_id}[/green]")
|
|
185
|
+
_wait_for_efs(efs_client, fs_id)
|
|
186
|
+
|
|
187
|
+
_ensure_mount_targets(efs_client, ec2, fs_id, vpc_id, sg_id)
|
|
188
|
+
|
|
189
|
+
return fs_id
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def setup_github_ssh(region, key_path):
|
|
193
|
+
"""Upload EC2 SSH key to Secrets Manager and add public key to GitHub."""
|
|
194
|
+
import boto3
|
|
195
|
+
secret_name = "brr-github-ssh-key"
|
|
196
|
+
sm = boto3.client("secretsmanager", region_name=region)
|
|
197
|
+
|
|
198
|
+
try:
|
|
199
|
+
with open(key_path) as f:
|
|
200
|
+
private_key = f.read()
|
|
201
|
+
sm.create_secret(Name=secret_name, SecretString=private_key)
|
|
202
|
+
console.print(f"Uploaded SSH key to Secrets Manager: [green]{secret_name}[/green]")
|
|
203
|
+
except sm.exceptions.ResourceExistsException:
|
|
204
|
+
console.print(f"SSH key already in Secrets Manager: [green]{secret_name}[/green]")
|
|
205
|
+
except sm.exceptions.ClientError as e:
|
|
206
|
+
if e.response["Error"]["Code"] == "AccessDeniedException":
|
|
207
|
+
console.print(f"[red]Access denied for secretsmanager:CreateSecret[/red]")
|
|
208
|
+
console.print("[yellow]Attach the SecretsManager IAM policy from the README to your IAM user,[/yellow]")
|
|
209
|
+
console.print("[yellow]then re-run 'brr configure aws'.[/yellow]")
|
|
210
|
+
return ""
|
|
211
|
+
raise
|
|
212
|
+
|
|
213
|
+
if not shutil.which("gh"):
|
|
214
|
+
console.print("[yellow]gh CLI not found — skipping GitHub key setup[/yellow]")
|
|
215
|
+
console.print("[yellow]Install gh and run 'brr configure aws' again to add the key to GitHub[/yellow]")
|
|
216
|
+
return secret_name
|
|
217
|
+
|
|
218
|
+
auth_check = subprocess.run(
|
|
219
|
+
["gh", "auth", "status"], capture_output=True, text=True
|
|
220
|
+
)
|
|
221
|
+
if auth_check.returncode != 0:
|
|
222
|
+
console.print("[yellow]gh CLI not authenticated — skipping GitHub key setup[/yellow]")
|
|
223
|
+
console.print("[yellow]Run 'gh auth login' and then 'brr configure aws' again[/yellow]")
|
|
224
|
+
return secret_name
|
|
225
|
+
|
|
226
|
+
list_result = subprocess.run(
|
|
227
|
+
["gh", "ssh-key", "list"], capture_output=True, text=True
|
|
228
|
+
)
|
|
229
|
+
if list_result.returncode == 0:
|
|
230
|
+
for line in list_result.stdout.splitlines():
|
|
231
|
+
if "brr-aws" in line:
|
|
232
|
+
console.print("SSH key already registered on GitHub: [green]brr-aws[/green]")
|
|
233
|
+
return secret_name
|
|
234
|
+
|
|
235
|
+
pubkey_result = subprocess.run(
|
|
236
|
+
["ssh-keygen", "-y", "-f", key_path], capture_output=True, text=True
|
|
237
|
+
)
|
|
238
|
+
if pubkey_result.returncode != 0:
|
|
239
|
+
console.print(f"[red]Failed to derive public key: {pubkey_result.stderr.strip()}[/red]")
|
|
240
|
+
return secret_name
|
|
241
|
+
|
|
242
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".pub", delete=False) as tmp:
|
|
243
|
+
tmp.write(pubkey_result.stdout)
|
|
244
|
+
tmp_path = tmp.name
|
|
245
|
+
|
|
246
|
+
try:
|
|
247
|
+
add_result = subprocess.run(
|
|
248
|
+
["gh", "ssh-key", "add", tmp_path, "--title", "brr-aws"],
|
|
249
|
+
capture_output=True, text=True,
|
|
250
|
+
)
|
|
251
|
+
if add_result.returncode == 0:
|
|
252
|
+
console.print("Added SSH key to GitHub: [green]brr-aws[/green]")
|
|
253
|
+
else:
|
|
254
|
+
console.print(f"[red]Failed to add key to GitHub: {add_result.stderr.strip()}[/red]")
|
|
255
|
+
finally:
|
|
256
|
+
os.unlink(tmp_path)
|
|
257
|
+
|
|
258
|
+
_attach_secretsmanager_policy(region)
|
|
259
|
+
|
|
260
|
+
return secret_name
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _store_ec2_ssh_key(region, key_path):
|
|
265
|
+
"""Store the EC2 SSH private key in Secrets Manager so cluster nodes can fetch it."""
|
|
266
|
+
import boto3
|
|
267
|
+
secret_name = "brr-ec2-ssh-key"
|
|
268
|
+
sm = boto3.client("secretsmanager", region_name=region)
|
|
269
|
+
|
|
270
|
+
with open(key_path) as f:
|
|
271
|
+
private_key = f.read()
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
sm.create_secret(Name=secret_name, SecretString=private_key)
|
|
275
|
+
console.print(f"Stored EC2 SSH key in Secrets Manager: [green]{secret_name}[/green]")
|
|
276
|
+
except sm.exceptions.ResourceExistsException:
|
|
277
|
+
sm.put_secret_value(SecretId=secret_name, SecretString=private_key)
|
|
278
|
+
console.print(f"Updated EC2 SSH key in Secrets Manager: [green]{secret_name}[/green]")
|
|
279
|
+
except sm.exceptions.ClientError as e:
|
|
280
|
+
if e.response["Error"]["Code"] == "AccessDeniedException":
|
|
281
|
+
console.print("[red]Access denied for secretsmanager:CreateSecret[/red]")
|
|
282
|
+
console.print("[yellow]Attach the SecretsManager IAM policy from the README.[/yellow]")
|
|
283
|
+
return ""
|
|
284
|
+
raise
|
|
285
|
+
|
|
286
|
+
return secret_name
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _attach_secretsmanager_policy(region):
|
|
290
|
+
"""Attach Secrets Manager read policy to ray-autoscaler-v1 role."""
|
|
291
|
+
import boto3
|
|
292
|
+
iam = boto3.client("iam")
|
|
293
|
+
try:
|
|
294
|
+
iam.put_role_policy(
|
|
295
|
+
RoleName="ray-autoscaler-v1",
|
|
296
|
+
PolicyName="brr-secretsmanager-read",
|
|
297
|
+
PolicyDocument=json.dumps({
|
|
298
|
+
"Version": "2012-10-17",
|
|
299
|
+
"Statement": [{
|
|
300
|
+
"Effect": "Allow",
|
|
301
|
+
"Action": "secretsmanager:GetSecretValue",
|
|
302
|
+
"Resource": f"arn:aws:secretsmanager:{region}:*:secret:brr-*"
|
|
303
|
+
}]
|
|
304
|
+
}),
|
|
305
|
+
)
|
|
306
|
+
console.print(f"Added Secrets Manager permission to [green]ray-autoscaler-v1[/green] role")
|
|
307
|
+
except iam.exceptions.NoSuchEntityException:
|
|
308
|
+
console.print("[yellow]IAM role 'ray-autoscaler-v1' not found yet (created on first cluster launch).[/yellow]")
|
|
309
|
+
console.print("[yellow]Permission will be added automatically on next `brr up`.[/yellow]")
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _attach_iam_passrole_policy():
|
|
313
|
+
"""Attach IAM PassRole/GetInstanceProfile policy to ray-autoscaler-v1.
|
|
314
|
+
|
|
315
|
+
Ray's autoscaler needs these to launch new instances with the same
|
|
316
|
+
instance profile.
|
|
317
|
+
"""
|
|
318
|
+
import boto3
|
|
319
|
+
iam = boto3.client("iam")
|
|
320
|
+
try:
|
|
321
|
+
iam.put_role_policy(
|
|
322
|
+
RoleName="ray-autoscaler-v1",
|
|
323
|
+
PolicyName="brr-iam-passrole",
|
|
324
|
+
PolicyDocument=json.dumps({
|
|
325
|
+
"Version": "2012-10-17",
|
|
326
|
+
"Statement": [
|
|
327
|
+
{
|
|
328
|
+
"Effect": "Allow",
|
|
329
|
+
"Action": [
|
|
330
|
+
"iam:GetInstanceProfile",
|
|
331
|
+
"iam:PassRole",
|
|
332
|
+
],
|
|
333
|
+
"Resource": [
|
|
334
|
+
"arn:aws:iam::*:instance-profile/ray-autoscaler-v1",
|
|
335
|
+
"arn:aws:iam::*:role/ray-autoscaler-v1",
|
|
336
|
+
],
|
|
337
|
+
},
|
|
338
|
+
],
|
|
339
|
+
}),
|
|
340
|
+
)
|
|
341
|
+
console.print(f"Added IAM PassRole permission to [green]ray-autoscaler-v1[/green] role")
|
|
342
|
+
except iam.exceptions.NoSuchEntityException:
|
|
343
|
+
console.print("[yellow]IAM role 'ray-autoscaler-v1' not found yet.[/yellow]")
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def _attach_ssm_policy():
|
|
347
|
+
"""Attach SSM managed policy to ray-autoscaler-v1 for Session Manager access."""
|
|
348
|
+
import boto3
|
|
349
|
+
iam = boto3.client("iam")
|
|
350
|
+
try:
|
|
351
|
+
iam.attach_role_policy(
|
|
352
|
+
RoleName="ray-autoscaler-v1",
|
|
353
|
+
PolicyArn="arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore",
|
|
354
|
+
)
|
|
355
|
+
console.print(f"Added SSM Session Manager permission to [green]ray-autoscaler-v1[/green] role")
|
|
356
|
+
except iam.exceptions.NoSuchEntityException:
|
|
357
|
+
console.print("[yellow]IAM role 'ray-autoscaler-v1' not found yet.[/yellow]")
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def configure_aws():
|
|
361
|
+
"""Interactive AWS configuration wizard."""
|
|
362
|
+
ensure_state_dirs()
|
|
363
|
+
|
|
364
|
+
existing = read_config()
|
|
365
|
+
|
|
366
|
+
console.print(Panel("AWS configuration", title="brr configure", border_style="cyan"))
|
|
367
|
+
|
|
368
|
+
region = click.prompt(
|
|
369
|
+
"AWS region",
|
|
370
|
+
default=existing.get("AWS_REGION", DEFAULTS["AWS_REGION"]),
|
|
371
|
+
)
|
|
372
|
+
ami_ubuntu = click.prompt(
|
|
373
|
+
"Ubuntu AMI",
|
|
374
|
+
default=existing.get("AMI_UBUNTU", DEFAULTS["AMI_UBUNTU"]),
|
|
375
|
+
)
|
|
376
|
+
ami_dl = click.prompt(
|
|
377
|
+
"Deep Learning AMI",
|
|
378
|
+
default=existing.get("AMI_DL", DEFAULTS["AMI_DL"]),
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
efs_enabled = click.confirm(
|
|
382
|
+
"Enable shared EFS filesystem?",
|
|
383
|
+
default=bool(existing.get("EFS_ID", "")),
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
console.print()
|
|
387
|
+
|
|
388
|
+
import boto3
|
|
389
|
+
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
|
|
390
|
+
|
|
391
|
+
ec2 = boto3.client("ec2", region_name=region)
|
|
392
|
+
|
|
393
|
+
try:
|
|
394
|
+
vpc_id = get_default_vpc(ec2)
|
|
395
|
+
except (NoCredentialsError, PartialCredentialsError):
|
|
396
|
+
console.print("[red]AWS credentials not found.[/red]")
|
|
397
|
+
console.print("Run [bold]aws configure[/bold] to set up your credentials first.")
|
|
398
|
+
console.print("See: https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html")
|
|
399
|
+
raise click.Abort()
|
|
400
|
+
|
|
401
|
+
if not vpc_id:
|
|
402
|
+
console.print("[red]No VPC found[/red]")
|
|
403
|
+
raise click.Abort()
|
|
404
|
+
|
|
405
|
+
sg_id = get_or_create_cluster_sg(ec2, vpc_id)
|
|
406
|
+
if not sg_id:
|
|
407
|
+
raise click.Abort()
|
|
408
|
+
|
|
409
|
+
key_name, key_path = get_or_create_key(ec2, region)
|
|
410
|
+
ec2_ssh_secret = _store_ec2_ssh_key(region, key_path)
|
|
411
|
+
_attach_iam_passrole_policy()
|
|
412
|
+
_attach_ssm_policy()
|
|
413
|
+
|
|
414
|
+
efs_id = ""
|
|
415
|
+
if efs_enabled:
|
|
416
|
+
efs_client = boto3.client("efs", region_name=region)
|
|
417
|
+
efs_id = get_or_create_efs(efs_client, ec2, vpc_id, sg_id)
|
|
418
|
+
|
|
419
|
+
github_ssh_enabled = click.confirm(
|
|
420
|
+
"Set up GitHub SSH access for clusters?",
|
|
421
|
+
default=bool(existing.get("GITHUB_SSH_SECRET", "")),
|
|
422
|
+
)
|
|
423
|
+
github_ssh_secret = ""
|
|
424
|
+
if github_ssh_enabled:
|
|
425
|
+
github_ssh_secret = setup_github_ssh(region, key_path)
|
|
426
|
+
|
|
427
|
+
updates = {
|
|
428
|
+
"AWS_REGION": region,
|
|
429
|
+
"AWS_SECURITY_GROUP": sg_id,
|
|
430
|
+
"AWS_KEY_NAME": key_name,
|
|
431
|
+
"AWS_SSH_KEY": key_path,
|
|
432
|
+
"EFS_ID": efs_id,
|
|
433
|
+
"AMI_UBUNTU": ami_ubuntu,
|
|
434
|
+
"AMI_DL": ami_dl,
|
|
435
|
+
"GITHUB_SSH_SECRET": github_ssh_secret,
|
|
436
|
+
"EC2_SSH_SECRET": ec2_ssh_secret,
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
merged = dict(existing)
|
|
440
|
+
merged.update(updates)
|
|
441
|
+
write_config(merged)
|
|
442
|
+
console.print(f"\nWrote [green]{CONFIG_PATH}[/green]")
|
|
443
|
+
|
|
444
|
+
console.print()
|
|
445
|
+
console.print("[bold green]Done![/bold green] Next steps:")
|
|
446
|
+
console.print(" brr configure tools # select AI coding tools")
|
|
447
|
+
console.print(" brr configure general # instance settings")
|
|
448
|
+
console.print(" brr up aws:cpu # launch CPU cluster")
|
brr/aws/iam-policy.json
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
{
|
|
2
|
+
"Version": "2012-10-17",
|
|
3
|
+
"Statement": [
|
|
4
|
+
{
|
|
5
|
+
"Sid": "EC2",
|
|
6
|
+
"Effect": "Allow",
|
|
7
|
+
"Action": [
|
|
8
|
+
"ec2:Describe*",
|
|
9
|
+
"ec2:RunInstances",
|
|
10
|
+
"ec2:TerminateInstances",
|
|
11
|
+
"ec2:StartInstances",
|
|
12
|
+
"ec2:StopInstances",
|
|
13
|
+
"ec2:CreateSecurityGroup",
|
|
14
|
+
"ec2:DeleteSecurityGroup",
|
|
15
|
+
"ec2:AuthorizeSecurityGroupIngress",
|
|
16
|
+
"ec2:CreateKeyPair",
|
|
17
|
+
"ec2:DeleteKeyPair",
|
|
18
|
+
"ec2:CreateTags",
|
|
19
|
+
"ec2:CreateVpc",
|
|
20
|
+
"ec2:DeleteVpc",
|
|
21
|
+
"ec2:CreateSubnet",
|
|
22
|
+
"ec2:DeleteSubnet",
|
|
23
|
+
"ec2:CreateInternetGateway",
|
|
24
|
+
"ec2:AttachInternetGateway",
|
|
25
|
+
"ec2:DetachInternetGateway",
|
|
26
|
+
"ec2:DeleteInternetGateway",
|
|
27
|
+
"ec2:CreateRoute",
|
|
28
|
+
"ec2:DeleteRouteTable",
|
|
29
|
+
"ec2:DeleteNatGateway",
|
|
30
|
+
"ec2:DeleteVolume",
|
|
31
|
+
"ec2:DisassociateAddress",
|
|
32
|
+
"ec2:ReleaseAddress",
|
|
33
|
+
"ec2:ModifySubnetAttribute",
|
|
34
|
+
"ec2:ModifyInstanceAttribute"
|
|
35
|
+
],
|
|
36
|
+
"Resource": "*"
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"Sid": "EFS",
|
|
40
|
+
"Effect": "Allow",
|
|
41
|
+
"Action": [
|
|
42
|
+
"elasticfilesystem:CreateFileSystem",
|
|
43
|
+
"elasticfilesystem:DescribeFileSystems",
|
|
44
|
+
"elasticfilesystem:CreateMountTarget",
|
|
45
|
+
"elasticfilesystem:DescribeMountTargets"
|
|
46
|
+
],
|
|
47
|
+
"Resource": "*"
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
"Sid": "IAM",
|
|
51
|
+
"Effect": "Allow",
|
|
52
|
+
"Action": [
|
|
53
|
+
"iam:CreateRole",
|
|
54
|
+
"iam:GetRole",
|
|
55
|
+
"iam:PutRolePolicy",
|
|
56
|
+
"iam:DeleteRolePolicy",
|
|
57
|
+
"iam:AttachRolePolicy",
|
|
58
|
+
"iam:DetachRolePolicy",
|
|
59
|
+
"iam:CreateInstanceProfile",
|
|
60
|
+
"iam:AddRoleToInstanceProfile",
|
|
61
|
+
"iam:PassRole"
|
|
62
|
+
],
|
|
63
|
+
"Resource": "*"
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
"Sid": "SecretsManager",
|
|
67
|
+
"Effect": "Allow",
|
|
68
|
+
"Action": [
|
|
69
|
+
"secretsmanager:CreateSecret",
|
|
70
|
+
"secretsmanager:PutSecretValue",
|
|
71
|
+
"secretsmanager:DeleteSecret",
|
|
72
|
+
"secretsmanager:GetSecretValue"
|
|
73
|
+
],
|
|
74
|
+
"Resource": "*"
|
|
75
|
+
}
|
|
76
|
+
]
|
|
77
|
+
}
|