brr-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
brr/__init__.py ADDED
File without changes
brr/aws/__init__.py ADDED
File without changes
brr/aws/configure.py ADDED
@@ -0,0 +1,448 @@
1
+ import json
2
+ import os
3
+ import shutil
4
+ import stat
5
+ import subprocess
6
+ import tempfile
7
+ import time
8
+ from datetime import datetime
9
+
10
+ import click
11
+ from rich.console import Console
12
+ from rich.panel import Panel
13
+
14
+ from brr.state import ensure_state_dirs, read_config, write_config, CONFIG_PATH, KEYS_DIR
15
+
16
+ console = Console()
17
+
18
+ DEFAULTS = {
19
+ "AWS_REGION": "us-east-1",
20
+ "AMI_UBUNTU": "ami-0360c520857e3138f",
21
+ "AMI_DL": "ami-0b594b8835777de74",
22
+ }
23
+
24
+
25
+ def get_or_create_key(ec2, region):
26
+ """Find an existing EC2 key pair or create a new one."""
27
+ ensure_state_dirs()
28
+
29
+ local_keys = [f for f in os.listdir(KEYS_DIR) if f.endswith(".pem")]
30
+ if local_keys:
31
+ aws_keys_resp = ec2.describe_key_pairs()
32
+ aws_key_names = [k['KeyName'] for k in aws_keys_resp['KeyPairs']]
33
+
34
+ for lk in local_keys:
35
+ key_name = lk.replace(".pem", "")
36
+ if key_name in aws_key_names:
37
+ console.print(f"Using existing local key: [green]{lk}[/green]")
38
+ return key_name, str(KEYS_DIR / lk)
39
+
40
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
41
+ key_name = f"brr-{region}-{timestamp}"
42
+ key_file = str(KEYS_DIR / f"{key_name}.pem")
43
+
44
+ console.print(f"Generating new key pair: [bold cyan]{key_name}[/bold cyan]...")
45
+ resp = ec2.create_key_pair(KeyName=key_name)
46
+
47
+ with open(key_file, "w") as f:
48
+ f.write(resp['KeyMaterial'])
49
+
50
+ os.chmod(key_file, stat.S_IRUSR)
51
+
52
+ return key_name, key_file
53
+
54
+
55
+ def get_default_vpc(ec2):
56
+ """Find the default VPC, or the first available VPC."""
57
+ vpcs = ec2.describe_vpcs(Filters=[{"Name": "isDefault", "Values": ["true"]}])
58
+ if vpcs["Vpcs"]:
59
+ return vpcs["Vpcs"][0]["VpcId"]
60
+ vpcs = ec2.describe_vpcs()
61
+ if vpcs["Vpcs"]:
62
+ return vpcs["Vpcs"][0]["VpcId"]
63
+ return None
64
+
65
+
66
+ def get_or_create_cluster_sg(ec2, vpc_id):
67
+ """Create or find the brr-cluster security group with SSH + cluster mesh rules."""
68
+ sg_name = "brr-cluster"
69
+
70
+ try:
71
+ resp = ec2.describe_security_groups(
72
+ Filters=[
73
+ {"Name": "group-name", "Values": [sg_name]},
74
+ {"Name": "vpc-id", "Values": [vpc_id]},
75
+ ]
76
+ )
77
+ if resp["SecurityGroups"]:
78
+ sg_id = resp["SecurityGroups"][0]["GroupId"]
79
+ console.print(f"Using existing security group: [green]{sg_name}[/green] ({sg_id})")
80
+ return sg_id
81
+ except Exception:
82
+ pass
83
+
84
+ try:
85
+ console.print(f"Creating security group: [bold cyan]{sg_name}[/bold cyan]...")
86
+ resp = ec2.create_security_group(
87
+ GroupName=sg_name,
88
+ Description="Ray cluster - SSH + cluster mesh",
89
+ VpcId=vpc_id,
90
+ )
91
+ sg_id = resp["GroupId"]
92
+
93
+ ec2.authorize_security_group_ingress(
94
+ GroupId=sg_id,
95
+ IpPermissions=[
96
+ {
97
+ "IpProtocol": "tcp",
98
+ "FromPort": 22,
99
+ "ToPort": 22,
100
+ "IpRanges": [{"CidrIp": "0.0.0.0/0"}],
101
+ },
102
+ {
103
+ "IpProtocol": "-1",
104
+ "UserIdGroupPairs": [{"GroupId": sg_id}],
105
+ },
106
+ ],
107
+ )
108
+ console.print(f"Created security group: [green]{sg_id}[/green]")
109
+ return sg_id
110
+ except Exception as e:
111
+ console.print(f"[red]Error creating security group: {e}[/red]")
112
+ return None
113
+
114
+
115
+ def _wait_for_efs(efs_client, fs_id, timeout=120):
116
+ """Poll until EFS filesystem is available."""
117
+ for _ in range(timeout // 5):
118
+ resp = efs_client.describe_file_systems(FileSystemId=fs_id)
119
+ state = resp["FileSystems"][0]["LifeCycleState"]
120
+ if state == "available":
121
+ return
122
+ time.sleep(5)
123
+ raise TimeoutError(f"EFS {fs_id} did not become available within {timeout}s")
124
+
125
+
126
+ def _ensure_mount_targets(efs_client, ec2, fs_id, vpc_id, sg_id):
127
+ """Create mount targets in all AZs of the VPC that don't already have one."""
128
+ existing = efs_client.describe_mount_targets(FileSystemId=fs_id)
129
+ existing_azs = {mt["AvailabilityZoneName"] for mt in existing["MountTargets"]}
130
+
131
+ subnets = ec2.describe_subnets(Filters=[{"Name": "vpc-id", "Values": [vpc_id]}])
132
+ az_to_subnet = {}
133
+ for subnet in subnets["Subnets"]:
134
+ az = subnet["AvailabilityZone"]
135
+ if az not in az_to_subnet:
136
+ az_to_subnet[az] = subnet["SubnetId"]
137
+
138
+ created = []
139
+ for az, subnet_id in az_to_subnet.items():
140
+ if az in existing_azs:
141
+ continue
142
+ try:
143
+ efs_client.create_mount_target(
144
+ FileSystemId=fs_id,
145
+ SubnetId=subnet_id,
146
+ SecurityGroups=[sg_id],
147
+ )
148
+ created.append(az)
149
+ except efs_client.exceptions.MountTargetConflict:
150
+ pass
151
+
152
+ if created:
153
+ console.print(f"Created EFS mount targets in: [green]{', '.join(created)}[/green]")
154
+ for _ in range(60):
155
+ resp = efs_client.describe_mount_targets(FileSystemId=fs_id)
156
+ states = [mt["LifeCycleState"] for mt in resp["MountTargets"]]
157
+ if all(s == "available" for s in states):
158
+ return
159
+ time.sleep(5)
160
+ console.print("[yellow]Warning: some mount targets may still be initializing[/yellow]")
161
+ else:
162
+ console.print("EFS mount targets already exist in all AZs")
163
+
164
+
165
+ def get_or_create_efs(efs_client, ec2, vpc_id, sg_id):
166
+ """Create or find the brr-shared EFS filesystem with mount targets."""
167
+ existing = efs_client.describe_file_systems(CreationToken="brr-shared")
168
+ if existing["FileSystems"]:
169
+ fs = existing["FileSystems"][0]
170
+ fs_id = fs["FileSystemId"]
171
+ console.print(f"Using existing EFS: [green]{fs_id}[/green]")
172
+ if fs["LifeCycleState"] != "available":
173
+ _wait_for_efs(efs_client, fs_id)
174
+ else:
175
+ console.print("Creating EFS filesystem: [bold cyan]brr-shared[/bold cyan]...")
176
+ resp = efs_client.create_file_system(
177
+ CreationToken="brr-shared",
178
+ PerformanceMode="generalPurpose",
179
+ ThroughputMode="elastic",
180
+ Encrypted=True,
181
+ Tags=[{"Key": "Name", "Value": "brr-shared"}],
182
+ )
183
+ fs_id = resp["FileSystemId"]
184
+ console.print(f"Created EFS: [green]{fs_id}[/green]")
185
+ _wait_for_efs(efs_client, fs_id)
186
+
187
+ _ensure_mount_targets(efs_client, ec2, fs_id, vpc_id, sg_id)
188
+
189
+ return fs_id
190
+
191
+
192
+ def setup_github_ssh(region, key_path):
193
+ """Upload EC2 SSH key to Secrets Manager and add public key to GitHub."""
194
+ import boto3
195
+ secret_name = "brr-github-ssh-key"
196
+ sm = boto3.client("secretsmanager", region_name=region)
197
+
198
+ try:
199
+ with open(key_path) as f:
200
+ private_key = f.read()
201
+ sm.create_secret(Name=secret_name, SecretString=private_key)
202
+ console.print(f"Uploaded SSH key to Secrets Manager: [green]{secret_name}[/green]")
203
+ except sm.exceptions.ResourceExistsException:
204
+ console.print(f"SSH key already in Secrets Manager: [green]{secret_name}[/green]")
205
+ except sm.exceptions.ClientError as e:
206
+ if e.response["Error"]["Code"] == "AccessDeniedException":
207
+ console.print(f"[red]Access denied for secretsmanager:CreateSecret[/red]")
208
+ console.print("[yellow]Attach the SecretsManager IAM policy from the README to your IAM user,[/yellow]")
209
+ console.print("[yellow]then re-run 'brr configure aws'.[/yellow]")
210
+ return ""
211
+ raise
212
+
213
+ if not shutil.which("gh"):
214
+ console.print("[yellow]gh CLI not found — skipping GitHub key setup[/yellow]")
215
+ console.print("[yellow]Install gh and run 'brr configure aws' again to add the key to GitHub[/yellow]")
216
+ return secret_name
217
+
218
+ auth_check = subprocess.run(
219
+ ["gh", "auth", "status"], capture_output=True, text=True
220
+ )
221
+ if auth_check.returncode != 0:
222
+ console.print("[yellow]gh CLI not authenticated — skipping GitHub key setup[/yellow]")
223
+ console.print("[yellow]Run 'gh auth login' and then 'brr configure aws' again[/yellow]")
224
+ return secret_name
225
+
226
+ list_result = subprocess.run(
227
+ ["gh", "ssh-key", "list"], capture_output=True, text=True
228
+ )
229
+ if list_result.returncode == 0:
230
+ for line in list_result.stdout.splitlines():
231
+ if "brr-aws" in line:
232
+ console.print("SSH key already registered on GitHub: [green]brr-aws[/green]")
233
+ return secret_name
234
+
235
+ pubkey_result = subprocess.run(
236
+ ["ssh-keygen", "-y", "-f", key_path], capture_output=True, text=True
237
+ )
238
+ if pubkey_result.returncode != 0:
239
+ console.print(f"[red]Failed to derive public key: {pubkey_result.stderr.strip()}[/red]")
240
+ return secret_name
241
+
242
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".pub", delete=False) as tmp:
243
+ tmp.write(pubkey_result.stdout)
244
+ tmp_path = tmp.name
245
+
246
+ try:
247
+ add_result = subprocess.run(
248
+ ["gh", "ssh-key", "add", tmp_path, "--title", "brr-aws"],
249
+ capture_output=True, text=True,
250
+ )
251
+ if add_result.returncode == 0:
252
+ console.print("Added SSH key to GitHub: [green]brr-aws[/green]")
253
+ else:
254
+ console.print(f"[red]Failed to add key to GitHub: {add_result.stderr.strip()}[/red]")
255
+ finally:
256
+ os.unlink(tmp_path)
257
+
258
+ _attach_secretsmanager_policy(region)
259
+
260
+ return secret_name
261
+
262
+
263
+
264
+ def _store_ec2_ssh_key(region, key_path):
265
+ """Store the EC2 SSH private key in Secrets Manager so cluster nodes can fetch it."""
266
+ import boto3
267
+ secret_name = "brr-ec2-ssh-key"
268
+ sm = boto3.client("secretsmanager", region_name=region)
269
+
270
+ with open(key_path) as f:
271
+ private_key = f.read()
272
+
273
+ try:
274
+ sm.create_secret(Name=secret_name, SecretString=private_key)
275
+ console.print(f"Stored EC2 SSH key in Secrets Manager: [green]{secret_name}[/green]")
276
+ except sm.exceptions.ResourceExistsException:
277
+ sm.put_secret_value(SecretId=secret_name, SecretString=private_key)
278
+ console.print(f"Updated EC2 SSH key in Secrets Manager: [green]{secret_name}[/green]")
279
+ except sm.exceptions.ClientError as e:
280
+ if e.response["Error"]["Code"] == "AccessDeniedException":
281
+ console.print("[red]Access denied for secretsmanager:CreateSecret[/red]")
282
+ console.print("[yellow]Attach the SecretsManager IAM policy from the README.[/yellow]")
283
+ return ""
284
+ raise
285
+
286
+ return secret_name
287
+
288
+
289
+ def _attach_secretsmanager_policy(region):
290
+ """Attach Secrets Manager read policy to ray-autoscaler-v1 role."""
291
+ import boto3
292
+ iam = boto3.client("iam")
293
+ try:
294
+ iam.put_role_policy(
295
+ RoleName="ray-autoscaler-v1",
296
+ PolicyName="brr-secretsmanager-read",
297
+ PolicyDocument=json.dumps({
298
+ "Version": "2012-10-17",
299
+ "Statement": [{
300
+ "Effect": "Allow",
301
+ "Action": "secretsmanager:GetSecretValue",
302
+ "Resource": f"arn:aws:secretsmanager:{region}:*:secret:brr-*"
303
+ }]
304
+ }),
305
+ )
306
+ console.print(f"Added Secrets Manager permission to [green]ray-autoscaler-v1[/green] role")
307
+ except iam.exceptions.NoSuchEntityException:
308
+ console.print("[yellow]IAM role 'ray-autoscaler-v1' not found yet (created on first cluster launch).[/yellow]")
309
+ console.print("[yellow]Permission will be added automatically on next `brr up`.[/yellow]")
310
+
311
+
312
+ def _attach_iam_passrole_policy():
313
+ """Attach IAM PassRole/GetInstanceProfile policy to ray-autoscaler-v1.
314
+
315
+ Ray's autoscaler needs these to launch new instances with the same
316
+ instance profile.
317
+ """
318
+ import boto3
319
+ iam = boto3.client("iam")
320
+ try:
321
+ iam.put_role_policy(
322
+ RoleName="ray-autoscaler-v1",
323
+ PolicyName="brr-iam-passrole",
324
+ PolicyDocument=json.dumps({
325
+ "Version": "2012-10-17",
326
+ "Statement": [
327
+ {
328
+ "Effect": "Allow",
329
+ "Action": [
330
+ "iam:GetInstanceProfile",
331
+ "iam:PassRole",
332
+ ],
333
+ "Resource": [
334
+ "arn:aws:iam::*:instance-profile/ray-autoscaler-v1",
335
+ "arn:aws:iam::*:role/ray-autoscaler-v1",
336
+ ],
337
+ },
338
+ ],
339
+ }),
340
+ )
341
+ console.print(f"Added IAM PassRole permission to [green]ray-autoscaler-v1[/green] role")
342
+ except iam.exceptions.NoSuchEntityException:
343
+ console.print("[yellow]IAM role 'ray-autoscaler-v1' not found yet.[/yellow]")
344
+
345
+
346
+ def _attach_ssm_policy():
347
+ """Attach SSM managed policy to ray-autoscaler-v1 for Session Manager access."""
348
+ import boto3
349
+ iam = boto3.client("iam")
350
+ try:
351
+ iam.attach_role_policy(
352
+ RoleName="ray-autoscaler-v1",
353
+ PolicyArn="arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore",
354
+ )
355
+ console.print(f"Added SSM Session Manager permission to [green]ray-autoscaler-v1[/green] role")
356
+ except iam.exceptions.NoSuchEntityException:
357
+ console.print("[yellow]IAM role 'ray-autoscaler-v1' not found yet.[/yellow]")
358
+
359
+
360
+ def configure_aws():
361
+ """Interactive AWS configuration wizard."""
362
+ ensure_state_dirs()
363
+
364
+ existing = read_config()
365
+
366
+ console.print(Panel("AWS configuration", title="brr configure", border_style="cyan"))
367
+
368
+ region = click.prompt(
369
+ "AWS region",
370
+ default=existing.get("AWS_REGION", DEFAULTS["AWS_REGION"]),
371
+ )
372
+ ami_ubuntu = click.prompt(
373
+ "Ubuntu AMI",
374
+ default=existing.get("AMI_UBUNTU", DEFAULTS["AMI_UBUNTU"]),
375
+ )
376
+ ami_dl = click.prompt(
377
+ "Deep Learning AMI",
378
+ default=existing.get("AMI_DL", DEFAULTS["AMI_DL"]),
379
+ )
380
+
381
+ efs_enabled = click.confirm(
382
+ "Enable shared EFS filesystem?",
383
+ default=bool(existing.get("EFS_ID", "")),
384
+ )
385
+
386
+ console.print()
387
+
388
+ import boto3
389
+ from botocore.exceptions import NoCredentialsError, PartialCredentialsError
390
+
391
+ ec2 = boto3.client("ec2", region_name=region)
392
+
393
+ try:
394
+ vpc_id = get_default_vpc(ec2)
395
+ except (NoCredentialsError, PartialCredentialsError):
396
+ console.print("[red]AWS credentials not found.[/red]")
397
+ console.print("Run [bold]aws configure[/bold] to set up your credentials first.")
398
+ console.print("See: https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html")
399
+ raise click.Abort()
400
+
401
+ if not vpc_id:
402
+ console.print("[red]No VPC found[/red]")
403
+ raise click.Abort()
404
+
405
+ sg_id = get_or_create_cluster_sg(ec2, vpc_id)
406
+ if not sg_id:
407
+ raise click.Abort()
408
+
409
+ key_name, key_path = get_or_create_key(ec2, region)
410
+ ec2_ssh_secret = _store_ec2_ssh_key(region, key_path)
411
+ _attach_iam_passrole_policy()
412
+ _attach_ssm_policy()
413
+
414
+ efs_id = ""
415
+ if efs_enabled:
416
+ efs_client = boto3.client("efs", region_name=region)
417
+ efs_id = get_or_create_efs(efs_client, ec2, vpc_id, sg_id)
418
+
419
+ github_ssh_enabled = click.confirm(
420
+ "Set up GitHub SSH access for clusters?",
421
+ default=bool(existing.get("GITHUB_SSH_SECRET", "")),
422
+ )
423
+ github_ssh_secret = ""
424
+ if github_ssh_enabled:
425
+ github_ssh_secret = setup_github_ssh(region, key_path)
426
+
427
+ updates = {
428
+ "AWS_REGION": region,
429
+ "AWS_SECURITY_GROUP": sg_id,
430
+ "AWS_KEY_NAME": key_name,
431
+ "AWS_SSH_KEY": key_path,
432
+ "EFS_ID": efs_id,
433
+ "AMI_UBUNTU": ami_ubuntu,
434
+ "AMI_DL": ami_dl,
435
+ "GITHUB_SSH_SECRET": github_ssh_secret,
436
+ "EC2_SSH_SECRET": ec2_ssh_secret,
437
+ }
438
+
439
+ merged = dict(existing)
440
+ merged.update(updates)
441
+ write_config(merged)
442
+ console.print(f"\nWrote [green]{CONFIG_PATH}[/green]")
443
+
444
+ console.print()
445
+ console.print("[bold green]Done![/bold green] Next steps:")
446
+ console.print(" brr configure tools # select AI coding tools")
447
+ console.print(" brr configure general # instance settings")
448
+ console.print(" brr up aws:cpu # launch CPU cluster")
@@ -0,0 +1,77 @@
1
+ {
2
+ "Version": "2012-10-17",
3
+ "Statement": [
4
+ {
5
+ "Sid": "EC2",
6
+ "Effect": "Allow",
7
+ "Action": [
8
+ "ec2:Describe*",
9
+ "ec2:RunInstances",
10
+ "ec2:TerminateInstances",
11
+ "ec2:StartInstances",
12
+ "ec2:StopInstances",
13
+ "ec2:CreateSecurityGroup",
14
+ "ec2:DeleteSecurityGroup",
15
+ "ec2:AuthorizeSecurityGroupIngress",
16
+ "ec2:CreateKeyPair",
17
+ "ec2:DeleteKeyPair",
18
+ "ec2:CreateTags",
19
+ "ec2:CreateVpc",
20
+ "ec2:DeleteVpc",
21
+ "ec2:CreateSubnet",
22
+ "ec2:DeleteSubnet",
23
+ "ec2:CreateInternetGateway",
24
+ "ec2:AttachInternetGateway",
25
+ "ec2:DetachInternetGateway",
26
+ "ec2:DeleteInternetGateway",
27
+ "ec2:CreateRoute",
28
+ "ec2:DeleteRouteTable",
29
+ "ec2:DeleteNatGateway",
30
+ "ec2:DeleteVolume",
31
+ "ec2:DisassociateAddress",
32
+ "ec2:ReleaseAddress",
33
+ "ec2:ModifySubnetAttribute",
34
+ "ec2:ModifyInstanceAttribute"
35
+ ],
36
+ "Resource": "*"
37
+ },
38
+ {
39
+ "Sid": "EFS",
40
+ "Effect": "Allow",
41
+ "Action": [
42
+ "elasticfilesystem:CreateFileSystem",
43
+ "elasticfilesystem:DescribeFileSystems",
44
+ "elasticfilesystem:CreateMountTarget",
45
+ "elasticfilesystem:DescribeMountTargets"
46
+ ],
47
+ "Resource": "*"
48
+ },
49
+ {
50
+ "Sid": "IAM",
51
+ "Effect": "Allow",
52
+ "Action": [
53
+ "iam:CreateRole",
54
+ "iam:GetRole",
55
+ "iam:PutRolePolicy",
56
+ "iam:DeleteRolePolicy",
57
+ "iam:AttachRolePolicy",
58
+ "iam:DetachRolePolicy",
59
+ "iam:CreateInstanceProfile",
60
+ "iam:AddRoleToInstanceProfile",
61
+ "iam:PassRole"
62
+ ],
63
+ "Resource": "*"
64
+ },
65
+ {
66
+ "Sid": "SecretsManager",
67
+ "Effect": "Allow",
68
+ "Action": [
69
+ "secretsmanager:CreateSecret",
70
+ "secretsmanager:PutSecretValue",
71
+ "secretsmanager:DeleteSecret",
72
+ "secretsmanager:GetSecretValue"
73
+ ],
74
+ "Resource": "*"
75
+ }
76
+ ]
77
+ }