anyscale 0.26.65__py3-none-any.whl → 0.26.67__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. anyscale/_private/anyscale_client/common.py +1 -1
  2. anyscale/client/README.md +0 -1
  3. anyscale/client/openapi_client/api/default_api.py +2 -127
  4. anyscale/client/openapi_client/models/baseimagesenum.py +70 -1
  5. anyscale/client/openapi_client/models/cloud_deployment_compute_config.py +29 -1
  6. anyscale/client/openapi_client/models/supportedbaseimagesenum.py +70 -1
  7. anyscale/client/openapi_client/models/task_summary_config.py +29 -3
  8. anyscale/client/openapi_client/models/task_table_config.py +29 -3
  9. anyscale/commands/cloud_commands.py +58 -0
  10. anyscale/commands/setup_k8s.py +1047 -0
  11. anyscale/controllers/cloud_controller.py +2 -1
  12. anyscale/controllers/kubernetes_verifier.py +8 -0
  13. anyscale/sdk/anyscale_client/models/baseimagesenum.py +70 -1
  14. anyscale/sdk/anyscale_client/models/cloud_deployment_compute_config.py +29 -1
  15. anyscale/sdk/anyscale_client/models/supportedbaseimagesenum.py +70 -1
  16. anyscale/shared_anyscale_utils/latest_ray_version.py +1 -1
  17. anyscale/utils/cloudformation_utils.py +364 -0
  18. anyscale/version.py +1 -1
  19. {anyscale-0.26.65.dist-info → anyscale-0.26.67.dist-info}/METADATA +1 -1
  20. {anyscale-0.26.65.dist-info → anyscale-0.26.67.dist-info}/RECORD +25 -23
  21. {anyscale-0.26.65.dist-info → anyscale-0.26.67.dist-info}/WHEEL +0 -0
  22. {anyscale-0.26.65.dist-info → anyscale-0.26.67.dist-info}/entry_points.txt +0 -0
  23. {anyscale-0.26.65.dist-info → anyscale-0.26.67.dist-info}/licenses/LICENSE +0 -0
  24. {anyscale-0.26.65.dist-info → anyscale-0.26.67.dist-info}/licenses/NOTICE +0 -0
  25. {anyscale-0.26.65.dist-info → anyscale-0.26.67.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1047 @@
1
+ """
2
+ Kubernetes Cloud Setup Command
3
+
4
+ This module provides a streamlined command for setting up Anyscale on Kubernetes clusters.
5
+ It handles infrastructure provisioning, cloud registration, and operator installation.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ import json
10
+ import os
11
+ import re
12
+ import subprocess
13
+ from typing import Any, Dict, List, Optional
14
+
15
+ import click
16
+ import yaml
17
+
18
+ from anyscale.cli_logger import BlockLogger
19
+ from anyscale.client.openapi_client.models import (
20
+ AWSConfig,
21
+ CloudDeployment,
22
+ CloudProviders,
23
+ ComputeStack,
24
+ KubernetesConfig as OpenAPIKubernetesConfig,
25
+ ObjectStorage,
26
+ )
27
+ from anyscale.controllers.cloud_controller import CloudController
28
+ from anyscale.controllers.kubernetes_verifier import (
29
+ KubernetesCloudDeploymentVerifier,
30
+ KubernetesConfig,
31
+ )
32
+ from anyscale.shared_anyscale_utils.conf import ANYSCALE_CORS_ORIGIN, ANYSCALE_HOST
33
+
34
+
35
+ @dataclass
36
+ class ClusterInfo:
37
+ """Information about the target Kubernetes cluster."""
38
+
39
+ context: str
40
+ namespace: str
41
+ provider: str
42
+ region: str
43
+ cluster_name: str
44
+ project_id: Optional[str] = None
45
+ cluster_arn: Optional[str] = None
46
+ oidc_provider: Optional[str] = None
47
+ cluster_location: Optional[str] = None
48
+ workload_identity_pool: Optional[str] = None
49
+ cluster_version: Optional[str] = None
50
+
51
+
52
+ @dataclass
53
+ class InfrastructureResources:
54
+ """Resources created during infrastructure setup."""
55
+
56
+ bucket_name: str
57
+ iam_role_arn: str
58
+ region: str
59
+ project_id: Optional[str] = None
60
+
61
+
62
+ class KubernetesCloudSetupCommand:
63
+ """Command to setup Kubernetes cloud."""
64
+
65
+ def __init__(self, logger: Optional[BlockLogger] = None, debug: bool = False):
66
+ self.log = logger or BlockLogger()
67
+ self.cloud_controller = CloudController(log=self.log)
68
+ self.skip_confirmation = False
69
+ self.debug = debug or os.environ.get("ANYSCALE_DEBUG") == "1"
70
+
71
+ def run( # noqa: PLR0913
72
+ self,
73
+ provider: str,
74
+ region: str,
75
+ name: str,
76
+ cluster_name: str,
77
+ namespace: str,
78
+ project_id: Optional[str],
79
+ functional_verify: bool,
80
+ yes: bool,
81
+ values_file: Optional[str] = None,
82
+ ) -> None:
83
+ """
84
+ Main entry point for Kubernetes cloud setup.
85
+
86
+ Args:
87
+ provider: Cloud provider (aws, gcp)
88
+ region: AWS/GCP region
89
+ name: Name for the Anyscale cloud
90
+ cluster_name: Kubernetes cluster name/context
91
+ namespace: Namespace for the Anyscale operator
92
+ project_id: GCP project ID (required for GCP)
93
+ functional_verify: Whether to run functional verification
94
+ yes: Skip confirmation prompts
95
+ values_file: Optional custom path for Helm values file
96
+ """
97
+ self.log.open_block(
98
+ "Setup", f"Setting up Kubernetes cloud '{name}' on {provider.upper()}"
99
+ )
100
+
101
+ # Set confirmation flag
102
+ self.skip_confirmation = yes
103
+
104
+ try:
105
+ # Step 0: Check required CLI tools are installed
106
+ self._check_required_tools(provider)
107
+
108
+ # Step 1: Prompt for namespace BEFORE infrastructure setup
109
+ # This is needed because the IAM role trust relationship depends on the namespace
110
+ final_namespace = self._prompt_for_namespace(
111
+ namespace, skip_confirmation=yes
112
+ )
113
+
114
+ # Step 2: Discover and validate cluster
115
+ cluster_info = self._discover_cluster(
116
+ cluster_name, final_namespace, provider, region, project_id
117
+ )
118
+
119
+ # Step 3: Set up cloud infrastructure
120
+ infrastructure = self._setup_infrastructure(
121
+ provider, region, name, cluster_info
122
+ )
123
+
124
+ # Step 4: Register cloud with Anyscale
125
+ cloud_id = self._register_cloud(
126
+ name, provider, region, infrastructure, cluster_info
127
+ )
128
+
129
+ # Step 5: Install Anyscale operator
130
+ self._install_operator(
131
+ cloud_id,
132
+ provider,
133
+ region,
134
+ final_namespace,
135
+ infrastructure,
136
+ values_file,
137
+ )
138
+
139
+ # Step 6: Verify installation
140
+ if functional_verify:
141
+ self._verify_installation(cloud_id, final_namespace, cluster_info)
142
+
143
+ self.log.close_block("Setup")
144
+ self.log.info(f"Kubernetes cloud '{name}' setup completed successfully!")
145
+ except Exception: # noqa: BLE001
146
+ self.log.close_block("Setup")
147
+ raise
148
+
149
+ def _debug(self, *msg: str) -> None:
150
+ """Log debug messages only when debug mode is enabled."""
151
+ if self.debug:
152
+ self.log.debug(*msg)
153
+
154
+ def _check_required_tools(self, provider: str) -> None:
155
+ """Check that required CLI tools are installed."""
156
+ # Common tools required for all providers
157
+ required_tools = ["kubectl", "helm"]
158
+
159
+ # Provider-specific tools
160
+ if provider == "aws":
161
+ required_tools.append("aws")
162
+ elif provider == "gcp":
163
+ required_tools.extend(["gcloud", "gsutil"])
164
+
165
+ self._debug(f"Checking for required tools: {', '.join(required_tools)}")
166
+
167
+ missing_tools = []
168
+ for tool in required_tools:
169
+ if not self._check_command_available(tool):
170
+ missing_tools.append(tool)
171
+
172
+ if missing_tools:
173
+ error_msg = f"Missing required CLI tools: {', '.join(missing_tools)}\n\n"
174
+ raise click.ClickException(error_msg.rstrip())
175
+
176
+ self.log.info(
177
+ f"Required CLI tools are installed ({', '.join(required_tools)})",
178
+ block_label="Setup",
179
+ )
180
+
181
+ def _check_command_available(self, command: str) -> bool:
182
+ """Check if a command is available in the system PATH."""
183
+ try:
184
+ result = subprocess.run(
185
+ ["which", command], capture_output=True, text=True, check=False
186
+ )
187
+ return result.returncode == 0
188
+ except Exception: # noqa: BLE001
189
+ return False
190
+
191
+ def _discover_cluster(
192
+ self,
193
+ cluster_name: str,
194
+ namespace: str,
195
+ provider: str,
196
+ region: str,
197
+ project_id: Optional[str], # noqa: ARG002
198
+ ) -> ClusterInfo:
199
+ """Discover and validate the target Kubernetes cluster using cloud provider APIs."""
200
+ self.log.info(
201
+ f"Discovering {provider.upper()} cluster: {cluster_name}",
202
+ block_label="Setup",
203
+ )
204
+
205
+ if provider == "aws":
206
+ return self._discover_aws_cluster(cluster_name, namespace, region)
207
+ elif provider == "gcp":
208
+ raise click.ClickException(
209
+ "GCP support is not yet implemented. Please use AWS for now."
210
+ )
211
+ else:
212
+ raise click.ClickException(f"Unsupported provider: {provider}")
213
+
214
+ def _discover_aws_cluster(
215
+ self, cluster_name: str, namespace: str, region: str
216
+ ) -> ClusterInfo:
217
+ """Discover AWS EKS cluster details and configure kubeconfig."""
218
+ try:
219
+ self._debug(f"Fetching EKS cluster info for {cluster_name} in {region}...")
220
+ cluster_info = self._get_eks_cluster_info(cluster_name, region)
221
+ self._debug(f"EKS Cluster ARN: {cluster_info.get('arn', 'Unknown')}")
222
+ self._debug(
223
+ f"EKS Cluster Version: {cluster_info.get('version', 'Unknown')}"
224
+ )
225
+ except Exception as e: # noqa: BLE001
226
+ self.log.error(f"Failed to get EKS cluster info: {e}")
227
+ raise click.ClickException(
228
+ f"Failed to discover EKS cluster {cluster_name}: {e}"
229
+ )
230
+
231
+ try:
232
+ self._debug("Fetching OIDC provider information...")
233
+ oidc_provider = self._get_eks_oidc_provider(cluster_name, region)
234
+ self._debug(f"OIDC Provider: {oidc_provider}")
235
+ except Exception as e: # noqa: BLE001
236
+ self.log.error(f"Failed to get OIDC provider: {e}")
237
+ raise click.ClickException(
238
+ f"Failed to get OIDC provider for cluster {cluster_name}: {e}"
239
+ )
240
+
241
+ try:
242
+ self._debug("Configuring kubeconfig for EKS cluster...")
243
+ self._configure_aws_kubeconfig(cluster_name, region)
244
+ except Exception as e: # noqa: BLE001
245
+ self.log.error(f"Failed to configure kubeconfig: {e}")
246
+ raise click.ClickException(
247
+ f"Failed to configure kubeconfig for EKS cluster: {e}"
248
+ )
249
+
250
+ try:
251
+ self._debug("Verifying kubeconfig configuration...")
252
+ self._verify_kubeconfig()
253
+ current_context = self._get_current_kubectl_context()
254
+ self.log.info(f"Cluster discovered: {current_context}", block_label="Setup")
255
+ except Exception as e: # noqa: BLE001
256
+ self.log.error(f"Failed to verify kubeconfig: {e}")
257
+ raise click.ClickException(f"Failed to verify kubeconfig: {e}")
258
+
259
+ return ClusterInfo(
260
+ context=current_context,
261
+ namespace=namespace,
262
+ provider="aws",
263
+ region=region,
264
+ cluster_name=cluster_name,
265
+ cluster_arn=cluster_info.get("arn"),
266
+ oidc_provider=oidc_provider,
267
+ cluster_version=cluster_info.get("version"),
268
+ )
269
+
270
+ def _setup_infrastructure(
271
+ self, provider: str, region: str, name: str, cluster_info: ClusterInfo,
272
+ ) -> InfrastructureResources:
273
+ """Set up cloud infrastructure (S3/GCS bucket, IAM roles, etc.)."""
274
+ self.log.info(
275
+ f"Setting up {provider.upper()} infrastructure...", block_label="Setup"
276
+ )
277
+
278
+ if provider == "aws":
279
+ return self._setup_aws_infrastructure(region, name, cluster_info)
280
+ elif provider == "gcp":
281
+ raise click.ClickException(
282
+ "GCP support is not yet implemented. Please use AWS for now."
283
+ )
284
+ else:
285
+ raise click.ClickException(f"Unsupported provider: {provider}")
286
+
287
+ def _setup_aws_infrastructure( # noqa: PLR0912
288
+ self, region: str, name: str, cluster_info: ClusterInfo,
289
+ ) -> InfrastructureResources:
290
+ """Set up AWS infrastructure for Kubernetes using CloudFormation."""
291
+ try:
292
+ import boto3
293
+
294
+ from anyscale.utils.cloudformation_utils import CloudFormationUtils
295
+ except ImportError as e:
296
+ self.log.error(f"Failed to import required modules: {e}")
297
+ raise click.ClickException(f"Failed to import required modules: {e}")
298
+
299
+ try:
300
+ # Generate a unique cloud ID
301
+ cloud_id = f"k8s-{name}-{os.urandom(4).hex()}"
302
+ stack_name = cloud_id.replace("_", "-").lower()
303
+ self._debug(f"Generated cloud ID: {cloud_id}")
304
+ self._debug(f"CloudFormation stack name: {stack_name}")
305
+ except Exception as e: # noqa: BLE001
306
+ self.log.error(f"Failed to generate cloud ID: {e}")
307
+ raise click.ClickException(f"Failed to generate cloud ID: {e}")
308
+
309
+ try:
310
+ # Generate CloudFormation template for Kubernetes setup with actual OIDC provider
311
+ if not cluster_info.oidc_provider:
312
+ raise click.ClickException(
313
+ "OIDC provider information not found. Please ensure the EKS cluster has OIDC provider enabled."
314
+ )
315
+ self._debug("Generating CloudFormation template...")
316
+ self._debug(
317
+ f"Using namespace: {cluster_info.namespace} with service account: anyscale-operator"
318
+ )
319
+ cfn_template_body = self._generate_aws_cloudformation_template(
320
+ cloud_id, cluster_info.oidc_provider, cluster_info.namespace,
321
+ )
322
+ self._debug("CloudFormation template generated successfully")
323
+ except Exception as e: # noqa: BLE001
324
+ self.log.error(f"Failed to generate CloudFormation template: {e}")
325
+ raise click.ClickException(
326
+ f"Failed to generate CloudFormation template: {e}"
327
+ )
328
+
329
+ try:
330
+ self._debug("Preparing CloudFormation parameters...")
331
+ parameters = [{"ParameterKey": "CloudID", "ParameterValue": cloud_id}]
332
+ self._debug(f"Prepared {len(parameters)} CloudFormation parameters")
333
+ except Exception as e: # noqa: BLE001
334
+ self.log.error(f"Failed to prepare CloudFormation parameters: {e}")
335
+ raise click.ClickException(
336
+ f"Failed to prepare CloudFormation parameters: {e}"
337
+ )
338
+
339
+ try:
340
+ with self.log.indent():
341
+ self.log.info(
342
+ "Creating CloudFormation stack (this may take a few minutes)...",
343
+ block_label="Setup",
344
+ )
345
+ boto3_session = boto3.Session(region_name=region)
346
+ cfn_utils = CloudFormationUtils(self.log)
347
+ cfn_utils.create_and_wait_for_stack(
348
+ stack_name=stack_name,
349
+ template_body=cfn_template_body,
350
+ parameters=parameters,
351
+ region=region,
352
+ boto3_session=boto3_session,
353
+ timeout_seconds=600,
354
+ )
355
+ self.log.info("CloudFormation stack created", block_label="Setup")
356
+ except Exception as e: # noqa: BLE001
357
+ self.log.error(f"Failed to create CloudFormation stack: {e}")
358
+ raise click.ClickException(f"Failed to create CloudFormation stack: {e}")
359
+
360
+ try:
361
+ self._debug("Retrieving CloudFormation stack outputs...")
362
+ stack_outputs = cfn_utils.get_stack_outputs(
363
+ stack_name, region, boto3_session
364
+ )
365
+ bucket_name = stack_outputs.get("S3BucketName", f"anyscale-{cloud_id}")
366
+ iam_role_arn = stack_outputs.get("AnyscaleCrossAccountIAMRoleArn")
367
+
368
+ if not iam_role_arn:
369
+ raise click.ClickException(
370
+ "Failed to get IAM role ARN from CloudFormation stack"
371
+ )
372
+
373
+ self._debug(f"S3 Bucket: {bucket_name}")
374
+ self._debug(f"IAM Role ARN: {iam_role_arn}")
375
+ except Exception as e: # noqa: BLE001
376
+ self.log.error(f"Failed to get CloudFormation outputs: {e}")
377
+ raise click.ClickException(f"Failed to get CloudFormation outputs: {e}")
378
+
379
+ return InfrastructureResources(
380
+ bucket_name=bucket_name, iam_role_arn=iam_role_arn, region=region
381
+ )
382
+
383
+ def _generate_aws_cloudformation_template(
384
+ self, cloud_id: str, oidc_provider_arn: str, namespace: str,
385
+ ) -> str:
386
+ """Generate CloudFormation template for AWS Kubernetes setup."""
387
+ # Extract OIDC provider URL from ARN for the condition
388
+ # ARN format: arn:aws:iam::ACCOUNT:oidc-provider/oidc.eks.REGION.amazonaws.com/id/XXXXXX
389
+ # We need: oidc.eks.REGION.amazonaws.com/id/XXXXXX
390
+ if "oidc-provider/" not in oidc_provider_arn:
391
+ raise click.ClickException(
392
+ f"Invalid OIDC provider ARN format: {oidc_provider_arn}"
393
+ )
394
+ oidc_provider_url = oidc_provider_arn.split("oidc-provider/")[-1]
395
+
396
+ service_account_name = "anyscale-operator"
397
+
398
+ # Use ANYSCALE_CORS_ORIGIN from shared config
399
+ # This respects the ANYSCALE_HOST environment variable
400
+ allowed_origin = ANYSCALE_CORS_ORIGIN
401
+
402
+ template = {
403
+ "AWSTemplateFormatVersion": "2010-09-09",
404
+ "Description": f"Anyscale Kubernetes Cloud Infrastructure for {cloud_id}",
405
+ "Parameters": {
406
+ "CloudID": {
407
+ "Type": "String",
408
+ "Description": "Cloud ID for resource naming",
409
+ }
410
+ },
411
+ "Resources": {
412
+ "AnyscaleBucket": {
413
+ "Type": "AWS::S3::Bucket",
414
+ "Properties": {
415
+ "BucketName": {"Fn::Sub": "anyscale-${CloudID}"},
416
+ "VersioningConfiguration": {"Status": "Enabled"},
417
+ "PublicAccessBlockConfiguration": {
418
+ "BlockPublicAcls": True,
419
+ "BlockPublicPolicy": True,
420
+ "IgnorePublicAcls": True,
421
+ "RestrictPublicBuckets": True,
422
+ },
423
+ "CorsConfiguration": {
424
+ "CorsRules": [
425
+ {
426
+ "AllowedHeaders": ["*"],
427
+ "AllowedMethods": [
428
+ "GET",
429
+ "PUT",
430
+ "POST",
431
+ "HEAD",
432
+ "DELETE",
433
+ ],
434
+ "AllowedOrigins": [allowed_origin],
435
+ "MaxAge": 3600,
436
+ }
437
+ ]
438
+ },
439
+ },
440
+ },
441
+ "AnyscaleOperatorRole": {
442
+ "Type": "AWS::IAM::Role",
443
+ "Properties": {
444
+ "RoleName": {"Fn::Sub": "${CloudID}-anyscale-operator-role"},
445
+ "AssumeRolePolicyDocument": {
446
+ "Version": "2012-10-17",
447
+ "Statement": [
448
+ {
449
+ "Effect": "Allow",
450
+ "Principal": {"Federated": oidc_provider_arn},
451
+ "Action": "sts:AssumeRoleWithWebIdentity",
452
+ "Condition": {
453
+ "StringEquals": {
454
+ f"{oidc_provider_url}:sub": f"system:serviceaccount:{namespace}:{service_account_name}"
455
+ }
456
+ },
457
+ }
458
+ ],
459
+ },
460
+ "Policies": [
461
+ {
462
+ "PolicyName": "AnyscaleS3AccessPolicy",
463
+ "PolicyDocument": {
464
+ "Version": "2012-10-17",
465
+ "Statement": [
466
+ {
467
+ "Effect": "Allow",
468
+ "Action": [
469
+ "s3:GetObject",
470
+ "s3:PutObject",
471
+ "s3:DeleteObject",
472
+ "s3:ListBucket",
473
+ ],
474
+ "Resource": [
475
+ {
476
+ "Fn::GetAtt": [
477
+ "AnyscaleBucket",
478
+ "Arn",
479
+ ]
480
+ },
481
+ {"Fn::Sub": "${AnyscaleBucket.Arn}/*"},
482
+ ],
483
+ }
484
+ ],
485
+ },
486
+ }
487
+ ],
488
+ },
489
+ },
490
+ },
491
+ "Outputs": {
492
+ "S3BucketName": {
493
+ "Value": {"Ref": "AnyscaleBucket"},
494
+ "Description": "Name of the S3 bucket",
495
+ },
496
+ "AnyscaleCrossAccountIAMRoleArn": {
497
+ "Value": {"Fn::GetAtt": ["AnyscaleOperatorRole", "Arn"]},
498
+ "Description": "ARN of the Anyscale operator IAM role",
499
+ },
500
+ },
501
+ }
502
+
503
+ return json.dumps(template, indent=2)
504
+
505
+ def _get_eks_cluster_info(self, cluster_name: str, region: str) -> Dict[str, Any]:
506
+ """Get EKS cluster information using AWS CLI."""
507
+ try:
508
+ result = subprocess.run(
509
+ [
510
+ "aws",
511
+ "eks",
512
+ "describe-cluster",
513
+ "--name",
514
+ cluster_name,
515
+ "--region",
516
+ region,
517
+ ],
518
+ capture_output=True,
519
+ text=True,
520
+ check=True,
521
+ )
522
+ cluster_data = json.loads(result.stdout)
523
+ return cluster_data.get("cluster", {})
524
+ except subprocess.CalledProcessError as e:
525
+ raise click.ClickException(f"Failed to get EKS cluster info: {e.stderr}")
526
+
527
+ def _get_eks_availability_zones(self, cluster_name: str, region: str) -> List[str]:
528
+ """Get availability zones where the EKS cluster's subnets are located."""
529
+ try:
530
+ cluster_info = self._get_eks_cluster_info(cluster_name, region)
531
+ subnet_ids = cluster_info.get("resourcesVpcConfig", {}).get("subnetIds", [])
532
+
533
+ if not subnet_ids:
534
+ self._debug(
535
+ "No subnets found in cluster info, falling back to default zones"
536
+ )
537
+ return [region + "a", region + "b", region + "c"]
538
+
539
+ # Get subnet details to find their availability zones
540
+ result = subprocess.run(
541
+ [
542
+ "aws",
543
+ "ec2",
544
+ "describe-subnets",
545
+ "--subnet-ids",
546
+ *subnet_ids,
547
+ "--region",
548
+ region,
549
+ "--query",
550
+ "Subnets[*].AvailabilityZone",
551
+ "--output",
552
+ "json",
553
+ ],
554
+ capture_output=True,
555
+ text=True,
556
+ check=True,
557
+ )
558
+
559
+ zones = json.loads(result.stdout)
560
+ # Remove duplicates and sort
561
+ unique_zones = sorted(set(zones))
562
+
563
+ if unique_zones:
564
+ self._debug(f"Discovered availability zones: {', '.join(unique_zones)}")
565
+ return unique_zones
566
+ else:
567
+ self._debug(
568
+ "No availability zones found, falling back to default zones"
569
+ )
570
+ return [region + "a", region + "b", region + "c"]
571
+
572
+ except Exception as e: # noqa: BLE001
573
+ self._debug(f"Failed to get availability zones: {e}, using default zones")
574
+ return [region + "a", region + "b", region + "c"]
575
+
576
+ def _get_eks_oidc_provider(self, cluster_name: str, region: str) -> str:
577
+ """Get EKS OIDC provider URL for IRSA."""
578
+ cluster_info = self._get_eks_cluster_info(cluster_name, region)
579
+ identity = cluster_info.get("identity", {})
580
+ oidc_issuer = identity.get("oidc", {}).get("issuer", "")
581
+
582
+ if not oidc_issuer:
583
+ raise click.ClickException(
584
+ "Could not find OIDC issuer for EKS cluster. IRSA setup requires OIDC provider."
585
+ )
586
+
587
+ # Extract OIDC provider ARN
588
+ # OIDC issuer URL format: https://oidc.eks.region.amazonaws.com/id/EXAMPLED539D4633E53CE8D
589
+ if "oidc.eks." in oidc_issuer and ".amazonaws.com/id/" in oidc_issuer:
590
+ oidc_id = oidc_issuer.split("/id/")[-1]
591
+ account_id = self._get_aws_account_id()
592
+ oidc_provider_arn = f"arn:aws:iam::{account_id}:oidc-provider/oidc.eks.{region}.amazonaws.com/id/{oidc_id}"
593
+ return oidc_provider_arn
594
+
595
+ raise click.ClickException(
596
+ f"Could not parse OIDC provider from issuer URL: {oidc_issuer}"
597
+ )
598
+
599
+ def _get_aws_account_id(self) -> str:
600
+ """Get AWS account ID."""
601
+ try:
602
+ result = subprocess.run(
603
+ [
604
+ "aws",
605
+ "sts",
606
+ "get-caller-identity",
607
+ "--query",
608
+ "Account",
609
+ "--output",
610
+ "text",
611
+ ],
612
+ capture_output=True,
613
+ text=True,
614
+ check=True,
615
+ )
616
+ return result.stdout.strip()
617
+ except subprocess.CalledProcessError as e:
618
+ raise click.ClickException(f"Failed to get AWS account ID: {e.stderr}")
619
+
620
+ def _configure_aws_kubeconfig(self, cluster_name: str, region: str) -> None:
621
+ """Configure kubeconfig for AWS EKS cluster."""
622
+ self.log.info(f"Configuring kubeconfig for EKS cluster: {cluster_name}")
623
+
624
+ try:
625
+ subprocess.run(
626
+ [
627
+ "aws",
628
+ "eks",
629
+ "update-kubeconfig",
630
+ "--region",
631
+ region,
632
+ "--name",
633
+ cluster_name,
634
+ ],
635
+ capture_output=True,
636
+ text=True,
637
+ check=True,
638
+ )
639
+ self.log.info("EKS kubeconfig configured successfully")
640
+ except subprocess.CalledProcessError as e:
641
+ raise click.ClickException(
642
+ f"Failed to configure EKS kubeconfig: {e.stderr}"
643
+ )
644
+
645
+ def _verify_kubeconfig(self) -> None:
646
+ """Verify that kubeconfig is working correctly."""
647
+ self.log.info("Verifying kubeconfig configuration...")
648
+
649
+ try:
650
+ subprocess.run(
651
+ ["kubectl", "cluster-info"], capture_output=True, text=True, check=True
652
+ )
653
+ self.log.info("Kubeconfig verification successful")
654
+ except subprocess.CalledProcessError as e:
655
+ raise click.ClickException(f"Kubeconfig verification failed: {e.stderr}")
656
+
657
+ def _get_current_kubectl_context(self) -> str:
658
+ """Get the current kubectl context."""
659
+ try:
660
+ result = subprocess.run(
661
+ ["kubectl", "config", "current-context"],
662
+ capture_output=True,
663
+ text=True,
664
+ check=True,
665
+ )
666
+ return result.stdout.strip()
667
+ except subprocess.CalledProcessError as e:
668
+ raise click.ClickException(
669
+ f"Failed to get current kubectl context: {e.stderr}"
670
+ )
671
+
672
+ def _register_cloud( # noqa: PLR0912
673
+ self,
674
+ name: str,
675
+ provider: str,
676
+ region: str,
677
+ infrastructure: InfrastructureResources,
678
+ cluster_info: ClusterInfo,
679
+ ) -> str:
680
+ """Register the cloud with Anyscale."""
681
+ self.log.info("Registering cloud with Anyscale...", block_label="Setup")
682
+
683
+ if provider == "aws":
684
+ # Dynamically determine availability zones from the EKS cluster
685
+ zones = self._get_eks_availability_zones(cluster_info.cluster_name, region)
686
+
687
+ cloud_deployment = CloudDeployment(
688
+ name=name,
689
+ provider=CloudProviders.AWS,
690
+ region=region,
691
+ compute_stack=ComputeStack.K8S,
692
+ object_storage=ObjectStorage(
693
+ bucket_name=infrastructure.bucket_name, region=region
694
+ ),
695
+ aws_config=AWSConfig(),
696
+ kubernetes_config=OpenAPIKubernetesConfig(
697
+ anyscale_operator_iam_identity=infrastructure.iam_role_arn,
698
+ zones=zones,
699
+ ),
700
+ )
701
+ else:
702
+ raise click.ClickException(
703
+ "GCP support is not yet implemented. Please use AWS for now."
704
+ )
705
+
706
+ # Register the cloud
707
+ try:
708
+ self._debug("Cloud deployment details:")
709
+ self._debug(f" Name: {cloud_deployment.name}")
710
+ self._debug(f" Provider: {cloud_deployment.provider}")
711
+ self._debug(f" Region: {cloud_deployment.region}")
712
+ self._debug(f" Compute Stack: {cloud_deployment.compute_stack}")
713
+ self._debug(f" Bucket Name: {cloud_deployment.object_storage.bucket_name}")
714
+ self._debug(
715
+ f" IAM Identity: {cloud_deployment.kubernetes_config.anyscale_operator_iam_identity}"
716
+ )
717
+ if cloud_deployment.aws_config:
718
+ self._debug(" AWS Config:")
719
+ self._debug(
720
+ f" IAM Role ID: {cloud_deployment.aws_config.anyscale_iam_role_id}"
721
+ )
722
+
723
+ # Temporarily suppress cloud controller logging to avoid Helm command output
724
+ original_log_info = self.cloud_controller.log.info
725
+ self.cloud_controller.log.info = lambda *_args, **_kwargs: None
726
+
727
+ try:
728
+ if provider == "aws":
729
+ self.log.info("Calling register_aws_cloud...")
730
+ self.cloud_controller.register_aws_cloud(
731
+ name=name,
732
+ cloud_resource=cloud_deployment,
733
+ functional_verify=None,
734
+ yes=True,
735
+ skip_verifications=True,
736
+ auto_add_user=True,
737
+ )
738
+ else:
739
+ raise click.ClickException(
740
+ "GCP support is not yet implemented. Please use AWS for now."
741
+ )
742
+ finally:
743
+ # Restore the original log.info method
744
+ self.cloud_controller.log.info = original_log_info
745
+
746
+ self._debug("Cloud registration completed, fetching cloud ID...")
747
+ clouds = (
748
+ self.cloud_controller.api_client.list_clouds_api_v2_clouds_get().results
749
+ )
750
+ cloud = next((c for c in clouds if c.name == name), None)
751
+ if not cloud:
752
+ raise click.ClickException("Failed to find registered cloud")
753
+
754
+ cloud_id = getattr(cloud, "id", None) or getattr(cloud, "cloud_id", None)
755
+ if not cloud_id:
756
+ raise click.ClickException(
757
+ "Failed to get cloud ID from registered cloud"
758
+ )
759
+
760
+ self.log.info(f"Cloud registered with ID: {cloud_id}", block_label="Setup")
761
+
762
+ return cloud_id
763
+
764
+ except Exception as e: # noqa: BLE001
765
+ self.log.error(f"Cloud registration failed with error: {e}")
766
+ self.log.error(f"Error type: {type(e).__name__}")
767
+ if hasattr(e, "response"):
768
+ self.log.error(f"Response details: {getattr(e, 'response', 'N/A')}")
769
+ if hasattr(e, "args"):
770
+ self.log.error(f"Error args: {e.args}")
771
+ import traceback
772
+
773
+ self.log.error(f"Full traceback: {traceback.format_exc()}")
774
+ raise click.ClickException(f"Failed to register cloud: {e}")
775
+
776
+ def _install_operator( # noqa: PLR0913
777
+ self,
778
+ cloud_id: str,
779
+ provider: str,
780
+ region: str,
781
+ namespace: str,
782
+ infrastructure: InfrastructureResources,
783
+ values_file: Optional[str] = None,
784
+ ) -> None:
785
+ """Install the Anyscale operator using Helm."""
786
+ self.log.info("Installing Anyscale operator...", block_label="Setup")
787
+
788
+ # Get cloud resources to get the cloud resource ID
789
+ cloud_resources = self.cloud_controller.get_decorated_cloud_resources(cloud_id)
790
+
791
+ if not cloud_resources:
792
+ raise click.ClickException("No cloud resources found")
793
+
794
+ cloud_resource_id = cloud_resources[0].cloud_resource_id
795
+
796
+ release_name = "anyscale-operator"
797
+
798
+ # Generate Helm command and extract --set-string flags from it
799
+ self._debug("Generating Helm command to extract parameters...")
800
+ helm_command = self.cloud_controller._generate_helm_upgrade_command( # noqa: SLF001
801
+ provider=provider,
802
+ cloud_deployment_id=cloud_resource_id,
803
+ region=region,
804
+ operator_iam_identity=infrastructure.iam_role_arn,
805
+ )
806
+
807
+ set_string_values = self._extract_set_string_values(helm_command)
808
+ self._debug(f"Extracted {len(set_string_values)} --set-string parameters")
809
+
810
+ values_file_path = self._generate_helm_values_file(
811
+ provider=provider,
812
+ cloud_deployment_id=cloud_resource_id,
813
+ region=region,
814
+ namespace=namespace,
815
+ infrastructure=infrastructure,
816
+ custom_path=values_file,
817
+ additional_values=set_string_values,
818
+ )
819
+
820
+ # Build a simple Helm command that only uses the values file
821
+ self._debug("Generating Helm command...")
822
+ helm_command = (
823
+ f"helm upgrade {release_name} anyscale/anyscale-operator "
824
+ f"--values {values_file_path} "
825
+ f"--namespace {namespace} "
826
+ f"--create-namespace "
827
+ f"--wait "
828
+ f"-i"
829
+ )
830
+
831
+ self._execute_helm_command(helm_command)
832
+
833
+ def _extract_set_string_values(self, helm_command: str) -> Dict[str, str]:
834
+ """
835
+ Extract all --set-string key=value pairs from a Helm command.
836
+
837
+ Args:
838
+ helm_command: The Helm command string to parse
839
+
840
+ Returns:
841
+ Dictionary of key-value pairs from --set-string flags
842
+ """
843
+ import re
844
+
845
+ set_string_values = {}
846
+
847
+ # Pattern to match --set-string key=value
848
+ pattern = r"--set-string\s+(\S+?)=(\S+)"
849
+
850
+ matches = re.findall(pattern, helm_command)
851
+ for key, value in matches:
852
+ set_string_values[key] = value
853
+
854
+ return set_string_values
855
+
856
+ def _prompt_for_namespace(
857
+ self, default_namespace: str, skip_confirmation: bool = False
858
+ ) -> str:
859
+ """Prompt user for namespace confirmation."""
860
+ final_namespace = default_namespace or "anyscale-operator"
861
+
862
+ if skip_confirmation:
863
+ self.log.info(f"Using namespace: {final_namespace}", block_label="Setup")
864
+ return final_namespace
865
+
866
+ self.log.info("Configuring Kubernetes namespace...")
867
+
868
+ self.log.info(
869
+ f"Enter the namespace to use for the Anyscale operator (default: {final_namespace}):"
870
+ )
871
+ final_namespace = click.prompt("", default=final_namespace, show_default=True)
872
+
873
+ # Validate namespace (Kubernetes DNS-1123 label requirements)
874
+ # Must be lowercase alphanumeric or hyphens, start and end with alphanumeric, max 63 chars
875
+
876
+ if not final_namespace:
877
+ raise click.ClickException("Namespace cannot be empty")
878
+ if len(final_namespace) > 63:
879
+ raise click.ClickException("Namespace must be 63 characters or less")
880
+ if not re.match(r"^[a-z0-9]([-a-z0-9]*[a-z0-9])?$", final_namespace):
881
+ raise click.ClickException(
882
+ "Namespace must consist of lowercase alphanumeric characters or hyphens, "
883
+ "and must start and end with an alphanumeric character"
884
+ )
885
+
886
+ self.log.info(f"Using namespace: {final_namespace}")
887
+
888
+ return final_namespace
889
+
890
+ def _generate_helm_values_file( # noqa: PLR0913
891
+ self,
892
+ provider: str,
893
+ cloud_deployment_id: str,
894
+ region: str,
895
+ namespace: str,
896
+ infrastructure: InfrastructureResources,
897
+ custom_path: Optional[str] = None,
898
+ additional_values: Optional[Dict[str, str]] = None,
899
+ ) -> str:
900
+ """Generate Helm values file and save it locally."""
901
+ self.log.info("Generating Helm values file...")
902
+
903
+ # Create values dictionary starting with base values
904
+ values: Dict[str, Any] = {
905
+ "cloudProvider": provider,
906
+ "cloudDeploymentId": cloud_deployment_id,
907
+ "region": region,
908
+ "operatorIamIdentity": infrastructure.iam_role_arn,
909
+ "ingress-nginx": {"enabled": True},
910
+ }
911
+
912
+ if additional_values:
913
+ for key, value in additional_values.items():
914
+ if key not in values:
915
+ values[key] = value
916
+
917
+ # Add control plane URL from ANYSCALE_HOST environment variable
918
+ if ANYSCALE_HOST:
919
+ values["controlPlaneURL"] = ANYSCALE_HOST
920
+ self._debug(f"Using control plane URL: {ANYSCALE_HOST}")
921
+
922
+ if custom_path:
923
+ values_file_path = custom_path
924
+ else:
925
+ # Create filename with timestamp
926
+ import datetime
927
+
928
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
929
+ filename = f"anyscale-helm-values-{provider}-{namespace}-{timestamp}.yaml"
930
+ values_file_path = os.path.join(os.getcwd(), filename)
931
+
932
+ with open(values_file_path, "w") as f:
933
+ yaml.dump(values, f, default_flow_style=False, sort_keys=False)
934
+
935
+ self.log.info(f"Generated Helm values file: {values_file_path}")
936
+
937
+ return values_file_path
938
+
939
+ def _execute_helm_command(self, helm_command: str) -> None:
940
+ """Execute the helm command."""
941
+ # Convert multi-line command to single line and execute
942
+ single_line_command = helm_command.replace(" \\\n", " ").replace("\n", " ")
943
+
944
+ self.log.info(f"Executing: {single_line_command}")
945
+
946
+ try:
947
+ subprocess.run(
948
+ single_line_command,
949
+ shell=True,
950
+ check=True,
951
+ capture_output=True,
952
+ text=True,
953
+ )
954
+ self.log.info("Helm installation completed successfully")
955
+ except subprocess.CalledProcessError as e:
956
+ self.log.error(f"Helm installation failed: {e.stderr}")
957
+ raise click.ClickException(
958
+ f"Failed to install Anyscale operator: {e.stderr}"
959
+ )
960
+
961
+ def _verify_installation(
962
+ self, cloud_id: str, namespace: str, cluster_info: ClusterInfo
963
+ ) -> None:
964
+ """Verify the Kubernetes installation."""
965
+ self.log.info("Verifying installation...")
966
+
967
+ # Get the cloud deployment
968
+ cloud_resources = self.cloud_controller.get_cloud_resources(cloud_id)
969
+
970
+ if not cloud_resources:
971
+ raise click.ClickException("No cloud resources found for verification")
972
+
973
+ cloud_deployment = cloud_resources[0]
974
+
975
+ # Use the existing Kubernetes verifier
976
+ verifier = KubernetesCloudDeploymentVerifier(
977
+ self.log, self.cloud_controller.api_client
978
+ )
979
+
980
+ # Set up kubectl config for verification using the discovered context
981
+ verifier.k8s_config = KubernetesConfig(
982
+ context=cluster_info.context, # Use the discovered context to avoid re-prompting
983
+ operator_namespace=namespace,
984
+ )
985
+
986
+ # Sleep to avoid race condition where operator has not loaded its IAM identity
987
+ import time
988
+
989
+ time.sleep(5)
990
+
991
+ # Run verification
992
+ success = verifier.verify(cloud_deployment)
993
+
994
+ if success:
995
+ self.log.info("Verification completed successfully")
996
+ else:
997
+ self.log.error("Verification failed - please check the logs above")
998
+ raise click.ClickException("Installation verification failed")
999
+
1000
+
1001
+ def setup_kubernetes_cloud( # noqa: PLR0913
1002
+ provider: str,
1003
+ region: str,
1004
+ name: str,
1005
+ cluster_name: str,
1006
+ namespace: str = "anyscale-operator",
1007
+ project_id: Optional[str] = None,
1008
+ functional_verify: bool = False,
1009
+ yes: bool = False,
1010
+ values_file: Optional[str] = None,
1011
+ debug: bool = False,
1012
+ ) -> None:
1013
+ """
1014
+ Set up Anyscale on a Kubernetes cluster.
1015
+
1016
+ This function can be called from multiple CLI commands and provides
1017
+ the core K8s setup functionality.
1018
+
1019
+ Args:
1020
+ provider: Cloud provider (aws, gcp)
1021
+ region: Cloud region
1022
+ name: Name for the Anyscale cloud
1023
+ cluster_name: Kubernetes cluster name
1024
+ namespace: Namespace for Anyscale operator (default: anyscale-operator)
1025
+ project_id: GCP project ID (optional, for future GCP support)
1026
+ functional_verify: Whether to run functional verification
1027
+ yes: Skip confirmation prompts
1028
+ values_file: Optional path for Helm values file
1029
+ debug: Enable debug logging
1030
+ """
1031
+ cmd = KubernetesCloudSetupCommand(debug=debug)
1032
+
1033
+ try:
1034
+ cmd.run(
1035
+ provider=provider,
1036
+ region=region,
1037
+ name=name,
1038
+ cluster_name=cluster_name,
1039
+ namespace=namespace,
1040
+ project_id=project_id,
1041
+ functional_verify=functional_verify,
1042
+ yes=yes,
1043
+ values_file=values_file,
1044
+ )
1045
+ except Exception as e: # noqa: BLE001
1046
+ click.echo(f"Setup failed: {e}", err=True)
1047
+ raise click.Abort()