anyscale 0.26.66__py3-none-any.whl → 0.26.68__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. anyscale/client/README.md +20 -0
  2. anyscale/client/openapi_client/__init__.py +15 -0
  3. anyscale/client/openapi_client/api/default_api.py +656 -0
  4. anyscale/client/openapi_client/models/__init__.py +15 -0
  5. anyscale/client/openapi_client/models/lineage_artifact.py +383 -0
  6. anyscale/client/openapi_client/models/lineage_artifact_sort_field.py +101 -0
  7. anyscale/client/openapi_client/models/lineage_artifact_type.py +100 -0
  8. anyscale/client/openapi_client/models/lineage_direction.py +101 -0
  9. anyscale/client/openapi_client/models/lineage_graph.py +179 -0
  10. anyscale/client/openapi_client/models/lineage_graph_node.py +439 -0
  11. anyscale/client/openapi_client/models/lineage_node_type.py +100 -0
  12. anyscale/client/openapi_client/models/lineage_workload.py +355 -0
  13. anyscale/client/openapi_client/models/lineage_workload_sort_field.py +101 -0
  14. anyscale/client/openapi_client/models/lineage_workload_type.py +101 -0
  15. anyscale/client/openapi_client/models/lineageartifact_list_response.py +147 -0
  16. anyscale/client/openapi_client/models/lineageartifact_response.py +121 -0
  17. anyscale/client/openapi_client/models/lineagegraph_response.py +121 -0
  18. anyscale/client/openapi_client/models/lineageworkload_list_response.py +147 -0
  19. anyscale/client/openapi_client/models/lineageworkload_response.py +121 -0
  20. anyscale/commands/cloud_commands.py +58 -0
  21. anyscale/commands/setup_k8s.py +1467 -0
  22. anyscale/controllers/cloud_controller.py +11 -10
  23. anyscale/controllers/kubernetes_verifier.py +65 -11
  24. anyscale/utils/cloudformation_utils.py +364 -0
  25. anyscale/version.py +1 -1
  26. {anyscale-0.26.66.dist-info → anyscale-0.26.68.dist-info}/METADATA +1 -1
  27. {anyscale-0.26.66.dist-info → anyscale-0.26.68.dist-info}/RECORD +32 -15
  28. {anyscale-0.26.66.dist-info → anyscale-0.26.68.dist-info}/WHEEL +0 -0
  29. {anyscale-0.26.66.dist-info → anyscale-0.26.68.dist-info}/entry_points.txt +0 -0
  30. {anyscale-0.26.66.dist-info → anyscale-0.26.68.dist-info}/licenses/LICENSE +0 -0
  31. {anyscale-0.26.66.dist-info → anyscale-0.26.68.dist-info}/licenses/NOTICE +0 -0
  32. {anyscale-0.26.66.dist-info → anyscale-0.26.68.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1467 @@
1
+ """
2
+ Kubernetes Cloud Setup Command
3
+
4
+ This module provides a streamlined command for setting up Anyscale on Kubernetes clusters.
5
+ It handles infrastructure provisioning, cloud registration, and operator installation.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ import json
10
+ import os
11
+ import re
12
+ import subprocess
13
+ from typing import Any, Dict, List, Optional
14
+
15
+ import click
16
+ import yaml
17
+
18
+ from anyscale.cli_logger import BlockLogger
19
+ from anyscale.client.openapi_client.models import (
20
+ AWSConfig,
21
+ CloudDeployment,
22
+ CloudProviders,
23
+ ComputeStack,
24
+ KubernetesConfig as OpenAPIKubernetesConfig,
25
+ ObjectStorage,
26
+ )
27
+ from anyscale.controllers.cloud_controller import CloudController
28
+ from anyscale.controllers.kubernetes_verifier import (
29
+ KubernetesCloudDeploymentVerifier,
30
+ KubernetesConfig,
31
+ )
32
+ from anyscale.shared_anyscale_utils.conf import ANYSCALE_CORS_ORIGIN, ANYSCALE_HOST
33
+
34
+
35
+ @dataclass
36
+ class ClusterInfo:
37
+ """Information about the target Kubernetes cluster."""
38
+
39
+ context: str
40
+ namespace: str
41
+ provider: str
42
+ region: str
43
+ cluster_name: str
44
+ project_id: Optional[str] = None
45
+ oidc_provider: Optional[str] = None
46
+
47
+
48
+ @dataclass
49
+ class InfrastructureResources:
50
+ """Resources created during infrastructure setup."""
51
+
52
+ bucket_name: str
53
+ iam_role_arn: str
54
+ region: str
55
+ project_id: Optional[str] = None
56
+
57
+
58
+ class KubernetesCloudSetupCommand:
59
+ """Command to setup Kubernetes cloud."""
60
+
61
+ def __init__(self, logger: Optional[BlockLogger] = None, debug: bool = False):
62
+ self.log = logger or BlockLogger()
63
+ self.cloud_controller = CloudController(log=self.log)
64
+ self.skip_confirmation = False
65
+ self.debug = debug or os.environ.get("ANYSCALE_DEBUG") == "1"
66
+
67
+ def run( # noqa: PLR0913
68
+ self,
69
+ provider: str,
70
+ region: str,
71
+ name: str,
72
+ cluster_name: str,
73
+ namespace: str,
74
+ project_id: Optional[str],
75
+ functional_verify: bool,
76
+ yes: bool,
77
+ values_file: Optional[str] = None,
78
+ ) -> None:
79
+ """
80
+ Main entry point for Kubernetes cloud setup.
81
+
82
+ Args:
83
+ provider: Cloud provider (aws, gcp)
84
+ region: AWS/GCP region
85
+ name: Name for the Anyscale cloud
86
+ cluster_name: Kubernetes cluster name/context
87
+ namespace: Namespace for the Anyscale operator
88
+ project_id: GCP project ID (required for GCP)
89
+ functional_verify: Whether to run functional verification
90
+ yes: Skip confirmation prompts
91
+ values_file: Optional custom path for Helm values file
92
+ """
93
+ self.log.open_block(
94
+ "Setup", f"Setting up Kubernetes cloud '{name}' on {provider.upper()}"
95
+ )
96
+
97
+ # Set confirmation flag
98
+ self.skip_confirmation = yes
99
+
100
+ try:
101
+ # Step 0: Check required CLI tools are installed
102
+ self._check_required_tools(provider)
103
+
104
+ # Step 1: Prompt for namespace BEFORE infrastructure setup
105
+ # This is needed because the IAM role trust relationship depends on the namespace
106
+ final_namespace = self._prompt_for_namespace(
107
+ namespace, skip_confirmation=yes
108
+ )
109
+
110
+ # Step 2: Discover and validate cluster
111
+ cluster_info = self._discover_cluster(
112
+ cluster_name, final_namespace, provider, region, project_id
113
+ )
114
+
115
+ # Step 3: Set up cloud infrastructure
116
+ infrastructure = self._setup_infrastructure(
117
+ provider, region, name, cluster_info
118
+ )
119
+
120
+ # Step 4: Register cloud with Anyscale
121
+ cloud_id = self._register_cloud(
122
+ name, provider, region, infrastructure, cluster_info
123
+ )
124
+
125
+ # Step 5: Install Anyscale operator
126
+ self._install_operator(
127
+ cloud_id,
128
+ provider,
129
+ region,
130
+ final_namespace,
131
+ infrastructure,
132
+ values_file,
133
+ )
134
+
135
+ # Step 6: Verify installation
136
+ if functional_verify:
137
+ self._verify_installation(cloud_id, final_namespace, cluster_info)
138
+
139
+ self.log.close_block("Setup")
140
+ self.log.info(f"Kubernetes cloud '{name}' setup completed successfully!")
141
+ except Exception: # noqa: BLE001
142
+ self.log.close_block("Setup")
143
+ raise
144
+
145
+ def _debug(self, *msg: str) -> None:
146
+ """Log debug messages only when debug mode is enabled."""
147
+ if self.debug:
148
+ self.log.debug(*msg)
149
+
150
+ def _check_required_tools(self, provider: str) -> None:
151
+ """Check that required CLI tools are installed."""
152
+ # Common tools required for all providers
153
+ required_tools = ["kubectl", "helm"]
154
+
155
+ # Provider-specific tools
156
+ if provider == "aws":
157
+ required_tools.append("aws")
158
+ elif provider == "gcp":
159
+ required_tools.extend(["gcloud", "gsutil"])
160
+
161
+ self._debug(f"Checking for required tools: {', '.join(required_tools)}")
162
+
163
+ missing_tools = []
164
+ for tool in required_tools:
165
+ if not self._check_command_available(tool):
166
+ missing_tools.append(tool)
167
+
168
+ if missing_tools:
169
+ error_msg = f"Missing required CLI tools: {', '.join(missing_tools)}\n\n"
170
+ raise click.ClickException(error_msg.rstrip())
171
+
172
+ self.log.info(
173
+ f"Required CLI tools are installed ({', '.join(required_tools)})",
174
+ block_label="Setup",
175
+ )
176
+
177
+ def _check_command_available(self, command: str) -> bool:
178
+ """Check if a command is available in the system PATH."""
179
+ try:
180
+ result = subprocess.run(
181
+ ["which", command], capture_output=True, text=True, check=False
182
+ )
183
+ return result.returncode == 0
184
+ except Exception: # noqa: BLE001
185
+ return False
186
+
187
+ def _discover_cluster(
188
+ self,
189
+ cluster_name: str,
190
+ namespace: str,
191
+ provider: str,
192
+ region: str,
193
+ project_id: Optional[str],
194
+ ) -> ClusterInfo:
195
+ """Discover and validate the target Kubernetes cluster using cloud provider APIs."""
196
+ self.log.info(
197
+ f"Discovering {provider.upper()} cluster: {cluster_name}",
198
+ block_label="Setup",
199
+ )
200
+
201
+ if provider == "aws":
202
+ return self._discover_aws_cluster(cluster_name, namespace, region)
203
+ elif provider == "gcp":
204
+ if not project_id:
205
+ raise click.ClickException(
206
+ "GCP project ID is required. Please provide --project-id"
207
+ )
208
+ return self._discover_gcp_cluster(
209
+ cluster_name, namespace, region, project_id
210
+ )
211
+ else:
212
+ raise click.ClickException(f"Unsupported provider: {provider}")
213
+
214
+ def _discover_aws_cluster(
215
+ self, cluster_name: str, namespace: str, region: str
216
+ ) -> ClusterInfo:
217
+ """Discover AWS EKS cluster details and configure kubeconfig."""
218
+
219
+ try:
220
+ self._debug("Fetching OIDC provider information...")
221
+ oidc_provider = self._get_eks_oidc_provider(cluster_name, region)
222
+ self._debug(f"OIDC Provider: {oidc_provider}")
223
+ except Exception as e: # noqa: BLE001
224
+ self.log.error(f"Failed to get OIDC provider: {e}")
225
+ raise click.ClickException(
226
+ f"Failed to get OIDC provider for cluster {cluster_name}: {e}"
227
+ )
228
+
229
+ try:
230
+ self._debug("Configuring kubeconfig for EKS cluster...")
231
+ self._configure_aws_kubeconfig(cluster_name, region)
232
+ except Exception as e: # noqa: BLE001
233
+ self.log.error(f"Failed to configure kubeconfig: {e}")
234
+ raise click.ClickException(
235
+ f"Failed to configure kubeconfig for EKS cluster: {e}"
236
+ )
237
+
238
+ try:
239
+ self._debug("Verifying kubeconfig configuration...")
240
+ self._verify_kubeconfig()
241
+ current_context = self._get_current_kubectl_context()
242
+ self.log.info(f"Cluster discovered: {current_context}", block_label="Setup")
243
+ except Exception as e: # noqa: BLE001
244
+ self.log.error(f"Failed to verify kubeconfig: {e}")
245
+ raise click.ClickException(f"Failed to verify kubeconfig: {e}")
246
+
247
+ return ClusterInfo(
248
+ context=current_context,
249
+ namespace=namespace,
250
+ provider="aws",
251
+ region=region,
252
+ cluster_name=cluster_name,
253
+ oidc_provider=oidc_provider,
254
+ )
255
+
256
+ def _discover_gcp_cluster(
257
+ self, cluster_name: str, namespace: str, region: str, project_id: str
258
+ ) -> ClusterInfo:
259
+ """Discover GCP GKE cluster details and configure kubeconfig."""
260
+
261
+ try:
262
+ self._debug("Configuring kubeconfig for GKE cluster...")
263
+ self._configure_gcp_kubeconfig(cluster_name, region, project_id)
264
+ except Exception as e: # noqa: BLE001
265
+ self.log.error(f"Failed to configure kubeconfig: {e}")
266
+ raise click.ClickException(
267
+ f"Failed to configure kubeconfig for GKE cluster: {e}"
268
+ )
269
+
270
+ try:
271
+ self._debug("Verifying kubeconfig configuration...")
272
+ self._verify_kubeconfig()
273
+ current_context = self._get_current_kubectl_context()
274
+ self.log.info(f"Cluster discovered: {current_context}", block_label="Setup")
275
+ except Exception as e: # noqa: BLE001
276
+ self.log.error(f"Failed to verify kubeconfig: {e}")
277
+ raise click.ClickException(f"Failed to verify kubeconfig: {e}")
278
+
279
+ return ClusterInfo(
280
+ context=current_context,
281
+ namespace=namespace,
282
+ provider="gcp",
283
+ region=region,
284
+ cluster_name=cluster_name,
285
+ project_id=project_id,
286
+ )
287
+
288
+ def _setup_infrastructure(
289
+ self, provider: str, region: str, name: str, cluster_info: ClusterInfo,
290
+ ) -> InfrastructureResources:
291
+ """Set up cloud infrastructure (S3/GCS bucket, IAM roles, etc.)."""
292
+ self.log.info(
293
+ f"Setting up {provider.upper()} infrastructure...", block_label="Setup"
294
+ )
295
+
296
+ if provider == "aws":
297
+ return self._setup_aws_infrastructure(region, name, cluster_info)
298
+ elif provider == "gcp":
299
+ return self._setup_gcp_infrastructure(region, name, cluster_info)
300
+ else:
301
+ raise click.ClickException(f"Unsupported provider: {provider}")
302
+
303
+ def _setup_aws_infrastructure( # noqa: PLR0912
304
+ self, region: str, name: str, cluster_info: ClusterInfo,
305
+ ) -> InfrastructureResources:
306
+ """Set up AWS infrastructure for Kubernetes using CloudFormation."""
307
+ try:
308
+ import boto3
309
+
310
+ from anyscale.utils.cloudformation_utils import CloudFormationUtils
311
+ except ImportError as e:
312
+ self.log.error(f"Failed to import required modules: {e}")
313
+ raise click.ClickException(f"Failed to import required modules: {e}")
314
+
315
+ try:
316
+ # Generate a unique cloud ID
317
+ cloud_id = f"k8s-{name}-{os.urandom(4).hex()}"
318
+ stack_name = cloud_id.replace("_", "-").lower()
319
+ self._debug(f"Generated cloud ID: {cloud_id}")
320
+ self._debug(f"CloudFormation stack name: {stack_name}")
321
+ except Exception as e: # noqa: BLE001
322
+ self.log.error(f"Failed to generate cloud ID: {e}")
323
+ raise click.ClickException(f"Failed to generate cloud ID: {e}")
324
+
325
+ try:
326
+ # Generate CloudFormation template for Kubernetes setup with actual OIDC provider
327
+ if not cluster_info.oidc_provider:
328
+ raise click.ClickException(
329
+ "OIDC provider information not found. Please ensure the EKS cluster has OIDC provider enabled."
330
+ )
331
+ self._debug("Generating CloudFormation template...")
332
+ self._debug(
333
+ f"Using namespace: {cluster_info.namespace} with service account: anyscale-operator"
334
+ )
335
+ cfn_template_body = self._generate_aws_cloudformation_template(
336
+ cloud_id, cluster_info.oidc_provider, cluster_info.namespace,
337
+ )
338
+ self._debug("CloudFormation template generated successfully")
339
+ except Exception as e: # noqa: BLE001
340
+ self.log.error(f"Failed to generate CloudFormation template: {e}")
341
+ raise click.ClickException(
342
+ f"Failed to generate CloudFormation template: {e}"
343
+ )
344
+
345
+ try:
346
+ self._debug("Preparing CloudFormation parameters...")
347
+ parameters = [{"ParameterKey": "CloudID", "ParameterValue": cloud_id}]
348
+ self._debug(f"Prepared {len(parameters)} CloudFormation parameters")
349
+ except Exception as e: # noqa: BLE001
350
+ self.log.error(f"Failed to prepare CloudFormation parameters: {e}")
351
+ raise click.ClickException(
352
+ f"Failed to prepare CloudFormation parameters: {e}"
353
+ )
354
+
355
+ try:
356
+ with self.log.indent():
357
+ self.log.info(
358
+ "Creating CloudFormation stack (this may take a few minutes)...",
359
+ block_label="Setup",
360
+ )
361
+ boto3_session = boto3.Session(region_name=region)
362
+ cfn_utils = CloudFormationUtils(self.log)
363
+ cfn_utils.create_and_wait_for_stack(
364
+ stack_name=stack_name,
365
+ template_body=cfn_template_body,
366
+ parameters=parameters,
367
+ region=region,
368
+ boto3_session=boto3_session,
369
+ timeout_seconds=600,
370
+ )
371
+ self.log.info("CloudFormation stack created", block_label="Setup")
372
+ except Exception as e: # noqa: BLE001
373
+ self.log.error(f"Failed to create CloudFormation stack: {e}")
374
+ raise click.ClickException(f"Failed to create CloudFormation stack: {e}")
375
+
376
+ try:
377
+ self._debug("Retrieving CloudFormation stack outputs...")
378
+ stack_outputs = cfn_utils.get_stack_outputs(
379
+ stack_name, region, boto3_session
380
+ )
381
+ bucket_name = stack_outputs.get("S3BucketName", f"anyscale-{cloud_id}")
382
+ iam_role_arn = stack_outputs.get("AnyscaleCrossAccountIAMRoleArn")
383
+
384
+ if not iam_role_arn:
385
+ raise click.ClickException(
386
+ "Failed to get IAM role ARN from CloudFormation stack"
387
+ )
388
+
389
+ self._debug(f"S3 Bucket: {bucket_name}")
390
+ self._debug(f"IAM Role ARN: {iam_role_arn}")
391
+ except Exception as e: # noqa: BLE001
392
+ self.log.error(f"Failed to get CloudFormation outputs: {e}")
393
+ raise click.ClickException(f"Failed to get CloudFormation outputs: {e}")
394
+
395
+ return InfrastructureResources(
396
+ bucket_name=bucket_name, iam_role_arn=iam_role_arn, region=region
397
+ )
398
+
399
+ def _generate_aws_cloudformation_template(
400
+ self, cloud_id: str, oidc_provider_arn: str, namespace: str,
401
+ ) -> str:
402
+ """Generate CloudFormation template for AWS Kubernetes setup."""
403
+ # Extract OIDC provider URL from ARN for the condition
404
+ # ARN format: arn:aws:iam::ACCOUNT:oidc-provider/oidc.eks.REGION.amazonaws.com/id/XXXXXX
405
+ # We need: oidc.eks.REGION.amazonaws.com/id/XXXXXX
406
+ if "oidc-provider/" not in oidc_provider_arn:
407
+ raise click.ClickException(
408
+ f"Invalid OIDC provider ARN format: {oidc_provider_arn}"
409
+ )
410
+ oidc_provider_url = oidc_provider_arn.split("oidc-provider/")[-1]
411
+
412
+ service_account_name = "anyscale-operator"
413
+
414
+ # Use ANYSCALE_CORS_ORIGIN from shared config
415
+ # This respects the ANYSCALE_HOST environment variable
416
+ allowed_origin = ANYSCALE_CORS_ORIGIN
417
+
418
+ template = {
419
+ "AWSTemplateFormatVersion": "2010-09-09",
420
+ "Description": f"Anyscale Kubernetes Cloud Infrastructure for {cloud_id}",
421
+ "Parameters": {
422
+ "CloudID": {
423
+ "Type": "String",
424
+ "Description": "Cloud ID for resource naming",
425
+ }
426
+ },
427
+ "Resources": {
428
+ "AnyscaleBucket": {
429
+ "Type": "AWS::S3::Bucket",
430
+ "Properties": {
431
+ "BucketName": {"Fn::Sub": "anyscale-${CloudID}"},
432
+ "VersioningConfiguration": {"Status": "Enabled"},
433
+ "PublicAccessBlockConfiguration": {
434
+ "BlockPublicAcls": True,
435
+ "BlockPublicPolicy": True,
436
+ "IgnorePublicAcls": True,
437
+ "RestrictPublicBuckets": True,
438
+ },
439
+ "CorsConfiguration": {
440
+ "CorsRules": [
441
+ {
442
+ "AllowedHeaders": ["*"],
443
+ "AllowedMethods": [
444
+ "GET",
445
+ "PUT",
446
+ "POST",
447
+ "HEAD",
448
+ "DELETE",
449
+ ],
450
+ "AllowedOrigins": [allowed_origin],
451
+ "MaxAge": 3600,
452
+ }
453
+ ]
454
+ },
455
+ },
456
+ },
457
+ "AnyscaleOperatorRole": {
458
+ "Type": "AWS::IAM::Role",
459
+ "Properties": {
460
+ "RoleName": {"Fn::Sub": "${CloudID}-anyscale-operator-role"},
461
+ "AssumeRolePolicyDocument": {
462
+ "Version": "2012-10-17",
463
+ "Statement": [
464
+ {
465
+ "Effect": "Allow",
466
+ "Principal": {"Federated": oidc_provider_arn},
467
+ "Action": "sts:AssumeRoleWithWebIdentity",
468
+ "Condition": {
469
+ "StringEquals": {
470
+ f"{oidc_provider_url}:sub": f"system:serviceaccount:{namespace}:{service_account_name}"
471
+ }
472
+ },
473
+ }
474
+ ],
475
+ },
476
+ "Policies": [
477
+ {
478
+ "PolicyName": "AnyscaleS3AccessPolicy",
479
+ "PolicyDocument": {
480
+ "Version": "2012-10-17",
481
+ "Statement": [
482
+ {
483
+ "Effect": "Allow",
484
+ "Action": [
485
+ "s3:GetObject",
486
+ "s3:PutObject",
487
+ "s3:DeleteObject",
488
+ "s3:ListBucket",
489
+ ],
490
+ "Resource": [
491
+ {
492
+ "Fn::GetAtt": [
493
+ "AnyscaleBucket",
494
+ "Arn",
495
+ ]
496
+ },
497
+ {"Fn::Sub": "${AnyscaleBucket.Arn}/*"},
498
+ ],
499
+ }
500
+ ],
501
+ },
502
+ }
503
+ ],
504
+ },
505
+ },
506
+ },
507
+ "Outputs": {
508
+ "S3BucketName": {
509
+ "Value": {"Ref": "AnyscaleBucket"},
510
+ "Description": "Name of the S3 bucket",
511
+ },
512
+ "AnyscaleCrossAccountIAMRoleArn": {
513
+ "Value": {"Fn::GetAtt": ["AnyscaleOperatorRole", "Arn"]},
514
+ "Description": "ARN of the Anyscale operator IAM role",
515
+ },
516
+ },
517
+ }
518
+
519
+ return json.dumps(template, indent=2)
520
+
521
+ def _setup_gcp_infrastructure( # noqa: PLR0912
522
+ self, region: str, name: str, cluster_info: ClusterInfo,
523
+ ) -> InfrastructureResources:
524
+ """Set up GCP infrastructure for Kubernetes using GCP Python SDK.
525
+
526
+ Note: Deployment Manager is deprecated so it is unused here.
527
+ Infrastructure Manager was tried but did not work well, so we rely
528
+ on the GCP Python SDK instead.
529
+ """
530
+ try:
531
+ from anyscale.utils.gcp_utils import get_google_cloud_client_factory
532
+ except ImportError as e:
533
+ self.log.error(f"Failed to import required modules: {e}")
534
+ raise click.ClickException(f"Failed to import required modules: {e}")
535
+
536
+ try:
537
+ # Generate a unique cloud ID
538
+ cloud_id = f"k8s-{name}-{os.urandom(4).hex()}"
539
+ deployment_name = cloud_id.replace("_", "-").lower()
540
+ self._debug(f"Generated cloud ID: {cloud_id}")
541
+ self._debug(f"Infrastructure Manager deployment name: {deployment_name}")
542
+ except Exception as e: # noqa: BLE001
543
+ self.log.error(f"Failed to generate cloud ID: {e}")
544
+ raise click.ClickException(f"Failed to generate cloud ID: {e}")
545
+
546
+ try:
547
+ # Get Google Cloud client factory
548
+ factory = get_google_cloud_client_factory(self.log, cluster_info.project_id)
549
+ except Exception as e: # noqa: BLE001
550
+ self.log.error(f"Failed to initialize GCP client: {e}")
551
+ raise click.ClickException(f"Failed to initialize GCP client: {e}")
552
+
553
+ try:
554
+ with self.log.indent():
555
+ self.log.warning(
556
+ "NOTE: GCP resources (bucket and service account) created by this command are not managed by Anyscale.",
557
+ )
558
+ self.log.warning(
559
+ "You will need to manually delete these resources when the cloud is no longer needed.",
560
+ )
561
+ self.log.info(
562
+ "Creating GCP resources (bucket, service account, IAM bindings)...",
563
+ )
564
+
565
+ # Calculate resource names
566
+ # Service account name: anyscale-operator-<random 8 chars>
567
+ # Max length for GCP service account is 30 characters
568
+ random_suffix = os.urandom(4).hex() # 8 hex chars
569
+ anyscale_service_account_name = f"anyscale-operator-{random_suffix}"
570
+ bucket_name = f"anyscale-{cloud_id.replace('_', '-').lower()}"
571
+
572
+ # Create GCS bucket
573
+ self._debug(f"Creating GCS bucket: {bucket_name}")
574
+ storage_client = factory.storage.Client()
575
+ bucket = storage_client.bucket(bucket_name)
576
+ bucket.location = region
577
+ bucket.storage_class = "REGIONAL"
578
+ bucket.iam_configuration.uniform_bucket_level_access_enabled = True
579
+ bucket.iam_configuration.public_access_prevention = "enforced"
580
+ bucket.versioning_enabled = True
581
+ bucket.labels = {"anyscale-cloud-id": cloud_id.replace("-", "_")}
582
+
583
+ # Set CORS
584
+ # Use ANYSCALE_CORS_ORIGIN from shared config
585
+ # This respects the ANYSCALE_HOST environment variable
586
+ allowed_origin = ANYSCALE_CORS_ORIGIN
587
+ bucket.cors = [
588
+ {
589
+ "origin": [allowed_origin],
590
+ "responseHeader": ["*"],
591
+ "method": ["GET", "PUT", "POST", "HEAD", "DELETE"],
592
+ "maxAgeSeconds": 3600,
593
+ }
594
+ ]
595
+
596
+ storage_client.create_bucket(bucket, location=region)
597
+ self.log.info(f"Created GCS bucket: {bucket_name}", block_label="Setup")
598
+
599
+ # Create service account
600
+ self._debug(
601
+ f"Creating service account: {anyscale_service_account_name}"
602
+ )
603
+ iam_client = factory.build("iam", "v1")
604
+ service_account_body = {
605
+ "accountId": anyscale_service_account_name,
606
+ "serviceAccount": {
607
+ "displayName": f"{cloud_id} Anyscale operator service account",
608
+ "description": "Service account for Anyscale Kubernetes operator",
609
+ },
610
+ }
611
+
612
+ service_account = (
613
+ iam_client.projects()
614
+ .serviceAccounts()
615
+ .create(
616
+ name=f"projects/{cluster_info.project_id}",
617
+ body=service_account_body,
618
+ )
619
+ .execute()
620
+ )
621
+
622
+ service_account_email = service_account["email"]
623
+ self.log.info(
624
+ f"Created service account: {service_account_email}",
625
+ block_label="Setup",
626
+ )
627
+
628
+ # Wait for service account to propagate through GCP systems
629
+ import time
630
+
631
+ self._debug("Waiting 10 seconds for service account to propagate...")
632
+ time.sleep(10)
633
+
634
+ # Grant Workload Identity binding
635
+ self._debug("Setting up Workload Identity binding")
636
+
637
+ # The K8s service account needs:
638
+ # 1. workloadIdentityUser role - to impersonate the GCP service account
639
+ # 2. serviceAccountTokenCreator - to generate tokens (for getOpenIdToken)
640
+
641
+ policy_body = {
642
+ "policy": {
643
+ "bindings": [
644
+ {
645
+ "role": "roles/iam.workloadIdentityUser",
646
+ "members": [
647
+ f"serviceAccount:{cluster_info.project_id}.svc.id.goog[{cluster_info.namespace}/anyscale-operator]"
648
+ ],
649
+ },
650
+ {
651
+ "role": "roles/iam.serviceAccountTokenCreator",
652
+ "members": [f"serviceAccount:{service_account_email}"],
653
+ },
654
+ ]
655
+ }
656
+ }
657
+
658
+ iam_client.projects().serviceAccounts().setIamPolicy(
659
+ resource=f"projects/{cluster_info.project_id}/serviceAccounts/{service_account_email}",
660
+ body=policy_body,
661
+ ).execute()
662
+
663
+ self.log.info(
664
+ "Configured Workload Identity binding", block_label="Setup"
665
+ )
666
+
667
+ # Grant storage admin role to service account for the bucket
668
+ # Note: There's often a propagation delay after service account creation
669
+ # We need to retry with exponential backoff
670
+ self._debug("Granting storage permissions")
671
+
672
+ import time
673
+
674
+ max_retries = 5
675
+ retry_delay = 2 # Start with 2 seconds
676
+
677
+ for attempt in range(max_retries):
678
+ try:
679
+ bucket_policy = bucket.get_iam_policy(
680
+ requested_policy_version=3
681
+ )
682
+ bucket_policy.bindings.append(
683
+ {
684
+ "role": "roles/storage.admin",
685
+ "members": {f"serviceAccount:{service_account_email}"},
686
+ }
687
+ )
688
+ bucket.set_iam_policy(bucket_policy)
689
+ break # Success!
690
+ except Exception as e: # noqa: BLE001
691
+ if "does not exist" in str(e) and attempt < max_retries - 1:
692
+ self._debug(
693
+ f"Service account not yet propagated, retrying in {retry_delay}s... (attempt {attempt + 1}/{max_retries})"
694
+ )
695
+ time.sleep(retry_delay)
696
+ retry_delay *= 2 # Exponential backoff
697
+ else:
698
+ raise # Re-raise if it's not a propagation issue or we're out of retries
699
+
700
+ self.log.info(
701
+ "Granted storage permissions to service account",
702
+ block_label="Setup",
703
+ )
704
+
705
+ self.log.info("GCP resources created successfully", block_label="Setup")
706
+ self.log.warning(
707
+ f"REMINDER: To clean up when no longer needed, delete GCS bucket '{bucket_name}' and service account '{service_account_email}'"
708
+ )
709
+ except Exception as e: # noqa: BLE001
710
+ self.log.error(f"Failed to create GCP resources: {e}")
711
+ raise click.ClickException(f"Failed to create GCP resources: {e}")
712
+
713
+ # Resources were created in the try block above
714
+ # bucket_name and service_account_email are already set
715
+ self._debug(f"GCS Bucket: {bucket_name}")
716
+ self._debug(f"Service Account Email: {service_account_email}")
717
+
718
+ return InfrastructureResources(
719
+ bucket_name=bucket_name,
720
+ iam_role_arn=service_account_email, # For GCP, we use service account email
721
+ region=region,
722
+ project_id=cluster_info.project_id,
723
+ )
724
+
725
+ def _get_gke_cluster_info(
726
+ self, cluster_name: str, region: str, project_id: str
727
+ ) -> Dict[str, Any]:
728
+ """Get GKE cluster information using gcloud CLI."""
729
+ try:
730
+ # Try regional cluster first
731
+ result = subprocess.run(
732
+ [
733
+ "gcloud",
734
+ "container",
735
+ "clusters",
736
+ "describe",
737
+ cluster_name,
738
+ f"--region={region}",
739
+ f"--project={project_id}",
740
+ "--format=json",
741
+ ],
742
+ capture_output=True,
743
+ text=True,
744
+ check=False,
745
+ )
746
+ if result.returncode == 0:
747
+ return json.loads(result.stdout)
748
+
749
+ # Try zonal cluster
750
+ # Assuming zone 'a' if regional fails
751
+ zone = f"{region}-a"
752
+ result = subprocess.run(
753
+ [
754
+ "gcloud",
755
+ "container",
756
+ "clusters",
757
+ "describe",
758
+ cluster_name,
759
+ f"--zone={zone}",
760
+ f"--project={project_id}",
761
+ "--format=json",
762
+ ],
763
+ capture_output=True,
764
+ text=True,
765
+ check=True,
766
+ )
767
+ return json.loads(result.stdout)
768
+ except subprocess.CalledProcessError as e:
769
+ raise click.ClickException(f"Failed to get GKE cluster info: {e.stderr}")
770
+ except json.JSONDecodeError as e:
771
+ raise click.ClickException(f"Failed to parse GKE cluster info: {e}")
772
+
773
+ def _get_gke_zones(
774
+ self, cluster_name: str, region: str, project_id: str
775
+ ) -> List[str]:
776
+ """Get zones where the GKE cluster's node pools are located."""
777
+ try:
778
+ cluster_info = self._get_gke_cluster_info(cluster_name, region, project_id)
779
+
780
+ # Extract zones from node pools
781
+ zones = []
782
+ node_pools = cluster_info.get("nodePools", [])
783
+
784
+ for pool in node_pools:
785
+ # For zonal clusters, each node pool has locations
786
+ pool_locations = pool.get("locations", [])
787
+ zones.extend(pool_locations)
788
+
789
+ # If no zones found from node pools, try cluster-level locations
790
+ if not zones:
791
+ cluster_locations = cluster_info.get("locations", [])
792
+ if cluster_locations:
793
+ zones = cluster_locations
794
+
795
+ # Remove duplicates and sort
796
+ if zones:
797
+ unique_zones = sorted(set(zones))
798
+ self._debug(f"Discovered zones: {', '.join(unique_zones)}")
799
+ return unique_zones
800
+ else:
801
+ # Fallback to default zones
802
+ self._debug(
803
+ "No zones found in cluster info, falling back to default zones"
804
+ )
805
+ return [region + "-a", region + "-b", region + "-c"]
806
+
807
+ except Exception as e: # noqa: BLE001
808
+ self._debug(f"Failed to get zones: {e}, using default zones")
809
+ return [region + "-a", region + "-b", region + "-c"]
810
+
811
+ def _configure_gcp_kubeconfig(
812
+ self, cluster_name: str, region: str, project_id: str
813
+ ) -> None:
814
+ """Configure kubeconfig for GCP GKE cluster."""
815
+ self.log.info(f"Configuring kubeconfig for GKE cluster: {cluster_name}")
816
+
817
+ try:
818
+ # Try regional cluster first
819
+ result = subprocess.run(
820
+ [
821
+ "gcloud",
822
+ "container",
823
+ "clusters",
824
+ "get-credentials",
825
+ cluster_name,
826
+ f"--region={region}",
827
+ f"--project={project_id}",
828
+ ],
829
+ capture_output=True,
830
+ text=True,
831
+ check=False,
832
+ )
833
+ if result.returncode == 0:
834
+ self.log.info("GKE kubeconfig configured successfully")
835
+ return
836
+
837
+ # Try zonal cluster
838
+ zone = f"{region}-a"
839
+ subprocess.run(
840
+ [
841
+ "gcloud",
842
+ "container",
843
+ "clusters",
844
+ "get-credentials",
845
+ cluster_name,
846
+ f"--zone={zone}",
847
+ f"--project={project_id}",
848
+ ],
849
+ capture_output=True,
850
+ text=True,
851
+ check=True,
852
+ )
853
+ self.log.info("GKE kubeconfig configured successfully")
854
+ except subprocess.CalledProcessError as e:
855
+ raise click.ClickException(
856
+ f"Failed to configure GKE kubeconfig: {e.stderr}"
857
+ )
858
+
859
+ def _get_eks_cluster_info(self, cluster_name: str, region: str) -> Dict[str, Any]:
860
+ """Get EKS cluster information using AWS CLI."""
861
+ try:
862
+ result = subprocess.run(
863
+ [
864
+ "aws",
865
+ "eks",
866
+ "describe-cluster",
867
+ "--name",
868
+ cluster_name,
869
+ "--region",
870
+ region,
871
+ ],
872
+ capture_output=True,
873
+ text=True,
874
+ check=True,
875
+ )
876
+ cluster_data = json.loads(result.stdout)
877
+ return cluster_data.get("cluster", {})
878
+ except subprocess.CalledProcessError as e:
879
+ raise click.ClickException(f"Failed to get EKS cluster info: {e.stderr}")
880
+
881
+ def _get_eks_availability_zones(self, cluster_name: str, region: str) -> List[str]:
882
+ """Get availability zones where the EKS cluster's subnets are located."""
883
+ try:
884
+ cluster_info = self._get_eks_cluster_info(cluster_name, region)
885
+ subnet_ids = cluster_info.get("resourcesVpcConfig", {}).get("subnetIds", [])
886
+
887
+ if not subnet_ids:
888
+ self._debug(
889
+ "No subnets found in cluster info, falling back to default zones"
890
+ )
891
+ return [region + "a", region + "b", region + "c"]
892
+
893
+ # Get subnet details to find their availability zones
894
+ result = subprocess.run(
895
+ [
896
+ "aws",
897
+ "ec2",
898
+ "describe-subnets",
899
+ "--subnet-ids",
900
+ *subnet_ids,
901
+ "--region",
902
+ region,
903
+ "--query",
904
+ "Subnets[*].AvailabilityZone",
905
+ "--output",
906
+ "json",
907
+ ],
908
+ capture_output=True,
909
+ text=True,
910
+ check=True,
911
+ )
912
+
913
+ zones = json.loads(result.stdout)
914
+ # Remove duplicates and sort
915
+ unique_zones = sorted(set(zones))
916
+
917
+ if unique_zones:
918
+ self._debug(f"Discovered availability zones: {', '.join(unique_zones)}")
919
+ return unique_zones
920
+ else:
921
+ self._debug(
922
+ "No availability zones found, falling back to default zones"
923
+ )
924
+ return [region + "a", region + "b", region + "c"]
925
+
926
+ except Exception as e: # noqa: BLE001
927
+ self._debug(f"Failed to get availability zones: {e}, using default zones")
928
+ return [region + "a", region + "b", region + "c"]
929
+
930
+ def _get_eks_oidc_provider(self, cluster_name: str, region: str) -> str:
931
+ """Get EKS OIDC provider URL for IRSA."""
932
+ cluster_info = self._get_eks_cluster_info(cluster_name, region)
933
+ identity = cluster_info.get("identity", {})
934
+ oidc_issuer = identity.get("oidc", {}).get("issuer", "")
935
+
936
+ if not oidc_issuer:
937
+ raise click.ClickException(
938
+ "Could not find OIDC issuer for EKS cluster. IRSA setup requires OIDC provider."
939
+ )
940
+
941
+ # Extract OIDC provider ARN
942
+ # OIDC issuer URL format: https://oidc.eks.region.amazonaws.com/id/EXAMPLED539D4633E53CE8D
943
+ if "oidc.eks." in oidc_issuer and ".amazonaws.com/id/" in oidc_issuer:
944
+ oidc_id = oidc_issuer.split("/id/")[-1]
945
+ account_id = self._get_aws_account_id()
946
+ oidc_provider_arn = f"arn:aws:iam::{account_id}:oidc-provider/oidc.eks.{region}.amazonaws.com/id/{oidc_id}"
947
+ return oidc_provider_arn
948
+
949
+ raise click.ClickException(
950
+ f"Could not parse OIDC provider from issuer URL: {oidc_issuer}"
951
+ )
952
+
953
+ def _get_aws_account_id(self) -> str:
954
+ """Get AWS account ID."""
955
+ try:
956
+ result = subprocess.run(
957
+ [
958
+ "aws",
959
+ "sts",
960
+ "get-caller-identity",
961
+ "--query",
962
+ "Account",
963
+ "--output",
964
+ "text",
965
+ ],
966
+ capture_output=True,
967
+ text=True,
968
+ check=True,
969
+ )
970
+ return result.stdout.strip()
971
+ except subprocess.CalledProcessError as e:
972
+ raise click.ClickException(f"Failed to get AWS account ID: {e.stderr}")
973
+
974
+ def _configure_aws_kubeconfig(self, cluster_name: str, region: str) -> None:
975
+ """Configure kubeconfig for AWS EKS cluster."""
976
+ self.log.info(f"Configuring kubeconfig for EKS cluster: {cluster_name}")
977
+
978
+ try:
979
+ subprocess.run(
980
+ [
981
+ "aws",
982
+ "eks",
983
+ "update-kubeconfig",
984
+ "--region",
985
+ region,
986
+ "--name",
987
+ cluster_name,
988
+ ],
989
+ capture_output=True,
990
+ text=True,
991
+ check=True,
992
+ )
993
+ self.log.info("EKS kubeconfig configured successfully")
994
+ except subprocess.CalledProcessError as e:
995
+ raise click.ClickException(
996
+ f"Failed to configure EKS kubeconfig: {e.stderr}"
997
+ )
998
+
999
+ def _verify_kubeconfig(self) -> None:
1000
+ """Verify that kubeconfig is working correctly."""
1001
+ self.log.info("Verifying kubeconfig configuration...")
1002
+
1003
+ try:
1004
+ subprocess.run(
1005
+ ["kubectl", "cluster-info"], capture_output=True, text=True, check=True
1006
+ )
1007
+ self.log.info("Kubeconfig verification successful")
1008
+ except subprocess.CalledProcessError as e:
1009
+ raise click.ClickException(f"Kubeconfig verification failed: {e.stderr}")
1010
+
1011
+ def _get_current_kubectl_context(self) -> str:
1012
+ """Get the current kubectl context."""
1013
+ try:
1014
+ result = subprocess.run(
1015
+ ["kubectl", "config", "current-context"],
1016
+ capture_output=True,
1017
+ text=True,
1018
+ check=True,
1019
+ )
1020
+ return result.stdout.strip()
1021
+ except subprocess.CalledProcessError as e:
1022
+ raise click.ClickException(
1023
+ f"Failed to get current kubectl context: {e.stderr}"
1024
+ )
1025
+
1026
+ def _register_cloud( # noqa: PLR0912
1027
+ self,
1028
+ name: str,
1029
+ provider: str,
1030
+ region: str,
1031
+ infrastructure: InfrastructureResources,
1032
+ cluster_info: ClusterInfo,
1033
+ ) -> str:
1034
+ """Register the cloud with Anyscale."""
1035
+ self.log.info("Registering cloud with Anyscale...", block_label="Setup")
1036
+
1037
+ if provider == "aws":
1038
+ # Dynamically determine availability zones from the EKS cluster
1039
+ zones = self._get_eks_availability_zones(cluster_info.cluster_name, region)
1040
+
1041
+ cloud_deployment = CloudDeployment(
1042
+ name=name,
1043
+ provider=CloudProviders.AWS,
1044
+ region=region,
1045
+ compute_stack=ComputeStack.K8S,
1046
+ object_storage=ObjectStorage(
1047
+ bucket_name=infrastructure.bucket_name, region=region
1048
+ ),
1049
+ aws_config=AWSConfig(),
1050
+ kubernetes_config=OpenAPIKubernetesConfig(
1051
+ anyscale_operator_iam_identity=infrastructure.iam_role_arn,
1052
+ zones=zones,
1053
+ ),
1054
+ )
1055
+ elif provider == "gcp":
1056
+ assert infrastructure.project_id, "Project ID is required for GCP"
1057
+
1058
+ from anyscale.client.openapi_client.models import GCPConfig
1059
+
1060
+ # Dynamically determine zones from the GKE cluster
1061
+ zones = self._get_gke_zones(
1062
+ cluster_info.cluster_name, region, infrastructure.project_id
1063
+ )
1064
+
1065
+ cloud_deployment = CloudDeployment(
1066
+ name=name,
1067
+ provider=CloudProviders.GCP,
1068
+ region=region,
1069
+ compute_stack=ComputeStack.K8S,
1070
+ object_storage=ObjectStorage(
1071
+ bucket_name=infrastructure.bucket_name, region=region
1072
+ ),
1073
+ gcp_config=GCPConfig(project_id=infrastructure.project_id,),
1074
+ kubernetes_config=OpenAPIKubernetesConfig(
1075
+ anyscale_operator_iam_identity=infrastructure.iam_role_arn,
1076
+ zones=zones,
1077
+ ),
1078
+ )
1079
+ else:
1080
+ raise click.ClickException(f"Unsupported provider: {provider}")
1081
+
1082
+ # Register the cloud
1083
+ try:
1084
+ self._debug("Cloud deployment details:")
1085
+ self._debug(f" Name: {cloud_deployment.name}")
1086
+ self._debug(f" Provider: {cloud_deployment.provider}")
1087
+ self._debug(f" Region: {cloud_deployment.region}")
1088
+ self._debug(f" Compute Stack: {cloud_deployment.compute_stack}")
1089
+ self._debug(f" Bucket Name: {cloud_deployment.object_storage.bucket_name}")
1090
+ self._debug(
1091
+ f" IAM Identity: {cloud_deployment.kubernetes_config.anyscale_operator_iam_identity}"
1092
+ )
1093
+ if cloud_deployment.aws_config:
1094
+ self._debug(" AWS Config:")
1095
+ self._debug(
1096
+ f" IAM Role ID: {cloud_deployment.aws_config.anyscale_iam_role_id}"
1097
+ )
1098
+
1099
+ # Temporarily suppress cloud controller logging to avoid Helm command output
1100
+ original_log_info = self.cloud_controller.log.info
1101
+ self.cloud_controller.log.info = lambda *_args, **_kwargs: None
1102
+
1103
+ try:
1104
+ if provider == "aws":
1105
+ self.log.info("Calling register_aws_cloud...")
1106
+ self.cloud_controller.register_aws_cloud(
1107
+ name=name,
1108
+ cloud_resource=cloud_deployment,
1109
+ functional_verify=None,
1110
+ yes=True,
1111
+ skip_verifications=True,
1112
+ auto_add_user=True,
1113
+ )
1114
+ elif provider == "gcp":
1115
+ self.log.info("Calling register_gcp_cloud...")
1116
+ self.cloud_controller.register_gcp_cloud(
1117
+ name=name,
1118
+ cloud_resource=cloud_deployment,
1119
+ functional_verify=None,
1120
+ yes=True,
1121
+ skip_verifications=True,
1122
+ auto_add_user=True,
1123
+ )
1124
+ else:
1125
+ raise click.ClickException(f"Unsupported provider: {provider}")
1126
+ finally:
1127
+ # Restore the original log.info method
1128
+ self.cloud_controller.log.info = original_log_info
1129
+
1130
+ self._debug("Cloud registration completed, fetching cloud ID...")
1131
+ clouds = (
1132
+ self.cloud_controller.api_client.list_clouds_api_v2_clouds_get().results
1133
+ )
1134
+ cloud = next((c for c in clouds if c.name == name), None)
1135
+ if not cloud:
1136
+ raise click.ClickException("Failed to find registered cloud")
1137
+
1138
+ cloud_id = getattr(cloud, "id", None) or getattr(cloud, "cloud_id", None)
1139
+ if not cloud_id:
1140
+ raise click.ClickException(
1141
+ "Failed to get cloud ID from registered cloud"
1142
+ )
1143
+
1144
+ self.log.info(f"Cloud registered with ID: {cloud_id}", block_label="Setup")
1145
+
1146
+ return cloud_id
1147
+
1148
+ except Exception as e: # noqa: BLE001
1149
+ self.log.error(f"Cloud registration failed with error: {e}")
1150
+ self.log.error(f"Error type: {type(e).__name__}")
1151
+ if hasattr(e, "response"):
1152
+ self.log.error(f"Response details: {getattr(e, 'response', 'N/A')}")
1153
+ if hasattr(e, "args"):
1154
+ self.log.error(f"Error args: {e.args}")
1155
+ import traceback
1156
+
1157
+ self.log.error(f"Full traceback: {traceback.format_exc()}")
1158
+ raise click.ClickException(f"Failed to register cloud: {e}")
1159
+
1160
+ def _install_operator( # noqa: PLR0913
1161
+ self,
1162
+ cloud_id: str,
1163
+ provider: str,
1164
+ region: str,
1165
+ namespace: str,
1166
+ infrastructure: InfrastructureResources,
1167
+ values_file: Optional[str] = None,
1168
+ ) -> None:
1169
+ """Install the Anyscale operator using Helm."""
1170
+ self.log.info("Installing Anyscale operator...", block_label="Setup")
1171
+
1172
+ # Get cloud resources to get the cloud resource ID
1173
+ cloud_resources = self.cloud_controller.get_decorated_cloud_resources(cloud_id)
1174
+
1175
+ if not cloud_resources:
1176
+ raise click.ClickException("No cloud resources found")
1177
+
1178
+ cloud_resource_id = cloud_resources[0].cloud_resource_id
1179
+
1180
+ release_name = "anyscale-operator"
1181
+
1182
+ # Generate Helm command and extract --set-string flags from it
1183
+ self._debug("Generating Helm command to extract parameters...")
1184
+ helm_command = self.cloud_controller._generate_helm_upgrade_command( # noqa: SLF001
1185
+ provider=provider,
1186
+ cloud_deployment_id=cloud_resource_id,
1187
+ region=region,
1188
+ operator_iam_identity=infrastructure.iam_role_arn,
1189
+ )
1190
+
1191
+ set_string_values = self._extract_set_string_values(helm_command)
1192
+ self._debug(f"Extracted {len(set_string_values)} --set-string parameters")
1193
+
1194
+ values_file_path = self._generate_helm_values_file(
1195
+ provider=provider,
1196
+ cloud_deployment_id=cloud_resource_id,
1197
+ region=region,
1198
+ namespace=namespace,
1199
+ infrastructure=infrastructure,
1200
+ custom_path=values_file,
1201
+ additional_values=set_string_values,
1202
+ )
1203
+
1204
+ # Add Helm repo before installing
1205
+ self._debug("Adding Anyscale Helm repository...")
1206
+ self._add_helm_repo()
1207
+
1208
+ # Build a simple Helm command that only uses the values file
1209
+ self._debug("Generating Helm command...")
1210
+ helm_command = (
1211
+ f"helm upgrade {release_name} anyscale/anyscale-operator "
1212
+ f"--values {values_file_path} "
1213
+ f"--namespace {namespace} "
1214
+ f"--create-namespace "
1215
+ f"--wait "
1216
+ f"-i"
1217
+ )
1218
+
1219
+ self._execute_helm_command(helm_command)
1220
+
1221
+ def _add_helm_repo(self) -> None:
1222
+ """Add and update the Anyscale Helm repository."""
1223
+ try:
1224
+ # Add the Anyscale Helm repository
1225
+ self.log.info("Adding Anyscale Helm repository...", block_label="Setup")
1226
+ subprocess.run(
1227
+ [
1228
+ "helm",
1229
+ "repo",
1230
+ "add",
1231
+ "anyscale",
1232
+ "https://anyscale.github.io/helm-charts",
1233
+ ],
1234
+ capture_output=True,
1235
+ text=True,
1236
+ check=False, # Don't fail if repo already exists
1237
+ )
1238
+
1239
+ # Update the Helm repository
1240
+ self.log.info("Updating Helm repositories...", block_label="Setup")
1241
+ subprocess.run(
1242
+ ["helm", "repo", "update", "anyscale"],
1243
+ capture_output=True,
1244
+ text=True,
1245
+ check=True,
1246
+ )
1247
+ self.log.info(
1248
+ "Helm repository configured successfully", block_label="Setup"
1249
+ )
1250
+ except subprocess.CalledProcessError as e:
1251
+ self.log.error(f"Failed to configure Helm repository: {e.stderr}")
1252
+ raise click.ClickException(
1253
+ f"Failed to configure Helm repository: {e.stderr}"
1254
+ )
1255
+
1256
+ def _extract_set_string_values(self, helm_command: str) -> Dict[str, str]:
1257
+ """
1258
+ Extract all --set-string key=value pairs from a Helm command.
1259
+
1260
+ Args:
1261
+ helm_command: The Helm command string to parse
1262
+
1263
+ Returns:
1264
+ Dictionary of key-value pairs from --set-string flags
1265
+ """
1266
+ import re
1267
+
1268
+ set_string_values = {}
1269
+
1270
+ # Pattern to match --set-string key=value
1271
+ pattern = r"--set-string\s+(\S+?)=(\S+)"
1272
+
1273
+ matches = re.findall(pattern, helm_command)
1274
+ for key, value in matches:
1275
+ set_string_values[key] = value
1276
+
1277
+ return set_string_values
1278
+
1279
+ def _prompt_for_namespace(
1280
+ self, default_namespace: str, skip_confirmation: bool = False
1281
+ ) -> str:
1282
+ """Prompt user for namespace confirmation."""
1283
+ final_namespace = default_namespace or "anyscale-operator"
1284
+
1285
+ if skip_confirmation:
1286
+ self.log.info(f"Using namespace: {final_namespace}", block_label="Setup")
1287
+ return final_namespace
1288
+
1289
+ self.log.info("Configuring Kubernetes namespace...")
1290
+
1291
+ self.log.info(
1292
+ f"Enter the namespace to use for the Anyscale operator (default: {final_namespace}):"
1293
+ )
1294
+ final_namespace = click.prompt("", default=final_namespace, show_default=True)
1295
+
1296
+ # Validate namespace (Kubernetes DNS-1123 label requirements)
1297
+ # Must be lowercase alphanumeric or hyphens, start and end with alphanumeric, max 63 chars
1298
+
1299
+ if not final_namespace:
1300
+ raise click.ClickException("Namespace cannot be empty")
1301
+ if len(final_namespace) > 63:
1302
+ raise click.ClickException("Namespace must be 63 characters or less")
1303
+ if not re.match(r"^[a-z0-9]([-a-z0-9]*[a-z0-9])?$", final_namespace):
1304
+ raise click.ClickException(
1305
+ "Namespace must consist of lowercase alphanumeric characters or hyphens, "
1306
+ "and must start and end with an alphanumeric character"
1307
+ )
1308
+
1309
+ self.log.info(f"Using namespace: {final_namespace}")
1310
+
1311
+ return final_namespace
1312
+
1313
+ def _generate_helm_values_file( # noqa: PLR0913
1314
+ self,
1315
+ provider: str,
1316
+ cloud_deployment_id: str,
1317
+ region: str,
1318
+ namespace: str,
1319
+ infrastructure: InfrastructureResources,
1320
+ custom_path: Optional[str] = None,
1321
+ additional_values: Optional[Dict[str, str]] = None,
1322
+ ) -> str:
1323
+ """Generate Helm values file and save it locally."""
1324
+ self.log.info("Generating Helm values file...")
1325
+
1326
+ # Create values dictionary starting with base values
1327
+ values: Dict[str, Any] = {
1328
+ "global": {
1329
+ "cloudDeploymentId": cloud_deployment_id,
1330
+ "cloudProvider": provider,
1331
+ "region": region,
1332
+ "auth": {"iamIdentity": infrastructure.iam_role_arn,},
1333
+ },
1334
+ "ingress-nginx": {"enabled": True},
1335
+ }
1336
+
1337
+ if additional_values:
1338
+ for key, value in additional_values.items():
1339
+ if key not in values:
1340
+ values[key] = value
1341
+
1342
+ # Add control plane URL from ANYSCALE_HOST environment variable
1343
+ if ANYSCALE_HOST:
1344
+ values["controlPlaneURL"] = ANYSCALE_HOST
1345
+ self.log.info(f"Using control plane URL: {ANYSCALE_HOST}")
1346
+
1347
+ if custom_path:
1348
+ values_file_path = custom_path
1349
+ else:
1350
+ # Create filename with timestamp
1351
+ import datetime
1352
+
1353
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
1354
+ filename = f"anyscale-helm-values-{provider}-{namespace}-{timestamp}.yaml"
1355
+ values_file_path = os.path.join(os.getcwd(), filename)
1356
+
1357
+ with open(values_file_path, "w") as f:
1358
+ yaml.dump(values, f, default_flow_style=False, sort_keys=False)
1359
+
1360
+ self.log.info(f"Generated Helm values file: {values_file_path}")
1361
+
1362
+ return values_file_path
1363
+
1364
+ def _execute_helm_command(self, helm_command: str) -> None:
1365
+ """Execute the helm command."""
1366
+ # Convert multi-line command to single line and execute
1367
+ single_line_command = helm_command.replace(" \\\n", " ").replace("\n", " ")
1368
+
1369
+ self.log.info(f"Executing: {single_line_command}")
1370
+
1371
+ try:
1372
+ subprocess.run(
1373
+ single_line_command,
1374
+ shell=True,
1375
+ check=True,
1376
+ capture_output=True,
1377
+ text=True,
1378
+ )
1379
+ self.log.info("Helm installation completed successfully")
1380
+ except subprocess.CalledProcessError as e:
1381
+ self.log.error(f"Helm installation failed: {e.stderr}")
1382
+ raise click.ClickException(
1383
+ f"Failed to install Anyscale operator: {e.stderr}"
1384
+ )
1385
+
1386
+ def _verify_installation(
1387
+ self, cloud_id: str, namespace: str, cluster_info: ClusterInfo
1388
+ ) -> None:
1389
+ """Verify the Kubernetes installation."""
1390
+ self.log.info("Verifying installation...")
1391
+
1392
+ # Get the cloud deployment
1393
+ cloud_resources = self.cloud_controller.get_cloud_resources(cloud_id)
1394
+
1395
+ if not cloud_resources:
1396
+ raise click.ClickException("No cloud resources found for verification")
1397
+
1398
+ cloud_deployment = cloud_resources[0]
1399
+
1400
+ # Use the existing Kubernetes verifier
1401
+ verifier = KubernetesCloudDeploymentVerifier(
1402
+ self.log, self.cloud_controller.api_client
1403
+ )
1404
+
1405
+ # Set up kubectl config for verification using the discovered context
1406
+ verifier.k8s_config = KubernetesConfig(
1407
+ context=cluster_info.context, # Use the discovered context to avoid re-prompting
1408
+ operator_namespace=namespace,
1409
+ )
1410
+
1411
+ # Run verification
1412
+ success = verifier.verify(cloud_deployment)
1413
+
1414
+ if success:
1415
+ self.log.info("Verification completed successfully")
1416
+ else:
1417
+ self.log.error("Verification failed - please check the logs above")
1418
+ raise click.ClickException("Installation verification failed")
1419
+
1420
+
1421
+ def setup_kubernetes_cloud( # noqa: PLR0913
1422
+ provider: str,
1423
+ region: str,
1424
+ name: str,
1425
+ cluster_name: str,
1426
+ namespace: str = "anyscale-operator",
1427
+ project_id: Optional[str] = None,
1428
+ functional_verify: bool = False,
1429
+ yes: bool = False,
1430
+ values_file: Optional[str] = None,
1431
+ debug: bool = False,
1432
+ ) -> None:
1433
+ """
1434
+ Set up Anyscale on a Kubernetes cluster.
1435
+
1436
+ This function can be called from multiple CLI commands and provides
1437
+ the core K8s setup functionality.
1438
+
1439
+ Args:
1440
+ provider: Cloud provider (aws, gcp)
1441
+ region: Cloud region
1442
+ name: Name for the Anyscale cloud
1443
+ cluster_name: Kubernetes cluster name
1444
+ namespace: Namespace for Anyscale operator (default: anyscale-operator)
1445
+ project_id: GCP project ID (optional, for future GCP support)
1446
+ functional_verify: Whether to run functional verification
1447
+ yes: Skip confirmation prompts
1448
+ values_file: Optional path for Helm values file
1449
+ debug: Enable debug logging
1450
+ """
1451
+ cmd = KubernetesCloudSetupCommand(debug=debug)
1452
+
1453
+ try:
1454
+ cmd.run(
1455
+ provider=provider,
1456
+ region=region,
1457
+ name=name,
1458
+ cluster_name=cluster_name,
1459
+ namespace=namespace,
1460
+ project_id=project_id,
1461
+ functional_verify=functional_verify,
1462
+ yes=yes,
1463
+ values_file=values_file,
1464
+ )
1465
+ except Exception as e: # noqa: BLE001
1466
+ click.echo(f"Setup failed: {e}", err=True)
1467
+ raise click.Abort()