anyscale 0.26.67__py3-none-any.whl → 0.26.69__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. anyscale/client/README.md +22 -0
  2. anyscale/client/openapi_client/__init__.py +16 -0
  3. anyscale/client/openapi_client/api/default_api.py +801 -19
  4. anyscale/client/openapi_client/models/__init__.py +16 -0
  5. anyscale/client/openapi_client/models/clusterdashboardnode_response.py +121 -0
  6. anyscale/client/openapi_client/models/lineage_artifact.py +383 -0
  7. anyscale/client/openapi_client/models/lineage_artifact_sort_field.py +101 -0
  8. anyscale/client/openapi_client/models/lineage_artifact_type.py +100 -0
  9. anyscale/client/openapi_client/models/lineage_direction.py +101 -0
  10. anyscale/client/openapi_client/models/lineage_graph.py +179 -0
  11. anyscale/client/openapi_client/models/lineage_graph_node.py +467 -0
  12. anyscale/client/openapi_client/models/lineage_node_type.py +100 -0
  13. anyscale/client/openapi_client/models/lineage_workload.py +383 -0
  14. anyscale/client/openapi_client/models/lineage_workload_sort_field.py +101 -0
  15. anyscale/client/openapi_client/models/lineage_workload_type.py +101 -0
  16. anyscale/client/openapi_client/models/lineageartifact_list_response.py +147 -0
  17. anyscale/client/openapi_client/models/lineageartifact_response.py +121 -0
  18. anyscale/client/openapi_client/models/lineagegraph_response.py +121 -0
  19. anyscale/client/openapi_client/models/lineageworkload_list_response.py +147 -0
  20. anyscale/client/openapi_client/models/lineageworkload_response.py +121 -0
  21. anyscale/commands/cloud_commands.py +15 -9
  22. anyscale/commands/command_examples.py +53 -0
  23. anyscale/commands/setup_k8s.py +521 -50
  24. anyscale/controllers/cloud_controller.py +13 -12
  25. anyscale/controllers/kubernetes_verifier.py +57 -11
  26. anyscale/version.py +1 -1
  27. {anyscale-0.26.67.dist-info → anyscale-0.26.69.dist-info}/METADATA +1 -1
  28. {anyscale-0.26.67.dist-info → anyscale-0.26.69.dist-info}/RECORD +33 -17
  29. {anyscale-0.26.67.dist-info → anyscale-0.26.69.dist-info}/WHEEL +0 -0
  30. {anyscale-0.26.67.dist-info → anyscale-0.26.69.dist-info}/entry_points.txt +0 -0
  31. {anyscale-0.26.67.dist-info → anyscale-0.26.69.dist-info}/licenses/LICENSE +0 -0
  32. {anyscale-0.26.67.dist-info → anyscale-0.26.69.dist-info}/licenses/NOTICE +0 -0
  33. {anyscale-0.26.67.dist-info → anyscale-0.26.69.dist-info}/top_level.txt +0 -0
@@ -42,11 +42,7 @@ class ClusterInfo:
42
42
  region: str
43
43
  cluster_name: str
44
44
  project_id: Optional[str] = None
45
- cluster_arn: Optional[str] = None
46
45
  oidc_provider: Optional[str] = None
47
- cluster_location: Optional[str] = None
48
- workload_identity_pool: Optional[str] = None
49
- cluster_version: Optional[str] = None
50
46
 
51
47
 
52
48
  @dataclass
@@ -79,6 +75,7 @@ class KubernetesCloudSetupCommand:
79
75
  functional_verify: bool,
80
76
  yes: bool,
81
77
  values_file: Optional[str] = None,
78
+ operator_chart: Optional[str] = None,
82
79
  ) -> None:
83
80
  """
84
81
  Main entry point for Kubernetes cloud setup.
@@ -93,6 +90,7 @@ class KubernetesCloudSetupCommand:
93
90
  functional_verify: Whether to run functional verification
94
91
  yes: Skip confirmation prompts
95
92
  values_file: Optional custom path for Helm values file
93
+ operator_chart: Optional path to operator chart (skips helm repo add/update)
96
94
  """
97
95
  self.log.open_block(
98
96
  "Setup", f"Setting up Kubernetes cloud '{name}' on {provider.upper()}"
@@ -134,6 +132,7 @@ class KubernetesCloudSetupCommand:
134
132
  final_namespace,
135
133
  infrastructure,
136
134
  values_file,
135
+ operator_chart,
137
136
  )
138
137
 
139
138
  # Step 6: Verify installation
@@ -194,7 +193,7 @@ class KubernetesCloudSetupCommand:
194
193
  namespace: str,
195
194
  provider: str,
196
195
  region: str,
197
- project_id: Optional[str], # noqa: ARG002
196
+ project_id: Optional[str],
198
197
  ) -> ClusterInfo:
199
198
  """Discover and validate the target Kubernetes cluster using cloud provider APIs."""
200
199
  self.log.info(
@@ -205,8 +204,12 @@ class KubernetesCloudSetupCommand:
205
204
  if provider == "aws":
206
205
  return self._discover_aws_cluster(cluster_name, namespace, region)
207
206
  elif provider == "gcp":
208
- raise click.ClickException(
209
- "GCP support is not yet implemented. Please use AWS for now."
207
+ if not project_id:
208
+ raise click.ClickException(
209
+ "GCP project ID is required. Please provide --project-id"
210
+ )
211
+ return self._discover_gcp_cluster(
212
+ cluster_name, namespace, region, project_id
210
213
  )
211
214
  else:
212
215
  raise click.ClickException(f"Unsupported provider: {provider}")
@@ -215,18 +218,6 @@ class KubernetesCloudSetupCommand:
215
218
  self, cluster_name: str, namespace: str, region: str
216
219
  ) -> ClusterInfo:
217
220
  """Discover AWS EKS cluster details and configure kubeconfig."""
218
- try:
219
- self._debug(f"Fetching EKS cluster info for {cluster_name} in {region}...")
220
- cluster_info = self._get_eks_cluster_info(cluster_name, region)
221
- self._debug(f"EKS Cluster ARN: {cluster_info.get('arn', 'Unknown')}")
222
- self._debug(
223
- f"EKS Cluster Version: {cluster_info.get('version', 'Unknown')}"
224
- )
225
- except Exception as e: # noqa: BLE001
226
- self.log.error(f"Failed to get EKS cluster info: {e}")
227
- raise click.ClickException(
228
- f"Failed to discover EKS cluster {cluster_name}: {e}"
229
- )
230
221
 
231
222
  try:
232
223
  self._debug("Fetching OIDC provider information...")
@@ -262,9 +253,39 @@ class KubernetesCloudSetupCommand:
262
253
  provider="aws",
263
254
  region=region,
264
255
  cluster_name=cluster_name,
265
- cluster_arn=cluster_info.get("arn"),
266
256
  oidc_provider=oidc_provider,
267
- cluster_version=cluster_info.get("version"),
257
+ )
258
+
259
+ def _discover_gcp_cluster(
260
+ self, cluster_name: str, namespace: str, region: str, project_id: str
261
+ ) -> ClusterInfo:
262
+ """Discover GCP GKE cluster details and configure kubeconfig."""
263
+
264
+ try:
265
+ self._debug("Configuring kubeconfig for GKE cluster...")
266
+ self._configure_gcp_kubeconfig(cluster_name, region, project_id)
267
+ except Exception as e: # noqa: BLE001
268
+ self.log.error(f"Failed to configure kubeconfig: {e}")
269
+ raise click.ClickException(
270
+ f"Failed to configure kubeconfig for GKE cluster: {e}"
271
+ )
272
+
273
+ try:
274
+ self._debug("Verifying kubeconfig configuration...")
275
+ self._verify_kubeconfig()
276
+ current_context = self._get_current_kubectl_context()
277
+ self.log.info(f"Cluster discovered: {current_context}", block_label="Setup")
278
+ except Exception as e: # noqa: BLE001
279
+ self.log.error(f"Failed to verify kubeconfig: {e}")
280
+ raise click.ClickException(f"Failed to verify kubeconfig: {e}")
281
+
282
+ return ClusterInfo(
283
+ context=current_context,
284
+ namespace=namespace,
285
+ provider="gcp",
286
+ region=region,
287
+ cluster_name=cluster_name,
288
+ project_id=project_id,
268
289
  )
269
290
 
270
291
  def _setup_infrastructure(
@@ -278,9 +299,7 @@ class KubernetesCloudSetupCommand:
278
299
  if provider == "aws":
279
300
  return self._setup_aws_infrastructure(region, name, cluster_info)
280
301
  elif provider == "gcp":
281
- raise click.ClickException(
282
- "GCP support is not yet implemented. Please use AWS for now."
283
- )
302
+ return self._setup_gcp_infrastructure(region, name, cluster_info)
284
303
  else:
285
304
  raise click.ClickException(f"Unsupported provider: {provider}")
286
305
 
@@ -502,6 +521,344 @@ class KubernetesCloudSetupCommand:
502
521
 
503
522
  return json.dumps(template, indent=2)
504
523
 
524
+ def _setup_gcp_infrastructure( # noqa: PLR0912
525
+ self, region: str, name: str, cluster_info: ClusterInfo,
526
+ ) -> InfrastructureResources:
527
+ """Set up GCP infrastructure for Kubernetes using GCP Python SDK.
528
+
529
+ Note: Deployment Manager is deprecated so it is unused here.
530
+ Infrastructure Manager was tried but did not work well, so we rely
531
+ on the GCP Python SDK instead.
532
+ """
533
+ try:
534
+ from anyscale.utils.gcp_utils import get_google_cloud_client_factory
535
+ except ImportError as e:
536
+ self.log.error(f"Failed to import required modules: {e}")
537
+ raise click.ClickException(f"Failed to import required modules: {e}")
538
+
539
+ try:
540
+ # Generate a unique cloud ID
541
+ cloud_id = f"k8s-{name}-{os.urandom(4).hex()}"
542
+ deployment_name = cloud_id.replace("_", "-").lower()
543
+ self._debug(f"Generated cloud ID: {cloud_id}")
544
+ self._debug(f"Infrastructure Manager deployment name: {deployment_name}")
545
+ except Exception as e: # noqa: BLE001
546
+ self.log.error(f"Failed to generate cloud ID: {e}")
547
+ raise click.ClickException(f"Failed to generate cloud ID: {e}")
548
+
549
+ try:
550
+ # Get Google Cloud client factory
551
+ factory = get_google_cloud_client_factory(self.log, cluster_info.project_id)
552
+ except Exception as e: # noqa: BLE001
553
+ self.log.error(f"Failed to initialize GCP client: {e}")
554
+ raise click.ClickException(f"Failed to initialize GCP client: {e}")
555
+
556
+ try:
557
+ with self.log.indent():
558
+ self.log.warning(
559
+ "NOTE: GCP resources (bucket and service account) created by this command are not managed by Anyscale.",
560
+ )
561
+ self.log.warning(
562
+ "You will need to manually delete these resources when the cloud is no longer needed.",
563
+ )
564
+ self.log.info(
565
+ "Creating GCP resources (bucket, service account, IAM bindings)...",
566
+ )
567
+
568
+ # Calculate resource names
569
+ # Service account name: anyscale-operator-<random 8 chars>
570
+ # Max length for GCP service account is 30 characters
571
+ random_suffix = os.urandom(4).hex() # 8 hex chars
572
+ anyscale_service_account_name = f"anyscale-operator-{random_suffix}"
573
+ bucket_name = f"anyscale-{cloud_id.replace('_', '-').lower()}"
574
+
575
+ # Create GCS bucket
576
+ self._debug(f"Creating GCS bucket: {bucket_name}")
577
+ storage_client = factory.storage.Client()
578
+ bucket = storage_client.bucket(bucket_name)
579
+ bucket.location = region
580
+ bucket.storage_class = "REGIONAL"
581
+ bucket.iam_configuration.uniform_bucket_level_access_enabled = True
582
+ bucket.iam_configuration.public_access_prevention = "enforced"
583
+ bucket.versioning_enabled = True
584
+ bucket.labels = {"anyscale-cloud-id": cloud_id.replace("-", "_")}
585
+
586
+ # Set CORS
587
+ # Use ANYSCALE_CORS_ORIGIN from shared config
588
+ # This respects the ANYSCALE_HOST environment variable
589
+ allowed_origin = ANYSCALE_CORS_ORIGIN
590
+ bucket.cors = [
591
+ {
592
+ "origin": [allowed_origin],
593
+ "responseHeader": ["*"],
594
+ "method": ["GET", "PUT", "POST", "HEAD", "DELETE"],
595
+ "maxAgeSeconds": 3600,
596
+ }
597
+ ]
598
+
599
+ storage_client.create_bucket(bucket, location=region)
600
+ self.log.info(f"Created GCS bucket: {bucket_name}", block_label="Setup")
601
+
602
+ # Create service account
603
+ self._debug(
604
+ f"Creating service account: {anyscale_service_account_name}"
605
+ )
606
+ iam_client = factory.build("iam", "v1")
607
+ service_account_body = {
608
+ "accountId": anyscale_service_account_name,
609
+ "serviceAccount": {
610
+ "displayName": f"{cloud_id} Anyscale operator service account",
611
+ "description": "Service account for Anyscale Kubernetes operator",
612
+ },
613
+ }
614
+
615
+ service_account = (
616
+ iam_client.projects()
617
+ .serviceAccounts()
618
+ .create(
619
+ name=f"projects/{cluster_info.project_id}",
620
+ body=service_account_body,
621
+ )
622
+ .execute()
623
+ )
624
+
625
+ service_account_email = service_account["email"]
626
+ self.log.info(
627
+ f"Created service account: {service_account_email}",
628
+ block_label="Setup",
629
+ )
630
+
631
+ # Wait for service account to propagate through GCP systems
632
+ import time
633
+
634
+ self._debug("Waiting 10 seconds for service account to propagate...")
635
+ time.sleep(10)
636
+
637
+ # Grant Workload Identity binding
638
+ self._debug("Setting up Workload Identity binding")
639
+
640
+ # The K8s service account needs:
641
+ # 1. workloadIdentityUser role - to impersonate the GCP service account
642
+ # 2. serviceAccountTokenCreator - to generate tokens (for getOpenIdToken)
643
+
644
+ policy_body = {
645
+ "policy": {
646
+ "bindings": [
647
+ {
648
+ "role": "roles/iam.workloadIdentityUser",
649
+ "members": [
650
+ f"serviceAccount:{cluster_info.project_id}.svc.id.goog[{cluster_info.namespace}/anyscale-operator]"
651
+ ],
652
+ },
653
+ {
654
+ "role": "roles/iam.serviceAccountTokenCreator",
655
+ "members": [f"serviceAccount:{service_account_email}"],
656
+ },
657
+ ]
658
+ }
659
+ }
660
+
661
+ iam_client.projects().serviceAccounts().setIamPolicy(
662
+ resource=f"projects/{cluster_info.project_id}/serviceAccounts/{service_account_email}",
663
+ body=policy_body,
664
+ ).execute()
665
+
666
+ self.log.info(
667
+ "Configured Workload Identity binding", block_label="Setup"
668
+ )
669
+
670
+ # Grant storage admin role to service account for the bucket
671
+ # Note: There's often a propagation delay after service account creation
672
+ # We need to retry with exponential backoff
673
+ self._debug("Granting storage permissions")
674
+
675
+ import time
676
+
677
+ max_retries = 5
678
+ retry_delay = 2 # Start with 2 seconds
679
+
680
+ for attempt in range(max_retries):
681
+ try:
682
+ bucket_policy = bucket.get_iam_policy(
683
+ requested_policy_version=3
684
+ )
685
+ bucket_policy.bindings.append(
686
+ {
687
+ "role": "roles/storage.admin",
688
+ "members": {f"serviceAccount:{service_account_email}"},
689
+ }
690
+ )
691
+ bucket.set_iam_policy(bucket_policy)
692
+ break # Success!
693
+ except Exception as e: # noqa: BLE001
694
+ if "does not exist" in str(e) and attempt < max_retries - 1:
695
+ self._debug(
696
+ f"Service account not yet propagated, retrying in {retry_delay}s... (attempt {attempt + 1}/{max_retries})"
697
+ )
698
+ time.sleep(retry_delay)
699
+ retry_delay *= 2 # Exponential backoff
700
+ else:
701
+ raise # Re-raise if it's not a propagation issue or we're out of retries
702
+
703
+ self.log.info(
704
+ "Granted storage permissions to service account",
705
+ block_label="Setup",
706
+ )
707
+
708
+ self.log.info("GCP resources created successfully", block_label="Setup")
709
+ self.log.warning(
710
+ f"REMINDER: To clean up when no longer needed, delete GCS bucket '{bucket_name}' and service account '{service_account_email}'"
711
+ )
712
+ except Exception as e: # noqa: BLE001
713
+ self.log.error(f"Failed to create GCP resources: {e}")
714
+ raise click.ClickException(f"Failed to create GCP resources: {e}")
715
+
716
+ # Resources were created in the try block above
717
+ # bucket_name and service_account_email are already set
718
+ self._debug(f"GCS Bucket: {bucket_name}")
719
+ self._debug(f"Service Account Email: {service_account_email}")
720
+
721
+ return InfrastructureResources(
722
+ bucket_name=bucket_name,
723
+ iam_role_arn=service_account_email, # For GCP, we use service account email
724
+ region=region,
725
+ project_id=cluster_info.project_id,
726
+ )
727
+
728
+ def _get_gke_cluster_info(
729
+ self, cluster_name: str, region: str, project_id: str
730
+ ) -> Dict[str, Any]:
731
+ """Get GKE cluster information using gcloud CLI."""
732
+ try:
733
+ # Try regional cluster first
734
+ result = subprocess.run(
735
+ [
736
+ "gcloud",
737
+ "container",
738
+ "clusters",
739
+ "describe",
740
+ cluster_name,
741
+ f"--region={region}",
742
+ f"--project={project_id}",
743
+ "--format=json",
744
+ ],
745
+ capture_output=True,
746
+ text=True,
747
+ check=False,
748
+ )
749
+ if result.returncode == 0:
750
+ return json.loads(result.stdout)
751
+
752
+ # Try zonal cluster
753
+ # Assuming zone 'a' if regional fails
754
+ zone = f"{region}-a"
755
+ result = subprocess.run(
756
+ [
757
+ "gcloud",
758
+ "container",
759
+ "clusters",
760
+ "describe",
761
+ cluster_name,
762
+ f"--zone={zone}",
763
+ f"--project={project_id}",
764
+ "--format=json",
765
+ ],
766
+ capture_output=True,
767
+ text=True,
768
+ check=True,
769
+ )
770
+ return json.loads(result.stdout)
771
+ except subprocess.CalledProcessError as e:
772
+ raise click.ClickException(f"Failed to get GKE cluster info: {e.stderr}")
773
+ except json.JSONDecodeError as e:
774
+ raise click.ClickException(f"Failed to parse GKE cluster info: {e}")
775
+
776
+ def _get_gke_zones(
777
+ self, cluster_name: str, region: str, project_id: str
778
+ ) -> List[str]:
779
+ """Get zones where the GKE cluster's node pools are located."""
780
+ try:
781
+ cluster_info = self._get_gke_cluster_info(cluster_name, region, project_id)
782
+
783
+ # Extract zones from node pools
784
+ zones = []
785
+ node_pools = cluster_info.get("nodePools", [])
786
+
787
+ for pool in node_pools:
788
+ # For zonal clusters, each node pool has locations
789
+ pool_locations = pool.get("locations", [])
790
+ zones.extend(pool_locations)
791
+
792
+ # If no zones found from node pools, try cluster-level locations
793
+ if not zones:
794
+ cluster_locations = cluster_info.get("locations", [])
795
+ if cluster_locations:
796
+ zones = cluster_locations
797
+
798
+ # Remove duplicates and sort
799
+ if zones:
800
+ unique_zones = sorted(set(zones))
801
+ self._debug(f"Discovered zones: {', '.join(unique_zones)}")
802
+ return unique_zones
803
+ else:
804
+ # Fallback to default zones
805
+ self._debug(
806
+ "No zones found in cluster info, falling back to default zones"
807
+ )
808
+ return [region + "-a", region + "-b", region + "-c"]
809
+
810
+ except Exception as e: # noqa: BLE001
811
+ self._debug(f"Failed to get zones: {e}, using default zones")
812
+ return [region + "-a", region + "-b", region + "-c"]
813
+
814
+ def _configure_gcp_kubeconfig(
815
+ self, cluster_name: str, region: str, project_id: str
816
+ ) -> None:
817
+ """Configure kubeconfig for GCP GKE cluster."""
818
+ self.log.info(f"Configuring kubeconfig for GKE cluster: {cluster_name}")
819
+
820
+ try:
821
+ # Try regional cluster first
822
+ result = subprocess.run(
823
+ [
824
+ "gcloud",
825
+ "container",
826
+ "clusters",
827
+ "get-credentials",
828
+ cluster_name,
829
+ f"--region={region}",
830
+ f"--project={project_id}",
831
+ ],
832
+ capture_output=True,
833
+ text=True,
834
+ check=False,
835
+ )
836
+ if result.returncode == 0:
837
+ self.log.info("GKE kubeconfig configured successfully")
838
+ return
839
+
840
+ # Try zonal cluster
841
+ zone = f"{region}-a"
842
+ subprocess.run(
843
+ [
844
+ "gcloud",
845
+ "container",
846
+ "clusters",
847
+ "get-credentials",
848
+ cluster_name,
849
+ f"--zone={zone}",
850
+ f"--project={project_id}",
851
+ ],
852
+ capture_output=True,
853
+ text=True,
854
+ check=True,
855
+ )
856
+ self.log.info("GKE kubeconfig configured successfully")
857
+ except subprocess.CalledProcessError as e:
858
+ raise click.ClickException(
859
+ f"Failed to configure GKE kubeconfig: {e.stderr}"
860
+ )
861
+
505
862
  def _get_eks_cluster_info(self, cluster_name: str, region: str) -> Dict[str, Any]:
506
863
  """Get EKS cluster information using AWS CLI."""
507
864
  try:
@@ -698,11 +1055,33 @@ class KubernetesCloudSetupCommand:
698
1055
  zones=zones,
699
1056
  ),
700
1057
  )
701
- else:
702
- raise click.ClickException(
703
- "GCP support is not yet implemented. Please use AWS for now."
1058
+ elif provider == "gcp":
1059
+ assert infrastructure.project_id, "Project ID is required for GCP"
1060
+
1061
+ from anyscale.client.openapi_client.models import GCPConfig
1062
+
1063
+ # Dynamically determine zones from the GKE cluster
1064
+ zones = self._get_gke_zones(
1065
+ cluster_info.cluster_name, region, infrastructure.project_id
704
1066
  )
705
1067
 
1068
+ cloud_deployment = CloudDeployment(
1069
+ name=name,
1070
+ provider=CloudProviders.GCP,
1071
+ region=region,
1072
+ compute_stack=ComputeStack.K8S,
1073
+ object_storage=ObjectStorage(
1074
+ bucket_name=infrastructure.bucket_name, region=region
1075
+ ),
1076
+ gcp_config=GCPConfig(project_id=infrastructure.project_id,),
1077
+ kubernetes_config=OpenAPIKubernetesConfig(
1078
+ anyscale_operator_iam_identity=infrastructure.iam_role_arn,
1079
+ zones=zones,
1080
+ ),
1081
+ )
1082
+ else:
1083
+ raise click.ClickException(f"Unsupported provider: {provider}")
1084
+
706
1085
  # Register the cloud
707
1086
  try:
708
1087
  self._debug("Cloud deployment details:")
@@ -735,10 +1114,18 @@ class KubernetesCloudSetupCommand:
735
1114
  skip_verifications=True,
736
1115
  auto_add_user=True,
737
1116
  )
738
- else:
739
- raise click.ClickException(
740
- "GCP support is not yet implemented. Please use AWS for now."
1117
+ elif provider == "gcp":
1118
+ self.log.info("Calling register_gcp_cloud...")
1119
+ self.cloud_controller.register_gcp_cloud(
1120
+ name=name,
1121
+ cloud_resource=cloud_deployment,
1122
+ functional_verify=None,
1123
+ yes=True,
1124
+ skip_verifications=True,
1125
+ auto_add_user=True,
741
1126
  )
1127
+ else:
1128
+ raise click.ClickException(f"Unsupported provider: {provider}")
742
1129
  finally:
743
1130
  # Restore the original log.info method
744
1131
  self.cloud_controller.log.info = original_log_info
@@ -781,6 +1168,7 @@ class KubernetesCloudSetupCommand:
781
1168
  namespace: str,
782
1169
  infrastructure: InfrastructureResources,
783
1170
  values_file: Optional[str] = None,
1171
+ operator_chart: Optional[str] = None,
784
1172
  ) -> None:
785
1173
  """Install the Anyscale operator using Helm."""
786
1174
  self.log.info("Installing Anyscale operator...", block_label="Setup")
@@ -817,10 +1205,21 @@ class KubernetesCloudSetupCommand:
817
1205
  additional_values=set_string_values,
818
1206
  )
819
1207
 
1208
+ # Determine chart reference based on operator_chart parameter
1209
+ if operator_chart:
1210
+ # Use the provided chart path directly
1211
+ self._debug(f"Using operator chart from: {operator_chart}")
1212
+ chart_reference = operator_chart
1213
+ else:
1214
+ # Add Helm repo before installing
1215
+ self._debug("Adding Anyscale Helm repository...")
1216
+ self._add_helm_repo()
1217
+ chart_reference = "anyscale/anyscale-operator"
1218
+
820
1219
  # Build a simple Helm command that only uses the values file
821
1220
  self._debug("Generating Helm command...")
822
1221
  helm_command = (
823
- f"helm upgrade {release_name} anyscale/anyscale-operator "
1222
+ f"helm upgrade {release_name} {chart_reference} "
824
1223
  f"--values {values_file_path} "
825
1224
  f"--namespace {namespace} "
826
1225
  f"--create-namespace "
@@ -830,6 +1229,41 @@ class KubernetesCloudSetupCommand:
830
1229
 
831
1230
  self._execute_helm_command(helm_command)
832
1231
 
1232
+ def _add_helm_repo(self) -> None:
1233
+ """Add and update the Anyscale Helm repository."""
1234
+ try:
1235
+ # Add the Anyscale Helm repository
1236
+ self.log.info("Adding Anyscale Helm repository...", block_label="Setup")
1237
+ subprocess.run(
1238
+ [
1239
+ "helm",
1240
+ "repo",
1241
+ "add",
1242
+ "anyscale",
1243
+ "https://anyscale.github.io/helm-charts",
1244
+ ],
1245
+ capture_output=True,
1246
+ text=True,
1247
+ check=False, # Don't fail if repo already exists
1248
+ )
1249
+
1250
+ # Update the Helm repository
1251
+ self.log.info("Updating Helm repositories...", block_label="Setup")
1252
+ subprocess.run(
1253
+ ["helm", "repo", "update", "anyscale"],
1254
+ capture_output=True,
1255
+ text=True,
1256
+ check=True,
1257
+ )
1258
+ self.log.info(
1259
+ "Helm repository configured successfully", block_label="Setup"
1260
+ )
1261
+ except subprocess.CalledProcessError as e:
1262
+ self.log.error(f"Failed to configure Helm repository: {e.stderr}")
1263
+ raise click.ClickException(
1264
+ f"Failed to configure Helm repository: {e.stderr}"
1265
+ )
1266
+
833
1267
  def _extract_set_string_values(self, helm_command: str) -> Dict[str, str]:
834
1268
  """
835
1269
  Extract all --set-string key=value pairs from a Helm command.
@@ -853,6 +1287,35 @@ class KubernetesCloudSetupCommand:
853
1287
 
854
1288
  return set_string_values
855
1289
 
1290
+ def _set_nested_value(self, d: Dict[str, Any], key_path: str, value: Any) -> None:
1291
+ """
1292
+ Set a value in a nested dictionary using a dotted key path.
1293
+
1294
+ Args:
1295
+ d: The dictionary to modify
1296
+ key_path: Dotted key path (e.g., "workloads.serviceaccount.name")
1297
+ value: The value to set
1298
+
1299
+ Example:
1300
+ _set_nested_value({}, "workloads.serviceaccount.name", "my-sa")
1301
+ # Results in: {"workloads": {"serviceaccount": {"name": "my-sa"}}}
1302
+ """
1303
+ keys = key_path.split(".")
1304
+ current = d
1305
+
1306
+ # Navigate/create the nested structure
1307
+ for key in keys[:-1]:
1308
+ if key not in current:
1309
+ current[key] = {}
1310
+ elif not isinstance(current[key], dict):
1311
+ # If the key exists but isn't a dict, we have a conflict
1312
+ # In this case, we'll overwrite it with a dict
1313
+ current[key] = {}
1314
+ current = current[key]
1315
+
1316
+ # Set the final value
1317
+ current[keys[-1]] = value
1318
+
856
1319
  def _prompt_for_namespace(
857
1320
  self, default_namespace: str, skip_confirmation: bool = False
858
1321
  ) -> str:
@@ -864,10 +1327,12 @@ class KubernetesCloudSetupCommand:
864
1327
  return final_namespace
865
1328
 
866
1329
  self.log.info("Configuring Kubernetes namespace...")
867
-
868
1330
  self.log.info(
869
- f"Enter the namespace to use for the Anyscale operator (default: {final_namespace}):"
1331
+ f"Specify the namespace to use for the Anyscale operator (leave blank for default: {final_namespace})."
870
1332
  )
1333
+ self.log.info("If the namespace does not exist, it will be created.")
1334
+ self.log.info("Enter your namespace:")
1335
+
871
1336
  final_namespace = click.prompt("", default=final_namespace, show_default=True)
872
1337
 
873
1338
  # Validate namespace (Kubernetes DNS-1123 label requirements)
@@ -900,24 +1365,32 @@ class KubernetesCloudSetupCommand:
900
1365
  """Generate Helm values file and save it locally."""
901
1366
  self.log.info("Generating Helm values file...")
902
1367
 
903
- # Create values dictionary starting with base values
904
- values: Dict[str, Any] = {
905
- "cloudProvider": provider,
906
- "cloudDeploymentId": cloud_deployment_id,
907
- "region": region,
908
- "operatorIamIdentity": infrastructure.iam_role_arn,
909
- "ingress-nginx": {"enabled": True},
910
- }
1368
+ # Start with an empty dictionary to build up values
1369
+ values: Dict[str, Any] = {}
911
1370
 
1371
+ # First, parse and merge additional_values with nested keys
912
1372
  if additional_values:
913
1373
  for key, value in additional_values.items():
914
- if key not in values:
915
- values[key] = value
1374
+ self._set_nested_value(values, key, value)
1375
+
1376
+ # Now overlay our constants on top (these take precedence)
1377
+ # Use _set_nested_value to ensure proper nesting
1378
+ self._set_nested_value(values, "global.cloudDeploymentId", cloud_deployment_id)
1379
+ self._set_nested_value(values, "global.cloudProvider", provider)
1380
+ self._set_nested_value(
1381
+ values, "global.auth.iamIdentity", infrastructure.iam_role_arn
1382
+ )
1383
+ self._set_nested_value(values, "ingress-nginx.enabled", True)
1384
+
1385
+ # Add region for AWS only (using global.aws.region)
1386
+ # Region field is deprecated for other providers
1387
+ if provider == "aws":
1388
+ self._set_nested_value(values, "global.aws.region", region)
916
1389
 
917
1390
  # Add control plane URL from ANYSCALE_HOST environment variable
918
1391
  if ANYSCALE_HOST:
919
- values["controlPlaneURL"] = ANYSCALE_HOST
920
- self._debug(f"Using control plane URL: {ANYSCALE_HOST}")
1392
+ self._set_nested_value(values, "global.controlPlaneURL", ANYSCALE_HOST)
1393
+ self.log.info(f"Using control plane URL: {ANYSCALE_HOST}")
921
1394
 
922
1395
  if custom_path:
923
1396
  values_file_path = custom_path
@@ -983,11 +1456,6 @@ class KubernetesCloudSetupCommand:
983
1456
  operator_namespace=namespace,
984
1457
  )
985
1458
 
986
- # Sleep to avoid race condition where operator has not loaded its IAM identity
987
- import time
988
-
989
- time.sleep(5)
990
-
991
1459
  # Run verification
992
1460
  success = verifier.verify(cloud_deployment)
993
1461
 
@@ -1009,6 +1477,7 @@ def setup_kubernetes_cloud( # noqa: PLR0913
1009
1477
  yes: bool = False,
1010
1478
  values_file: Optional[str] = None,
1011
1479
  debug: bool = False,
1480
+ operator_chart: Optional[str] = None,
1012
1481
  ) -> None:
1013
1482
  """
1014
1483
  Set up Anyscale on a Kubernetes cluster.
@@ -1027,6 +1496,7 @@ def setup_kubernetes_cloud( # noqa: PLR0913
1027
1496
  yes: Skip confirmation prompts
1028
1497
  values_file: Optional path for Helm values file
1029
1498
  debug: Enable debug logging
1499
+ operator_chart: Optional path to operator chart (skips helm repo add/update)
1030
1500
  """
1031
1501
  cmd = KubernetesCloudSetupCommand(debug=debug)
1032
1502
 
@@ -1041,6 +1511,7 @@ def setup_kubernetes_cloud( # noqa: PLR0913
1041
1511
  functional_verify=functional_verify,
1042
1512
  yes=yes,
1043
1513
  values_file=values_file,
1514
+ operator_chart=operator_chart,
1044
1515
  )
1045
1516
  except Exception as e: # noqa: BLE001
1046
1517
  click.echo(f"Setup failed: {e}", err=True)