anyscale 0.26.67__py3-none-any.whl → 0.26.69__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anyscale/client/README.md +22 -0
- anyscale/client/openapi_client/__init__.py +16 -0
- anyscale/client/openapi_client/api/default_api.py +801 -19
- anyscale/client/openapi_client/models/__init__.py +16 -0
- anyscale/client/openapi_client/models/clusterdashboardnode_response.py +121 -0
- anyscale/client/openapi_client/models/lineage_artifact.py +383 -0
- anyscale/client/openapi_client/models/lineage_artifact_sort_field.py +101 -0
- anyscale/client/openapi_client/models/lineage_artifact_type.py +100 -0
- anyscale/client/openapi_client/models/lineage_direction.py +101 -0
- anyscale/client/openapi_client/models/lineage_graph.py +179 -0
- anyscale/client/openapi_client/models/lineage_graph_node.py +467 -0
- anyscale/client/openapi_client/models/lineage_node_type.py +100 -0
- anyscale/client/openapi_client/models/lineage_workload.py +383 -0
- anyscale/client/openapi_client/models/lineage_workload_sort_field.py +101 -0
- anyscale/client/openapi_client/models/lineage_workload_type.py +101 -0
- anyscale/client/openapi_client/models/lineageartifact_list_response.py +147 -0
- anyscale/client/openapi_client/models/lineageartifact_response.py +121 -0
- anyscale/client/openapi_client/models/lineagegraph_response.py +121 -0
- anyscale/client/openapi_client/models/lineageworkload_list_response.py +147 -0
- anyscale/client/openapi_client/models/lineageworkload_response.py +121 -0
- anyscale/commands/cloud_commands.py +15 -9
- anyscale/commands/command_examples.py +53 -0
- anyscale/commands/setup_k8s.py +521 -50
- anyscale/controllers/cloud_controller.py +13 -12
- anyscale/controllers/kubernetes_verifier.py +57 -11
- anyscale/version.py +1 -1
- {anyscale-0.26.67.dist-info → anyscale-0.26.69.dist-info}/METADATA +1 -1
- {anyscale-0.26.67.dist-info → anyscale-0.26.69.dist-info}/RECORD +33 -17
- {anyscale-0.26.67.dist-info → anyscale-0.26.69.dist-info}/WHEEL +0 -0
- {anyscale-0.26.67.dist-info → anyscale-0.26.69.dist-info}/entry_points.txt +0 -0
- {anyscale-0.26.67.dist-info → anyscale-0.26.69.dist-info}/licenses/LICENSE +0 -0
- {anyscale-0.26.67.dist-info → anyscale-0.26.69.dist-info}/licenses/NOTICE +0 -0
- {anyscale-0.26.67.dist-info → anyscale-0.26.69.dist-info}/top_level.txt +0 -0
anyscale/commands/setup_k8s.py
CHANGED
|
@@ -42,11 +42,7 @@ class ClusterInfo:
|
|
|
42
42
|
region: str
|
|
43
43
|
cluster_name: str
|
|
44
44
|
project_id: Optional[str] = None
|
|
45
|
-
cluster_arn: Optional[str] = None
|
|
46
45
|
oidc_provider: Optional[str] = None
|
|
47
|
-
cluster_location: Optional[str] = None
|
|
48
|
-
workload_identity_pool: Optional[str] = None
|
|
49
|
-
cluster_version: Optional[str] = None
|
|
50
46
|
|
|
51
47
|
|
|
52
48
|
@dataclass
|
|
@@ -79,6 +75,7 @@ class KubernetesCloudSetupCommand:
|
|
|
79
75
|
functional_verify: bool,
|
|
80
76
|
yes: bool,
|
|
81
77
|
values_file: Optional[str] = None,
|
|
78
|
+
operator_chart: Optional[str] = None,
|
|
82
79
|
) -> None:
|
|
83
80
|
"""
|
|
84
81
|
Main entry point for Kubernetes cloud setup.
|
|
@@ -93,6 +90,7 @@ class KubernetesCloudSetupCommand:
|
|
|
93
90
|
functional_verify: Whether to run functional verification
|
|
94
91
|
yes: Skip confirmation prompts
|
|
95
92
|
values_file: Optional custom path for Helm values file
|
|
93
|
+
operator_chart: Optional path to operator chart (skips helm repo add/update)
|
|
96
94
|
"""
|
|
97
95
|
self.log.open_block(
|
|
98
96
|
"Setup", f"Setting up Kubernetes cloud '{name}' on {provider.upper()}"
|
|
@@ -134,6 +132,7 @@ class KubernetesCloudSetupCommand:
|
|
|
134
132
|
final_namespace,
|
|
135
133
|
infrastructure,
|
|
136
134
|
values_file,
|
|
135
|
+
operator_chart,
|
|
137
136
|
)
|
|
138
137
|
|
|
139
138
|
# Step 6: Verify installation
|
|
@@ -194,7 +193,7 @@ class KubernetesCloudSetupCommand:
|
|
|
194
193
|
namespace: str,
|
|
195
194
|
provider: str,
|
|
196
195
|
region: str,
|
|
197
|
-
project_id: Optional[str],
|
|
196
|
+
project_id: Optional[str],
|
|
198
197
|
) -> ClusterInfo:
|
|
199
198
|
"""Discover and validate the target Kubernetes cluster using cloud provider APIs."""
|
|
200
199
|
self.log.info(
|
|
@@ -205,8 +204,12 @@ class KubernetesCloudSetupCommand:
|
|
|
205
204
|
if provider == "aws":
|
|
206
205
|
return self._discover_aws_cluster(cluster_name, namespace, region)
|
|
207
206
|
elif provider == "gcp":
|
|
208
|
-
|
|
209
|
-
|
|
207
|
+
if not project_id:
|
|
208
|
+
raise click.ClickException(
|
|
209
|
+
"GCP project ID is required. Please provide --project-id"
|
|
210
|
+
)
|
|
211
|
+
return self._discover_gcp_cluster(
|
|
212
|
+
cluster_name, namespace, region, project_id
|
|
210
213
|
)
|
|
211
214
|
else:
|
|
212
215
|
raise click.ClickException(f"Unsupported provider: {provider}")
|
|
@@ -215,18 +218,6 @@ class KubernetesCloudSetupCommand:
|
|
|
215
218
|
self, cluster_name: str, namespace: str, region: str
|
|
216
219
|
) -> ClusterInfo:
|
|
217
220
|
"""Discover AWS EKS cluster details and configure kubeconfig."""
|
|
218
|
-
try:
|
|
219
|
-
self._debug(f"Fetching EKS cluster info for {cluster_name} in {region}...")
|
|
220
|
-
cluster_info = self._get_eks_cluster_info(cluster_name, region)
|
|
221
|
-
self._debug(f"EKS Cluster ARN: {cluster_info.get('arn', 'Unknown')}")
|
|
222
|
-
self._debug(
|
|
223
|
-
f"EKS Cluster Version: {cluster_info.get('version', 'Unknown')}"
|
|
224
|
-
)
|
|
225
|
-
except Exception as e: # noqa: BLE001
|
|
226
|
-
self.log.error(f"Failed to get EKS cluster info: {e}")
|
|
227
|
-
raise click.ClickException(
|
|
228
|
-
f"Failed to discover EKS cluster {cluster_name}: {e}"
|
|
229
|
-
)
|
|
230
221
|
|
|
231
222
|
try:
|
|
232
223
|
self._debug("Fetching OIDC provider information...")
|
|
@@ -262,9 +253,39 @@ class KubernetesCloudSetupCommand:
|
|
|
262
253
|
provider="aws",
|
|
263
254
|
region=region,
|
|
264
255
|
cluster_name=cluster_name,
|
|
265
|
-
cluster_arn=cluster_info.get("arn"),
|
|
266
256
|
oidc_provider=oidc_provider,
|
|
267
|
-
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
def _discover_gcp_cluster(
|
|
260
|
+
self, cluster_name: str, namespace: str, region: str, project_id: str
|
|
261
|
+
) -> ClusterInfo:
|
|
262
|
+
"""Discover GCP GKE cluster details and configure kubeconfig."""
|
|
263
|
+
|
|
264
|
+
try:
|
|
265
|
+
self._debug("Configuring kubeconfig for GKE cluster...")
|
|
266
|
+
self._configure_gcp_kubeconfig(cluster_name, region, project_id)
|
|
267
|
+
except Exception as e: # noqa: BLE001
|
|
268
|
+
self.log.error(f"Failed to configure kubeconfig: {e}")
|
|
269
|
+
raise click.ClickException(
|
|
270
|
+
f"Failed to configure kubeconfig for GKE cluster: {e}"
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
self._debug("Verifying kubeconfig configuration...")
|
|
275
|
+
self._verify_kubeconfig()
|
|
276
|
+
current_context = self._get_current_kubectl_context()
|
|
277
|
+
self.log.info(f"Cluster discovered: {current_context}", block_label="Setup")
|
|
278
|
+
except Exception as e: # noqa: BLE001
|
|
279
|
+
self.log.error(f"Failed to verify kubeconfig: {e}")
|
|
280
|
+
raise click.ClickException(f"Failed to verify kubeconfig: {e}")
|
|
281
|
+
|
|
282
|
+
return ClusterInfo(
|
|
283
|
+
context=current_context,
|
|
284
|
+
namespace=namespace,
|
|
285
|
+
provider="gcp",
|
|
286
|
+
region=region,
|
|
287
|
+
cluster_name=cluster_name,
|
|
288
|
+
project_id=project_id,
|
|
268
289
|
)
|
|
269
290
|
|
|
270
291
|
def _setup_infrastructure(
|
|
@@ -278,9 +299,7 @@ class KubernetesCloudSetupCommand:
|
|
|
278
299
|
if provider == "aws":
|
|
279
300
|
return self._setup_aws_infrastructure(region, name, cluster_info)
|
|
280
301
|
elif provider == "gcp":
|
|
281
|
-
|
|
282
|
-
"GCP support is not yet implemented. Please use AWS for now."
|
|
283
|
-
)
|
|
302
|
+
return self._setup_gcp_infrastructure(region, name, cluster_info)
|
|
284
303
|
else:
|
|
285
304
|
raise click.ClickException(f"Unsupported provider: {provider}")
|
|
286
305
|
|
|
@@ -502,6 +521,344 @@ class KubernetesCloudSetupCommand:
|
|
|
502
521
|
|
|
503
522
|
return json.dumps(template, indent=2)
|
|
504
523
|
|
|
524
|
+
def _setup_gcp_infrastructure( # noqa: PLR0912
|
|
525
|
+
self, region: str, name: str, cluster_info: ClusterInfo,
|
|
526
|
+
) -> InfrastructureResources:
|
|
527
|
+
"""Set up GCP infrastructure for Kubernetes using GCP Python SDK.
|
|
528
|
+
|
|
529
|
+
Note: Deployment Manager is deprecated so it is unused here.
|
|
530
|
+
Infrastructure Manager was tried but did not work well, so we rely
|
|
531
|
+
on the GCP Python SDK instead.
|
|
532
|
+
"""
|
|
533
|
+
try:
|
|
534
|
+
from anyscale.utils.gcp_utils import get_google_cloud_client_factory
|
|
535
|
+
except ImportError as e:
|
|
536
|
+
self.log.error(f"Failed to import required modules: {e}")
|
|
537
|
+
raise click.ClickException(f"Failed to import required modules: {e}")
|
|
538
|
+
|
|
539
|
+
try:
|
|
540
|
+
# Generate a unique cloud ID
|
|
541
|
+
cloud_id = f"k8s-{name}-{os.urandom(4).hex()}"
|
|
542
|
+
deployment_name = cloud_id.replace("_", "-").lower()
|
|
543
|
+
self._debug(f"Generated cloud ID: {cloud_id}")
|
|
544
|
+
self._debug(f"Infrastructure Manager deployment name: {deployment_name}")
|
|
545
|
+
except Exception as e: # noqa: BLE001
|
|
546
|
+
self.log.error(f"Failed to generate cloud ID: {e}")
|
|
547
|
+
raise click.ClickException(f"Failed to generate cloud ID: {e}")
|
|
548
|
+
|
|
549
|
+
try:
|
|
550
|
+
# Get Google Cloud client factory
|
|
551
|
+
factory = get_google_cloud_client_factory(self.log, cluster_info.project_id)
|
|
552
|
+
except Exception as e: # noqa: BLE001
|
|
553
|
+
self.log.error(f"Failed to initialize GCP client: {e}")
|
|
554
|
+
raise click.ClickException(f"Failed to initialize GCP client: {e}")
|
|
555
|
+
|
|
556
|
+
try:
|
|
557
|
+
with self.log.indent():
|
|
558
|
+
self.log.warning(
|
|
559
|
+
"NOTE: GCP resources (bucket and service account) created by this command are not managed by Anyscale.",
|
|
560
|
+
)
|
|
561
|
+
self.log.warning(
|
|
562
|
+
"You will need to manually delete these resources when the cloud is no longer needed.",
|
|
563
|
+
)
|
|
564
|
+
self.log.info(
|
|
565
|
+
"Creating GCP resources (bucket, service account, IAM bindings)...",
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
# Calculate resource names
|
|
569
|
+
# Service account name: anyscale-operator-<random 8 chars>
|
|
570
|
+
# Max length for GCP service account is 30 characters
|
|
571
|
+
random_suffix = os.urandom(4).hex() # 8 hex chars
|
|
572
|
+
anyscale_service_account_name = f"anyscale-operator-{random_suffix}"
|
|
573
|
+
bucket_name = f"anyscale-{cloud_id.replace('_', '-').lower()}"
|
|
574
|
+
|
|
575
|
+
# Create GCS bucket
|
|
576
|
+
self._debug(f"Creating GCS bucket: {bucket_name}")
|
|
577
|
+
storage_client = factory.storage.Client()
|
|
578
|
+
bucket = storage_client.bucket(bucket_name)
|
|
579
|
+
bucket.location = region
|
|
580
|
+
bucket.storage_class = "REGIONAL"
|
|
581
|
+
bucket.iam_configuration.uniform_bucket_level_access_enabled = True
|
|
582
|
+
bucket.iam_configuration.public_access_prevention = "enforced"
|
|
583
|
+
bucket.versioning_enabled = True
|
|
584
|
+
bucket.labels = {"anyscale-cloud-id": cloud_id.replace("-", "_")}
|
|
585
|
+
|
|
586
|
+
# Set CORS
|
|
587
|
+
# Use ANYSCALE_CORS_ORIGIN from shared config
|
|
588
|
+
# This respects the ANYSCALE_HOST environment variable
|
|
589
|
+
allowed_origin = ANYSCALE_CORS_ORIGIN
|
|
590
|
+
bucket.cors = [
|
|
591
|
+
{
|
|
592
|
+
"origin": [allowed_origin],
|
|
593
|
+
"responseHeader": ["*"],
|
|
594
|
+
"method": ["GET", "PUT", "POST", "HEAD", "DELETE"],
|
|
595
|
+
"maxAgeSeconds": 3600,
|
|
596
|
+
}
|
|
597
|
+
]
|
|
598
|
+
|
|
599
|
+
storage_client.create_bucket(bucket, location=region)
|
|
600
|
+
self.log.info(f"Created GCS bucket: {bucket_name}", block_label="Setup")
|
|
601
|
+
|
|
602
|
+
# Create service account
|
|
603
|
+
self._debug(
|
|
604
|
+
f"Creating service account: {anyscale_service_account_name}"
|
|
605
|
+
)
|
|
606
|
+
iam_client = factory.build("iam", "v1")
|
|
607
|
+
service_account_body = {
|
|
608
|
+
"accountId": anyscale_service_account_name,
|
|
609
|
+
"serviceAccount": {
|
|
610
|
+
"displayName": f"{cloud_id} Anyscale operator service account",
|
|
611
|
+
"description": "Service account for Anyscale Kubernetes operator",
|
|
612
|
+
},
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
service_account = (
|
|
616
|
+
iam_client.projects()
|
|
617
|
+
.serviceAccounts()
|
|
618
|
+
.create(
|
|
619
|
+
name=f"projects/{cluster_info.project_id}",
|
|
620
|
+
body=service_account_body,
|
|
621
|
+
)
|
|
622
|
+
.execute()
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
service_account_email = service_account["email"]
|
|
626
|
+
self.log.info(
|
|
627
|
+
f"Created service account: {service_account_email}",
|
|
628
|
+
block_label="Setup",
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
# Wait for service account to propagate through GCP systems
|
|
632
|
+
import time
|
|
633
|
+
|
|
634
|
+
self._debug("Waiting 10 seconds for service account to propagate...")
|
|
635
|
+
time.sleep(10)
|
|
636
|
+
|
|
637
|
+
# Grant Workload Identity binding
|
|
638
|
+
self._debug("Setting up Workload Identity binding")
|
|
639
|
+
|
|
640
|
+
# The K8s service account needs:
|
|
641
|
+
# 1. workloadIdentityUser role - to impersonate the GCP service account
|
|
642
|
+
# 2. serviceAccountTokenCreator - to generate tokens (for getOpenIdToken)
|
|
643
|
+
|
|
644
|
+
policy_body = {
|
|
645
|
+
"policy": {
|
|
646
|
+
"bindings": [
|
|
647
|
+
{
|
|
648
|
+
"role": "roles/iam.workloadIdentityUser",
|
|
649
|
+
"members": [
|
|
650
|
+
f"serviceAccount:{cluster_info.project_id}.svc.id.goog[{cluster_info.namespace}/anyscale-operator]"
|
|
651
|
+
],
|
|
652
|
+
},
|
|
653
|
+
{
|
|
654
|
+
"role": "roles/iam.serviceAccountTokenCreator",
|
|
655
|
+
"members": [f"serviceAccount:{service_account_email}"],
|
|
656
|
+
},
|
|
657
|
+
]
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
iam_client.projects().serviceAccounts().setIamPolicy(
|
|
662
|
+
resource=f"projects/{cluster_info.project_id}/serviceAccounts/{service_account_email}",
|
|
663
|
+
body=policy_body,
|
|
664
|
+
).execute()
|
|
665
|
+
|
|
666
|
+
self.log.info(
|
|
667
|
+
"Configured Workload Identity binding", block_label="Setup"
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
# Grant storage admin role to service account for the bucket
|
|
671
|
+
# Note: There's often a propagation delay after service account creation
|
|
672
|
+
# We need to retry with exponential backoff
|
|
673
|
+
self._debug("Granting storage permissions")
|
|
674
|
+
|
|
675
|
+
import time
|
|
676
|
+
|
|
677
|
+
max_retries = 5
|
|
678
|
+
retry_delay = 2 # Start with 2 seconds
|
|
679
|
+
|
|
680
|
+
for attempt in range(max_retries):
|
|
681
|
+
try:
|
|
682
|
+
bucket_policy = bucket.get_iam_policy(
|
|
683
|
+
requested_policy_version=3
|
|
684
|
+
)
|
|
685
|
+
bucket_policy.bindings.append(
|
|
686
|
+
{
|
|
687
|
+
"role": "roles/storage.admin",
|
|
688
|
+
"members": {f"serviceAccount:{service_account_email}"},
|
|
689
|
+
}
|
|
690
|
+
)
|
|
691
|
+
bucket.set_iam_policy(bucket_policy)
|
|
692
|
+
break # Success!
|
|
693
|
+
except Exception as e: # noqa: BLE001
|
|
694
|
+
if "does not exist" in str(e) and attempt < max_retries - 1:
|
|
695
|
+
self._debug(
|
|
696
|
+
f"Service account not yet propagated, retrying in {retry_delay}s... (attempt {attempt + 1}/{max_retries})"
|
|
697
|
+
)
|
|
698
|
+
time.sleep(retry_delay)
|
|
699
|
+
retry_delay *= 2 # Exponential backoff
|
|
700
|
+
else:
|
|
701
|
+
raise # Re-raise if it's not a propagation issue or we're out of retries
|
|
702
|
+
|
|
703
|
+
self.log.info(
|
|
704
|
+
"Granted storage permissions to service account",
|
|
705
|
+
block_label="Setup",
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
self.log.info("GCP resources created successfully", block_label="Setup")
|
|
709
|
+
self.log.warning(
|
|
710
|
+
f"REMINDER: To clean up when no longer needed, delete GCS bucket '{bucket_name}' and service account '{service_account_email}'"
|
|
711
|
+
)
|
|
712
|
+
except Exception as e: # noqa: BLE001
|
|
713
|
+
self.log.error(f"Failed to create GCP resources: {e}")
|
|
714
|
+
raise click.ClickException(f"Failed to create GCP resources: {e}")
|
|
715
|
+
|
|
716
|
+
# Resources were created in the try block above
|
|
717
|
+
# bucket_name and service_account_email are already set
|
|
718
|
+
self._debug(f"GCS Bucket: {bucket_name}")
|
|
719
|
+
self._debug(f"Service Account Email: {service_account_email}")
|
|
720
|
+
|
|
721
|
+
return InfrastructureResources(
|
|
722
|
+
bucket_name=bucket_name,
|
|
723
|
+
iam_role_arn=service_account_email, # For GCP, we use service account email
|
|
724
|
+
region=region,
|
|
725
|
+
project_id=cluster_info.project_id,
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
def _get_gke_cluster_info(
|
|
729
|
+
self, cluster_name: str, region: str, project_id: str
|
|
730
|
+
) -> Dict[str, Any]:
|
|
731
|
+
"""Get GKE cluster information using gcloud CLI."""
|
|
732
|
+
try:
|
|
733
|
+
# Try regional cluster first
|
|
734
|
+
result = subprocess.run(
|
|
735
|
+
[
|
|
736
|
+
"gcloud",
|
|
737
|
+
"container",
|
|
738
|
+
"clusters",
|
|
739
|
+
"describe",
|
|
740
|
+
cluster_name,
|
|
741
|
+
f"--region={region}",
|
|
742
|
+
f"--project={project_id}",
|
|
743
|
+
"--format=json",
|
|
744
|
+
],
|
|
745
|
+
capture_output=True,
|
|
746
|
+
text=True,
|
|
747
|
+
check=False,
|
|
748
|
+
)
|
|
749
|
+
if result.returncode == 0:
|
|
750
|
+
return json.loads(result.stdout)
|
|
751
|
+
|
|
752
|
+
# Try zonal cluster
|
|
753
|
+
# Assuming zone 'a' if regional fails
|
|
754
|
+
zone = f"{region}-a"
|
|
755
|
+
result = subprocess.run(
|
|
756
|
+
[
|
|
757
|
+
"gcloud",
|
|
758
|
+
"container",
|
|
759
|
+
"clusters",
|
|
760
|
+
"describe",
|
|
761
|
+
cluster_name,
|
|
762
|
+
f"--zone={zone}",
|
|
763
|
+
f"--project={project_id}",
|
|
764
|
+
"--format=json",
|
|
765
|
+
],
|
|
766
|
+
capture_output=True,
|
|
767
|
+
text=True,
|
|
768
|
+
check=True,
|
|
769
|
+
)
|
|
770
|
+
return json.loads(result.stdout)
|
|
771
|
+
except subprocess.CalledProcessError as e:
|
|
772
|
+
raise click.ClickException(f"Failed to get GKE cluster info: {e.stderr}")
|
|
773
|
+
except json.JSONDecodeError as e:
|
|
774
|
+
raise click.ClickException(f"Failed to parse GKE cluster info: {e}")
|
|
775
|
+
|
|
776
|
+
def _get_gke_zones(
|
|
777
|
+
self, cluster_name: str, region: str, project_id: str
|
|
778
|
+
) -> List[str]:
|
|
779
|
+
"""Get zones where the GKE cluster's node pools are located."""
|
|
780
|
+
try:
|
|
781
|
+
cluster_info = self._get_gke_cluster_info(cluster_name, region, project_id)
|
|
782
|
+
|
|
783
|
+
# Extract zones from node pools
|
|
784
|
+
zones = []
|
|
785
|
+
node_pools = cluster_info.get("nodePools", [])
|
|
786
|
+
|
|
787
|
+
for pool in node_pools:
|
|
788
|
+
# For zonal clusters, each node pool has locations
|
|
789
|
+
pool_locations = pool.get("locations", [])
|
|
790
|
+
zones.extend(pool_locations)
|
|
791
|
+
|
|
792
|
+
# If no zones found from node pools, try cluster-level locations
|
|
793
|
+
if not zones:
|
|
794
|
+
cluster_locations = cluster_info.get("locations", [])
|
|
795
|
+
if cluster_locations:
|
|
796
|
+
zones = cluster_locations
|
|
797
|
+
|
|
798
|
+
# Remove duplicates and sort
|
|
799
|
+
if zones:
|
|
800
|
+
unique_zones = sorted(set(zones))
|
|
801
|
+
self._debug(f"Discovered zones: {', '.join(unique_zones)}")
|
|
802
|
+
return unique_zones
|
|
803
|
+
else:
|
|
804
|
+
# Fallback to default zones
|
|
805
|
+
self._debug(
|
|
806
|
+
"No zones found in cluster info, falling back to default zones"
|
|
807
|
+
)
|
|
808
|
+
return [region + "-a", region + "-b", region + "-c"]
|
|
809
|
+
|
|
810
|
+
except Exception as e: # noqa: BLE001
|
|
811
|
+
self._debug(f"Failed to get zones: {e}, using default zones")
|
|
812
|
+
return [region + "-a", region + "-b", region + "-c"]
|
|
813
|
+
|
|
814
|
+
def _configure_gcp_kubeconfig(
|
|
815
|
+
self, cluster_name: str, region: str, project_id: str
|
|
816
|
+
) -> None:
|
|
817
|
+
"""Configure kubeconfig for GCP GKE cluster."""
|
|
818
|
+
self.log.info(f"Configuring kubeconfig for GKE cluster: {cluster_name}")
|
|
819
|
+
|
|
820
|
+
try:
|
|
821
|
+
# Try regional cluster first
|
|
822
|
+
result = subprocess.run(
|
|
823
|
+
[
|
|
824
|
+
"gcloud",
|
|
825
|
+
"container",
|
|
826
|
+
"clusters",
|
|
827
|
+
"get-credentials",
|
|
828
|
+
cluster_name,
|
|
829
|
+
f"--region={region}",
|
|
830
|
+
f"--project={project_id}",
|
|
831
|
+
],
|
|
832
|
+
capture_output=True,
|
|
833
|
+
text=True,
|
|
834
|
+
check=False,
|
|
835
|
+
)
|
|
836
|
+
if result.returncode == 0:
|
|
837
|
+
self.log.info("GKE kubeconfig configured successfully")
|
|
838
|
+
return
|
|
839
|
+
|
|
840
|
+
# Try zonal cluster
|
|
841
|
+
zone = f"{region}-a"
|
|
842
|
+
subprocess.run(
|
|
843
|
+
[
|
|
844
|
+
"gcloud",
|
|
845
|
+
"container",
|
|
846
|
+
"clusters",
|
|
847
|
+
"get-credentials",
|
|
848
|
+
cluster_name,
|
|
849
|
+
f"--zone={zone}",
|
|
850
|
+
f"--project={project_id}",
|
|
851
|
+
],
|
|
852
|
+
capture_output=True,
|
|
853
|
+
text=True,
|
|
854
|
+
check=True,
|
|
855
|
+
)
|
|
856
|
+
self.log.info("GKE kubeconfig configured successfully")
|
|
857
|
+
except subprocess.CalledProcessError as e:
|
|
858
|
+
raise click.ClickException(
|
|
859
|
+
f"Failed to configure GKE kubeconfig: {e.stderr}"
|
|
860
|
+
)
|
|
861
|
+
|
|
505
862
|
def _get_eks_cluster_info(self, cluster_name: str, region: str) -> Dict[str, Any]:
|
|
506
863
|
"""Get EKS cluster information using AWS CLI."""
|
|
507
864
|
try:
|
|
@@ -698,11 +1055,33 @@ class KubernetesCloudSetupCommand:
|
|
|
698
1055
|
zones=zones,
|
|
699
1056
|
),
|
|
700
1057
|
)
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
1058
|
+
elif provider == "gcp":
|
|
1059
|
+
assert infrastructure.project_id, "Project ID is required for GCP"
|
|
1060
|
+
|
|
1061
|
+
from anyscale.client.openapi_client.models import GCPConfig
|
|
1062
|
+
|
|
1063
|
+
# Dynamically determine zones from the GKE cluster
|
|
1064
|
+
zones = self._get_gke_zones(
|
|
1065
|
+
cluster_info.cluster_name, region, infrastructure.project_id
|
|
704
1066
|
)
|
|
705
1067
|
|
|
1068
|
+
cloud_deployment = CloudDeployment(
|
|
1069
|
+
name=name,
|
|
1070
|
+
provider=CloudProviders.GCP,
|
|
1071
|
+
region=region,
|
|
1072
|
+
compute_stack=ComputeStack.K8S,
|
|
1073
|
+
object_storage=ObjectStorage(
|
|
1074
|
+
bucket_name=infrastructure.bucket_name, region=region
|
|
1075
|
+
),
|
|
1076
|
+
gcp_config=GCPConfig(project_id=infrastructure.project_id,),
|
|
1077
|
+
kubernetes_config=OpenAPIKubernetesConfig(
|
|
1078
|
+
anyscale_operator_iam_identity=infrastructure.iam_role_arn,
|
|
1079
|
+
zones=zones,
|
|
1080
|
+
),
|
|
1081
|
+
)
|
|
1082
|
+
else:
|
|
1083
|
+
raise click.ClickException(f"Unsupported provider: {provider}")
|
|
1084
|
+
|
|
706
1085
|
# Register the cloud
|
|
707
1086
|
try:
|
|
708
1087
|
self._debug("Cloud deployment details:")
|
|
@@ -735,10 +1114,18 @@ class KubernetesCloudSetupCommand:
|
|
|
735
1114
|
skip_verifications=True,
|
|
736
1115
|
auto_add_user=True,
|
|
737
1116
|
)
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
1117
|
+
elif provider == "gcp":
|
|
1118
|
+
self.log.info("Calling register_gcp_cloud...")
|
|
1119
|
+
self.cloud_controller.register_gcp_cloud(
|
|
1120
|
+
name=name,
|
|
1121
|
+
cloud_resource=cloud_deployment,
|
|
1122
|
+
functional_verify=None,
|
|
1123
|
+
yes=True,
|
|
1124
|
+
skip_verifications=True,
|
|
1125
|
+
auto_add_user=True,
|
|
741
1126
|
)
|
|
1127
|
+
else:
|
|
1128
|
+
raise click.ClickException(f"Unsupported provider: {provider}")
|
|
742
1129
|
finally:
|
|
743
1130
|
# Restore the original log.info method
|
|
744
1131
|
self.cloud_controller.log.info = original_log_info
|
|
@@ -781,6 +1168,7 @@ class KubernetesCloudSetupCommand:
|
|
|
781
1168
|
namespace: str,
|
|
782
1169
|
infrastructure: InfrastructureResources,
|
|
783
1170
|
values_file: Optional[str] = None,
|
|
1171
|
+
operator_chart: Optional[str] = None,
|
|
784
1172
|
) -> None:
|
|
785
1173
|
"""Install the Anyscale operator using Helm."""
|
|
786
1174
|
self.log.info("Installing Anyscale operator...", block_label="Setup")
|
|
@@ -817,10 +1205,21 @@ class KubernetesCloudSetupCommand:
|
|
|
817
1205
|
additional_values=set_string_values,
|
|
818
1206
|
)
|
|
819
1207
|
|
|
1208
|
+
# Determine chart reference based on operator_chart parameter
|
|
1209
|
+
if operator_chart:
|
|
1210
|
+
# Use the provided chart path directly
|
|
1211
|
+
self._debug(f"Using operator chart from: {operator_chart}")
|
|
1212
|
+
chart_reference = operator_chart
|
|
1213
|
+
else:
|
|
1214
|
+
# Add Helm repo before installing
|
|
1215
|
+
self._debug("Adding Anyscale Helm repository...")
|
|
1216
|
+
self._add_helm_repo()
|
|
1217
|
+
chart_reference = "anyscale/anyscale-operator"
|
|
1218
|
+
|
|
820
1219
|
# Build a simple Helm command that only uses the values file
|
|
821
1220
|
self._debug("Generating Helm command...")
|
|
822
1221
|
helm_command = (
|
|
823
|
-
f"helm upgrade {release_name}
|
|
1222
|
+
f"helm upgrade {release_name} {chart_reference} "
|
|
824
1223
|
f"--values {values_file_path} "
|
|
825
1224
|
f"--namespace {namespace} "
|
|
826
1225
|
f"--create-namespace "
|
|
@@ -830,6 +1229,41 @@ class KubernetesCloudSetupCommand:
|
|
|
830
1229
|
|
|
831
1230
|
self._execute_helm_command(helm_command)
|
|
832
1231
|
|
|
1232
|
+
def _add_helm_repo(self) -> None:
|
|
1233
|
+
"""Add and update the Anyscale Helm repository."""
|
|
1234
|
+
try:
|
|
1235
|
+
# Add the Anyscale Helm repository
|
|
1236
|
+
self.log.info("Adding Anyscale Helm repository...", block_label="Setup")
|
|
1237
|
+
subprocess.run(
|
|
1238
|
+
[
|
|
1239
|
+
"helm",
|
|
1240
|
+
"repo",
|
|
1241
|
+
"add",
|
|
1242
|
+
"anyscale",
|
|
1243
|
+
"https://anyscale.github.io/helm-charts",
|
|
1244
|
+
],
|
|
1245
|
+
capture_output=True,
|
|
1246
|
+
text=True,
|
|
1247
|
+
check=False, # Don't fail if repo already exists
|
|
1248
|
+
)
|
|
1249
|
+
|
|
1250
|
+
# Update the Helm repository
|
|
1251
|
+
self.log.info("Updating Helm repositories...", block_label="Setup")
|
|
1252
|
+
subprocess.run(
|
|
1253
|
+
["helm", "repo", "update", "anyscale"],
|
|
1254
|
+
capture_output=True,
|
|
1255
|
+
text=True,
|
|
1256
|
+
check=True,
|
|
1257
|
+
)
|
|
1258
|
+
self.log.info(
|
|
1259
|
+
"Helm repository configured successfully", block_label="Setup"
|
|
1260
|
+
)
|
|
1261
|
+
except subprocess.CalledProcessError as e:
|
|
1262
|
+
self.log.error(f"Failed to configure Helm repository: {e.stderr}")
|
|
1263
|
+
raise click.ClickException(
|
|
1264
|
+
f"Failed to configure Helm repository: {e.stderr}"
|
|
1265
|
+
)
|
|
1266
|
+
|
|
833
1267
|
def _extract_set_string_values(self, helm_command: str) -> Dict[str, str]:
|
|
834
1268
|
"""
|
|
835
1269
|
Extract all --set-string key=value pairs from a Helm command.
|
|
@@ -853,6 +1287,35 @@ class KubernetesCloudSetupCommand:
|
|
|
853
1287
|
|
|
854
1288
|
return set_string_values
|
|
855
1289
|
|
|
1290
|
+
def _set_nested_value(self, d: Dict[str, Any], key_path: str, value: Any) -> None:
|
|
1291
|
+
"""
|
|
1292
|
+
Set a value in a nested dictionary using a dotted key path.
|
|
1293
|
+
|
|
1294
|
+
Args:
|
|
1295
|
+
d: The dictionary to modify
|
|
1296
|
+
key_path: Dotted key path (e.g., "workloads.serviceaccount.name")
|
|
1297
|
+
value: The value to set
|
|
1298
|
+
|
|
1299
|
+
Example:
|
|
1300
|
+
_set_nested_value({}, "workloads.serviceaccount.name", "my-sa")
|
|
1301
|
+
# Results in: {"workloads": {"serviceaccount": {"name": "my-sa"}}}
|
|
1302
|
+
"""
|
|
1303
|
+
keys = key_path.split(".")
|
|
1304
|
+
current = d
|
|
1305
|
+
|
|
1306
|
+
# Navigate/create the nested structure
|
|
1307
|
+
for key in keys[:-1]:
|
|
1308
|
+
if key not in current:
|
|
1309
|
+
current[key] = {}
|
|
1310
|
+
elif not isinstance(current[key], dict):
|
|
1311
|
+
# If the key exists but isn't a dict, we have a conflict
|
|
1312
|
+
# In this case, we'll overwrite it with a dict
|
|
1313
|
+
current[key] = {}
|
|
1314
|
+
current = current[key]
|
|
1315
|
+
|
|
1316
|
+
# Set the final value
|
|
1317
|
+
current[keys[-1]] = value
|
|
1318
|
+
|
|
856
1319
|
def _prompt_for_namespace(
|
|
857
1320
|
self, default_namespace: str, skip_confirmation: bool = False
|
|
858
1321
|
) -> str:
|
|
@@ -864,10 +1327,12 @@ class KubernetesCloudSetupCommand:
|
|
|
864
1327
|
return final_namespace
|
|
865
1328
|
|
|
866
1329
|
self.log.info("Configuring Kubernetes namespace...")
|
|
867
|
-
|
|
868
1330
|
self.log.info(
|
|
869
|
-
f"
|
|
1331
|
+
f"Specify the namespace to use for the Anyscale operator (leave blank for default: {final_namespace})."
|
|
870
1332
|
)
|
|
1333
|
+
self.log.info("If the namespace does not exist, it will be created.")
|
|
1334
|
+
self.log.info("Enter your namespace:")
|
|
1335
|
+
|
|
871
1336
|
final_namespace = click.prompt("", default=final_namespace, show_default=True)
|
|
872
1337
|
|
|
873
1338
|
# Validate namespace (Kubernetes DNS-1123 label requirements)
|
|
@@ -900,24 +1365,32 @@ class KubernetesCloudSetupCommand:
|
|
|
900
1365
|
"""Generate Helm values file and save it locally."""
|
|
901
1366
|
self.log.info("Generating Helm values file...")
|
|
902
1367
|
|
|
903
|
-
#
|
|
904
|
-
values: Dict[str, Any] = {
|
|
905
|
-
"cloudProvider": provider,
|
|
906
|
-
"cloudDeploymentId": cloud_deployment_id,
|
|
907
|
-
"region": region,
|
|
908
|
-
"operatorIamIdentity": infrastructure.iam_role_arn,
|
|
909
|
-
"ingress-nginx": {"enabled": True},
|
|
910
|
-
}
|
|
1368
|
+
# Start with an empty dictionary to build up values
|
|
1369
|
+
values: Dict[str, Any] = {}
|
|
911
1370
|
|
|
1371
|
+
# First, parse and merge additional_values with nested keys
|
|
912
1372
|
if additional_values:
|
|
913
1373
|
for key, value in additional_values.items():
|
|
914
|
-
|
|
915
|
-
|
|
1374
|
+
self._set_nested_value(values, key, value)
|
|
1375
|
+
|
|
1376
|
+
# Now overlay our constants on top (these take precedence)
|
|
1377
|
+
# Use _set_nested_value to ensure proper nesting
|
|
1378
|
+
self._set_nested_value(values, "global.cloudDeploymentId", cloud_deployment_id)
|
|
1379
|
+
self._set_nested_value(values, "global.cloudProvider", provider)
|
|
1380
|
+
self._set_nested_value(
|
|
1381
|
+
values, "global.auth.iamIdentity", infrastructure.iam_role_arn
|
|
1382
|
+
)
|
|
1383
|
+
self._set_nested_value(values, "ingress-nginx.enabled", True)
|
|
1384
|
+
|
|
1385
|
+
# Add region for AWS only (using global.aws.region)
|
|
1386
|
+
# Region field is deprecated for other providers
|
|
1387
|
+
if provider == "aws":
|
|
1388
|
+
self._set_nested_value(values, "global.aws.region", region)
|
|
916
1389
|
|
|
917
1390
|
# Add control plane URL from ANYSCALE_HOST environment variable
|
|
918
1391
|
if ANYSCALE_HOST:
|
|
919
|
-
values
|
|
920
|
-
self.
|
|
1392
|
+
self._set_nested_value(values, "global.controlPlaneURL", ANYSCALE_HOST)
|
|
1393
|
+
self.log.info(f"Using control plane URL: {ANYSCALE_HOST}")
|
|
921
1394
|
|
|
922
1395
|
if custom_path:
|
|
923
1396
|
values_file_path = custom_path
|
|
@@ -983,11 +1456,6 @@ class KubernetesCloudSetupCommand:
|
|
|
983
1456
|
operator_namespace=namespace,
|
|
984
1457
|
)
|
|
985
1458
|
|
|
986
|
-
# Sleep to avoid race condition where operator has not loaded its IAM identity
|
|
987
|
-
import time
|
|
988
|
-
|
|
989
|
-
time.sleep(5)
|
|
990
|
-
|
|
991
1459
|
# Run verification
|
|
992
1460
|
success = verifier.verify(cloud_deployment)
|
|
993
1461
|
|
|
@@ -1009,6 +1477,7 @@ def setup_kubernetes_cloud( # noqa: PLR0913
|
|
|
1009
1477
|
yes: bool = False,
|
|
1010
1478
|
values_file: Optional[str] = None,
|
|
1011
1479
|
debug: bool = False,
|
|
1480
|
+
operator_chart: Optional[str] = None,
|
|
1012
1481
|
) -> None:
|
|
1013
1482
|
"""
|
|
1014
1483
|
Set up Anyscale on a Kubernetes cluster.
|
|
@@ -1027,6 +1496,7 @@ def setup_kubernetes_cloud( # noqa: PLR0913
|
|
|
1027
1496
|
yes: Skip confirmation prompts
|
|
1028
1497
|
values_file: Optional path for Helm values file
|
|
1029
1498
|
debug: Enable debug logging
|
|
1499
|
+
operator_chart: Optional path to operator chart (skips helm repo add/update)
|
|
1030
1500
|
"""
|
|
1031
1501
|
cmd = KubernetesCloudSetupCommand(debug=debug)
|
|
1032
1502
|
|
|
@@ -1041,6 +1511,7 @@ def setup_kubernetes_cloud( # noqa: PLR0913
|
|
|
1041
1511
|
functional_verify=functional_verify,
|
|
1042
1512
|
yes=yes,
|
|
1043
1513
|
values_file=values_file,
|
|
1514
|
+
operator_chart=operator_chart,
|
|
1044
1515
|
)
|
|
1045
1516
|
except Exception as e: # noqa: BLE001
|
|
1046
1517
|
click.echo(f"Setup failed: {e}", err=True)
|