anyscale 0.26.68__py3-none-any.whl → 0.26.70__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. anyscale/_private/anyscale_client/anyscale_client.py +67 -1
  2. anyscale/_private/anyscale_client/common.py +20 -1
  3. anyscale/_private/anyscale_client/fake_anyscale_client.py +77 -10
  4. anyscale/client/README.md +16 -4
  5. anyscale/client/openapi_client/__init__.py +12 -4
  6. anyscale/client/openapi_client/api/default_api.py +588 -23
  7. anyscale/client/openapi_client/models/__init__.py +12 -4
  8. anyscale/client/openapi_client/models/api_key_info.py +29 -3
  9. anyscale/client/openapi_client/models/apply_autoscaling_config_update_model.py +350 -0
  10. anyscale/client/openapi_client/models/apply_production_service_multi_version_v2_model.py +207 -0
  11. anyscale/client/openapi_client/models/apply_production_service_v2_model.py +31 -3
  12. anyscale/client/openapi_client/models/baseimagesenum.py +70 -1
  13. anyscale/client/openapi_client/models/cloud_data_bucket_file_type.py +2 -1
  14. anyscale/client/openapi_client/models/{oauthconnectionresponse_response.py → clouddeployment_response.py} +11 -11
  15. anyscale/client/openapi_client/models/clusterdashboardnode_response.py +121 -0
  16. anyscale/client/openapi_client/models/create_experimental_workspace.py +29 -1
  17. anyscale/client/openapi_client/models/create_workspace_from_template.py +29 -1
  18. anyscale/client/openapi_client/models/create_workspace_template_version.py +31 -3
  19. anyscale/client/openapi_client/models/decorated_list_service_api_model.py +58 -1
  20. anyscale/client/openapi_client/models/decorated_production_service_v2_api_model.py +60 -3
  21. anyscale/client/openapi_client/models/decorated_service_event_api_model.py +3 -3
  22. anyscale/client/openapi_client/models/describe_machine_pool_machines_filters.py +33 -5
  23. anyscale/client/openapi_client/models/describe_machine_pool_workloads_filters.py +33 -5
  24. anyscale/client/openapi_client/models/{service_event_level.py → entity_type.py} +9 -9
  25. anyscale/client/openapi_client/models/event_level.py +2 -1
  26. anyscale/client/openapi_client/models/job_event_fields.py +206 -0
  27. anyscale/client/openapi_client/models/lineage_graph_node.py +70 -42
  28. anyscale/client/openapi_client/models/lineage_workload.py +31 -3
  29. anyscale/client/openapi_client/models/machine_type_partition_filter.py +152 -0
  30. anyscale/client/openapi_client/models/partition_info.py +30 -1
  31. anyscale/client/openapi_client/models/production_job_event.py +3 -3
  32. anyscale/client/openapi_client/models/rollout_strategy.py +2 -1
  33. anyscale/client/openapi_client/models/service_event_fields.py +318 -0
  34. anyscale/client/openapi_client/models/supportedbaseimagesenum.py +70 -1
  35. anyscale/client/openapi_client/models/task_summary_config.py +29 -3
  36. anyscale/client/openapi_client/models/task_table_config.py +29 -3
  37. anyscale/client/openapi_client/models/unified_event.py +377 -0
  38. anyscale/client/openapi_client/models/{ha_job_event_level.py → unified_origin_filter.py} +21 -9
  39. anyscale/client/openapi_client/models/unifiedevent_list_response.py +147 -0
  40. anyscale/client/openapi_client/models/workspace_event_fields.py +122 -0
  41. anyscale/client/openapi_client/models/workspace_template_version.py +30 -1
  42. anyscale/client/openapi_client/models/workspace_template_version_data_object.py +30 -1
  43. anyscale/cloud/models.py +2 -2
  44. anyscale/commands/cloud_commands.py +148 -11
  45. anyscale/commands/command_examples.py +53 -0
  46. anyscale/commands/job_commands.py +1 -1
  47. anyscale/commands/service_commands.py +130 -67
  48. anyscale/commands/setup_k8s.py +615 -49
  49. anyscale/controllers/cloud_controller.py +19 -5
  50. anyscale/controllers/kubernetes_verifier.py +80 -66
  51. anyscale/job/_private/job_sdk.py +47 -1
  52. anyscale/job/commands.py +3 -0
  53. anyscale/sdk/anyscale_client/models/apply_production_service_v2_model.py +31 -3
  54. anyscale/sdk/anyscale_client/models/apply_service_model.py +31 -3
  55. anyscale/sdk/anyscale_client/models/baseimagesenum.py +70 -1
  56. anyscale/sdk/anyscale_client/models/rollout_strategy.py +2 -1
  57. anyscale/sdk/anyscale_client/models/supportedbaseimagesenum.py +70 -1
  58. anyscale/service/__init__.py +11 -3
  59. anyscale/service/_private/service_sdk.py +361 -35
  60. anyscale/service/commands.py +15 -3
  61. anyscale/service/models.py +12 -0
  62. anyscale/shared_anyscale_utils/latest_ray_version.py +1 -1
  63. anyscale/version.py +1 -1
  64. {anyscale-0.26.68.dist-info → anyscale-0.26.70.dist-info}/METADATA +1 -1
  65. {anyscale-0.26.68.dist-info → anyscale-0.26.70.dist-info}/RECORD +70 -62
  66. anyscale/client/openapi_client/models/o_auth_connection_response.py +0 -229
  67. {anyscale-0.26.68.dist-info → anyscale-0.26.70.dist-info}/WHEEL +0 -0
  68. {anyscale-0.26.68.dist-info → anyscale-0.26.70.dist-info}/entry_points.txt +0 -0
  69. {anyscale-0.26.68.dist-info → anyscale-0.26.70.dist-info}/licenses/LICENSE +0 -0
  70. {anyscale-0.26.68.dist-info → anyscale-0.26.70.dist-info}/licenses/NOTICE +0 -0
  71. {anyscale-0.26.68.dist-info → anyscale-0.26.70.dist-info}/top_level.txt +0 -0
@@ -75,30 +75,59 @@ class KubernetesCloudSetupCommand:
75
75
  functional_verify: bool,
76
76
  yes: bool,
77
77
  values_file: Optional[str] = None,
78
+ operator_chart: Optional[str] = None,
79
+ cloud_id: Optional[str] = None,
80
+ resource_name: Optional[str] = None,
78
81
  ) -> None:
79
82
  """
80
83
  Main entry point for Kubernetes cloud setup.
81
84
 
85
+ This method handles both:
86
+ 1. Creating a new cloud (when cloud_id is None)
87
+ 2. Adding a resource to an existing cloud (when cloud_id is provided)
88
+
82
89
  Args:
83
90
  provider: Cloud provider (aws, gcp)
84
91
  region: AWS/GCP region
85
- name: Name for the Anyscale cloud
92
+ name: Name for the Anyscale cloud. If cloud_id is not provided, this will be used to create a new cloud.
86
93
  cluster_name: Kubernetes cluster name/context
87
94
  namespace: Namespace for the Anyscale operator
88
95
  project_id: GCP project ID (required for GCP)
89
96
  functional_verify: Whether to run functional verification
90
97
  yes: Skip confirmation prompts
91
98
  values_file: Optional custom path for Helm values file
99
+ operator_chart: Optional path to operator chart (skips helm repo add/update)
100
+ cloud_id: Optional cloud ID for the Anyscale cloud to add the resource to
101
+ resource_name: Optional name for the cloud resource (will be auto-generated if not provided)
92
102
  """
93
- self.log.open_block(
94
- "Setup", f"Setting up Kubernetes cloud '{name}' on {provider.upper()}"
95
- )
103
+ # Determine if we're creating a new cloud or adding to existing
104
+ create_cloud = cloud_id is None
105
+
106
+ # Validate cloud_id is provided when adding to existing cloud
107
+ if not create_cloud:
108
+ assert (
109
+ cloud_id
110
+ ), "cloud_id is required when adding a resource to an existing cloud"
111
+
112
+ # Set up logging message based on mode
113
+ if create_cloud:
114
+ setup_message = (
115
+ f"Setting up Kubernetes cloud '{name}' on {provider.upper()}"
116
+ )
117
+ else:
118
+ setup_message = f"Setting up Kubernetes cloud resource for '{name}' on {provider.upper()}"
119
+
120
+ self.log.open_block("Setup", setup_message)
96
121
 
97
122
  # Set confirmation flag
98
123
  self.skip_confirmation = yes
99
124
 
125
+ # Track what resources were created for cleanup messaging
126
+ infrastructure = None
127
+ cluster_info = None
128
+ cloud_resource_id = None
129
+
100
130
  try:
101
- # Step 0: Check required CLI tools are installed
102
131
  self._check_required_tools(provider)
103
132
 
104
133
  # Step 1: Prompt for namespace BEFORE infrastructure setup
@@ -117,29 +146,74 @@ class KubernetesCloudSetupCommand:
117
146
  provider, region, name, cluster_info
118
147
  )
119
148
 
120
- # Step 4: Register cloud with Anyscale
121
- cloud_id = self._register_cloud(
122
- name, provider, region, infrastructure, cluster_info
123
- )
149
+ # Step 4: Register cloud OR create cloud resource
150
+ if create_cloud:
151
+ # Register new cloud with Anyscale
152
+ cloud_id = self._register_cloud(
153
+ name, provider, region, infrastructure, cluster_info
154
+ )
155
+
156
+ # Get the cloud resource ID from the newly registered cloud
157
+ cloud_resources = self.cloud_controller.get_decorated_cloud_resources(
158
+ cloud_id
159
+ )
160
+ if not cloud_resources:
161
+ raise click.ClickException(
162
+ "No cloud resources found after registration"
163
+ )
164
+ cloud_resource_id = cloud_resources[0].cloud_resource_id
165
+ else:
166
+ # Should have been validated earlier, but just in case
167
+ assert (
168
+ cloud_id
169
+ ), "cloud_id is required when adding a resource to an existing cloud"
170
+
171
+ # Create cloud resource in existing cloud
172
+ cloud_resource_id = self._create_cloud_resource(
173
+ cloud_id,
174
+ provider,
175
+ region,
176
+ infrastructure,
177
+ cluster_info,
178
+ resource_name,
179
+ )
124
180
 
125
181
  # Step 5: Install Anyscale operator
126
182
  self._install_operator(
127
- cloud_id,
183
+ cloud_resource_id,
128
184
  provider,
129
185
  region,
130
186
  final_namespace,
131
187
  infrastructure,
132
188
  values_file,
189
+ operator_chart,
190
+ skip_confirmation=yes,
133
191
  )
134
192
 
135
193
  # Step 6: Verify installation
136
194
  if functional_verify:
137
- self._verify_installation(cloud_id, final_namespace, cluster_info)
195
+ self._verify_installation(
196
+ cloud_id, final_namespace, cluster_info, cloud_resource_id
197
+ )
138
198
 
139
199
  self.log.close_block("Setup")
140
- self.log.info(f"Kubernetes cloud '{name}' setup completed successfully!")
200
+ if create_cloud:
201
+ self.log.info(
202
+ f"Kubernetes cloud '{name}' setup completed successfully!"
203
+ )
204
+ else:
205
+ self.log.info(
206
+ f"Kubernetes cloud resource setup for '{name}' completed successfully!"
207
+ )
141
208
  except Exception: # noqa: BLE001
142
209
  self.log.close_block("Setup")
210
+ self._handle_setup_failure(
211
+ provider,
212
+ infrastructure,
213
+ cloud_id,
214
+ name,
215
+ is_cloud_resource_setup=not create_cloud,
216
+ )
143
217
  raise
144
218
 
145
219
  def _debug(self, *msg: str) -> None:
@@ -1128,14 +1202,16 @@ class KubernetesCloudSetupCommand:
1128
1202
  self.cloud_controller.log.info = original_log_info
1129
1203
 
1130
1204
  self._debug("Cloud registration completed, fetching cloud ID...")
1131
- clouds = (
1132
- self.cloud_controller.api_client.list_clouds_api_v2_clouds_get().results
1133
- )
1134
- cloud = next((c for c in clouds if c.name == name), None)
1135
- if not cloud:
1136
- raise click.ClickException("Failed to find registered cloud")
1205
+ # Use get_cloud_id_and_name helper to fetch the registered cloud
1206
+ from anyscale.cloud_utils import get_cloud_id_and_name
1207
+
1208
+ try:
1209
+ cloud_id, _ = get_cloud_id_and_name(
1210
+ self.cloud_controller.api_client, cloud_name=name
1211
+ )
1212
+ except Exception as e: # noqa: BLE001
1213
+ raise click.ClickException(f"Failed to find registered cloud: {e}")
1137
1214
 
1138
- cloud_id = getattr(cloud, "id", None) or getattr(cloud, "cloud_id", None)
1139
1215
  if not cloud_id:
1140
1216
  raise click.ClickException(
1141
1217
  "Failed to get cloud ID from registered cloud"
@@ -1159,26 +1235,23 @@ class KubernetesCloudSetupCommand:
1159
1235
 
1160
1236
  def _install_operator( # noqa: PLR0913
1161
1237
  self,
1162
- cloud_id: str,
1238
+ cloud_resource_id: str,
1163
1239
  provider: str,
1164
1240
  region: str,
1165
1241
  namespace: str,
1166
1242
  infrastructure: InfrastructureResources,
1167
1243
  values_file: Optional[str] = None,
1244
+ operator_chart: Optional[str] = None,
1245
+ skip_confirmation: bool = False,
1168
1246
  ) -> None:
1169
1247
  """Install the Anyscale operator using Helm."""
1170
1248
  self.log.info("Installing Anyscale operator...", block_label="Setup")
1171
1249
 
1172
- # Get cloud resources to get the cloud resource ID
1173
- cloud_resources = self.cloud_controller.get_decorated_cloud_resources(cloud_id)
1174
-
1175
- if not cloud_resources:
1176
- raise click.ClickException("No cloud resources found")
1177
-
1178
- cloud_resource_id = cloud_resources[0].cloud_resource_id
1179
-
1180
1250
  release_name = "anyscale-operator"
1181
1251
 
1252
+ # Prompt user about nginx ingress installation
1253
+ install_nginx = self._prompt_for_nginx_ingress(skip_confirmation)
1254
+
1182
1255
  # Generate Helm command and extract --set-string flags from it
1183
1256
  self._debug("Generating Helm command to extract parameters...")
1184
1257
  helm_command = self.cloud_controller._generate_helm_upgrade_command( # noqa: SLF001
@@ -1199,16 +1272,24 @@ class KubernetesCloudSetupCommand:
1199
1272
  infrastructure=infrastructure,
1200
1273
  custom_path=values_file,
1201
1274
  additional_values=set_string_values,
1275
+ install_nginx_ingress=install_nginx,
1202
1276
  )
1203
1277
 
1204
- # Add Helm repo before installing
1205
- self._debug("Adding Anyscale Helm repository...")
1206
- self._add_helm_repo()
1278
+ # Determine chart reference based on operator_chart parameter
1279
+ if operator_chart:
1280
+ # Use the provided chart path directly
1281
+ self._debug(f"Using operator chart from: {operator_chart}")
1282
+ chart_reference = operator_chart
1283
+ else:
1284
+ # Add Helm repo before installing
1285
+ self._debug("Adding Anyscale Helm repository...")
1286
+ self._add_helm_repo()
1287
+ chart_reference = "anyscale/anyscale-operator"
1207
1288
 
1208
1289
  # Build a simple Helm command that only uses the values file
1209
1290
  self._debug("Generating Helm command...")
1210
1291
  helm_command = (
1211
- f"helm upgrade {release_name} anyscale/anyscale-operator "
1292
+ f"helm upgrade {release_name} {chart_reference} "
1212
1293
  f"--values {values_file_path} "
1213
1294
  f"--namespace {namespace} "
1214
1295
  f"--create-namespace "
@@ -1276,6 +1357,35 @@ class KubernetesCloudSetupCommand:
1276
1357
 
1277
1358
  return set_string_values
1278
1359
 
1360
+ def _set_nested_value(self, d: Dict[str, Any], key_path: str, value: Any) -> None:
1361
+ """
1362
+ Set a value in a nested dictionary using a dotted key path.
1363
+
1364
+ Args:
1365
+ d: The dictionary to modify
1366
+ key_path: Dotted key path (e.g., "workloads.serviceaccount.name")
1367
+ value: The value to set
1368
+
1369
+ Example:
1370
+ _set_nested_value({}, "workloads.serviceaccount.name", "my-sa")
1371
+ # Results in: {"workloads": {"serviceaccount": {"name": "my-sa"}}}
1372
+ """
1373
+ keys = key_path.split(".")
1374
+ current = d
1375
+
1376
+ # Navigate/create the nested structure
1377
+ for key in keys[:-1]:
1378
+ if key not in current:
1379
+ current[key] = {}
1380
+ elif not isinstance(current[key], dict):
1381
+ # If the key exists but isn't a dict, we have a conflict
1382
+ # In this case, we'll overwrite it with a dict
1383
+ current[key] = {}
1384
+ current = current[key]
1385
+
1386
+ # Set the final value
1387
+ current[keys[-1]] = value
1388
+
1279
1389
  def _prompt_for_namespace(
1280
1390
  self, default_namespace: str, skip_confirmation: bool = False
1281
1391
  ) -> str:
@@ -1287,10 +1397,12 @@ class KubernetesCloudSetupCommand:
1287
1397
  return final_namespace
1288
1398
 
1289
1399
  self.log.info("Configuring Kubernetes namespace...")
1290
-
1291
1400
  self.log.info(
1292
- f"Enter the namespace to use for the Anyscale operator (default: {final_namespace}):"
1401
+ f"Specify the namespace to use for the Anyscale operator (leave blank for default: {final_namespace})."
1293
1402
  )
1403
+ self.log.info("If the namespace does not exist, it will be created.")
1404
+ self.log.info("Enter your namespace:")
1405
+
1294
1406
  final_namespace = click.prompt("", default=final_namespace, show_default=True)
1295
1407
 
1296
1408
  # Validate namespace (Kubernetes DNS-1123 label requirements)
@@ -1310,6 +1422,36 @@ class KubernetesCloudSetupCommand:
1310
1422
 
1311
1423
  return final_namespace
1312
1424
 
1425
+ def _prompt_for_nginx_ingress(self, skip_confirmation: bool = False) -> bool:
1426
+ """Prompt user whether to install nginx ingress subchart."""
1427
+ if skip_confirmation:
1428
+ self.log.info("Using default: nginx ingress subchart will be installed")
1429
+ return True
1430
+
1431
+ self.log.info(
1432
+ "The Anyscale operator can install an nginx ingress controller as part of the setup.",
1433
+ block_label="Setup",
1434
+ )
1435
+ self.log.info(
1436
+ "If you already have an ingress controller installed, you may want to skip this.",
1437
+ block_label="Setup",
1438
+ )
1439
+
1440
+ response = click.confirm(
1441
+ "Do you want to install the nginx ingress subchart?", default=True
1442
+ )
1443
+
1444
+ if response:
1445
+ self.log.info(
1446
+ "nginx ingress subchart will be installed", block_label="Setup"
1447
+ )
1448
+ else:
1449
+ self.log.info(
1450
+ "nginx ingress subchart will NOT be installed", block_label="Setup"
1451
+ )
1452
+
1453
+ return response
1454
+
1313
1455
  def _generate_helm_values_file( # noqa: PLR0913
1314
1456
  self,
1315
1457
  provider: str,
@@ -1319,29 +1461,36 @@ class KubernetesCloudSetupCommand:
1319
1461
  infrastructure: InfrastructureResources,
1320
1462
  custom_path: Optional[str] = None,
1321
1463
  additional_values: Optional[Dict[str, str]] = None,
1464
+ install_nginx_ingress: bool = True,
1322
1465
  ) -> str:
1323
1466
  """Generate Helm values file and save it locally."""
1324
1467
  self.log.info("Generating Helm values file...")
1325
1468
 
1326
- # Create values dictionary starting with base values
1327
- values: Dict[str, Any] = {
1328
- "global": {
1329
- "cloudDeploymentId": cloud_deployment_id,
1330
- "cloudProvider": provider,
1331
- "region": region,
1332
- "auth": {"iamIdentity": infrastructure.iam_role_arn,},
1333
- },
1334
- "ingress-nginx": {"enabled": True},
1335
- }
1469
+ # Start with an empty dictionary to build up values
1470
+ values: Dict[str, Any] = {}
1336
1471
 
1472
+ # First, parse and merge additional_values with nested keys
1337
1473
  if additional_values:
1338
1474
  for key, value in additional_values.items():
1339
- if key not in values:
1340
- values[key] = value
1475
+ self._set_nested_value(values, key, value)
1476
+
1477
+ # Now overlay our constants on top (these take precedence)
1478
+ # Use _set_nested_value to ensure proper nesting
1479
+ self._set_nested_value(values, "global.cloudDeploymentId", cloud_deployment_id)
1480
+ self._set_nested_value(values, "global.cloudProvider", provider)
1481
+ self._set_nested_value(
1482
+ values, "global.auth.iamIdentity", infrastructure.iam_role_arn
1483
+ )
1484
+ self._set_nested_value(values, "ingress-nginx.enabled", install_nginx_ingress)
1485
+
1486
+ # Add region for AWS only (using global.aws.region)
1487
+ # Region field is deprecated for other providers
1488
+ if provider == "aws":
1489
+ self._set_nested_value(values, "global.aws.region", region)
1341
1490
 
1342
1491
  # Add control plane URL from ANYSCALE_HOST environment variable
1343
1492
  if ANYSCALE_HOST:
1344
- values["controlPlaneURL"] = ANYSCALE_HOST
1493
+ self._set_nested_value(values, "global.controlPlaneURL", ANYSCALE_HOST)
1345
1494
  self.log.info(f"Using control plane URL: {ANYSCALE_HOST}")
1346
1495
 
1347
1496
  if custom_path:
@@ -1384,7 +1533,11 @@ class KubernetesCloudSetupCommand:
1384
1533
  )
1385
1534
 
1386
1535
  def _verify_installation(
1387
- self, cloud_id: str, namespace: str, cluster_info: ClusterInfo
1536
+ self,
1537
+ cloud_id: str,
1538
+ namespace: str,
1539
+ cluster_info: ClusterInfo,
1540
+ cloud_resource_id: Optional[str] = None,
1388
1541
  ) -> None:
1389
1542
  """Verify the Kubernetes installation."""
1390
1543
  self.log.info("Verifying installation...")
@@ -1395,7 +1548,20 @@ class KubernetesCloudSetupCommand:
1395
1548
  if not cloud_resources:
1396
1549
  raise click.ClickException("No cloud resources found for verification")
1397
1550
 
1398
- cloud_deployment = cloud_resources[0]
1551
+ # Find the specific cloud resource if cloud_resource_id is provided
1552
+ cloud_deployment = None
1553
+ if cloud_resource_id:
1554
+ for resource in cloud_resources:
1555
+ if resource.cloud_resource_id == cloud_resource_id:
1556
+ cloud_deployment = resource
1557
+ break
1558
+ if not cloud_deployment:
1559
+ raise click.ClickException(
1560
+ f"Could not find cloud resource with ID {cloud_resource_id}"
1561
+ )
1562
+ else:
1563
+ # Fallback to first resource for backward compatibility
1564
+ cloud_deployment = cloud_resources[0]
1399
1565
 
1400
1566
  # Use the existing Kubernetes verifier
1401
1567
  verifier = KubernetesCloudDeploymentVerifier(
@@ -1417,6 +1583,314 @@ class KubernetesCloudSetupCommand:
1417
1583
  self.log.error("Verification failed - please check the logs above")
1418
1584
  raise click.ClickException("Installation verification failed")
1419
1585
 
1586
+ def _create_cloud_resource( # noqa: PLR0912
1587
+ self,
1588
+ cloud_id: str,
1589
+ provider: str,
1590
+ region: str,
1591
+ infrastructure: InfrastructureResources,
1592
+ cluster_info: ClusterInfo,
1593
+ resource_name: Optional[str],
1594
+ ) -> str:
1595
+ """
1596
+ Create a cloud resource in an existing cloud and return the cloud_resource_id.
1597
+
1598
+ Args:
1599
+ cloud_id: ID of the existing cloud
1600
+ provider: Cloud provider (aws, gcp)
1601
+ region: Cloud region
1602
+ infrastructure: Infrastructure resources created
1603
+ cluster_info: Cluster information
1604
+ resource_name: Name for the cloud resource (optional, will be auto-generated if not provided)
1605
+
1606
+ Returns:
1607
+ The cloud_resource_id of the created resource
1608
+ """
1609
+ self.log.info("Creating cloud resource in Anyscale...", block_label="Setup")
1610
+ if resource_name:
1611
+ self.log.info(f"Using resource name: {resource_name}", block_label="Setup")
1612
+ else:
1613
+ self.log.info("Resource name will be auto-generated", block_label="Setup")
1614
+
1615
+ if provider == "aws":
1616
+ # Dynamically determine availability zones from the EKS cluster
1617
+ zones = self._get_eks_availability_zones(cluster_info.cluster_name, region)
1618
+
1619
+ cloud_deployment = CloudDeployment(
1620
+ name=resource_name,
1621
+ provider=CloudProviders.AWS,
1622
+ region=region,
1623
+ compute_stack=ComputeStack.K8S,
1624
+ object_storage=ObjectStorage(
1625
+ bucket_name=infrastructure.bucket_name, region=region
1626
+ ),
1627
+ aws_config=AWSConfig(),
1628
+ kubernetes_config=OpenAPIKubernetesConfig(
1629
+ anyscale_operator_iam_identity=infrastructure.iam_role_arn,
1630
+ zones=zones,
1631
+ ),
1632
+ )
1633
+ elif provider == "gcp":
1634
+ assert infrastructure.project_id, "Project ID is required for GCP"
1635
+
1636
+ from anyscale.client.openapi_client.models import GCPConfig
1637
+
1638
+ # Dynamically determine zones from the GKE cluster
1639
+ zones = self._get_gke_zones(
1640
+ cluster_info.cluster_name, region, infrastructure.project_id
1641
+ )
1642
+
1643
+ cloud_deployment = CloudDeployment(
1644
+ name=resource_name,
1645
+ provider=CloudProviders.GCP,
1646
+ region=region,
1647
+ compute_stack=ComputeStack.K8S,
1648
+ object_storage=ObjectStorage(
1649
+ bucket_name=infrastructure.bucket_name, region=region
1650
+ ),
1651
+ gcp_config=GCPConfig(project_id=infrastructure.project_id),
1652
+ kubernetes_config=OpenAPIKubernetesConfig(
1653
+ anyscale_operator_iam_identity=infrastructure.iam_role_arn,
1654
+ zones=zones,
1655
+ ),
1656
+ )
1657
+ else:
1658
+ raise click.ClickException(f"Unsupported provider: {provider}")
1659
+
1660
+ # Create cloud resource using the API
1661
+ try:
1662
+ self._debug("Cloud deployment details:")
1663
+ self._debug(f" Provider: {cloud_deployment.provider}")
1664
+ self._debug(f" Region: {cloud_deployment.region}")
1665
+ self._debug(f" Compute Stack: {cloud_deployment.compute_stack}")
1666
+ self._debug(f" Bucket Name: {cloud_deployment.object_storage.bucket_name}")
1667
+ self._debug(
1668
+ f" IAM Identity: {cloud_deployment.kubernetes_config.anyscale_operator_iam_identity}"
1669
+ )
1670
+
1671
+ # Save cloud deployment to a temporary file and use create_cloud_resource
1672
+ self._debug("Saving cloud deployment to temporary file...")
1673
+ import tempfile
1674
+
1675
+ with tempfile.NamedTemporaryFile(
1676
+ mode="w", suffix=".yaml", delete=False
1677
+ ) as temp_file:
1678
+ # Convert CloudDeployment to dict for YAML serialization
1679
+ deployment_dict = cloud_deployment.to_dict()
1680
+ yaml.dump(deployment_dict, temp_file, default_flow_style=False)
1681
+ temp_file_path = temp_file.name
1682
+
1683
+ try:
1684
+ self._debug(f"Created temporary spec file: {temp_file_path}")
1685
+ self._debug("Calling cloud_controller.create_cloud_resource...")
1686
+
1687
+ # Use cloud_controller's create_cloud_resource method which now returns the cloud_resource_id
1688
+ cloud_resource_id = self.cloud_controller.create_cloud_resource(
1689
+ cloud=None,
1690
+ cloud_id=cloud_id,
1691
+ spec_file=temp_file_path,
1692
+ skip_verification=True, # We will do verification in the _verify_installation method
1693
+ yes=True, # Skip confirmation prompts
1694
+ )
1695
+ finally:
1696
+ # Clean up the temporary file
1697
+ import os as os_module
1698
+
1699
+ try:
1700
+ os_module.unlink(temp_file_path)
1701
+ self._debug(f"Cleaned up temporary file: {temp_file_path}")
1702
+ except Exception as e: # noqa: BLE001
1703
+ self._debug(f"Failed to clean up temporary file: {e}")
1704
+
1705
+ self.log.info(
1706
+ f"Cloud resource created with ID: {cloud_resource_id}",
1707
+ block_label="Setup",
1708
+ )
1709
+
1710
+ return cloud_resource_id
1711
+
1712
+ except Exception as e: # noqa: BLE001
1713
+ self.log.error(f"Cloud resource creation failed with error: {e}")
1714
+ self.log.error(f"Error type: {type(e).__name__}")
1715
+ if hasattr(e, "response"):
1716
+ self.log.error(f"Response details: {getattr(e, 'response', 'N/A')}")
1717
+ if hasattr(e, "args"):
1718
+ self.log.error(f"Error args: {e.args}")
1719
+ import traceback
1720
+
1721
+ self.log.error(f"Full traceback: {traceback.format_exc()}")
1722
+ raise click.ClickException(f"Failed to create cloud resource: {e}")
1723
+
1724
+ def _handle_setup_failure(
1725
+ self,
1726
+ provider: str,
1727
+ infrastructure: Optional[InfrastructureResources],
1728
+ cloud_id: Optional[str],
1729
+ name: str,
1730
+ is_cloud_resource_setup: bool = False,
1731
+ ) -> None:
1732
+ """Handle setup failure by providing cleanup instructions to the user."""
1733
+ self.log.error("")
1734
+ self.log.error("=" * 80)
1735
+ self.log.error("SETUP FAILED - MANUAL CLEANUP REQUIRED")
1736
+ self.log.error("=" * 80)
1737
+ self.log.error("")
1738
+
1739
+ if is_cloud_resource_setup:
1740
+ self.log.error(
1741
+ "The Kubernetes cloud resource setup failed, leaving resources in an incomplete state."
1742
+ )
1743
+ else:
1744
+ self.log.error(
1745
+ "The Kubernetes cloud setup failed, leaving resources in an incomplete state."
1746
+ )
1747
+
1748
+ self.log.error(
1749
+ "You must manually clean up the following resources to avoid charges:"
1750
+ )
1751
+ self.log.error("")
1752
+
1753
+ if provider == "aws":
1754
+ self._log_aws_cleanup_instructions(
1755
+ infrastructure, cloud_id, name, is_cloud_resource_setup
1756
+ )
1757
+ elif provider == "gcp":
1758
+ self._log_gcp_cleanup_instructions(
1759
+ infrastructure, cloud_id, name, is_cloud_resource_setup
1760
+ )
1761
+
1762
+ self.log.error("")
1763
+ self.log.error("=" * 80)
1764
+
1765
+ def _log_aws_cleanup_instructions(
1766
+ self,
1767
+ infrastructure: Optional[InfrastructureResources],
1768
+ cloud_id: Optional[str],
1769
+ name: str,
1770
+ is_cloud_resource_setup: bool = False,
1771
+ ) -> None:
1772
+ """Log AWS-specific cleanup instructions."""
1773
+ self.log.error("AWS Resources to clean up:")
1774
+ self.log.error("")
1775
+
1776
+ if infrastructure:
1777
+ self.log.error("1. CloudFormation Stack:")
1778
+ # Stack name pattern: k8s-{name}-{random} with underscores replaced by hyphens
1779
+ stack_name_pattern = f"k8s-{name}-*".replace("_", "-").lower()
1780
+ self.log.error(
1781
+ f" - Find and delete the CloudFormation stack matching pattern: {stack_name_pattern}"
1782
+ )
1783
+ self.log.error(f" - Region: {infrastructure.region}")
1784
+ self.log.error(
1785
+ " - AWS Console: CloudFormation > Stacks > Select stack > Delete"
1786
+ )
1787
+ self.log.error(
1788
+ f" - AWS CLI: aws cloudformation delete-stack --stack-name <stack-name> --region {infrastructure.region}"
1789
+ )
1790
+ self.log.error("")
1791
+ self.log.error(" This will automatically delete:")
1792
+ self.log.error(f" - S3 Bucket: {infrastructure.bucket_name}")
1793
+ self.log.error(f" - IAM Role: {infrastructure.iam_role_arn}")
1794
+ self.log.error("")
1795
+
1796
+ if cloud_id:
1797
+ if is_cloud_resource_setup:
1798
+ self.log.error("2. Anyscale Cloud Resource:")
1799
+ self.log.error(
1800
+ f" - Delete the cloud resource from cloud '{name}' (ID: {cloud_id})"
1801
+ )
1802
+ self.log.error(
1803
+ f" - CLI: anyscale cloud resource delete --cloud '{name}' --resource <resource-name>"
1804
+ )
1805
+ self.log.error(
1806
+ " - To find the resource name, run: anyscale cloud get --name '{name}'"
1807
+ )
1808
+ self.log.error(
1809
+ f" - Console: {ANYSCALE_HOST}/clouds (if using custom host)"
1810
+ )
1811
+ else:
1812
+ self.log.error("2. Anyscale Cloud Registration:")
1813
+ self.log.error(
1814
+ f" - Delete the cloud '{name}' (ID: {cloud_id}) from Anyscale"
1815
+ )
1816
+ self.log.error(f" - CLI: anyscale cloud delete --name '{name}'")
1817
+ self.log.error(
1818
+ f" - Console: {ANYSCALE_HOST}/clouds (if using custom host)"
1819
+ )
1820
+ self.log.error("")
1821
+
1822
+ if not infrastructure:
1823
+ self.log.error(
1824
+ "No infrastructure resources were created before the failure."
1825
+ )
1826
+ self.log.error("")
1827
+
1828
+ def _log_gcp_cleanup_instructions(
1829
+ self,
1830
+ infrastructure: Optional[InfrastructureResources],
1831
+ cloud_id: Optional[str],
1832
+ name: str,
1833
+ is_cloud_resource_setup: bool = False,
1834
+ ) -> None:
1835
+ """Log GCP-specific cleanup instructions."""
1836
+ self.log.error("GCP Resources to clean up:")
1837
+ self.log.error("")
1838
+
1839
+ if infrastructure:
1840
+ self.log.error("1. GCS Bucket:")
1841
+ self.log.error(f" - Bucket: {infrastructure.bucket_name}")
1842
+ self.log.error(f" - Project: {infrastructure.project_id}")
1843
+ self.log.error(
1844
+ " - GCP Console: Cloud Storage > Buckets > Select bucket > Delete"
1845
+ )
1846
+ self.log.error(
1847
+ f" - gcloud CLI: gsutil rm -r gs://{infrastructure.bucket_name}"
1848
+ )
1849
+ self.log.error("")
1850
+
1851
+ self.log.error("2. Service Account:")
1852
+ self.log.error(f" - Service Account: {infrastructure.iam_role_arn}")
1853
+ self.log.error(f" - Project: {infrastructure.project_id}")
1854
+ self.log.error(
1855
+ " - GCP Console: IAM & Admin > Service Accounts > Select account > Delete"
1856
+ )
1857
+ self.log.error(
1858
+ f" - gcloud CLI: gcloud iam service-accounts delete {infrastructure.iam_role_arn} --project={infrastructure.project_id}"
1859
+ )
1860
+ self.log.error("")
1861
+
1862
+ if cloud_id:
1863
+ if is_cloud_resource_setup:
1864
+ self.log.error("3. Anyscale Cloud Resource:")
1865
+ self.log.error(
1866
+ f" - Delete the cloud resource from cloud '{name}' (ID: {cloud_id})"
1867
+ )
1868
+ self.log.error(
1869
+ f" - CLI: anyscale cloud resource delete --cloud '{name}' --resource <resource-name>"
1870
+ )
1871
+ self.log.error(
1872
+ " - To find the resource name, run: anyscale cloud get --name '{name}'"
1873
+ )
1874
+ self.log.error(
1875
+ f" - Console: {ANYSCALE_HOST}/clouds (if using custom host)"
1876
+ )
1877
+ else:
1878
+ self.log.error("3. Anyscale Cloud Registration:")
1879
+ self.log.error(
1880
+ f" - Delete the cloud '{name}' (ID: {cloud_id}) from Anyscale"
1881
+ )
1882
+ self.log.error(f" - CLI: anyscale cloud delete --name '{name}'")
1883
+ self.log.error(
1884
+ f" - Console: {ANYSCALE_HOST}/clouds (if using custom host)"
1885
+ )
1886
+ self.log.error("")
1887
+
1888
+ if not infrastructure:
1889
+ self.log.error(
1890
+ "No infrastructure resources were created before the failure."
1891
+ )
1892
+ self.log.error("")
1893
+
1420
1894
 
1421
1895
  def setup_kubernetes_cloud( # noqa: PLR0913
1422
1896
  provider: str,
@@ -1429,6 +1903,7 @@ def setup_kubernetes_cloud( # noqa: PLR0913
1429
1903
  yes: bool = False,
1430
1904
  values_file: Optional[str] = None,
1431
1905
  debug: bool = False,
1906
+ operator_chart: Optional[str] = None,
1432
1907
  ) -> None:
1433
1908
  """
1434
1909
  Set up Anyscale on a Kubernetes cluster.
@@ -1447,6 +1922,7 @@ def setup_kubernetes_cloud( # noqa: PLR0913
1447
1922
  yes: Skip confirmation prompts
1448
1923
  values_file: Optional path for Helm values file
1449
1924
  debug: Enable debug logging
1925
+ operator_chart: Optional path to operator chart (skips helm repo add/update)
1450
1926
  """
1451
1927
  cmd = KubernetesCloudSetupCommand(debug=debug)
1452
1928
 
@@ -1461,6 +1937,96 @@ def setup_kubernetes_cloud( # noqa: PLR0913
1461
1937
  functional_verify=functional_verify,
1462
1938
  yes=yes,
1463
1939
  values_file=values_file,
1940
+ operator_chart=operator_chart,
1941
+ )
1942
+ except Exception as e: # noqa: BLE001
1943
+ click.echo(f"Setup failed: {e}", err=True)
1944
+ raise click.Abort()
1945
+
1946
+
1947
+ def setup_kubernetes_cloud_resource( # noqa: PLR0913
1948
+ provider: str,
1949
+ region: str,
1950
+ cloud_name: Optional[str],
1951
+ cloud_id: Optional[str],
1952
+ cluster_name: str,
1953
+ resource_name: Optional[str],
1954
+ namespace: str = "anyscale-operator",
1955
+ project_id: Optional[str] = None,
1956
+ functional_verify: bool = False,
1957
+ yes: bool = False,
1958
+ values_file: Optional[str] = None,
1959
+ debug: bool = False,
1960
+ operator_chart: Optional[str] = None,
1961
+ ) -> None:
1962
+ """
1963
+ Set up cloud resources for an existing Anyscale cloud on a Kubernetes cluster.
1964
+
1965
+ This function sets up infrastructure and installs the operator without
1966
+ registering a new cloud.
1967
+
1968
+ Args:
1969
+ provider: Cloud provider (aws, gcp)
1970
+ region: Cloud region
1971
+ cloud_name: Name of existing Anyscale cloud
1972
+ cloud_id: ID of existing Anyscale cloud
1973
+ cluster_name: Kubernetes cluster name
1974
+ resource_name: Name for the cloud resource (optional, will be auto-generated if not provided)
1975
+ namespace: Namespace for Anyscale operator (default: anyscale-operator)
1976
+ project_id: GCP project ID (optional, for GCP)
1977
+ functional_verify: Whether to run functional verification
1978
+ yes: Skip confirmation prompts
1979
+ values_file: Optional path for Helm values file
1980
+ debug: Enable debug logging
1981
+ operator_chart: Optional path to operator chart (skips helm repo add/update)
1982
+ """
1983
+ cmd = KubernetesCloudSetupCommand(debug=debug)
1984
+
1985
+ # Preprocessing: Fetch full cloud info to ensure cloud exists and get both name and ID
1986
+ if not cloud_id and not cloud_name:
1987
+ click.echo("Either cloud_name or cloud_id must be provided", err=True)
1988
+ raise click.Abort()
1989
+
1990
+ if cloud_id and cloud_name:
1991
+ click.echo("Only one of cloud_name or cloud_id can be provided", err=True)
1992
+ raise click.Abort()
1993
+
1994
+ # Use get_cloud_id_and_name to validate cloud exists and get both ID and name
1995
+ try:
1996
+ from anyscale.cloud_utils import get_cloud_id_and_name
1997
+
1998
+ if cloud_id:
1999
+ cloud_id, cloud_name = get_cloud_id_and_name(
2000
+ cmd.cloud_controller.api_client, cloud_id=cloud_id
2001
+ )
2002
+ else:
2003
+ cloud_id, cloud_name = get_cloud_id_and_name(
2004
+ cmd.cloud_controller.api_client, cloud_name=cloud_name
2005
+ )
2006
+
2007
+ except Exception as e: # noqa: BLE001
2008
+ click.echo(f"Failed to fetch cloud information: {e}", err=True)
2009
+ raise click.Abort()
2010
+
2011
+ if not cloud_id or not cloud_name:
2012
+ click.echo("Could not find cloud with provided name or ID", err=True)
2013
+ raise click.Abort()
2014
+
2015
+ try:
2016
+ # Use the unified run method with cloud_id to indicate resource-only mode
2017
+ cmd.run(
2018
+ provider=provider,
2019
+ region=region,
2020
+ name=cloud_name,
2021
+ cluster_name=cluster_name,
2022
+ namespace=namespace,
2023
+ project_id=project_id,
2024
+ functional_verify=functional_verify,
2025
+ yes=yes,
2026
+ values_file=values_file,
2027
+ operator_chart=operator_chart,
2028
+ cloud_id=cloud_id,
2029
+ resource_name=resource_name,
1464
2030
  )
1465
2031
  except Exception as e: # noqa: BLE001
1466
2032
  click.echo(f"Setup failed: {e}", err=True)