anyscale 0.26.69__py3-none-any.whl → 0.26.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anyscale/_private/anyscale_client/anyscale_client.py +126 -3
- anyscale/_private/anyscale_client/common.py +51 -2
- anyscale/_private/anyscale_client/fake_anyscale_client.py +103 -11
- anyscale/client/README.md +43 -4
- anyscale/client/openapi_client/__init__.py +30 -4
- anyscale/client/openapi_client/api/default_api.py +1769 -27
- anyscale/client/openapi_client/models/__init__.py +30 -4
- anyscale/client/openapi_client/models/api_key_info.py +29 -3
- anyscale/client/openapi_client/models/apply_autoscaling_config_update_model.py +350 -0
- anyscale/client/openapi_client/models/apply_multi_version_update_weights_update_model.py +152 -0
- anyscale/client/openapi_client/models/apply_production_service_multi_version_v2_model.py +207 -0
- anyscale/client/openapi_client/models/apply_production_service_v2_model.py +31 -3
- anyscale/client/openapi_client/models/apply_version_weight_update_model.py +181 -0
- anyscale/client/openapi_client/models/backend_server_api_product_models_catalog_client_models_table_metadata.py +546 -0
- anyscale/client/openapi_client/models/backend_server_api_product_models_data_catalogs_table_metadata.py +178 -0
- anyscale/client/openapi_client/models/baseimagesenum.py +139 -1
- anyscale/client/openapi_client/models/catalog_metadata.py +150 -0
- anyscale/client/openapi_client/models/cloud_data_bucket_file_type.py +2 -1
- anyscale/client/openapi_client/models/{oauthconnectionresponse_response.py → clouddeployment_response.py} +11 -11
- anyscale/client/openapi_client/models/column_info.py +265 -0
- anyscale/client/openapi_client/models/compute_node_type.py +29 -1
- anyscale/client/openapi_client/models/connection_metadata.py +206 -0
- anyscale/client/openapi_client/models/create_experimental_workspace.py +29 -1
- anyscale/client/openapi_client/models/create_workspace_from_template.py +29 -1
- anyscale/client/openapi_client/models/create_workspace_template_version.py +59 -3
- anyscale/client/openapi_client/models/data_catalog.py +45 -31
- anyscale/client/openapi_client/models/data_catalog_connection.py +74 -58
- anyscale/client/openapi_client/models/{ha_job_event_level.py → data_catalog_object_type.py} +7 -8
- anyscale/client/openapi_client/models/data_catalog_schema.py +324 -0
- anyscale/client/openapi_client/models/data_catalog_table.py +437 -0
- anyscale/client/openapi_client/models/data_catalog_volume.py +437 -0
- anyscale/client/openapi_client/models/datacatalogschema_list_response.py +147 -0
- anyscale/client/openapi_client/models/datacatalogtable_list_response.py +147 -0
- anyscale/client/openapi_client/models/datacatalogvolume_list_response.py +147 -0
- anyscale/client/openapi_client/models/decorated_list_service_api_model.py +58 -1
- anyscale/client/openapi_client/models/decorated_production_service_v2_api_model.py +60 -3
- anyscale/client/openapi_client/models/decorated_serve_deployment.py +27 -1
- anyscale/client/openapi_client/models/decorated_service_event_api_model.py +3 -3
- anyscale/client/openapi_client/models/decoratedproductionservicev2_versionapimodel_response.py +121 -0
- anyscale/client/openapi_client/models/describe_machine_pool_machines_filters.py +33 -5
- anyscale/client/openapi_client/models/describe_machine_pool_requests_filters.py +33 -5
- anyscale/client/openapi_client/models/describe_machine_pool_workloads_filters.py +33 -5
- anyscale/client/openapi_client/models/{service_event_level.py → entity_type.py} +9 -9
- anyscale/client/openapi_client/models/event_level.py +2 -1
- anyscale/client/openapi_client/models/job_event_fields.py +206 -0
- anyscale/client/openapi_client/models/machine_type_partition_filter.py +152 -0
- anyscale/client/openapi_client/models/partition_info.py +30 -1
- anyscale/client/openapi_client/models/physical_resources.py +178 -0
- anyscale/client/openapi_client/models/production_job_event.py +3 -3
- anyscale/client/openapi_client/models/rollout_strategy.py +2 -1
- anyscale/client/openapi_client/models/schema_metadata.py +150 -0
- anyscale/client/openapi_client/models/service_event_fields.py +318 -0
- anyscale/client/openapi_client/models/sso_config.py +18 -18
- anyscale/client/openapi_client/models/supportedbaseimagesenum.py +139 -1
- anyscale/client/openapi_client/models/table_data_preview.py +209 -0
- anyscale/client/openapi_client/models/task_summary_config.py +29 -3
- anyscale/client/openapi_client/models/task_table_config.py +29 -3
- anyscale/client/openapi_client/models/unified_event.py +377 -0
- anyscale/client/openapi_client/models/unified_origin_filter.py +113 -0
- anyscale/client/openapi_client/models/unifiedevent_list_response.py +147 -0
- anyscale/client/openapi_client/models/volume_metadata.py +150 -0
- anyscale/client/openapi_client/models/worker_node_type.py +29 -1
- anyscale/client/openapi_client/models/workspace_event_fields.py +122 -0
- anyscale/client/openapi_client/models/workspace_template_version.py +58 -1
- anyscale/client/openapi_client/models/workspace_template_version_data_object.py +58 -1
- anyscale/cloud/models.py +2 -2
- anyscale/commands/cloud_commands.py +133 -2
- anyscale/commands/job_commands.py +121 -1
- anyscale/commands/job_queue_commands.py +99 -2
- anyscale/commands/service_commands.py +267 -67
- anyscale/commands/setup_k8s.py +546 -31
- anyscale/commands/util.py +104 -1
- anyscale/commands/workspace_commands.py +123 -5
- anyscale/commands/workspace_commands_v2.py +17 -1
- anyscale/compute_config/_private/compute_config_sdk.py +25 -12
- anyscale/compute_config/models.py +15 -0
- anyscale/controllers/cloud_controller.py +15 -2
- anyscale/controllers/job_controller.py +12 -0
- anyscale/controllers/kubernetes_verifier.py +80 -66
- anyscale/controllers/workspace_controller.py +67 -5
- anyscale/job/_private/job_sdk.py +50 -2
- anyscale/job/commands.py +3 -0
- anyscale/job/models.py +16 -0
- anyscale/job_queue/__init__.py +37 -1
- anyscale/job_queue/_private/job_queue_sdk.py +28 -1
- anyscale/job_queue/commands.py +61 -1
- anyscale/sdk/anyscale_client/__init__.py +1 -0
- anyscale/sdk/anyscale_client/api/default_api.py +12 -2
- anyscale/sdk/anyscale_client/models/__init__.py +1 -0
- anyscale/sdk/anyscale_client/models/apply_production_service_v2_model.py +31 -3
- anyscale/sdk/anyscale_client/models/apply_service_model.py +31 -3
- anyscale/sdk/anyscale_client/models/baseimagesenum.py +139 -1
- anyscale/sdk/anyscale_client/models/compute_node_type.py +29 -1
- anyscale/sdk/anyscale_client/models/physical_resources.py +178 -0
- anyscale/sdk/anyscale_client/models/rollout_strategy.py +2 -1
- anyscale/sdk/anyscale_client/models/supportedbaseimagesenum.py +139 -1
- anyscale/sdk/anyscale_client/models/worker_node_type.py +29 -1
- anyscale/service/__init__.py +51 -3
- anyscale/service/_private/service_sdk.py +481 -58
- anyscale/service/commands.py +90 -4
- anyscale/service/models.py +56 -0
- anyscale/shared_anyscale_utils/latest_ray_version.py +1 -1
- anyscale/version.py +1 -1
- anyscale/workspace/_private/workspace_sdk.py +1 -0
- anyscale/workspace/models.py +19 -0
- {anyscale-0.26.69.dist-info → anyscale-0.26.71.dist-info}/METADATA +1 -1
- {anyscale-0.26.69.dist-info → anyscale-0.26.71.dist-info}/RECORD +112 -85
- anyscale/client/openapi_client/models/o_auth_connection_response.py +0 -229
- {anyscale-0.26.69.dist-info → anyscale-0.26.71.dist-info}/WHEEL +0 -0
- {anyscale-0.26.69.dist-info → anyscale-0.26.71.dist-info}/entry_points.txt +0 -0
- {anyscale-0.26.69.dist-info → anyscale-0.26.71.dist-info}/licenses/LICENSE +0 -0
- {anyscale-0.26.69.dist-info → anyscale-0.26.71.dist-info}/licenses/NOTICE +0 -0
- {anyscale-0.26.69.dist-info → anyscale-0.26.71.dist-info}/top_level.txt +0 -0
|
@@ -12,6 +12,7 @@ Handles verification of Kubernetes-based cloud deployments including:
|
|
|
12
12
|
|
|
13
13
|
from contextlib import contextmanager, suppress
|
|
14
14
|
from dataclasses import dataclass
|
|
15
|
+
from enum import Enum
|
|
15
16
|
import json
|
|
16
17
|
import os
|
|
17
18
|
import shutil
|
|
@@ -90,8 +91,19 @@ KUBECTL_COMMON_PATHS = [
|
|
|
90
91
|
# Status and result strings
|
|
91
92
|
PASSED_STATUS = "PASSED"
|
|
92
93
|
FAILED_STATUS = "FAILED"
|
|
94
|
+
SKIPPED_STATUS = "SKIPPED"
|
|
93
95
|
RUNNING_STATUS = "Running"
|
|
94
96
|
|
|
97
|
+
|
|
98
|
+
# Verification status enum
|
|
99
|
+
class VerificationStatus(Enum):
|
|
100
|
+
"""Status of a verification check."""
|
|
101
|
+
|
|
102
|
+
PASSED = "PASSED"
|
|
103
|
+
FAILED = "FAILED"
|
|
104
|
+
SKIPPED = "SKIPPED"
|
|
105
|
+
|
|
106
|
+
|
|
95
107
|
# Verification component names (for consistent reporting)
|
|
96
108
|
class VerificationComponents:
|
|
97
109
|
OPERATOR_POD_INSTALLED = "Operator Pod Installed"
|
|
@@ -206,15 +218,15 @@ class ResourceNotFoundError(KubernetesVerificationError):
|
|
|
206
218
|
class VerificationResults:
|
|
207
219
|
"""Tracks the results of all verification steps."""
|
|
208
220
|
|
|
209
|
-
operator_pod_installed:
|
|
210
|
-
operator_health:
|
|
211
|
-
operator_identity:
|
|
212
|
-
file_storage:
|
|
213
|
-
gateway_support:
|
|
214
|
-
nginx_ingress:
|
|
221
|
+
operator_pod_installed: VerificationStatus = VerificationStatus.FAILED
|
|
222
|
+
operator_health: VerificationStatus = VerificationStatus.FAILED
|
|
223
|
+
operator_identity: VerificationStatus = VerificationStatus.FAILED
|
|
224
|
+
file_storage: VerificationStatus = VerificationStatus.FAILED
|
|
225
|
+
gateway_support: VerificationStatus = VerificationStatus.FAILED
|
|
226
|
+
nginx_ingress: VerificationStatus = VerificationStatus.FAILED
|
|
215
227
|
|
|
216
|
-
def to_dict(self) -> Dict[str,
|
|
217
|
-
"""Convert to dictionary format
|
|
228
|
+
def to_dict(self) -> Dict[str, VerificationStatus]:
|
|
229
|
+
"""Convert to dictionary format for reporting."""
|
|
218
230
|
return {
|
|
219
231
|
VerificationComponents.OPERATOR_POD_INSTALLED: self.operator_pod_installed,
|
|
220
232
|
VerificationComponents.OPERATOR_HEALTH: self.operator_health,
|
|
@@ -226,9 +238,10 @@ class VerificationResults:
|
|
|
226
238
|
|
|
227
239
|
@property
|
|
228
240
|
def overall_success(self) -> bool:
|
|
229
|
-
"""Return True if all verification steps passed."""
|
|
241
|
+
"""Return True if all verification steps passed or were skipped."""
|
|
230
242
|
return all(
|
|
231
|
-
|
|
243
|
+
status in (VerificationStatus.PASSED, VerificationStatus.SKIPPED)
|
|
244
|
+
for status in [
|
|
232
245
|
self.operator_pod_installed,
|
|
233
246
|
self.operator_health,
|
|
234
247
|
self.operator_identity,
|
|
@@ -654,24 +667,24 @@ class OperatorVerifier:
|
|
|
654
667
|
port=OPERATOR_HEALTH_PORT,
|
|
655
668
|
)
|
|
656
669
|
|
|
657
|
-
def verify_operator_health(self, operator_data: OperatorData) ->
|
|
670
|
+
def verify_operator_health(self, operator_data: OperatorData) -> VerificationStatus:
|
|
658
671
|
"""Verify operator health using pre-fetched data."""
|
|
659
672
|
if operator_data.health.is_healthy:
|
|
660
|
-
return
|
|
673
|
+
return VerificationStatus.PASSED
|
|
661
674
|
else:
|
|
662
675
|
self.log.error(
|
|
663
676
|
f"Health check failed - HTTP {operator_data.health.status_code}"
|
|
664
677
|
)
|
|
665
678
|
if operator_data.health.response_text:
|
|
666
679
|
self.log.error(f"Response: {operator_data.health.response_text}")
|
|
667
|
-
return
|
|
680
|
+
return VerificationStatus.FAILED
|
|
668
681
|
|
|
669
682
|
def verify_operator_identity(
|
|
670
683
|
self,
|
|
671
684
|
operator_data: OperatorData,
|
|
672
685
|
kubernetes_config: OpenAPIKubernetesConfig,
|
|
673
686
|
cloud_provider: Optional[CloudProviders],
|
|
674
|
-
) ->
|
|
687
|
+
) -> VerificationStatus:
|
|
675
688
|
"""Verify operator identity using pre-fetched config data."""
|
|
676
689
|
# Validate kubernetes_config contents
|
|
677
690
|
expected_identity = kubernetes_config.anyscale_operator_iam_identity
|
|
@@ -679,7 +692,7 @@ class OperatorVerifier:
|
|
|
679
692
|
self.log.error(
|
|
680
693
|
"Missing 'anyscale_operator_iam_identity' in kubernetes config"
|
|
681
694
|
)
|
|
682
|
-
return
|
|
695
|
+
return VerificationStatus.FAILED
|
|
683
696
|
|
|
684
697
|
# Validate config response
|
|
685
698
|
if not operator_data.config.is_valid:
|
|
@@ -688,32 +701,34 @@ class OperatorVerifier:
|
|
|
688
701
|
)
|
|
689
702
|
if operator_data.config.response_text:
|
|
690
703
|
self.log.error(f"Response: {operator_data.config.response_text}")
|
|
691
|
-
return
|
|
704
|
+
return VerificationStatus.FAILED
|
|
692
705
|
|
|
693
706
|
# Extract actual identity from config
|
|
694
707
|
if operator_data.config.config_data is None:
|
|
695
708
|
self.log.error("Operator config data is None")
|
|
696
|
-
return
|
|
709
|
+
return VerificationStatus.FAILED
|
|
697
710
|
|
|
698
711
|
actual_identity = operator_data.config.config_data.get("iamIdentity")
|
|
699
712
|
if not actual_identity:
|
|
700
713
|
self.log.error("Operator config missing 'iamIdentity' field")
|
|
701
|
-
return
|
|
714
|
+
return VerificationStatus.FAILED
|
|
702
715
|
|
|
703
716
|
# Perform identity comparison
|
|
704
717
|
if self._evaluate_identity_match(
|
|
705
718
|
expected_identity, actual_identity, cloud_provider
|
|
706
719
|
):
|
|
720
|
+
# Get cloud provider string for display
|
|
721
|
+
provider_str = str(cloud_provider) if cloud_provider else "AWS"
|
|
707
722
|
self.log.info(
|
|
708
|
-
f"
|
|
723
|
+
f"{provider_str} identity match: Expected identity matches (Expected: {expected_identity})"
|
|
709
724
|
)
|
|
710
|
-
self.log.info("Expected
|
|
711
|
-
return
|
|
725
|
+
self.log.info("Expected identity matches actual identity")
|
|
726
|
+
return VerificationStatus.PASSED
|
|
712
727
|
else:
|
|
713
728
|
self.log.error("Operator identity mismatch")
|
|
714
729
|
self.log.error(f"Expected: {expected_identity}")
|
|
715
730
|
self.log.error(f"Actual: {actual_identity}")
|
|
716
|
-
return
|
|
731
|
+
return VerificationStatus.FAILED
|
|
717
732
|
|
|
718
733
|
@contextmanager
|
|
719
734
|
def _port_forward_to_operator(self, pod_name: str):
|
|
@@ -980,8 +995,12 @@ class StorageVerifier:
|
|
|
980
995
|
|
|
981
996
|
def verify_file_storage(
|
|
982
997
|
self, file_storage: FileStorage, cloud_deployment: CloudDeployment
|
|
983
|
-
) ->
|
|
984
|
-
"""Verify file storage configuration (non-functional checks only).
|
|
998
|
+
) -> VerificationStatus:
|
|
999
|
+
"""Verify file storage configuration (non-functional checks only).
|
|
1000
|
+
|
|
1001
|
+
Returns:
|
|
1002
|
+
VerificationStatus enum value
|
|
1003
|
+
"""
|
|
985
1004
|
self.log.info("Verifying file storage configuration...")
|
|
986
1005
|
verification_results = []
|
|
987
1006
|
|
|
@@ -1014,12 +1033,15 @@ class StorageVerifier:
|
|
|
1014
1033
|
f"Cloud provider API error while verifying file storage: {e}"
|
|
1015
1034
|
) from e
|
|
1016
1035
|
|
|
1017
|
-
# Return overall
|
|
1036
|
+
# Return overall status
|
|
1018
1037
|
if verification_results:
|
|
1019
|
-
|
|
1038
|
+
if all(result for _, result in verification_results):
|
|
1039
|
+
return VerificationStatus.PASSED
|
|
1040
|
+
else:
|
|
1041
|
+
return VerificationStatus.FAILED
|
|
1020
1042
|
else:
|
|
1021
|
-
self.log.info("
|
|
1022
|
-
return
|
|
1043
|
+
self.log.info("No file storage components found to verify")
|
|
1044
|
+
return VerificationStatus.SKIPPED
|
|
1023
1045
|
|
|
1024
1046
|
def _verify_csi_driver(self, driver_name: str) -> bool:
|
|
1025
1047
|
"""Check if CSI driver exists on cluster."""
|
|
@@ -1135,13 +1157,17 @@ class GatewayVerifier:
|
|
|
1135
1157
|
self.config = k8s_config
|
|
1136
1158
|
self.log = logger
|
|
1137
1159
|
|
|
1138
|
-
def verify_gateway_support(self, operator_data: OperatorData) ->
|
|
1139
|
-
"""Verify gateway support using pre-fetched config data.
|
|
1160
|
+
def verify_gateway_support(self, operator_data: OperatorData) -> VerificationStatus:
|
|
1161
|
+
"""Verify gateway support using pre-fetched config data.
|
|
1162
|
+
|
|
1163
|
+
Returns:
|
|
1164
|
+
VerificationStatus enum value
|
|
1165
|
+
"""
|
|
1140
1166
|
if not operator_data.config.is_valid:
|
|
1141
|
-
self.log.
|
|
1167
|
+
self.log.info(
|
|
1142
1168
|
"Could not retrieve operator configuration - skipping gateway verification"
|
|
1143
1169
|
)
|
|
1144
|
-
return
|
|
1170
|
+
return VerificationStatus.SKIPPED
|
|
1145
1171
|
|
|
1146
1172
|
# Extract gateway configuration from operator data
|
|
1147
1173
|
gateway_config = GatewayConfig.from_operator_config(
|
|
@@ -1152,21 +1178,24 @@ class GatewayVerifier:
|
|
|
1152
1178
|
self.log.info(
|
|
1153
1179
|
"Gateway support is not enabled - skipping gateway verification"
|
|
1154
1180
|
)
|
|
1155
|
-
return
|
|
1181
|
+
return VerificationStatus.SKIPPED
|
|
1156
1182
|
|
|
1157
1183
|
if not gateway_config.requires_verification:
|
|
1158
1184
|
self.log.error(
|
|
1159
1185
|
"Gateway is enabled but no gateway name found in operator configuration"
|
|
1160
1186
|
)
|
|
1161
|
-
return
|
|
1187
|
+
return VerificationStatus.FAILED
|
|
1162
1188
|
|
|
1163
1189
|
# Verify gateway exists in cluster
|
|
1164
1190
|
assert (
|
|
1165
1191
|
gateway_config.name is not None
|
|
1166
1192
|
) # guaranteed by requires_verification check
|
|
1167
|
-
|
|
1193
|
+
if self._verify_gateway_exists(gateway_config.name):
|
|
1194
|
+
return VerificationStatus.PASSED
|
|
1195
|
+
else:
|
|
1196
|
+
return VerificationStatus.FAILED
|
|
1168
1197
|
|
|
1169
|
-
def verify_nginx_ingress(self) ->
|
|
1198
|
+
def verify_nginx_ingress(self) -> VerificationStatus:
|
|
1170
1199
|
"""Check for NGINX ingress controller (warning only)."""
|
|
1171
1200
|
try:
|
|
1172
1201
|
self.log.info("Checking for NGINX ingress controller...")
|
|
@@ -1182,7 +1211,7 @@ class GatewayVerifier:
|
|
|
1182
1211
|
f"PASSED: Found running NGINX ingress controller: {nginx_pod} "
|
|
1183
1212
|
f"(namespace: {config_dict['namespace']})"
|
|
1184
1213
|
)
|
|
1185
|
-
return
|
|
1214
|
+
return VerificationStatus.PASSED
|
|
1186
1215
|
else:
|
|
1187
1216
|
pod_status = self.kubectl.get_pod_status(
|
|
1188
1217
|
nginx_pod, config_dict["namespace"]
|
|
@@ -1194,14 +1223,14 @@ class GatewayVerifier:
|
|
|
1194
1223
|
|
|
1195
1224
|
# Try fallback search by name patterns
|
|
1196
1225
|
if self._find_nginx_by_name_pattern():
|
|
1197
|
-
return
|
|
1226
|
+
return VerificationStatus.PASSED
|
|
1198
1227
|
|
|
1199
1228
|
# No NGINX ingress controller found
|
|
1200
1229
|
self.log.warning("No NGINX ingress controller found")
|
|
1201
1230
|
self.log.warning("This may impact ingress routing capabilities")
|
|
1202
1231
|
self.log.warning("Available ingress controllers:")
|
|
1203
1232
|
self._list_available_ingress_controllers()
|
|
1204
|
-
return
|
|
1233
|
+
return VerificationStatus.FAILED
|
|
1205
1234
|
|
|
1206
1235
|
except (KubectlError, ResourceNotFoundError) as e:
|
|
1207
1236
|
self.log.warning(f"WARNING: Could not verify NGINX ingress controller: {e}")
|
|
@@ -1437,10 +1466,6 @@ class KubernetesCloudDeploymentVerifier:
|
|
|
1437
1466
|
self.log.error(f"Data parsing error during verification: {e}")
|
|
1438
1467
|
return False
|
|
1439
1468
|
|
|
1440
|
-
def _passed_or_failed_str_from_bool(self, is_passing: bool) -> str:
|
|
1441
|
-
"""Return PASSED or FAILED string for verification results, matching VM verification format."""
|
|
1442
|
-
return PASSED_STATUS if is_passing else FAILED_STATUS
|
|
1443
|
-
|
|
1444
1469
|
@contextmanager
|
|
1445
1470
|
def _verification_step(self, step_name: str):
|
|
1446
1471
|
"""Context manager for verification steps that indents detailed output."""
|
|
@@ -1467,7 +1492,7 @@ class KubernetesCloudDeploymentVerifier:
|
|
|
1467
1492
|
with self._verification_step("Finding operator pod"):
|
|
1468
1493
|
try:
|
|
1469
1494
|
operator_pod = operator_verifier.find_operator_pod()
|
|
1470
|
-
self.results.operator_pod_installed =
|
|
1495
|
+
self.results.operator_pod_installed = VerificationStatus.PASSED
|
|
1471
1496
|
except OperatorPodNotFoundError as e:
|
|
1472
1497
|
self.log.error(
|
|
1473
1498
|
"Failed to find operator pod, please make sure the operator is running"
|
|
@@ -1490,56 +1515,47 @@ class KubernetesCloudDeploymentVerifier:
|
|
|
1490
1515
|
self.results.operator_health = operator_verifier.verify_operator_health(
|
|
1491
1516
|
operator_data
|
|
1492
1517
|
)
|
|
1493
|
-
self.log.info(
|
|
1494
|
-
f"Operator Health: {self._passed_or_failed_str_from_bool(self.results.operator_health)}"
|
|
1495
|
-
)
|
|
1518
|
+
self.log.info(f"Operator Health: {self.results.operator_health.value}")
|
|
1496
1519
|
|
|
1497
1520
|
self.log.info("Verifying operator identity...")
|
|
1498
1521
|
if cloud_deployment.kubernetes_config is None:
|
|
1499
1522
|
self.log.error(
|
|
1500
1523
|
"Kubernetes configuration is missing from cloud deployment"
|
|
1501
1524
|
)
|
|
1502
|
-
self.results.operator_identity =
|
|
1525
|
+
self.results.operator_identity = VerificationStatus.FAILED
|
|
1503
1526
|
else:
|
|
1504
1527
|
self.results.operator_identity = operator_verifier.verify_operator_identity(
|
|
1505
1528
|
operator_data,
|
|
1506
1529
|
cloud_deployment.kubernetes_config,
|
|
1507
1530
|
cloud_deployment.provider,
|
|
1508
1531
|
)
|
|
1509
|
-
self.log.info(
|
|
1510
|
-
f"Operator Identity: {self._passed_or_failed_str_from_bool(self.results.operator_identity)}"
|
|
1511
|
-
)
|
|
1532
|
+
self.log.info(f"Operator Identity: {self.results.operator_identity.value}")
|
|
1512
1533
|
|
|
1513
1534
|
# Step 4: Check file storage
|
|
1514
1535
|
with self._verification_step("Checking file storage"):
|
|
1515
1536
|
if cloud_deployment.file_storage is None:
|
|
1516
1537
|
self.log.info(
|
|
1517
|
-
"
|
|
1538
|
+
"No file storage configured - skipping file storage verification"
|
|
1518
1539
|
)
|
|
1519
|
-
self.results.file_storage =
|
|
1540
|
+
self.results.file_storage = VerificationStatus.SKIPPED
|
|
1520
1541
|
else:
|
|
1521
1542
|
self.results.file_storage = storage_verifier.verify_file_storage(
|
|
1522
1543
|
cloud_deployment.file_storage, cloud_deployment
|
|
1523
1544
|
)
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
)
|
|
1545
|
+
|
|
1546
|
+
self.log.info(f"File Storage: {self.results.file_storage.value}")
|
|
1527
1547
|
|
|
1528
1548
|
# Step 5: Verify gateway support
|
|
1529
|
-
with self._verification_step("
|
|
1549
|
+
with self._verification_step("Checking gateway support"):
|
|
1530
1550
|
self.results.gateway_support = gateway_verifier.verify_gateway_support(
|
|
1531
1551
|
operator_data
|
|
1532
1552
|
)
|
|
1533
|
-
self.log.info(
|
|
1534
|
-
f"Gateway Support: {self._passed_or_failed_str_from_bool(self.results.gateway_support)}"
|
|
1535
|
-
)
|
|
1553
|
+
self.log.info(f"Gateway Support: {self.results.gateway_support.value}")
|
|
1536
1554
|
|
|
1537
1555
|
# Step 6: Check NGINX ingress (warning only)
|
|
1538
1556
|
with self._verification_step("Checking NGINX ingress controller"):
|
|
1539
1557
|
self.results.nginx_ingress = gateway_verifier.verify_nginx_ingress()
|
|
1540
|
-
self.log.info(
|
|
1541
|
-
f"NGINX Ingress: {self._passed_or_failed_str_from_bool(self.results.nginx_ingress)}"
|
|
1542
|
-
)
|
|
1558
|
+
self.log.info(f"NGINX Ingress: {self.results.nginx_ingress.value}")
|
|
1543
1559
|
|
|
1544
1560
|
self._show_verification_summary()
|
|
1545
1561
|
|
|
@@ -1557,9 +1573,7 @@ class KubernetesCloudDeploymentVerifier:
|
|
|
1557
1573
|
verification_result_summary = ["Verification result:"]
|
|
1558
1574
|
|
|
1559
1575
|
for component, result in self.results.to_dict().items():
|
|
1560
|
-
verification_result_summary.append(
|
|
1561
|
-
f"{component}: {self._passed_or_failed_str_from_bool(result)}"
|
|
1562
|
-
)
|
|
1576
|
+
verification_result_summary.append(f"{component}: {result.value}")
|
|
1563
1577
|
|
|
1564
1578
|
self.log.info("\n".join(verification_result_summary))
|
|
1565
1579
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import shlex
|
|
3
3
|
import subprocess
|
|
4
|
-
from typing import Any, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
import click
|
|
7
7
|
import tabulate
|
|
@@ -10,9 +10,19 @@ from anyscale.cli_logger import BlockLogger
|
|
|
10
10
|
from anyscale.client.openapi_client.models.create_experimental_workspace import (
|
|
11
11
|
CreateExperimentalWorkspace,
|
|
12
12
|
)
|
|
13
|
+
from anyscale.client.openapi_client.models.delete_resource_tags_request import (
|
|
14
|
+
DeleteResourceTagsRequest,
|
|
15
|
+
)
|
|
13
16
|
from anyscale.client.openapi_client.models.experimental_workspace import (
|
|
14
17
|
ExperimentalWorkspace,
|
|
15
18
|
)
|
|
19
|
+
from anyscale.client.openapi_client.models.resource_tag_resource_type import (
|
|
20
|
+
ResourceTagResourceType,
|
|
21
|
+
)
|
|
22
|
+
from anyscale.client.openapi_client.models.upsert_resource_tags_request import (
|
|
23
|
+
UpsertResourceTagsRequest,
|
|
24
|
+
)
|
|
25
|
+
from anyscale.commands.util import flatten_tag_dict_to_api_list
|
|
16
26
|
from anyscale.controllers.base_controller import BaseController
|
|
17
27
|
from anyscale.feature_flags import FLAG_DEFAULT_WORKING_DIR_FOR_PROJ
|
|
18
28
|
from anyscale.project_utils import get_default_project
|
|
@@ -84,13 +94,21 @@ class WorkspaceController(BaseController):
|
|
|
84
94
|
)
|
|
85
95
|
)
|
|
86
96
|
|
|
87
|
-
def list(self) -> None:
|
|
97
|
+
def list(self, tags_filter: Optional[Dict[str, List[str]]] = None) -> None:
|
|
88
98
|
"""
|
|
89
99
|
prints a non-exhaustive tabular list of information about non-deleted workspaces.
|
|
90
100
|
"""
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
101
|
+
params = {}
|
|
102
|
+
if tags_filter:
|
|
103
|
+
api_tag_filter = flatten_tag_dict_to_api_list(tags_filter)
|
|
104
|
+
if api_tag_filter:
|
|
105
|
+
params["tag_filter"] = api_tag_filter
|
|
106
|
+
|
|
107
|
+
workspaces_data: List[ExperimentalWorkspace] = (
|
|
108
|
+
self.api_client.list_workspaces_api_v2_experimental_workspaces_get(
|
|
109
|
+
**params
|
|
110
|
+
).results
|
|
111
|
+
)
|
|
94
112
|
|
|
95
113
|
workspaces_table: List[List[Any]] = [
|
|
96
114
|
[
|
|
@@ -109,6 +127,50 @@ class WorkspaceController(BaseController):
|
|
|
109
127
|
|
|
110
128
|
print(f"Workspaces:\n{table}")
|
|
111
129
|
|
|
130
|
+
def _resolve_workspace_id(
|
|
131
|
+
self, *, workspace_id: Optional[str], name: Optional[str]
|
|
132
|
+
) -> str:
|
|
133
|
+
if workspace_id:
|
|
134
|
+
return workspace_id
|
|
135
|
+
if not name:
|
|
136
|
+
raise click.ClickException("Provide either --id or --name for workspace.")
|
|
137
|
+
results = self.api_client.list_workspaces_api_v2_experimental_workspaces_get(
|
|
138
|
+
name=name
|
|
139
|
+
).results
|
|
140
|
+
if len(results) == 0:
|
|
141
|
+
raise click.ClickException(f"No workspace with name '{name}' found.")
|
|
142
|
+
if len(results) > 1:
|
|
143
|
+
raise click.ClickException(
|
|
144
|
+
f"Multiple workspaces with name '{name}' found. Please use --id."
|
|
145
|
+
)
|
|
146
|
+
return results[0].id
|
|
147
|
+
|
|
148
|
+
def add_tags(
|
|
149
|
+
self,
|
|
150
|
+
*,
|
|
151
|
+
workspace_id: Optional[str] = None,
|
|
152
|
+
name: Optional[str] = None,
|
|
153
|
+
tags: Dict[str, str],
|
|
154
|
+
) -> None:
|
|
155
|
+
wid = self._resolve_workspace_id(workspace_id=workspace_id, name=name)
|
|
156
|
+
req = UpsertResourceTagsRequest(
|
|
157
|
+
resource_type=ResourceTagResourceType.WORKSPACE, resource_id=wid, tags=tags,
|
|
158
|
+
)
|
|
159
|
+
self.api_client.upsert_resource_tags_api_v2_tags_resource_put(req)
|
|
160
|
+
|
|
161
|
+
def remove_tags(
|
|
162
|
+
self,
|
|
163
|
+
*,
|
|
164
|
+
workspace_id: Optional[str] = None,
|
|
165
|
+
name: Optional[str] = None,
|
|
166
|
+
keys: List[str],
|
|
167
|
+
) -> None:
|
|
168
|
+
wid = self._resolve_workspace_id(workspace_id=workspace_id, name=name)
|
|
169
|
+
req = DeleteResourceTagsRequest(
|
|
170
|
+
resource_type=ResourceTagResourceType.WORKSPACE, resource_id=wid, keys=keys,
|
|
171
|
+
)
|
|
172
|
+
self.api_client.delete_resource_tags_api_v2_tags_resource_delete(req)
|
|
173
|
+
|
|
112
174
|
def clone(self, workspace: ExperimentalWorkspace) -> None:
|
|
113
175
|
dir_name = workspace.name
|
|
114
176
|
os.makedirs(dir_name)
|
anyscale/job/_private/job_sdk.py
CHANGED
|
@@ -46,7 +46,8 @@ HA_JOB_STATE_TO_JOB_STATE = {
|
|
|
46
46
|
HaJobStates.PENDING: JobState.STARTING,
|
|
47
47
|
HaJobStates.AWAITING_CLUSTER_START: JobState.STARTING,
|
|
48
48
|
HaJobStates.SUCCESS: JobState.SUCCEEDED,
|
|
49
|
-
|
|
49
|
+
# ERRORED is a transient state that can transition to RESTARTING when retries remain.
|
|
50
|
+
HaJobStates.ERRORED: JobState.RUNNING,
|
|
50
51
|
HaJobStates.TERMINATED: JobState.FAILED,
|
|
51
52
|
HaJobStates.BROKEN: JobState.FAILED,
|
|
52
53
|
HaJobStates.OUT_OF_RETRIES: JobState.FAILED,
|
|
@@ -280,6 +281,7 @@ class PrivateJobSDK(WorkloadSDK):
|
|
|
280
281
|
workspace_id=self.client.get_current_workspace_id(),
|
|
281
282
|
config=prod_job_config,
|
|
282
283
|
job_queue_config=job_queue_config,
|
|
284
|
+
tags=getattr(config, "tags", None),
|
|
283
285
|
)
|
|
284
286
|
)
|
|
285
287
|
|
|
@@ -438,7 +440,35 @@ class PrivateJobSDK(WorkloadSDK):
|
|
|
438
440
|
self.logger.info(f"Job {job_model.id} is successfully archived.")
|
|
439
441
|
return job_model.id
|
|
440
442
|
|
|
441
|
-
def
|
|
443
|
+
def _stream_logs_for_job_run(
|
|
444
|
+
self, job_run_id: str, next_page_token: Optional[str] = None,
|
|
445
|
+
) -> Optional[str]:
|
|
446
|
+
"""Stream logs for a job run and return updated pagination state.
|
|
447
|
+
|
|
448
|
+
Args:
|
|
449
|
+
job_run_id: The ID of the job run to stream logs for
|
|
450
|
+
next_page_token: Token for fetching next page of logs
|
|
451
|
+
|
|
452
|
+
Returns:
|
|
453
|
+
next_page_token for the next iteration
|
|
454
|
+
"""
|
|
455
|
+
try:
|
|
456
|
+
logs, next_page_token = self.client.stream_logs_for_job_run(
|
|
457
|
+
job_run_id=job_run_id, next_page_token=next_page_token,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
# Print logs line by line
|
|
461
|
+
for line in logs.splitlines():
|
|
462
|
+
if line: # Skip empty lines
|
|
463
|
+
print(line)
|
|
464
|
+
|
|
465
|
+
except Exception as e: # noqa: BLE001
|
|
466
|
+
# Don't fail if log streaming fails
|
|
467
|
+
self.logger.warning(f"Error streaming logs: {e}")
|
|
468
|
+
|
|
469
|
+
return next_page_token
|
|
470
|
+
|
|
471
|
+
def wait( # noqa: PLR0912
|
|
442
472
|
self,
|
|
443
473
|
*,
|
|
444
474
|
name: Optional[str] = None,
|
|
@@ -448,6 +478,7 @@ class PrivateJobSDK(WorkloadSDK):
|
|
|
448
478
|
state: Union[str, JobState] = JobState.SUCCEEDED,
|
|
449
479
|
timeout_s: float = 1800,
|
|
450
480
|
interval_s: float = _POLLING_INTERVAL_SECONDS,
|
|
481
|
+
follow: bool = False,
|
|
451
482
|
):
|
|
452
483
|
if not isinstance(timeout_s, (int, float)):
|
|
453
484
|
raise TypeError("timeout_s must be a float")
|
|
@@ -471,6 +502,11 @@ class PrivateJobSDK(WorkloadSDK):
|
|
|
471
502
|
self.logger.info(
|
|
472
503
|
f"Waiting for job '{job_id_or_name}' to reach target state {state}, currently in state: {curr_state}"
|
|
473
504
|
)
|
|
505
|
+
|
|
506
|
+
next_page_token = None
|
|
507
|
+
job_run_id = None
|
|
508
|
+
logs_started = False
|
|
509
|
+
|
|
474
510
|
for _ in self.timer.poll(timeout_s=timeout_s, interval_s=interval_s):
|
|
475
511
|
job_model = self._resolve_to_job_model(
|
|
476
512
|
name=name, job_id=job_id, cloud=cloud, project=project
|
|
@@ -483,6 +519,18 @@ class PrivateJobSDK(WorkloadSDK):
|
|
|
483
519
|
)
|
|
484
520
|
curr_state = new_state
|
|
485
521
|
|
|
522
|
+
# Stream logs if enabled and job has a job run
|
|
523
|
+
if follow and job_model.last_job_run_id:
|
|
524
|
+
if not logs_started:
|
|
525
|
+
job_run_id = job_model.last_job_run_id
|
|
526
|
+
self.logger.info(f"Starting log stream for job run {job_run_id}")
|
|
527
|
+
logs_started = True
|
|
528
|
+
|
|
529
|
+
if job_run_id:
|
|
530
|
+
next_page_token = self._stream_logs_for_job_run(
|
|
531
|
+
job_run_id=job_run_id, next_page_token=next_page_token,
|
|
532
|
+
)
|
|
533
|
+
|
|
486
534
|
if curr_state == state:
|
|
487
535
|
self.logger.info(
|
|
488
536
|
f"Job '{job_id_or_name}' reached target state, exiting"
|
anyscale/job/commands.py
CHANGED
|
@@ -187,6 +187,7 @@ _WAIT_ARG_DOCSTRINGS = {
|
|
|
187
187
|
"project": "Named project to use for the job. If not provided, the default project for the cloud will be used (or, if running in a workspace, the project of the workspace).",
|
|
188
188
|
"state": "Target state of the job",
|
|
189
189
|
"timeout_s": "Number of seconds to wait before timing out, this timeout will not affect job execution",
|
|
190
|
+
"follow": "Whether to follow the logs of the job. If True, the logs will be streamed to the console.",
|
|
190
191
|
}
|
|
191
192
|
|
|
192
193
|
|
|
@@ -204,6 +205,7 @@ def wait(
|
|
|
204
205
|
project: Optional[str] = None,
|
|
205
206
|
state: Union[JobState, str] = JobState.SUCCEEDED,
|
|
206
207
|
timeout_s: float = 1800,
|
|
208
|
+
follow: bool = False,
|
|
207
209
|
_private_sdk: Optional[PrivateJobSDK] = None,
|
|
208
210
|
**_kwargs: Dict[str, Any],
|
|
209
211
|
):
|
|
@@ -216,6 +218,7 @@ def wait(
|
|
|
216
218
|
project=project,
|
|
217
219
|
state=state,
|
|
218
220
|
timeout_s=timeout_s,
|
|
221
|
+
follow=follow,
|
|
219
222
|
)
|
|
220
223
|
|
|
221
224
|
|
anyscale/job/models.py
CHANGED
|
@@ -300,6 +300,9 @@ py_modules: # (Optional) A list of local directories or remote URIs that will be
|
|
|
300
300
|
cloud: anyscale-prod # (Optional) The name of the Anyscale Cloud.
|
|
301
301
|
project: my-project # (Optional) The name of the Anyscale Project.
|
|
302
302
|
max_retries: 3 # (Optional) Maximum number of times the job will be retried before being marked failed. Defaults to `1`.
|
|
303
|
+
tags:
|
|
304
|
+
team: mlops
|
|
305
|
+
purpose: training
|
|
303
306
|
|
|
304
307
|
"""
|
|
305
308
|
|
|
@@ -377,6 +380,19 @@ max_retries: 3 # (Optional) Maximum number of times the job will be retried befo
|
|
|
377
380
|
if timeout_s < 0:
|
|
378
381
|
raise ValueError("'timeout_s' must be >= 0.")
|
|
379
382
|
|
|
383
|
+
tags: Optional[Dict[str, str]] = field(
|
|
384
|
+
default=None, metadata={"docstring": "Tags to associate with the job."},
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
def _validate_tags(self, tags: Optional[Dict[str, str]]):
|
|
388
|
+
if tags is None:
|
|
389
|
+
return
|
|
390
|
+
if not isinstance(tags, dict):
|
|
391
|
+
raise TypeError("'tags' must be a Dict[str, str].")
|
|
392
|
+
for k, v in tags.items():
|
|
393
|
+
if not isinstance(k, str) or not isinstance(v, str):
|
|
394
|
+
raise TypeError("'tags' must be a Dict[str, str].")
|
|
395
|
+
|
|
380
396
|
|
|
381
397
|
class JobRunState(ModelEnum):
|
|
382
398
|
"""Current state of an individual job run."""
|
anyscale/job_queue/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Optional
|
|
1
|
+
from typing import Dict, List, Optional
|
|
2
2
|
|
|
3
3
|
from anyscale._private.anyscale_client import AnyscaleClient
|
|
4
4
|
from anyscale._private.models.model_base import ResultIterator
|
|
@@ -16,9 +16,15 @@ from anyscale.job_queue.commands import (
|
|
|
16
16
|
_LIST_EXAMPLE,
|
|
17
17
|
_STATUS_ARG_DOCSTRINGS,
|
|
18
18
|
_STATUS_EXAMPLE,
|
|
19
|
+
_TAGS_ADD_ARG_DOCSTRINGS,
|
|
20
|
+
_TAGS_ADD_EXAMPLE,
|
|
21
|
+
_TAGS_REMOVE_ARG_DOCSTRINGS,
|
|
22
|
+
_TAGS_REMOVE_EXAMPLE,
|
|
19
23
|
_UPDATE_ARG_DOCSTRINGS,
|
|
20
24
|
_UPDATE_EXAMPLE,
|
|
25
|
+
add_tags,
|
|
21
26
|
list,
|
|
27
|
+
remove_tags,
|
|
22
28
|
status,
|
|
23
29
|
update,
|
|
24
30
|
)
|
|
@@ -49,6 +55,7 @@ class JobQueueSDK:
|
|
|
49
55
|
cloud: Optional[str] = None,
|
|
50
56
|
project: Optional[str] = None,
|
|
51
57
|
cluster_status: Optional[SessionState] = None,
|
|
58
|
+
tags_filter: Optional[dict[str, List[str]]] = None,
|
|
52
59
|
page_size: Optional[int] = None,
|
|
53
60
|
max_items: Optional[int] = None,
|
|
54
61
|
sorting_directives: Optional[List[JobQueueSortDirective]] = None,
|
|
@@ -61,6 +68,7 @@ class JobQueueSDK:
|
|
|
61
68
|
cloud=cloud,
|
|
62
69
|
project=project,
|
|
63
70
|
cluster_status=cluster_status,
|
|
71
|
+
tags_filter=tags_filter,
|
|
64
72
|
page_size=page_size,
|
|
65
73
|
max_items=max_items,
|
|
66
74
|
sorting_directives=sorting_directives,
|
|
@@ -87,3 +95,31 @@ class JobQueueSDK:
|
|
|
87
95
|
max_concurrency=max_concurrency,
|
|
88
96
|
idle_timeout_s=idle_timeout_s,
|
|
89
97
|
)
|
|
98
|
+
|
|
99
|
+
@sdk_docs(doc_py_example=_TAGS_ADD_EXAMPLE, arg_docstrings=_TAGS_ADD_ARG_DOCSTRINGS)
|
|
100
|
+
def add_tags( # noqa: F811
|
|
101
|
+
self,
|
|
102
|
+
*,
|
|
103
|
+
job_queue_id: Optional[str] = None,
|
|
104
|
+
name: Optional[str] = None,
|
|
105
|
+
tags: Dict[str, str],
|
|
106
|
+
) -> None:
|
|
107
|
+
"""Upsert (add/update) tag key/value pairs for a job queue."""
|
|
108
|
+
return self._private_sdk.add_tags(
|
|
109
|
+
job_queue_id=job_queue_id, name=name, tags=tags
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
@sdk_docs(
|
|
113
|
+
doc_py_example=_TAGS_REMOVE_EXAMPLE, arg_docstrings=_TAGS_REMOVE_ARG_DOCSTRINGS
|
|
114
|
+
)
|
|
115
|
+
def remove_tags( # noqa: F811
|
|
116
|
+
self,
|
|
117
|
+
*,
|
|
118
|
+
job_queue_id: Optional[str] = None,
|
|
119
|
+
name: Optional[str] = None,
|
|
120
|
+
keys: List[str],
|
|
121
|
+
) -> None:
|
|
122
|
+
"""Remove tags by key from a job queue."""
|
|
123
|
+
return self._private_sdk.remove_tags(
|
|
124
|
+
job_queue_id=job_queue_id, name=name, keys=keys
|
|
125
|
+
)
|