anyscale 0.26.68__py3-none-any.whl → 0.26.70__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anyscale/_private/anyscale_client/anyscale_client.py +67 -1
- anyscale/_private/anyscale_client/common.py +20 -1
- anyscale/_private/anyscale_client/fake_anyscale_client.py +77 -10
- anyscale/client/README.md +16 -4
- anyscale/client/openapi_client/__init__.py +12 -4
- anyscale/client/openapi_client/api/default_api.py +588 -23
- anyscale/client/openapi_client/models/__init__.py +12 -4
- anyscale/client/openapi_client/models/api_key_info.py +29 -3
- anyscale/client/openapi_client/models/apply_autoscaling_config_update_model.py +350 -0
- anyscale/client/openapi_client/models/apply_production_service_multi_version_v2_model.py +207 -0
- anyscale/client/openapi_client/models/apply_production_service_v2_model.py +31 -3
- anyscale/client/openapi_client/models/baseimagesenum.py +70 -1
- anyscale/client/openapi_client/models/cloud_data_bucket_file_type.py +2 -1
- anyscale/client/openapi_client/models/{oauthconnectionresponse_response.py → clouddeployment_response.py} +11 -11
- anyscale/client/openapi_client/models/clusterdashboardnode_response.py +121 -0
- anyscale/client/openapi_client/models/create_experimental_workspace.py +29 -1
- anyscale/client/openapi_client/models/create_workspace_from_template.py +29 -1
- anyscale/client/openapi_client/models/create_workspace_template_version.py +31 -3
- anyscale/client/openapi_client/models/decorated_list_service_api_model.py +58 -1
- anyscale/client/openapi_client/models/decorated_production_service_v2_api_model.py +60 -3
- anyscale/client/openapi_client/models/decorated_service_event_api_model.py +3 -3
- anyscale/client/openapi_client/models/describe_machine_pool_machines_filters.py +33 -5
- anyscale/client/openapi_client/models/describe_machine_pool_workloads_filters.py +33 -5
- anyscale/client/openapi_client/models/{service_event_level.py → entity_type.py} +9 -9
- anyscale/client/openapi_client/models/event_level.py +2 -1
- anyscale/client/openapi_client/models/job_event_fields.py +206 -0
- anyscale/client/openapi_client/models/lineage_graph_node.py +70 -42
- anyscale/client/openapi_client/models/lineage_workload.py +31 -3
- anyscale/client/openapi_client/models/machine_type_partition_filter.py +152 -0
- anyscale/client/openapi_client/models/partition_info.py +30 -1
- anyscale/client/openapi_client/models/production_job_event.py +3 -3
- anyscale/client/openapi_client/models/rollout_strategy.py +2 -1
- anyscale/client/openapi_client/models/service_event_fields.py +318 -0
- anyscale/client/openapi_client/models/supportedbaseimagesenum.py +70 -1
- anyscale/client/openapi_client/models/task_summary_config.py +29 -3
- anyscale/client/openapi_client/models/task_table_config.py +29 -3
- anyscale/client/openapi_client/models/unified_event.py +377 -0
- anyscale/client/openapi_client/models/{ha_job_event_level.py → unified_origin_filter.py} +21 -9
- anyscale/client/openapi_client/models/unifiedevent_list_response.py +147 -0
- anyscale/client/openapi_client/models/workspace_event_fields.py +122 -0
- anyscale/client/openapi_client/models/workspace_template_version.py +30 -1
- anyscale/client/openapi_client/models/workspace_template_version_data_object.py +30 -1
- anyscale/cloud/models.py +2 -2
- anyscale/commands/cloud_commands.py +148 -11
- anyscale/commands/command_examples.py +53 -0
- anyscale/commands/job_commands.py +1 -1
- anyscale/commands/service_commands.py +130 -67
- anyscale/commands/setup_k8s.py +615 -49
- anyscale/controllers/cloud_controller.py +19 -5
- anyscale/controllers/kubernetes_verifier.py +80 -66
- anyscale/job/_private/job_sdk.py +47 -1
- anyscale/job/commands.py +3 -0
- anyscale/sdk/anyscale_client/models/apply_production_service_v2_model.py +31 -3
- anyscale/sdk/anyscale_client/models/apply_service_model.py +31 -3
- anyscale/sdk/anyscale_client/models/baseimagesenum.py +70 -1
- anyscale/sdk/anyscale_client/models/rollout_strategy.py +2 -1
- anyscale/sdk/anyscale_client/models/supportedbaseimagesenum.py +70 -1
- anyscale/service/__init__.py +11 -3
- anyscale/service/_private/service_sdk.py +361 -35
- anyscale/service/commands.py +15 -3
- anyscale/service/models.py +12 -0
- anyscale/shared_anyscale_utils/latest_ray_version.py +1 -1
- anyscale/version.py +1 -1
- {anyscale-0.26.68.dist-info → anyscale-0.26.70.dist-info}/METADATA +1 -1
- {anyscale-0.26.68.dist-info → anyscale-0.26.70.dist-info}/RECORD +70 -62
- anyscale/client/openapi_client/models/o_auth_connection_response.py +0 -229
- {anyscale-0.26.68.dist-info → anyscale-0.26.70.dist-info}/WHEEL +0 -0
- {anyscale-0.26.68.dist-info → anyscale-0.26.70.dist-info}/entry_points.txt +0 -0
- {anyscale-0.26.68.dist-info → anyscale-0.26.70.dist-info}/licenses/LICENSE +0 -0
- {anyscale-0.26.68.dist-info → anyscale-0.26.70.dist-info}/licenses/NOTICE +0 -0
- {anyscale-0.26.68.dist-info → anyscale-0.26.70.dist-info}/top_level.txt +0 -0
|
@@ -1746,7 +1746,7 @@ class CloudController(BaseController):
|
|
|
1746
1746
|
spec_file: str,
|
|
1747
1747
|
skip_verification: bool = False,
|
|
1748
1748
|
yes: bool = False,
|
|
1749
|
-
):
|
|
1749
|
+
) -> str:
|
|
1750
1750
|
cloud_id, _ = get_cloud_id_and_name(
|
|
1751
1751
|
self.api_client, cloud_id=cloud_id, cloud_name=cloud
|
|
1752
1752
|
)
|
|
@@ -1797,16 +1797,21 @@ class CloudController(BaseController):
|
|
|
1797
1797
|
|
|
1798
1798
|
# Add the resource.
|
|
1799
1799
|
try:
|
|
1800
|
-
self.api_client.add_cloud_resource_api_v2_clouds_cloud_id_add_resource_put(
|
|
1800
|
+
response = self.api_client.add_cloud_resource_api_v2_clouds_cloud_id_add_resource_put(
|
|
1801
1801
|
cloud_id=cloud_id, cloud_deployment=new_deployment,
|
|
1802
1802
|
)
|
|
1803
1803
|
except Exception as e: # noqa: BLE001
|
|
1804
1804
|
raise ClickException(f"Failed to add cloud resource: {e}")
|
|
1805
1805
|
|
|
1806
|
+
# Extract cloud_resource_id from the response
|
|
1807
|
+
cloud_resource_id = response.result.cloud_resource_id
|
|
1808
|
+
|
|
1806
1809
|
self.log.info(
|
|
1807
1810
|
f"Successfully created cloud resource{' ' + (new_deployment.name or '')} in cloud {cloud or cloud_id}!"
|
|
1808
1811
|
)
|
|
1809
1812
|
|
|
1813
|
+
return cloud_resource_id
|
|
1814
|
+
|
|
1810
1815
|
def update_cloud_resources( # noqa: PLR0912, C901
|
|
1811
1816
|
self,
|
|
1812
1817
|
cloud_name: Optional[str],
|
|
@@ -1828,6 +1833,14 @@ class CloudController(BaseController):
|
|
|
1828
1833
|
|
|
1829
1834
|
spec = yaml.safe_load(path.read_text())
|
|
1830
1835
|
|
|
1836
|
+
# Normalize spec to a list
|
|
1837
|
+
if isinstance(spec, dict):
|
|
1838
|
+
spec = [spec]
|
|
1839
|
+
elif not isinstance(spec, list):
|
|
1840
|
+
raise ClickException(
|
|
1841
|
+
"Invalid cloud resources file format. Must contain either a single CloudResource or a list of CloudResources."
|
|
1842
|
+
)
|
|
1843
|
+
|
|
1831
1844
|
# Get the existing spec.
|
|
1832
1845
|
existing_resources = self.get_cloud_resources(cloud_id=cloud_id)
|
|
1833
1846
|
|
|
@@ -4197,9 +4210,10 @@ class CloudController(BaseController):
|
|
|
4197
4210
|
f" --set-string global.cloudProvider={provider}",
|
|
4198
4211
|
]
|
|
4199
4212
|
|
|
4200
|
-
# Add region for
|
|
4201
|
-
|
|
4202
|
-
|
|
4213
|
+
# Add region only for AWS (using global.aws.region)
|
|
4214
|
+
# Region field is deprecated for other providers
|
|
4215
|
+
if region and provider == "aws":
|
|
4216
|
+
command_parts.append(f" --set-string global.aws.region={region}")
|
|
4203
4217
|
|
|
4204
4218
|
# Add provider-specific parameters
|
|
4205
4219
|
if provider == "gcp" and operator_iam_identity:
|
|
@@ -12,6 +12,7 @@ Handles verification of Kubernetes-based cloud deployments including:
|
|
|
12
12
|
|
|
13
13
|
from contextlib import contextmanager, suppress
|
|
14
14
|
from dataclasses import dataclass
|
|
15
|
+
from enum import Enum
|
|
15
16
|
import json
|
|
16
17
|
import os
|
|
17
18
|
import shutil
|
|
@@ -90,8 +91,19 @@ KUBECTL_COMMON_PATHS = [
|
|
|
90
91
|
# Status and result strings
|
|
91
92
|
PASSED_STATUS = "PASSED"
|
|
92
93
|
FAILED_STATUS = "FAILED"
|
|
94
|
+
SKIPPED_STATUS = "SKIPPED"
|
|
93
95
|
RUNNING_STATUS = "Running"
|
|
94
96
|
|
|
97
|
+
|
|
98
|
+
# Verification status enum
|
|
99
|
+
class VerificationStatus(Enum):
|
|
100
|
+
"""Status of a verification check."""
|
|
101
|
+
|
|
102
|
+
PASSED = "PASSED"
|
|
103
|
+
FAILED = "FAILED"
|
|
104
|
+
SKIPPED = "SKIPPED"
|
|
105
|
+
|
|
106
|
+
|
|
95
107
|
# Verification component names (for consistent reporting)
|
|
96
108
|
class VerificationComponents:
|
|
97
109
|
OPERATOR_POD_INSTALLED = "Operator Pod Installed"
|
|
@@ -206,15 +218,15 @@ class ResourceNotFoundError(KubernetesVerificationError):
|
|
|
206
218
|
class VerificationResults:
|
|
207
219
|
"""Tracks the results of all verification steps."""
|
|
208
220
|
|
|
209
|
-
operator_pod_installed:
|
|
210
|
-
operator_health:
|
|
211
|
-
operator_identity:
|
|
212
|
-
file_storage:
|
|
213
|
-
gateway_support:
|
|
214
|
-
nginx_ingress:
|
|
221
|
+
operator_pod_installed: VerificationStatus = VerificationStatus.FAILED
|
|
222
|
+
operator_health: VerificationStatus = VerificationStatus.FAILED
|
|
223
|
+
operator_identity: VerificationStatus = VerificationStatus.FAILED
|
|
224
|
+
file_storage: VerificationStatus = VerificationStatus.FAILED
|
|
225
|
+
gateway_support: VerificationStatus = VerificationStatus.FAILED
|
|
226
|
+
nginx_ingress: VerificationStatus = VerificationStatus.FAILED
|
|
215
227
|
|
|
216
|
-
def to_dict(self) -> Dict[str,
|
|
217
|
-
"""Convert to dictionary format
|
|
228
|
+
def to_dict(self) -> Dict[str, VerificationStatus]:
|
|
229
|
+
"""Convert to dictionary format for reporting."""
|
|
218
230
|
return {
|
|
219
231
|
VerificationComponents.OPERATOR_POD_INSTALLED: self.operator_pod_installed,
|
|
220
232
|
VerificationComponents.OPERATOR_HEALTH: self.operator_health,
|
|
@@ -226,9 +238,10 @@ class VerificationResults:
|
|
|
226
238
|
|
|
227
239
|
@property
|
|
228
240
|
def overall_success(self) -> bool:
|
|
229
|
-
"""Return True if all verification steps passed."""
|
|
241
|
+
"""Return True if all verification steps passed or were skipped."""
|
|
230
242
|
return all(
|
|
231
|
-
|
|
243
|
+
status in (VerificationStatus.PASSED, VerificationStatus.SKIPPED)
|
|
244
|
+
for status in [
|
|
232
245
|
self.operator_pod_installed,
|
|
233
246
|
self.operator_health,
|
|
234
247
|
self.operator_identity,
|
|
@@ -654,24 +667,24 @@ class OperatorVerifier:
|
|
|
654
667
|
port=OPERATOR_HEALTH_PORT,
|
|
655
668
|
)
|
|
656
669
|
|
|
657
|
-
def verify_operator_health(self, operator_data: OperatorData) ->
|
|
670
|
+
def verify_operator_health(self, operator_data: OperatorData) -> VerificationStatus:
|
|
658
671
|
"""Verify operator health using pre-fetched data."""
|
|
659
672
|
if operator_data.health.is_healthy:
|
|
660
|
-
return
|
|
673
|
+
return VerificationStatus.PASSED
|
|
661
674
|
else:
|
|
662
675
|
self.log.error(
|
|
663
676
|
f"Health check failed - HTTP {operator_data.health.status_code}"
|
|
664
677
|
)
|
|
665
678
|
if operator_data.health.response_text:
|
|
666
679
|
self.log.error(f"Response: {operator_data.health.response_text}")
|
|
667
|
-
return
|
|
680
|
+
return VerificationStatus.FAILED
|
|
668
681
|
|
|
669
682
|
def verify_operator_identity(
|
|
670
683
|
self,
|
|
671
684
|
operator_data: OperatorData,
|
|
672
685
|
kubernetes_config: OpenAPIKubernetesConfig,
|
|
673
686
|
cloud_provider: Optional[CloudProviders],
|
|
674
|
-
) ->
|
|
687
|
+
) -> VerificationStatus:
|
|
675
688
|
"""Verify operator identity using pre-fetched config data."""
|
|
676
689
|
# Validate kubernetes_config contents
|
|
677
690
|
expected_identity = kubernetes_config.anyscale_operator_iam_identity
|
|
@@ -679,7 +692,7 @@ class OperatorVerifier:
|
|
|
679
692
|
self.log.error(
|
|
680
693
|
"Missing 'anyscale_operator_iam_identity' in kubernetes config"
|
|
681
694
|
)
|
|
682
|
-
return
|
|
695
|
+
return VerificationStatus.FAILED
|
|
683
696
|
|
|
684
697
|
# Validate config response
|
|
685
698
|
if not operator_data.config.is_valid:
|
|
@@ -688,32 +701,34 @@ class OperatorVerifier:
|
|
|
688
701
|
)
|
|
689
702
|
if operator_data.config.response_text:
|
|
690
703
|
self.log.error(f"Response: {operator_data.config.response_text}")
|
|
691
|
-
return
|
|
704
|
+
return VerificationStatus.FAILED
|
|
692
705
|
|
|
693
706
|
# Extract actual identity from config
|
|
694
707
|
if operator_data.config.config_data is None:
|
|
695
708
|
self.log.error("Operator config data is None")
|
|
696
|
-
return
|
|
709
|
+
return VerificationStatus.FAILED
|
|
697
710
|
|
|
698
711
|
actual_identity = operator_data.config.config_data.get("iamIdentity")
|
|
699
712
|
if not actual_identity:
|
|
700
713
|
self.log.error("Operator config missing 'iamIdentity' field")
|
|
701
|
-
return
|
|
714
|
+
return VerificationStatus.FAILED
|
|
702
715
|
|
|
703
716
|
# Perform identity comparison
|
|
704
717
|
if self._evaluate_identity_match(
|
|
705
718
|
expected_identity, actual_identity, cloud_provider
|
|
706
719
|
):
|
|
720
|
+
# Get cloud provider string for display
|
|
721
|
+
provider_str = str(cloud_provider) if cloud_provider else "AWS"
|
|
707
722
|
self.log.info(
|
|
708
|
-
f"
|
|
723
|
+
f"{provider_str} identity match: Expected identity matches (Expected: {expected_identity})"
|
|
709
724
|
)
|
|
710
|
-
self.log.info("Expected
|
|
711
|
-
return
|
|
725
|
+
self.log.info("Expected identity matches actual identity")
|
|
726
|
+
return VerificationStatus.PASSED
|
|
712
727
|
else:
|
|
713
728
|
self.log.error("Operator identity mismatch")
|
|
714
729
|
self.log.error(f"Expected: {expected_identity}")
|
|
715
730
|
self.log.error(f"Actual: {actual_identity}")
|
|
716
|
-
return
|
|
731
|
+
return VerificationStatus.FAILED
|
|
717
732
|
|
|
718
733
|
@contextmanager
|
|
719
734
|
def _port_forward_to_operator(self, pod_name: str):
|
|
@@ -980,8 +995,12 @@ class StorageVerifier:
|
|
|
980
995
|
|
|
981
996
|
def verify_file_storage(
|
|
982
997
|
self, file_storage: FileStorage, cloud_deployment: CloudDeployment
|
|
983
|
-
) ->
|
|
984
|
-
"""Verify file storage configuration (non-functional checks only).
|
|
998
|
+
) -> VerificationStatus:
|
|
999
|
+
"""Verify file storage configuration (non-functional checks only).
|
|
1000
|
+
|
|
1001
|
+
Returns:
|
|
1002
|
+
VerificationStatus enum value
|
|
1003
|
+
"""
|
|
985
1004
|
self.log.info("Verifying file storage configuration...")
|
|
986
1005
|
verification_results = []
|
|
987
1006
|
|
|
@@ -1014,12 +1033,15 @@ class StorageVerifier:
|
|
|
1014
1033
|
f"Cloud provider API error while verifying file storage: {e}"
|
|
1015
1034
|
) from e
|
|
1016
1035
|
|
|
1017
|
-
# Return overall
|
|
1036
|
+
# Return overall status
|
|
1018
1037
|
if verification_results:
|
|
1019
|
-
|
|
1038
|
+
if all(result for _, result in verification_results):
|
|
1039
|
+
return VerificationStatus.PASSED
|
|
1040
|
+
else:
|
|
1041
|
+
return VerificationStatus.FAILED
|
|
1020
1042
|
else:
|
|
1021
|
-
self.log.info("
|
|
1022
|
-
return
|
|
1043
|
+
self.log.info("No file storage components found to verify")
|
|
1044
|
+
return VerificationStatus.SKIPPED
|
|
1023
1045
|
|
|
1024
1046
|
def _verify_csi_driver(self, driver_name: str) -> bool:
|
|
1025
1047
|
"""Check if CSI driver exists on cluster."""
|
|
@@ -1135,13 +1157,17 @@ class GatewayVerifier:
|
|
|
1135
1157
|
self.config = k8s_config
|
|
1136
1158
|
self.log = logger
|
|
1137
1159
|
|
|
1138
|
-
def verify_gateway_support(self, operator_data: OperatorData) ->
|
|
1139
|
-
"""Verify gateway support using pre-fetched config data.
|
|
1160
|
+
def verify_gateway_support(self, operator_data: OperatorData) -> VerificationStatus:
|
|
1161
|
+
"""Verify gateway support using pre-fetched config data.
|
|
1162
|
+
|
|
1163
|
+
Returns:
|
|
1164
|
+
VerificationStatus enum value
|
|
1165
|
+
"""
|
|
1140
1166
|
if not operator_data.config.is_valid:
|
|
1141
|
-
self.log.
|
|
1167
|
+
self.log.info(
|
|
1142
1168
|
"Could not retrieve operator configuration - skipping gateway verification"
|
|
1143
1169
|
)
|
|
1144
|
-
return
|
|
1170
|
+
return VerificationStatus.SKIPPED
|
|
1145
1171
|
|
|
1146
1172
|
# Extract gateway configuration from operator data
|
|
1147
1173
|
gateway_config = GatewayConfig.from_operator_config(
|
|
@@ -1152,21 +1178,24 @@ class GatewayVerifier:
|
|
|
1152
1178
|
self.log.info(
|
|
1153
1179
|
"Gateway support is not enabled - skipping gateway verification"
|
|
1154
1180
|
)
|
|
1155
|
-
return
|
|
1181
|
+
return VerificationStatus.SKIPPED
|
|
1156
1182
|
|
|
1157
1183
|
if not gateway_config.requires_verification:
|
|
1158
1184
|
self.log.error(
|
|
1159
1185
|
"Gateway is enabled but no gateway name found in operator configuration"
|
|
1160
1186
|
)
|
|
1161
|
-
return
|
|
1187
|
+
return VerificationStatus.FAILED
|
|
1162
1188
|
|
|
1163
1189
|
# Verify gateway exists in cluster
|
|
1164
1190
|
assert (
|
|
1165
1191
|
gateway_config.name is not None
|
|
1166
1192
|
) # guaranteed by requires_verification check
|
|
1167
|
-
|
|
1193
|
+
if self._verify_gateway_exists(gateway_config.name):
|
|
1194
|
+
return VerificationStatus.PASSED
|
|
1195
|
+
else:
|
|
1196
|
+
return VerificationStatus.FAILED
|
|
1168
1197
|
|
|
1169
|
-
def verify_nginx_ingress(self) ->
|
|
1198
|
+
def verify_nginx_ingress(self) -> VerificationStatus:
|
|
1170
1199
|
"""Check for NGINX ingress controller (warning only)."""
|
|
1171
1200
|
try:
|
|
1172
1201
|
self.log.info("Checking for NGINX ingress controller...")
|
|
@@ -1182,7 +1211,7 @@ class GatewayVerifier:
|
|
|
1182
1211
|
f"PASSED: Found running NGINX ingress controller: {nginx_pod} "
|
|
1183
1212
|
f"(namespace: {config_dict['namespace']})"
|
|
1184
1213
|
)
|
|
1185
|
-
return
|
|
1214
|
+
return VerificationStatus.PASSED
|
|
1186
1215
|
else:
|
|
1187
1216
|
pod_status = self.kubectl.get_pod_status(
|
|
1188
1217
|
nginx_pod, config_dict["namespace"]
|
|
@@ -1194,14 +1223,14 @@ class GatewayVerifier:
|
|
|
1194
1223
|
|
|
1195
1224
|
# Try fallback search by name patterns
|
|
1196
1225
|
if self._find_nginx_by_name_pattern():
|
|
1197
|
-
return
|
|
1226
|
+
return VerificationStatus.PASSED
|
|
1198
1227
|
|
|
1199
1228
|
# No NGINX ingress controller found
|
|
1200
1229
|
self.log.warning("No NGINX ingress controller found")
|
|
1201
1230
|
self.log.warning("This may impact ingress routing capabilities")
|
|
1202
1231
|
self.log.warning("Available ingress controllers:")
|
|
1203
1232
|
self._list_available_ingress_controllers()
|
|
1204
|
-
return
|
|
1233
|
+
return VerificationStatus.FAILED
|
|
1205
1234
|
|
|
1206
1235
|
except (KubectlError, ResourceNotFoundError) as e:
|
|
1207
1236
|
self.log.warning(f"WARNING: Could not verify NGINX ingress controller: {e}")
|
|
@@ -1437,10 +1466,6 @@ class KubernetesCloudDeploymentVerifier:
|
|
|
1437
1466
|
self.log.error(f"Data parsing error during verification: {e}")
|
|
1438
1467
|
return False
|
|
1439
1468
|
|
|
1440
|
-
def _passed_or_failed_str_from_bool(self, is_passing: bool) -> str:
|
|
1441
|
-
"""Return PASSED or FAILED string for verification results, matching VM verification format."""
|
|
1442
|
-
return PASSED_STATUS if is_passing else FAILED_STATUS
|
|
1443
|
-
|
|
1444
1469
|
@contextmanager
|
|
1445
1470
|
def _verification_step(self, step_name: str):
|
|
1446
1471
|
"""Context manager for verification steps that indents detailed output."""
|
|
@@ -1467,7 +1492,7 @@ class KubernetesCloudDeploymentVerifier:
|
|
|
1467
1492
|
with self._verification_step("Finding operator pod"):
|
|
1468
1493
|
try:
|
|
1469
1494
|
operator_pod = operator_verifier.find_operator_pod()
|
|
1470
|
-
self.results.operator_pod_installed =
|
|
1495
|
+
self.results.operator_pod_installed = VerificationStatus.PASSED
|
|
1471
1496
|
except OperatorPodNotFoundError as e:
|
|
1472
1497
|
self.log.error(
|
|
1473
1498
|
"Failed to find operator pod, please make sure the operator is running"
|
|
@@ -1490,56 +1515,47 @@ class KubernetesCloudDeploymentVerifier:
|
|
|
1490
1515
|
self.results.operator_health = operator_verifier.verify_operator_health(
|
|
1491
1516
|
operator_data
|
|
1492
1517
|
)
|
|
1493
|
-
self.log.info(
|
|
1494
|
-
f"Operator Health: {self._passed_or_failed_str_from_bool(self.results.operator_health)}"
|
|
1495
|
-
)
|
|
1518
|
+
self.log.info(f"Operator Health: {self.results.operator_health.value}")
|
|
1496
1519
|
|
|
1497
1520
|
self.log.info("Verifying operator identity...")
|
|
1498
1521
|
if cloud_deployment.kubernetes_config is None:
|
|
1499
1522
|
self.log.error(
|
|
1500
1523
|
"Kubernetes configuration is missing from cloud deployment"
|
|
1501
1524
|
)
|
|
1502
|
-
self.results.operator_identity =
|
|
1525
|
+
self.results.operator_identity = VerificationStatus.FAILED
|
|
1503
1526
|
else:
|
|
1504
1527
|
self.results.operator_identity = operator_verifier.verify_operator_identity(
|
|
1505
1528
|
operator_data,
|
|
1506
1529
|
cloud_deployment.kubernetes_config,
|
|
1507
1530
|
cloud_deployment.provider,
|
|
1508
1531
|
)
|
|
1509
|
-
self.log.info(
|
|
1510
|
-
f"Operator Identity: {self._passed_or_failed_str_from_bool(self.results.operator_identity)}"
|
|
1511
|
-
)
|
|
1532
|
+
self.log.info(f"Operator Identity: {self.results.operator_identity.value}")
|
|
1512
1533
|
|
|
1513
1534
|
# Step 4: Check file storage
|
|
1514
1535
|
with self._verification_step("Checking file storage"):
|
|
1515
1536
|
if cloud_deployment.file_storage is None:
|
|
1516
1537
|
self.log.info(
|
|
1517
|
-
"
|
|
1538
|
+
"No file storage configured - skipping file storage verification"
|
|
1518
1539
|
)
|
|
1519
|
-
self.results.file_storage =
|
|
1540
|
+
self.results.file_storage = VerificationStatus.SKIPPED
|
|
1520
1541
|
else:
|
|
1521
1542
|
self.results.file_storage = storage_verifier.verify_file_storage(
|
|
1522
1543
|
cloud_deployment.file_storage, cloud_deployment
|
|
1523
1544
|
)
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
)
|
|
1545
|
+
|
|
1546
|
+
self.log.info(f"File Storage: {self.results.file_storage.value}")
|
|
1527
1547
|
|
|
1528
1548
|
# Step 5: Verify gateway support
|
|
1529
|
-
with self._verification_step("
|
|
1549
|
+
with self._verification_step("Checking gateway support"):
|
|
1530
1550
|
self.results.gateway_support = gateway_verifier.verify_gateway_support(
|
|
1531
1551
|
operator_data
|
|
1532
1552
|
)
|
|
1533
|
-
self.log.info(
|
|
1534
|
-
f"Gateway Support: {self._passed_or_failed_str_from_bool(self.results.gateway_support)}"
|
|
1535
|
-
)
|
|
1553
|
+
self.log.info(f"Gateway Support: {self.results.gateway_support.value}")
|
|
1536
1554
|
|
|
1537
1555
|
# Step 6: Check NGINX ingress (warning only)
|
|
1538
1556
|
with self._verification_step("Checking NGINX ingress controller"):
|
|
1539
1557
|
self.results.nginx_ingress = gateway_verifier.verify_nginx_ingress()
|
|
1540
|
-
self.log.info(
|
|
1541
|
-
f"NGINX Ingress: {self._passed_or_failed_str_from_bool(self.results.nginx_ingress)}"
|
|
1542
|
-
)
|
|
1558
|
+
self.log.info(f"NGINX Ingress: {self.results.nginx_ingress.value}")
|
|
1543
1559
|
|
|
1544
1560
|
self._show_verification_summary()
|
|
1545
1561
|
|
|
@@ -1557,9 +1573,7 @@ class KubernetesCloudDeploymentVerifier:
|
|
|
1557
1573
|
verification_result_summary = ["Verification result:"]
|
|
1558
1574
|
|
|
1559
1575
|
for component, result in self.results.to_dict().items():
|
|
1560
|
-
verification_result_summary.append(
|
|
1561
|
-
f"{component}: {self._passed_or_failed_str_from_bool(result)}"
|
|
1562
|
-
)
|
|
1576
|
+
verification_result_summary.append(f"{component}: {result.value}")
|
|
1563
1577
|
|
|
1564
1578
|
self.log.info("\n".join(verification_result_summary))
|
|
1565
1579
|
|
anyscale/job/_private/job_sdk.py
CHANGED
|
@@ -438,7 +438,35 @@ class PrivateJobSDK(WorkloadSDK):
|
|
|
438
438
|
self.logger.info(f"Job {job_model.id} is successfully archived.")
|
|
439
439
|
return job_model.id
|
|
440
440
|
|
|
441
|
-
def
|
|
441
|
+
def _stream_logs_for_job_run(
|
|
442
|
+
self, job_run_id: str, next_page_token: Optional[str] = None,
|
|
443
|
+
) -> Optional[str]:
|
|
444
|
+
"""Stream logs for a job run and return updated pagination state.
|
|
445
|
+
|
|
446
|
+
Args:
|
|
447
|
+
job_run_id: The ID of the job run to stream logs for
|
|
448
|
+
next_page_token: Token for fetching next page of logs
|
|
449
|
+
|
|
450
|
+
Returns:
|
|
451
|
+
next_page_token for the next iteration
|
|
452
|
+
"""
|
|
453
|
+
try:
|
|
454
|
+
logs, next_page_token = self.client.stream_logs_for_job_run(
|
|
455
|
+
job_run_id=job_run_id, next_page_token=next_page_token,
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
# Print logs line by line
|
|
459
|
+
for line in logs.splitlines():
|
|
460
|
+
if line: # Skip empty lines
|
|
461
|
+
print(line)
|
|
462
|
+
|
|
463
|
+
except Exception as e: # noqa: BLE001
|
|
464
|
+
# Don't fail if log streaming fails
|
|
465
|
+
self.logger.warning(f"Error streaming logs: {e}")
|
|
466
|
+
|
|
467
|
+
return next_page_token
|
|
468
|
+
|
|
469
|
+
def wait( # noqa: PLR0912
|
|
442
470
|
self,
|
|
443
471
|
*,
|
|
444
472
|
name: Optional[str] = None,
|
|
@@ -448,6 +476,7 @@ class PrivateJobSDK(WorkloadSDK):
|
|
|
448
476
|
state: Union[str, JobState] = JobState.SUCCEEDED,
|
|
449
477
|
timeout_s: float = 1800,
|
|
450
478
|
interval_s: float = _POLLING_INTERVAL_SECONDS,
|
|
479
|
+
follow: bool = False,
|
|
451
480
|
):
|
|
452
481
|
if not isinstance(timeout_s, (int, float)):
|
|
453
482
|
raise TypeError("timeout_s must be a float")
|
|
@@ -471,6 +500,11 @@ class PrivateJobSDK(WorkloadSDK):
|
|
|
471
500
|
self.logger.info(
|
|
472
501
|
f"Waiting for job '{job_id_or_name}' to reach target state {state}, currently in state: {curr_state}"
|
|
473
502
|
)
|
|
503
|
+
|
|
504
|
+
next_page_token = None
|
|
505
|
+
job_run_id = None
|
|
506
|
+
logs_started = False
|
|
507
|
+
|
|
474
508
|
for _ in self.timer.poll(timeout_s=timeout_s, interval_s=interval_s):
|
|
475
509
|
job_model = self._resolve_to_job_model(
|
|
476
510
|
name=name, job_id=job_id, cloud=cloud, project=project
|
|
@@ -483,6 +517,18 @@ class PrivateJobSDK(WorkloadSDK):
|
|
|
483
517
|
)
|
|
484
518
|
curr_state = new_state
|
|
485
519
|
|
|
520
|
+
# Stream logs if enabled and job has a job run
|
|
521
|
+
if follow and job_model.last_job_run_id:
|
|
522
|
+
if not logs_started:
|
|
523
|
+
job_run_id = job_model.last_job_run_id
|
|
524
|
+
self.logger.info(f"Starting log stream for job run {job_run_id}")
|
|
525
|
+
logs_started = True
|
|
526
|
+
|
|
527
|
+
if job_run_id:
|
|
528
|
+
next_page_token = self._stream_logs_for_job_run(
|
|
529
|
+
job_run_id=job_run_id, next_page_token=next_page_token,
|
|
530
|
+
)
|
|
531
|
+
|
|
486
532
|
if curr_state == state:
|
|
487
533
|
self.logger.info(
|
|
488
534
|
f"Job '{job_id_or_name}' reached target state, exiting"
|
anyscale/job/commands.py
CHANGED
|
@@ -187,6 +187,7 @@ _WAIT_ARG_DOCSTRINGS = {
|
|
|
187
187
|
"project": "Named project to use for the job. If not provided, the default project for the cloud will be used (or, if running in a workspace, the project of the workspace).",
|
|
188
188
|
"state": "Target state of the job",
|
|
189
189
|
"timeout_s": "Number of seconds to wait before timing out, this timeout will not affect job execution",
|
|
190
|
+
"follow": "Whether to follow the logs of the job. If True, the logs will be streamed to the console.",
|
|
190
191
|
}
|
|
191
192
|
|
|
192
193
|
|
|
@@ -204,6 +205,7 @@ def wait(
|
|
|
204
205
|
project: Optional[str] = None,
|
|
205
206
|
state: Union[JobState, str] = JobState.SUCCEEDED,
|
|
206
207
|
timeout_s: float = 1800,
|
|
208
|
+
follow: bool = False,
|
|
207
209
|
_private_sdk: Optional[PrivateJobSDK] = None,
|
|
208
210
|
**_kwargs: Dict[str, Any],
|
|
209
211
|
):
|
|
@@ -216,6 +218,7 @@ def wait(
|
|
|
216
218
|
project=project,
|
|
217
219
|
state=state,
|
|
218
220
|
timeout_s=timeout_s,
|
|
221
|
+
follow=follow,
|
|
219
222
|
)
|
|
220
223
|
|
|
221
224
|
|
|
@@ -47,7 +47,8 @@ class ApplyProductionServiceV2Model(object):
|
|
|
47
47
|
'tracing_config': 'TracingConfig',
|
|
48
48
|
'auto_complete_rollout': 'bool',
|
|
49
49
|
'max_surge_percent': 'int',
|
|
50
|
-
'tags': 'dict(str, str)'
|
|
50
|
+
'tags': 'dict(str, str)',
|
|
51
|
+
'traffic_percent': 'int'
|
|
51
52
|
}
|
|
52
53
|
|
|
53
54
|
attribute_map = {
|
|
@@ -65,10 +66,11 @@ class ApplyProductionServiceV2Model(object):
|
|
|
65
66
|
'tracing_config': 'tracing_config',
|
|
66
67
|
'auto_complete_rollout': 'auto_complete_rollout',
|
|
67
68
|
'max_surge_percent': 'max_surge_percent',
|
|
68
|
-
'tags': 'tags'
|
|
69
|
+
'tags': 'tags',
|
|
70
|
+
'traffic_percent': 'traffic_percent'
|
|
69
71
|
}
|
|
70
72
|
|
|
71
|
-
def __init__(self, name=None, description=None, project_id=None, version=None, canary_percent=None, ray_serve_config=None, build_id=None, compute_config_id=None, config=None, rollout_strategy=None, ray_gcs_external_storage_config=None, tracing_config=None, auto_complete_rollout=True, max_surge_percent=None, tags=None, local_vars_configuration=None): # noqa: E501
|
|
73
|
+
def __init__(self, name=None, description=None, project_id=None, version=None, canary_percent=None, ray_serve_config=None, build_id=None, compute_config_id=None, config=None, rollout_strategy=None, ray_gcs_external_storage_config=None, tracing_config=None, auto_complete_rollout=True, max_surge_percent=None, tags=None, traffic_percent=None, local_vars_configuration=None): # noqa: E501
|
|
72
74
|
"""ApplyProductionServiceV2Model - a model defined in OpenAPI""" # noqa: E501
|
|
73
75
|
if local_vars_configuration is None:
|
|
74
76
|
local_vars_configuration = Configuration()
|
|
@@ -89,6 +91,7 @@ class ApplyProductionServiceV2Model(object):
|
|
|
89
91
|
self._auto_complete_rollout = None
|
|
90
92
|
self._max_surge_percent = None
|
|
91
93
|
self._tags = None
|
|
94
|
+
self._traffic_percent = None
|
|
92
95
|
self.discriminator = None
|
|
93
96
|
|
|
94
97
|
self.name = name
|
|
@@ -117,6 +120,8 @@ class ApplyProductionServiceV2Model(object):
|
|
|
117
120
|
self.max_surge_percent = max_surge_percent
|
|
118
121
|
if tags is not None:
|
|
119
122
|
self.tags = tags
|
|
123
|
+
if traffic_percent is not None:
|
|
124
|
+
self.traffic_percent = traffic_percent
|
|
120
125
|
|
|
121
126
|
@property
|
|
122
127
|
def name(self):
|
|
@@ -471,6 +476,29 @@ class ApplyProductionServiceV2Model(object):
|
|
|
471
476
|
|
|
472
477
|
self._tags = tags
|
|
473
478
|
|
|
479
|
+
@property
|
|
480
|
+
def traffic_percent(self):
|
|
481
|
+
"""Gets the traffic_percent of this ApplyProductionServiceV2Model. # noqa: E501
|
|
482
|
+
|
|
483
|
+
Percentage of traffic forwarded to a particular service version from the ALB. # noqa: E501
|
|
484
|
+
|
|
485
|
+
:return: The traffic_percent of this ApplyProductionServiceV2Model. # noqa: E501
|
|
486
|
+
:rtype: int
|
|
487
|
+
"""
|
|
488
|
+
return self._traffic_percent
|
|
489
|
+
|
|
490
|
+
@traffic_percent.setter
|
|
491
|
+
def traffic_percent(self, traffic_percent):
|
|
492
|
+
"""Sets the traffic_percent of this ApplyProductionServiceV2Model.
|
|
493
|
+
|
|
494
|
+
Percentage of traffic forwarded to a particular service version from the ALB. # noqa: E501
|
|
495
|
+
|
|
496
|
+
:param traffic_percent: The traffic_percent of this ApplyProductionServiceV2Model. # noqa: E501
|
|
497
|
+
:type: int
|
|
498
|
+
"""
|
|
499
|
+
|
|
500
|
+
self._traffic_percent = traffic_percent
|
|
501
|
+
|
|
474
502
|
def to_dict(self):
|
|
475
503
|
"""Returns the model properties as a dict"""
|
|
476
504
|
result = {}
|
|
@@ -47,7 +47,8 @@ class ApplyServiceModel(object):
|
|
|
47
47
|
'tracing_config': 'TracingConfig',
|
|
48
48
|
'auto_complete_rollout': 'bool',
|
|
49
49
|
'max_surge_percent': 'int',
|
|
50
|
-
'tags': 'dict(str, str)'
|
|
50
|
+
'tags': 'dict(str, str)',
|
|
51
|
+
'traffic_percent': 'int'
|
|
51
52
|
}
|
|
52
53
|
|
|
53
54
|
attribute_map = {
|
|
@@ -65,10 +66,11 @@ class ApplyServiceModel(object):
|
|
|
65
66
|
'tracing_config': 'tracing_config',
|
|
66
67
|
'auto_complete_rollout': 'auto_complete_rollout',
|
|
67
68
|
'max_surge_percent': 'max_surge_percent',
|
|
68
|
-
'tags': 'tags'
|
|
69
|
+
'tags': 'tags',
|
|
70
|
+
'traffic_percent': 'traffic_percent'
|
|
69
71
|
}
|
|
70
72
|
|
|
71
|
-
def __init__(self, name=None, description=None, project_id=None, version=None, canary_percent=None, ray_serve_config=None, build_id=None, compute_config_id=None, config=None, rollout_strategy=None, ray_gcs_external_storage_config=None, tracing_config=None, auto_complete_rollout=True, max_surge_percent=None, tags=None, local_vars_configuration=None): # noqa: E501
|
|
73
|
+
def __init__(self, name=None, description=None, project_id=None, version=None, canary_percent=None, ray_serve_config=None, build_id=None, compute_config_id=None, config=None, rollout_strategy=None, ray_gcs_external_storage_config=None, tracing_config=None, auto_complete_rollout=True, max_surge_percent=None, tags=None, traffic_percent=None, local_vars_configuration=None): # noqa: E501
|
|
72
74
|
"""ApplyServiceModel - a model defined in OpenAPI""" # noqa: E501
|
|
73
75
|
if local_vars_configuration is None:
|
|
74
76
|
local_vars_configuration = Configuration()
|
|
@@ -89,6 +91,7 @@ class ApplyServiceModel(object):
|
|
|
89
91
|
self._auto_complete_rollout = None
|
|
90
92
|
self._max_surge_percent = None
|
|
91
93
|
self._tags = None
|
|
94
|
+
self._traffic_percent = None
|
|
92
95
|
self.discriminator = None
|
|
93
96
|
|
|
94
97
|
self.name = name
|
|
@@ -117,6 +120,8 @@ class ApplyServiceModel(object):
|
|
|
117
120
|
self.max_surge_percent = max_surge_percent
|
|
118
121
|
if tags is not None:
|
|
119
122
|
self.tags = tags
|
|
123
|
+
if traffic_percent is not None:
|
|
124
|
+
self.traffic_percent = traffic_percent
|
|
120
125
|
|
|
121
126
|
@property
|
|
122
127
|
def name(self):
|
|
@@ -471,6 +476,29 @@ class ApplyServiceModel(object):
|
|
|
471
476
|
|
|
472
477
|
self._tags = tags
|
|
473
478
|
|
|
479
|
+
@property
|
|
480
|
+
def traffic_percent(self):
|
|
481
|
+
"""Gets the traffic_percent of this ApplyServiceModel. # noqa: E501
|
|
482
|
+
|
|
483
|
+
Percentage of traffic forwarded to a particular service version from the ALB. # noqa: E501
|
|
484
|
+
|
|
485
|
+
:return: The traffic_percent of this ApplyServiceModel. # noqa: E501
|
|
486
|
+
:rtype: int
|
|
487
|
+
"""
|
|
488
|
+
return self._traffic_percent
|
|
489
|
+
|
|
490
|
+
@traffic_percent.setter
|
|
491
|
+
def traffic_percent(self, traffic_percent):
|
|
492
|
+
"""Sets the traffic_percent of this ApplyServiceModel.
|
|
493
|
+
|
|
494
|
+
Percentage of traffic forwarded to a particular service version from the ALB. # noqa: E501
|
|
495
|
+
|
|
496
|
+
:param traffic_percent: The traffic_percent of this ApplyServiceModel. # noqa: E501
|
|
497
|
+
:type: int
|
|
498
|
+
"""
|
|
499
|
+
|
|
500
|
+
self._traffic_percent = traffic_percent
|
|
501
|
+
|
|
474
502
|
def to_dict(self):
|
|
475
503
|
"""Returns the model properties as a dict"""
|
|
476
504
|
result = {}
|