skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/jobs/recovery_strategy.py
CHANGED
@@ -13,50 +13,33 @@ from typing import Optional
|
|
13
13
|
import sky
|
14
14
|
from sky import backends
|
15
15
|
from sky import exceptions
|
16
|
+
from sky import execution
|
16
17
|
from sky import global_user_state
|
17
18
|
from sky import sky_logging
|
18
|
-
from sky import status_lib
|
19
19
|
from sky.backends import backend_utils
|
20
|
+
from sky.jobs import scheduler
|
20
21
|
from sky.jobs import utils as managed_job_utils
|
21
22
|
from sky.skylet import job_lib
|
22
23
|
from sky.usage import usage_lib
|
23
24
|
from sky.utils import common_utils
|
25
|
+
from sky.utils import registry
|
26
|
+
from sky.utils import status_lib
|
24
27
|
from sky.utils import ux_utils
|
25
28
|
|
26
29
|
if typing.TYPE_CHECKING:
|
30
|
+
from sky import resources
|
27
31
|
from sky import task as task_lib
|
28
32
|
|
29
33
|
logger = sky_logging.init_logger(__name__)
|
30
34
|
|
31
|
-
RECOVERY_STRATEGIES = {}
|
32
|
-
DEFAULT_RECOVERY_STRATEGY = None
|
33
|
-
|
34
35
|
# Waiting time for job from INIT/PENDING to RUNNING
|
35
36
|
# 10 * JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 10 * 5 = 50 seconds
|
36
37
|
MAX_JOB_CHECKING_RETRY = 10
|
37
38
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
while True:
|
43
|
-
try:
|
44
|
-
usage_lib.messages.usage.set_internal()
|
45
|
-
sky.down(cluster_name)
|
46
|
-
return
|
47
|
-
except ValueError:
|
48
|
-
# The cluster is already down.
|
49
|
-
return
|
50
|
-
except Exception as e: # pylint: disable=broad-except
|
51
|
-
retry_cnt += 1
|
52
|
-
if retry_cnt >= max_retry:
|
53
|
-
raise RuntimeError(
|
54
|
-
f'Failed to terminate the cluster {cluster_name}.') from e
|
55
|
-
logger.error(
|
56
|
-
f'Failed to terminate the cluster {cluster_name}. Retrying.'
|
57
|
-
f'Details: {common_utils.format_exception(e)}')
|
58
|
-
with ux_utils.enable_traceback():
|
59
|
-
logger.error(f' Traceback: {traceback.format_exc()}')
|
39
|
+
# Minutes to job cluster autodown. This should be significantly larger than
|
40
|
+
# managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS, to avoid tearing down the
|
41
|
+
# cluster before its status can be updated by the job controller.
|
42
|
+
_AUTODOWN_MINUTES = 5
|
60
43
|
|
61
44
|
|
62
45
|
class StrategyExecutor:
|
@@ -65,14 +48,14 @@ class StrategyExecutor:
|
|
65
48
|
RETRY_INIT_GAP_SECONDS = 60
|
66
49
|
|
67
50
|
def __init__(self, cluster_name: str, backend: 'backends.Backend',
|
68
|
-
task: 'task_lib.Task',
|
51
|
+
task: 'task_lib.Task', max_restarts_on_errors: int,
|
52
|
+
job_id: int) -> None:
|
69
53
|
"""Initialize the strategy executor.
|
70
54
|
|
71
55
|
Args:
|
72
56
|
cluster_name: The name of the cluster.
|
73
57
|
backend: The backend to use. Only CloudVMRayBackend is supported.
|
74
58
|
task: The task to execute.
|
75
|
-
retry_until_up: Whether to retry until the cluster is up.
|
76
59
|
"""
|
77
60
|
assert isinstance(backend, backends.CloudVmRayBackend), (
|
78
61
|
'Only CloudVMRayBackend is supported.')
|
@@ -80,19 +63,13 @@ class StrategyExecutor:
|
|
80
63
|
self.dag.add(task)
|
81
64
|
self.cluster_name = cluster_name
|
82
65
|
self.backend = backend
|
83
|
-
self.
|
84
|
-
|
85
|
-
|
86
|
-
RECOVERY_STRATEGIES[name] = cls
|
87
|
-
if default:
|
88
|
-
global DEFAULT_RECOVERY_STRATEGY
|
89
|
-
assert DEFAULT_RECOVERY_STRATEGY is None, (
|
90
|
-
'Only one strategy can be default.')
|
91
|
-
DEFAULT_RECOVERY_STRATEGY = name
|
66
|
+
self.max_restarts_on_errors = max_restarts_on_errors
|
67
|
+
self.job_id = job_id
|
68
|
+
self.restart_cnt_on_failure = 0
|
92
69
|
|
93
70
|
@classmethod
|
94
71
|
def make(cls, cluster_name: str, backend: 'backends.Backend',
|
95
|
-
task: 'task_lib.Task',
|
72
|
+
task: 'task_lib.Task', job_id: int) -> 'StrategyExecutor':
|
96
73
|
"""Create a strategy from a task."""
|
97
74
|
|
98
75
|
resource_list = list(task.resources)
|
@@ -108,8 +85,19 @@ class StrategyExecutor:
|
|
108
85
|
# set the new_task_resources to be the same type (list or set) as the
|
109
86
|
# original task.resources
|
110
87
|
task.set_resources(type(task.resources)(new_resources_list))
|
111
|
-
|
112
|
-
|
88
|
+
if isinstance(job_recovery, dict):
|
89
|
+
job_recovery_name = job_recovery.pop(
|
90
|
+
'strategy', registry.JOBS_RECOVERY_STRATEGY_REGISTRY.default)
|
91
|
+
max_restarts_on_errors = job_recovery.pop('max_restarts_on_errors',
|
92
|
+
0)
|
93
|
+
else:
|
94
|
+
job_recovery_name = job_recovery
|
95
|
+
max_restarts_on_errors = 0
|
96
|
+
job_recovery_strategy = (registry.JOBS_RECOVERY_STRATEGY_REGISTRY.
|
97
|
+
from_str(job_recovery_name))
|
98
|
+
assert job_recovery_strategy is not None, job_recovery_name
|
99
|
+
return job_recovery_strategy(cluster_name, backend, task,
|
100
|
+
max_restarts_on_errors, job_id)
|
113
101
|
|
114
102
|
def launch(self) -> float:
|
115
103
|
"""Launch the cluster for the first time.
|
@@ -123,10 +111,7 @@ class StrategyExecutor:
|
|
123
111
|
Raises: Please refer to the docstring of self._launch().
|
124
112
|
"""
|
125
113
|
|
126
|
-
|
127
|
-
job_submit_at = self._launch(max_retry=None)
|
128
|
-
else:
|
129
|
-
job_submit_at = self._launch()
|
114
|
+
job_submit_at = self._launch(max_retry=None)
|
130
115
|
assert job_submit_at is not None
|
131
116
|
return job_submit_at
|
132
117
|
|
@@ -141,6 +126,8 @@ class StrategyExecutor:
|
|
141
126
|
raise NotImplementedError
|
142
127
|
|
143
128
|
def _try_cancel_all_jobs(self):
|
129
|
+
from sky import core # pylint: disable=import-outside-toplevel
|
130
|
+
|
144
131
|
handle = global_user_state.get_handle_from_cluster_name(
|
145
132
|
self.cluster_name)
|
146
133
|
if handle is None:
|
@@ -166,9 +153,9 @@ class StrategyExecutor:
|
|
166
153
|
# should be functional with the `_try_cancel_if_cluster_is_init`
|
167
154
|
# flag, i.e. it sends the cancel signal to the head node, which will
|
168
155
|
# then kill the user process on remaining worker nodes.
|
169
|
-
|
170
|
-
|
171
|
-
|
156
|
+
core.cancel(cluster_name=self.cluster_name,
|
157
|
+
all=True,
|
158
|
+
_try_cancel_if_cluster_is_init=True)
|
172
159
|
except Exception as e: # pylint: disable=broad-except
|
173
160
|
logger.info('Failed to cancel the job on the cluster. The cluster '
|
174
161
|
'might be already down or the head node is preempted.'
|
@@ -176,7 +163,7 @@ class StrategyExecutor:
|
|
176
163
|
f'{common_utils.format_exception(e)}\n'
|
177
164
|
'Terminating the cluster explicitly to ensure no '
|
178
165
|
'remaining job process interferes with recovery.')
|
179
|
-
terminate_cluster(self.cluster_name)
|
166
|
+
managed_job_utils.terminate_cluster(self.cluster_name)
|
180
167
|
|
181
168
|
def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
|
182
169
|
"""Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
|
@@ -270,8 +257,8 @@ class StrategyExecutor:
|
|
270
257
|
1. The optimizer cannot find a feasible solution.
|
271
258
|
2. Precheck errors: invalid cluster name, failure in getting
|
272
259
|
cloud user identity, or unsupported feature.
|
273
|
-
exceptions.
|
274
|
-
all prechecks passed but the maximum number of retries is
|
260
|
+
exceptions.ManagedJobReachedMaxRetriesError: This will be raised
|
261
|
+
when all prechecks passed but the maximum number of retries is
|
275
262
|
reached for `sky.launch`. The failure of `sky.launch` can be
|
276
263
|
due to:
|
277
264
|
1. Any of the underlying failover exceptions is due to resources
|
@@ -285,104 +272,128 @@ class StrategyExecutor:
|
|
285
272
|
backoff = common_utils.Backoff(self.RETRY_INIT_GAP_SECONDS)
|
286
273
|
while True:
|
287
274
|
retry_cnt += 1
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
# after failing over through all the candidates.
|
309
|
-
# Please refer to the docstring of `sky.launch` for more
|
310
|
-
# details of how the exception will be structured.
|
311
|
-
if not any(
|
312
|
-
isinstance(err, exceptions.ResourcesUnavailableError)
|
313
|
-
for err in e.failover_history):
|
314
|
-
# _launch() (this function) should fail/exit directly, if
|
315
|
-
# none of the failover reasons were because of resource
|
316
|
-
# unavailability or no failover was attempted (the optimizer
|
317
|
-
# cannot find feasible resources for requested resources),
|
318
|
-
# i.e., e.failover_history is empty.
|
319
|
-
# Failing directly avoids the infinite loop of retrying
|
320
|
-
# the launch when, e.g., an invalid cluster name is used
|
321
|
-
# and --retry-until-up is specified.
|
322
|
-
reasons = (e.failover_history
|
323
|
-
if e.failover_history else [e])
|
324
|
-
reasons_str = '; '.join(
|
325
|
-
common_utils.format_exception(err) for err in reasons)
|
326
|
-
logger.error(
|
327
|
-
'Failure happened before provisioning. Failover '
|
328
|
-
f'reasons: {reasons_str}')
|
275
|
+
with scheduler.scheduled_launch(self.job_id):
|
276
|
+
try:
|
277
|
+
usage_lib.messages.usage.set_internal()
|
278
|
+
# Detach setup, so that the setup failure can be detected
|
279
|
+
# by the controller process (job_status -> FAILED_SETUP).
|
280
|
+
execution.launch(
|
281
|
+
self.dag,
|
282
|
+
cluster_name=self.cluster_name,
|
283
|
+
# We expect to tear down the cluster as soon as the job
|
284
|
+
# is finished. However, in case the controller dies, set
|
285
|
+
# autodown to try and avoid a resource leak.
|
286
|
+
idle_minutes_to_autostop=_AUTODOWN_MINUTES,
|
287
|
+
down=True,
|
288
|
+
_is_launched_by_jobs_controller=True)
|
289
|
+
logger.info('Managed job cluster launched.')
|
290
|
+
except (exceptions.InvalidClusterNameError,
|
291
|
+
exceptions.NoCloudAccessError,
|
292
|
+
exceptions.ResourcesMismatchError) as e:
|
293
|
+
logger.error('Failure happened before provisioning. '
|
294
|
+
f'{common_utils.format_exception(e)}')
|
329
295
|
if raise_on_failure:
|
330
|
-
raise exceptions.ProvisionPrechecksError(
|
331
|
-
reasons=reasons)
|
332
|
-
return None
|
333
|
-
logger.info('Failed to launch a cluster with error: '
|
334
|
-
f'{common_utils.format_exception(e)})')
|
335
|
-
except Exception as e: # pylint: disable=broad-except
|
336
|
-
# If the launch fails, it will be recovered by the following
|
337
|
-
# code.
|
338
|
-
logger.info('Failed to launch a cluster with error: '
|
339
|
-
f'{common_utils.format_exception(e)})')
|
340
|
-
with ux_utils.enable_traceback():
|
341
|
-
logger.info(f' Traceback: {traceback.format_exc()}')
|
342
|
-
else: # No exception, the launch succeeds.
|
343
|
-
# At this point, a sky.launch() has succeeded. Cluster may be
|
344
|
-
# UP (no preemption since) or DOWN (newly preempted).
|
345
|
-
job_submitted_at = self._wait_until_job_starts_on_cluster()
|
346
|
-
if job_submitted_at is not None:
|
347
|
-
return job_submitted_at
|
348
|
-
# The job fails to start on the cluster, retry the launch.
|
349
|
-
# TODO(zhwu): log the unexpected error to usage collection
|
350
|
-
# for future debugging.
|
351
|
-
logger.info(
|
352
|
-
'Failed to successfully submit the job to the '
|
353
|
-
'launched cluster, due to unexpected submission errors or '
|
354
|
-
'the cluster being preempted during job submission.')
|
355
|
-
|
356
|
-
terminate_cluster(self.cluster_name)
|
357
|
-
if max_retry is not None and retry_cnt >= max_retry:
|
358
|
-
# Retry forever if max_retry is None.
|
359
|
-
if raise_on_failure:
|
360
|
-
with ux_utils.print_exception_no_traceback():
|
361
|
-
raise exceptions.ManagedJobReachedMaxRetriesError(
|
362
|
-
'Resources unavailable: failed to launch clusters '
|
363
|
-
f'after {max_retry} retries.')
|
364
|
-
else:
|
296
|
+
raise exceptions.ProvisionPrechecksError(reasons=[e])
|
365
297
|
return None
|
298
|
+
except exceptions.ResourcesUnavailableError as e:
|
299
|
+
# This is raised when the launch fails due to prechecks or
|
300
|
+
# after failing over through all the candidates.
|
301
|
+
# Please refer to the docstring of `sky.launch` for more
|
302
|
+
# details of how the exception will be structured.
|
303
|
+
if not any(
|
304
|
+
isinstance(err,
|
305
|
+
exceptions.ResourcesUnavailableError)
|
306
|
+
for err in e.failover_history):
|
307
|
+
# _launch() (this function) should fail/exit directly,
|
308
|
+
# if none of the failover reasons were because of
|
309
|
+
# resource unavailability or no failover was attempted
|
310
|
+
# (the optimizer cannot find feasible resources for
|
311
|
+
# requested resources), i.e., e.failover_history is
|
312
|
+
# empty. Failing directly avoids the infinite loop of
|
313
|
+
# retrying the launch when, e.g., an invalid cluster
|
314
|
+
# name is used and --retry-until-up is specified.
|
315
|
+
reasons = (e.failover_history
|
316
|
+
if e.failover_history else [e])
|
317
|
+
reasons_str = '; '.join(
|
318
|
+
common_utils.format_exception(err)
|
319
|
+
for err in reasons)
|
320
|
+
logger.error(
|
321
|
+
'Failure happened before provisioning. Failover '
|
322
|
+
f'reasons: {reasons_str}')
|
323
|
+
if raise_on_failure:
|
324
|
+
raise exceptions.ProvisionPrechecksError(reasons)
|
325
|
+
return None
|
326
|
+
logger.info('Failed to launch a cluster with error: '
|
327
|
+
f'{common_utils.format_exception(e)})')
|
328
|
+
except Exception as e: # pylint: disable=broad-except
|
329
|
+
# If the launch fails, it will be recovered by the following
|
330
|
+
# code.
|
331
|
+
logger.info('Failed to launch a cluster with error: '
|
332
|
+
f'{common_utils.format_exception(e)})')
|
333
|
+
with ux_utils.enable_traceback():
|
334
|
+
logger.info(f' Traceback: {traceback.format_exc()}')
|
335
|
+
else: # No exception, the launch succeeds.
|
336
|
+
# At this point, a sky.launch() has succeeded. Cluster may
|
337
|
+
# be UP (no preemption since) or DOWN (newly preempted).
|
338
|
+
job_submitted_at = self._wait_until_job_starts_on_cluster()
|
339
|
+
if job_submitted_at is not None:
|
340
|
+
return job_submitted_at
|
341
|
+
# The job fails to start on the cluster, retry the launch.
|
342
|
+
# TODO(zhwu): log the unexpected error to usage collection
|
343
|
+
# for future debugging.
|
344
|
+
logger.info(
|
345
|
+
'Failed to successfully submit the job to the '
|
346
|
+
'launched cluster, due to unexpected submission errors '
|
347
|
+
'or the cluster being preempted during job submission.')
|
348
|
+
|
349
|
+
# If we get here, the launch did not succeed. Tear down the
|
350
|
+
# cluster and retry.
|
351
|
+
managed_job_utils.terminate_cluster(self.cluster_name)
|
352
|
+
if max_retry is not None and retry_cnt >= max_retry:
|
353
|
+
# Retry forever if max_retry is None.
|
354
|
+
if raise_on_failure:
|
355
|
+
with ux_utils.print_exception_no_traceback():
|
356
|
+
raise exceptions.ManagedJobReachedMaxRetriesError(
|
357
|
+
'Resources unavailable: failed to launch '
|
358
|
+
f'clusters after {max_retry} retries.')
|
359
|
+
else:
|
360
|
+
return None
|
361
|
+
# Exit the scheduled_launch context so that the scheulde state is
|
362
|
+
# ALIVE during the backoff. This allows other jobs to launch.
|
366
363
|
gap_seconds = backoff.current_backoff()
|
367
364
|
logger.info('Retrying to launch the cluster in '
|
368
365
|
f'{gap_seconds:.1f} seconds.')
|
369
366
|
time.sleep(gap_seconds)
|
370
367
|
|
368
|
+
def should_restart_on_failure(self) -> bool:
|
369
|
+
"""Increments counter & checks if job should be restarted on a failure.
|
371
370
|
|
372
|
-
|
373
|
-
|
371
|
+
Returns:
|
372
|
+
True if the job should be restarted, otherwise False.
|
373
|
+
"""
|
374
|
+
self.restart_cnt_on_failure += 1
|
375
|
+
if self.restart_cnt_on_failure > self.max_restarts_on_errors:
|
376
|
+
return False
|
377
|
+
return True
|
378
|
+
|
379
|
+
|
380
|
+
@registry.JOBS_RECOVERY_STRATEGY_REGISTRY.type_register(name='FAILOVER',
|
381
|
+
default=False)
|
382
|
+
class FailoverStrategyExecutor(StrategyExecutor):
|
374
383
|
"""Failover strategy: wait in same region and failover after timeout."""
|
375
384
|
|
376
385
|
_MAX_RETRY_CNT = 240 # Retry for 4 hours.
|
377
386
|
|
378
387
|
def __init__(self, cluster_name: str, backend: 'backends.Backend',
|
379
|
-
task: 'task_lib.Task',
|
380
|
-
|
388
|
+
task: 'task_lib.Task', max_restarts_on_errors: int,
|
389
|
+
job_id: int) -> None:
|
390
|
+
super().__init__(cluster_name, backend, task, max_restarts_on_errors,
|
391
|
+
job_id)
|
381
392
|
# Note down the cloud/region of the launched cluster, so that we can
|
382
393
|
# first retry in the same cloud/region. (Inside recover() we may not
|
383
394
|
# rely on cluster handle, as it can be None if the cluster is
|
384
395
|
# preempted.)
|
385
|
-
self._launched_resources: Optional['
|
396
|
+
self._launched_resources: Optional['resources.Resources'] = None
|
386
397
|
|
387
398
|
def _launch(self,
|
388
399
|
max_retry: Optional[int] = 3,
|
@@ -431,7 +442,7 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
|
|
431
442
|
# Step 2
|
432
443
|
logger.debug('Terminating unhealthy cluster and reset cloud '
|
433
444
|
'region.')
|
434
|
-
terminate_cluster(self.cluster_name)
|
445
|
+
managed_job_utils.terminate_cluster(self.cluster_name)
|
435
446
|
|
436
447
|
# Step 3
|
437
448
|
logger.debug('Relaunch the cluster without constraining to prior '
|
@@ -441,23 +452,18 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
|
|
441
452
|
raise_on_failure=False)
|
442
453
|
if job_submitted_at is None:
|
443
454
|
# Failed to launch the cluster.
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
continue
|
450
|
-
with ux_utils.print_exception_no_traceback():
|
451
|
-
raise exceptions.ResourcesUnavailableError(
|
452
|
-
f'Failed to recover the cluster after retrying '
|
453
|
-
f'{self._MAX_RETRY_CNT} times.')
|
455
|
+
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
456
|
+
logger.info('Retrying to recover the cluster in '
|
457
|
+
f'{gap_seconds:.1f} seconds.')
|
458
|
+
time.sleep(gap_seconds)
|
459
|
+
continue
|
454
460
|
|
455
461
|
return job_submitted_at
|
456
462
|
|
457
463
|
|
458
|
-
|
459
|
-
|
460
|
-
|
464
|
+
@registry.JOBS_RECOVERY_STRATEGY_REGISTRY.type_register(
|
465
|
+
name='EAGER_NEXT_REGION', default=True)
|
466
|
+
class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
461
467
|
"""Eager failover strategy.
|
462
468
|
|
463
469
|
This strategy is an extension of the FAILOVER strategy. Instead of waiting
|
@@ -494,7 +500,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
|
|
494
500
|
|
495
501
|
# Step 1
|
496
502
|
logger.debug('Terminating unhealthy cluster and reset cloud region.')
|
497
|
-
terminate_cluster(self.cluster_name)
|
503
|
+
managed_job_utils.terminate_cluster(self.cluster_name)
|
498
504
|
|
499
505
|
# Step 2
|
500
506
|
logger.debug('Relaunch the cluster skipping the previously launched '
|
@@ -529,15 +535,10 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
|
|
529
535
|
raise_on_failure=False)
|
530
536
|
if job_submitted_at is None:
|
531
537
|
# Failed to launch the cluster.
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
continue
|
538
|
-
with ux_utils.print_exception_no_traceback():
|
539
|
-
raise exceptions.ResourcesUnavailableError(
|
540
|
-
f'Failed to recover the cluster after retrying '
|
541
|
-
f'{self._MAX_RETRY_CNT} times.')
|
538
|
+
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
539
|
+
logger.info('Retrying to recover the cluster in '
|
540
|
+
f'{gap_seconds:.1f} seconds.')
|
541
|
+
time.sleep(gap_seconds)
|
542
|
+
continue
|
542
543
|
|
543
544
|
return job_submitted_at
|