skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/jobs/controller.py
CHANGED
@@ -1,29 +1,36 @@
|
|
1
|
-
"""Controller: handles the life cycle of a managed job.
|
1
|
+
"""Controller: handles the life cycle of a managed job.
|
2
|
+
|
3
|
+
TODO(cooperc): Document lifecycle, and multiprocess layout.
|
4
|
+
"""
|
2
5
|
import argparse
|
3
6
|
import multiprocessing
|
4
7
|
import os
|
5
8
|
import pathlib
|
9
|
+
import shutil
|
6
10
|
import time
|
7
11
|
import traceback
|
8
12
|
import typing
|
9
|
-
from typing import Tuple
|
13
|
+
from typing import Optional, Tuple
|
10
14
|
|
11
15
|
import filelock
|
12
16
|
|
13
17
|
from sky import exceptions
|
14
18
|
from sky import sky_logging
|
15
|
-
from sky import status_lib
|
16
19
|
from sky.backends import backend_utils
|
17
20
|
from sky.backends import cloud_vm_ray_backend
|
21
|
+
from sky.data import data_utils
|
18
22
|
from sky.jobs import recovery_strategy
|
23
|
+
from sky.jobs import scheduler
|
19
24
|
from sky.jobs import state as managed_job_state
|
20
25
|
from sky.jobs import utils as managed_job_utils
|
21
26
|
from sky.skylet import constants
|
22
27
|
from sky.skylet import job_lib
|
23
28
|
from sky.usage import usage_lib
|
29
|
+
from sky.utils import common
|
24
30
|
from sky.utils import common_utils
|
25
31
|
from sky.utils import controller_utils
|
26
32
|
from sky.utils import dag_utils
|
33
|
+
from sky.utils import status_lib
|
27
34
|
from sky.utils import subprocess_utils
|
28
35
|
from sky.utils import ux_utils
|
29
36
|
|
@@ -46,12 +53,10 @@ def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
|
|
46
53
|
class JobsController:
|
47
54
|
"""Each jobs controller manages the life cycle of one managed job."""
|
48
55
|
|
49
|
-
def __init__(self, job_id: int, dag_yaml: str
|
50
|
-
retry_until_up: bool) -> None:
|
56
|
+
def __init__(self, job_id: int, dag_yaml: str) -> None:
|
51
57
|
self._job_id = job_id
|
52
58
|
self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
|
53
59
|
logger.info(self._dag)
|
54
|
-
self._retry_until_up = retry_until_up
|
55
60
|
# TODO(zhwu): this assumes the specific backend.
|
56
61
|
self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
57
62
|
|
@@ -64,8 +69,9 @@ class JobsController:
|
|
64
69
|
if len(self._dag.tasks) <= 1:
|
65
70
|
task_name = self._dag_name
|
66
71
|
else:
|
72
|
+
assert task.name is not None, task
|
67
73
|
task_name = task.name
|
68
|
-
# This is guaranteed by the
|
74
|
+
# This is guaranteed by the jobs.launch API, where we fill in
|
69
75
|
# the task.name with
|
70
76
|
# dag_utils.maybe_infer_and_fill_dag_and_task_names.
|
71
77
|
assert task_name is not None, self._dag
|
@@ -86,18 +92,28 @@ class JobsController:
|
|
86
92
|
task.update_envs(task_envs)
|
87
93
|
|
88
94
|
def _download_log_and_stream(
|
89
|
-
|
90
|
-
|
91
|
-
|
95
|
+
self, task_id: Optional[int],
|
96
|
+
handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle]
|
97
|
+
) -> None:
|
98
|
+
"""Downloads and streams the logs of the current job with given task ID.
|
92
99
|
|
93
100
|
We do not stream the logs from the cluster directly, as the
|
94
101
|
donwload and stream should be faster, and more robust against
|
95
102
|
preemptions or ssh disconnection during the streaming.
|
96
103
|
"""
|
104
|
+
if handle is None:
|
105
|
+
logger.info(f'Cluster for job {self._job_id} is not found. '
|
106
|
+
'Skipping downloading and streaming the logs.')
|
107
|
+
return
|
97
108
|
managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
98
109
|
'managed_jobs')
|
99
|
-
controller_utils.download_and_stream_latest_job_log(
|
110
|
+
log_file = controller_utils.download_and_stream_latest_job_log(
|
100
111
|
self._backend, handle, managed_job_logs_dir)
|
112
|
+
if log_file is not None:
|
113
|
+
# Set the path of the log file for the current task, so it can be
|
114
|
+
# accessed even after the job is finished
|
115
|
+
managed_job_state.set_local_log_file(self._job_id, task_id,
|
116
|
+
log_file)
|
101
117
|
logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
|
102
118
|
|
103
119
|
def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
|
@@ -124,8 +140,8 @@ class JobsController:
|
|
124
140
|
1. The optimizer cannot find a feasible solution.
|
125
141
|
2. Precheck errors: invalid cluster name, failure in getting
|
126
142
|
cloud user identity, or unsupported feature.
|
127
|
-
exceptions.
|
128
|
-
all prechecks passed but the maximum number of retries is
|
143
|
+
exceptions.ManagedJobReachedMaxRetriesError: This will be raised
|
144
|
+
when all prechecks passed but the maximum number of retries is
|
129
145
|
reached for `sky.launch`. The failure of `sky.launch` can be
|
130
146
|
due to:
|
131
147
|
1. Any of the underlying failover exceptions is due to resources
|
@@ -159,6 +175,11 @@ class JobsController:
|
|
159
175
|
if task_id == 0:
|
160
176
|
submitted_at = backend_utils.get_timestamp_from_run_timestamp(
|
161
177
|
self._backend.run_timestamp)
|
178
|
+
assert task.name is not None, task
|
179
|
+
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
180
|
+
task.name, self._job_id)
|
181
|
+
self._strategy_executor = recovery_strategy.StrategyExecutor.make(
|
182
|
+
cluster_name, self._backend, task, self._job_id)
|
162
183
|
managed_job_state.set_submitted(
|
163
184
|
self._job_id,
|
164
185
|
task_id,
|
@@ -166,15 +187,14 @@ class JobsController:
|
|
166
187
|
submitted_at,
|
167
188
|
resources_str=backend_utils.get_task_resources_str(
|
168
189
|
task, is_managed_job=True),
|
190
|
+
specs={
|
191
|
+
'max_restarts_on_errors':
|
192
|
+
self._strategy_executor.max_restarts_on_errors
|
193
|
+
},
|
169
194
|
callback_func=callback_func)
|
170
195
|
logger.info(
|
171
196
|
f'Submitted managed job {self._job_id} (task: {task_id}, name: '
|
172
197
|
f'{task.name!r}); {constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
|
173
|
-
assert task.name is not None, task
|
174
|
-
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
175
|
-
task.name, self._job_id)
|
176
|
-
self._strategy_executor = recovery_strategy.StrategyExecutor.make(
|
177
|
-
cluster_name, self._backend, task, self._retry_until_up)
|
178
198
|
|
179
199
|
logger.info('Started monitoring.')
|
180
200
|
managed_job_state.set_starting(job_id=self._job_id,
|
@@ -187,6 +207,7 @@ class JobsController:
|
|
187
207
|
task_id=task_id,
|
188
208
|
start_time=remote_job_submitted_at,
|
189
209
|
callback_func=callback_func)
|
210
|
+
|
190
211
|
while True:
|
191
212
|
time.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
|
192
213
|
|
@@ -206,22 +227,39 @@ class JobsController:
|
|
206
227
|
self._backend, cluster_name)
|
207
228
|
|
208
229
|
if job_status == job_lib.JobStatus.SUCCEEDED:
|
209
|
-
end_time = managed_job_utils.
|
210
|
-
self._backend, cluster_name
|
211
|
-
# The job is done.
|
230
|
+
end_time = managed_job_utils.try_to_get_job_end_time(
|
231
|
+
self._backend, cluster_name)
|
232
|
+
# The job is done. Set the job to SUCCEEDED first before start
|
233
|
+
# downloading and streaming the logs to make it more responsive.
|
212
234
|
managed_job_state.set_succeeded(self._job_id,
|
213
235
|
task_id,
|
214
236
|
end_time=end_time,
|
215
237
|
callback_func=callback_func)
|
216
238
|
logger.info(
|
217
|
-
f'
|
239
|
+
f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
|
218
240
|
f'Cleaning up the cluster {cluster_name}.')
|
241
|
+
try:
|
242
|
+
clusters = backend_utils.get_clusters(
|
243
|
+
cluster_names=[cluster_name],
|
244
|
+
refresh=common.StatusRefreshMode.NONE,
|
245
|
+
all_users=True)
|
246
|
+
if clusters:
|
247
|
+
assert len(clusters) == 1, (clusters, cluster_name)
|
248
|
+
handle = clusters[0].get('handle')
|
249
|
+
# Best effort to download and stream the logs.
|
250
|
+
self._download_log_and_stream(task_id, handle)
|
251
|
+
except Exception as e: # pylint: disable=broad-except
|
252
|
+
# We don't want to crash here, so just log and continue.
|
253
|
+
logger.warning(
|
254
|
+
f'Failed to download and stream logs: '
|
255
|
+
f'{common_utils.format_exception(e)}',
|
256
|
+
exc_info=True)
|
219
257
|
# Only clean up the cluster, not the storages, because tasks may
|
220
258
|
# share storages.
|
221
|
-
|
259
|
+
managed_job_utils.terminate_cluster(cluster_name=cluster_name)
|
222
260
|
return True
|
223
261
|
|
224
|
-
# For single-node jobs,
|
262
|
+
# For single-node jobs, non-terminated job_status indicates a
|
225
263
|
# healthy cluster. We can safely continue monitoring.
|
226
264
|
# For multi-node jobs, since the job may not be set to FAILED
|
227
265
|
# immediately (depending on user program) when only some of the
|
@@ -231,9 +269,7 @@ class JobsController:
|
|
231
269
|
task.num_nodes == 1):
|
232
270
|
continue
|
233
271
|
|
234
|
-
if job_status in
|
235
|
-
job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
|
236
|
-
]:
|
272
|
+
if job_status in job_lib.JobStatus.user_code_failure_states():
|
237
273
|
# Add a grace period before the check of preemption to avoid
|
238
274
|
# false alarm for job failure.
|
239
275
|
time.sleep(5)
|
@@ -263,17 +299,15 @@ class JobsController:
|
|
263
299
|
if job_status is not None and not job_status.is_terminal():
|
264
300
|
# The multi-node job is still running, continue monitoring.
|
265
301
|
continue
|
266
|
-
elif job_status in
|
267
|
-
job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
|
268
|
-
]:
|
302
|
+
elif job_status in job_lib.JobStatus.user_code_failure_states():
|
269
303
|
# The user code has probably crashed, fail immediately.
|
270
|
-
end_time = managed_job_utils.
|
271
|
-
self._backend, cluster_name
|
304
|
+
end_time = managed_job_utils.try_to_get_job_end_time(
|
305
|
+
self._backend, cluster_name)
|
272
306
|
logger.info(
|
273
307
|
'The user job failed. Please check the logs below.\n'
|
274
308
|
f'== Logs of the user job (ID: {self._job_id}) ==\n')
|
275
309
|
|
276
|
-
self._download_log_and_stream(handle)
|
310
|
+
self._download_log_and_stream(task_id, handle)
|
277
311
|
managed_job_status = (
|
278
312
|
managed_job_state.ManagedJobStatus.FAILED)
|
279
313
|
if job_status == job_lib.JobStatus.FAILED_SETUP:
|
@@ -282,23 +316,35 @@ class JobsController:
|
|
282
316
|
failure_reason = (
|
283
317
|
'To see the details, run: '
|
284
318
|
f'sky jobs logs --controller {self._job_id}')
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
319
|
+
should_restart_on_failure = (
|
320
|
+
self._strategy_executor.should_restart_on_failure())
|
321
|
+
if should_restart_on_failure:
|
322
|
+
max_restarts = (
|
323
|
+
self._strategy_executor.max_restarts_on_errors)
|
324
|
+
logger.info(
|
325
|
+
f'User program crashed '
|
326
|
+
f'({managed_job_status.value}). '
|
327
|
+
f'Retry the job as max_restarts_on_errors is '
|
328
|
+
f'set to {max_restarts}. '
|
329
|
+
f'[{self._strategy_executor.restart_cnt_on_failure}'
|
330
|
+
f'/{max_restarts}]')
|
331
|
+
else:
|
332
|
+
managed_job_state.set_failed(
|
333
|
+
self._job_id,
|
334
|
+
task_id,
|
335
|
+
failure_type=managed_job_status,
|
336
|
+
failure_reason=failure_reason,
|
337
|
+
end_time=end_time,
|
338
|
+
callback_func=callback_func)
|
339
|
+
return False
|
340
|
+
else:
|
341
|
+
# Although the cluster is healthy, we fail to access the
|
342
|
+
# job status. Try to recover the job (will not restart the
|
343
|
+
# cluster, if the cluster is healthy).
|
344
|
+
assert job_status is None, job_status
|
345
|
+
logger.info('Failed to fetch the job status while the '
|
346
|
+
'cluster is healthy. Try to recover the job '
|
347
|
+
'(the cluster will not be restarted).')
|
302
348
|
# When the handle is None, the cluster should be cleaned up already.
|
303
349
|
if handle is not None:
|
304
350
|
resources = handle.launched_resources
|
@@ -309,7 +355,7 @@ class JobsController:
|
|
309
355
|
# those clusters again may fail.
|
310
356
|
logger.info('Cleaning up the preempted or failed cluster'
|
311
357
|
'...')
|
312
|
-
|
358
|
+
managed_job_utils.terminate_cluster(cluster_name)
|
313
359
|
|
314
360
|
# Try to recover the managed jobs, when the cluster is preempted or
|
315
361
|
# failed or the job status is failed to be fetched.
|
@@ -339,48 +385,28 @@ class JobsController:
|
|
339
385
|
common_utils.format_exception(reason, use_bracket=True)
|
340
386
|
for reason in e.reasons))
|
341
387
|
logger.error(failure_reason)
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
failure_type=managed_job_state.ManagedJobStatus.
|
346
|
-
FAILED_PRECHECKS,
|
347
|
-
failure_reason=failure_reason,
|
348
|
-
callback_func=managed_job_utils.event_callback_func(
|
349
|
-
job_id=self._job_id,
|
350
|
-
task_id=task_id,
|
351
|
-
task=self._dag.tasks[task_id]))
|
388
|
+
self._update_failed_task_state(
|
389
|
+
task_id, managed_job_state.ManagedJobStatus.FAILED_PRECHECKS,
|
390
|
+
failure_reason)
|
352
391
|
except exceptions.ManagedJobReachedMaxRetriesError as e:
|
353
392
|
# Please refer to the docstring of self._run for the cases when
|
354
393
|
# this exception can occur.
|
355
|
-
|
394
|
+
failure_reason = common_utils.format_exception(e)
|
395
|
+
logger.error(failure_reason)
|
356
396
|
# The managed job should be marked as FAILED_NO_RESOURCE, as the
|
357
397
|
# managed job may be able to launch next time.
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
failure_type=managed_job_state.ManagedJobStatus.
|
362
|
-
FAILED_NO_RESOURCE,
|
363
|
-
failure_reason=common_utils.format_exception(e),
|
364
|
-
callback_func=managed_job_utils.event_callback_func(
|
365
|
-
job_id=self._job_id,
|
366
|
-
task_id=task_id,
|
367
|
-
task=self._dag.tasks[task_id]))
|
398
|
+
self._update_failed_task_state(
|
399
|
+
task_id, managed_job_state.ManagedJobStatus.FAILED_NO_RESOURCE,
|
400
|
+
failure_reason)
|
368
401
|
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
369
402
|
with ux_utils.enable_traceback():
|
370
403
|
logger.error(traceback.format_exc())
|
371
|
-
msg = ('Unexpected error occurred: '
|
372
|
-
|
404
|
+
msg = ('Unexpected error occurred: ' +
|
405
|
+
common_utils.format_exception(e, use_bracket=True))
|
373
406
|
logger.error(msg)
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
failure_type=managed_job_state.ManagedJobStatus.
|
378
|
-
FAILED_CONTROLLER,
|
379
|
-
failure_reason=msg,
|
380
|
-
callback_func=managed_job_utils.event_callback_func(
|
381
|
-
job_id=self._job_id,
|
382
|
-
task_id=task_id,
|
383
|
-
task=self._dag.tasks[task_id]))
|
407
|
+
self._update_failed_task_state(
|
408
|
+
task_id, managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
|
409
|
+
msg)
|
384
410
|
finally:
|
385
411
|
# This will set all unfinished tasks to CANCELLING, and will not
|
386
412
|
# affect the jobs in terminal states.
|
@@ -395,12 +421,27 @@ class JobsController:
|
|
395
421
|
managed_job_state.set_cancelled(job_id=self._job_id,
|
396
422
|
callback_func=callback_func)
|
397
423
|
|
424
|
+
def _update_failed_task_state(
|
425
|
+
self, task_id: int,
|
426
|
+
failure_type: managed_job_state.ManagedJobStatus,
|
427
|
+
failure_reason: str):
|
428
|
+
"""Update the state of the failed task."""
|
429
|
+
managed_job_state.set_failed(
|
430
|
+
self._job_id,
|
431
|
+
task_id=task_id,
|
432
|
+
failure_type=failure_type,
|
433
|
+
failure_reason=failure_reason,
|
434
|
+
callback_func=managed_job_utils.event_callback_func(
|
435
|
+
job_id=self._job_id,
|
436
|
+
task_id=task_id,
|
437
|
+
task=self._dag.tasks[task_id]))
|
398
438
|
|
399
|
-
|
439
|
+
|
440
|
+
def _run_controller(job_id: int, dag_yaml: str):
|
400
441
|
"""Runs the controller in a remote process for interruption."""
|
401
442
|
# The controller needs to be instantiated in the remote process, since
|
402
443
|
# the controller is not serializable.
|
403
|
-
jobs_controller = JobsController(job_id, dag_yaml
|
444
|
+
jobs_controller = JobsController(job_id, dag_yaml)
|
404
445
|
jobs_controller.run()
|
405
446
|
|
406
447
|
|
@@ -443,23 +484,44 @@ def _cleanup(job_id: int, dag_yaml: str):
|
|
443
484
|
when reaching here, as we currently only support chain DAGs, and only
|
444
485
|
task is executed at a time.
|
445
486
|
"""
|
446
|
-
# NOTE: The code to get cluster name is same as what we did in the spot
|
447
|
-
# controller, we should keep it in sync with JobsController.__init__()
|
448
487
|
dag, _ = _get_dag_and_name(dag_yaml)
|
449
488
|
for task in dag.tasks:
|
489
|
+
assert task.name is not None, task
|
450
490
|
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
451
491
|
task.name, job_id)
|
452
|
-
|
492
|
+
managed_job_utils.terminate_cluster(cluster_name)
|
493
|
+
|
453
494
|
# Clean up Storages with persistent=False.
|
454
495
|
# TODO(zhwu): this assumes the specific backend.
|
455
496
|
backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
497
|
+
# Need to re-construct storage object in the controller process
|
498
|
+
# because when SkyPilot API server machine sends the yaml config to the
|
499
|
+
# controller machine, only storage metadata is sent, not the storage
|
500
|
+
# object itself.
|
501
|
+
for storage in task.storage_mounts.values():
|
502
|
+
storage.construct()
|
456
503
|
backend.teardown_ephemeral_storage(task)
|
457
504
|
|
458
|
-
|
459
|
-
|
505
|
+
# Clean up any files mounted from the local disk, such as two-hop file
|
506
|
+
# mounts.
|
507
|
+
for file_mount in (task.file_mounts or {}).values():
|
508
|
+
try:
|
509
|
+
if not data_utils.is_cloud_store_url(file_mount):
|
510
|
+
path = os.path.expanduser(file_mount)
|
511
|
+
if os.path.isdir(path):
|
512
|
+
shutil.rmtree(path)
|
513
|
+
else:
|
514
|
+
os.remove(path)
|
515
|
+
except Exception as e: # pylint: disable=broad-except
|
516
|
+
logger.warning(
|
517
|
+
f'Failed to clean up file mount {file_mount}: {e}')
|
518
|
+
|
519
|
+
|
520
|
+
def start(job_id, dag_yaml):
|
460
521
|
"""Start the controller."""
|
461
522
|
controller_process = None
|
462
523
|
cancelling = False
|
524
|
+
task_id = None
|
463
525
|
try:
|
464
526
|
_handle_signal(job_id)
|
465
527
|
# TODO(suquark): In theory, we should make controller process a
|
@@ -469,8 +531,7 @@ def start(job_id, dag_yaml, retry_until_up):
|
|
469
531
|
# So we can only enable daemon after we no longer need to
|
470
532
|
# start daemon processes like Ray.
|
471
533
|
controller_process = multiprocessing.Process(target=_run_controller,
|
472
|
-
args=(job_id, dag_yaml
|
473
|
-
retry_until_up))
|
534
|
+
args=(job_id, dag_yaml))
|
474
535
|
controller_process.start()
|
475
536
|
while controller_process.is_alive():
|
476
537
|
_handle_signal(job_id)
|
@@ -478,6 +539,7 @@ def start(job_id, dag_yaml, retry_until_up):
|
|
478
539
|
except exceptions.ManagedJobUserCancelledError:
|
479
540
|
dag, _ = _get_dag_and_name(dag_yaml)
|
480
541
|
task_id, _ = managed_job_state.get_latest_task_id_status(job_id)
|
542
|
+
assert task_id is not None, job_id
|
481
543
|
logger.info(
|
482
544
|
f'Cancelling managed job, job_id: {job_id}, task_id: {task_id}')
|
483
545
|
managed_job_state.set_cancelling(
|
@@ -492,8 +554,8 @@ def start(job_id, dag_yaml, retry_until_up):
|
|
492
554
|
# Kill the controller process first; if its child process is
|
493
555
|
# killed first, then the controller process will raise errors.
|
494
556
|
# Kill any possible remaining children processes recursively.
|
495
|
-
subprocess_utils.kill_children_processes(
|
496
|
-
|
557
|
+
subprocess_utils.kill_children_processes(
|
558
|
+
parent_pids=[controller_process.pid], force=True)
|
497
559
|
controller_process.join()
|
498
560
|
logger.info(f'Controller process {controller_process.pid} killed.')
|
499
561
|
|
@@ -509,6 +571,7 @@ def start(job_id, dag_yaml, retry_until_up):
|
|
509
571
|
logger.info(f'Cluster of managed job {job_id} has been cleaned up.')
|
510
572
|
|
511
573
|
if cancelling:
|
574
|
+
assert task_id is not None, job_id # Since it's set with cancelling
|
512
575
|
managed_job_state.set_cancelled(
|
513
576
|
job_id=job_id,
|
514
577
|
callback_func=managed_job_utils.event_callback_func(
|
@@ -530,6 +593,8 @@ def start(job_id, dag_yaml, retry_until_up):
|
|
530
593
|
failure_reason=('Unexpected error occurred. For details, '
|
531
594
|
f'run: sky jobs logs --controller {job_id}'))
|
532
595
|
|
596
|
+
scheduler.job_done(job_id)
|
597
|
+
|
533
598
|
|
534
599
|
if __name__ == '__main__':
|
535
600
|
parser = argparse.ArgumentParser()
|
@@ -537,9 +602,6 @@ if __name__ == '__main__':
|
|
537
602
|
required=True,
|
538
603
|
type=int,
|
539
604
|
help='Job id for the controller job.')
|
540
|
-
parser.add_argument('--retry-until-up',
|
541
|
-
action='store_true',
|
542
|
-
help='Retry until the cluster is up.')
|
543
605
|
parser.add_argument('dag_yaml',
|
544
606
|
type=str,
|
545
607
|
help='The path to the user job yaml file.')
|
@@ -547,4 +609,4 @@ if __name__ == '__main__':
|
|
547
609
|
# We start process with 'spawn', because 'fork' could result in weird
|
548
610
|
# behaviors; 'spawn' is also cross-platform.
|
549
611
|
multiprocessing.set_start_method('spawn', force=True)
|
550
|
-
start(args.job_id, args.dag_yaml
|
612
|
+
start(args.job_id, args.dag_yaml)
|