skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/jobs/controller.py CHANGED
@@ -1,29 +1,36 @@
1
- """Controller: handles the life cycle of a managed job."""
1
+ """Controller: handles the life cycle of a managed job.
2
+
3
+ TODO(cooperc): Document lifecycle, and multiprocess layout.
4
+ """
2
5
  import argparse
3
6
  import multiprocessing
4
7
  import os
5
8
  import pathlib
9
+ import shutil
6
10
  import time
7
11
  import traceback
8
12
  import typing
9
- from typing import Tuple
13
+ from typing import Optional, Tuple
10
14
 
11
15
  import filelock
12
16
 
13
17
  from sky import exceptions
14
18
  from sky import sky_logging
15
- from sky import status_lib
16
19
  from sky.backends import backend_utils
17
20
  from sky.backends import cloud_vm_ray_backend
21
+ from sky.data import data_utils
18
22
  from sky.jobs import recovery_strategy
23
+ from sky.jobs import scheduler
19
24
  from sky.jobs import state as managed_job_state
20
25
  from sky.jobs import utils as managed_job_utils
21
26
  from sky.skylet import constants
22
27
  from sky.skylet import job_lib
23
28
  from sky.usage import usage_lib
29
+ from sky.utils import common
24
30
  from sky.utils import common_utils
25
31
  from sky.utils import controller_utils
26
32
  from sky.utils import dag_utils
33
+ from sky.utils import status_lib
27
34
  from sky.utils import subprocess_utils
28
35
  from sky.utils import ux_utils
29
36
 
@@ -46,12 +53,10 @@ def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
46
53
  class JobsController:
47
54
  """Each jobs controller manages the life cycle of one managed job."""
48
55
 
49
- def __init__(self, job_id: int, dag_yaml: str,
50
- retry_until_up: bool) -> None:
56
+ def __init__(self, job_id: int, dag_yaml: str) -> None:
51
57
  self._job_id = job_id
52
58
  self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
53
59
  logger.info(self._dag)
54
- self._retry_until_up = retry_until_up
55
60
  # TODO(zhwu): this assumes the specific backend.
56
61
  self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
57
62
 
@@ -64,8 +69,9 @@ class JobsController:
64
69
  if len(self._dag.tasks) <= 1:
65
70
  task_name = self._dag_name
66
71
  else:
72
+ assert task.name is not None, task
67
73
  task_name = task.name
68
- # This is guaranteed by the spot_launch API, where we fill in
74
+ # This is guaranteed by the jobs.launch API, where we fill in
69
75
  # the task.name with
70
76
  # dag_utils.maybe_infer_and_fill_dag_and_task_names.
71
77
  assert task_name is not None, self._dag
@@ -86,18 +92,28 @@ class JobsController:
86
92
  task.update_envs(task_envs)
87
93
 
88
94
  def _download_log_and_stream(
89
- self,
90
- handle: cloud_vm_ray_backend.CloudVmRayResourceHandle) -> None:
91
- """Downloads and streams the logs of the latest job.
95
+ self, task_id: Optional[int],
96
+ handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle]
97
+ ) -> None:
98
+ """Downloads and streams the logs of the current job with given task ID.
92
99
 
93
100
  We do not stream the logs from the cluster directly, as the
94
101
  donwload and stream should be faster, and more robust against
95
102
  preemptions or ssh disconnection during the streaming.
96
103
  """
104
+ if handle is None:
105
+ logger.info(f'Cluster for job {self._job_id} is not found. '
106
+ 'Skipping downloading and streaming the logs.')
107
+ return
97
108
  managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
98
109
  'managed_jobs')
99
- controller_utils.download_and_stream_latest_job_log(
110
+ log_file = controller_utils.download_and_stream_latest_job_log(
100
111
  self._backend, handle, managed_job_logs_dir)
112
+ if log_file is not None:
113
+ # Set the path of the log file for the current task, so it can be
114
+ # accessed even after the job is finished
115
+ managed_job_state.set_local_log_file(self._job_id, task_id,
116
+ log_file)
101
117
  logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
102
118
 
103
119
  def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
@@ -124,8 +140,8 @@ class JobsController:
124
140
  1. The optimizer cannot find a feasible solution.
125
141
  2. Precheck errors: invalid cluster name, failure in getting
126
142
  cloud user identity, or unsupported feature.
127
- exceptions.SpotJobReachedMaxRetryError: This will be raised when
128
- all prechecks passed but the maximum number of retries is
143
+ exceptions.ManagedJobReachedMaxRetriesError: This will be raised
144
+ when all prechecks passed but the maximum number of retries is
129
145
  reached for `sky.launch`. The failure of `sky.launch` can be
130
146
  due to:
131
147
  1. Any of the underlying failover exceptions is due to resources
@@ -159,6 +175,11 @@ class JobsController:
159
175
  if task_id == 0:
160
176
  submitted_at = backend_utils.get_timestamp_from_run_timestamp(
161
177
  self._backend.run_timestamp)
178
+ assert task.name is not None, task
179
+ cluster_name = managed_job_utils.generate_managed_job_cluster_name(
180
+ task.name, self._job_id)
181
+ self._strategy_executor = recovery_strategy.StrategyExecutor.make(
182
+ cluster_name, self._backend, task, self._job_id)
162
183
  managed_job_state.set_submitted(
163
184
  self._job_id,
164
185
  task_id,
@@ -166,15 +187,14 @@ class JobsController:
166
187
  submitted_at,
167
188
  resources_str=backend_utils.get_task_resources_str(
168
189
  task, is_managed_job=True),
190
+ specs={
191
+ 'max_restarts_on_errors':
192
+ self._strategy_executor.max_restarts_on_errors
193
+ },
169
194
  callback_func=callback_func)
170
195
  logger.info(
171
196
  f'Submitted managed job {self._job_id} (task: {task_id}, name: '
172
197
  f'{task.name!r}); {constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
173
- assert task.name is not None, task
174
- cluster_name = managed_job_utils.generate_managed_job_cluster_name(
175
- task.name, self._job_id)
176
- self._strategy_executor = recovery_strategy.StrategyExecutor.make(
177
- cluster_name, self._backend, task, self._retry_until_up)
178
198
 
179
199
  logger.info('Started monitoring.')
180
200
  managed_job_state.set_starting(job_id=self._job_id,
@@ -187,6 +207,7 @@ class JobsController:
187
207
  task_id=task_id,
188
208
  start_time=remote_job_submitted_at,
189
209
  callback_func=callback_func)
210
+
190
211
  while True:
191
212
  time.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
192
213
 
@@ -206,22 +227,39 @@ class JobsController:
206
227
  self._backend, cluster_name)
207
228
 
208
229
  if job_status == job_lib.JobStatus.SUCCEEDED:
209
- end_time = managed_job_utils.get_job_timestamp(
210
- self._backend, cluster_name, get_end_time=True)
211
- # The job is done.
230
+ end_time = managed_job_utils.try_to_get_job_end_time(
231
+ self._backend, cluster_name)
232
+ # The job is done. Set the job to SUCCEEDED first before start
233
+ # downloading and streaming the logs to make it more responsive.
212
234
  managed_job_state.set_succeeded(self._job_id,
213
235
  task_id,
214
236
  end_time=end_time,
215
237
  callback_func=callback_func)
216
238
  logger.info(
217
- f'Spot job {self._job_id} (task: {task_id}) SUCCEEDED. '
239
+ f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
218
240
  f'Cleaning up the cluster {cluster_name}.')
241
+ try:
242
+ clusters = backend_utils.get_clusters(
243
+ cluster_names=[cluster_name],
244
+ refresh=common.StatusRefreshMode.NONE,
245
+ all_users=True)
246
+ if clusters:
247
+ assert len(clusters) == 1, (clusters, cluster_name)
248
+ handle = clusters[0].get('handle')
249
+ # Best effort to download and stream the logs.
250
+ self._download_log_and_stream(task_id, handle)
251
+ except Exception as e: # pylint: disable=broad-except
252
+ # We don't want to crash here, so just log and continue.
253
+ logger.warning(
254
+ f'Failed to download and stream logs: '
255
+ f'{common_utils.format_exception(e)}',
256
+ exc_info=True)
219
257
  # Only clean up the cluster, not the storages, because tasks may
220
258
  # share storages.
221
- recovery_strategy.terminate_cluster(cluster_name=cluster_name)
259
+ managed_job_utils.terminate_cluster(cluster_name=cluster_name)
222
260
  return True
223
261
 
224
- # For single-node jobs, nonterminated job_status indicates a
262
+ # For single-node jobs, non-terminated job_status indicates a
225
263
  # healthy cluster. We can safely continue monitoring.
226
264
  # For multi-node jobs, since the job may not be set to FAILED
227
265
  # immediately (depending on user program) when only some of the
@@ -231,9 +269,7 @@ class JobsController:
231
269
  task.num_nodes == 1):
232
270
  continue
233
271
 
234
- if job_status in [
235
- job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
236
- ]:
272
+ if job_status in job_lib.JobStatus.user_code_failure_states():
237
273
  # Add a grace period before the check of preemption to avoid
238
274
  # false alarm for job failure.
239
275
  time.sleep(5)
@@ -263,17 +299,15 @@ class JobsController:
263
299
  if job_status is not None and not job_status.is_terminal():
264
300
  # The multi-node job is still running, continue monitoring.
265
301
  continue
266
- elif job_status in [
267
- job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
268
- ]:
302
+ elif job_status in job_lib.JobStatus.user_code_failure_states():
269
303
  # The user code has probably crashed, fail immediately.
270
- end_time = managed_job_utils.get_job_timestamp(
271
- self._backend, cluster_name, get_end_time=True)
304
+ end_time = managed_job_utils.try_to_get_job_end_time(
305
+ self._backend, cluster_name)
272
306
  logger.info(
273
307
  'The user job failed. Please check the logs below.\n'
274
308
  f'== Logs of the user job (ID: {self._job_id}) ==\n')
275
309
 
276
- self._download_log_and_stream(handle)
310
+ self._download_log_and_stream(task_id, handle)
277
311
  managed_job_status = (
278
312
  managed_job_state.ManagedJobStatus.FAILED)
279
313
  if job_status == job_lib.JobStatus.FAILED_SETUP:
@@ -282,23 +316,35 @@ class JobsController:
282
316
  failure_reason = (
283
317
  'To see the details, run: '
284
318
  f'sky jobs logs --controller {self._job_id}')
285
-
286
- managed_job_state.set_failed(
287
- self._job_id,
288
- task_id,
289
- failure_type=managed_job_status,
290
- failure_reason=failure_reason,
291
- end_time=end_time,
292
- callback_func=callback_func)
293
- return False
294
- # Although the cluster is healthy, we fail to access the
295
- # job status. Try to recover the job (will not restart the
296
- # cluster, if the cluster is healthy).
297
- assert job_status is None, job_status
298
- logger.info('Failed to fetch the job status while the '
299
- 'cluster is healthy. Try to recover the job '
300
- '(the cluster will not be restarted).')
301
-
319
+ should_restart_on_failure = (
320
+ self._strategy_executor.should_restart_on_failure())
321
+ if should_restart_on_failure:
322
+ max_restarts = (
323
+ self._strategy_executor.max_restarts_on_errors)
324
+ logger.info(
325
+ f'User program crashed '
326
+ f'({managed_job_status.value}). '
327
+ f'Retry the job as max_restarts_on_errors is '
328
+ f'set to {max_restarts}. '
329
+ f'[{self._strategy_executor.restart_cnt_on_failure}'
330
+ f'/{max_restarts}]')
331
+ else:
332
+ managed_job_state.set_failed(
333
+ self._job_id,
334
+ task_id,
335
+ failure_type=managed_job_status,
336
+ failure_reason=failure_reason,
337
+ end_time=end_time,
338
+ callback_func=callback_func)
339
+ return False
340
+ else:
341
+ # Although the cluster is healthy, we fail to access the
342
+ # job status. Try to recover the job (will not restart the
343
+ # cluster, if the cluster is healthy).
344
+ assert job_status is None, job_status
345
+ logger.info('Failed to fetch the job status while the '
346
+ 'cluster is healthy. Try to recover the job '
347
+ '(the cluster will not be restarted).')
302
348
  # When the handle is None, the cluster should be cleaned up already.
303
349
  if handle is not None:
304
350
  resources = handle.launched_resources
@@ -309,7 +355,7 @@ class JobsController:
309
355
  # those clusters again may fail.
310
356
  logger.info('Cleaning up the preempted or failed cluster'
311
357
  '...')
312
- recovery_strategy.terminate_cluster(cluster_name)
358
+ managed_job_utils.terminate_cluster(cluster_name)
313
359
 
314
360
  # Try to recover the managed jobs, when the cluster is preempted or
315
361
  # failed or the job status is failed to be fetched.
@@ -339,48 +385,28 @@ class JobsController:
339
385
  common_utils.format_exception(reason, use_bracket=True)
340
386
  for reason in e.reasons))
341
387
  logger.error(failure_reason)
342
- managed_job_state.set_failed(
343
- self._job_id,
344
- task_id=task_id,
345
- failure_type=managed_job_state.ManagedJobStatus.
346
- FAILED_PRECHECKS,
347
- failure_reason=failure_reason,
348
- callback_func=managed_job_utils.event_callback_func(
349
- job_id=self._job_id,
350
- task_id=task_id,
351
- task=self._dag.tasks[task_id]))
388
+ self._update_failed_task_state(
389
+ task_id, managed_job_state.ManagedJobStatus.FAILED_PRECHECKS,
390
+ failure_reason)
352
391
  except exceptions.ManagedJobReachedMaxRetriesError as e:
353
392
  # Please refer to the docstring of self._run for the cases when
354
393
  # this exception can occur.
355
- logger.error(common_utils.format_exception(e))
394
+ failure_reason = common_utils.format_exception(e)
395
+ logger.error(failure_reason)
356
396
  # The managed job should be marked as FAILED_NO_RESOURCE, as the
357
397
  # managed job may be able to launch next time.
358
- managed_job_state.set_failed(
359
- self._job_id,
360
- task_id=task_id,
361
- failure_type=managed_job_state.ManagedJobStatus.
362
- FAILED_NO_RESOURCE,
363
- failure_reason=common_utils.format_exception(e),
364
- callback_func=managed_job_utils.event_callback_func(
365
- job_id=self._job_id,
366
- task_id=task_id,
367
- task=self._dag.tasks[task_id]))
398
+ self._update_failed_task_state(
399
+ task_id, managed_job_state.ManagedJobStatus.FAILED_NO_RESOURCE,
400
+ failure_reason)
368
401
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
369
402
  with ux_utils.enable_traceback():
370
403
  logger.error(traceback.format_exc())
371
- msg = ('Unexpected error occurred: '
372
- f'{common_utils.format_exception(e, use_bracket=True)}')
404
+ msg = ('Unexpected error occurred: ' +
405
+ common_utils.format_exception(e, use_bracket=True))
373
406
  logger.error(msg)
374
- managed_job_state.set_failed(
375
- self._job_id,
376
- task_id=task_id,
377
- failure_type=managed_job_state.ManagedJobStatus.
378
- FAILED_CONTROLLER,
379
- failure_reason=msg,
380
- callback_func=managed_job_utils.event_callback_func(
381
- job_id=self._job_id,
382
- task_id=task_id,
383
- task=self._dag.tasks[task_id]))
407
+ self._update_failed_task_state(
408
+ task_id, managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
409
+ msg)
384
410
  finally:
385
411
  # This will set all unfinished tasks to CANCELLING, and will not
386
412
  # affect the jobs in terminal states.
@@ -395,12 +421,27 @@ class JobsController:
395
421
  managed_job_state.set_cancelled(job_id=self._job_id,
396
422
  callback_func=callback_func)
397
423
 
424
+ def _update_failed_task_state(
425
+ self, task_id: int,
426
+ failure_type: managed_job_state.ManagedJobStatus,
427
+ failure_reason: str):
428
+ """Update the state of the failed task."""
429
+ managed_job_state.set_failed(
430
+ self._job_id,
431
+ task_id=task_id,
432
+ failure_type=failure_type,
433
+ failure_reason=failure_reason,
434
+ callback_func=managed_job_utils.event_callback_func(
435
+ job_id=self._job_id,
436
+ task_id=task_id,
437
+ task=self._dag.tasks[task_id]))
398
438
 
399
- def _run_controller(job_id: int, dag_yaml: str, retry_until_up: bool):
439
+
440
+ def _run_controller(job_id: int, dag_yaml: str):
400
441
  """Runs the controller in a remote process for interruption."""
401
442
  # The controller needs to be instantiated in the remote process, since
402
443
  # the controller is not serializable.
403
- jobs_controller = JobsController(job_id, dag_yaml, retry_until_up)
444
+ jobs_controller = JobsController(job_id, dag_yaml)
404
445
  jobs_controller.run()
405
446
 
406
447
 
@@ -443,23 +484,44 @@ def _cleanup(job_id: int, dag_yaml: str):
443
484
  when reaching here, as we currently only support chain DAGs, and only
444
485
  task is executed at a time.
445
486
  """
446
- # NOTE: The code to get cluster name is same as what we did in the spot
447
- # controller, we should keep it in sync with JobsController.__init__()
448
487
  dag, _ = _get_dag_and_name(dag_yaml)
449
488
  for task in dag.tasks:
489
+ assert task.name is not None, task
450
490
  cluster_name = managed_job_utils.generate_managed_job_cluster_name(
451
491
  task.name, job_id)
452
- recovery_strategy.terminate_cluster(cluster_name)
492
+ managed_job_utils.terminate_cluster(cluster_name)
493
+
453
494
  # Clean up Storages with persistent=False.
454
495
  # TODO(zhwu): this assumes the specific backend.
455
496
  backend = cloud_vm_ray_backend.CloudVmRayBackend()
497
+ # Need to re-construct storage object in the controller process
498
+ # because when SkyPilot API server machine sends the yaml config to the
499
+ # controller machine, only storage metadata is sent, not the storage
500
+ # object itself.
501
+ for storage in task.storage_mounts.values():
502
+ storage.construct()
456
503
  backend.teardown_ephemeral_storage(task)
457
504
 
458
-
459
- def start(job_id, dag_yaml, retry_until_up):
505
+ # Clean up any files mounted from the local disk, such as two-hop file
506
+ # mounts.
507
+ for file_mount in (task.file_mounts or {}).values():
508
+ try:
509
+ if not data_utils.is_cloud_store_url(file_mount):
510
+ path = os.path.expanduser(file_mount)
511
+ if os.path.isdir(path):
512
+ shutil.rmtree(path)
513
+ else:
514
+ os.remove(path)
515
+ except Exception as e: # pylint: disable=broad-except
516
+ logger.warning(
517
+ f'Failed to clean up file mount {file_mount}: {e}')
518
+
519
+
520
+ def start(job_id, dag_yaml):
460
521
  """Start the controller."""
461
522
  controller_process = None
462
523
  cancelling = False
524
+ task_id = None
463
525
  try:
464
526
  _handle_signal(job_id)
465
527
  # TODO(suquark): In theory, we should make controller process a
@@ -469,8 +531,7 @@ def start(job_id, dag_yaml, retry_until_up):
469
531
  # So we can only enable daemon after we no longer need to
470
532
  # start daemon processes like Ray.
471
533
  controller_process = multiprocessing.Process(target=_run_controller,
472
- args=(job_id, dag_yaml,
473
- retry_until_up))
534
+ args=(job_id, dag_yaml))
474
535
  controller_process.start()
475
536
  while controller_process.is_alive():
476
537
  _handle_signal(job_id)
@@ -478,6 +539,7 @@ def start(job_id, dag_yaml, retry_until_up):
478
539
  except exceptions.ManagedJobUserCancelledError:
479
540
  dag, _ = _get_dag_and_name(dag_yaml)
480
541
  task_id, _ = managed_job_state.get_latest_task_id_status(job_id)
542
+ assert task_id is not None, job_id
481
543
  logger.info(
482
544
  f'Cancelling managed job, job_id: {job_id}, task_id: {task_id}')
483
545
  managed_job_state.set_cancelling(
@@ -492,8 +554,8 @@ def start(job_id, dag_yaml, retry_until_up):
492
554
  # Kill the controller process first; if its child process is
493
555
  # killed first, then the controller process will raise errors.
494
556
  # Kill any possible remaining children processes recursively.
495
- subprocess_utils.kill_children_processes(controller_process.pid,
496
- force=True)
557
+ subprocess_utils.kill_children_processes(
558
+ parent_pids=[controller_process.pid], force=True)
497
559
  controller_process.join()
498
560
  logger.info(f'Controller process {controller_process.pid} killed.')
499
561
 
@@ -509,6 +571,7 @@ def start(job_id, dag_yaml, retry_until_up):
509
571
  logger.info(f'Cluster of managed job {job_id} has been cleaned up.')
510
572
 
511
573
  if cancelling:
574
+ assert task_id is not None, job_id # Since it's set with cancelling
512
575
  managed_job_state.set_cancelled(
513
576
  job_id=job_id,
514
577
  callback_func=managed_job_utils.event_callback_func(
@@ -530,6 +593,8 @@ def start(job_id, dag_yaml, retry_until_up):
530
593
  failure_reason=('Unexpected error occurred. For details, '
531
594
  f'run: sky jobs logs --controller {job_id}'))
532
595
 
596
+ scheduler.job_done(job_id)
597
+
533
598
 
534
599
  if __name__ == '__main__':
535
600
  parser = argparse.ArgumentParser()
@@ -537,9 +602,6 @@ if __name__ == '__main__':
537
602
  required=True,
538
603
  type=int,
539
604
  help='Job id for the controller job.')
540
- parser.add_argument('--retry-until-up',
541
- action='store_true',
542
- help='Retry until the cluster is up.')
543
605
  parser.add_argument('dag_yaml',
544
606
  type=str,
545
607
  help='The path to the user job yaml file.')
@@ -547,4 +609,4 @@ if __name__ == '__main__':
547
609
  # We start process with 'spawn', because 'fork' could result in weird
548
610
  # behaviors; 'spawn' is also cross-platform.
549
611
  multiprocessing.set_start_method('spawn', force=True)
550
- start(args.job_id, args.dag_yaml, args.retry_until_up)
612
+ start(args.job_id, args.dag_yaml)