skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -13,50 +13,33 @@ from typing import Optional
13
13
  import sky
14
14
  from sky import backends
15
15
  from sky import exceptions
16
+ from sky import execution
16
17
  from sky import global_user_state
17
18
  from sky import sky_logging
18
- from sky import status_lib
19
19
  from sky.backends import backend_utils
20
+ from sky.jobs import scheduler
20
21
  from sky.jobs import utils as managed_job_utils
21
22
  from sky.skylet import job_lib
22
23
  from sky.usage import usage_lib
23
24
  from sky.utils import common_utils
25
+ from sky.utils import registry
26
+ from sky.utils import status_lib
24
27
  from sky.utils import ux_utils
25
28
 
26
29
  if typing.TYPE_CHECKING:
30
+ from sky import resources
27
31
  from sky import task as task_lib
28
32
 
29
33
  logger = sky_logging.init_logger(__name__)
30
34
 
31
- RECOVERY_STRATEGIES = {}
32
- DEFAULT_RECOVERY_STRATEGY = None
33
-
34
35
  # Waiting time for job from INIT/PENDING to RUNNING
35
36
  # 10 * JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 10 * 5 = 50 seconds
36
37
  MAX_JOB_CHECKING_RETRY = 10
37
38
 
38
-
39
- def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
40
- """Terminate the cluster."""
41
- retry_cnt = 0
42
- while True:
43
- try:
44
- usage_lib.messages.usage.set_internal()
45
- sky.down(cluster_name)
46
- return
47
- except ValueError:
48
- # The cluster is already down.
49
- return
50
- except Exception as e: # pylint: disable=broad-except
51
- retry_cnt += 1
52
- if retry_cnt >= max_retry:
53
- raise RuntimeError(
54
- f'Failed to terminate the cluster {cluster_name}.') from e
55
- logger.error(
56
- f'Failed to terminate the cluster {cluster_name}. Retrying.'
57
- f'Details: {common_utils.format_exception(e)}')
58
- with ux_utils.enable_traceback():
59
- logger.error(f' Traceback: {traceback.format_exc()}')
39
+ # Minutes to job cluster autodown. This should be significantly larger than
40
+ # managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS, to avoid tearing down the
41
+ # cluster before its status can be updated by the job controller.
42
+ _AUTODOWN_MINUTES = 5
60
43
 
61
44
 
62
45
  class StrategyExecutor:
@@ -65,14 +48,14 @@ class StrategyExecutor:
65
48
  RETRY_INIT_GAP_SECONDS = 60
66
49
 
67
50
  def __init__(self, cluster_name: str, backend: 'backends.Backend',
68
- task: 'task_lib.Task', retry_until_up: bool) -> None:
51
+ task: 'task_lib.Task', max_restarts_on_errors: int,
52
+ job_id: int) -> None:
69
53
  """Initialize the strategy executor.
70
54
 
71
55
  Args:
72
56
  cluster_name: The name of the cluster.
73
57
  backend: The backend to use. Only CloudVMRayBackend is supported.
74
58
  task: The task to execute.
75
- retry_until_up: Whether to retry until the cluster is up.
76
59
  """
77
60
  assert isinstance(backend, backends.CloudVmRayBackend), (
78
61
  'Only CloudVMRayBackend is supported.')
@@ -80,19 +63,13 @@ class StrategyExecutor:
80
63
  self.dag.add(task)
81
64
  self.cluster_name = cluster_name
82
65
  self.backend = backend
83
- self.retry_until_up = retry_until_up
84
-
85
- def __init_subclass__(cls, name: str, default: bool = False):
86
- RECOVERY_STRATEGIES[name] = cls
87
- if default:
88
- global DEFAULT_RECOVERY_STRATEGY
89
- assert DEFAULT_RECOVERY_STRATEGY is None, (
90
- 'Only one strategy can be default.')
91
- DEFAULT_RECOVERY_STRATEGY = name
66
+ self.max_restarts_on_errors = max_restarts_on_errors
67
+ self.job_id = job_id
68
+ self.restart_cnt_on_failure = 0
92
69
 
93
70
  @classmethod
94
71
  def make(cls, cluster_name: str, backend: 'backends.Backend',
95
- task: 'task_lib.Task', retry_until_up: bool) -> 'StrategyExecutor':
72
+ task: 'task_lib.Task', job_id: int) -> 'StrategyExecutor':
96
73
  """Create a strategy from a task."""
97
74
 
98
75
  resource_list = list(task.resources)
@@ -108,8 +85,19 @@ class StrategyExecutor:
108
85
  # set the new_task_resources to be the same type (list or set) as the
109
86
  # original task.resources
110
87
  task.set_resources(type(task.resources)(new_resources_list))
111
- return RECOVERY_STRATEGIES[job_recovery](cluster_name, backend, task,
112
- retry_until_up)
88
+ if isinstance(job_recovery, dict):
89
+ job_recovery_name = job_recovery.pop(
90
+ 'strategy', registry.JOBS_RECOVERY_STRATEGY_REGISTRY.default)
91
+ max_restarts_on_errors = job_recovery.pop('max_restarts_on_errors',
92
+ 0)
93
+ else:
94
+ job_recovery_name = job_recovery
95
+ max_restarts_on_errors = 0
96
+ job_recovery_strategy = (registry.JOBS_RECOVERY_STRATEGY_REGISTRY.
97
+ from_str(job_recovery_name))
98
+ assert job_recovery_strategy is not None, job_recovery_name
99
+ return job_recovery_strategy(cluster_name, backend, task,
100
+ max_restarts_on_errors, job_id)
113
101
 
114
102
  def launch(self) -> float:
115
103
  """Launch the cluster for the first time.
@@ -123,10 +111,7 @@ class StrategyExecutor:
123
111
  Raises: Please refer to the docstring of self._launch().
124
112
  """
125
113
 
126
- if self.retry_until_up:
127
- job_submit_at = self._launch(max_retry=None)
128
- else:
129
- job_submit_at = self._launch()
114
+ job_submit_at = self._launch(max_retry=None)
130
115
  assert job_submit_at is not None
131
116
  return job_submit_at
132
117
 
@@ -141,6 +126,8 @@ class StrategyExecutor:
141
126
  raise NotImplementedError
142
127
 
143
128
  def _try_cancel_all_jobs(self):
129
+ from sky import core # pylint: disable=import-outside-toplevel
130
+
144
131
  handle = global_user_state.get_handle_from_cluster_name(
145
132
  self.cluster_name)
146
133
  if handle is None:
@@ -166,9 +153,9 @@ class StrategyExecutor:
166
153
  # should be functional with the `_try_cancel_if_cluster_is_init`
167
154
  # flag, i.e. it sends the cancel signal to the head node, which will
168
155
  # then kill the user process on remaining worker nodes.
169
- sky.cancel(cluster_name=self.cluster_name,
170
- all=True,
171
- _try_cancel_if_cluster_is_init=True)
156
+ core.cancel(cluster_name=self.cluster_name,
157
+ all=True,
158
+ _try_cancel_if_cluster_is_init=True)
172
159
  except Exception as e: # pylint: disable=broad-except
173
160
  logger.info('Failed to cancel the job on the cluster. The cluster '
174
161
  'might be already down or the head node is preempted.'
@@ -176,7 +163,7 @@ class StrategyExecutor:
176
163
  f'{common_utils.format_exception(e)}\n'
177
164
  'Terminating the cluster explicitly to ensure no '
178
165
  'remaining job process interferes with recovery.')
179
- terminate_cluster(self.cluster_name)
166
+ managed_job_utils.terminate_cluster(self.cluster_name)
180
167
 
181
168
  def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
182
169
  """Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
@@ -270,8 +257,8 @@ class StrategyExecutor:
270
257
  1. The optimizer cannot find a feasible solution.
271
258
  2. Precheck errors: invalid cluster name, failure in getting
272
259
  cloud user identity, or unsupported feature.
273
- exceptions.SpotJobReachedMaxRetryError: This will be raised when
274
- all prechecks passed but the maximum number of retries is
260
+ exceptions.ManagedJobReachedMaxRetriesError: This will be raised
261
+ when all prechecks passed but the maximum number of retries is
275
262
  reached for `sky.launch`. The failure of `sky.launch` can be
276
263
  due to:
277
264
  1. Any of the underlying failover exceptions is due to resources
@@ -285,104 +272,128 @@ class StrategyExecutor:
285
272
  backoff = common_utils.Backoff(self.RETRY_INIT_GAP_SECONDS)
286
273
  while True:
287
274
  retry_cnt += 1
288
- try:
289
- usage_lib.messages.usage.set_internal()
290
- # Detach setup, so that the setup failure can be detected
291
- # by the controller process (job_status -> FAILED_SETUP).
292
- sky.launch(self.dag,
293
- cluster_name=self.cluster_name,
294
- detach_setup=True,
295
- detach_run=True,
296
- _is_launched_by_jobs_controller=True)
297
- logger.info('Managed job cluster launched.')
298
- except (exceptions.InvalidClusterNameError,
299
- exceptions.NoCloudAccessError,
300
- exceptions.ResourcesMismatchError) as e:
301
- logger.error('Failure happened before provisioning. '
302
- f'{common_utils.format_exception(e)}')
303
- if raise_on_failure:
304
- raise exceptions.ProvisionPrechecksError(reasons=[e])
305
- return None
306
- except exceptions.ResourcesUnavailableError as e:
307
- # This is raised when the launch fails due to prechecks or
308
- # after failing over through all the candidates.
309
- # Please refer to the docstring of `sky.launch` for more
310
- # details of how the exception will be structured.
311
- if not any(
312
- isinstance(err, exceptions.ResourcesUnavailableError)
313
- for err in e.failover_history):
314
- # _launch() (this function) should fail/exit directly, if
315
- # none of the failover reasons were because of resource
316
- # unavailability or no failover was attempted (the optimizer
317
- # cannot find feasible resources for requested resources),
318
- # i.e., e.failover_history is empty.
319
- # Failing directly avoids the infinite loop of retrying
320
- # the launch when, e.g., an invalid cluster name is used
321
- # and --retry-until-up is specified.
322
- reasons = (e.failover_history
323
- if e.failover_history else [e])
324
- reasons_str = '; '.join(
325
- common_utils.format_exception(err) for err in reasons)
326
- logger.error(
327
- 'Failure happened before provisioning. Failover '
328
- f'reasons: {reasons_str}')
275
+ with scheduler.scheduled_launch(self.job_id):
276
+ try:
277
+ usage_lib.messages.usage.set_internal()
278
+ # Detach setup, so that the setup failure can be detected
279
+ # by the controller process (job_status -> FAILED_SETUP).
280
+ execution.launch(
281
+ self.dag,
282
+ cluster_name=self.cluster_name,
283
+ # We expect to tear down the cluster as soon as the job
284
+ # is finished. However, in case the controller dies, set
285
+ # autodown to try and avoid a resource leak.
286
+ idle_minutes_to_autostop=_AUTODOWN_MINUTES,
287
+ down=True,
288
+ _is_launched_by_jobs_controller=True)
289
+ logger.info('Managed job cluster launched.')
290
+ except (exceptions.InvalidClusterNameError,
291
+ exceptions.NoCloudAccessError,
292
+ exceptions.ResourcesMismatchError) as e:
293
+ logger.error('Failure happened before provisioning. '
294
+ f'{common_utils.format_exception(e)}')
329
295
  if raise_on_failure:
330
- raise exceptions.ProvisionPrechecksError(
331
- reasons=reasons)
332
- return None
333
- logger.info('Failed to launch a cluster with error: '
334
- f'{common_utils.format_exception(e)})')
335
- except Exception as e: # pylint: disable=broad-except
336
- # If the launch fails, it will be recovered by the following
337
- # code.
338
- logger.info('Failed to launch a cluster with error: '
339
- f'{common_utils.format_exception(e)})')
340
- with ux_utils.enable_traceback():
341
- logger.info(f' Traceback: {traceback.format_exc()}')
342
- else: # No exception, the launch succeeds.
343
- # At this point, a sky.launch() has succeeded. Cluster may be
344
- # UP (no preemption since) or DOWN (newly preempted).
345
- job_submitted_at = self._wait_until_job_starts_on_cluster()
346
- if job_submitted_at is not None:
347
- return job_submitted_at
348
- # The job fails to start on the cluster, retry the launch.
349
- # TODO(zhwu): log the unexpected error to usage collection
350
- # for future debugging.
351
- logger.info(
352
- 'Failed to successfully submit the job to the '
353
- 'launched cluster, due to unexpected submission errors or '
354
- 'the cluster being preempted during job submission.')
355
-
356
- terminate_cluster(self.cluster_name)
357
- if max_retry is not None and retry_cnt >= max_retry:
358
- # Retry forever if max_retry is None.
359
- if raise_on_failure:
360
- with ux_utils.print_exception_no_traceback():
361
- raise exceptions.ManagedJobReachedMaxRetriesError(
362
- 'Resources unavailable: failed to launch clusters '
363
- f'after {max_retry} retries.')
364
- else:
296
+ raise exceptions.ProvisionPrechecksError(reasons=[e])
365
297
  return None
298
+ except exceptions.ResourcesUnavailableError as e:
299
+ # This is raised when the launch fails due to prechecks or
300
+ # after failing over through all the candidates.
301
+ # Please refer to the docstring of `sky.launch` for more
302
+ # details of how the exception will be structured.
303
+ if not any(
304
+ isinstance(err,
305
+ exceptions.ResourcesUnavailableError)
306
+ for err in e.failover_history):
307
+ # _launch() (this function) should fail/exit directly,
308
+ # if none of the failover reasons were because of
309
+ # resource unavailability or no failover was attempted
310
+ # (the optimizer cannot find feasible resources for
311
+ # requested resources), i.e., e.failover_history is
312
+ # empty. Failing directly avoids the infinite loop of
313
+ # retrying the launch when, e.g., an invalid cluster
314
+ # name is used and --retry-until-up is specified.
315
+ reasons = (e.failover_history
316
+ if e.failover_history else [e])
317
+ reasons_str = '; '.join(
318
+ common_utils.format_exception(err)
319
+ for err in reasons)
320
+ logger.error(
321
+ 'Failure happened before provisioning. Failover '
322
+ f'reasons: {reasons_str}')
323
+ if raise_on_failure:
324
+ raise exceptions.ProvisionPrechecksError(reasons)
325
+ return None
326
+ logger.info('Failed to launch a cluster with error: '
327
+ f'{common_utils.format_exception(e)})')
328
+ except Exception as e: # pylint: disable=broad-except
329
+ # If the launch fails, it will be recovered by the following
330
+ # code.
331
+ logger.info('Failed to launch a cluster with error: '
332
+ f'{common_utils.format_exception(e)})')
333
+ with ux_utils.enable_traceback():
334
+ logger.info(f' Traceback: {traceback.format_exc()}')
335
+ else: # No exception, the launch succeeds.
336
+ # At this point, a sky.launch() has succeeded. Cluster may
337
+ # be UP (no preemption since) or DOWN (newly preempted).
338
+ job_submitted_at = self._wait_until_job_starts_on_cluster()
339
+ if job_submitted_at is not None:
340
+ return job_submitted_at
341
+ # The job fails to start on the cluster, retry the launch.
342
+ # TODO(zhwu): log the unexpected error to usage collection
343
+ # for future debugging.
344
+ logger.info(
345
+ 'Failed to successfully submit the job to the '
346
+ 'launched cluster, due to unexpected submission errors '
347
+ 'or the cluster being preempted during job submission.')
348
+
349
+ # If we get here, the launch did not succeed. Tear down the
350
+ # cluster and retry.
351
+ managed_job_utils.terminate_cluster(self.cluster_name)
352
+ if max_retry is not None and retry_cnt >= max_retry:
353
+ # Retry forever if max_retry is None.
354
+ if raise_on_failure:
355
+ with ux_utils.print_exception_no_traceback():
356
+ raise exceptions.ManagedJobReachedMaxRetriesError(
357
+ 'Resources unavailable: failed to launch '
358
+ f'clusters after {max_retry} retries.')
359
+ else:
360
+ return None
361
+ # Exit the scheduled_launch context so that the scheulde state is
362
+ # ALIVE during the backoff. This allows other jobs to launch.
366
363
  gap_seconds = backoff.current_backoff()
367
364
  logger.info('Retrying to launch the cluster in '
368
365
  f'{gap_seconds:.1f} seconds.')
369
366
  time.sleep(gap_seconds)
370
367
 
368
+ def should_restart_on_failure(self) -> bool:
369
+ """Increments counter & checks if job should be restarted on a failure.
371
370
 
372
- class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
373
- default=False):
371
+ Returns:
372
+ True if the job should be restarted, otherwise False.
373
+ """
374
+ self.restart_cnt_on_failure += 1
375
+ if self.restart_cnt_on_failure > self.max_restarts_on_errors:
376
+ return False
377
+ return True
378
+
379
+
380
+ @registry.JOBS_RECOVERY_STRATEGY_REGISTRY.type_register(name='FAILOVER',
381
+ default=False)
382
+ class FailoverStrategyExecutor(StrategyExecutor):
374
383
  """Failover strategy: wait in same region and failover after timeout."""
375
384
 
376
385
  _MAX_RETRY_CNT = 240 # Retry for 4 hours.
377
386
 
378
387
  def __init__(self, cluster_name: str, backend: 'backends.Backend',
379
- task: 'task_lib.Task', retry_until_up: bool) -> None:
380
- super().__init__(cluster_name, backend, task, retry_until_up)
388
+ task: 'task_lib.Task', max_restarts_on_errors: int,
389
+ job_id: int) -> None:
390
+ super().__init__(cluster_name, backend, task, max_restarts_on_errors,
391
+ job_id)
381
392
  # Note down the cloud/region of the launched cluster, so that we can
382
393
  # first retry in the same cloud/region. (Inside recover() we may not
383
394
  # rely on cluster handle, as it can be None if the cluster is
384
395
  # preempted.)
385
- self._launched_resources: Optional['sky.resources.Resources'] = None
396
+ self._launched_resources: Optional['resources.Resources'] = None
386
397
 
387
398
  def _launch(self,
388
399
  max_retry: Optional[int] = 3,
@@ -431,7 +442,7 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
431
442
  # Step 2
432
443
  logger.debug('Terminating unhealthy cluster and reset cloud '
433
444
  'region.')
434
- terminate_cluster(self.cluster_name)
445
+ managed_job_utils.terminate_cluster(self.cluster_name)
435
446
 
436
447
  # Step 3
437
448
  logger.debug('Relaunch the cluster without constraining to prior '
@@ -441,23 +452,18 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
441
452
  raise_on_failure=False)
442
453
  if job_submitted_at is None:
443
454
  # Failed to launch the cluster.
444
- if self.retry_until_up:
445
- gap_seconds = self.RETRY_INIT_GAP_SECONDS
446
- logger.info('Retrying to recover the cluster in '
447
- f'{gap_seconds:.1f} seconds.')
448
- time.sleep(gap_seconds)
449
- continue
450
- with ux_utils.print_exception_no_traceback():
451
- raise exceptions.ResourcesUnavailableError(
452
- f'Failed to recover the cluster after retrying '
453
- f'{self._MAX_RETRY_CNT} times.')
455
+ gap_seconds = self.RETRY_INIT_GAP_SECONDS
456
+ logger.info('Retrying to recover the cluster in '
457
+ f'{gap_seconds:.1f} seconds.')
458
+ time.sleep(gap_seconds)
459
+ continue
454
460
 
455
461
  return job_submitted_at
456
462
 
457
463
 
458
- class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
459
- name='EAGER_NEXT_REGION',
460
- default=True):
464
+ @registry.JOBS_RECOVERY_STRATEGY_REGISTRY.type_register(
465
+ name='EAGER_NEXT_REGION', default=True)
466
+ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
461
467
  """Eager failover strategy.
462
468
 
463
469
  This strategy is an extension of the FAILOVER strategy. Instead of waiting
@@ -494,7 +500,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
494
500
 
495
501
  # Step 1
496
502
  logger.debug('Terminating unhealthy cluster and reset cloud region.')
497
- terminate_cluster(self.cluster_name)
503
+ managed_job_utils.terminate_cluster(self.cluster_name)
498
504
 
499
505
  # Step 2
500
506
  logger.debug('Relaunch the cluster skipping the previously launched '
@@ -529,15 +535,10 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
529
535
  raise_on_failure=False)
530
536
  if job_submitted_at is None:
531
537
  # Failed to launch the cluster.
532
- if self.retry_until_up:
533
- gap_seconds = self.RETRY_INIT_GAP_SECONDS
534
- logger.info('Retrying to recover the cluster in '
535
- f'{gap_seconds:.1f} seconds.')
536
- time.sleep(gap_seconds)
537
- continue
538
- with ux_utils.print_exception_no_traceback():
539
- raise exceptions.ResourcesUnavailableError(
540
- f'Failed to recover the cluster after retrying '
541
- f'{self._MAX_RETRY_CNT} times.')
538
+ gap_seconds = self.RETRY_INIT_GAP_SECONDS
539
+ logger.info('Retrying to recover the cluster in '
540
+ f'{gap_seconds:.1f} seconds.')
541
+ time.sleep(gap_seconds)
542
+ continue
542
543
 
543
544
  return job_submitted_at