skypilot-nightly 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +4 -3
- sky/catalog/__init__.py +3 -3
- sky/catalog/aws_catalog.py +12 -0
- sky/catalog/common.py +2 -2
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +448 -60
- sky/client/common.py +12 -9
- sky/clouds/nebius.py +1 -1
- sky/clouds/utils/gcp_utils.py +1 -1
- sky/clouds/vast.py +1 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +11 -0
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +30 -0
- sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +6 -0
- sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +1 -0
- sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +16 -0
- sky/dashboard/out/_next/static/chunks/4937.d6bf67771e353356.js +15 -0
- sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +1 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6135-d0e285ac5f3f2485.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +1 -0
- sky/dashboard/out/_next/static/chunks/691.6d99cbfba347cebf.js +55 -0
- sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +6 -0
- sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +1 -0
- sky/dashboard/out/_next/static/chunks/9847.4c46c5e229c78704.js +30 -0
- sky/dashboard/out/_next/static/chunks/9984.78ee6d2c6fa4b0e8.js +1 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-a67ae198457b9886.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-5adfc4d4b3db6f71.js +1 -0
- sky/dashboard/out/_next/static/oKqDxFQ88cquF4nQGE_0w/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +25 -0
- sky/data/storage.py +1219 -1775
- sky/global_user_state.py +18 -8
- sky/jobs/__init__.py +3 -0
- sky/jobs/client/sdk.py +80 -3
- sky/jobs/controller.py +76 -25
- sky/jobs/recovery_strategy.py +80 -34
- sky/jobs/scheduler.py +68 -20
- sky/jobs/server/core.py +228 -136
- sky/jobs/server/server.py +40 -0
- sky/jobs/state.py +164 -31
- sky/jobs/utils.py +144 -68
- sky/logs/aws.py +4 -2
- sky/provision/kubernetes/utils.py +6 -4
- sky/provision/nebius/constants.py +3 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vast/utils.py +9 -6
- sky/py.typed +0 -0
- sky/resources.py +24 -14
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/serve/autoscalers.py +8 -0
- sky/serve/client/impl.py +188 -0
- sky/serve/client/sdk.py +12 -82
- sky/serve/constants.py +5 -1
- sky/serve/controller.py +5 -0
- sky/serve/replica_managers.py +112 -37
- sky/serve/serve_state.py +16 -6
- sky/serve/serve_utils.py +274 -77
- sky/serve/server/core.py +8 -525
- sky/serve/server/impl.py +709 -0
- sky/serve/service.py +13 -9
- sky/serve/service_spec.py +74 -4
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +33 -0
- sky/server/requests/requests.py +18 -1
- sky/server/requests/serializers/decoders.py +12 -3
- sky/server/requests/serializers/encoders.py +13 -2
- sky/server/server.py +6 -1
- sky/skylet/events.py +9 -0
- sky/skypilot_config.py +24 -21
- sky/task.py +41 -11
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/sky-serve-controller.yaml.j2 +18 -2
- sky/users/server.py +1 -1
- sky/utils/command_runner.py +4 -2
- sky/utils/controller_utils.py +14 -10
- sky/utils/dag_utils.py +4 -2
- sky/utils/db/migration_utils.py +2 -4
- sky/utils/schemas.py +24 -19
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +135 -130
- sky/dashboard/out/_next/static/Q2sVXboB_t7cgvntL-6nD/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e49a159c30a6c4a7.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +0 -30
- sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +0 -1
- sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +0 -6
- sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.74c19c4d45a2c034.js +0 -1
- sky/dashboard/out/_next/static/chunks/3698-9fa11dafb5cad4a6.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.66125dcd9832aa5d.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +0 -16
- sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +0 -15
- sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +0 -1
- sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +0 -8
- sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +0 -1
- sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +0 -39
- sky/dashboard/out/_next/static/chunks/6601-d4a381403a8bae91.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +0 -55
- sky/dashboard/out/_next/static/chunks/6989-eab0e9c16b64fd9f.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +0 -1
- sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +0 -41
- sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +0 -6
- sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.0460de9d3adf5582.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-efc06c2733009cd3.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-c0a4f1ea606d48d2.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2186770cc2de1623.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-95afb019ab85801c.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dc0299ffefebcdbe.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-6790fcefd5487b13.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-61ea7ba7e56f8d06.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +0 -1
- /sky/dashboard/out/_next/static/{Q2sVXboB_t7cgvntL-6nD → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0
sky/jobs/recovery_strategy.py
CHANGED
|
@@ -20,6 +20,7 @@ from sky.backends import backend_utils
|
|
|
20
20
|
from sky.jobs import scheduler
|
|
21
21
|
from sky.jobs import state
|
|
22
22
|
from sky.jobs import utils as managed_job_utils
|
|
23
|
+
from sky.serve import serve_utils
|
|
23
24
|
from sky.skylet import job_lib
|
|
24
25
|
from sky.usage import usage_lib
|
|
25
26
|
from sky.utils import common_utils
|
|
@@ -48,9 +49,9 @@ class StrategyExecutor:
|
|
|
48
49
|
|
|
49
50
|
RETRY_INIT_GAP_SECONDS = 60
|
|
50
51
|
|
|
51
|
-
def __init__(self, cluster_name: str, backend: 'backends.Backend',
|
|
52
|
+
def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
|
|
52
53
|
task: 'task_lib.Task', max_restarts_on_errors: int,
|
|
53
|
-
job_id: int, task_id: int) -> None:
|
|
54
|
+
job_id: int, task_id: int, pool: Optional[str]) -> None:
|
|
54
55
|
"""Initialize the strategy executor.
|
|
55
56
|
|
|
56
57
|
Args:
|
|
@@ -62,17 +63,23 @@ class StrategyExecutor:
|
|
|
62
63
|
'Only CloudVMRayBackend is supported.')
|
|
63
64
|
self.dag = sky.Dag()
|
|
64
65
|
self.dag.add(task)
|
|
66
|
+
# For jobs submitted to a pool, the cluster name might change after each
|
|
67
|
+
# recovery. Initially this is set to an empty string to indicate that no
|
|
68
|
+
# cluster is assigned yet, and in `_launch`, it will be set to one of
|
|
69
|
+
# the cluster names in the pool.
|
|
65
70
|
self.cluster_name = cluster_name
|
|
66
71
|
self.backend = backend
|
|
67
72
|
self.max_restarts_on_errors = max_restarts_on_errors
|
|
68
73
|
self.job_id = job_id
|
|
69
74
|
self.task_id = task_id
|
|
75
|
+
self.pool = pool
|
|
70
76
|
self.restart_cnt_on_failure = 0
|
|
77
|
+
self.job_id_on_pool_cluster: Optional[int] = None
|
|
71
78
|
|
|
72
79
|
@classmethod
|
|
73
|
-
def make(cls, cluster_name: str, backend: 'backends.Backend',
|
|
74
|
-
task: 'task_lib.Task', job_id: int,
|
|
75
|
-
|
|
80
|
+
def make(cls, cluster_name: Optional[str], backend: 'backends.Backend',
|
|
81
|
+
task: 'task_lib.Task', job_id: int, task_id: int,
|
|
82
|
+
pool: Optional[str]) -> 'StrategyExecutor':
|
|
76
83
|
"""Create a strategy from a task."""
|
|
77
84
|
|
|
78
85
|
resource_list = list(task.resources)
|
|
@@ -103,7 +110,8 @@ class StrategyExecutor:
|
|
|
103
110
|
from_str(job_recovery_name))
|
|
104
111
|
assert job_recovery_strategy is not None, job_recovery_name
|
|
105
112
|
return job_recovery_strategy(cluster_name, backend, task,
|
|
106
|
-
max_restarts_on_errors, job_id, task_id
|
|
113
|
+
max_restarts_on_errors, job_id, task_id,
|
|
114
|
+
pool)
|
|
107
115
|
|
|
108
116
|
def launch(self) -> float:
|
|
109
117
|
"""Launch the cluster for the first time.
|
|
@@ -131,12 +139,14 @@ class StrategyExecutor:
|
|
|
131
139
|
"""
|
|
132
140
|
raise NotImplementedError
|
|
133
141
|
|
|
134
|
-
def
|
|
142
|
+
def _try_cancel_jobs(self):
|
|
135
143
|
from sky import core # pylint: disable=import-outside-toplevel
|
|
136
144
|
|
|
145
|
+
if self.cluster_name is None:
|
|
146
|
+
return
|
|
137
147
|
handle = global_user_state.get_handle_from_cluster_name(
|
|
138
148
|
self.cluster_name)
|
|
139
|
-
if handle is None:
|
|
149
|
+
if handle is None or self.pool is not None:
|
|
140
150
|
return
|
|
141
151
|
try:
|
|
142
152
|
usage_lib.messages.usage.set_internal()
|
|
@@ -159,8 +169,13 @@ class StrategyExecutor:
|
|
|
159
169
|
# should be functional with the `_try_cancel_if_cluster_is_init`
|
|
160
170
|
# flag, i.e. it sends the cancel signal to the head node, which will
|
|
161
171
|
# then kill the user process on remaining worker nodes.
|
|
172
|
+
# Only cancel the corresponding job for worker pool.
|
|
173
|
+
if self.pool is None:
|
|
174
|
+
kwargs = dict(all=True)
|
|
175
|
+
else:
|
|
176
|
+
kwargs = dict(job_ids=[self.job_id_on_pool_cluster])
|
|
162
177
|
core.cancel(cluster_name=self.cluster_name,
|
|
163
|
-
|
|
178
|
+
**kwargs,
|
|
164
179
|
_try_cancel_if_cluster_is_init=True)
|
|
165
180
|
except Exception as e: # pylint: disable=broad-except
|
|
166
181
|
logger.info('Failed to cancel the job on the cluster. The cluster '
|
|
@@ -169,7 +184,7 @@ class StrategyExecutor:
|
|
|
169
184
|
f'{common_utils.format_exception(e)}\n'
|
|
170
185
|
'Terminating the cluster explicitly to ensure no '
|
|
171
186
|
'remaining job process interferes with recovery.')
|
|
172
|
-
|
|
187
|
+
self._cleanup_cluster()
|
|
173
188
|
|
|
174
189
|
def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
|
|
175
190
|
"""Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
|
|
@@ -178,6 +193,7 @@ class StrategyExecutor:
|
|
|
178
193
|
The timestamp of when the job is submitted, or None if failed to
|
|
179
194
|
submit.
|
|
180
195
|
"""
|
|
196
|
+
assert self.cluster_name is not None
|
|
181
197
|
status = None
|
|
182
198
|
job_checking_retry_cnt = 0
|
|
183
199
|
while job_checking_retry_cnt < MAX_JOB_CHECKING_RETRY:
|
|
@@ -208,7 +224,9 @@ class StrategyExecutor:
|
|
|
208
224
|
|
|
209
225
|
try:
|
|
210
226
|
status = managed_job_utils.get_job_status(
|
|
211
|
-
self.backend,
|
|
227
|
+
self.backend,
|
|
228
|
+
self.cluster_name,
|
|
229
|
+
job_id=self.job_id_on_pool_cluster)
|
|
212
230
|
except Exception as e: # pylint: disable=broad-except
|
|
213
231
|
# If any unexpected error happens, retry the job checking
|
|
214
232
|
# loop.
|
|
@@ -224,7 +242,10 @@ class StrategyExecutor:
|
|
|
224
242
|
if status is not None and status > job_lib.JobStatus.INIT:
|
|
225
243
|
try:
|
|
226
244
|
job_submitted_at = managed_job_utils.get_job_timestamp(
|
|
227
|
-
self.backend,
|
|
245
|
+
self.backend,
|
|
246
|
+
self.cluster_name,
|
|
247
|
+
self.job_id_on_pool_cluster,
|
|
248
|
+
get_end_time=False)
|
|
228
249
|
return job_submitted_at
|
|
229
250
|
except Exception as e: # pylint: disable=broad-except
|
|
230
251
|
# If we failed to get the job timestamp, we will retry
|
|
@@ -236,6 +257,12 @@ class StrategyExecutor:
|
|
|
236
257
|
time.sleep(managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
|
|
237
258
|
return None
|
|
238
259
|
|
|
260
|
+
def _cleanup_cluster(self) -> None:
|
|
261
|
+
if self.cluster_name is None:
|
|
262
|
+
return
|
|
263
|
+
if self.pool is None:
|
|
264
|
+
managed_job_utils.terminate_cluster(self.cluster_name)
|
|
265
|
+
|
|
239
266
|
def _launch(self,
|
|
240
267
|
max_retry: Optional[int] = 3,
|
|
241
268
|
raise_on_failure: bool = True,
|
|
@@ -290,19 +317,35 @@ class StrategyExecutor:
|
|
|
290
317
|
recovery)
|
|
291
318
|
try:
|
|
292
319
|
usage_lib.messages.usage.set_internal()
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
320
|
+
if self.pool is None:
|
|
321
|
+
assert self.cluster_name is not None
|
|
322
|
+
# Detach setup, so that the setup failure can be
|
|
323
|
+
# detected by the controller process (job_status ->
|
|
324
|
+
# FAILED_SETUP).
|
|
325
|
+
execution.launch(
|
|
326
|
+
self.dag,
|
|
327
|
+
cluster_name=self.cluster_name,
|
|
328
|
+
# We expect to tear down the cluster as soon as
|
|
329
|
+
# the job is finished. However, in case the
|
|
330
|
+
# controller dies, set autodown to try and avoid
|
|
331
|
+
# a resource leak.
|
|
332
|
+
idle_minutes_to_autostop=_AUTODOWN_MINUTES,
|
|
333
|
+
down=True,
|
|
334
|
+
_is_launched_by_jobs_controller=True)
|
|
335
|
+
else:
|
|
336
|
+
self.cluster_name = (
|
|
337
|
+
serve_utils.get_next_cluster_name(
|
|
338
|
+
self.pool, self.job_id))
|
|
339
|
+
if self.cluster_name is None:
|
|
340
|
+
raise exceptions.NoClusterLaunchedError(
|
|
341
|
+
'No cluster name found in the pool.')
|
|
342
|
+
job_id_on_pool_cluster, _ = execution.exec(
|
|
343
|
+
self.dag, cluster_name=self.cluster_name)
|
|
344
|
+
assert job_id_on_pool_cluster is not None, (
|
|
345
|
+
self.cluster_name, self.job_id)
|
|
346
|
+
self.job_id_on_pool_cluster = job_id_on_pool_cluster
|
|
347
|
+
state.set_job_id_on_pool_cluster(
|
|
348
|
+
self.job_id, job_id_on_pool_cluster)
|
|
306
349
|
logger.info('Managed job cluster launched.')
|
|
307
350
|
except (exceptions.InvalidClusterNameError,
|
|
308
351
|
exceptions.NoCloudAccessError,
|
|
@@ -373,7 +416,7 @@ class StrategyExecutor:
|
|
|
373
416
|
|
|
374
417
|
# If we get here, the launch did not succeed. Tear down the
|
|
375
418
|
# cluster and retry.
|
|
376
|
-
|
|
419
|
+
self._cleanup_cluster()
|
|
377
420
|
if max_retry is not None and retry_cnt >= max_retry:
|
|
378
421
|
# Retry forever if max_retry is None.
|
|
379
422
|
if raise_on_failure:
|
|
@@ -398,7 +441,10 @@ class StrategyExecutor:
|
|
|
398
441
|
# Update the status to PENDING during backoff.
|
|
399
442
|
state.set_backoff_pending(self.job_id, self.task_id)
|
|
400
443
|
# Calculate the backoff time and sleep.
|
|
401
|
-
|
|
444
|
+
# We retry immediately for worker pool, since no sky.launch()
|
|
445
|
+
# is called and the overhead is minimal.
|
|
446
|
+
gap_seconds = (backoff.current_backoff()
|
|
447
|
+
if self.pool is None else 0)
|
|
402
448
|
logger.info('Retrying to launch the cluster in '
|
|
403
449
|
f'{gap_seconds:.1f} seconds.')
|
|
404
450
|
time.sleep(gap_seconds)
|
|
@@ -427,11 +473,11 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
427
473
|
|
|
428
474
|
_MAX_RETRY_CNT = 240 # Retry for 4 hours.
|
|
429
475
|
|
|
430
|
-
def __init__(self, cluster_name: str, backend: 'backends.Backend',
|
|
476
|
+
def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
|
|
431
477
|
task: 'task_lib.Task', max_restarts_on_errors: int,
|
|
432
|
-
job_id: int, task_id: int) -> None:
|
|
478
|
+
job_id: int, task_id: int, pool: Optional[str]) -> None:
|
|
433
479
|
super().__init__(cluster_name, backend, task, max_restarts_on_errors,
|
|
434
|
-
job_id, task_id)
|
|
480
|
+
job_id, task_id, pool)
|
|
435
481
|
# Note down the cloud/region of the launched cluster, so that we can
|
|
436
482
|
# first retry in the same cloud/region. (Inside recover() we may not
|
|
437
483
|
# rely on cluster handle, as it can be None if the cluster is
|
|
@@ -444,7 +490,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
444
490
|
recovery: bool = False) -> Optional[float]:
|
|
445
491
|
job_submitted_at = super()._launch(max_retry, raise_on_failure,
|
|
446
492
|
recovery)
|
|
447
|
-
if job_submitted_at is not None:
|
|
493
|
+
if job_submitted_at is not None and self.cluster_name is not None:
|
|
448
494
|
# Only record the cloud/region if the launch is successful.
|
|
449
495
|
handle = global_user_state.get_handle_from_cluster_name(
|
|
450
496
|
self.cluster_name)
|
|
@@ -464,7 +510,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
464
510
|
# original user specification.
|
|
465
511
|
|
|
466
512
|
# Step 1
|
|
467
|
-
self.
|
|
513
|
+
self._try_cancel_jobs()
|
|
468
514
|
|
|
469
515
|
while True:
|
|
470
516
|
# Add region constraint to the task, to retry on the same region
|
|
@@ -488,7 +534,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
488
534
|
# Step 2
|
|
489
535
|
logger.debug('Terminating unhealthy cluster and reset cloud '
|
|
490
536
|
'region.')
|
|
491
|
-
|
|
537
|
+
self._cleanup_cluster()
|
|
492
538
|
|
|
493
539
|
# Step 3
|
|
494
540
|
logger.debug('Relaunch the cluster without constraining to prior '
|
|
@@ -547,7 +593,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
|
547
593
|
|
|
548
594
|
# Step 1
|
|
549
595
|
logger.debug('Terminating unhealthy cluster and reset cloud region.')
|
|
550
|
-
|
|
596
|
+
self._cleanup_cluster()
|
|
551
597
|
|
|
552
598
|
# Step 2
|
|
553
599
|
logger.debug('Relaunch the cluster skipping the previously launched '
|
sky/jobs/scheduler.py
CHANGED
|
@@ -9,9 +9,11 @@ The scheduler is not its own process - instead, maybe_schedule_next_jobs() can
|
|
|
9
9
|
be called from any code running on the managed jobs controller instance to
|
|
10
10
|
trigger scheduling of new jobs if possible. This function should be called
|
|
11
11
|
immediately after any state change that could result in jobs newly being able to
|
|
12
|
-
be scheduled.
|
|
12
|
+
be scheduled. If the job is running in a pool, the scheduler will only schedule
|
|
13
|
+
jobs for the same pool, because the resources limitations are per-pool (see the
|
|
14
|
+
following section for more details).
|
|
13
15
|
|
|
14
|
-
The scheduling logic limits
|
|
16
|
+
The scheduling logic limits #running jobs according to three limits:
|
|
15
17
|
1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
|
|
16
18
|
once, based on the number of CPUs. (See _get_launch_parallelism.) This the
|
|
17
19
|
most compute-intensive part of the job lifecycle, which is why we have an
|
|
@@ -20,6 +22,8 @@ The scheduling logic limits the number of running jobs according to two limits:
|
|
|
20
22
|
of memory. (See _get_job_parallelism.) Since the job controller is doing very
|
|
21
23
|
little once a job starts (just checking its status periodically), the most
|
|
22
24
|
significant resource it consumes is memory.
|
|
25
|
+
3. The number of jobs that can be running in a pool at any given time, based on
|
|
26
|
+
the number of ready workers in the pool. (See _can_start_new_job.)
|
|
23
27
|
|
|
24
28
|
The state of the scheduler is entirely determined by the schedule_state column
|
|
25
29
|
of all the jobs in the job_info table. This column should only be modified via
|
|
@@ -43,6 +47,7 @@ import os
|
|
|
43
47
|
import sys
|
|
44
48
|
import time
|
|
45
49
|
import typing
|
|
50
|
+
from typing import Optional
|
|
46
51
|
|
|
47
52
|
import filelock
|
|
48
53
|
|
|
@@ -51,6 +56,7 @@ from sky import sky_logging
|
|
|
51
56
|
from sky.adaptors import common as adaptors_common
|
|
52
57
|
from sky.jobs import constants as managed_job_constants
|
|
53
58
|
from sky.jobs import state
|
|
59
|
+
from sky.serve import serve_utils
|
|
54
60
|
from sky.skylet import constants
|
|
55
61
|
from sky.utils import common_utils
|
|
56
62
|
from sky.utils import subprocess_utils
|
|
@@ -80,18 +86,21 @@ LAUNCHES_PER_CPU = 4
|
|
|
80
86
|
|
|
81
87
|
@lru_cache(maxsize=1)
|
|
82
88
|
def _get_lock_path() -> str:
|
|
89
|
+
# TODO(tian): Per pool lock.
|
|
83
90
|
path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
|
|
84
91
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
85
92
|
return path
|
|
86
93
|
|
|
87
94
|
|
|
88
|
-
def _start_controller(job_id: int, dag_yaml_path: str,
|
|
89
|
-
|
|
95
|
+
def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
|
|
96
|
+
pool: Optional[str]) -> None:
|
|
90
97
|
activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
|
|
91
98
|
source_environment_cmd = (f'source {env_file_path};'
|
|
92
99
|
if env_file_path else '')
|
|
93
|
-
|
|
94
|
-
|
|
100
|
+
maybe_pool_arg = (f'--pool {pool}' if pool is not None else '')
|
|
101
|
+
run_controller_cmd = (
|
|
102
|
+
f'{sys.executable} -u -m sky.jobs.controller '
|
|
103
|
+
f'{dag_yaml_path} --job-id {job_id} {maybe_pool_arg};')
|
|
95
104
|
|
|
96
105
|
# If the command line here is changed, please also update
|
|
97
106
|
# utils._controller_process_alive. The substring `--job-id X`
|
|
@@ -111,7 +120,7 @@ def _start_controller(job_id: int, dag_yaml_path: str,
|
|
|
111
120
|
logger.debug(f'Job {job_id} started with pid {pid}')
|
|
112
121
|
|
|
113
122
|
|
|
114
|
-
def maybe_schedule_next_jobs() -> None:
|
|
123
|
+
def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
|
|
115
124
|
"""Determine if any managed jobs can be scheduled, and if so, schedule them.
|
|
116
125
|
|
|
117
126
|
Here, "schedule" means to select job that is waiting, and allow it to
|
|
@@ -141,6 +150,13 @@ def maybe_schedule_next_jobs() -> None:
|
|
|
141
150
|
the jobs controller instance. New job controller processes will be detached
|
|
142
151
|
from the current process and there will not be a parent/child relationship.
|
|
143
152
|
See launch_new_process_tree for more.
|
|
153
|
+
|
|
154
|
+
After adding the pool support, this function will be called in a per-pool
|
|
155
|
+
basis. We employ resources limitation for each pool given the number of
|
|
156
|
+
ready workers in the pool. Each pool will have its own scheduler queue,
|
|
157
|
+
indicating by the argument `pool`. Finished job in pool 1 will only trigger
|
|
158
|
+
another jobs in pool 1, but the job in pool 2 will still be waiting. When
|
|
159
|
+
the `pool` argument is None, it schedules a job regardless of the pool.
|
|
144
160
|
"""
|
|
145
161
|
try:
|
|
146
162
|
# We must use a global lock rather than a per-job lock to ensure correct
|
|
@@ -149,10 +165,11 @@ def maybe_schedule_next_jobs() -> None:
|
|
|
149
165
|
# releasing the lock.
|
|
150
166
|
with filelock.FileLock(_get_lock_path(), blocking=False):
|
|
151
167
|
while True:
|
|
152
|
-
maybe_next_job = state.get_waiting_job()
|
|
168
|
+
maybe_next_job = state.get_waiting_job(pool)
|
|
153
169
|
if maybe_next_job is None:
|
|
154
170
|
# Nothing left to start, break from scheduling loop
|
|
155
171
|
break
|
|
172
|
+
actual_pool = maybe_next_job['pool']
|
|
156
173
|
|
|
157
174
|
current_state = maybe_next_job['schedule_state']
|
|
158
175
|
|
|
@@ -171,7 +188,17 @@ def maybe_schedule_next_jobs() -> None:
|
|
|
171
188
|
# Can't schedule anything, break from scheduling loop.
|
|
172
189
|
break
|
|
173
190
|
elif current_state == state.ManagedJobScheduleState.WAITING:
|
|
174
|
-
if not _can_start_new_job():
|
|
191
|
+
if not _can_start_new_job(actual_pool):
|
|
192
|
+
# If there is no job can be scheduled in the pool, we
|
|
193
|
+
# try to schedule another job regardless of the pool.
|
|
194
|
+
# This is to avoid the case where the pool is scaled
|
|
195
|
+
# down at the same time as a job is done. In this case,
|
|
196
|
+
# we won't have any job to schedule in the pool, but
|
|
197
|
+
# other jobs in other pool (or no pool) can still be
|
|
198
|
+
# scheduled.
|
|
199
|
+
if pool is not None:
|
|
200
|
+
pool = None
|
|
201
|
+
continue
|
|
175
202
|
# Can't schedule anything, break from scheduling loop.
|
|
176
203
|
break
|
|
177
204
|
|
|
@@ -187,7 +214,8 @@ def maybe_schedule_next_jobs() -> None:
|
|
|
187
214
|
dag_yaml_path = maybe_next_job['dag_yaml_path']
|
|
188
215
|
env_file_path = maybe_next_job['env_file_path']
|
|
189
216
|
|
|
190
|
-
_start_controller(job_id, dag_yaml_path, env_file_path
|
|
217
|
+
_start_controller(job_id, dag_yaml_path, env_file_path,
|
|
218
|
+
actual_pool)
|
|
191
219
|
|
|
192
220
|
except filelock.Timeout:
|
|
193
221
|
# If we can't get the lock, just exit. The process holding the lock
|
|
@@ -196,7 +224,7 @@ def maybe_schedule_next_jobs() -> None:
|
|
|
196
224
|
|
|
197
225
|
|
|
198
226
|
def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
199
|
-
env_file_path: str, priority: int) -> None:
|
|
227
|
+
env_file_path: str, priority: int, pool: Optional[str]) -> None:
|
|
200
228
|
"""Submit an existing job to the scheduler.
|
|
201
229
|
|
|
202
230
|
This should be called after a job is created in the `spot` table as
|
|
@@ -213,9 +241,9 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
|
213
241
|
common_utils.get_user_hash(),
|
|
214
242
|
priority)
|
|
215
243
|
if is_resume:
|
|
216
|
-
_start_controller(job_id, dag_yaml_path, env_file_path)
|
|
244
|
+
_start_controller(job_id, dag_yaml_path, env_file_path, pool)
|
|
217
245
|
else:
|
|
218
|
-
maybe_schedule_next_jobs()
|
|
246
|
+
maybe_schedule_next_jobs(pool)
|
|
219
247
|
|
|
220
248
|
|
|
221
249
|
@contextlib.contextmanager
|
|
@@ -251,6 +279,7 @@ def scheduled_launch(job_id: int):
|
|
|
251
279
|
while (state.get_job_schedule_state(job_id) !=
|
|
252
280
|
state.ManagedJobScheduleState.LAUNCHING):
|
|
253
281
|
time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
|
|
282
|
+
pool = state.get_pool_from_job_id(job_id)
|
|
254
283
|
|
|
255
284
|
try:
|
|
256
285
|
yield
|
|
@@ -264,7 +293,7 @@ def scheduled_launch(job_id: int):
|
|
|
264
293
|
with filelock.FileLock(_get_lock_path()):
|
|
265
294
|
state.scheduler_set_alive(job_id)
|
|
266
295
|
finally:
|
|
267
|
-
maybe_schedule_next_jobs()
|
|
296
|
+
maybe_schedule_next_jobs(pool)
|
|
268
297
|
|
|
269
298
|
|
|
270
299
|
def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
@@ -279,17 +308,19 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
|
279
308
|
if idempotent and (state.get_job_schedule_state(job_id)
|
|
280
309
|
== state.ManagedJobScheduleState.DONE):
|
|
281
310
|
return
|
|
311
|
+
pool = state.get_pool_from_job_id(job_id)
|
|
282
312
|
|
|
283
313
|
with filelock.FileLock(_get_lock_path()):
|
|
284
314
|
state.scheduler_set_done(job_id, idempotent)
|
|
285
|
-
maybe_schedule_next_jobs()
|
|
315
|
+
maybe_schedule_next_jobs(pool)
|
|
286
316
|
|
|
287
317
|
|
|
288
318
|
def _set_alive_waiting(job_id: int) -> None:
|
|
289
319
|
"""Should use wait_until_launch_okay() to transition to this state."""
|
|
290
320
|
with filelock.FileLock(_get_lock_path()):
|
|
291
321
|
state.scheduler_set_alive_waiting(job_id)
|
|
292
|
-
|
|
322
|
+
pool = state.get_pool_from_job_id(job_id)
|
|
323
|
+
maybe_schedule_next_jobs(pool)
|
|
293
324
|
|
|
294
325
|
|
|
295
326
|
def _get_job_parallelism() -> int:
|
|
@@ -305,11 +336,23 @@ def _get_launch_parallelism() -> int:
|
|
|
305
336
|
return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
|
|
306
337
|
|
|
307
338
|
|
|
308
|
-
def _can_start_new_job() -> bool:
|
|
339
|
+
def _can_start_new_job(pool: Optional[str]) -> bool:
|
|
309
340
|
launching_jobs = state.get_num_launching_jobs()
|
|
310
341
|
alive_jobs = state.get_num_alive_jobs()
|
|
311
|
-
|
|
312
|
-
|
|
342
|
+
|
|
343
|
+
# Check basic resource limits
|
|
344
|
+
if not (launching_jobs < _get_launch_parallelism() and
|
|
345
|
+
alive_jobs < _get_job_parallelism()):
|
|
346
|
+
return False
|
|
347
|
+
|
|
348
|
+
# Check if there are available replicas in the pool
|
|
349
|
+
if pool is not None:
|
|
350
|
+
alive_jobs_in_pool = state.get_num_alive_jobs(pool)
|
|
351
|
+
if alive_jobs_in_pool >= serve_utils.num_replicas(pool):
|
|
352
|
+
logger.debug(f'No replicas available in pool {pool}')
|
|
353
|
+
return False
|
|
354
|
+
|
|
355
|
+
return True
|
|
313
356
|
|
|
314
357
|
|
|
315
358
|
def _can_lauch_in_alive_job() -> bool:
|
|
@@ -332,6 +375,11 @@ if __name__ == '__main__':
|
|
|
332
375
|
parser.add_argument('--env-file',
|
|
333
376
|
type=str,
|
|
334
377
|
help='The path to the controller env file.')
|
|
378
|
+
parser.add_argument('--pool',
|
|
379
|
+
type=str,
|
|
380
|
+
required=False,
|
|
381
|
+
default=None,
|
|
382
|
+
help='The pool to use for the controller job.')
|
|
335
383
|
parser.add_argument(
|
|
336
384
|
'--priority',
|
|
337
385
|
type=int,
|
|
@@ -341,4 +389,4 @@ if __name__ == '__main__':
|
|
|
341
389
|
f' Default: {constants.DEFAULT_PRIORITY}.')
|
|
342
390
|
args = parser.parse_args()
|
|
343
391
|
submit_job(args.job_id, args.dag_yaml, args.user_yaml_path, args.env_file,
|
|
344
|
-
args.priority)
|
|
392
|
+
args.priority, args.pool)
|