skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +4 -3
- sky/catalog/__init__.py +3 -3
- sky/catalog/aws_catalog.py +12 -0
- sky/catalog/common.py +2 -2
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +448 -53
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/__init__.py +3 -0
- sky/jobs/client/sdk.py +80 -3
- sky/jobs/controller.py +76 -25
- sky/jobs/recovery_strategy.py +80 -34
- sky/jobs/scheduler.py +68 -20
- sky/jobs/server/core.py +228 -136
- sky/jobs/server/server.py +40 -0
- sky/jobs/state.py +129 -24
- sky/jobs/utils.py +109 -51
- sky/provision/nebius/constants.py +3 -0
- sky/py.typed +0 -0
- sky/resources.py +16 -12
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/serve/autoscalers.py +8 -0
- sky/serve/client/impl.py +188 -0
- sky/serve/client/sdk.py +12 -82
- sky/serve/constants.py +5 -1
- sky/serve/controller.py +5 -0
- sky/serve/replica_managers.py +112 -37
- sky/serve/serve_state.py +16 -6
- sky/serve/serve_utils.py +274 -77
- sky/serve/server/core.py +8 -525
- sky/serve/server/impl.py +709 -0
- sky/serve/service.py +13 -9
- sky/serve/service_spec.py +74 -4
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +33 -0
- sky/server/requests/requests.py +18 -1
- sky/server/requests/serializers/decoders.py +12 -3
- sky/server/requests/serializers/encoders.py +13 -2
- sky/skylet/events.py +9 -0
- sky/skypilot_config.py +24 -21
- sky/task.py +41 -11
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/sky-serve-controller.yaml.j2 +18 -2
- sky/users/server.py +1 -1
- sky/utils/command_runner.py +4 -2
- sky/utils/controller_utils.py +14 -10
- sky/utils/dag_utils.py +4 -2
- sky/utils/db/migration_utils.py +2 -4
- sky/utils/schemas.py +24 -19
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +72 -68
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → oKqDxFQ88cquF4nQGE_0w}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0
sky/jobs/scheduler.py
CHANGED
|
@@ -9,9 +9,11 @@ The scheduler is not its own process - instead, maybe_schedule_next_jobs() can
|
|
|
9
9
|
be called from any code running on the managed jobs controller instance to
|
|
10
10
|
trigger scheduling of new jobs if possible. This function should be called
|
|
11
11
|
immediately after any state change that could result in jobs newly being able to
|
|
12
|
-
be scheduled.
|
|
12
|
+
be scheduled. If the job is running in a pool, the scheduler will only schedule
|
|
13
|
+
jobs for the same pool, because the resources limitations are per-pool (see the
|
|
14
|
+
following section for more details).
|
|
13
15
|
|
|
14
|
-
The scheduling logic limits
|
|
16
|
+
The scheduling logic limits #running jobs according to three limits:
|
|
15
17
|
1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
|
|
16
18
|
once, based on the number of CPUs. (See _get_launch_parallelism.) This the
|
|
17
19
|
most compute-intensive part of the job lifecycle, which is why we have an
|
|
@@ -20,6 +22,8 @@ The scheduling logic limits the number of running jobs according to two limits:
|
|
|
20
22
|
of memory. (See _get_job_parallelism.) Since the job controller is doing very
|
|
21
23
|
little once a job starts (just checking its status periodically), the most
|
|
22
24
|
significant resource it consumes is memory.
|
|
25
|
+
3. The number of jobs that can be running in a pool at any given time, based on
|
|
26
|
+
the number of ready workers in the pool. (See _can_start_new_job.)
|
|
23
27
|
|
|
24
28
|
The state of the scheduler is entirely determined by the schedule_state column
|
|
25
29
|
of all the jobs in the job_info table. This column should only be modified via
|
|
@@ -43,6 +47,7 @@ import os
|
|
|
43
47
|
import sys
|
|
44
48
|
import time
|
|
45
49
|
import typing
|
|
50
|
+
from typing import Optional
|
|
46
51
|
|
|
47
52
|
import filelock
|
|
48
53
|
|
|
@@ -51,6 +56,7 @@ from sky import sky_logging
|
|
|
51
56
|
from sky.adaptors import common as adaptors_common
|
|
52
57
|
from sky.jobs import constants as managed_job_constants
|
|
53
58
|
from sky.jobs import state
|
|
59
|
+
from sky.serve import serve_utils
|
|
54
60
|
from sky.skylet import constants
|
|
55
61
|
from sky.utils import common_utils
|
|
56
62
|
from sky.utils import subprocess_utils
|
|
@@ -80,18 +86,21 @@ LAUNCHES_PER_CPU = 4
|
|
|
80
86
|
|
|
81
87
|
@lru_cache(maxsize=1)
|
|
82
88
|
def _get_lock_path() -> str:
|
|
89
|
+
# TODO(tian): Per pool lock.
|
|
83
90
|
path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
|
|
84
91
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
85
92
|
return path
|
|
86
93
|
|
|
87
94
|
|
|
88
|
-
def _start_controller(job_id: int, dag_yaml_path: str,
|
|
89
|
-
|
|
95
|
+
def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
|
|
96
|
+
pool: Optional[str]) -> None:
|
|
90
97
|
activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
|
|
91
98
|
source_environment_cmd = (f'source {env_file_path};'
|
|
92
99
|
if env_file_path else '')
|
|
93
|
-
|
|
94
|
-
|
|
100
|
+
maybe_pool_arg = (f'--pool {pool}' if pool is not None else '')
|
|
101
|
+
run_controller_cmd = (
|
|
102
|
+
f'{sys.executable} -u -m sky.jobs.controller '
|
|
103
|
+
f'{dag_yaml_path} --job-id {job_id} {maybe_pool_arg};')
|
|
95
104
|
|
|
96
105
|
# If the command line here is changed, please also update
|
|
97
106
|
# utils._controller_process_alive. The substring `--job-id X`
|
|
@@ -111,7 +120,7 @@ def _start_controller(job_id: int, dag_yaml_path: str,
|
|
|
111
120
|
logger.debug(f'Job {job_id} started with pid {pid}')
|
|
112
121
|
|
|
113
122
|
|
|
114
|
-
def maybe_schedule_next_jobs() -> None:
|
|
123
|
+
def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
|
|
115
124
|
"""Determine if any managed jobs can be scheduled, and if so, schedule them.
|
|
116
125
|
|
|
117
126
|
Here, "schedule" means to select job that is waiting, and allow it to
|
|
@@ -141,6 +150,13 @@ def maybe_schedule_next_jobs() -> None:
|
|
|
141
150
|
the jobs controller instance. New job controller processes will be detached
|
|
142
151
|
from the current process and there will not be a parent/child relationship.
|
|
143
152
|
See launch_new_process_tree for more.
|
|
153
|
+
|
|
154
|
+
After adding the pool support, this function will be called in a per-pool
|
|
155
|
+
basis. We employ resources limitation for each pool given the number of
|
|
156
|
+
ready workers in the pool. Each pool will have its own scheduler queue,
|
|
157
|
+
indicating by the argument `pool`. Finished job in pool 1 will only trigger
|
|
158
|
+
another jobs in pool 1, but the job in pool 2 will still be waiting. When
|
|
159
|
+
the `pool` argument is None, it schedules a job regardless of the pool.
|
|
144
160
|
"""
|
|
145
161
|
try:
|
|
146
162
|
# We must use a global lock rather than a per-job lock to ensure correct
|
|
@@ -149,10 +165,11 @@ def maybe_schedule_next_jobs() -> None:
|
|
|
149
165
|
# releasing the lock.
|
|
150
166
|
with filelock.FileLock(_get_lock_path(), blocking=False):
|
|
151
167
|
while True:
|
|
152
|
-
maybe_next_job = state.get_waiting_job()
|
|
168
|
+
maybe_next_job = state.get_waiting_job(pool)
|
|
153
169
|
if maybe_next_job is None:
|
|
154
170
|
# Nothing left to start, break from scheduling loop
|
|
155
171
|
break
|
|
172
|
+
actual_pool = maybe_next_job['pool']
|
|
156
173
|
|
|
157
174
|
current_state = maybe_next_job['schedule_state']
|
|
158
175
|
|
|
@@ -171,7 +188,17 @@ def maybe_schedule_next_jobs() -> None:
|
|
|
171
188
|
# Can't schedule anything, break from scheduling loop.
|
|
172
189
|
break
|
|
173
190
|
elif current_state == state.ManagedJobScheduleState.WAITING:
|
|
174
|
-
if not _can_start_new_job():
|
|
191
|
+
if not _can_start_new_job(actual_pool):
|
|
192
|
+
# If there is no job can be scheduled in the pool, we
|
|
193
|
+
# try to schedule another job regardless of the pool.
|
|
194
|
+
# This is to avoid the case where the pool is scaled
|
|
195
|
+
# down at the same time as a job is done. In this case,
|
|
196
|
+
# we won't have any job to schedule in the pool, but
|
|
197
|
+
# other jobs in other pool (or no pool) can still be
|
|
198
|
+
# scheduled.
|
|
199
|
+
if pool is not None:
|
|
200
|
+
pool = None
|
|
201
|
+
continue
|
|
175
202
|
# Can't schedule anything, break from scheduling loop.
|
|
176
203
|
break
|
|
177
204
|
|
|
@@ -187,7 +214,8 @@ def maybe_schedule_next_jobs() -> None:
|
|
|
187
214
|
dag_yaml_path = maybe_next_job['dag_yaml_path']
|
|
188
215
|
env_file_path = maybe_next_job['env_file_path']
|
|
189
216
|
|
|
190
|
-
_start_controller(job_id, dag_yaml_path, env_file_path
|
|
217
|
+
_start_controller(job_id, dag_yaml_path, env_file_path,
|
|
218
|
+
actual_pool)
|
|
191
219
|
|
|
192
220
|
except filelock.Timeout:
|
|
193
221
|
# If we can't get the lock, just exit. The process holding the lock
|
|
@@ -196,7 +224,7 @@ def maybe_schedule_next_jobs() -> None:
|
|
|
196
224
|
|
|
197
225
|
|
|
198
226
|
def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
199
|
-
env_file_path: str, priority: int) -> None:
|
|
227
|
+
env_file_path: str, priority: int, pool: Optional[str]) -> None:
|
|
200
228
|
"""Submit an existing job to the scheduler.
|
|
201
229
|
|
|
202
230
|
This should be called after a job is created in the `spot` table as
|
|
@@ -213,9 +241,9 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
|
213
241
|
common_utils.get_user_hash(),
|
|
214
242
|
priority)
|
|
215
243
|
if is_resume:
|
|
216
|
-
_start_controller(job_id, dag_yaml_path, env_file_path)
|
|
244
|
+
_start_controller(job_id, dag_yaml_path, env_file_path, pool)
|
|
217
245
|
else:
|
|
218
|
-
maybe_schedule_next_jobs()
|
|
246
|
+
maybe_schedule_next_jobs(pool)
|
|
219
247
|
|
|
220
248
|
|
|
221
249
|
@contextlib.contextmanager
|
|
@@ -251,6 +279,7 @@ def scheduled_launch(job_id: int):
|
|
|
251
279
|
while (state.get_job_schedule_state(job_id) !=
|
|
252
280
|
state.ManagedJobScheduleState.LAUNCHING):
|
|
253
281
|
time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
|
|
282
|
+
pool = state.get_pool_from_job_id(job_id)
|
|
254
283
|
|
|
255
284
|
try:
|
|
256
285
|
yield
|
|
@@ -264,7 +293,7 @@ def scheduled_launch(job_id: int):
|
|
|
264
293
|
with filelock.FileLock(_get_lock_path()):
|
|
265
294
|
state.scheduler_set_alive(job_id)
|
|
266
295
|
finally:
|
|
267
|
-
maybe_schedule_next_jobs()
|
|
296
|
+
maybe_schedule_next_jobs(pool)
|
|
268
297
|
|
|
269
298
|
|
|
270
299
|
def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
@@ -279,17 +308,19 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
|
279
308
|
if idempotent and (state.get_job_schedule_state(job_id)
|
|
280
309
|
== state.ManagedJobScheduleState.DONE):
|
|
281
310
|
return
|
|
311
|
+
pool = state.get_pool_from_job_id(job_id)
|
|
282
312
|
|
|
283
313
|
with filelock.FileLock(_get_lock_path()):
|
|
284
314
|
state.scheduler_set_done(job_id, idempotent)
|
|
285
|
-
maybe_schedule_next_jobs()
|
|
315
|
+
maybe_schedule_next_jobs(pool)
|
|
286
316
|
|
|
287
317
|
|
|
288
318
|
def _set_alive_waiting(job_id: int) -> None:
|
|
289
319
|
"""Should use wait_until_launch_okay() to transition to this state."""
|
|
290
320
|
with filelock.FileLock(_get_lock_path()):
|
|
291
321
|
state.scheduler_set_alive_waiting(job_id)
|
|
292
|
-
|
|
322
|
+
pool = state.get_pool_from_job_id(job_id)
|
|
323
|
+
maybe_schedule_next_jobs(pool)
|
|
293
324
|
|
|
294
325
|
|
|
295
326
|
def _get_job_parallelism() -> int:
|
|
@@ -305,11 +336,23 @@ def _get_launch_parallelism() -> int:
|
|
|
305
336
|
return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
|
|
306
337
|
|
|
307
338
|
|
|
308
|
-
def _can_start_new_job() -> bool:
|
|
339
|
+
def _can_start_new_job(pool: Optional[str]) -> bool:
|
|
309
340
|
launching_jobs = state.get_num_launching_jobs()
|
|
310
341
|
alive_jobs = state.get_num_alive_jobs()
|
|
311
|
-
|
|
312
|
-
|
|
342
|
+
|
|
343
|
+
# Check basic resource limits
|
|
344
|
+
if not (launching_jobs < _get_launch_parallelism() and
|
|
345
|
+
alive_jobs < _get_job_parallelism()):
|
|
346
|
+
return False
|
|
347
|
+
|
|
348
|
+
# Check if there are available replicas in the pool
|
|
349
|
+
if pool is not None:
|
|
350
|
+
alive_jobs_in_pool = state.get_num_alive_jobs(pool)
|
|
351
|
+
if alive_jobs_in_pool >= serve_utils.num_replicas(pool):
|
|
352
|
+
logger.debug(f'No replicas available in pool {pool}')
|
|
353
|
+
return False
|
|
354
|
+
|
|
355
|
+
return True
|
|
313
356
|
|
|
314
357
|
|
|
315
358
|
def _can_lauch_in_alive_job() -> bool:
|
|
@@ -332,6 +375,11 @@ if __name__ == '__main__':
|
|
|
332
375
|
parser.add_argument('--env-file',
|
|
333
376
|
type=str,
|
|
334
377
|
help='The path to the controller env file.')
|
|
378
|
+
parser.add_argument('--pool',
|
|
379
|
+
type=str,
|
|
380
|
+
required=False,
|
|
381
|
+
default=None,
|
|
382
|
+
help='The pool to use for the controller job.')
|
|
335
383
|
parser.add_argument(
|
|
336
384
|
'--priority',
|
|
337
385
|
type=int,
|
|
@@ -341,4 +389,4 @@ if __name__ == '__main__':
|
|
|
341
389
|
f' Default: {constants.DEFAULT_PRIORITY}.')
|
|
342
390
|
args = parser.parse_args()
|
|
343
391
|
submit_job(args.job_id, args.dag_yaml, args.user_yaml_path, args.env_file,
|
|
344
|
-
args.priority)
|
|
392
|
+
args.priority, args.pool)
|