skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (72) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +4 -1
  3. sky/backends/cloud_vm_ray_backend.py +4 -3
  4. sky/catalog/__init__.py +3 -3
  5. sky/catalog/aws_catalog.py +12 -0
  6. sky/catalog/common.py +2 -2
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +448 -53
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  11. sky/dashboard/out/clusters/[cluster].html +1 -1
  12. sky/dashboard/out/clusters.html +1 -1
  13. sky/dashboard/out/config.html +1 -1
  14. sky/dashboard/out/index.html +1 -1
  15. sky/dashboard/out/infra/[context].html +1 -1
  16. sky/dashboard/out/infra.html +1 -1
  17. sky/dashboard/out/jobs/[job].html +1 -1
  18. sky/dashboard/out/jobs.html +1 -1
  19. sky/dashboard/out/users.html +1 -1
  20. sky/dashboard/out/volumes.html +1 -1
  21. sky/dashboard/out/workspace/new.html +1 -1
  22. sky/dashboard/out/workspaces/[name].html +1 -1
  23. sky/dashboard/out/workspaces.html +1 -1
  24. sky/jobs/__init__.py +3 -0
  25. sky/jobs/client/sdk.py +80 -3
  26. sky/jobs/controller.py +76 -25
  27. sky/jobs/recovery_strategy.py +80 -34
  28. sky/jobs/scheduler.py +68 -20
  29. sky/jobs/server/core.py +228 -136
  30. sky/jobs/server/server.py +40 -0
  31. sky/jobs/state.py +129 -24
  32. sky/jobs/utils.py +109 -51
  33. sky/provision/nebius/constants.py +3 -0
  34. sky/py.typed +0 -0
  35. sky/resources.py +16 -12
  36. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  37. sky/serve/autoscalers.py +8 -0
  38. sky/serve/client/impl.py +188 -0
  39. sky/serve/client/sdk.py +12 -82
  40. sky/serve/constants.py +5 -1
  41. sky/serve/controller.py +5 -0
  42. sky/serve/replica_managers.py +112 -37
  43. sky/serve/serve_state.py +16 -6
  44. sky/serve/serve_utils.py +274 -77
  45. sky/serve/server/core.py +8 -525
  46. sky/serve/server/impl.py +709 -0
  47. sky/serve/service.py +13 -9
  48. sky/serve/service_spec.py +74 -4
  49. sky/server/constants.py +1 -1
  50. sky/server/requests/payloads.py +33 -0
  51. sky/server/requests/requests.py +18 -1
  52. sky/server/requests/serializers/decoders.py +12 -3
  53. sky/server/requests/serializers/encoders.py +13 -2
  54. sky/skylet/events.py +9 -0
  55. sky/skypilot_config.py +24 -21
  56. sky/task.py +41 -11
  57. sky/templates/jobs-controller.yaml.j2 +3 -0
  58. sky/templates/sky-serve-controller.yaml.j2 +18 -2
  59. sky/users/server.py +1 -1
  60. sky/utils/command_runner.py +4 -2
  61. sky/utils/controller_utils.py +14 -10
  62. sky/utils/dag_utils.py +4 -2
  63. sky/utils/db/migration_utils.py +2 -4
  64. sky/utils/schemas.py +24 -19
  65. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
  66. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +72 -68
  67. /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → oKqDxFQ88cquF4nQGE_0w}/_buildManifest.js +0 -0
  68. /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
  69. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
  70. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
  71. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
  72. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0
sky/jobs/scheduler.py CHANGED
@@ -9,9 +9,11 @@ The scheduler is not its own process - instead, maybe_schedule_next_jobs() can
9
9
  be called from any code running on the managed jobs controller instance to
10
10
  trigger scheduling of new jobs if possible. This function should be called
11
11
  immediately after any state change that could result in jobs newly being able to
12
- be scheduled.
12
+ be scheduled. If the job is running in a pool, the scheduler will only schedule
13
+ jobs for the same pool, because the resources limitations are per-pool (see the
14
+ following section for more details).
13
15
 
14
- The scheduling logic limits the number of running jobs according to two limits:
16
+ The scheduling logic limits #running jobs according to three limits:
15
17
  1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
16
18
  once, based on the number of CPUs. (See _get_launch_parallelism.) This the
17
19
  most compute-intensive part of the job lifecycle, which is why we have an
@@ -20,6 +22,8 @@ The scheduling logic limits the number of running jobs according to two limits:
20
22
  of memory. (See _get_job_parallelism.) Since the job controller is doing very
21
23
  little once a job starts (just checking its status periodically), the most
22
24
  significant resource it consumes is memory.
25
+ 3. The number of jobs that can be running in a pool at any given time, based on
26
+ the number of ready workers in the pool. (See _can_start_new_job.)
23
27
 
24
28
  The state of the scheduler is entirely determined by the schedule_state column
25
29
  of all the jobs in the job_info table. This column should only be modified via
@@ -43,6 +47,7 @@ import os
43
47
  import sys
44
48
  import time
45
49
  import typing
50
+ from typing import Optional
46
51
 
47
52
  import filelock
48
53
 
@@ -51,6 +56,7 @@ from sky import sky_logging
51
56
  from sky.adaptors import common as adaptors_common
52
57
  from sky.jobs import constants as managed_job_constants
53
58
  from sky.jobs import state
59
+ from sky.serve import serve_utils
54
60
  from sky.skylet import constants
55
61
  from sky.utils import common_utils
56
62
  from sky.utils import subprocess_utils
@@ -80,18 +86,21 @@ LAUNCHES_PER_CPU = 4
80
86
 
81
87
  @lru_cache(maxsize=1)
82
88
  def _get_lock_path() -> str:
89
+ # TODO(tian): Per pool lock.
83
90
  path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
84
91
  os.makedirs(os.path.dirname(path), exist_ok=True)
85
92
  return path
86
93
 
87
94
 
88
- def _start_controller(job_id: int, dag_yaml_path: str,
89
- env_file_path: str) -> None:
95
+ def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
96
+ pool: Optional[str]) -> None:
90
97
  activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
91
98
  source_environment_cmd = (f'source {env_file_path};'
92
99
  if env_file_path else '')
93
- run_controller_cmd = (f'{sys.executable} -u -m sky.jobs.controller '
94
- f'{dag_yaml_path} --job-id {job_id};')
100
+ maybe_pool_arg = (f'--pool {pool}' if pool is not None else '')
101
+ run_controller_cmd = (
102
+ f'{sys.executable} -u -m sky.jobs.controller '
103
+ f'{dag_yaml_path} --job-id {job_id} {maybe_pool_arg};')
95
104
 
96
105
  # If the command line here is changed, please also update
97
106
  # utils._controller_process_alive. The substring `--job-id X`
@@ -111,7 +120,7 @@ def _start_controller(job_id: int, dag_yaml_path: str,
111
120
  logger.debug(f'Job {job_id} started with pid {pid}')
112
121
 
113
122
 
114
- def maybe_schedule_next_jobs() -> None:
123
+ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
115
124
  """Determine if any managed jobs can be scheduled, and if so, schedule them.
116
125
 
117
126
  Here, "schedule" means to select job that is waiting, and allow it to
@@ -141,6 +150,13 @@ def maybe_schedule_next_jobs() -> None:
141
150
  the jobs controller instance. New job controller processes will be detached
142
151
  from the current process and there will not be a parent/child relationship.
143
152
  See launch_new_process_tree for more.
153
+
154
+ After adding the pool support, this function will be called in a per-pool
155
+ basis. We employ resources limitation for each pool given the number of
156
+ ready workers in the pool. Each pool will have its own scheduler queue,
157
+ indicating by the argument `pool`. Finished job in pool 1 will only trigger
158
+ another jobs in pool 1, but the job in pool 2 will still be waiting. When
159
+ the `pool` argument is None, it schedules a job regardless of the pool.
144
160
  """
145
161
  try:
146
162
  # We must use a global lock rather than a per-job lock to ensure correct
@@ -149,10 +165,11 @@ def maybe_schedule_next_jobs() -> None:
149
165
  # releasing the lock.
150
166
  with filelock.FileLock(_get_lock_path(), blocking=False):
151
167
  while True:
152
- maybe_next_job = state.get_waiting_job()
168
+ maybe_next_job = state.get_waiting_job(pool)
153
169
  if maybe_next_job is None:
154
170
  # Nothing left to start, break from scheduling loop
155
171
  break
172
+ actual_pool = maybe_next_job['pool']
156
173
 
157
174
  current_state = maybe_next_job['schedule_state']
158
175
 
@@ -171,7 +188,17 @@ def maybe_schedule_next_jobs() -> None:
171
188
  # Can't schedule anything, break from scheduling loop.
172
189
  break
173
190
  elif current_state == state.ManagedJobScheduleState.WAITING:
174
- if not _can_start_new_job():
191
+ if not _can_start_new_job(actual_pool):
192
+ # If there is no job can be scheduled in the pool, we
193
+ # try to schedule another job regardless of the pool.
194
+ # This is to avoid the case where the pool is scaled
195
+ # down at the same time as a job is done. In this case,
196
+ # we won't have any job to schedule in the pool, but
197
+ # other jobs in other pool (or no pool) can still be
198
+ # scheduled.
199
+ if pool is not None:
200
+ pool = None
201
+ continue
175
202
  # Can't schedule anything, break from scheduling loop.
176
203
  break
177
204
 
@@ -187,7 +214,8 @@ def maybe_schedule_next_jobs() -> None:
187
214
  dag_yaml_path = maybe_next_job['dag_yaml_path']
188
215
  env_file_path = maybe_next_job['env_file_path']
189
216
 
190
- _start_controller(job_id, dag_yaml_path, env_file_path)
217
+ _start_controller(job_id, dag_yaml_path, env_file_path,
218
+ actual_pool)
191
219
 
192
220
  except filelock.Timeout:
193
221
  # If we can't get the lock, just exit. The process holding the lock
@@ -196,7 +224,7 @@ def maybe_schedule_next_jobs() -> None:
196
224
 
197
225
 
198
226
  def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
199
- env_file_path: str, priority: int) -> None:
227
+ env_file_path: str, priority: int, pool: Optional[str]) -> None:
200
228
  """Submit an existing job to the scheduler.
201
229
 
202
230
  This should be called after a job is created in the `spot` table as
@@ -213,9 +241,9 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
213
241
  common_utils.get_user_hash(),
214
242
  priority)
215
243
  if is_resume:
216
- _start_controller(job_id, dag_yaml_path, env_file_path)
244
+ _start_controller(job_id, dag_yaml_path, env_file_path, pool)
217
245
  else:
218
- maybe_schedule_next_jobs()
246
+ maybe_schedule_next_jobs(pool)
219
247
 
220
248
 
221
249
  @contextlib.contextmanager
@@ -251,6 +279,7 @@ def scheduled_launch(job_id: int):
251
279
  while (state.get_job_schedule_state(job_id) !=
252
280
  state.ManagedJobScheduleState.LAUNCHING):
253
281
  time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
282
+ pool = state.get_pool_from_job_id(job_id)
254
283
 
255
284
  try:
256
285
  yield
@@ -264,7 +293,7 @@ def scheduled_launch(job_id: int):
264
293
  with filelock.FileLock(_get_lock_path()):
265
294
  state.scheduler_set_alive(job_id)
266
295
  finally:
267
- maybe_schedule_next_jobs()
296
+ maybe_schedule_next_jobs(pool)
268
297
 
269
298
 
270
299
  def job_done(job_id: int, idempotent: bool = False) -> None:
@@ -279,17 +308,19 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
279
308
  if idempotent and (state.get_job_schedule_state(job_id)
280
309
  == state.ManagedJobScheduleState.DONE):
281
310
  return
311
+ pool = state.get_pool_from_job_id(job_id)
282
312
 
283
313
  with filelock.FileLock(_get_lock_path()):
284
314
  state.scheduler_set_done(job_id, idempotent)
285
- maybe_schedule_next_jobs()
315
+ maybe_schedule_next_jobs(pool)
286
316
 
287
317
 
288
318
  def _set_alive_waiting(job_id: int) -> None:
289
319
  """Should use wait_until_launch_okay() to transition to this state."""
290
320
  with filelock.FileLock(_get_lock_path()):
291
321
  state.scheduler_set_alive_waiting(job_id)
292
- maybe_schedule_next_jobs()
322
+ pool = state.get_pool_from_job_id(job_id)
323
+ maybe_schedule_next_jobs(pool)
293
324
 
294
325
 
295
326
  def _get_job_parallelism() -> int:
@@ -305,11 +336,23 @@ def _get_launch_parallelism() -> int:
305
336
  return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
306
337
 
307
338
 
308
- def _can_start_new_job() -> bool:
339
+ def _can_start_new_job(pool: Optional[str]) -> bool:
309
340
  launching_jobs = state.get_num_launching_jobs()
310
341
  alive_jobs = state.get_num_alive_jobs()
311
- return launching_jobs < _get_launch_parallelism(
312
- ) and alive_jobs < _get_job_parallelism()
342
+
343
+ # Check basic resource limits
344
+ if not (launching_jobs < _get_launch_parallelism() and
345
+ alive_jobs < _get_job_parallelism()):
346
+ return False
347
+
348
+ # Check if there are available replicas in the pool
349
+ if pool is not None:
350
+ alive_jobs_in_pool = state.get_num_alive_jobs(pool)
351
+ if alive_jobs_in_pool >= serve_utils.num_replicas(pool):
352
+ logger.debug(f'No replicas available in pool {pool}')
353
+ return False
354
+
355
+ return True
313
356
 
314
357
 
315
358
  def _can_lauch_in_alive_job() -> bool:
@@ -332,6 +375,11 @@ if __name__ == '__main__':
332
375
  parser.add_argument('--env-file',
333
376
  type=str,
334
377
  help='The path to the controller env file.')
378
+ parser.add_argument('--pool',
379
+ type=str,
380
+ required=False,
381
+ default=None,
382
+ help='The pool to use for the controller job.')
335
383
  parser.add_argument(
336
384
  '--priority',
337
385
  type=int,
@@ -341,4 +389,4 @@ if __name__ == '__main__':
341
389
  f' Default: {constants.DEFAULT_PRIORITY}.')
342
390
  args = parser.parse_args()
343
391
  submit_job(args.job_id, args.dag_yaml, args.user_yaml_path, args.env_file,
344
- args.priority)
392
+ args.priority, args.pool)