skypilot-nightly 1.0.0.dev20250808__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (120) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/kubernetes.py +5 -2
  3. sky/backends/backend_utils.py +102 -8
  4. sky/backends/cloud_vm_ray_backend.py +197 -31
  5. sky/catalog/cudo_catalog.py +1 -1
  6. sky/catalog/data_fetchers/fetch_cudo.py +1 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +6 -3
  8. sky/client/cli/command.py +60 -77
  9. sky/client/common.py +1 -1
  10. sky/client/sdk.py +19 -19
  11. sky/client/sdk_async.py +5 -4
  12. sky/clouds/aws.py +52 -1
  13. sky/clouds/kubernetes.py +14 -0
  14. sky/core.py +5 -0
  15. sky/dag.py +1 -0
  16. sky/dashboard/out/404.html +1 -1
  17. sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
  18. sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/{webpack-339efec49c0cc7d0.js → webpack-00c0a51d21157453.js} +1 -1
  25. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  26. sky/dashboard/out/clusters/[cluster].html +1 -1
  27. sky/dashboard/out/clusters.html +1 -1
  28. sky/dashboard/out/config.html +1 -1
  29. sky/dashboard/out/index.html +1 -1
  30. sky/dashboard/out/infra/[context].html +1 -1
  31. sky/dashboard/out/infra.html +1 -1
  32. sky/dashboard/out/jobs/[job].html +1 -1
  33. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  34. sky/dashboard/out/jobs.html +1 -1
  35. sky/dashboard/out/users.html +1 -1
  36. sky/dashboard/out/volumes.html +1 -1
  37. sky/dashboard/out/workspace/new.html +1 -1
  38. sky/dashboard/out/workspaces/[name].html +1 -1
  39. sky/dashboard/out/workspaces.html +1 -1
  40. sky/data/storage.py +11 -1
  41. sky/exceptions.py +5 -0
  42. sky/execution.py +15 -0
  43. sky/global_user_state.py +160 -2
  44. sky/jobs/constants.py +1 -1
  45. sky/jobs/controller.py +0 -1
  46. sky/jobs/recovery_strategy.py +6 -3
  47. sky/jobs/scheduler.py +23 -68
  48. sky/jobs/server/core.py +22 -12
  49. sky/jobs/state.py +6 -2
  50. sky/jobs/utils.py +17 -2
  51. sky/provision/__init__.py +4 -2
  52. sky/provision/aws/config.py +9 -0
  53. sky/provision/aws/instance.py +41 -17
  54. sky/provision/azure/instance.py +7 -4
  55. sky/provision/cudo/cudo_wrapper.py +1 -1
  56. sky/provision/cudo/instance.py +7 -4
  57. sky/provision/do/instance.py +7 -4
  58. sky/provision/fluidstack/instance.py +7 -4
  59. sky/provision/gcp/instance.py +7 -4
  60. sky/provision/hyperbolic/instance.py +7 -5
  61. sky/provision/kubernetes/instance.py +169 -6
  62. sky/provision/lambda_cloud/instance.py +7 -4
  63. sky/provision/nebius/instance.py +7 -4
  64. sky/provision/oci/instance.py +7 -4
  65. sky/provision/paperspace/instance.py +7 -5
  66. sky/provision/paperspace/utils.py +1 -1
  67. sky/provision/provisioner.py +6 -0
  68. sky/provision/runpod/instance.py +7 -4
  69. sky/provision/runpod/utils.py +1 -1
  70. sky/provision/scp/instance.py +7 -5
  71. sky/provision/vast/instance.py +7 -5
  72. sky/provision/vsphere/instance.py +7 -4
  73. sky/resources.py +1 -2
  74. sky/schemas/__init__.py +0 -0
  75. sky/schemas/api/__init__.py +0 -0
  76. sky/schemas/api/responses.py +70 -0
  77. sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
  78. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  79. sky/schemas/db/serve_state/001_initial_schema.py +1 -1
  80. sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
  81. sky/schemas/generated/__init__.py +0 -0
  82. sky/schemas/generated/autostopv1_pb2.py +36 -0
  83. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  84. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  85. sky/serve/constants.py +3 -7
  86. sky/serve/replica_managers.py +15 -16
  87. sky/serve/serve_state.py +10 -0
  88. sky/serve/serve_utils.py +58 -23
  89. sky/serve/server/impl.py +15 -19
  90. sky/serve/service.py +31 -16
  91. sky/server/server.py +20 -14
  92. sky/setup_files/dependencies.py +11 -10
  93. sky/skylet/autostop_lib.py +38 -5
  94. sky/skylet/constants.py +3 -1
  95. sky/skylet/services.py +44 -0
  96. sky/skylet/skylet.py +49 -4
  97. sky/skypilot_config.py +4 -4
  98. sky/task.py +19 -16
  99. sky/templates/aws-ray.yml.j2 +2 -2
  100. sky/templates/jobs-controller.yaml.j2 +6 -0
  101. sky/users/permission.py +1 -1
  102. sky/utils/cli_utils/status_utils.py +9 -0
  103. sky/utils/command_runner.py +1 -1
  104. sky/utils/config_utils.py +29 -5
  105. sky/utils/controller_utils.py +73 -0
  106. sky/utils/db/db_utils.py +39 -1
  107. sky/utils/db/migration_utils.py +1 -1
  108. sky/utils/schemas.py +3 -0
  109. sky/volumes/server/core.py +2 -2
  110. sky/volumes/server/server.py +2 -2
  111. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
  112. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +117 -108
  113. sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +0 -1
  114. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +0 -11
  115. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
  116. /sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
  117. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
  118. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
  119. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
  120. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0
sky/jobs/scheduler.py CHANGED
@@ -15,13 +15,14 @@ following section for more details).
15
15
 
16
16
  The scheduling logic limits #running jobs according to three limits:
17
17
  1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
18
- once, based on the number of CPUs. (See _get_launch_parallelism.) This the
19
- most compute-intensive part of the job lifecycle, which is why we have an
20
- additional limit.
18
+ once, based on the number of CPUs. This the most compute-intensive part of
19
+ the job lifecycle, which is why we have an additional limit.
20
+ See sky/utils/controller_utils.py::_get_launch_parallelism.
21
21
  2. The number of jobs that can be running at any given time, based on the amount
22
- of memory. (See _get_job_parallelism.) Since the job controller is doing very
23
- little once a job starts (just checking its status periodically), the most
24
- significant resource it consumes is memory.
22
+ of memory. Since the job controller is doing very little once a job starts
23
+ (just checking its status periodically), the most significant resource it
24
+ consumes is memory.
25
+ See sky/utils/controller_utils.py::_get_job_parallelism.
25
26
  3. The number of jobs that can be running in a pool at any given time, based on
26
27
  the number of ready workers in the pool. (See _can_start_new_job.)
27
28
 
@@ -42,55 +43,27 @@ Nomenclature:
42
43
 
43
44
  from argparse import ArgumentParser
44
45
  import contextlib
45
- from functools import lru_cache
46
46
  import os
47
47
  import sys
48
48
  import time
49
- import typing
50
49
  from typing import Optional
51
50
 
52
51
  import filelock
53
52
 
54
53
  from sky import exceptions
55
54
  from sky import sky_logging
56
- from sky.adaptors import common as adaptors_common
57
55
  from sky.jobs import constants as managed_job_constants
58
56
  from sky.jobs import state
59
57
  from sky.serve import serve_utils
60
58
  from sky.skylet import constants
61
59
  from sky.utils import common_utils
60
+ from sky.utils import controller_utils
62
61
  from sky.utils import subprocess_utils
63
62
 
64
- if typing.TYPE_CHECKING:
65
- import psutil
66
- else:
67
- psutil = adaptors_common.LazyImport('psutil')
68
-
69
63
  logger = sky_logging.init_logger('sky.jobs.controller')
70
64
 
71
- # The _MANAGED_JOB_SCHEDULER_LOCK should be held whenever we are checking the
72
- # parallelism control or updating the schedule_state of any job.
73
- # Any code that takes this lock must conclude by calling
74
- # maybe_schedule_next_jobs.
75
- _MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
76
65
  _ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
77
66
 
78
- # Based on testing, assume a running job uses 350MB memory.
79
- JOB_MEMORY_MB = 350
80
- # Past 2000 simultaneous jobs, we become unstable.
81
- # See https://github.com/skypilot-org/skypilot/issues/4649.
82
- MAX_JOB_LIMIT = 2000
83
- # Number of ongoing launches launches allowed per CPU.
84
- LAUNCHES_PER_CPU = 4
85
-
86
-
87
- @lru_cache(maxsize=1)
88
- def _get_lock_path() -> str:
89
- # TODO(tian): Per pool lock.
90
- path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
91
- os.makedirs(os.path.dirname(path), exist_ok=True)
92
- return path
93
-
94
67
 
95
68
  def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
96
69
  pool: Optional[str]) -> None:
@@ -163,7 +136,8 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
163
136
  # parallelism control. If we cannot obtain the lock, exit immediately.
164
137
  # The current lock holder is expected to launch any jobs it can before
165
138
  # releasing the lock.
166
- with filelock.FileLock(_get_lock_path(), blocking=False):
139
+ with filelock.FileLock(controller_utils.get_resources_lock_path(),
140
+ blocking=False):
167
141
  while True:
168
142
  maybe_next_job = state.get_waiting_job(pool)
169
143
  if maybe_next_job is None:
@@ -184,7 +158,8 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
184
158
  # an ALIVE_WAITING job, but we would be able to launch a WAITING
185
159
  # job.
186
160
  if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
187
- if not _can_lauch_in_alive_job():
161
+ if not (controller_utils.can_provision() or
162
+ actual_pool is not None):
188
163
  # Can't schedule anything, break from scheduling loop.
189
164
  break
190
165
  elif current_state == state.ManagedJobScheduleState.WAITING:
@@ -234,7 +209,7 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
234
209
 
235
210
  The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
236
211
  """
237
- with filelock.FileLock(_get_lock_path()):
212
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
238
213
  is_resume = state.scheduler_set_waiting(job_id, dag_yaml_path,
239
214
  original_user_yaml_path,
240
215
  env_file_path,
@@ -286,11 +261,11 @@ def scheduled_launch(job_id: int):
286
261
  except exceptions.NoClusterLaunchedError:
287
262
  # NoClusterLaunchedError is indicates that the job is in retry backoff.
288
263
  # We should transition to ALIVE_BACKOFF instead of ALIVE.
289
- with filelock.FileLock(_get_lock_path()):
264
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
290
265
  state.scheduler_set_alive_backoff(job_id)
291
266
  raise
292
267
  else:
293
- with filelock.FileLock(_get_lock_path()):
268
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
294
269
  state.scheduler_set_alive(job_id)
295
270
  finally:
296
271
  maybe_schedule_next_jobs(pool)
@@ -310,56 +285,36 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
310
285
  return
311
286
  pool = state.get_pool_from_job_id(job_id)
312
287
 
313
- with filelock.FileLock(_get_lock_path()):
288
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
314
289
  state.scheduler_set_done(job_id, idempotent)
315
290
  maybe_schedule_next_jobs(pool)
316
291
 
317
292
 
318
293
  def _set_alive_waiting(job_id: int) -> None:
319
294
  """Should use wait_until_launch_okay() to transition to this state."""
320
- with filelock.FileLock(_get_lock_path()):
295
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
321
296
  state.scheduler_set_alive_waiting(job_id)
322
297
  pool = state.get_pool_from_job_id(job_id)
323
298
  maybe_schedule_next_jobs(pool)
324
299
 
325
300
 
326
- def _get_job_parallelism() -> int:
327
- job_memory = JOB_MEMORY_MB * 1024 * 1024
328
-
329
- job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
330
-
331
- return max(job_limit, 1)
332
-
333
-
334
- def _get_launch_parallelism() -> int:
335
- cpus = os.cpu_count()
336
- return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
337
-
338
-
339
301
  def _can_start_new_job(pool: Optional[str]) -> bool:
340
- launching_jobs = state.get_num_launching_jobs()
341
- alive_jobs = state.get_num_alive_jobs()
342
-
343
302
  # Check basic resource limits
344
- if not (launching_jobs < _get_launch_parallelism() and
345
- alive_jobs < _get_job_parallelism()):
303
+ # Pool jobs don't need to provision resources, so we skip the check.
304
+ if not ((controller_utils.can_provision() or pool is not None) and
305
+ controller_utils.can_start_new_process()):
346
306
  return False
347
307
 
348
- # Check if there are available replicas in the pool
308
+ # Check if there are available workers in the pool
349
309
  if pool is not None:
350
310
  alive_jobs_in_pool = state.get_num_alive_jobs(pool)
351
- if alive_jobs_in_pool >= serve_utils.num_replicas(pool):
352
- logger.debug(f'No replicas available in pool {pool}')
311
+ if alive_jobs_in_pool >= len(serve_utils.get_ready_replicas(pool)):
312
+ logger.debug(f'No READY workers available in pool {pool}')
353
313
  return False
354
314
 
355
315
  return True
356
316
 
357
317
 
358
- def _can_lauch_in_alive_job() -> bool:
359
- launching_jobs = state.get_num_launching_jobs()
360
- return launching_jobs < _get_launch_parallelism()
361
-
362
-
363
318
  if __name__ == '__main__':
364
319
  parser = ArgumentParser()
365
320
  parser.add_argument('dag_yaml',
sky/jobs/server/core.py CHANGED
@@ -93,8 +93,8 @@ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
93
93
  return local_to_controller_file_mounts
94
94
 
95
95
 
96
- def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag', pool: Optional[str],
97
- num_jobs: Optional[int]) -> Optional[List[int]]:
96
+ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
97
+ num_jobs: int) -> Optional[List[int]]:
98
98
  """Submit the managed job locally if in consolidation mode.
99
99
 
100
100
  In normal mode the managed job submission is done in the ray job submission.
@@ -109,12 +109,13 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag', pool: Optional[str],
109
109
  # Create local directory for the managed job.
110
110
  pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
111
111
  job_ids = []
112
+ pool = dag.pool
112
113
  pool_hash = None
113
114
  if pool is not None:
114
115
  pool_hash = serve_state.get_service_hash(pool)
115
116
  # Already checked in the sdk.
116
117
  assert pool_hash is not None, f'Pool {pool} not found'
117
- for _ in range(num_jobs if num_jobs is not None else 1):
118
+ for _ in range(num_jobs):
118
119
  # TODO(tian): We should have a separate name for each job when
119
120
  # submitting multiple jobs. Current blocker is that we are sharing
120
121
  # the same dag object for all jobs. Maybe we can do copy.copy() for
@@ -172,9 +173,6 @@ def launch(
172
173
  handle: Optional[backends.ResourceHandle]; handle to the controller VM.
173
174
  None if dryrun.
174
175
  """
175
- if pool is not None and not managed_job_utils.is_consolidation_mode():
176
- with ux_utils.print_exception_no_traceback():
177
- raise ValueError('pool is only supported in consolidation mode.')
178
176
  entrypoint = task
179
177
  # using hasattr instead of isinstance to avoid importing sky
180
178
  if hasattr(task, 'metadata'):
@@ -295,8 +293,13 @@ def launch(
295
293
  controller=controller,
296
294
  task_resources=sum([list(t.resources) for t in dag.tasks], []))
297
295
 
296
+ num_jobs = num_jobs if num_jobs is not None else 1
297
+ # We do this assignment after applying the admin policy, so that we don't
298
+ # need to serialize the pool name in the dag. The dag object will be
299
+ # preserved. See sky/admin_policy.py::MutatedUserRequest::decode.
300
+ dag.pool = pool
298
301
  consolidation_mode_job_ids = _maybe_submit_job_locally(
299
- prefix, dag, pool, num_jobs)
302
+ prefix, dag, num_jobs)
300
303
 
301
304
  # This is only needed for non-consolidation mode. For consolidation
302
305
  # mode, the controller uses the same catalog as API server.
@@ -373,8 +376,8 @@ def launch(
373
376
  controller_task._metadata = metadata
374
377
 
375
378
  job_identity = ''
376
- if consolidation_mode_job_id is not None:
377
- job_identity = f' (Job ID: {consolidation_mode_job_id})'
379
+ if job_rank is not None:
380
+ job_identity = f' (rank: {job_rank})'
378
381
  logger.info(f'{colorama.Fore.YELLOW}'
379
382
  f'Launching managed job {dag.name!r}{job_identity} '
380
383
  f'from jobs controller...{colorama.Style.RESET_ALL}')
@@ -428,14 +431,17 @@ def launch(
428
431
  backend.run_on_head(local_handle, run_script)
429
432
  return consolidation_mode_job_id, local_handle
430
433
 
431
- if consolidation_mode_job_ids is None:
432
- return _submit_one()
433
434
  if pool is None:
435
+ if consolidation_mode_job_ids is None:
436
+ return _submit_one()
434
437
  assert len(consolidation_mode_job_ids) == 1
435
438
  return _submit_one(consolidation_mode_job_ids[0])
439
+
436
440
  ids = []
437
441
  all_handle = None
438
- for job_rank, job_id in enumerate(consolidation_mode_job_ids):
442
+ for job_rank in range(num_jobs):
443
+ job_id = (consolidation_mode_job_ids[job_rank]
444
+ if consolidation_mode_job_ids is not None else None)
439
445
  jid, handle = _submit_one(job_id, job_rank)
440
446
  assert jid is not None, (job_id, handle)
441
447
  ids.append(jid)
@@ -547,6 +553,10 @@ def _maybe_restart_controller(
547
553
  'controller'))
548
554
  with skypilot_config.local_active_workspace_ctx(
549
555
  skylet_constants.SKYPILOT_DEFAULT_WORKSPACE):
556
+ global_user_state.add_cluster_event(
557
+ jobs_controller_type.value.cluster_name,
558
+ status_lib.ClusterStatus.INIT, 'Jobs controller restarted.',
559
+ global_user_state.ClusterEventType.STATUS_CHANGE)
550
560
  handle = core.start(
551
561
  cluster_name=jobs_controller_type.value.cluster_name)
552
562
 
sky/jobs/state.py CHANGED
@@ -441,7 +441,8 @@ class ManagedJobScheduleState(enum.Enum):
441
441
 
442
442
  # === Status transition functions ===
443
443
  @_init_db
444
- def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
444
+ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str,
445
+ pool: Optional[str], pool_hash: Optional[str]):
445
446
  assert _SQLALCHEMY_ENGINE is not None
446
447
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
447
448
  if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -457,7 +458,10 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
457
458
  name=name,
458
459
  schedule_state=ManagedJobScheduleState.INACTIVE.value,
459
460
  workspace=workspace,
460
- entrypoint=entrypoint)
461
+ entrypoint=entrypoint,
462
+ pool=pool,
463
+ pool_hash=pool_hash,
464
+ )
461
465
  session.execute(insert_stmt)
462
466
  session.commit()
463
467
 
sky/jobs/utils.py CHANGED
@@ -141,7 +141,7 @@ def _validate_consolidation_mode_config(
141
141
  if global_user_state.get_cluster_from_name(controller_cn) is not None:
142
142
  with ux_utils.print_exception_no_traceback():
143
143
  raise exceptions.InconsistentConsolidationModeError(
144
- f'{colorama.Fore.RED}Consolidation mode is '
144
+ f'{colorama.Fore.RED}Consolidation mode for jobs is '
145
145
  f'enabled, but the controller cluster '
146
146
  f'{controller_cn} is still running. Please '
147
147
  'terminate the controller cluster first.'
@@ -179,7 +179,11 @@ def _validate_consolidation_mode_config(
179
179
  def is_consolidation_mode() -> bool:
180
180
  consolidation_mode = skypilot_config.get_nested(
181
181
  ('jobs', 'controller', 'consolidation_mode'), default_value=False)
182
- _validate_consolidation_mode_config(consolidation_mode)
182
+ # We should only do this check on API server, as the controller will not
183
+ # have related config and will always seemingly disabled for consolidation
184
+ # mode. Check #6611 for more details.
185
+ if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
186
+ _validate_consolidation_mode_config(consolidation_mode)
183
187
  return consolidation_mode
184
188
 
185
189
 
@@ -333,6 +337,9 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
333
337
  if handle is not None:
334
338
  try:
335
339
  if pool is None:
340
+ global_user_state.add_cluster_event(
341
+ cluster_name, None, 'Cluster was cleaned up.',
342
+ global_user_state.ClusterEventType.STATUS_CHANGE)
336
343
  terminate_cluster(cluster_name)
337
344
  except Exception as e: # pylint: disable=broad-except
338
345
  error_msg = (
@@ -1683,6 +1690,7 @@ class ManagedJobCodeGen:
1683
1690
  def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag',
1684
1691
  workspace: str, entrypoint: str) -> str:
1685
1692
  dag_name = managed_job_dag.name
1693
+ pool = managed_job_dag.pool
1686
1694
  # Add the managed job to queue table.
1687
1695
  code = textwrap.dedent(f"""\
1688
1696
  set_job_info_kwargs = {{'workspace': {workspace!r}}}
@@ -1690,6 +1698,13 @@ class ManagedJobCodeGen:
1690
1698
  set_job_info_kwargs = {{}}
1691
1699
  if managed_job_version >= 5:
1692
1700
  set_job_info_kwargs['entrypoint'] = {entrypoint!r}
1701
+ if managed_job_version >= 8:
1702
+ from sky.serve import serve_state
1703
+ pool_hash = None
1704
+ if {pool!r} != None:
1705
+ pool_hash = serve_state.get_service_hash({pool!r})
1706
+ set_job_info_kwargs['pool'] = {pool!r}
1707
+ set_job_info_kwargs['pool_hash'] = pool_hash
1693
1708
  managed_job_state.set_job_info(
1694
1709
  {job_id}, {dag_name!r}, **set_job_info_kwargs)
1695
1710
  """)
sky/provision/__init__.py CHANGED
@@ -73,13 +73,15 @@ def _route_to_cloud_impl(func):
73
73
  @_route_to_cloud_impl
74
74
  def query_instances(
75
75
  provider_name: str,
76
+ cluster_name: str,
76
77
  cluster_name_on_cloud: str,
77
78
  provider_config: Optional[Dict[str, Any]] = None,
78
79
  non_terminated_only: bool = True,
79
- ) -> Dict[str, Optional['status_lib.ClusterStatus']]:
80
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
80
81
  """Query instances.
81
82
 
82
- Returns a dictionary of instance IDs and status.
83
+ Returns a dictionary of instance IDs and a tuple of (status, reason for
84
+ being in status if any).
83
85
 
84
86
  A None status means the instance is marked as "terminated"
85
87
  or "terminating".
@@ -19,6 +19,7 @@ import colorama
19
19
  from sky import exceptions
20
20
  from sky import sky_logging
21
21
  from sky.adaptors import aws
22
+ from sky.clouds import aws as aws_cloud
22
23
  from sky.provision import common
23
24
  from sky.provision.aws import utils
24
25
  from sky.utils import annotations
@@ -103,6 +104,14 @@ def bootstrap_instances(
103
104
  security_group_ids = _configure_security_group(ec2, vpc_id,
104
105
  expected_sg_name,
105
106
  extended_ip_rules)
107
+ if expected_sg_name != aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
108
+ # Ensure the default security group is created. This is needed
109
+ # to enable us to use the default security group to quickly
110
+ # delete the cluster. If the default security group is not created,
111
+ # we will need to block on instance termination to delete the
112
+ # security group.
113
+ _configure_security_group(ec2, vpc_id,
114
+ aws_cloud.DEFAULT_SECURITY_GROUP_NAME, [])
106
115
  end_time = time.time()
107
116
  elapsed = end_time - start_time
108
117
  logger.info(
@@ -10,7 +10,7 @@ from multiprocessing import pool
10
10
  import re
11
11
  import time
12
12
  import typing
13
- from typing import Any, Callable, Dict, List, Optional, Set, TypeVar
13
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar
14
14
 
15
15
  from sky import sky_logging
16
16
  from sky.adaptors import aws
@@ -527,6 +527,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
527
527
  to_start_count,
528
528
  associate_public_ip_address=(
529
529
  not config.provider_config['use_internal_ips']))
530
+
530
531
  created_instances.extend(created_remaining_instances)
531
532
  created_instances.sort(key=lambda x: x.id)
532
533
 
@@ -585,11 +586,13 @@ def _filter_instances(ec2: 'mypy_boto3_ec2.ServiceResource',
585
586
  # stop() and terminate() for example already implicitly assume non-terminated.
586
587
  @common_utils.retry
587
588
  def query_instances(
589
+ cluster_name: str,
588
590
  cluster_name_on_cloud: str,
589
591
  provider_config: Optional[Dict[str, Any]] = None,
590
592
  non_terminated_only: bool = True,
591
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
593
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
592
594
  """See sky/provision/__init__.py"""
595
+ del cluster_name # unused
593
596
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
594
597
  region = provider_config['region']
595
598
  ec2 = _default_ec2_resource(region)
@@ -608,12 +611,13 @@ def query_instances(
608
611
  'shutting-down': None,
609
612
  'terminated': None,
610
613
  }
611
- statuses = {}
614
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
615
+ Optional[str]]] = {}
612
616
  for inst in instances:
613
617
  status = status_map[inst.state['Name']]
614
618
  if non_terminated_only and status is None:
615
619
  continue
616
- statuses[inst.id] = status
620
+ statuses[inst.id] = (status, None)
617
621
  return statuses
618
622
 
619
623
 
@@ -681,19 +685,39 @@ def terminate_instances(
681
685
  filters,
682
686
  included_instances=None,
683
687
  excluded_instances=None)
684
- instances_list = list(instances)
685
- instances.terminate()
686
- if (sg_name == aws_cloud.DEFAULT_SECURITY_GROUP_NAME or
687
- not managed_by_skypilot):
688
- # Using default AWS SG or user specified security group. We don't need
689
- # to wait for the termination of the instances, as we do not need to
690
- # delete the SG.
691
- return
692
- # If ports are specified, we need to delete the newly created Security
693
- # Group. Here we wait for all instances to be terminated, since the
694
- # Security Group dependent on them.
695
- for instance in instances_list:
696
- instance.wait_until_terminated()
688
+ default_sg = _get_sg_from_name(ec2, aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
689
+ if sg_name == aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
690
+ # Case 1: The default SG is used, we don't need to ensure instance are
691
+ # terminated.
692
+ instances.terminate()
693
+ elif not managed_by_skypilot:
694
+ # Case 2: We are not managing the non-default sg. We don't need to
695
+ # ensure instances are terminated.
696
+ instances.terminate()
697
+ elif (managed_by_skypilot and default_sg is not None):
698
+ # Case 3: We are managing the non-default sg. The default SG exists
699
+ # so we can move the instances to the default SG and terminate them
700
+ # without blocking.
701
+
702
+ # Make this multithreaded: modify all instances' SGs in parallel.
703
+ def modify_instance_sg(instance):
704
+ instance.modify_attribute(Groups=[default_sg.id])
705
+ logger.debug(f'Instance {instance.id} modified to use default SG:'
706
+ f'{default_sg.id} for quick deletion.')
707
+
708
+ with pool.ThreadPool() as thread_pool:
709
+ thread_pool.map(modify_instance_sg, instances)
710
+ thread_pool.close()
711
+ thread_pool.join()
712
+
713
+ instances.terminate()
714
+ else:
715
+ # Case 4: We are managing the non-default sg. The default SG does not
716
+ # exist. We must block on instance termination.
717
+ instances.terminate()
718
+ for instance in instances:
719
+ instance.wait_until_terminated()
720
+
697
721
  # TODO(suquark): Currently, the implementation of GCP and Azure will
698
722
  # wait util the cluster is fully terminated, while other clouds just
699
723
  # trigger the termination process (via http call) and then return.
@@ -952,11 +952,13 @@ def delete_vm_and_attached_resources(subscription_id: str, resource_group: str,
952
952
 
953
953
  @common_utils.retry
954
954
  def query_instances(
955
+ cluster_name: str,
955
956
  cluster_name_on_cloud: str,
956
957
  provider_config: Optional[Dict[str, Any]] = None,
957
958
  non_terminated_only: bool = True,
958
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
959
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
959
960
  """See sky/provision/__init__.py"""
961
+ del cluster_name # unused
960
962
  assert provider_config is not None, cluster_name_on_cloud
961
963
 
962
964
  subscription_id = provider_config['subscription_id']
@@ -964,7 +966,8 @@ def query_instances(
964
966
  filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
965
967
  compute_client = azure.get_client('compute', subscription_id)
966
968
  nodes = _filter_instances(compute_client, resource_group, filters)
967
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
969
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
970
+ Optional[str]]] = {}
968
971
 
969
972
  def _fetch_and_map_status(node, resource_group: str) -> None:
970
973
  compute_client = azure.get_client('compute', subscription_id)
@@ -972,8 +975,8 @@ def query_instances(
972
975
 
973
976
  if status is None and non_terminated_only:
974
977
  return
975
- statuses[node.name] = (None if status is None else
976
- status.to_cluster_status())
978
+ statuses[node.name] = ((None if status is None else
979
+ status.to_cluster_status()), None)
977
980
 
978
981
  with pool.ThreadPool() as p:
979
982
  p.starmap(_fetch_and_map_status,
@@ -4,7 +4,7 @@ from typing import Dict
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.adaptors import cudo
7
- import sky.provision.cudo.cudo_utils as utils
7
+ from sky.provision.cudo import cudo_utils as utils
8
8
 
9
9
  logger = sky_logging.init_logger(__name__)
10
10
 
@@ -1,7 +1,7 @@
1
1
  """Cudo Compute instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.provision import common
@@ -191,11 +191,13 @@ def get_cluster_info(
191
191
 
192
192
 
193
193
  def query_instances(
194
+ cluster_name: str,
194
195
  cluster_name_on_cloud: str,
195
196
  provider_config: Optional[Dict[str, Any]] = None,
196
197
  non_terminated_only: bool = True,
197
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
198
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
198
199
  """See sky/provision/__init__.py"""
200
+ del cluster_name # unused
199
201
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
200
202
  instances = _filter_instances(cluster_name_on_cloud, None)
201
203
 
@@ -210,12 +212,13 @@ def query_instances(
210
212
  'done': status_lib.ClusterStatus.STOPPED,
211
213
  'poff': status_lib.ClusterStatus.STOPPED,
212
214
  }
213
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
215
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
216
+ Optional[str]]] = {}
214
217
  for inst_id, inst in instances.items():
215
218
  status = status_map[inst['status']]
216
219
  if non_terminated_only and status is None:
217
220
  continue
218
- statuses[inst_id] = status
221
+ statuses[inst_id] = (status, None)
219
222
  return statuses
220
223
 
221
224
 
@@ -1,7 +1,7 @@
1
1
  """DigitalOcean instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
  import uuid
6
6
 
7
7
  from sky import sky_logging
@@ -242,11 +242,13 @@ def get_cluster_info(
242
242
 
243
243
 
244
244
  def query_instances(
245
+ cluster_name: str,
245
246
  cluster_name_on_cloud: str,
246
247
  provider_config: Optional[Dict[str, Any]] = None,
247
248
  non_terminated_only: bool = True,
248
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
249
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
249
250
  """See sky/provision/__init__.py"""
251
+ del cluster_name # unused
250
252
  # terminated instances are not retrieved by the
251
253
  # API making `non_terminated_only` argument moot.
252
254
  del non_terminated_only
@@ -260,10 +262,11 @@ def query_instances(
260
262
  'active': status_lib.ClusterStatus.UP,
261
263
  'off': status_lib.ClusterStatus.STOPPED,
262
264
  }
263
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
265
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
266
+ Optional[str]]] = {}
264
267
  for instance_meta in instances.values():
265
268
  status = status_map[instance_meta['status']]
266
- statuses[instance_meta['name']] = status
269
+ statuses[instance_meta['name']] = (status, None)
267
270
  return statuses
268
271
 
269
272
 
@@ -1,7 +1,7 @@
1
1
  """FluidStack instance provisioning."""
2
2
  import os
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import authentication as auth
7
7
  from sky import exceptions
@@ -287,11 +287,13 @@ def get_cluster_info(
287
287
 
288
288
 
289
289
  def query_instances(
290
+ cluster_name: str,
290
291
  cluster_name_on_cloud: str,
291
292
  provider_config: Optional[Dict[str, Any]] = None,
292
293
  non_terminated_only: bool = True,
293
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
294
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
294
295
  """See sky/provision/__init__.py"""
296
+ del cluster_name # unused
295
297
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
296
298
  instances = _filter_instances(cluster_name_on_cloud, None)
297
299
  instances = _filter_instances(cluster_name_on_cloud, None)
@@ -302,7 +304,8 @@ def query_instances(
302
304
  'failed': status_lib.ClusterStatus.INIT,
303
305
  'terminated': None,
304
306
  }
305
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
307
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
308
+ Optional[str]]] = {}
306
309
  for inst_id, inst in instances.items():
307
310
  if inst['status'] not in status_map:
308
311
  with ux_utils.print_exception_no_traceback():
@@ -311,7 +314,7 @@ def query_instances(
311
314
  status = status_map.get(inst['status'], None)
312
315
  if non_terminated_only and status is None:
313
316
  continue
314
- statuses[inst_id] = status
317
+ statuses[inst_id] = (status, None)
315
318
  return statuses
316
319
 
317
320