skypilot-nightly 1.0.0.dev20250808__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/kubernetes.py +5 -2
- sky/backends/backend_utils.py +102 -8
- sky/backends/cloud_vm_ray_backend.py +197 -31
- sky/catalog/cudo_catalog.py +1 -1
- sky/catalog/data_fetchers/fetch_cudo.py +1 -1
- sky/catalog/data_fetchers/fetch_nebius.py +6 -3
- sky/client/cli/command.py +60 -77
- sky/client/common.py +1 -1
- sky/client/sdk.py +19 -19
- sky/client/sdk_async.py +5 -4
- sky/clouds/aws.py +52 -1
- sky/clouds/kubernetes.py +14 -0
- sky/core.py +5 -0
- sky/dag.py +1 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-339efec49c0cc7d0.js → webpack-00c0a51d21157453.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -1
- sky/exceptions.py +5 -0
- sky/execution.py +15 -0
- sky/global_user_state.py +160 -2
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +0 -1
- sky/jobs/recovery_strategy.py +6 -3
- sky/jobs/scheduler.py +23 -68
- sky/jobs/server/core.py +22 -12
- sky/jobs/state.py +6 -2
- sky/jobs/utils.py +17 -2
- sky/provision/__init__.py +4 -2
- sky/provision/aws/config.py +9 -0
- sky/provision/aws/instance.py +41 -17
- sky/provision/azure/instance.py +7 -4
- sky/provision/cudo/cudo_wrapper.py +1 -1
- sky/provision/cudo/instance.py +7 -4
- sky/provision/do/instance.py +7 -4
- sky/provision/fluidstack/instance.py +7 -4
- sky/provision/gcp/instance.py +7 -4
- sky/provision/hyperbolic/instance.py +7 -5
- sky/provision/kubernetes/instance.py +169 -6
- sky/provision/lambda_cloud/instance.py +7 -4
- sky/provision/nebius/instance.py +7 -4
- sky/provision/oci/instance.py +7 -4
- sky/provision/paperspace/instance.py +7 -5
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/provisioner.py +6 -0
- sky/provision/runpod/instance.py +7 -4
- sky/provision/runpod/utils.py +1 -1
- sky/provision/scp/instance.py +7 -5
- sky/provision/vast/instance.py +7 -5
- sky/provision/vsphere/instance.py +7 -4
- sky/resources.py +1 -2
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +70 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/serve_state/001_initial_schema.py +1 -1
- sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/serve/constants.py +3 -7
- sky/serve/replica_managers.py +15 -16
- sky/serve/serve_state.py +10 -0
- sky/serve/serve_utils.py +58 -23
- sky/serve/server/impl.py +15 -19
- sky/serve/service.py +31 -16
- sky/server/server.py +20 -14
- sky/setup_files/dependencies.py +11 -10
- sky/skylet/autostop_lib.py +38 -5
- sky/skylet/constants.py +3 -1
- sky/skylet/services.py +44 -0
- sky/skylet/skylet.py +49 -4
- sky/skypilot_config.py +4 -4
- sky/task.py +19 -16
- sky/templates/aws-ray.yml.j2 +2 -2
- sky/templates/jobs-controller.yaml.j2 +6 -0
- sky/users/permission.py +1 -1
- sky/utils/cli_utils/status_utils.py +9 -0
- sky/utils/command_runner.py +1 -1
- sky/utils/config_utils.py +29 -5
- sky/utils/controller_utils.py +73 -0
- sky/utils/db/db_utils.py +39 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/schemas.py +3 -0
- sky/volumes/server/core.py +2 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +117 -108
- sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
- /sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0
sky/jobs/scheduler.py
CHANGED
|
@@ -15,13 +15,14 @@ following section for more details).
|
|
|
15
15
|
|
|
16
16
|
The scheduling logic limits #running jobs according to three limits:
|
|
17
17
|
1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
|
|
18
|
-
once, based on the number of CPUs.
|
|
19
|
-
|
|
20
|
-
|
|
18
|
+
once, based on the number of CPUs. This the most compute-intensive part of
|
|
19
|
+
the job lifecycle, which is why we have an additional limit.
|
|
20
|
+
See sky/utils/controller_utils.py::_get_launch_parallelism.
|
|
21
21
|
2. The number of jobs that can be running at any given time, based on the amount
|
|
22
|
-
of memory.
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
of memory. Since the job controller is doing very little once a job starts
|
|
23
|
+
(just checking its status periodically), the most significant resource it
|
|
24
|
+
consumes is memory.
|
|
25
|
+
See sky/utils/controller_utils.py::_get_job_parallelism.
|
|
25
26
|
3. The number of jobs that can be running in a pool at any given time, based on
|
|
26
27
|
the number of ready workers in the pool. (See _can_start_new_job.)
|
|
27
28
|
|
|
@@ -42,55 +43,27 @@ Nomenclature:
|
|
|
42
43
|
|
|
43
44
|
from argparse import ArgumentParser
|
|
44
45
|
import contextlib
|
|
45
|
-
from functools import lru_cache
|
|
46
46
|
import os
|
|
47
47
|
import sys
|
|
48
48
|
import time
|
|
49
|
-
import typing
|
|
50
49
|
from typing import Optional
|
|
51
50
|
|
|
52
51
|
import filelock
|
|
53
52
|
|
|
54
53
|
from sky import exceptions
|
|
55
54
|
from sky import sky_logging
|
|
56
|
-
from sky.adaptors import common as adaptors_common
|
|
57
55
|
from sky.jobs import constants as managed_job_constants
|
|
58
56
|
from sky.jobs import state
|
|
59
57
|
from sky.serve import serve_utils
|
|
60
58
|
from sky.skylet import constants
|
|
61
59
|
from sky.utils import common_utils
|
|
60
|
+
from sky.utils import controller_utils
|
|
62
61
|
from sky.utils import subprocess_utils
|
|
63
62
|
|
|
64
|
-
if typing.TYPE_CHECKING:
|
|
65
|
-
import psutil
|
|
66
|
-
else:
|
|
67
|
-
psutil = adaptors_common.LazyImport('psutil')
|
|
68
|
-
|
|
69
63
|
logger = sky_logging.init_logger('sky.jobs.controller')
|
|
70
64
|
|
|
71
|
-
# The _MANAGED_JOB_SCHEDULER_LOCK should be held whenever we are checking the
|
|
72
|
-
# parallelism control or updating the schedule_state of any job.
|
|
73
|
-
# Any code that takes this lock must conclude by calling
|
|
74
|
-
# maybe_schedule_next_jobs.
|
|
75
|
-
_MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
|
|
76
65
|
_ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
|
|
77
66
|
|
|
78
|
-
# Based on testing, assume a running job uses 350MB memory.
|
|
79
|
-
JOB_MEMORY_MB = 350
|
|
80
|
-
# Past 2000 simultaneous jobs, we become unstable.
|
|
81
|
-
# See https://github.com/skypilot-org/skypilot/issues/4649.
|
|
82
|
-
MAX_JOB_LIMIT = 2000
|
|
83
|
-
# Number of ongoing launches launches allowed per CPU.
|
|
84
|
-
LAUNCHES_PER_CPU = 4
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
@lru_cache(maxsize=1)
|
|
88
|
-
def _get_lock_path() -> str:
|
|
89
|
-
# TODO(tian): Per pool lock.
|
|
90
|
-
path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
|
|
91
|
-
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
92
|
-
return path
|
|
93
|
-
|
|
94
67
|
|
|
95
68
|
def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
|
|
96
69
|
pool: Optional[str]) -> None:
|
|
@@ -163,7 +136,8 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
|
|
|
163
136
|
# parallelism control. If we cannot obtain the lock, exit immediately.
|
|
164
137
|
# The current lock holder is expected to launch any jobs it can before
|
|
165
138
|
# releasing the lock.
|
|
166
|
-
with filelock.FileLock(
|
|
139
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path(),
|
|
140
|
+
blocking=False):
|
|
167
141
|
while True:
|
|
168
142
|
maybe_next_job = state.get_waiting_job(pool)
|
|
169
143
|
if maybe_next_job is None:
|
|
@@ -184,7 +158,8 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
|
|
|
184
158
|
# an ALIVE_WAITING job, but we would be able to launch a WAITING
|
|
185
159
|
# job.
|
|
186
160
|
if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
|
|
187
|
-
if not
|
|
161
|
+
if not (controller_utils.can_provision() or
|
|
162
|
+
actual_pool is not None):
|
|
188
163
|
# Can't schedule anything, break from scheduling loop.
|
|
189
164
|
break
|
|
190
165
|
elif current_state == state.ManagedJobScheduleState.WAITING:
|
|
@@ -234,7 +209,7 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
|
234
209
|
|
|
235
210
|
The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
|
|
236
211
|
"""
|
|
237
|
-
with filelock.FileLock(
|
|
212
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
238
213
|
is_resume = state.scheduler_set_waiting(job_id, dag_yaml_path,
|
|
239
214
|
original_user_yaml_path,
|
|
240
215
|
env_file_path,
|
|
@@ -286,11 +261,11 @@ def scheduled_launch(job_id: int):
|
|
|
286
261
|
except exceptions.NoClusterLaunchedError:
|
|
287
262
|
# NoClusterLaunchedError is indicates that the job is in retry backoff.
|
|
288
263
|
# We should transition to ALIVE_BACKOFF instead of ALIVE.
|
|
289
|
-
with filelock.FileLock(
|
|
264
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
290
265
|
state.scheduler_set_alive_backoff(job_id)
|
|
291
266
|
raise
|
|
292
267
|
else:
|
|
293
|
-
with filelock.FileLock(
|
|
268
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
294
269
|
state.scheduler_set_alive(job_id)
|
|
295
270
|
finally:
|
|
296
271
|
maybe_schedule_next_jobs(pool)
|
|
@@ -310,56 +285,36 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
|
310
285
|
return
|
|
311
286
|
pool = state.get_pool_from_job_id(job_id)
|
|
312
287
|
|
|
313
|
-
with filelock.FileLock(
|
|
288
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
314
289
|
state.scheduler_set_done(job_id, idempotent)
|
|
315
290
|
maybe_schedule_next_jobs(pool)
|
|
316
291
|
|
|
317
292
|
|
|
318
293
|
def _set_alive_waiting(job_id: int) -> None:
|
|
319
294
|
"""Should use wait_until_launch_okay() to transition to this state."""
|
|
320
|
-
with filelock.FileLock(
|
|
295
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
321
296
|
state.scheduler_set_alive_waiting(job_id)
|
|
322
297
|
pool = state.get_pool_from_job_id(job_id)
|
|
323
298
|
maybe_schedule_next_jobs(pool)
|
|
324
299
|
|
|
325
300
|
|
|
326
|
-
def _get_job_parallelism() -> int:
|
|
327
|
-
job_memory = JOB_MEMORY_MB * 1024 * 1024
|
|
328
|
-
|
|
329
|
-
job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
|
|
330
|
-
|
|
331
|
-
return max(job_limit, 1)
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
def _get_launch_parallelism() -> int:
|
|
335
|
-
cpus = os.cpu_count()
|
|
336
|
-
return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
|
|
337
|
-
|
|
338
|
-
|
|
339
301
|
def _can_start_new_job(pool: Optional[str]) -> bool:
|
|
340
|
-
launching_jobs = state.get_num_launching_jobs()
|
|
341
|
-
alive_jobs = state.get_num_alive_jobs()
|
|
342
|
-
|
|
343
302
|
# Check basic resource limits
|
|
344
|
-
|
|
345
|
-
|
|
303
|
+
# Pool jobs don't need to provision resources, so we skip the check.
|
|
304
|
+
if not ((controller_utils.can_provision() or pool is not None) and
|
|
305
|
+
controller_utils.can_start_new_process()):
|
|
346
306
|
return False
|
|
347
307
|
|
|
348
|
-
# Check if there are available
|
|
308
|
+
# Check if there are available workers in the pool
|
|
349
309
|
if pool is not None:
|
|
350
310
|
alive_jobs_in_pool = state.get_num_alive_jobs(pool)
|
|
351
|
-
if alive_jobs_in_pool >= serve_utils.
|
|
352
|
-
logger.debug(f'No
|
|
311
|
+
if alive_jobs_in_pool >= len(serve_utils.get_ready_replicas(pool)):
|
|
312
|
+
logger.debug(f'No READY workers available in pool {pool}')
|
|
353
313
|
return False
|
|
354
314
|
|
|
355
315
|
return True
|
|
356
316
|
|
|
357
317
|
|
|
358
|
-
def _can_lauch_in_alive_job() -> bool:
|
|
359
|
-
launching_jobs = state.get_num_launching_jobs()
|
|
360
|
-
return launching_jobs < _get_launch_parallelism()
|
|
361
|
-
|
|
362
|
-
|
|
363
318
|
if __name__ == '__main__':
|
|
364
319
|
parser = ArgumentParser()
|
|
365
320
|
parser.add_argument('dag_yaml',
|
sky/jobs/server/core.py
CHANGED
|
@@ -93,8 +93,8 @@ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
|
|
|
93
93
|
return local_to_controller_file_mounts
|
|
94
94
|
|
|
95
95
|
|
|
96
|
-
def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
|
|
97
|
-
num_jobs:
|
|
96
|
+
def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
|
|
97
|
+
num_jobs: int) -> Optional[List[int]]:
|
|
98
98
|
"""Submit the managed job locally if in consolidation mode.
|
|
99
99
|
|
|
100
100
|
In normal mode the managed job submission is done in the ray job submission.
|
|
@@ -109,12 +109,13 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag', pool: Optional[str],
|
|
|
109
109
|
# Create local directory for the managed job.
|
|
110
110
|
pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
|
|
111
111
|
job_ids = []
|
|
112
|
+
pool = dag.pool
|
|
112
113
|
pool_hash = None
|
|
113
114
|
if pool is not None:
|
|
114
115
|
pool_hash = serve_state.get_service_hash(pool)
|
|
115
116
|
# Already checked in the sdk.
|
|
116
117
|
assert pool_hash is not None, f'Pool {pool} not found'
|
|
117
|
-
for _ in range(num_jobs
|
|
118
|
+
for _ in range(num_jobs):
|
|
118
119
|
# TODO(tian): We should have a separate name for each job when
|
|
119
120
|
# submitting multiple jobs. Current blocker is that we are sharing
|
|
120
121
|
# the same dag object for all jobs. Maybe we can do copy.copy() for
|
|
@@ -172,9 +173,6 @@ def launch(
|
|
|
172
173
|
handle: Optional[backends.ResourceHandle]; handle to the controller VM.
|
|
173
174
|
None if dryrun.
|
|
174
175
|
"""
|
|
175
|
-
if pool is not None and not managed_job_utils.is_consolidation_mode():
|
|
176
|
-
with ux_utils.print_exception_no_traceback():
|
|
177
|
-
raise ValueError('pool is only supported in consolidation mode.')
|
|
178
176
|
entrypoint = task
|
|
179
177
|
# using hasattr instead of isinstance to avoid importing sky
|
|
180
178
|
if hasattr(task, 'metadata'):
|
|
@@ -295,8 +293,13 @@ def launch(
|
|
|
295
293
|
controller=controller,
|
|
296
294
|
task_resources=sum([list(t.resources) for t in dag.tasks], []))
|
|
297
295
|
|
|
296
|
+
num_jobs = num_jobs if num_jobs is not None else 1
|
|
297
|
+
# We do this assignment after applying the admin policy, so that we don't
|
|
298
|
+
# need to serialize the pool name in the dag. The dag object will be
|
|
299
|
+
# preserved. See sky/admin_policy.py::MutatedUserRequest::decode.
|
|
300
|
+
dag.pool = pool
|
|
298
301
|
consolidation_mode_job_ids = _maybe_submit_job_locally(
|
|
299
|
-
prefix, dag,
|
|
302
|
+
prefix, dag, num_jobs)
|
|
300
303
|
|
|
301
304
|
# This is only needed for non-consolidation mode. For consolidation
|
|
302
305
|
# mode, the controller uses the same catalog as API server.
|
|
@@ -373,8 +376,8 @@ def launch(
|
|
|
373
376
|
controller_task._metadata = metadata
|
|
374
377
|
|
|
375
378
|
job_identity = ''
|
|
376
|
-
if
|
|
377
|
-
job_identity = f' (
|
|
379
|
+
if job_rank is not None:
|
|
380
|
+
job_identity = f' (rank: {job_rank})'
|
|
378
381
|
logger.info(f'{colorama.Fore.YELLOW}'
|
|
379
382
|
f'Launching managed job {dag.name!r}{job_identity} '
|
|
380
383
|
f'from jobs controller...{colorama.Style.RESET_ALL}')
|
|
@@ -428,14 +431,17 @@ def launch(
|
|
|
428
431
|
backend.run_on_head(local_handle, run_script)
|
|
429
432
|
return consolidation_mode_job_id, local_handle
|
|
430
433
|
|
|
431
|
-
if consolidation_mode_job_ids is None:
|
|
432
|
-
return _submit_one()
|
|
433
434
|
if pool is None:
|
|
435
|
+
if consolidation_mode_job_ids is None:
|
|
436
|
+
return _submit_one()
|
|
434
437
|
assert len(consolidation_mode_job_ids) == 1
|
|
435
438
|
return _submit_one(consolidation_mode_job_ids[0])
|
|
439
|
+
|
|
436
440
|
ids = []
|
|
437
441
|
all_handle = None
|
|
438
|
-
for job_rank
|
|
442
|
+
for job_rank in range(num_jobs):
|
|
443
|
+
job_id = (consolidation_mode_job_ids[job_rank]
|
|
444
|
+
if consolidation_mode_job_ids is not None else None)
|
|
439
445
|
jid, handle = _submit_one(job_id, job_rank)
|
|
440
446
|
assert jid is not None, (job_id, handle)
|
|
441
447
|
ids.append(jid)
|
|
@@ -547,6 +553,10 @@ def _maybe_restart_controller(
|
|
|
547
553
|
'controller'))
|
|
548
554
|
with skypilot_config.local_active_workspace_ctx(
|
|
549
555
|
skylet_constants.SKYPILOT_DEFAULT_WORKSPACE):
|
|
556
|
+
global_user_state.add_cluster_event(
|
|
557
|
+
jobs_controller_type.value.cluster_name,
|
|
558
|
+
status_lib.ClusterStatus.INIT, 'Jobs controller restarted.',
|
|
559
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
550
560
|
handle = core.start(
|
|
551
561
|
cluster_name=jobs_controller_type.value.cluster_name)
|
|
552
562
|
|
sky/jobs/state.py
CHANGED
|
@@ -441,7 +441,8 @@ class ManagedJobScheduleState(enum.Enum):
|
|
|
441
441
|
|
|
442
442
|
# === Status transition functions ===
|
|
443
443
|
@_init_db
|
|
444
|
-
def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str
|
|
444
|
+
def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str,
|
|
445
|
+
pool: Optional[str], pool_hash: Optional[str]):
|
|
445
446
|
assert _SQLALCHEMY_ENGINE is not None
|
|
446
447
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
447
448
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
@@ -457,7 +458,10 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
|
|
|
457
458
|
name=name,
|
|
458
459
|
schedule_state=ManagedJobScheduleState.INACTIVE.value,
|
|
459
460
|
workspace=workspace,
|
|
460
|
-
entrypoint=entrypoint
|
|
461
|
+
entrypoint=entrypoint,
|
|
462
|
+
pool=pool,
|
|
463
|
+
pool_hash=pool_hash,
|
|
464
|
+
)
|
|
461
465
|
session.execute(insert_stmt)
|
|
462
466
|
session.commit()
|
|
463
467
|
|
sky/jobs/utils.py
CHANGED
|
@@ -141,7 +141,7 @@ def _validate_consolidation_mode_config(
|
|
|
141
141
|
if global_user_state.get_cluster_from_name(controller_cn) is not None:
|
|
142
142
|
with ux_utils.print_exception_no_traceback():
|
|
143
143
|
raise exceptions.InconsistentConsolidationModeError(
|
|
144
|
-
f'{colorama.Fore.RED}Consolidation mode is '
|
|
144
|
+
f'{colorama.Fore.RED}Consolidation mode for jobs is '
|
|
145
145
|
f'enabled, but the controller cluster '
|
|
146
146
|
f'{controller_cn} is still running. Please '
|
|
147
147
|
'terminate the controller cluster first.'
|
|
@@ -179,7 +179,11 @@ def _validate_consolidation_mode_config(
|
|
|
179
179
|
def is_consolidation_mode() -> bool:
|
|
180
180
|
consolidation_mode = skypilot_config.get_nested(
|
|
181
181
|
('jobs', 'controller', 'consolidation_mode'), default_value=False)
|
|
182
|
-
|
|
182
|
+
# We should only do this check on API server, as the controller will not
|
|
183
|
+
# have related config and will always seemingly disabled for consolidation
|
|
184
|
+
# mode. Check #6611 for more details.
|
|
185
|
+
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
186
|
+
_validate_consolidation_mode_config(consolidation_mode)
|
|
183
187
|
return consolidation_mode
|
|
184
188
|
|
|
185
189
|
|
|
@@ -333,6 +337,9 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
333
337
|
if handle is not None:
|
|
334
338
|
try:
|
|
335
339
|
if pool is None:
|
|
340
|
+
global_user_state.add_cluster_event(
|
|
341
|
+
cluster_name, None, 'Cluster was cleaned up.',
|
|
342
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
336
343
|
terminate_cluster(cluster_name)
|
|
337
344
|
except Exception as e: # pylint: disable=broad-except
|
|
338
345
|
error_msg = (
|
|
@@ -1683,6 +1690,7 @@ class ManagedJobCodeGen:
|
|
|
1683
1690
|
def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag',
|
|
1684
1691
|
workspace: str, entrypoint: str) -> str:
|
|
1685
1692
|
dag_name = managed_job_dag.name
|
|
1693
|
+
pool = managed_job_dag.pool
|
|
1686
1694
|
# Add the managed job to queue table.
|
|
1687
1695
|
code = textwrap.dedent(f"""\
|
|
1688
1696
|
set_job_info_kwargs = {{'workspace': {workspace!r}}}
|
|
@@ -1690,6 +1698,13 @@ class ManagedJobCodeGen:
|
|
|
1690
1698
|
set_job_info_kwargs = {{}}
|
|
1691
1699
|
if managed_job_version >= 5:
|
|
1692
1700
|
set_job_info_kwargs['entrypoint'] = {entrypoint!r}
|
|
1701
|
+
if managed_job_version >= 8:
|
|
1702
|
+
from sky.serve import serve_state
|
|
1703
|
+
pool_hash = None
|
|
1704
|
+
if {pool!r} != None:
|
|
1705
|
+
pool_hash = serve_state.get_service_hash({pool!r})
|
|
1706
|
+
set_job_info_kwargs['pool'] = {pool!r}
|
|
1707
|
+
set_job_info_kwargs['pool_hash'] = pool_hash
|
|
1693
1708
|
managed_job_state.set_job_info(
|
|
1694
1709
|
{job_id}, {dag_name!r}, **set_job_info_kwargs)
|
|
1695
1710
|
""")
|
sky/provision/__init__.py
CHANGED
|
@@ -73,13 +73,15 @@ def _route_to_cloud_impl(func):
|
|
|
73
73
|
@_route_to_cloud_impl
|
|
74
74
|
def query_instances(
|
|
75
75
|
provider_name: str,
|
|
76
|
+
cluster_name: str,
|
|
76
77
|
cluster_name_on_cloud: str,
|
|
77
78
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
78
79
|
non_terminated_only: bool = True,
|
|
79
|
-
) -> Dict[str, Optional['status_lib.ClusterStatus']]:
|
|
80
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
80
81
|
"""Query instances.
|
|
81
82
|
|
|
82
|
-
Returns a dictionary of instance IDs and status
|
|
83
|
+
Returns a dictionary of instance IDs and a tuple of (status, reason for
|
|
84
|
+
being in status if any).
|
|
83
85
|
|
|
84
86
|
A None status means the instance is marked as "terminated"
|
|
85
87
|
or "terminating".
|
sky/provision/aws/config.py
CHANGED
|
@@ -19,6 +19,7 @@ import colorama
|
|
|
19
19
|
from sky import exceptions
|
|
20
20
|
from sky import sky_logging
|
|
21
21
|
from sky.adaptors import aws
|
|
22
|
+
from sky.clouds import aws as aws_cloud
|
|
22
23
|
from sky.provision import common
|
|
23
24
|
from sky.provision.aws import utils
|
|
24
25
|
from sky.utils import annotations
|
|
@@ -103,6 +104,14 @@ def bootstrap_instances(
|
|
|
103
104
|
security_group_ids = _configure_security_group(ec2, vpc_id,
|
|
104
105
|
expected_sg_name,
|
|
105
106
|
extended_ip_rules)
|
|
107
|
+
if expected_sg_name != aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
|
|
108
|
+
# Ensure the default security group is created. This is needed
|
|
109
|
+
# to enable us to use the default security group to quickly
|
|
110
|
+
# delete the cluster. If the default security group is not created,
|
|
111
|
+
# we will need to block on instance termination to delete the
|
|
112
|
+
# security group.
|
|
113
|
+
_configure_security_group(ec2, vpc_id,
|
|
114
|
+
aws_cloud.DEFAULT_SECURITY_GROUP_NAME, [])
|
|
106
115
|
end_time = time.time()
|
|
107
116
|
elapsed = end_time - start_time
|
|
108
117
|
logger.info(
|
sky/provision/aws/instance.py
CHANGED
|
@@ -10,7 +10,7 @@ from multiprocessing import pool
|
|
|
10
10
|
import re
|
|
11
11
|
import time
|
|
12
12
|
import typing
|
|
13
|
-
from typing import Any, Callable, Dict, List, Optional, Set, TypeVar
|
|
13
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar
|
|
14
14
|
|
|
15
15
|
from sky import sky_logging
|
|
16
16
|
from sky.adaptors import aws
|
|
@@ -527,6 +527,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
527
527
|
to_start_count,
|
|
528
528
|
associate_public_ip_address=(
|
|
529
529
|
not config.provider_config['use_internal_ips']))
|
|
530
|
+
|
|
530
531
|
created_instances.extend(created_remaining_instances)
|
|
531
532
|
created_instances.sort(key=lambda x: x.id)
|
|
532
533
|
|
|
@@ -585,11 +586,13 @@ def _filter_instances(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
585
586
|
# stop() and terminate() for example already implicitly assume non-terminated.
|
|
586
587
|
@common_utils.retry
|
|
587
588
|
def query_instances(
|
|
589
|
+
cluster_name: str,
|
|
588
590
|
cluster_name_on_cloud: str,
|
|
589
591
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
590
592
|
non_terminated_only: bool = True,
|
|
591
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
593
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
592
594
|
"""See sky/provision/__init__.py"""
|
|
595
|
+
del cluster_name # unused
|
|
593
596
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
594
597
|
region = provider_config['region']
|
|
595
598
|
ec2 = _default_ec2_resource(region)
|
|
@@ -608,12 +611,13 @@ def query_instances(
|
|
|
608
611
|
'shutting-down': None,
|
|
609
612
|
'terminated': None,
|
|
610
613
|
}
|
|
611
|
-
statuses
|
|
614
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
615
|
+
Optional[str]]] = {}
|
|
612
616
|
for inst in instances:
|
|
613
617
|
status = status_map[inst.state['Name']]
|
|
614
618
|
if non_terminated_only and status is None:
|
|
615
619
|
continue
|
|
616
|
-
statuses[inst.id] = status
|
|
620
|
+
statuses[inst.id] = (status, None)
|
|
617
621
|
return statuses
|
|
618
622
|
|
|
619
623
|
|
|
@@ -681,19 +685,39 @@ def terminate_instances(
|
|
|
681
685
|
filters,
|
|
682
686
|
included_instances=None,
|
|
683
687
|
excluded_instances=None)
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
#
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
688
|
+
default_sg = _get_sg_from_name(ec2, aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
|
|
689
|
+
if sg_name == aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
|
|
690
|
+
# Case 1: The default SG is used, we don't need to ensure instance are
|
|
691
|
+
# terminated.
|
|
692
|
+
instances.terminate()
|
|
693
|
+
elif not managed_by_skypilot:
|
|
694
|
+
# Case 2: We are not managing the non-default sg. We don't need to
|
|
695
|
+
# ensure instances are terminated.
|
|
696
|
+
instances.terminate()
|
|
697
|
+
elif (managed_by_skypilot and default_sg is not None):
|
|
698
|
+
# Case 3: We are managing the non-default sg. The default SG exists
|
|
699
|
+
# so we can move the instances to the default SG and terminate them
|
|
700
|
+
# without blocking.
|
|
701
|
+
|
|
702
|
+
# Make this multithreaded: modify all instances' SGs in parallel.
|
|
703
|
+
def modify_instance_sg(instance):
|
|
704
|
+
instance.modify_attribute(Groups=[default_sg.id])
|
|
705
|
+
logger.debug(f'Instance {instance.id} modified to use default SG:'
|
|
706
|
+
f'{default_sg.id} for quick deletion.')
|
|
707
|
+
|
|
708
|
+
with pool.ThreadPool() as thread_pool:
|
|
709
|
+
thread_pool.map(modify_instance_sg, instances)
|
|
710
|
+
thread_pool.close()
|
|
711
|
+
thread_pool.join()
|
|
712
|
+
|
|
713
|
+
instances.terminate()
|
|
714
|
+
else:
|
|
715
|
+
# Case 4: We are managing the non-default sg. The default SG does not
|
|
716
|
+
# exist. We must block on instance termination.
|
|
717
|
+
instances.terminate()
|
|
718
|
+
for instance in instances:
|
|
719
|
+
instance.wait_until_terminated()
|
|
720
|
+
|
|
697
721
|
# TODO(suquark): Currently, the implementation of GCP and Azure will
|
|
698
722
|
# wait util the cluster is fully terminated, while other clouds just
|
|
699
723
|
# trigger the termination process (via http call) and then return.
|
sky/provision/azure/instance.py
CHANGED
|
@@ -952,11 +952,13 @@ def delete_vm_and_attached_resources(subscription_id: str, resource_group: str,
|
|
|
952
952
|
|
|
953
953
|
@common_utils.retry
|
|
954
954
|
def query_instances(
|
|
955
|
+
cluster_name: str,
|
|
955
956
|
cluster_name_on_cloud: str,
|
|
956
957
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
957
958
|
non_terminated_only: bool = True,
|
|
958
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
959
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
959
960
|
"""See sky/provision/__init__.py"""
|
|
961
|
+
del cluster_name # unused
|
|
960
962
|
assert provider_config is not None, cluster_name_on_cloud
|
|
961
963
|
|
|
962
964
|
subscription_id = provider_config['subscription_id']
|
|
@@ -964,7 +966,8 @@ def query_instances(
|
|
|
964
966
|
filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
|
|
965
967
|
compute_client = azure.get_client('compute', subscription_id)
|
|
966
968
|
nodes = _filter_instances(compute_client, resource_group, filters)
|
|
967
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
969
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
970
|
+
Optional[str]]] = {}
|
|
968
971
|
|
|
969
972
|
def _fetch_and_map_status(node, resource_group: str) -> None:
|
|
970
973
|
compute_client = azure.get_client('compute', subscription_id)
|
|
@@ -972,8 +975,8 @@ def query_instances(
|
|
|
972
975
|
|
|
973
976
|
if status is None and non_terminated_only:
|
|
974
977
|
return
|
|
975
|
-
statuses[node.name] = (None if status is None else
|
|
976
|
-
|
|
978
|
+
statuses[node.name] = ((None if status is None else
|
|
979
|
+
status.to_cluster_status()), None)
|
|
977
980
|
|
|
978
981
|
with pool.ThreadPool() as p:
|
|
979
982
|
p.starmap(_fetch_and_map_status,
|
sky/provision/cudo/instance.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Cudo Compute instance provisioning."""
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.provision import common
|
|
@@ -191,11 +191,13 @@ def get_cluster_info(
|
|
|
191
191
|
|
|
192
192
|
|
|
193
193
|
def query_instances(
|
|
194
|
+
cluster_name: str,
|
|
194
195
|
cluster_name_on_cloud: str,
|
|
195
196
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
196
197
|
non_terminated_only: bool = True,
|
|
197
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
198
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
198
199
|
"""See sky/provision/__init__.py"""
|
|
200
|
+
del cluster_name # unused
|
|
199
201
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
200
202
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
201
203
|
|
|
@@ -210,12 +212,13 @@ def query_instances(
|
|
|
210
212
|
'done': status_lib.ClusterStatus.STOPPED,
|
|
211
213
|
'poff': status_lib.ClusterStatus.STOPPED,
|
|
212
214
|
}
|
|
213
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
215
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
216
|
+
Optional[str]]] = {}
|
|
214
217
|
for inst_id, inst in instances.items():
|
|
215
218
|
status = status_map[inst['status']]
|
|
216
219
|
if non_terminated_only and status is None:
|
|
217
220
|
continue
|
|
218
|
-
statuses[inst_id] = status
|
|
221
|
+
statuses[inst_id] = (status, None)
|
|
219
222
|
return statuses
|
|
220
223
|
|
|
221
224
|
|
sky/provision/do/instance.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""DigitalOcean instance provisioning."""
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
import uuid
|
|
6
6
|
|
|
7
7
|
from sky import sky_logging
|
|
@@ -242,11 +242,13 @@ def get_cluster_info(
|
|
|
242
242
|
|
|
243
243
|
|
|
244
244
|
def query_instances(
|
|
245
|
+
cluster_name: str,
|
|
245
246
|
cluster_name_on_cloud: str,
|
|
246
247
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
247
248
|
non_terminated_only: bool = True,
|
|
248
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
249
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
249
250
|
"""See sky/provision/__init__.py"""
|
|
251
|
+
del cluster_name # unused
|
|
250
252
|
# terminated instances are not retrieved by the
|
|
251
253
|
# API making `non_terminated_only` argument moot.
|
|
252
254
|
del non_terminated_only
|
|
@@ -260,10 +262,11 @@ def query_instances(
|
|
|
260
262
|
'active': status_lib.ClusterStatus.UP,
|
|
261
263
|
'off': status_lib.ClusterStatus.STOPPED,
|
|
262
264
|
}
|
|
263
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
265
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
266
|
+
Optional[str]]] = {}
|
|
264
267
|
for instance_meta in instances.values():
|
|
265
268
|
status = status_map[instance_meta['status']]
|
|
266
|
-
statuses[instance_meta['name']] = status
|
|
269
|
+
statuses[instance_meta['name']] = (status, None)
|
|
267
270
|
return statuses
|
|
268
271
|
|
|
269
272
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""FluidStack instance provisioning."""
|
|
2
2
|
import os
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from sky import authentication as auth
|
|
7
7
|
from sky import exceptions
|
|
@@ -287,11 +287,13 @@ def get_cluster_info(
|
|
|
287
287
|
|
|
288
288
|
|
|
289
289
|
def query_instances(
|
|
290
|
+
cluster_name: str,
|
|
290
291
|
cluster_name_on_cloud: str,
|
|
291
292
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
292
293
|
non_terminated_only: bool = True,
|
|
293
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
294
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
294
295
|
"""See sky/provision/__init__.py"""
|
|
296
|
+
del cluster_name # unused
|
|
295
297
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
296
298
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
297
299
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
@@ -302,7 +304,8 @@ def query_instances(
|
|
|
302
304
|
'failed': status_lib.ClusterStatus.INIT,
|
|
303
305
|
'terminated': None,
|
|
304
306
|
}
|
|
305
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
307
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
308
|
+
Optional[str]]] = {}
|
|
306
309
|
for inst_id, inst in instances.items():
|
|
307
310
|
if inst['status'] not in status_map:
|
|
308
311
|
with ux_utils.print_exception_no_traceback():
|
|
@@ -311,7 +314,7 @@ def query_instances(
|
|
|
311
314
|
status = status_map.get(inst['status'], None)
|
|
312
315
|
if non_terminated_only and status is None:
|
|
313
316
|
continue
|
|
314
|
-
statuses[inst_id] = status
|
|
317
|
+
statuses[inst_id] = (status, None)
|
|
315
318
|
return statuses
|
|
316
319
|
|
|
317
320
|
|