skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/nebius.py +43 -1
- sky/backends/backend_utils.py +74 -7
- sky/backends/cloud_vm_ray_backend.py +169 -29
- sky/catalog/cudo_catalog.py +1 -1
- sky/catalog/data_fetchers/fetch_cudo.py +1 -1
- sky/catalog/data_fetchers/fetch_nebius.py +6 -3
- sky/client/cli/command.py +62 -85
- sky/client/common.py +1 -1
- sky/client/sdk.py +69 -19
- sky/client/sdk_async.py +5 -4
- sky/clouds/aws.py +52 -1
- sky/clouds/kubernetes.py +15 -5
- sky/clouds/nebius.py +3 -1
- sky/dag.py +1 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
- sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
- sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
- sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
- sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
- sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-01359c57e018caa4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
- sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
- sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
- sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
- sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
- sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
- sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-ce361c6959bc2001.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-7d4182df6625fe10.js} +2 -7
- sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -1
- sky/exceptions.py +5 -0
- sky/execution.py +13 -10
- sky/global_user_state.py +191 -8
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +0 -1
- sky/jobs/recovery_strategy.py +3 -3
- sky/jobs/scheduler.py +35 -87
- sky/jobs/server/core.py +82 -22
- sky/jobs/server/utils.py +1 -1
- sky/jobs/state.py +7 -5
- sky/jobs/utils.py +167 -8
- sky/provision/__init__.py +1 -0
- sky/provision/aws/config.py +25 -0
- sky/provision/aws/instance.py +37 -13
- sky/provision/azure/instance.py +2 -0
- sky/provision/cudo/cudo_wrapper.py +1 -1
- sky/provision/cudo/instance.py +2 -0
- sky/provision/do/instance.py +2 -0
- sky/provision/fluidstack/instance.py +2 -0
- sky/provision/gcp/instance.py +2 -0
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/kubernetes/instance.py +133 -0
- sky/provision/lambda_cloud/instance.py +2 -0
- sky/provision/nebius/instance.py +2 -0
- sky/provision/nebius/utils.py +101 -86
- sky/provision/oci/instance.py +2 -0
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/provisioner.py +13 -8
- sky/provision/runpod/instance.py +2 -0
- sky/provision/runpod/utils.py +1 -1
- sky/provision/scp/instance.py +2 -0
- sky/provision/vast/instance.py +2 -0
- sky/provision/vsphere/instance.py +2 -0
- sky/resources.py +6 -7
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +70 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/serve/constants.py +3 -7
- sky/serve/replica_managers.py +138 -117
- sky/serve/serve_state.py +42 -0
- sky/serve/serve_utils.py +58 -36
- sky/serve/server/impl.py +15 -19
- sky/serve/service.py +82 -33
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/serializers/decoders.py +12 -2
- sky/server/requests/serializers/encoders.py +10 -2
- sky/server/server.py +64 -16
- sky/setup_files/dependencies.py +11 -10
- sky/skylet/autostop_lib.py +38 -5
- sky/skylet/constants.py +3 -1
- sky/skylet/services.py +44 -0
- sky/skylet/skylet.py +49 -4
- sky/task.py +19 -16
- sky/templates/aws-ray.yml.j2 +2 -2
- sky/templates/jobs-controller.yaml.j2 +6 -0
- sky/templates/kubernetes-ray.yml.j2 +1 -0
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +20 -0
- sky/utils/config_utils.py +29 -5
- sky/utils/controller_utils.py +86 -0
- sky/utils/db/db_utils.py +17 -0
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/log_utils.py +14 -5
- sky/utils/resources_utils.py +25 -1
- sky/utils/schemas.py +6 -0
- sky/utils/ux_utils.py +36 -5
- sky/volumes/server/core.py +2 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +5 -7
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +151 -142
- sky/dashboard/out/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +0 -1
- /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0
sky/jobs/scheduler.py
CHANGED
|
@@ -15,13 +15,14 @@ following section for more details).
|
|
|
15
15
|
|
|
16
16
|
The scheduling logic limits #running jobs according to three limits:
|
|
17
17
|
1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
|
|
18
|
-
once, based on the number of CPUs.
|
|
19
|
-
|
|
20
|
-
|
|
18
|
+
once, based on the number of CPUs. This the most compute-intensive part of
|
|
19
|
+
the job lifecycle, which is why we have an additional limit.
|
|
20
|
+
See sky/utils/controller_utils.py::_get_launch_parallelism.
|
|
21
21
|
2. The number of jobs that can be running at any given time, based on the amount
|
|
22
|
-
of memory.
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
of memory. Since the job controller is doing very little once a job starts
|
|
23
|
+
(just checking its status periodically), the most significant resource it
|
|
24
|
+
consumes is memory.
|
|
25
|
+
See sky/utils/controller_utils.py::_get_job_parallelism.
|
|
25
26
|
3. The number of jobs that can be running in a pool at any given time, based on
|
|
26
27
|
the number of ready workers in the pool. (See _can_start_new_job.)
|
|
27
28
|
|
|
@@ -42,55 +43,27 @@ Nomenclature:
|
|
|
42
43
|
|
|
43
44
|
from argparse import ArgumentParser
|
|
44
45
|
import contextlib
|
|
45
|
-
from functools import lru_cache
|
|
46
46
|
import os
|
|
47
47
|
import sys
|
|
48
48
|
import time
|
|
49
|
-
import typing
|
|
50
49
|
from typing import Optional
|
|
51
50
|
|
|
52
51
|
import filelock
|
|
53
52
|
|
|
54
53
|
from sky import exceptions
|
|
55
54
|
from sky import sky_logging
|
|
56
|
-
from sky.adaptors import common as adaptors_common
|
|
57
55
|
from sky.jobs import constants as managed_job_constants
|
|
58
56
|
from sky.jobs import state
|
|
59
57
|
from sky.serve import serve_utils
|
|
60
58
|
from sky.skylet import constants
|
|
61
59
|
from sky.utils import common_utils
|
|
60
|
+
from sky.utils import controller_utils
|
|
62
61
|
from sky.utils import subprocess_utils
|
|
63
62
|
|
|
64
|
-
if typing.TYPE_CHECKING:
|
|
65
|
-
import psutil
|
|
66
|
-
else:
|
|
67
|
-
psutil = adaptors_common.LazyImport('psutil')
|
|
68
|
-
|
|
69
63
|
logger = sky_logging.init_logger('sky.jobs.controller')
|
|
70
64
|
|
|
71
|
-
# The _MANAGED_JOB_SCHEDULER_LOCK should be held whenever we are checking the
|
|
72
|
-
# parallelism control or updating the schedule_state of any job.
|
|
73
|
-
# Any code that takes this lock must conclude by calling
|
|
74
|
-
# maybe_schedule_next_jobs.
|
|
75
|
-
_MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
|
|
76
65
|
_ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
|
|
77
66
|
|
|
78
|
-
# Based on testing, assume a running job uses 350MB memory.
|
|
79
|
-
JOB_MEMORY_MB = 350
|
|
80
|
-
# Past 2000 simultaneous jobs, we become unstable.
|
|
81
|
-
# See https://github.com/skypilot-org/skypilot/issues/4649.
|
|
82
|
-
MAX_JOB_LIMIT = 2000
|
|
83
|
-
# Number of ongoing launches launches allowed per CPU.
|
|
84
|
-
LAUNCHES_PER_CPU = 4
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
@lru_cache(maxsize=1)
|
|
88
|
-
def _get_lock_path() -> str:
|
|
89
|
-
# TODO(tian): Per pool lock.
|
|
90
|
-
path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
|
|
91
|
-
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
92
|
-
return path
|
|
93
|
-
|
|
94
67
|
|
|
95
68
|
def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
|
|
96
69
|
pool: Optional[str]) -> None:
|
|
@@ -120,7 +93,7 @@ def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
|
|
|
120
93
|
logger.debug(f'Job {job_id} started with pid {pid}')
|
|
121
94
|
|
|
122
95
|
|
|
123
|
-
def maybe_schedule_next_jobs(
|
|
96
|
+
def maybe_schedule_next_jobs() -> None:
|
|
124
97
|
"""Determine if any managed jobs can be scheduled, and if so, schedule them.
|
|
125
98
|
|
|
126
99
|
Here, "schedule" means to select job that is waiting, and allow it to
|
|
@@ -163,9 +136,10 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
|
|
|
163
136
|
# parallelism control. If we cannot obtain the lock, exit immediately.
|
|
164
137
|
# The current lock holder is expected to launch any jobs it can before
|
|
165
138
|
# releasing the lock.
|
|
166
|
-
with filelock.FileLock(
|
|
139
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path(),
|
|
140
|
+
blocking=False):
|
|
167
141
|
while True:
|
|
168
|
-
maybe_next_job = state.get_waiting_job(
|
|
142
|
+
maybe_next_job = state.get_waiting_job()
|
|
169
143
|
if maybe_next_job is None:
|
|
170
144
|
# Nothing left to start, break from scheduling loop
|
|
171
145
|
break
|
|
@@ -184,21 +158,11 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
|
|
|
184
158
|
# an ALIVE_WAITING job, but we would be able to launch a WAITING
|
|
185
159
|
# job.
|
|
186
160
|
if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
|
|
187
|
-
if not
|
|
161
|
+
if not controller_utils.can_provision():
|
|
188
162
|
# Can't schedule anything, break from scheduling loop.
|
|
189
163
|
break
|
|
190
164
|
elif current_state == state.ManagedJobScheduleState.WAITING:
|
|
191
165
|
if not _can_start_new_job(actual_pool):
|
|
192
|
-
# If there is no job can be scheduled in the pool, we
|
|
193
|
-
# try to schedule another job regardless of the pool.
|
|
194
|
-
# This is to avoid the case where the pool is scaled
|
|
195
|
-
# down at the same time as a job is done. In this case,
|
|
196
|
-
# we won't have any job to schedule in the pool, but
|
|
197
|
-
# other jobs in other pool (or no pool) can still be
|
|
198
|
-
# scheduled.
|
|
199
|
-
if pool is not None:
|
|
200
|
-
pool = None
|
|
201
|
-
continue
|
|
202
166
|
# Can't schedule anything, break from scheduling loop.
|
|
203
167
|
break
|
|
204
168
|
|
|
@@ -234,7 +198,7 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
|
234
198
|
|
|
235
199
|
The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
|
|
236
200
|
"""
|
|
237
|
-
with filelock.FileLock(
|
|
201
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
238
202
|
is_resume = state.scheduler_set_waiting(job_id, dag_yaml_path,
|
|
239
203
|
original_user_yaml_path,
|
|
240
204
|
env_file_path,
|
|
@@ -243,7 +207,7 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
|
243
207
|
if is_resume:
|
|
244
208
|
_start_controller(job_id, dag_yaml_path, env_file_path, pool)
|
|
245
209
|
else:
|
|
246
|
-
maybe_schedule_next_jobs(
|
|
210
|
+
maybe_schedule_next_jobs()
|
|
247
211
|
|
|
248
212
|
|
|
249
213
|
@contextlib.contextmanager
|
|
@@ -268,6 +232,13 @@ def scheduled_launch(job_id: int):
|
|
|
268
232
|
multiple uses of this context are nested, behavior is undefined. Don't do
|
|
269
233
|
that.
|
|
270
234
|
"""
|
|
235
|
+
pool = state.get_pool_from_job_id(job_id)
|
|
236
|
+
# For pool, since there is no execution.launch, we don't need to have all
|
|
237
|
+
# the ALIVE_WAITING state. The state transition will be
|
|
238
|
+
# WAITING -> ALIVE -> DONE without any intermediate transitions.
|
|
239
|
+
if pool is not None:
|
|
240
|
+
yield
|
|
241
|
+
return
|
|
271
242
|
|
|
272
243
|
# If we're already in LAUNCHING schedule_state, we don't need to wait.
|
|
273
244
|
# This may be the case for the first launch of a job.
|
|
@@ -279,21 +250,20 @@ def scheduled_launch(job_id: int):
|
|
|
279
250
|
while (state.get_job_schedule_state(job_id) !=
|
|
280
251
|
state.ManagedJobScheduleState.LAUNCHING):
|
|
281
252
|
time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
|
|
282
|
-
pool = state.get_pool_from_job_id(job_id)
|
|
283
253
|
|
|
284
254
|
try:
|
|
285
255
|
yield
|
|
286
256
|
except exceptions.NoClusterLaunchedError:
|
|
287
257
|
# NoClusterLaunchedError is indicates that the job is in retry backoff.
|
|
288
258
|
# We should transition to ALIVE_BACKOFF instead of ALIVE.
|
|
289
|
-
with filelock.FileLock(
|
|
259
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
290
260
|
state.scheduler_set_alive_backoff(job_id)
|
|
291
261
|
raise
|
|
292
262
|
else:
|
|
293
|
-
with filelock.FileLock(
|
|
263
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
294
264
|
state.scheduler_set_alive(job_id)
|
|
295
265
|
finally:
|
|
296
|
-
maybe_schedule_next_jobs(
|
|
266
|
+
maybe_schedule_next_jobs()
|
|
297
267
|
|
|
298
268
|
|
|
299
269
|
def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
@@ -308,58 +278,36 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
|
308
278
|
if idempotent and (state.get_job_schedule_state(job_id)
|
|
309
279
|
== state.ManagedJobScheduleState.DONE):
|
|
310
280
|
return
|
|
311
|
-
pool = state.get_pool_from_job_id(job_id)
|
|
312
281
|
|
|
313
|
-
with filelock.FileLock(
|
|
282
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
314
283
|
state.scheduler_set_done(job_id, idempotent)
|
|
315
|
-
maybe_schedule_next_jobs(
|
|
284
|
+
maybe_schedule_next_jobs()
|
|
316
285
|
|
|
317
286
|
|
|
318
287
|
def _set_alive_waiting(job_id: int) -> None:
|
|
319
288
|
"""Should use wait_until_launch_okay() to transition to this state."""
|
|
320
|
-
with filelock.FileLock(
|
|
289
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
321
290
|
state.scheduler_set_alive_waiting(job_id)
|
|
322
|
-
|
|
323
|
-
maybe_schedule_next_jobs(pool)
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
def _get_job_parallelism() -> int:
|
|
327
|
-
job_memory = JOB_MEMORY_MB * 1024 * 1024
|
|
328
|
-
|
|
329
|
-
job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
|
|
330
|
-
|
|
331
|
-
return max(job_limit, 1)
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
def _get_launch_parallelism() -> int:
|
|
335
|
-
cpus = os.cpu_count()
|
|
336
|
-
return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
|
|
291
|
+
maybe_schedule_next_jobs()
|
|
337
292
|
|
|
338
293
|
|
|
339
294
|
def _can_start_new_job(pool: Optional[str]) -> bool:
|
|
340
|
-
launching_jobs = state.get_num_launching_jobs()
|
|
341
|
-
alive_jobs = state.get_num_alive_jobs()
|
|
342
|
-
|
|
343
295
|
# Check basic resource limits
|
|
344
|
-
|
|
345
|
-
|
|
296
|
+
# Pool jobs don't need to provision resources, so we skip the check.
|
|
297
|
+
if not ((controller_utils.can_provision() or pool is not None) and
|
|
298
|
+
controller_utils.can_start_new_process()):
|
|
346
299
|
return False
|
|
347
300
|
|
|
348
|
-
# Check if there are available
|
|
301
|
+
# Check if there are available workers in the pool
|
|
349
302
|
if pool is not None:
|
|
350
303
|
alive_jobs_in_pool = state.get_num_alive_jobs(pool)
|
|
351
|
-
if alive_jobs_in_pool >= serve_utils.
|
|
352
|
-
logger.debug(f'No
|
|
304
|
+
if alive_jobs_in_pool >= len(serve_utils.get_ready_replicas(pool)):
|
|
305
|
+
logger.debug(f'No READY workers available in pool {pool}')
|
|
353
306
|
return False
|
|
354
307
|
|
|
355
308
|
return True
|
|
356
309
|
|
|
357
310
|
|
|
358
|
-
def _can_lauch_in_alive_job() -> bool:
|
|
359
|
-
launching_jobs = state.get_num_launching_jobs()
|
|
360
|
-
return launching_jobs < _get_launch_parallelism()
|
|
361
|
-
|
|
362
|
-
|
|
363
311
|
if __name__ == '__main__':
|
|
364
312
|
parser = ArgumentParser()
|
|
365
313
|
parser.add_argument('dag_yaml',
|
sky/jobs/server/core.py
CHANGED
|
@@ -93,8 +93,8 @@ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
|
|
|
93
93
|
return local_to_controller_file_mounts
|
|
94
94
|
|
|
95
95
|
|
|
96
|
-
def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
|
|
97
|
-
num_jobs:
|
|
96
|
+
def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
|
|
97
|
+
num_jobs: int) -> Optional[List[int]]:
|
|
98
98
|
"""Submit the managed job locally if in consolidation mode.
|
|
99
99
|
|
|
100
100
|
In normal mode the managed job submission is done in the ray job submission.
|
|
@@ -109,12 +109,13 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag', pool: Optional[str],
|
|
|
109
109
|
# Create local directory for the managed job.
|
|
110
110
|
pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
|
|
111
111
|
job_ids = []
|
|
112
|
+
pool = dag.pool
|
|
112
113
|
pool_hash = None
|
|
113
114
|
if pool is not None:
|
|
114
115
|
pool_hash = serve_state.get_service_hash(pool)
|
|
115
116
|
# Already checked in the sdk.
|
|
116
117
|
assert pool_hash is not None, f'Pool {pool} not found'
|
|
117
|
-
for _ in range(num_jobs
|
|
118
|
+
for _ in range(num_jobs):
|
|
118
119
|
# TODO(tian): We should have a separate name for each job when
|
|
119
120
|
# submitting multiple jobs. Current blocker is that we are sharing
|
|
120
121
|
# the same dag object for all jobs. Maybe we can do copy.copy() for
|
|
@@ -172,9 +173,6 @@ def launch(
|
|
|
172
173
|
handle: Optional[backends.ResourceHandle]; handle to the controller VM.
|
|
173
174
|
None if dryrun.
|
|
174
175
|
"""
|
|
175
|
-
if pool is not None and not managed_job_utils.is_consolidation_mode():
|
|
176
|
-
with ux_utils.print_exception_no_traceback():
|
|
177
|
-
raise ValueError('pool is only supported in consolidation mode.')
|
|
178
176
|
entrypoint = task
|
|
179
177
|
# using hasattr instead of isinstance to avoid importing sky
|
|
180
178
|
if hasattr(task, 'metadata'):
|
|
@@ -295,8 +293,13 @@ def launch(
|
|
|
295
293
|
controller=controller,
|
|
296
294
|
task_resources=sum([list(t.resources) for t in dag.tasks], []))
|
|
297
295
|
|
|
296
|
+
num_jobs = num_jobs if num_jobs is not None else 1
|
|
297
|
+
# We do this assignment after applying the admin policy, so that we don't
|
|
298
|
+
# need to serialize the pool name in the dag. The dag object will be
|
|
299
|
+
# preserved. See sky/admin_policy.py::MutatedUserRequest::decode.
|
|
300
|
+
dag.pool = pool
|
|
298
301
|
consolidation_mode_job_ids = _maybe_submit_job_locally(
|
|
299
|
-
prefix, dag,
|
|
302
|
+
prefix, dag, num_jobs)
|
|
300
303
|
|
|
301
304
|
# This is only needed for non-consolidation mode. For consolidation
|
|
302
305
|
# mode, the controller uses the same catalog as API server.
|
|
@@ -373,8 +376,8 @@ def launch(
|
|
|
373
376
|
controller_task._metadata = metadata
|
|
374
377
|
|
|
375
378
|
job_identity = ''
|
|
376
|
-
if
|
|
377
|
-
job_identity = f' (
|
|
379
|
+
if job_rank is not None:
|
|
380
|
+
job_identity = f' (rank: {job_rank})'
|
|
378
381
|
logger.info(f'{colorama.Fore.YELLOW}'
|
|
379
382
|
f'Launching managed job {dag.name!r}{job_identity} '
|
|
380
383
|
f'from jobs controller...{colorama.Style.RESET_ALL}')
|
|
@@ -428,14 +431,17 @@ def launch(
|
|
|
428
431
|
backend.run_on_head(local_handle, run_script)
|
|
429
432
|
return consolidation_mode_job_id, local_handle
|
|
430
433
|
|
|
431
|
-
if consolidation_mode_job_ids is None:
|
|
432
|
-
return _submit_one()
|
|
433
434
|
if pool is None:
|
|
435
|
+
if consolidation_mode_job_ids is None:
|
|
436
|
+
return _submit_one()
|
|
434
437
|
assert len(consolidation_mode_job_ids) == 1
|
|
435
438
|
return _submit_one(consolidation_mode_job_ids[0])
|
|
439
|
+
|
|
436
440
|
ids = []
|
|
437
441
|
all_handle = None
|
|
438
|
-
for job_rank
|
|
442
|
+
for job_rank in range(num_jobs):
|
|
443
|
+
job_id = (consolidation_mode_job_ids[job_rank]
|
|
444
|
+
if consolidation_mode_job_ids is not None else None)
|
|
439
445
|
jid, handle = _submit_one(job_id, job_rank)
|
|
440
446
|
assert jid is not None, (job_id, handle)
|
|
441
447
|
ids.append(jid)
|
|
@@ -491,7 +497,8 @@ def queue_from_kubernetes_pod(
|
|
|
491
497
|
managed_jobs_runner = provision_lib.get_command_runners(
|
|
492
498
|
'kubernetes', cluster_info)[0]
|
|
493
499
|
|
|
494
|
-
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
500
|
+
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
501
|
+
skip_finished=skip_finished)
|
|
495
502
|
returncode, job_table_payload, stderr = managed_jobs_runner.run(
|
|
496
503
|
code,
|
|
497
504
|
require_outputs=True,
|
|
@@ -507,7 +514,14 @@ def queue_from_kubernetes_pod(
|
|
|
507
514
|
except exceptions.CommandError as e:
|
|
508
515
|
raise RuntimeError(str(e)) from e
|
|
509
516
|
|
|
510
|
-
jobs = managed_job_utils.load_managed_job_queue(
|
|
517
|
+
jobs, _, result_type = managed_job_utils.load_managed_job_queue(
|
|
518
|
+
job_table_payload)
|
|
519
|
+
|
|
520
|
+
if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
|
|
521
|
+
return jobs
|
|
522
|
+
|
|
523
|
+
# Backward compatibility for old jobs controller without filtering
|
|
524
|
+
# TODO(hailong): remove this after 0.12.0
|
|
511
525
|
if skip_finished:
|
|
512
526
|
# Filter out the finished jobs. If a multi-task job is partially
|
|
513
527
|
# finished, we will include all its tasks.
|
|
@@ -562,10 +576,18 @@ def _maybe_restart_controller(
|
|
|
562
576
|
|
|
563
577
|
|
|
564
578
|
@usage_lib.entrypoint
|
|
565
|
-
def queue(
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
579
|
+
def queue(
|
|
580
|
+
refresh: bool,
|
|
581
|
+
skip_finished: bool = False,
|
|
582
|
+
all_users: bool = False,
|
|
583
|
+
job_ids: Optional[List[int]] = None,
|
|
584
|
+
user_match: Optional[str] = None,
|
|
585
|
+
workspace_match: Optional[str] = None,
|
|
586
|
+
name_match: Optional[str] = None,
|
|
587
|
+
pool_match: Optional[str] = None,
|
|
588
|
+
page: Optional[int] = None,
|
|
589
|
+
limit: Optional[int] = None,
|
|
590
|
+
) -> Tuple[List[Dict[str, Any]], int]:
|
|
569
591
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
570
592
|
"""Gets statuses of managed jobs.
|
|
571
593
|
|
|
@@ -595,6 +617,17 @@ def queue(refresh: bool,
|
|
|
595
617
|
does not exist.
|
|
596
618
|
RuntimeError: if failed to get the managed jobs with ssh.
|
|
597
619
|
"""
|
|
620
|
+
if limit is not None:
|
|
621
|
+
if limit < 1:
|
|
622
|
+
raise ValueError(f'Limit must be at least 1, got {limit}')
|
|
623
|
+
if page is None:
|
|
624
|
+
page = 1
|
|
625
|
+
if page < 1:
|
|
626
|
+
raise ValueError(f'Page must be at least 1, got {page}')
|
|
627
|
+
else:
|
|
628
|
+
if page is not None:
|
|
629
|
+
raise ValueError('Limit must be specified when page is specified')
|
|
630
|
+
|
|
598
631
|
handle = _maybe_restart_controller(refresh,
|
|
599
632
|
stopped_message='No in-progress '
|
|
600
633
|
'managed jobs.',
|
|
@@ -603,7 +636,22 @@ def queue(refresh: bool,
|
|
|
603
636
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
604
637
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
605
638
|
|
|
606
|
-
|
|
639
|
+
user_hashes: Optional[List[Optional[str]]] = None
|
|
640
|
+
if not all_users:
|
|
641
|
+
user_hashes = [common_utils.get_user_hash()]
|
|
642
|
+
# For backwards compatibility, we show jobs that do not have a
|
|
643
|
+
# user_hash. TODO(cooperc): Remove before 0.12.0.
|
|
644
|
+
user_hashes.append(None)
|
|
645
|
+
elif user_match is not None:
|
|
646
|
+
users = global_user_state.get_user_by_name_match(user_match)
|
|
647
|
+
if not users:
|
|
648
|
+
return [], 0
|
|
649
|
+
user_hashes = [user.id for user in users]
|
|
650
|
+
|
|
651
|
+
accessible_workspaces = list(workspaces_core.get_workspaces().keys())
|
|
652
|
+
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
653
|
+
skip_finished, accessible_workspaces, job_ids, workspace_match,
|
|
654
|
+
name_match, pool_match, page, limit, user_hashes)
|
|
607
655
|
returncode, job_table_payload, stderr = backend.run_on_head(
|
|
608
656
|
handle,
|
|
609
657
|
code,
|
|
@@ -616,8 +664,14 @@ def queue(refresh: bool,
|
|
|
616
664
|
raise RuntimeError('Failed to fetch managed jobs with returncode: '
|
|
617
665
|
f'{returncode}.\n{job_table_payload + stderr}')
|
|
618
666
|
|
|
619
|
-
jobs = managed_job_utils.load_managed_job_queue(
|
|
667
|
+
jobs, total, result_type = managed_job_utils.load_managed_job_queue(
|
|
668
|
+
job_table_payload)
|
|
620
669
|
|
|
670
|
+
if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
|
|
671
|
+
return jobs, total
|
|
672
|
+
|
|
673
|
+
# Backward compatibility for old jobs controller without filtering
|
|
674
|
+
# TODO(hailong): remove this after 0.12.0
|
|
621
675
|
if not all_users:
|
|
622
676
|
|
|
623
677
|
def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
|
|
@@ -630,7 +684,6 @@ def queue(refresh: bool,
|
|
|
630
684
|
|
|
631
685
|
jobs = list(filter(user_hash_matches_or_missing, jobs))
|
|
632
686
|
|
|
633
|
-
accessible_workspaces = workspaces_core.get_workspaces()
|
|
634
687
|
jobs = list(
|
|
635
688
|
filter(
|
|
636
689
|
lambda job: job.get('workspace', skylet_constants.
|
|
@@ -649,7 +702,14 @@ def queue(refresh: bool,
|
|
|
649
702
|
if job_ids:
|
|
650
703
|
jobs = [job for job in jobs if job['job_id'] in job_ids]
|
|
651
704
|
|
|
652
|
-
return jobs
|
|
705
|
+
return managed_job_utils.filter_jobs(jobs,
|
|
706
|
+
workspace_match,
|
|
707
|
+
name_match,
|
|
708
|
+
pool_match,
|
|
709
|
+
page=page,
|
|
710
|
+
limit=limit,
|
|
711
|
+
user_match=user_match,
|
|
712
|
+
enable_user_match=True)
|
|
653
713
|
|
|
654
714
|
|
|
655
715
|
@usage_lib.entrypoint
|
sky/jobs/server/utils.py
CHANGED
|
@@ -62,7 +62,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
62
62
|
version_matches = controller_version == local_version
|
|
63
63
|
|
|
64
64
|
# Load and filter jobs locally using existing method
|
|
65
|
-
jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
|
|
65
|
+
jobs, _, _ = managed_job_utils.load_managed_job_queue(job_table_payload)
|
|
66
66
|
non_terminal_jobs = [job for job in jobs if not job['status'].is_terminal()]
|
|
67
67
|
has_non_terminal_jobs = len(non_terminal_jobs) > 0
|
|
68
68
|
|
sky/jobs/state.py
CHANGED
|
@@ -441,7 +441,8 @@ class ManagedJobScheduleState(enum.Enum):
|
|
|
441
441
|
|
|
442
442
|
# === Status transition functions ===
|
|
443
443
|
@_init_db
|
|
444
|
-
def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str
|
|
444
|
+
def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str,
|
|
445
|
+
pool: Optional[str], pool_hash: Optional[str]):
|
|
445
446
|
assert _SQLALCHEMY_ENGINE is not None
|
|
446
447
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
447
448
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
@@ -457,7 +458,10 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
|
|
|
457
458
|
name=name,
|
|
458
459
|
schedule_state=ManagedJobScheduleState.INACTIVE.value,
|
|
459
460
|
workspace=workspace,
|
|
460
|
-
entrypoint=entrypoint
|
|
461
|
+
entrypoint=entrypoint,
|
|
462
|
+
pool=pool,
|
|
463
|
+
pool_hash=pool_hash,
|
|
464
|
+
)
|
|
461
465
|
session.execute(insert_stmt)
|
|
462
466
|
session.commit()
|
|
463
467
|
|
|
@@ -1524,7 +1528,7 @@ def get_nonterminal_job_ids_by_pool(pool: str,
|
|
|
1524
1528
|
|
|
1525
1529
|
|
|
1526
1530
|
@_init_db
|
|
1527
|
-
def get_waiting_job(
|
|
1531
|
+
def get_waiting_job() -> Optional[Dict[str, Any]]:
|
|
1528
1532
|
"""Get the next job that should transition to LAUNCHING.
|
|
1529
1533
|
|
|
1530
1534
|
Selects the highest-priority WAITING or ALIVE_WAITING job, provided its
|
|
@@ -1555,8 +1559,6 @@ def get_waiting_job(pool: Optional[str]) -> Optional[Dict[str, Any]]:
|
|
|
1555
1559
|
job_info_table.c.priority >= sqlalchemy.func.coalesce(
|
|
1556
1560
|
max_priority_subquery, 0),
|
|
1557
1561
|
]
|
|
1558
|
-
if pool is not None:
|
|
1559
|
-
select_conds.append(job_info_table.c.pool == pool)
|
|
1560
1562
|
query = sqlalchemy.select(
|
|
1561
1563
|
job_info_table.c.spot_job_id,
|
|
1562
1564
|
job_info_table.c.schedule_state,
|