skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250910__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/authentication.py +19 -4
- sky/backends/backend_utils.py +35 -1
- sky/backends/cloud_vm_ray_backend.py +2 -2
- sky/client/sdk.py +20 -0
- sky/client/sdk_async.py +18 -16
- sky/clouds/aws.py +3 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-1d7e11230da3ca89.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +5 -1
- sky/execution.py +21 -14
- sky/jobs/constants.py +3 -0
- sky/jobs/controller.py +732 -310
- sky/jobs/recovery_strategy.py +251 -129
- sky/jobs/scheduler.py +247 -174
- sky/jobs/server/core.py +20 -4
- sky/jobs/server/utils.py +2 -2
- sky/jobs/state.py +702 -511
- sky/jobs/utils.py +94 -39
- sky/provision/aws/config.py +4 -1
- sky/provision/gcp/config.py +6 -1
- sky/provision/kubernetes/utils.py +17 -8
- sky/provision/provisioner.py +1 -0
- sky/serve/replica_managers.py +0 -7
- sky/serve/serve_utils.py +5 -0
- sky/serve/server/impl.py +1 -2
- sky/serve/service.py +0 -2
- sky/server/common.py +8 -3
- sky/server/config.py +43 -24
- sky/server/constants.py +1 -0
- sky/server/daemons.py +7 -11
- sky/server/requests/serializers/encoders.py +1 -1
- sky/server/server.py +8 -1
- sky/setup_files/dependencies.py +4 -2
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/constants.py +3 -1
- sky/skylet/events.py +2 -10
- sky/utils/command_runner.pyi +3 -3
- sky/utils/common_utils.py +11 -1
- sky/utils/controller_utils.py +5 -0
- sky/utils/db/db_utils.py +31 -2
- sky/utils/rich_utils.py +3 -1
- sky/utils/subprocess_utils.py +9 -0
- sky/volumes/volume.py +2 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/METADATA +39 -37
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/RECORD +67 -67
- /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/top_level.txt +0 -0
sky/jobs/scheduler.py
CHANGED
|
@@ -42,145 +42,213 @@ Nomenclature:
|
|
|
42
42
|
"""
|
|
43
43
|
|
|
44
44
|
from argparse import ArgumentParser
|
|
45
|
+
import asyncio
|
|
45
46
|
import contextlib
|
|
46
47
|
import os
|
|
48
|
+
import pathlib
|
|
49
|
+
import shutil
|
|
47
50
|
import sys
|
|
48
|
-
import
|
|
49
|
-
from typing import
|
|
51
|
+
import typing
|
|
52
|
+
from typing import Set
|
|
53
|
+
import uuid
|
|
50
54
|
|
|
51
55
|
import filelock
|
|
52
56
|
|
|
53
|
-
from sky import exceptions
|
|
54
57
|
from sky import sky_logging
|
|
58
|
+
from sky import skypilot_config
|
|
59
|
+
from sky.adaptors import common as adaptors_common
|
|
60
|
+
from sky.client import sdk
|
|
55
61
|
from sky.jobs import constants as managed_job_constants
|
|
56
62
|
from sky.jobs import state
|
|
57
|
-
from sky.
|
|
63
|
+
from sky.jobs import utils as managed_job_utils
|
|
64
|
+
from sky.server import config as server_config
|
|
58
65
|
from sky.skylet import constants
|
|
59
66
|
from sky.utils import common_utils
|
|
60
|
-
from sky.utils import controller_utils
|
|
61
67
|
from sky.utils import subprocess_utils
|
|
62
68
|
|
|
69
|
+
if typing.TYPE_CHECKING:
|
|
70
|
+
import logging
|
|
71
|
+
|
|
72
|
+
import psutil
|
|
73
|
+
else:
|
|
74
|
+
psutil = adaptors_common.LazyImport('psutil')
|
|
75
|
+
|
|
63
76
|
logger = sky_logging.init_logger('sky.jobs.controller')
|
|
64
77
|
|
|
65
|
-
|
|
78
|
+
# Job controller lock. This is used to synchronize writing/reading the
|
|
79
|
+
# controller pid file.
|
|
80
|
+
JOB_CONTROLLER_PID_LOCK = os.path.expanduser(
|
|
81
|
+
'~/.sky/locks/job_controller_pid.lock')
|
|
66
82
|
|
|
83
|
+
JOB_CONTROLLER_PID_PATH = os.path.expanduser('~/.sky/job_controller_pid')
|
|
84
|
+
JOB_CONTROLLER_ENV_PATH = os.path.expanduser('~/.sky/job_controller_env')
|
|
67
85
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
86
|
+
# Based on testing, each worker takes around 200-300MB memory. Keeping it
|
|
87
|
+
# higher to be safe.
|
|
88
|
+
JOB_MEMORY_MB = 400
|
|
89
|
+
# Number of ongoing launches launches allowed per worker. Can probably be
|
|
90
|
+
# increased a bit to around 16 but keeping it lower to just to be safe
|
|
91
|
+
LAUNCHES_PER_WORKER = 8
|
|
92
|
+
# this can probably be increased to around 300-400 but keeping it lower to just
|
|
93
|
+
# to be safe
|
|
94
|
+
JOBS_PER_WORKER = 200
|
|
95
|
+
|
|
96
|
+
# keep 1GB reserved after the controllers
|
|
97
|
+
MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB = 2048
|
|
98
|
+
|
|
99
|
+
CURRENT_HASH = os.path.expanduser('~/.sky/wheels/current_sky_wheel_hash')
|
|
100
|
+
|
|
101
|
+
# Maximum values for above constants. There will start to be lagging issues
|
|
102
|
+
# at these numbers already.
|
|
103
|
+
# JOB_MEMORY_MB = 200
|
|
104
|
+
# LAUNCHES_PER_WORKER = 16
|
|
105
|
+
# JOBS_PER_WORKER = 400
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def get_number_of_controllers() -> int:
|
|
109
|
+
"""Returns the number of controllers that should be running.
|
|
110
|
+
|
|
111
|
+
This is the number of controllers that should be running to maximize
|
|
112
|
+
resource utilization.
|
|
84
113
|
|
|
114
|
+
In consolidation mode, we use the existing API server so our resource
|
|
115
|
+
requirements are just for the job controllers. We try taking up as much
|
|
116
|
+
much memory as possible left over from the API server.
|
|
117
|
+
|
|
118
|
+
In non-consolidation mode, we have to take into account the memory of the
|
|
119
|
+
API server workers. We limit to only 8 launches per worker, so our logic is
|
|
120
|
+
each controller will take CONTROLLER_MEMORY_MB + 8 * WORKER_MEMORY_MB. We
|
|
121
|
+
leave some leftover room for ssh codegen and ray status overhead.
|
|
122
|
+
"""
|
|
123
|
+
consolidation_mode = skypilot_config.get_nested(
|
|
124
|
+
('jobs', 'controller', 'consolidation_mode'), default_value=False)
|
|
125
|
+
|
|
126
|
+
total_memory_mb = common_utils.get_mem_size_gb() * 1024
|
|
127
|
+
if consolidation_mode:
|
|
128
|
+
config = server_config.compute_server_config(deploy=True, quiet=True)
|
|
129
|
+
|
|
130
|
+
used = 0.0
|
|
131
|
+
used += MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB
|
|
132
|
+
used += (config.long_worker_config.garanteed_parallelism +
|
|
133
|
+
config.long_worker_config.burstable_parallelism) * \
|
|
134
|
+
server_config.LONG_WORKER_MEM_GB * 1024
|
|
135
|
+
used += (config.short_worker_config.garanteed_parallelism +
|
|
136
|
+
config.short_worker_config.burstable_parallelism) * \
|
|
137
|
+
server_config.SHORT_WORKER_MEM_GB * 1024
|
|
138
|
+
|
|
139
|
+
return max(1, int((total_memory_mb - used) // JOB_MEMORY_MB))
|
|
140
|
+
else:
|
|
141
|
+
return max(
|
|
142
|
+
1,
|
|
143
|
+
int((total_memory_mb - MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB) /
|
|
144
|
+
((LAUNCHES_PER_WORKER * server_config.LONG_WORKER_MEM_GB) * 1024
|
|
145
|
+
+ JOB_MEMORY_MB)))
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def start_controller() -> None:
|
|
149
|
+
"""Start the job controller process.
|
|
150
|
+
|
|
151
|
+
This requires that the env file is already set up.
|
|
152
|
+
"""
|
|
153
|
+
os.environ[constants.OVERRIDE_CONSOLIDATION_MODE] = 'true'
|
|
85
154
|
logs_dir = os.path.expanduser(
|
|
86
155
|
managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
|
87
156
|
os.makedirs(logs_dir, exist_ok=True)
|
|
88
|
-
log_path = os.path.join(logs_dir, f'{
|
|
157
|
+
log_path = os.path.join(logs_dir, f'controller_{uuid.uuid4()}.log')
|
|
158
|
+
|
|
159
|
+
activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
|
|
160
|
+
run_controller_cmd = (f'{sys.executable} -u -m'
|
|
161
|
+
'sky.jobs.controller')
|
|
162
|
+
|
|
163
|
+
run_cmd = (f'{activate_python_env_cmd}'
|
|
164
|
+
f'{run_controller_cmd}')
|
|
165
|
+
|
|
166
|
+
logger.info(f'Running controller with command: {run_cmd}')
|
|
89
167
|
|
|
90
168
|
pid = subprocess_utils.launch_new_process_tree(run_cmd, log_output=log_path)
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
another jobs in pool 1, but the job in pool 2 will still be waiting. When
|
|
132
|
-
the `pool` argument is None, it schedules a job regardless of the pool.
|
|
169
|
+
with open(JOB_CONTROLLER_PID_PATH, 'a', encoding='utf-8') as f:
|
|
170
|
+
f.write(str(pid) + '\n')
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def get_alive_controllers() -> typing.Optional[int]:
|
|
174
|
+
if not os.path.exists(JOB_CONTROLLER_PID_PATH):
|
|
175
|
+
# if the file doesn't exist, it means the controller server is not
|
|
176
|
+
# running, so we return 0
|
|
177
|
+
return 0
|
|
178
|
+
|
|
179
|
+
try:
|
|
180
|
+
with open(JOB_CONTROLLER_PID_PATH, 'r', encoding='utf-8') as f:
|
|
181
|
+
pids = f.read().split('\n')[:-1]
|
|
182
|
+
except OSError:
|
|
183
|
+
# if the file is corrupted, or any issues with reading it, we just
|
|
184
|
+
# return None to be safe and not over start
|
|
185
|
+
return None
|
|
186
|
+
|
|
187
|
+
alive = 0
|
|
188
|
+
for pid in pids:
|
|
189
|
+
try:
|
|
190
|
+
# TODO(luca) there is a chance that the process that is alive is
|
|
191
|
+
# not the same controller process. a better solution is to also
|
|
192
|
+
# include a random UUID with each controller and store that in the
|
|
193
|
+
# db as well/in the command that spawns it.
|
|
194
|
+
if subprocess_utils.is_process_alive(int(pid.strip())):
|
|
195
|
+
alive += 1
|
|
196
|
+
except ValueError:
|
|
197
|
+
# if the pid is not an integer, let's assume it's alive to not
|
|
198
|
+
# over start new processes
|
|
199
|
+
alive += 1
|
|
200
|
+
return alive
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def maybe_start_controllers(from_scheduler: bool = False) -> None:
|
|
204
|
+
"""Start the job controller process.
|
|
205
|
+
|
|
206
|
+
If the process is already running, it will not start a new one.
|
|
207
|
+
Will also add the job_id, dag_yaml_path, and env_file_path to the
|
|
208
|
+
controllers list of processes.
|
|
133
209
|
"""
|
|
134
210
|
try:
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
job_id = maybe_next_job['job_id']
|
|
178
|
-
dag_yaml_path = maybe_next_job['dag_yaml_path']
|
|
179
|
-
env_file_path = maybe_next_job['env_file_path']
|
|
180
|
-
|
|
181
|
-
_start_controller(job_id, dag_yaml_path, env_file_path,
|
|
182
|
-
actual_pool)
|
|
183
|
-
|
|
211
|
+
with filelock.FileLock(JOB_CONTROLLER_PID_LOCK, blocking=False):
|
|
212
|
+
if from_scheduler and not managed_job_utils.is_consolidation_mode():
|
|
213
|
+
cur = pathlib.Path(CURRENT_HASH)
|
|
214
|
+
old = pathlib.Path(f'{CURRENT_HASH}.old')
|
|
215
|
+
|
|
216
|
+
if old.exists() and cur.exists():
|
|
217
|
+
if (old.read_text(encoding='utf-8') !=
|
|
218
|
+
cur.read_text(encoding='utf-8')):
|
|
219
|
+
# TODO(luca): there is a 1/2^160 chance that there will
|
|
220
|
+
# be a collision. using a geometric distribution and
|
|
221
|
+
# assuming one update a day, we expect a bug slightly
|
|
222
|
+
# before the heat death of the universe. should get
|
|
223
|
+
# this fixed before then.
|
|
224
|
+
try:
|
|
225
|
+
# this will stop all the controllers and the api
|
|
226
|
+
# server.
|
|
227
|
+
sdk.api_stop()
|
|
228
|
+
# All controllers should be dead. Remove the PIDs so
|
|
229
|
+
# that update_managed_jobs_statuses won't think they
|
|
230
|
+
# have failed.
|
|
231
|
+
state.reset_jobs_for_recovery()
|
|
232
|
+
except Exception as e: # pylint: disable=broad-except
|
|
233
|
+
logger.error(f'Failed to stop the api server: {e}')
|
|
234
|
+
pass
|
|
235
|
+
else:
|
|
236
|
+
shutil.copyfile(cur, old)
|
|
237
|
+
if not old.exists():
|
|
238
|
+
shutil.copyfile(cur, old)
|
|
239
|
+
|
|
240
|
+
alive = get_alive_controllers()
|
|
241
|
+
if alive is None:
|
|
242
|
+
return
|
|
243
|
+
wanted = get_number_of_controllers()
|
|
244
|
+
started = 0
|
|
245
|
+
|
|
246
|
+
while alive + started < wanted:
|
|
247
|
+
start_controller()
|
|
248
|
+
started += 1
|
|
249
|
+
|
|
250
|
+
if started > 0:
|
|
251
|
+
logger.info(f'Started {started} controllers')
|
|
184
252
|
except filelock.Timeout:
|
|
185
253
|
# If we can't get the lock, just exit. The process holding the lock
|
|
186
254
|
# should launch any pending jobs.
|
|
@@ -188,30 +256,46 @@ def maybe_schedule_next_jobs() -> None:
|
|
|
188
256
|
|
|
189
257
|
|
|
190
258
|
def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
191
|
-
env_file_path: str, priority: int
|
|
259
|
+
env_file_path: str, priority: int) -> None:
|
|
192
260
|
"""Submit an existing job to the scheduler.
|
|
193
261
|
|
|
194
262
|
This should be called after a job is created in the `spot` table as
|
|
195
263
|
PENDING. It will tell the scheduler to try and start the job controller, if
|
|
196
|
-
there are resources available.
|
|
197
|
-
should not be on the critical path for `sky jobs launch -d`.
|
|
264
|
+
there are resources available.
|
|
198
265
|
|
|
199
266
|
The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
|
|
200
267
|
"""
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
268
|
+
controller_pid = state.get_job_controller_pid(job_id)
|
|
269
|
+
if controller_pid is not None:
|
|
270
|
+
# why? TODO(cooperc): figure out why this is needed, fix it, and remove
|
|
271
|
+
if managed_job_utils.controller_process_alive(controller_pid, job_id):
|
|
272
|
+
# This can happen when HA recovery runs for some reason but the job
|
|
273
|
+
# controller is still alive.
|
|
274
|
+
logger.warning(f'Job {job_id} is still alive, skipping submission')
|
|
275
|
+
maybe_start_controllers(from_scheduler=True)
|
|
276
|
+
return
|
|
277
|
+
|
|
278
|
+
state.scheduler_set_waiting(job_id, dag_yaml_path,
|
|
279
|
+
original_user_yaml_path, env_file_path,
|
|
280
|
+
common_utils.get_user_hash(), priority)
|
|
281
|
+
if state.get_ha_recovery_script(job_id) is None:
|
|
282
|
+
# the run command is just the command that called scheduler
|
|
283
|
+
run = (f'{sys.executable} -m sky.jobs.scheduler {dag_yaml_path} '
|
|
284
|
+
f'--job-id {job_id} --env-file {env_file_path} '
|
|
285
|
+
f'--user-yaml-path {original_user_yaml_path} '
|
|
286
|
+
f'--priority {priority}')
|
|
287
|
+
state.set_ha_recovery_script(job_id, run)
|
|
288
|
+
maybe_start_controllers(from_scheduler=True)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
@contextlib.asynccontextmanager
|
|
292
|
+
async def scheduled_launch(
|
|
293
|
+
job_id: int,
|
|
294
|
+
starting: Set[int],
|
|
295
|
+
starting_lock: asyncio.Lock,
|
|
296
|
+
starting_signal: asyncio.Condition,
|
|
297
|
+
job_logger: 'logging.Logger',
|
|
298
|
+
):
|
|
215
299
|
"""Launch as part of an ongoing job.
|
|
216
300
|
|
|
217
301
|
A newly started job will already be LAUNCHING, and this will immediately
|
|
@@ -240,30 +324,34 @@ def scheduled_launch(job_id: int):
|
|
|
240
324
|
yield
|
|
241
325
|
return
|
|
242
326
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
if (state.get_job_schedule_state(job_id) !=
|
|
246
|
-
state.ManagedJobScheduleState.LAUNCHING):
|
|
247
|
-
# Since we aren't LAUNCHING, we need to wait to be scheduled.
|
|
248
|
-
_set_alive_waiting(job_id)
|
|
327
|
+
assert starting_lock == starting_signal._lock, ( # type: ignore #pylint: disable=protected-access
|
|
328
|
+
'starting_lock and starting_signal must use the same lock')
|
|
249
329
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
330
|
+
while True:
|
|
331
|
+
async with starting_lock:
|
|
332
|
+
starting_count = len(starting)
|
|
333
|
+
if starting_count < LAUNCHES_PER_WORKER:
|
|
334
|
+
break
|
|
335
|
+
job_logger.info('Too many jobs starting, waiting for a slot')
|
|
336
|
+
await starting_signal.wait()
|
|
337
|
+
|
|
338
|
+
job_logger.info(f'Starting job {job_id}')
|
|
339
|
+
|
|
340
|
+
async with starting_lock:
|
|
341
|
+
starting.add(job_id)
|
|
342
|
+
|
|
343
|
+
await state.scheduler_set_launching_async(job_id)
|
|
253
344
|
|
|
254
345
|
try:
|
|
255
346
|
yield
|
|
256
|
-
except
|
|
257
|
-
|
|
258
|
-
# We should transition to ALIVE_BACKOFF instead of ALIVE.
|
|
259
|
-
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
260
|
-
state.scheduler_set_alive_backoff(job_id)
|
|
261
|
-
raise
|
|
347
|
+
except Exception as e:
|
|
348
|
+
raise e
|
|
262
349
|
else:
|
|
263
|
-
|
|
264
|
-
state.scheduler_set_alive(job_id)
|
|
350
|
+
await state.scheduler_set_alive_async(job_id)
|
|
265
351
|
finally:
|
|
266
|
-
|
|
352
|
+
async with starting_lock:
|
|
353
|
+
starting.remove(job_id)
|
|
354
|
+
starting_signal.notify()
|
|
267
355
|
|
|
268
356
|
|
|
269
357
|
def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
@@ -274,38 +362,23 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
|
|
|
274
362
|
|
|
275
363
|
The job could be in any terminal ManagedJobStatus. However, once DONE, it
|
|
276
364
|
should never transition back to another state.
|
|
365
|
+
|
|
366
|
+
This is only called by utils.update_managed_jobs_statuses which is sync.
|
|
277
367
|
"""
|
|
278
368
|
if idempotent and (state.get_job_schedule_state(job_id)
|
|
279
369
|
== state.ManagedJobScheduleState.DONE):
|
|
280
370
|
return
|
|
281
371
|
|
|
282
|
-
|
|
283
|
-
state.scheduler_set_done(job_id, idempotent)
|
|
284
|
-
maybe_schedule_next_jobs()
|
|
285
|
-
|
|
372
|
+
state.scheduler_set_done(job_id, idempotent)
|
|
286
373
|
|
|
287
|
-
def _set_alive_waiting(job_id: int) -> None:
|
|
288
|
-
"""Should use wait_until_launch_okay() to transition to this state."""
|
|
289
|
-
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
290
|
-
state.scheduler_set_alive_waiting(job_id)
|
|
291
|
-
maybe_schedule_next_jobs()
|
|
292
374
|
|
|
375
|
+
async def job_done_async(job_id: int, idempotent: bool = False):
|
|
376
|
+
"""Async version of job_done."""
|
|
377
|
+
if idempotent and (await state.get_job_schedule_state_async(job_id)
|
|
378
|
+
== state.ManagedJobScheduleState.DONE):
|
|
379
|
+
return
|
|
293
380
|
|
|
294
|
-
|
|
295
|
-
# Check basic resource limits
|
|
296
|
-
# Pool jobs don't need to provision resources, so we skip the check.
|
|
297
|
-
if not ((controller_utils.can_provision() or pool is not None) and
|
|
298
|
-
controller_utils.can_start_new_process()):
|
|
299
|
-
return False
|
|
300
|
-
|
|
301
|
-
# Check if there are available workers in the pool
|
|
302
|
-
if pool is not None:
|
|
303
|
-
alive_jobs_in_pool = state.get_num_alive_jobs(pool)
|
|
304
|
-
if alive_jobs_in_pool >= len(serve_utils.get_ready_replicas(pool)):
|
|
305
|
-
logger.debug(f'No READY workers available in pool {pool}')
|
|
306
|
-
return False
|
|
307
|
-
|
|
308
|
-
return True
|
|
381
|
+
await state.scheduler_set_done_async(job_id, idempotent)
|
|
309
382
|
|
|
310
383
|
|
|
311
384
|
if __name__ == '__main__':
|
|
@@ -337,4 +410,4 @@ if __name__ == '__main__':
|
|
|
337
410
|
f' Default: {constants.DEFAULT_PRIORITY}.')
|
|
338
411
|
args = parser.parse_args()
|
|
339
412
|
submit_job(args.job_id, args.dag_yaml, args.user_yaml_path, args.env_file,
|
|
340
|
-
args.priority
|
|
413
|
+
args.priority)
|
sky/jobs/server/core.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
"""SDK functions for managed jobs."""
|
|
2
|
+
import ipaddress
|
|
2
3
|
import os
|
|
3
4
|
import pathlib
|
|
4
5
|
import tempfile
|
|
5
6
|
import typing
|
|
6
7
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
8
|
+
from urllib import parse as urlparse
|
|
7
9
|
import uuid
|
|
8
10
|
|
|
9
11
|
import colorama
|
|
@@ -188,6 +190,7 @@ def launch(
|
|
|
188
190
|
|
|
189
191
|
dag_uuid = str(uuid.uuid4().hex[:4])
|
|
190
192
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
|
193
|
+
|
|
191
194
|
# Always apply the policy again here, even though it might have been applied
|
|
192
195
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
|
193
196
|
# and get the mutated config.
|
|
@@ -202,6 +205,21 @@ def launch(
|
|
|
202
205
|
# pre-mount operations when submitting jobs.
|
|
203
206
|
dag.pre_mount_volumes()
|
|
204
207
|
|
|
208
|
+
# If there is a local postgres db, when the api server tries launching on
|
|
209
|
+
# the remote jobs controller it will fail. therefore, we should remove this
|
|
210
|
+
# before sending the config to the jobs controller.
|
|
211
|
+
# TODO(luca) there are a lot of potential problems with postgres being sent
|
|
212
|
+
# to the jobs controller. for example if the postgres is whitelisted to
|
|
213
|
+
# only the API server, this will then break. the simple solution to that is
|
|
214
|
+
# telling the user to add the jobs controller to the postgres whitelist.
|
|
215
|
+
if not managed_job_utils.is_consolidation_mode():
|
|
216
|
+
db_path = mutated_user_config.get('db', None)
|
|
217
|
+
if db_path is not None:
|
|
218
|
+
parsed = urlparse.urlparse(db_path)
|
|
219
|
+
if ((parsed.hostname == 'localhost' or
|
|
220
|
+
ipaddress.ip_address(parsed.hostname).is_loopback)):
|
|
221
|
+
mutated_user_config.pop('db', None)
|
|
222
|
+
|
|
205
223
|
user_dag_str_user_specified = dag_utils.dump_chain_dag_to_yaml_str(
|
|
206
224
|
dag, use_user_specified_yaml=True)
|
|
207
225
|
|
|
@@ -424,10 +442,8 @@ def launch(
|
|
|
424
442
|
]
|
|
425
443
|
run_script = '\n'.join(env_cmds + [run_script])
|
|
426
444
|
# Dump script for high availability recovery.
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
managed_job_state.set_ha_recovery_script(
|
|
430
|
-
consolidation_mode_job_id, run_script)
|
|
445
|
+
managed_job_state.set_ha_recovery_script(
|
|
446
|
+
consolidation_mode_job_id, run_script)
|
|
431
447
|
backend.run_on_head(local_handle, run_script)
|
|
432
448
|
return consolidation_mode_job_id, local_handle
|
|
433
449
|
|
sky/jobs/server/utils.py
CHANGED
|
@@ -11,7 +11,6 @@ logger = sky_logging.init_logger(__name__)
|
|
|
11
11
|
|
|
12
12
|
def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
13
13
|
"""Check if controller has version mismatch and non-terminal jobs exist.
|
|
14
|
-
|
|
15
14
|
Raises:
|
|
16
15
|
ValueError: If there's a version mismatch and non-terminal jobs exist.
|
|
17
16
|
sky.exceptions.ClusterNotUpError: If the controller is not accessible.
|
|
@@ -59,7 +58,8 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
59
58
|
job_table_payload = output_parts[1]
|
|
60
59
|
|
|
61
60
|
# Process locally: check version match and filter non-terminal jobs
|
|
62
|
-
version_matches = controller_version == local_version
|
|
61
|
+
version_matches = (controller_version == local_version or
|
|
62
|
+
int(controller_version) > 17)
|
|
63
63
|
|
|
64
64
|
# Load and filter jobs locally using existing method
|
|
65
65
|
jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(
|