skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250910__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (67) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +19 -4
  3. sky/backends/backend_utils.py +35 -1
  4. sky/backends/cloud_vm_ray_backend.py +2 -2
  5. sky/client/sdk.py +20 -0
  6. sky/client/sdk_async.py +18 -16
  7. sky/clouds/aws.py +3 -1
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-1d7e11230da3ca89.js} +1 -1
  10. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  11. sky/dashboard/out/clusters/[cluster].html +1 -1
  12. sky/dashboard/out/clusters.html +1 -1
  13. sky/dashboard/out/config.html +1 -1
  14. sky/dashboard/out/index.html +1 -1
  15. sky/dashboard/out/infra/[context].html +1 -1
  16. sky/dashboard/out/infra.html +1 -1
  17. sky/dashboard/out/jobs/[job].html +1 -1
  18. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  19. sky/dashboard/out/jobs.html +1 -1
  20. sky/dashboard/out/users.html +1 -1
  21. sky/dashboard/out/volumes.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/data/storage.py +5 -1
  26. sky/execution.py +21 -14
  27. sky/jobs/constants.py +3 -0
  28. sky/jobs/controller.py +732 -310
  29. sky/jobs/recovery_strategy.py +251 -129
  30. sky/jobs/scheduler.py +247 -174
  31. sky/jobs/server/core.py +20 -4
  32. sky/jobs/server/utils.py +2 -2
  33. sky/jobs/state.py +702 -511
  34. sky/jobs/utils.py +94 -39
  35. sky/provision/aws/config.py +4 -1
  36. sky/provision/gcp/config.py +6 -1
  37. sky/provision/kubernetes/utils.py +17 -8
  38. sky/provision/provisioner.py +1 -0
  39. sky/serve/replica_managers.py +0 -7
  40. sky/serve/serve_utils.py +5 -0
  41. sky/serve/server/impl.py +1 -2
  42. sky/serve/service.py +0 -2
  43. sky/server/common.py +8 -3
  44. sky/server/config.py +43 -24
  45. sky/server/constants.py +1 -0
  46. sky/server/daemons.py +7 -11
  47. sky/server/requests/serializers/encoders.py +1 -1
  48. sky/server/server.py +8 -1
  49. sky/setup_files/dependencies.py +4 -2
  50. sky/skylet/attempt_skylet.py +1 -0
  51. sky/skylet/constants.py +3 -1
  52. sky/skylet/events.py +2 -10
  53. sky/utils/command_runner.pyi +3 -3
  54. sky/utils/common_utils.py +11 -1
  55. sky/utils/controller_utils.py +5 -0
  56. sky/utils/db/db_utils.py +31 -2
  57. sky/utils/rich_utils.py +3 -1
  58. sky/utils/subprocess_utils.py +9 -0
  59. sky/volumes/volume.py +2 -0
  60. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/METADATA +39 -37
  61. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/RECORD +67 -67
  62. /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_buildManifest.js +0 -0
  63. /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_ssgManifest.js +0 -0
  64. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/WHEEL +0 -0
  65. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/entry_points.txt +0 -0
  66. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/licenses/LICENSE +0 -0
  67. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/top_level.txt +0 -0
sky/jobs/scheduler.py CHANGED
@@ -42,145 +42,213 @@ Nomenclature:
42
42
  """
43
43
 
44
44
  from argparse import ArgumentParser
45
+ import asyncio
45
46
  import contextlib
46
47
  import os
48
+ import pathlib
49
+ import shutil
47
50
  import sys
48
- import time
49
- from typing import Optional
51
+ import typing
52
+ from typing import Set
53
+ import uuid
50
54
 
51
55
  import filelock
52
56
 
53
- from sky import exceptions
54
57
  from sky import sky_logging
58
+ from sky import skypilot_config
59
+ from sky.adaptors import common as adaptors_common
60
+ from sky.client import sdk
55
61
  from sky.jobs import constants as managed_job_constants
56
62
  from sky.jobs import state
57
- from sky.serve import serve_utils
63
+ from sky.jobs import utils as managed_job_utils
64
+ from sky.server import config as server_config
58
65
  from sky.skylet import constants
59
66
  from sky.utils import common_utils
60
- from sky.utils import controller_utils
61
67
  from sky.utils import subprocess_utils
62
68
 
69
+ if typing.TYPE_CHECKING:
70
+ import logging
71
+
72
+ import psutil
73
+ else:
74
+ psutil = adaptors_common.LazyImport('psutil')
75
+
63
76
  logger = sky_logging.init_logger('sky.jobs.controller')
64
77
 
65
- _ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
78
+ # Job controller lock. This is used to synchronize writing/reading the
79
+ # controller pid file.
80
+ JOB_CONTROLLER_PID_LOCK = os.path.expanduser(
81
+ '~/.sky/locks/job_controller_pid.lock')
66
82
 
83
+ JOB_CONTROLLER_PID_PATH = os.path.expanduser('~/.sky/job_controller_pid')
84
+ JOB_CONTROLLER_ENV_PATH = os.path.expanduser('~/.sky/job_controller_env')
67
85
 
68
- def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
69
- pool: Optional[str]) -> None:
70
- activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
71
- source_environment_cmd = (f'source {env_file_path};'
72
- if env_file_path else '')
73
- maybe_pool_arg = (f'--pool {pool}' if pool is not None else '')
74
- run_controller_cmd = (
75
- f'{sys.executable} -u -m sky.jobs.controller '
76
- f'{dag_yaml_path} --job-id {job_id} {maybe_pool_arg};')
77
-
78
- # If the command line here is changed, please also update
79
- # utils._controller_process_alive. The substring `--job-id X`
80
- # should be in the command.
81
- run_cmd = (f'{activate_python_env_cmd}'
82
- f'{source_environment_cmd}'
83
- f'{run_controller_cmd}')
86
+ # Based on testing, each worker takes around 200-300MB memory. Keeping it
87
+ # higher to be safe.
88
+ JOB_MEMORY_MB = 400
89
+ # Number of ongoing launches launches allowed per worker. Can probably be
90
+ # increased a bit to around 16 but keeping it lower to just to be safe
91
+ LAUNCHES_PER_WORKER = 8
92
+ # this can probably be increased to around 300-400 but keeping it lower to just
93
+ # to be safe
94
+ JOBS_PER_WORKER = 200
95
+
96
+ # keep 1GB reserved after the controllers
97
+ MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB = 2048
98
+
99
+ CURRENT_HASH = os.path.expanduser('~/.sky/wheels/current_sky_wheel_hash')
100
+
101
+ # Maximum values for above constants. There will start to be lagging issues
102
+ # at these numbers already.
103
+ # JOB_MEMORY_MB = 200
104
+ # LAUNCHES_PER_WORKER = 16
105
+ # JOBS_PER_WORKER = 400
106
+
107
+
108
+ def get_number_of_controllers() -> int:
109
+ """Returns the number of controllers that should be running.
110
+
111
+ This is the number of controllers that should be running to maximize
112
+ resource utilization.
84
113
 
114
+ In consolidation mode, we use the existing API server so our resource
115
+ requirements are just for the job controllers. We try taking up as much
116
+ much memory as possible left over from the API server.
117
+
118
+ In non-consolidation mode, we have to take into account the memory of the
119
+ API server workers. We limit to only 8 launches per worker, so our logic is
120
+ each controller will take CONTROLLER_MEMORY_MB + 8 * WORKER_MEMORY_MB. We
121
+ leave some leftover room for ssh codegen and ray status overhead.
122
+ """
123
+ consolidation_mode = skypilot_config.get_nested(
124
+ ('jobs', 'controller', 'consolidation_mode'), default_value=False)
125
+
126
+ total_memory_mb = common_utils.get_mem_size_gb() * 1024
127
+ if consolidation_mode:
128
+ config = server_config.compute_server_config(deploy=True, quiet=True)
129
+
130
+ used = 0.0
131
+ used += MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB
132
+ used += (config.long_worker_config.garanteed_parallelism +
133
+ config.long_worker_config.burstable_parallelism) * \
134
+ server_config.LONG_WORKER_MEM_GB * 1024
135
+ used += (config.short_worker_config.garanteed_parallelism +
136
+ config.short_worker_config.burstable_parallelism) * \
137
+ server_config.SHORT_WORKER_MEM_GB * 1024
138
+
139
+ return max(1, int((total_memory_mb - used) // JOB_MEMORY_MB))
140
+ else:
141
+ return max(
142
+ 1,
143
+ int((total_memory_mb - MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB) /
144
+ ((LAUNCHES_PER_WORKER * server_config.LONG_WORKER_MEM_GB) * 1024
145
+ + JOB_MEMORY_MB)))
146
+
147
+
148
+ def start_controller() -> None:
149
+ """Start the job controller process.
150
+
151
+ This requires that the env file is already set up.
152
+ """
153
+ os.environ[constants.OVERRIDE_CONSOLIDATION_MODE] = 'true'
85
154
  logs_dir = os.path.expanduser(
86
155
  managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
87
156
  os.makedirs(logs_dir, exist_ok=True)
88
- log_path = os.path.join(logs_dir, f'{job_id}.log')
157
+ log_path = os.path.join(logs_dir, f'controller_{uuid.uuid4()}.log')
158
+
159
+ activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
160
+ run_controller_cmd = (f'{sys.executable} -u -m'
161
+ 'sky.jobs.controller')
162
+
163
+ run_cmd = (f'{activate_python_env_cmd}'
164
+ f'{run_controller_cmd}')
165
+
166
+ logger.info(f'Running controller with command: {run_cmd}')
89
167
 
90
168
  pid = subprocess_utils.launch_new_process_tree(run_cmd, log_output=log_path)
91
- state.set_job_controller_pid(job_id, pid)
92
-
93
- logger.debug(f'Job {job_id} started with pid {pid}')
94
-
95
-
96
- def maybe_schedule_next_jobs() -> None:
97
- """Determine if any managed jobs can be scheduled, and if so, schedule them.
98
-
99
- Here, "schedule" means to select job that is waiting, and allow it to
100
- proceed. It does NOT mean to submit a job to the scheduler.
101
-
102
- For newly submitted jobs, scheduling means updating the state of the jobs,
103
- and starting the job controller process. For jobs that are already alive but
104
- are waiting to launch a new task or recover, just update the state of the
105
- job to indicate that the launch can proceed.
106
-
107
- This function transitions jobs into LAUNCHING on a best-effort basis. That
108
- is, if we can start any jobs, we will, but if not, we will exit (almost)
109
- immediately. It's expected that if some WAITING or ALIVE_WAITING jobs cannot
110
- be started now (either because the lock is held, or because there are not
111
- enough resources), another call to this function will be made whenever that
112
- situation is resolved. (If the lock is held, the lock holder should start
113
- the jobs. If there aren't enough resources, the next controller to exit and
114
- free up resources should start the jobs.)
115
-
116
- If this function obtains the lock, it will launch as many jobs as possible
117
- before releasing the lock. This is what allows other calls to exit
118
- immediately if the lock is held, while ensuring that all jobs are started as
119
- soon as possible.
120
-
121
- This uses subprocess_utils.launch_new_process_tree() to start the controller
122
- processes, which should be safe to call from pretty much any code running on
123
- the jobs controller instance. New job controller processes will be detached
124
- from the current process and there will not be a parent/child relationship.
125
- See launch_new_process_tree for more.
126
-
127
- After adding the pool support, this function will be called in a per-pool
128
- basis. We employ resources limitation for each pool given the number of
129
- ready workers in the pool. Each pool will have its own scheduler queue,
130
- indicating by the argument `pool`. Finished job in pool 1 will only trigger
131
- another jobs in pool 1, but the job in pool 2 will still be waiting. When
132
- the `pool` argument is None, it schedules a job regardless of the pool.
169
+ with open(JOB_CONTROLLER_PID_PATH, 'a', encoding='utf-8') as f:
170
+ f.write(str(pid) + '\n')
171
+
172
+
173
+ def get_alive_controllers() -> typing.Optional[int]:
174
+ if not os.path.exists(JOB_CONTROLLER_PID_PATH):
175
+ # if the file doesn't exist, it means the controller server is not
176
+ # running, so we return 0
177
+ return 0
178
+
179
+ try:
180
+ with open(JOB_CONTROLLER_PID_PATH, 'r', encoding='utf-8') as f:
181
+ pids = f.read().split('\n')[:-1]
182
+ except OSError:
183
+ # if the file is corrupted, or any issues with reading it, we just
184
+ # return None to be safe and not over start
185
+ return None
186
+
187
+ alive = 0
188
+ for pid in pids:
189
+ try:
190
+ # TODO(luca) there is a chance that the process that is alive is
191
+ # not the same controller process. a better solution is to also
192
+ # include a random UUID with each controller and store that in the
193
+ # db as well/in the command that spawns it.
194
+ if subprocess_utils.is_process_alive(int(pid.strip())):
195
+ alive += 1
196
+ except ValueError:
197
+ # if the pid is not an integer, let's assume it's alive to not
198
+ # over start new processes
199
+ alive += 1
200
+ return alive
201
+
202
+
203
+ def maybe_start_controllers(from_scheduler: bool = False) -> None:
204
+ """Start the job controller process.
205
+
206
+ If the process is already running, it will not start a new one.
207
+ Will also add the job_id, dag_yaml_path, and env_file_path to the
208
+ controllers list of processes.
133
209
  """
134
210
  try:
135
- # We must use a global lock rather than a per-job lock to ensure correct
136
- # parallelism control. If we cannot obtain the lock, exit immediately.
137
- # The current lock holder is expected to launch any jobs it can before
138
- # releasing the lock.
139
- with filelock.FileLock(controller_utils.get_resources_lock_path(),
140
- blocking=False):
141
- while True:
142
- maybe_next_job = state.get_waiting_job()
143
- if maybe_next_job is None:
144
- # Nothing left to start, break from scheduling loop
145
- break
146
- actual_pool = maybe_next_job['pool']
147
-
148
- current_state = maybe_next_job['schedule_state']
149
-
150
- assert current_state in (
151
- state.ManagedJobScheduleState.ALIVE_WAITING,
152
- state.ManagedJobScheduleState.WAITING), maybe_next_job
153
-
154
- # Note: we expect to get ALIVE_WAITING jobs before WAITING jobs,
155
- # since they will have been submitted and therefore started
156
- # first. The requirements to launch in an alive job are more
157
- # lenient, so there is no way that we wouldn't be able to launch
158
- # an ALIVE_WAITING job, but we would be able to launch a WAITING
159
- # job.
160
- if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
161
- if not controller_utils.can_provision():
162
- # Can't schedule anything, break from scheduling loop.
163
- break
164
- elif current_state == state.ManagedJobScheduleState.WAITING:
165
- if not _can_start_new_job(actual_pool):
166
- # Can't schedule anything, break from scheduling loop.
167
- break
168
-
169
- logger.debug(f'Scheduling job {maybe_next_job["job_id"]}')
170
- state.scheduler_set_launching(maybe_next_job['job_id'],
171
- current_state)
172
-
173
- if current_state == state.ManagedJobScheduleState.WAITING:
174
- # The job controller has not been started yet. We must start
175
- # it.
176
-
177
- job_id = maybe_next_job['job_id']
178
- dag_yaml_path = maybe_next_job['dag_yaml_path']
179
- env_file_path = maybe_next_job['env_file_path']
180
-
181
- _start_controller(job_id, dag_yaml_path, env_file_path,
182
- actual_pool)
183
-
211
+ with filelock.FileLock(JOB_CONTROLLER_PID_LOCK, blocking=False):
212
+ if from_scheduler and not managed_job_utils.is_consolidation_mode():
213
+ cur = pathlib.Path(CURRENT_HASH)
214
+ old = pathlib.Path(f'{CURRENT_HASH}.old')
215
+
216
+ if old.exists() and cur.exists():
217
+ if (old.read_text(encoding='utf-8') !=
218
+ cur.read_text(encoding='utf-8')):
219
+ # TODO(luca): there is a 1/2^160 chance that there will
220
+ # be a collision. using a geometric distribution and
221
+ # assuming one update a day, we expect a bug slightly
222
+ # before the heat death of the universe. should get
223
+ # this fixed before then.
224
+ try:
225
+ # this will stop all the controllers and the api
226
+ # server.
227
+ sdk.api_stop()
228
+ # All controllers should be dead. Remove the PIDs so
229
+ # that update_managed_jobs_statuses won't think they
230
+ # have failed.
231
+ state.reset_jobs_for_recovery()
232
+ except Exception as e: # pylint: disable=broad-except
233
+ logger.error(f'Failed to stop the api server: {e}')
234
+ pass
235
+ else:
236
+ shutil.copyfile(cur, old)
237
+ if not old.exists():
238
+ shutil.copyfile(cur, old)
239
+
240
+ alive = get_alive_controllers()
241
+ if alive is None:
242
+ return
243
+ wanted = get_number_of_controllers()
244
+ started = 0
245
+
246
+ while alive + started < wanted:
247
+ start_controller()
248
+ started += 1
249
+
250
+ if started > 0:
251
+ logger.info(f'Started {started} controllers')
184
252
  except filelock.Timeout:
185
253
  # If we can't get the lock, just exit. The process holding the lock
186
254
  # should launch any pending jobs.
@@ -188,30 +256,46 @@ def maybe_schedule_next_jobs() -> None:
188
256
 
189
257
 
190
258
  def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
191
- env_file_path: str, priority: int, pool: Optional[str]) -> None:
259
+ env_file_path: str, priority: int) -> None:
192
260
  """Submit an existing job to the scheduler.
193
261
 
194
262
  This should be called after a job is created in the `spot` table as
195
263
  PENDING. It will tell the scheduler to try and start the job controller, if
196
- there are resources available. It may block to acquire the lock, so it
197
- should not be on the critical path for `sky jobs launch -d`.
264
+ there are resources available.
198
265
 
199
266
  The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
200
267
  """
201
- with filelock.FileLock(controller_utils.get_resources_lock_path()):
202
- is_resume = state.scheduler_set_waiting(job_id, dag_yaml_path,
203
- original_user_yaml_path,
204
- env_file_path,
205
- common_utils.get_user_hash(),
206
- priority)
207
- if is_resume:
208
- _start_controller(job_id, dag_yaml_path, env_file_path, pool)
209
- else:
210
- maybe_schedule_next_jobs()
211
-
212
-
213
- @contextlib.contextmanager
214
- def scheduled_launch(job_id: int):
268
+ controller_pid = state.get_job_controller_pid(job_id)
269
+ if controller_pid is not None:
270
+ # why? TODO(cooperc): figure out why this is needed, fix it, and remove
271
+ if managed_job_utils.controller_process_alive(controller_pid, job_id):
272
+ # This can happen when HA recovery runs for some reason but the job
273
+ # controller is still alive.
274
+ logger.warning(f'Job {job_id} is still alive, skipping submission')
275
+ maybe_start_controllers(from_scheduler=True)
276
+ return
277
+
278
+ state.scheduler_set_waiting(job_id, dag_yaml_path,
279
+ original_user_yaml_path, env_file_path,
280
+ common_utils.get_user_hash(), priority)
281
+ if state.get_ha_recovery_script(job_id) is None:
282
+ # the run command is just the command that called scheduler
283
+ run = (f'{sys.executable} -m sky.jobs.scheduler {dag_yaml_path} '
284
+ f'--job-id {job_id} --env-file {env_file_path} '
285
+ f'--user-yaml-path {original_user_yaml_path} '
286
+ f'--priority {priority}')
287
+ state.set_ha_recovery_script(job_id, run)
288
+ maybe_start_controllers(from_scheduler=True)
289
+
290
+
291
+ @contextlib.asynccontextmanager
292
+ async def scheduled_launch(
293
+ job_id: int,
294
+ starting: Set[int],
295
+ starting_lock: asyncio.Lock,
296
+ starting_signal: asyncio.Condition,
297
+ job_logger: 'logging.Logger',
298
+ ):
215
299
  """Launch as part of an ongoing job.
216
300
 
217
301
  A newly started job will already be LAUNCHING, and this will immediately
@@ -240,30 +324,34 @@ def scheduled_launch(job_id: int):
240
324
  yield
241
325
  return
242
326
 
243
- # If we're already in LAUNCHING schedule_state, we don't need to wait.
244
- # This may be the case for the first launch of a job.
245
- if (state.get_job_schedule_state(job_id) !=
246
- state.ManagedJobScheduleState.LAUNCHING):
247
- # Since we aren't LAUNCHING, we need to wait to be scheduled.
248
- _set_alive_waiting(job_id)
327
+ assert starting_lock == starting_signal._lock, ( # type: ignore #pylint: disable=protected-access
328
+ 'starting_lock and starting_signal must use the same lock')
249
329
 
250
- while (state.get_job_schedule_state(job_id) !=
251
- state.ManagedJobScheduleState.LAUNCHING):
252
- time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
330
+ while True:
331
+ async with starting_lock:
332
+ starting_count = len(starting)
333
+ if starting_count < LAUNCHES_PER_WORKER:
334
+ break
335
+ job_logger.info('Too many jobs starting, waiting for a slot')
336
+ await starting_signal.wait()
337
+
338
+ job_logger.info(f'Starting job {job_id}')
339
+
340
+ async with starting_lock:
341
+ starting.add(job_id)
342
+
343
+ await state.scheduler_set_launching_async(job_id)
253
344
 
254
345
  try:
255
346
  yield
256
- except exceptions.NoClusterLaunchedError:
257
- # NoClusterLaunchedError is indicates that the job is in retry backoff.
258
- # We should transition to ALIVE_BACKOFF instead of ALIVE.
259
- with filelock.FileLock(controller_utils.get_resources_lock_path()):
260
- state.scheduler_set_alive_backoff(job_id)
261
- raise
347
+ except Exception as e:
348
+ raise e
262
349
  else:
263
- with filelock.FileLock(controller_utils.get_resources_lock_path()):
264
- state.scheduler_set_alive(job_id)
350
+ await state.scheduler_set_alive_async(job_id)
265
351
  finally:
266
- maybe_schedule_next_jobs()
352
+ async with starting_lock:
353
+ starting.remove(job_id)
354
+ starting_signal.notify()
267
355
 
268
356
 
269
357
  def job_done(job_id: int, idempotent: bool = False) -> None:
@@ -274,38 +362,23 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
274
362
 
275
363
  The job could be in any terminal ManagedJobStatus. However, once DONE, it
276
364
  should never transition back to another state.
365
+
366
+ This is only called by utils.update_managed_jobs_statuses which is sync.
277
367
  """
278
368
  if idempotent and (state.get_job_schedule_state(job_id)
279
369
  == state.ManagedJobScheduleState.DONE):
280
370
  return
281
371
 
282
- with filelock.FileLock(controller_utils.get_resources_lock_path()):
283
- state.scheduler_set_done(job_id, idempotent)
284
- maybe_schedule_next_jobs()
285
-
372
+ state.scheduler_set_done(job_id, idempotent)
286
373
 
287
- def _set_alive_waiting(job_id: int) -> None:
288
- """Should use wait_until_launch_okay() to transition to this state."""
289
- with filelock.FileLock(controller_utils.get_resources_lock_path()):
290
- state.scheduler_set_alive_waiting(job_id)
291
- maybe_schedule_next_jobs()
292
374
 
375
+ async def job_done_async(job_id: int, idempotent: bool = False):
376
+ """Async version of job_done."""
377
+ if idempotent and (await state.get_job_schedule_state_async(job_id)
378
+ == state.ManagedJobScheduleState.DONE):
379
+ return
293
380
 
294
- def _can_start_new_job(pool: Optional[str]) -> bool:
295
- # Check basic resource limits
296
- # Pool jobs don't need to provision resources, so we skip the check.
297
- if not ((controller_utils.can_provision() or pool is not None) and
298
- controller_utils.can_start_new_process()):
299
- return False
300
-
301
- # Check if there are available workers in the pool
302
- if pool is not None:
303
- alive_jobs_in_pool = state.get_num_alive_jobs(pool)
304
- if alive_jobs_in_pool >= len(serve_utils.get_ready_replicas(pool)):
305
- logger.debug(f'No READY workers available in pool {pool}')
306
- return False
307
-
308
- return True
381
+ await state.scheduler_set_done_async(job_id, idempotent)
309
382
 
310
383
 
311
384
  if __name__ == '__main__':
@@ -337,4 +410,4 @@ if __name__ == '__main__':
337
410
  f' Default: {constants.DEFAULT_PRIORITY}.')
338
411
  args = parser.parse_args()
339
412
  submit_job(args.job_id, args.dag_yaml, args.user_yaml_path, args.env_file,
340
- args.priority, args.pool)
413
+ args.priority)
sky/jobs/server/core.py CHANGED
@@ -1,9 +1,11 @@
1
1
  """SDK functions for managed jobs."""
2
+ import ipaddress
2
3
  import os
3
4
  import pathlib
4
5
  import tempfile
5
6
  import typing
6
7
  from typing import Any, Dict, List, Optional, Tuple, Union
8
+ from urllib import parse as urlparse
7
9
  import uuid
8
10
 
9
11
  import colorama
@@ -188,6 +190,7 @@ def launch(
188
190
 
189
191
  dag_uuid = str(uuid.uuid4().hex[:4])
190
192
  dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
193
+
191
194
  # Always apply the policy again here, even though it might have been applied
192
195
  # in the CLI. This is to ensure that we apply the policy to the final DAG
193
196
  # and get the mutated config.
@@ -202,6 +205,21 @@ def launch(
202
205
  # pre-mount operations when submitting jobs.
203
206
  dag.pre_mount_volumes()
204
207
 
208
+ # If there is a local postgres db, when the api server tries launching on
209
+ # the remote jobs controller it will fail. therefore, we should remove this
210
+ # before sending the config to the jobs controller.
211
+ # TODO(luca) there are a lot of potential problems with postgres being sent
212
+ # to the jobs controller. for example if the postgres is whitelisted to
213
+ # only the API server, this will then break. the simple solution to that is
214
+ # telling the user to add the jobs controller to the postgres whitelist.
215
+ if not managed_job_utils.is_consolidation_mode():
216
+ db_path = mutated_user_config.get('db', None)
217
+ if db_path is not None:
218
+ parsed = urlparse.urlparse(db_path)
219
+ if ((parsed.hostname == 'localhost' or
220
+ ipaddress.ip_address(parsed.hostname).is_loopback)):
221
+ mutated_user_config.pop('db', None)
222
+
205
223
  user_dag_str_user_specified = dag_utils.dump_chain_dag_to_yaml_str(
206
224
  dag, use_user_specified_yaml=True)
207
225
 
@@ -424,10 +442,8 @@ def launch(
424
442
  ]
425
443
  run_script = '\n'.join(env_cmds + [run_script])
426
444
  # Dump script for high availability recovery.
427
- if controller_utils.high_availability_specified(
428
- controller_name):
429
- managed_job_state.set_ha_recovery_script(
430
- consolidation_mode_job_id, run_script)
445
+ managed_job_state.set_ha_recovery_script(
446
+ consolidation_mode_job_id, run_script)
431
447
  backend.run_on_head(local_handle, run_script)
432
448
  return consolidation_mode_job_id, local_handle
433
449
 
sky/jobs/server/utils.py CHANGED
@@ -11,7 +11,6 @@ logger = sky_logging.init_logger(__name__)
11
11
 
12
12
  def check_version_mismatch_and_non_terminal_jobs() -> None:
13
13
  """Check if controller has version mismatch and non-terminal jobs exist.
14
-
15
14
  Raises:
16
15
  ValueError: If there's a version mismatch and non-terminal jobs exist.
17
16
  sky.exceptions.ClusterNotUpError: If the controller is not accessible.
@@ -59,7 +58,8 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
59
58
  job_table_payload = output_parts[1]
60
59
 
61
60
  # Process locally: check version match and filter non-terminal jobs
62
- version_matches = controller_version == local_version
61
+ version_matches = (controller_version == local_version or
62
+ int(controller_version) > 17)
63
63
 
64
64
  # Load and filter jobs locally using existing method
65
65
  jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(