skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (179) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/nebius.py +43 -1
  3. sky/backends/backend_utils.py +74 -7
  4. sky/backends/cloud_vm_ray_backend.py +169 -29
  5. sky/catalog/cudo_catalog.py +1 -1
  6. sky/catalog/data_fetchers/fetch_cudo.py +1 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +6 -3
  8. sky/client/cli/command.py +62 -85
  9. sky/client/common.py +1 -1
  10. sky/client/sdk.py +69 -19
  11. sky/client/sdk_async.py +5 -4
  12. sky/clouds/aws.py +52 -1
  13. sky/clouds/kubernetes.py +15 -5
  14. sky/clouds/nebius.py +3 -1
  15. sky/dag.py +1 -0
  16. sky/dashboard/out/404.html +1 -1
  17. sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
  23. sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
  25. sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
  27. sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
  29. sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-01359c57e018caa4.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
  34. sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
  36. sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
  37. sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
  39. sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
  40. sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-ce361c6959bc2001.js} +2 -2
  41. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
  42. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
  43. sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
  45. sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-7d4182df6625fe10.js} +2 -7
  48. sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
  54. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  55. sky/dashboard/out/clusters/[cluster].html +1 -1
  56. sky/dashboard/out/clusters.html +1 -1
  57. sky/dashboard/out/config.html +1 -1
  58. sky/dashboard/out/index.html +1 -1
  59. sky/dashboard/out/infra/[context].html +1 -1
  60. sky/dashboard/out/infra.html +1 -1
  61. sky/dashboard/out/jobs/[job].html +1 -1
  62. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  63. sky/dashboard/out/jobs.html +1 -1
  64. sky/dashboard/out/users.html +1 -1
  65. sky/dashboard/out/volumes.html +1 -1
  66. sky/dashboard/out/workspace/new.html +1 -1
  67. sky/dashboard/out/workspaces/[name].html +1 -1
  68. sky/dashboard/out/workspaces.html +1 -1
  69. sky/data/storage.py +11 -1
  70. sky/exceptions.py +5 -0
  71. sky/execution.py +13 -10
  72. sky/global_user_state.py +191 -8
  73. sky/jobs/constants.py +1 -1
  74. sky/jobs/controller.py +0 -1
  75. sky/jobs/recovery_strategy.py +3 -3
  76. sky/jobs/scheduler.py +35 -87
  77. sky/jobs/server/core.py +82 -22
  78. sky/jobs/server/utils.py +1 -1
  79. sky/jobs/state.py +7 -5
  80. sky/jobs/utils.py +167 -8
  81. sky/provision/__init__.py +1 -0
  82. sky/provision/aws/config.py +25 -0
  83. sky/provision/aws/instance.py +37 -13
  84. sky/provision/azure/instance.py +2 -0
  85. sky/provision/cudo/cudo_wrapper.py +1 -1
  86. sky/provision/cudo/instance.py +2 -0
  87. sky/provision/do/instance.py +2 -0
  88. sky/provision/fluidstack/instance.py +2 -0
  89. sky/provision/gcp/instance.py +2 -0
  90. sky/provision/hyperbolic/instance.py +2 -1
  91. sky/provision/kubernetes/instance.py +133 -0
  92. sky/provision/lambda_cloud/instance.py +2 -0
  93. sky/provision/nebius/instance.py +2 -0
  94. sky/provision/nebius/utils.py +101 -86
  95. sky/provision/oci/instance.py +2 -0
  96. sky/provision/paperspace/instance.py +2 -1
  97. sky/provision/paperspace/utils.py +1 -1
  98. sky/provision/provisioner.py +13 -8
  99. sky/provision/runpod/instance.py +2 -0
  100. sky/provision/runpod/utils.py +1 -1
  101. sky/provision/scp/instance.py +2 -0
  102. sky/provision/vast/instance.py +2 -0
  103. sky/provision/vsphere/instance.py +2 -0
  104. sky/resources.py +6 -7
  105. sky/schemas/__init__.py +0 -0
  106. sky/schemas/api/__init__.py +0 -0
  107. sky/schemas/api/responses.py +70 -0
  108. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  109. sky/schemas/generated/__init__.py +0 -0
  110. sky/schemas/generated/autostopv1_pb2.py +36 -0
  111. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  112. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  113. sky/serve/constants.py +3 -7
  114. sky/serve/replica_managers.py +138 -117
  115. sky/serve/serve_state.py +42 -0
  116. sky/serve/serve_utils.py +58 -36
  117. sky/serve/server/impl.py +15 -19
  118. sky/serve/service.py +82 -33
  119. sky/server/constants.py +1 -1
  120. sky/server/requests/payloads.py +6 -0
  121. sky/server/requests/serializers/decoders.py +12 -2
  122. sky/server/requests/serializers/encoders.py +10 -2
  123. sky/server/server.py +64 -16
  124. sky/setup_files/dependencies.py +11 -10
  125. sky/skylet/autostop_lib.py +38 -5
  126. sky/skylet/constants.py +3 -1
  127. sky/skylet/services.py +44 -0
  128. sky/skylet/skylet.py +49 -4
  129. sky/task.py +19 -16
  130. sky/templates/aws-ray.yml.j2 +2 -2
  131. sky/templates/jobs-controller.yaml.j2 +6 -0
  132. sky/templates/kubernetes-ray.yml.j2 +1 -0
  133. sky/utils/command_runner.py +1 -1
  134. sky/utils/common_utils.py +20 -0
  135. sky/utils/config_utils.py +29 -5
  136. sky/utils/controller_utils.py +86 -0
  137. sky/utils/db/db_utils.py +17 -0
  138. sky/utils/db/migration_utils.py +1 -1
  139. sky/utils/log_utils.py +14 -5
  140. sky/utils/resources_utils.py +25 -1
  141. sky/utils/schemas.py +6 -0
  142. sky/utils/ux_utils.py +36 -5
  143. sky/volumes/server/core.py +2 -2
  144. sky/volumes/server/server.py +2 -2
  145. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +5 -7
  146. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +151 -142
  147. sky/dashboard/out/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
  149. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
  150. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  151. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
  153. sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  155. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  156. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
  158. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
  159. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
  160. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
  161. sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
  162. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
  163. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
  164. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
  165. sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
  166. sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
  167. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
  169. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
  170. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
  173. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
  174. sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +0 -1
  175. /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
  176. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
  177. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
  178. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
  179. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0
sky/jobs/scheduler.py CHANGED
@@ -15,13 +15,14 @@ following section for more details).
15
15
 
16
16
  The scheduling logic limits #running jobs according to three limits:
17
17
  1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
18
- once, based on the number of CPUs. (See _get_launch_parallelism.) This the
19
- most compute-intensive part of the job lifecycle, which is why we have an
20
- additional limit.
18
+ once, based on the number of CPUs. This the most compute-intensive part of
19
+ the job lifecycle, which is why we have an additional limit.
20
+ See sky/utils/controller_utils.py::_get_launch_parallelism.
21
21
  2. The number of jobs that can be running at any given time, based on the amount
22
- of memory. (See _get_job_parallelism.) Since the job controller is doing very
23
- little once a job starts (just checking its status periodically), the most
24
- significant resource it consumes is memory.
22
+ of memory. Since the job controller is doing very little once a job starts
23
+ (just checking its status periodically), the most significant resource it
24
+ consumes is memory.
25
+ See sky/utils/controller_utils.py::_get_job_parallelism.
25
26
  3. The number of jobs that can be running in a pool at any given time, based on
26
27
  the number of ready workers in the pool. (See _can_start_new_job.)
27
28
 
@@ -42,55 +43,27 @@ Nomenclature:
42
43
 
43
44
  from argparse import ArgumentParser
44
45
  import contextlib
45
- from functools import lru_cache
46
46
  import os
47
47
  import sys
48
48
  import time
49
- import typing
50
49
  from typing import Optional
51
50
 
52
51
  import filelock
53
52
 
54
53
  from sky import exceptions
55
54
  from sky import sky_logging
56
- from sky.adaptors import common as adaptors_common
57
55
  from sky.jobs import constants as managed_job_constants
58
56
  from sky.jobs import state
59
57
  from sky.serve import serve_utils
60
58
  from sky.skylet import constants
61
59
  from sky.utils import common_utils
60
+ from sky.utils import controller_utils
62
61
  from sky.utils import subprocess_utils
63
62
 
64
- if typing.TYPE_CHECKING:
65
- import psutil
66
- else:
67
- psutil = adaptors_common.LazyImport('psutil')
68
-
69
63
  logger = sky_logging.init_logger('sky.jobs.controller')
70
64
 
71
- # The _MANAGED_JOB_SCHEDULER_LOCK should be held whenever we are checking the
72
- # parallelism control or updating the schedule_state of any job.
73
- # Any code that takes this lock must conclude by calling
74
- # maybe_schedule_next_jobs.
75
- _MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
76
65
  _ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
77
66
 
78
- # Based on testing, assume a running job uses 350MB memory.
79
- JOB_MEMORY_MB = 350
80
- # Past 2000 simultaneous jobs, we become unstable.
81
- # See https://github.com/skypilot-org/skypilot/issues/4649.
82
- MAX_JOB_LIMIT = 2000
83
- # Number of ongoing launches launches allowed per CPU.
84
- LAUNCHES_PER_CPU = 4
85
-
86
-
87
- @lru_cache(maxsize=1)
88
- def _get_lock_path() -> str:
89
- # TODO(tian): Per pool lock.
90
- path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
91
- os.makedirs(os.path.dirname(path), exist_ok=True)
92
- return path
93
-
94
67
 
95
68
  def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
96
69
  pool: Optional[str]) -> None:
@@ -120,7 +93,7 @@ def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
120
93
  logger.debug(f'Job {job_id} started with pid {pid}')
121
94
 
122
95
 
123
- def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
96
+ def maybe_schedule_next_jobs() -> None:
124
97
  """Determine if any managed jobs can be scheduled, and if so, schedule them.
125
98
 
126
99
  Here, "schedule" means to select job that is waiting, and allow it to
@@ -163,9 +136,10 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
163
136
  # parallelism control. If we cannot obtain the lock, exit immediately.
164
137
  # The current lock holder is expected to launch any jobs it can before
165
138
  # releasing the lock.
166
- with filelock.FileLock(_get_lock_path(), blocking=False):
139
+ with filelock.FileLock(controller_utils.get_resources_lock_path(),
140
+ blocking=False):
167
141
  while True:
168
- maybe_next_job = state.get_waiting_job(pool)
142
+ maybe_next_job = state.get_waiting_job()
169
143
  if maybe_next_job is None:
170
144
  # Nothing left to start, break from scheduling loop
171
145
  break
@@ -184,21 +158,11 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
184
158
  # an ALIVE_WAITING job, but we would be able to launch a WAITING
185
159
  # job.
186
160
  if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
187
- if not _can_lauch_in_alive_job():
161
+ if not controller_utils.can_provision():
188
162
  # Can't schedule anything, break from scheduling loop.
189
163
  break
190
164
  elif current_state == state.ManagedJobScheduleState.WAITING:
191
165
  if not _can_start_new_job(actual_pool):
192
- # If there is no job can be scheduled in the pool, we
193
- # try to schedule another job regardless of the pool.
194
- # This is to avoid the case where the pool is scaled
195
- # down at the same time as a job is done. In this case,
196
- # we won't have any job to schedule in the pool, but
197
- # other jobs in other pool (or no pool) can still be
198
- # scheduled.
199
- if pool is not None:
200
- pool = None
201
- continue
202
166
  # Can't schedule anything, break from scheduling loop.
203
167
  break
204
168
 
@@ -234,7 +198,7 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
234
198
 
235
199
  The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
236
200
  """
237
- with filelock.FileLock(_get_lock_path()):
201
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
238
202
  is_resume = state.scheduler_set_waiting(job_id, dag_yaml_path,
239
203
  original_user_yaml_path,
240
204
  env_file_path,
@@ -243,7 +207,7 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
243
207
  if is_resume:
244
208
  _start_controller(job_id, dag_yaml_path, env_file_path, pool)
245
209
  else:
246
- maybe_schedule_next_jobs(pool)
210
+ maybe_schedule_next_jobs()
247
211
 
248
212
 
249
213
  @contextlib.contextmanager
@@ -268,6 +232,13 @@ def scheduled_launch(job_id: int):
268
232
  multiple uses of this context are nested, behavior is undefined. Don't do
269
233
  that.
270
234
  """
235
+ pool = state.get_pool_from_job_id(job_id)
236
+ # For pool, since there is no execution.launch, we don't need to have all
237
+ # the ALIVE_WAITING state. The state transition will be
238
+ # WAITING -> ALIVE -> DONE without any intermediate transitions.
239
+ if pool is not None:
240
+ yield
241
+ return
271
242
 
272
243
  # If we're already in LAUNCHING schedule_state, we don't need to wait.
273
244
  # This may be the case for the first launch of a job.
@@ -279,21 +250,20 @@ def scheduled_launch(job_id: int):
279
250
  while (state.get_job_schedule_state(job_id) !=
280
251
  state.ManagedJobScheduleState.LAUNCHING):
281
252
  time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
282
- pool = state.get_pool_from_job_id(job_id)
283
253
 
284
254
  try:
285
255
  yield
286
256
  except exceptions.NoClusterLaunchedError:
287
257
  # NoClusterLaunchedError is indicates that the job is in retry backoff.
288
258
  # We should transition to ALIVE_BACKOFF instead of ALIVE.
289
- with filelock.FileLock(_get_lock_path()):
259
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
290
260
  state.scheduler_set_alive_backoff(job_id)
291
261
  raise
292
262
  else:
293
- with filelock.FileLock(_get_lock_path()):
263
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
294
264
  state.scheduler_set_alive(job_id)
295
265
  finally:
296
- maybe_schedule_next_jobs(pool)
266
+ maybe_schedule_next_jobs()
297
267
 
298
268
 
299
269
  def job_done(job_id: int, idempotent: bool = False) -> None:
@@ -308,58 +278,36 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
308
278
  if idempotent and (state.get_job_schedule_state(job_id)
309
279
  == state.ManagedJobScheduleState.DONE):
310
280
  return
311
- pool = state.get_pool_from_job_id(job_id)
312
281
 
313
- with filelock.FileLock(_get_lock_path()):
282
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
314
283
  state.scheduler_set_done(job_id, idempotent)
315
- maybe_schedule_next_jobs(pool)
284
+ maybe_schedule_next_jobs()
316
285
 
317
286
 
318
287
  def _set_alive_waiting(job_id: int) -> None:
319
288
  """Should use wait_until_launch_okay() to transition to this state."""
320
- with filelock.FileLock(_get_lock_path()):
289
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
321
290
  state.scheduler_set_alive_waiting(job_id)
322
- pool = state.get_pool_from_job_id(job_id)
323
- maybe_schedule_next_jobs(pool)
324
-
325
-
326
- def _get_job_parallelism() -> int:
327
- job_memory = JOB_MEMORY_MB * 1024 * 1024
328
-
329
- job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
330
-
331
- return max(job_limit, 1)
332
-
333
-
334
- def _get_launch_parallelism() -> int:
335
- cpus = os.cpu_count()
336
- return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
291
+ maybe_schedule_next_jobs()
337
292
 
338
293
 
339
294
  def _can_start_new_job(pool: Optional[str]) -> bool:
340
- launching_jobs = state.get_num_launching_jobs()
341
- alive_jobs = state.get_num_alive_jobs()
342
-
343
295
  # Check basic resource limits
344
- if not (launching_jobs < _get_launch_parallelism() and
345
- alive_jobs < _get_job_parallelism()):
296
+ # Pool jobs don't need to provision resources, so we skip the check.
297
+ if not ((controller_utils.can_provision() or pool is not None) and
298
+ controller_utils.can_start_new_process()):
346
299
  return False
347
300
 
348
- # Check if there are available replicas in the pool
301
+ # Check if there are available workers in the pool
349
302
  if pool is not None:
350
303
  alive_jobs_in_pool = state.get_num_alive_jobs(pool)
351
- if alive_jobs_in_pool >= serve_utils.num_replicas(pool):
352
- logger.debug(f'No replicas available in pool {pool}')
304
+ if alive_jobs_in_pool >= len(serve_utils.get_ready_replicas(pool)):
305
+ logger.debug(f'No READY workers available in pool {pool}')
353
306
  return False
354
307
 
355
308
  return True
356
309
 
357
310
 
358
- def _can_lauch_in_alive_job() -> bool:
359
- launching_jobs = state.get_num_launching_jobs()
360
- return launching_jobs < _get_launch_parallelism()
361
-
362
-
363
311
  if __name__ == '__main__':
364
312
  parser = ArgumentParser()
365
313
  parser.add_argument('dag_yaml',
sky/jobs/server/core.py CHANGED
@@ -93,8 +93,8 @@ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
93
93
  return local_to_controller_file_mounts
94
94
 
95
95
 
96
- def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag', pool: Optional[str],
97
- num_jobs: Optional[int]) -> Optional[List[int]]:
96
+ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
97
+ num_jobs: int) -> Optional[List[int]]:
98
98
  """Submit the managed job locally if in consolidation mode.
99
99
 
100
100
  In normal mode the managed job submission is done in the ray job submission.
@@ -109,12 +109,13 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag', pool: Optional[str],
109
109
  # Create local directory for the managed job.
110
110
  pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
111
111
  job_ids = []
112
+ pool = dag.pool
112
113
  pool_hash = None
113
114
  if pool is not None:
114
115
  pool_hash = serve_state.get_service_hash(pool)
115
116
  # Already checked in the sdk.
116
117
  assert pool_hash is not None, f'Pool {pool} not found'
117
- for _ in range(num_jobs if num_jobs is not None else 1):
118
+ for _ in range(num_jobs):
118
119
  # TODO(tian): We should have a separate name for each job when
119
120
  # submitting multiple jobs. Current blocker is that we are sharing
120
121
  # the same dag object for all jobs. Maybe we can do copy.copy() for
@@ -172,9 +173,6 @@ def launch(
172
173
  handle: Optional[backends.ResourceHandle]; handle to the controller VM.
173
174
  None if dryrun.
174
175
  """
175
- if pool is not None and not managed_job_utils.is_consolidation_mode():
176
- with ux_utils.print_exception_no_traceback():
177
- raise ValueError('pool is only supported in consolidation mode.')
178
176
  entrypoint = task
179
177
  # using hasattr instead of isinstance to avoid importing sky
180
178
  if hasattr(task, 'metadata'):
@@ -295,8 +293,13 @@ def launch(
295
293
  controller=controller,
296
294
  task_resources=sum([list(t.resources) for t in dag.tasks], []))
297
295
 
296
+ num_jobs = num_jobs if num_jobs is not None else 1
297
+ # We do this assignment after applying the admin policy, so that we don't
298
+ # need to serialize the pool name in the dag. The dag object will be
299
+ # preserved. See sky/admin_policy.py::MutatedUserRequest::decode.
300
+ dag.pool = pool
298
301
  consolidation_mode_job_ids = _maybe_submit_job_locally(
299
- prefix, dag, pool, num_jobs)
302
+ prefix, dag, num_jobs)
300
303
 
301
304
  # This is only needed for non-consolidation mode. For consolidation
302
305
  # mode, the controller uses the same catalog as API server.
@@ -373,8 +376,8 @@ def launch(
373
376
  controller_task._metadata = metadata
374
377
 
375
378
  job_identity = ''
376
- if consolidation_mode_job_id is not None:
377
- job_identity = f' (Job ID: {consolidation_mode_job_id})'
379
+ if job_rank is not None:
380
+ job_identity = f' (rank: {job_rank})'
378
381
  logger.info(f'{colorama.Fore.YELLOW}'
379
382
  f'Launching managed job {dag.name!r}{job_identity} '
380
383
  f'from jobs controller...{colorama.Style.RESET_ALL}')
@@ -428,14 +431,17 @@ def launch(
428
431
  backend.run_on_head(local_handle, run_script)
429
432
  return consolidation_mode_job_id, local_handle
430
433
 
431
- if consolidation_mode_job_ids is None:
432
- return _submit_one()
433
434
  if pool is None:
435
+ if consolidation_mode_job_ids is None:
436
+ return _submit_one()
434
437
  assert len(consolidation_mode_job_ids) == 1
435
438
  return _submit_one(consolidation_mode_job_ids[0])
439
+
436
440
  ids = []
437
441
  all_handle = None
438
- for job_rank, job_id in enumerate(consolidation_mode_job_ids):
442
+ for job_rank in range(num_jobs):
443
+ job_id = (consolidation_mode_job_ids[job_rank]
444
+ if consolidation_mode_job_ids is not None else None)
439
445
  jid, handle = _submit_one(job_id, job_rank)
440
446
  assert jid is not None, (job_id, handle)
441
447
  ids.append(jid)
@@ -491,7 +497,8 @@ def queue_from_kubernetes_pod(
491
497
  managed_jobs_runner = provision_lib.get_command_runners(
492
498
  'kubernetes', cluster_info)[0]
493
499
 
494
- code = managed_job_utils.ManagedJobCodeGen.get_job_table()
500
+ code = managed_job_utils.ManagedJobCodeGen.get_job_table(
501
+ skip_finished=skip_finished)
495
502
  returncode, job_table_payload, stderr = managed_jobs_runner.run(
496
503
  code,
497
504
  require_outputs=True,
@@ -507,7 +514,14 @@ def queue_from_kubernetes_pod(
507
514
  except exceptions.CommandError as e:
508
515
  raise RuntimeError(str(e)) from e
509
516
 
510
- jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
517
+ jobs, _, result_type = managed_job_utils.load_managed_job_queue(
518
+ job_table_payload)
519
+
520
+ if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
521
+ return jobs
522
+
523
+ # Backward compatibility for old jobs controller without filtering
524
+ # TODO(hailong): remove this after 0.12.0
511
525
  if skip_finished:
512
526
  # Filter out the finished jobs. If a multi-task job is partially
513
527
  # finished, we will include all its tasks.
@@ -562,10 +576,18 @@ def _maybe_restart_controller(
562
576
 
563
577
 
564
578
  @usage_lib.entrypoint
565
- def queue(refresh: bool,
566
- skip_finished: bool = False,
567
- all_users: bool = False,
568
- job_ids: Optional[List[int]] = None) -> List[Dict[str, Any]]:
579
+ def queue(
580
+ refresh: bool,
581
+ skip_finished: bool = False,
582
+ all_users: bool = False,
583
+ job_ids: Optional[List[int]] = None,
584
+ user_match: Optional[str] = None,
585
+ workspace_match: Optional[str] = None,
586
+ name_match: Optional[str] = None,
587
+ pool_match: Optional[str] = None,
588
+ page: Optional[int] = None,
589
+ limit: Optional[int] = None,
590
+ ) -> Tuple[List[Dict[str, Any]], int]:
569
591
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
570
592
  """Gets statuses of managed jobs.
571
593
 
@@ -595,6 +617,17 @@ def queue(refresh: bool,
595
617
  does not exist.
596
618
  RuntimeError: if failed to get the managed jobs with ssh.
597
619
  """
620
+ if limit is not None:
621
+ if limit < 1:
622
+ raise ValueError(f'Limit must be at least 1, got {limit}')
623
+ if page is None:
624
+ page = 1
625
+ if page < 1:
626
+ raise ValueError(f'Page must be at least 1, got {page}')
627
+ else:
628
+ if page is not None:
629
+ raise ValueError('Limit must be specified when page is specified')
630
+
598
631
  handle = _maybe_restart_controller(refresh,
599
632
  stopped_message='No in-progress '
600
633
  'managed jobs.',
@@ -603,7 +636,22 @@ def queue(refresh: bool,
603
636
  backend = backend_utils.get_backend_from_handle(handle)
604
637
  assert isinstance(backend, backends.CloudVmRayBackend)
605
638
 
606
- code = managed_job_utils.ManagedJobCodeGen.get_job_table()
639
+ user_hashes: Optional[List[Optional[str]]] = None
640
+ if not all_users:
641
+ user_hashes = [common_utils.get_user_hash()]
642
+ # For backwards compatibility, we show jobs that do not have a
643
+ # user_hash. TODO(cooperc): Remove before 0.12.0.
644
+ user_hashes.append(None)
645
+ elif user_match is not None:
646
+ users = global_user_state.get_user_by_name_match(user_match)
647
+ if not users:
648
+ return [], 0
649
+ user_hashes = [user.id for user in users]
650
+
651
+ accessible_workspaces = list(workspaces_core.get_workspaces().keys())
652
+ code = managed_job_utils.ManagedJobCodeGen.get_job_table(
653
+ skip_finished, accessible_workspaces, job_ids, workspace_match,
654
+ name_match, pool_match, page, limit, user_hashes)
607
655
  returncode, job_table_payload, stderr = backend.run_on_head(
608
656
  handle,
609
657
  code,
@@ -616,8 +664,14 @@ def queue(refresh: bool,
616
664
  raise RuntimeError('Failed to fetch managed jobs with returncode: '
617
665
  f'{returncode}.\n{job_table_payload + stderr}')
618
666
 
619
- jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
667
+ jobs, total, result_type = managed_job_utils.load_managed_job_queue(
668
+ job_table_payload)
620
669
 
670
+ if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
671
+ return jobs, total
672
+
673
+ # Backward compatibility for old jobs controller without filtering
674
+ # TODO(hailong): remove this after 0.12.0
621
675
  if not all_users:
622
676
 
623
677
  def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
@@ -630,7 +684,6 @@ def queue(refresh: bool,
630
684
 
631
685
  jobs = list(filter(user_hash_matches_or_missing, jobs))
632
686
 
633
- accessible_workspaces = workspaces_core.get_workspaces()
634
687
  jobs = list(
635
688
  filter(
636
689
  lambda job: job.get('workspace', skylet_constants.
@@ -649,7 +702,14 @@ def queue(refresh: bool,
649
702
  if job_ids:
650
703
  jobs = [job for job in jobs if job['job_id'] in job_ids]
651
704
 
652
- return jobs
705
+ return managed_job_utils.filter_jobs(jobs,
706
+ workspace_match,
707
+ name_match,
708
+ pool_match,
709
+ page=page,
710
+ limit=limit,
711
+ user_match=user_match,
712
+ enable_user_match=True)
653
713
 
654
714
 
655
715
  @usage_lib.entrypoint
sky/jobs/server/utils.py CHANGED
@@ -62,7 +62,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
62
62
  version_matches = controller_version == local_version
63
63
 
64
64
  # Load and filter jobs locally using existing method
65
- jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
65
+ jobs, _, _ = managed_job_utils.load_managed_job_queue(job_table_payload)
66
66
  non_terminal_jobs = [job for job in jobs if not job['status'].is_terminal()]
67
67
  has_non_terminal_jobs = len(non_terminal_jobs) > 0
68
68
 
sky/jobs/state.py CHANGED
@@ -441,7 +441,8 @@ class ManagedJobScheduleState(enum.Enum):
441
441
 
442
442
  # === Status transition functions ===
443
443
  @_init_db
444
- def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
444
+ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str,
445
+ pool: Optional[str], pool_hash: Optional[str]):
445
446
  assert _SQLALCHEMY_ENGINE is not None
446
447
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
447
448
  if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -457,7 +458,10 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
457
458
  name=name,
458
459
  schedule_state=ManagedJobScheduleState.INACTIVE.value,
459
460
  workspace=workspace,
460
- entrypoint=entrypoint)
461
+ entrypoint=entrypoint,
462
+ pool=pool,
463
+ pool_hash=pool_hash,
464
+ )
461
465
  session.execute(insert_stmt)
462
466
  session.commit()
463
467
 
@@ -1524,7 +1528,7 @@ def get_nonterminal_job_ids_by_pool(pool: str,
1524
1528
 
1525
1529
 
1526
1530
  @_init_db
1527
- def get_waiting_job(pool: Optional[str]) -> Optional[Dict[str, Any]]:
1531
+ def get_waiting_job() -> Optional[Dict[str, Any]]:
1528
1532
  """Get the next job that should transition to LAUNCHING.
1529
1533
 
1530
1534
  Selects the highest-priority WAITING or ALIVE_WAITING job, provided its
@@ -1555,8 +1559,6 @@ def get_waiting_job(pool: Optional[str]) -> Optional[Dict[str, Any]]:
1555
1559
  job_info_table.c.priority >= sqlalchemy.func.coalesce(
1556
1560
  max_priority_subquery, 0),
1557
1561
  ]
1558
- if pool is not None:
1559
- select_conds.append(job_info_table.c.pool == pool)
1560
1562
  query = sqlalchemy.select(
1561
1563
  job_info_table.c.spot_job_id,
1562
1564
  job_info_table.c.schedule_state,