skypilot-nightly 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (186) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +4 -1
  3. sky/backends/cloud_vm_ray_backend.py +4 -3
  4. sky/catalog/__init__.py +3 -3
  5. sky/catalog/aws_catalog.py +12 -0
  6. sky/catalog/common.py +2 -2
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +448 -60
  9. sky/client/common.py +12 -9
  10. sky/clouds/nebius.py +1 -1
  11. sky/clouds/utils/gcp_utils.py +1 -1
  12. sky/clouds/vast.py +1 -2
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +11 -0
  16. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +30 -0
  17. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  22. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +16 -0
  28. sky/dashboard/out/_next/static/chunks/4937.d6bf67771e353356.js +15 -0
  29. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  31. sky/dashboard/out/_next/static/chunks/6135-d0e285ac5f3f2485.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  33. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  34. sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/691.6d99cbfba347cebf.js +55 -0
  36. sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  39. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +6 -0
  41. sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/9847.4c46c5e229c78704.js +30 -0
  43. sky/dashboard/out/_next/static/chunks/9984.78ee6d2c6fa4b0e8.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  46. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/_app-a67ae198457b9886.js +34 -0
  49. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +11 -0
  51. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +11 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +1 -0
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-5adfc4d4b3db6f71.js +1 -0
  65. sky/dashboard/out/_next/static/oKqDxFQ88cquF4nQGE_0w/_buildManifest.js +1 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs.html +1 -1
  75. sky/dashboard/out/users.html +1 -1
  76. sky/dashboard/out/volumes.html +1 -1
  77. sky/dashboard/out/workspace/new.html +1 -1
  78. sky/dashboard/out/workspaces/[name].html +1 -1
  79. sky/dashboard/out/workspaces.html +1 -1
  80. sky/data/data_utils.py +25 -0
  81. sky/data/storage.py +1219 -1775
  82. sky/global_user_state.py +18 -8
  83. sky/jobs/__init__.py +3 -0
  84. sky/jobs/client/sdk.py +80 -3
  85. sky/jobs/controller.py +76 -25
  86. sky/jobs/recovery_strategy.py +80 -34
  87. sky/jobs/scheduler.py +68 -20
  88. sky/jobs/server/core.py +228 -136
  89. sky/jobs/server/server.py +40 -0
  90. sky/jobs/state.py +164 -31
  91. sky/jobs/utils.py +144 -68
  92. sky/logs/aws.py +4 -2
  93. sky/provision/kubernetes/utils.py +6 -4
  94. sky/provision/nebius/constants.py +3 -0
  95. sky/provision/vast/instance.py +2 -1
  96. sky/provision/vast/utils.py +9 -6
  97. sky/py.typed +0 -0
  98. sky/resources.py +24 -14
  99. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  100. sky/serve/autoscalers.py +8 -0
  101. sky/serve/client/impl.py +188 -0
  102. sky/serve/client/sdk.py +12 -82
  103. sky/serve/constants.py +5 -1
  104. sky/serve/controller.py +5 -0
  105. sky/serve/replica_managers.py +112 -37
  106. sky/serve/serve_state.py +16 -6
  107. sky/serve/serve_utils.py +274 -77
  108. sky/serve/server/core.py +8 -525
  109. sky/serve/server/impl.py +709 -0
  110. sky/serve/service.py +13 -9
  111. sky/serve/service_spec.py +74 -4
  112. sky/server/constants.py +1 -1
  113. sky/server/requests/payloads.py +33 -0
  114. sky/server/requests/requests.py +18 -1
  115. sky/server/requests/serializers/decoders.py +12 -3
  116. sky/server/requests/serializers/encoders.py +13 -2
  117. sky/server/server.py +6 -1
  118. sky/skylet/events.py +9 -0
  119. sky/skypilot_config.py +24 -21
  120. sky/task.py +41 -11
  121. sky/templates/jobs-controller.yaml.j2 +3 -0
  122. sky/templates/sky-serve-controller.yaml.j2 +18 -2
  123. sky/users/server.py +1 -1
  124. sky/utils/command_runner.py +4 -2
  125. sky/utils/controller_utils.py +14 -10
  126. sky/utils/dag_utils.py +4 -2
  127. sky/utils/db/migration_utils.py +2 -4
  128. sky/utils/schemas.py +24 -19
  129. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
  130. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +135 -130
  131. sky/dashboard/out/_next/static/Q2sVXboB_t7cgvntL-6nD/_buildManifest.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +0 -1
  133. sky/dashboard/out/_next/static/chunks/1141-e49a159c30a6c4a7.js +0 -11
  134. sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +0 -30
  135. sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +0 -6
  137. sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +0 -15
  140. sky/dashboard/out/_next/static/chunks/2641.74c19c4d45a2c034.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/3698-9fa11dafb5cad4a6.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/4725.66125dcd9832aa5d.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +0 -16
  146. sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +0 -15
  147. sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +0 -8
  149. sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +0 -39
  151. sky/dashboard/out/_next/static/chunks/6601-d4a381403a8bae91.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +0 -55
  153. sky/dashboard/out/_next/static/chunks/6989-eab0e9c16b64fd9f.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +0 -41
  156. sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +0 -6
  158. sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +0 -1
  159. sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +0 -30
  160. sky/dashboard/out/_next/static/chunks/9984.0460de9d3adf5582.js +0 -1
  161. sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +0 -1
  162. sky/dashboard/out/_next/static/chunks/framework-efc06c2733009cd3.js +0 -33
  163. sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +0 -1
  164. sky/dashboard/out/_next/static/chunks/main-c0a4f1ea606d48d2.js +0 -1
  165. sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +0 -34
  166. sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +0 -1
  167. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2186770cc2de1623.js +0 -11
  168. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-95afb019ab85801c.js +0 -6
  169. sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +0 -1
  170. sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +0 -1
  173. sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +0 -1
  174. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dc0299ffefebcdbe.js +0 -16
  175. sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/pages/users-6790fcefd5487b13.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/pages/volumes-61ea7ba7e56f8d06.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +0 -1
  179. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +0 -1
  180. sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +0 -1
  182. /sky/dashboard/out/_next/static/{Q2sVXboB_t7cgvntL-6nD → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
  183. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
  184. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
  185. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
  186. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,7 @@ from sky.backends import backend_utils
20
20
  from sky.jobs import scheduler
21
21
  from sky.jobs import state
22
22
  from sky.jobs import utils as managed_job_utils
23
+ from sky.serve import serve_utils
23
24
  from sky.skylet import job_lib
24
25
  from sky.usage import usage_lib
25
26
  from sky.utils import common_utils
@@ -48,9 +49,9 @@ class StrategyExecutor:
48
49
 
49
50
  RETRY_INIT_GAP_SECONDS = 60
50
51
 
51
- def __init__(self, cluster_name: str, backend: 'backends.Backend',
52
+ def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
52
53
  task: 'task_lib.Task', max_restarts_on_errors: int,
53
- job_id: int, task_id: int) -> None:
54
+ job_id: int, task_id: int, pool: Optional[str]) -> None:
54
55
  """Initialize the strategy executor.
55
56
 
56
57
  Args:
@@ -62,17 +63,23 @@ class StrategyExecutor:
62
63
  'Only CloudVMRayBackend is supported.')
63
64
  self.dag = sky.Dag()
64
65
  self.dag.add(task)
66
+ # For jobs submitted to a pool, the cluster name might change after each
67
+ # recovery. Initially this is set to an empty string to indicate that no
68
+ # cluster is assigned yet, and in `_launch`, it will be set to one of
69
+ # the cluster names in the pool.
65
70
  self.cluster_name = cluster_name
66
71
  self.backend = backend
67
72
  self.max_restarts_on_errors = max_restarts_on_errors
68
73
  self.job_id = job_id
69
74
  self.task_id = task_id
75
+ self.pool = pool
70
76
  self.restart_cnt_on_failure = 0
77
+ self.job_id_on_pool_cluster: Optional[int] = None
71
78
 
72
79
  @classmethod
73
- def make(cls, cluster_name: str, backend: 'backends.Backend',
74
- task: 'task_lib.Task', job_id: int,
75
- task_id: int) -> 'StrategyExecutor':
80
+ def make(cls, cluster_name: Optional[str], backend: 'backends.Backend',
81
+ task: 'task_lib.Task', job_id: int, task_id: int,
82
+ pool: Optional[str]) -> 'StrategyExecutor':
76
83
  """Create a strategy from a task."""
77
84
 
78
85
  resource_list = list(task.resources)
@@ -103,7 +110,8 @@ class StrategyExecutor:
103
110
  from_str(job_recovery_name))
104
111
  assert job_recovery_strategy is not None, job_recovery_name
105
112
  return job_recovery_strategy(cluster_name, backend, task,
106
- max_restarts_on_errors, job_id, task_id)
113
+ max_restarts_on_errors, job_id, task_id,
114
+ pool)
107
115
 
108
116
  def launch(self) -> float:
109
117
  """Launch the cluster for the first time.
@@ -131,12 +139,14 @@ class StrategyExecutor:
131
139
  """
132
140
  raise NotImplementedError
133
141
 
134
- def _try_cancel_all_jobs(self):
142
+ def _try_cancel_jobs(self):
135
143
  from sky import core # pylint: disable=import-outside-toplevel
136
144
 
145
+ if self.cluster_name is None:
146
+ return
137
147
  handle = global_user_state.get_handle_from_cluster_name(
138
148
  self.cluster_name)
139
- if handle is None:
149
+ if handle is None or self.pool is not None:
140
150
  return
141
151
  try:
142
152
  usage_lib.messages.usage.set_internal()
@@ -159,8 +169,13 @@ class StrategyExecutor:
159
169
  # should be functional with the `_try_cancel_if_cluster_is_init`
160
170
  # flag, i.e. it sends the cancel signal to the head node, which will
161
171
  # then kill the user process on remaining worker nodes.
172
+ # Only cancel the corresponding job for worker pool.
173
+ if self.pool is None:
174
+ kwargs = dict(all=True)
175
+ else:
176
+ kwargs = dict(job_ids=[self.job_id_on_pool_cluster])
162
177
  core.cancel(cluster_name=self.cluster_name,
163
- all=True,
178
+ **kwargs,
164
179
  _try_cancel_if_cluster_is_init=True)
165
180
  except Exception as e: # pylint: disable=broad-except
166
181
  logger.info('Failed to cancel the job on the cluster. The cluster '
@@ -169,7 +184,7 @@ class StrategyExecutor:
169
184
  f'{common_utils.format_exception(e)}\n'
170
185
  'Terminating the cluster explicitly to ensure no '
171
186
  'remaining job process interferes with recovery.')
172
- managed_job_utils.terminate_cluster(self.cluster_name)
187
+ self._cleanup_cluster()
173
188
 
174
189
  def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
175
190
  """Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
@@ -178,6 +193,7 @@ class StrategyExecutor:
178
193
  The timestamp of when the job is submitted, or None if failed to
179
194
  submit.
180
195
  """
196
+ assert self.cluster_name is not None
181
197
  status = None
182
198
  job_checking_retry_cnt = 0
183
199
  while job_checking_retry_cnt < MAX_JOB_CHECKING_RETRY:
@@ -208,7 +224,9 @@ class StrategyExecutor:
208
224
 
209
225
  try:
210
226
  status = managed_job_utils.get_job_status(
211
- self.backend, self.cluster_name)
227
+ self.backend,
228
+ self.cluster_name,
229
+ job_id=self.job_id_on_pool_cluster)
212
230
  except Exception as e: # pylint: disable=broad-except
213
231
  # If any unexpected error happens, retry the job checking
214
232
  # loop.
@@ -224,7 +242,10 @@ class StrategyExecutor:
224
242
  if status is not None and status > job_lib.JobStatus.INIT:
225
243
  try:
226
244
  job_submitted_at = managed_job_utils.get_job_timestamp(
227
- self.backend, self.cluster_name, get_end_time=False)
245
+ self.backend,
246
+ self.cluster_name,
247
+ self.job_id_on_pool_cluster,
248
+ get_end_time=False)
228
249
  return job_submitted_at
229
250
  except Exception as e: # pylint: disable=broad-except
230
251
  # If we failed to get the job timestamp, we will retry
@@ -236,6 +257,12 @@ class StrategyExecutor:
236
257
  time.sleep(managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
237
258
  return None
238
259
 
260
+ def _cleanup_cluster(self) -> None:
261
+ if self.cluster_name is None:
262
+ return
263
+ if self.pool is None:
264
+ managed_job_utils.terminate_cluster(self.cluster_name)
265
+
239
266
  def _launch(self,
240
267
  max_retry: Optional[int] = 3,
241
268
  raise_on_failure: bool = True,
@@ -290,19 +317,35 @@ class StrategyExecutor:
290
317
  recovery)
291
318
  try:
292
319
  usage_lib.messages.usage.set_internal()
293
- # Detach setup, so that the setup failure can be
294
- # detected by the controller process (job_status ->
295
- # FAILED_SETUP).
296
- execution.launch(
297
- self.dag,
298
- cluster_name=self.cluster_name,
299
- # We expect to tear down the cluster as soon as the
300
- # job is finished. However, in case the controller
301
- # dies, set autodown to try and avoid a resource
302
- # leak.
303
- idle_minutes_to_autostop=_AUTODOWN_MINUTES,
304
- down=True,
305
- _is_launched_by_jobs_controller=True)
320
+ if self.pool is None:
321
+ assert self.cluster_name is not None
322
+ # Detach setup, so that the setup failure can be
323
+ # detected by the controller process (job_status ->
324
+ # FAILED_SETUP).
325
+ execution.launch(
326
+ self.dag,
327
+ cluster_name=self.cluster_name,
328
+ # We expect to tear down the cluster as soon as
329
+ # the job is finished. However, in case the
330
+ # controller dies, set autodown to try and avoid
331
+ # a resource leak.
332
+ idle_minutes_to_autostop=_AUTODOWN_MINUTES,
333
+ down=True,
334
+ _is_launched_by_jobs_controller=True)
335
+ else:
336
+ self.cluster_name = (
337
+ serve_utils.get_next_cluster_name(
338
+ self.pool, self.job_id))
339
+ if self.cluster_name is None:
340
+ raise exceptions.NoClusterLaunchedError(
341
+ 'No cluster name found in the pool.')
342
+ job_id_on_pool_cluster, _ = execution.exec(
343
+ self.dag, cluster_name=self.cluster_name)
344
+ assert job_id_on_pool_cluster is not None, (
345
+ self.cluster_name, self.job_id)
346
+ self.job_id_on_pool_cluster = job_id_on_pool_cluster
347
+ state.set_job_id_on_pool_cluster(
348
+ self.job_id, job_id_on_pool_cluster)
306
349
  logger.info('Managed job cluster launched.')
307
350
  except (exceptions.InvalidClusterNameError,
308
351
  exceptions.NoCloudAccessError,
@@ -373,7 +416,7 @@ class StrategyExecutor:
373
416
 
374
417
  # If we get here, the launch did not succeed. Tear down the
375
418
  # cluster and retry.
376
- managed_job_utils.terminate_cluster(self.cluster_name)
419
+ self._cleanup_cluster()
377
420
  if max_retry is not None and retry_cnt >= max_retry:
378
421
  # Retry forever if max_retry is None.
379
422
  if raise_on_failure:
@@ -398,7 +441,10 @@ class StrategyExecutor:
398
441
  # Update the status to PENDING during backoff.
399
442
  state.set_backoff_pending(self.job_id, self.task_id)
400
443
  # Calculate the backoff time and sleep.
401
- gap_seconds = backoff.current_backoff()
444
+ # We retry immediately for worker pool, since no sky.launch()
445
+ # is called and the overhead is minimal.
446
+ gap_seconds = (backoff.current_backoff()
447
+ if self.pool is None else 0)
402
448
  logger.info('Retrying to launch the cluster in '
403
449
  f'{gap_seconds:.1f} seconds.')
404
450
  time.sleep(gap_seconds)
@@ -427,11 +473,11 @@ class FailoverStrategyExecutor(StrategyExecutor):
427
473
 
428
474
  _MAX_RETRY_CNT = 240 # Retry for 4 hours.
429
475
 
430
- def __init__(self, cluster_name: str, backend: 'backends.Backend',
476
+ def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
431
477
  task: 'task_lib.Task', max_restarts_on_errors: int,
432
- job_id: int, task_id: int) -> None:
478
+ job_id: int, task_id: int, pool: Optional[str]) -> None:
433
479
  super().__init__(cluster_name, backend, task, max_restarts_on_errors,
434
- job_id, task_id)
480
+ job_id, task_id, pool)
435
481
  # Note down the cloud/region of the launched cluster, so that we can
436
482
  # first retry in the same cloud/region. (Inside recover() we may not
437
483
  # rely on cluster handle, as it can be None if the cluster is
@@ -444,7 +490,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
444
490
  recovery: bool = False) -> Optional[float]:
445
491
  job_submitted_at = super()._launch(max_retry, raise_on_failure,
446
492
  recovery)
447
- if job_submitted_at is not None:
493
+ if job_submitted_at is not None and self.cluster_name is not None:
448
494
  # Only record the cloud/region if the launch is successful.
449
495
  handle = global_user_state.get_handle_from_cluster_name(
450
496
  self.cluster_name)
@@ -464,7 +510,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
464
510
  # original user specification.
465
511
 
466
512
  # Step 1
467
- self._try_cancel_all_jobs()
513
+ self._try_cancel_jobs()
468
514
 
469
515
  while True:
470
516
  # Add region constraint to the task, to retry on the same region
@@ -488,7 +534,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
488
534
  # Step 2
489
535
  logger.debug('Terminating unhealthy cluster and reset cloud '
490
536
  'region.')
491
- managed_job_utils.terminate_cluster(self.cluster_name)
537
+ self._cleanup_cluster()
492
538
 
493
539
  # Step 3
494
540
  logger.debug('Relaunch the cluster without constraining to prior '
@@ -547,7 +593,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
547
593
 
548
594
  # Step 1
549
595
  logger.debug('Terminating unhealthy cluster and reset cloud region.')
550
- managed_job_utils.terminate_cluster(self.cluster_name)
596
+ self._cleanup_cluster()
551
597
 
552
598
  # Step 2
553
599
  logger.debug('Relaunch the cluster skipping the previously launched '
sky/jobs/scheduler.py CHANGED
@@ -9,9 +9,11 @@ The scheduler is not its own process - instead, maybe_schedule_next_jobs() can
9
9
  be called from any code running on the managed jobs controller instance to
10
10
  trigger scheduling of new jobs if possible. This function should be called
11
11
  immediately after any state change that could result in jobs newly being able to
12
- be scheduled.
12
+ be scheduled. If the job is running in a pool, the scheduler will only schedule
13
+ jobs for the same pool, because the resources limitations are per-pool (see the
14
+ following section for more details).
13
15
 
14
- The scheduling logic limits the number of running jobs according to two limits:
16
+ The scheduling logic limits #running jobs according to three limits:
15
17
  1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
16
18
  once, based on the number of CPUs. (See _get_launch_parallelism.) This the
17
19
  most compute-intensive part of the job lifecycle, which is why we have an
@@ -20,6 +22,8 @@ The scheduling logic limits the number of running jobs according to two limits:
20
22
  of memory. (See _get_job_parallelism.) Since the job controller is doing very
21
23
  little once a job starts (just checking its status periodically), the most
22
24
  significant resource it consumes is memory.
25
+ 3. The number of jobs that can be running in a pool at any given time, based on
26
+ the number of ready workers in the pool. (See _can_start_new_job.)
23
27
 
24
28
  The state of the scheduler is entirely determined by the schedule_state column
25
29
  of all the jobs in the job_info table. This column should only be modified via
@@ -43,6 +47,7 @@ import os
43
47
  import sys
44
48
  import time
45
49
  import typing
50
+ from typing import Optional
46
51
 
47
52
  import filelock
48
53
 
@@ -51,6 +56,7 @@ from sky import sky_logging
51
56
  from sky.adaptors import common as adaptors_common
52
57
  from sky.jobs import constants as managed_job_constants
53
58
  from sky.jobs import state
59
+ from sky.serve import serve_utils
54
60
  from sky.skylet import constants
55
61
  from sky.utils import common_utils
56
62
  from sky.utils import subprocess_utils
@@ -80,18 +86,21 @@ LAUNCHES_PER_CPU = 4
80
86
 
81
87
  @lru_cache(maxsize=1)
82
88
  def _get_lock_path() -> str:
89
+ # TODO(tian): Per pool lock.
83
90
  path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
84
91
  os.makedirs(os.path.dirname(path), exist_ok=True)
85
92
  return path
86
93
 
87
94
 
88
- def _start_controller(job_id: int, dag_yaml_path: str,
89
- env_file_path: str) -> None:
95
+ def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
96
+ pool: Optional[str]) -> None:
90
97
  activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
91
98
  source_environment_cmd = (f'source {env_file_path};'
92
99
  if env_file_path else '')
93
- run_controller_cmd = (f'{sys.executable} -u -m sky.jobs.controller '
94
- f'{dag_yaml_path} --job-id {job_id};')
100
+ maybe_pool_arg = (f'--pool {pool}' if pool is not None else '')
101
+ run_controller_cmd = (
102
+ f'{sys.executable} -u -m sky.jobs.controller '
103
+ f'{dag_yaml_path} --job-id {job_id} {maybe_pool_arg};')
95
104
 
96
105
  # If the command line here is changed, please also update
97
106
  # utils._controller_process_alive. The substring `--job-id X`
@@ -111,7 +120,7 @@ def _start_controller(job_id: int, dag_yaml_path: str,
111
120
  logger.debug(f'Job {job_id} started with pid {pid}')
112
121
 
113
122
 
114
- def maybe_schedule_next_jobs() -> None:
123
+ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
115
124
  """Determine if any managed jobs can be scheduled, and if so, schedule them.
116
125
 
117
126
  Here, "schedule" means to select job that is waiting, and allow it to
@@ -141,6 +150,13 @@ def maybe_schedule_next_jobs() -> None:
141
150
  the jobs controller instance. New job controller processes will be detached
142
151
  from the current process and there will not be a parent/child relationship.
143
152
  See launch_new_process_tree for more.
153
+
154
+ After adding the pool support, this function will be called in a per-pool
155
+ basis. We employ resources limitation for each pool given the number of
156
+ ready workers in the pool. Each pool will have its own scheduler queue,
157
+ indicating by the argument `pool`. Finished job in pool 1 will only trigger
158
+ another jobs in pool 1, but the job in pool 2 will still be waiting. When
159
+ the `pool` argument is None, it schedules a job regardless of the pool.
144
160
  """
145
161
  try:
146
162
  # We must use a global lock rather than a per-job lock to ensure correct
@@ -149,10 +165,11 @@ def maybe_schedule_next_jobs() -> None:
149
165
  # releasing the lock.
150
166
  with filelock.FileLock(_get_lock_path(), blocking=False):
151
167
  while True:
152
- maybe_next_job = state.get_waiting_job()
168
+ maybe_next_job = state.get_waiting_job(pool)
153
169
  if maybe_next_job is None:
154
170
  # Nothing left to start, break from scheduling loop
155
171
  break
172
+ actual_pool = maybe_next_job['pool']
156
173
 
157
174
  current_state = maybe_next_job['schedule_state']
158
175
 
@@ -171,7 +188,17 @@ def maybe_schedule_next_jobs() -> None:
171
188
  # Can't schedule anything, break from scheduling loop.
172
189
  break
173
190
  elif current_state == state.ManagedJobScheduleState.WAITING:
174
- if not _can_start_new_job():
191
+ if not _can_start_new_job(actual_pool):
192
+ # If there is no job can be scheduled in the pool, we
193
+ # try to schedule another job regardless of the pool.
194
+ # This is to avoid the case where the pool is scaled
195
+ # down at the same time as a job is done. In this case,
196
+ # we won't have any job to schedule in the pool, but
197
+ # other jobs in other pool (or no pool) can still be
198
+ # scheduled.
199
+ if pool is not None:
200
+ pool = None
201
+ continue
175
202
  # Can't schedule anything, break from scheduling loop.
176
203
  break
177
204
 
@@ -187,7 +214,8 @@ def maybe_schedule_next_jobs() -> None:
187
214
  dag_yaml_path = maybe_next_job['dag_yaml_path']
188
215
  env_file_path = maybe_next_job['env_file_path']
189
216
 
190
- _start_controller(job_id, dag_yaml_path, env_file_path)
217
+ _start_controller(job_id, dag_yaml_path, env_file_path,
218
+ actual_pool)
191
219
 
192
220
  except filelock.Timeout:
193
221
  # If we can't get the lock, just exit. The process holding the lock
@@ -196,7 +224,7 @@ def maybe_schedule_next_jobs() -> None:
196
224
 
197
225
 
198
226
  def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
199
- env_file_path: str, priority: int) -> None:
227
+ env_file_path: str, priority: int, pool: Optional[str]) -> None:
200
228
  """Submit an existing job to the scheduler.
201
229
 
202
230
  This should be called after a job is created in the `spot` table as
@@ -213,9 +241,9 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
213
241
  common_utils.get_user_hash(),
214
242
  priority)
215
243
  if is_resume:
216
- _start_controller(job_id, dag_yaml_path, env_file_path)
244
+ _start_controller(job_id, dag_yaml_path, env_file_path, pool)
217
245
  else:
218
- maybe_schedule_next_jobs()
246
+ maybe_schedule_next_jobs(pool)
219
247
 
220
248
 
221
249
  @contextlib.contextmanager
@@ -251,6 +279,7 @@ def scheduled_launch(job_id: int):
251
279
  while (state.get_job_schedule_state(job_id) !=
252
280
  state.ManagedJobScheduleState.LAUNCHING):
253
281
  time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
282
+ pool = state.get_pool_from_job_id(job_id)
254
283
 
255
284
  try:
256
285
  yield
@@ -264,7 +293,7 @@ def scheduled_launch(job_id: int):
264
293
  with filelock.FileLock(_get_lock_path()):
265
294
  state.scheduler_set_alive(job_id)
266
295
  finally:
267
- maybe_schedule_next_jobs()
296
+ maybe_schedule_next_jobs(pool)
268
297
 
269
298
 
270
299
  def job_done(job_id: int, idempotent: bool = False) -> None:
@@ -279,17 +308,19 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
279
308
  if idempotent and (state.get_job_schedule_state(job_id)
280
309
  == state.ManagedJobScheduleState.DONE):
281
310
  return
311
+ pool = state.get_pool_from_job_id(job_id)
282
312
 
283
313
  with filelock.FileLock(_get_lock_path()):
284
314
  state.scheduler_set_done(job_id, idempotent)
285
- maybe_schedule_next_jobs()
315
+ maybe_schedule_next_jobs(pool)
286
316
 
287
317
 
288
318
  def _set_alive_waiting(job_id: int) -> None:
289
319
  """Should use wait_until_launch_okay() to transition to this state."""
290
320
  with filelock.FileLock(_get_lock_path()):
291
321
  state.scheduler_set_alive_waiting(job_id)
292
- maybe_schedule_next_jobs()
322
+ pool = state.get_pool_from_job_id(job_id)
323
+ maybe_schedule_next_jobs(pool)
293
324
 
294
325
 
295
326
  def _get_job_parallelism() -> int:
@@ -305,11 +336,23 @@ def _get_launch_parallelism() -> int:
305
336
  return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
306
337
 
307
338
 
308
- def _can_start_new_job() -> bool:
339
+ def _can_start_new_job(pool: Optional[str]) -> bool:
309
340
  launching_jobs = state.get_num_launching_jobs()
310
341
  alive_jobs = state.get_num_alive_jobs()
311
- return launching_jobs < _get_launch_parallelism(
312
- ) and alive_jobs < _get_job_parallelism()
342
+
343
+ # Check basic resource limits
344
+ if not (launching_jobs < _get_launch_parallelism() and
345
+ alive_jobs < _get_job_parallelism()):
346
+ return False
347
+
348
+ # Check if there are available replicas in the pool
349
+ if pool is not None:
350
+ alive_jobs_in_pool = state.get_num_alive_jobs(pool)
351
+ if alive_jobs_in_pool >= serve_utils.num_replicas(pool):
352
+ logger.debug(f'No replicas available in pool {pool}')
353
+ return False
354
+
355
+ return True
313
356
 
314
357
 
315
358
  def _can_lauch_in_alive_job() -> bool:
@@ -332,6 +375,11 @@ if __name__ == '__main__':
332
375
  parser.add_argument('--env-file',
333
376
  type=str,
334
377
  help='The path to the controller env file.')
378
+ parser.add_argument('--pool',
379
+ type=str,
380
+ required=False,
381
+ default=None,
382
+ help='The pool to use for the controller job.')
335
383
  parser.add_argument(
336
384
  '--priority',
337
385
  type=int,
@@ -341,4 +389,4 @@ if __name__ == '__main__':
341
389
  f' Default: {constants.DEFAULT_PRIORITY}.')
342
390
  args = parser.parse_args()
343
391
  submit_job(args.job_id, args.dag_yaml, args.user_yaml_path, args.env_file,
344
- args.priority)
392
+ args.priority, args.pool)