skypilot-nightly 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (186) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +4 -1
  3. sky/backends/cloud_vm_ray_backend.py +4 -3
  4. sky/catalog/__init__.py +3 -3
  5. sky/catalog/aws_catalog.py +12 -0
  6. sky/catalog/common.py +2 -2
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +448 -60
  9. sky/client/common.py +12 -9
  10. sky/clouds/nebius.py +1 -1
  11. sky/clouds/utils/gcp_utils.py +1 -1
  12. sky/clouds/vast.py +1 -2
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +11 -0
  16. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +30 -0
  17. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  22. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +16 -0
  28. sky/dashboard/out/_next/static/chunks/4937.d6bf67771e353356.js +15 -0
  29. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  31. sky/dashboard/out/_next/static/chunks/6135-d0e285ac5f3f2485.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  33. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  34. sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/691.6d99cbfba347cebf.js +55 -0
  36. sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  39. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +6 -0
  41. sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/9847.4c46c5e229c78704.js +30 -0
  43. sky/dashboard/out/_next/static/chunks/9984.78ee6d2c6fa4b0e8.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  46. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/_app-a67ae198457b9886.js +34 -0
  49. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +11 -0
  51. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +11 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +1 -0
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-5adfc4d4b3db6f71.js +1 -0
  65. sky/dashboard/out/_next/static/oKqDxFQ88cquF4nQGE_0w/_buildManifest.js +1 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs.html +1 -1
  75. sky/dashboard/out/users.html +1 -1
  76. sky/dashboard/out/volumes.html +1 -1
  77. sky/dashboard/out/workspace/new.html +1 -1
  78. sky/dashboard/out/workspaces/[name].html +1 -1
  79. sky/dashboard/out/workspaces.html +1 -1
  80. sky/data/data_utils.py +25 -0
  81. sky/data/storage.py +1219 -1775
  82. sky/global_user_state.py +18 -8
  83. sky/jobs/__init__.py +3 -0
  84. sky/jobs/client/sdk.py +80 -3
  85. sky/jobs/controller.py +76 -25
  86. sky/jobs/recovery_strategy.py +80 -34
  87. sky/jobs/scheduler.py +68 -20
  88. sky/jobs/server/core.py +228 -136
  89. sky/jobs/server/server.py +40 -0
  90. sky/jobs/state.py +164 -31
  91. sky/jobs/utils.py +144 -68
  92. sky/logs/aws.py +4 -2
  93. sky/provision/kubernetes/utils.py +6 -4
  94. sky/provision/nebius/constants.py +3 -0
  95. sky/provision/vast/instance.py +2 -1
  96. sky/provision/vast/utils.py +9 -6
  97. sky/py.typed +0 -0
  98. sky/resources.py +24 -14
  99. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  100. sky/serve/autoscalers.py +8 -0
  101. sky/serve/client/impl.py +188 -0
  102. sky/serve/client/sdk.py +12 -82
  103. sky/serve/constants.py +5 -1
  104. sky/serve/controller.py +5 -0
  105. sky/serve/replica_managers.py +112 -37
  106. sky/serve/serve_state.py +16 -6
  107. sky/serve/serve_utils.py +274 -77
  108. sky/serve/server/core.py +8 -525
  109. sky/serve/server/impl.py +709 -0
  110. sky/serve/service.py +13 -9
  111. sky/serve/service_spec.py +74 -4
  112. sky/server/constants.py +1 -1
  113. sky/server/requests/payloads.py +33 -0
  114. sky/server/requests/requests.py +18 -1
  115. sky/server/requests/serializers/decoders.py +12 -3
  116. sky/server/requests/serializers/encoders.py +13 -2
  117. sky/server/server.py +6 -1
  118. sky/skylet/events.py +9 -0
  119. sky/skypilot_config.py +24 -21
  120. sky/task.py +41 -11
  121. sky/templates/jobs-controller.yaml.j2 +3 -0
  122. sky/templates/sky-serve-controller.yaml.j2 +18 -2
  123. sky/users/server.py +1 -1
  124. sky/utils/command_runner.py +4 -2
  125. sky/utils/controller_utils.py +14 -10
  126. sky/utils/dag_utils.py +4 -2
  127. sky/utils/db/migration_utils.py +2 -4
  128. sky/utils/schemas.py +24 -19
  129. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
  130. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +135 -130
  131. sky/dashboard/out/_next/static/Q2sVXboB_t7cgvntL-6nD/_buildManifest.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +0 -1
  133. sky/dashboard/out/_next/static/chunks/1141-e49a159c30a6c4a7.js +0 -11
  134. sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +0 -30
  135. sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +0 -6
  137. sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +0 -15
  140. sky/dashboard/out/_next/static/chunks/2641.74c19c4d45a2c034.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/3698-9fa11dafb5cad4a6.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/4725.66125dcd9832aa5d.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +0 -16
  146. sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +0 -15
  147. sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +0 -8
  149. sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +0 -39
  151. sky/dashboard/out/_next/static/chunks/6601-d4a381403a8bae91.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +0 -55
  153. sky/dashboard/out/_next/static/chunks/6989-eab0e9c16b64fd9f.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +0 -41
  156. sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +0 -6
  158. sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +0 -1
  159. sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +0 -30
  160. sky/dashboard/out/_next/static/chunks/9984.0460de9d3adf5582.js +0 -1
  161. sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +0 -1
  162. sky/dashboard/out/_next/static/chunks/framework-efc06c2733009cd3.js +0 -33
  163. sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +0 -1
  164. sky/dashboard/out/_next/static/chunks/main-c0a4f1ea606d48d2.js +0 -1
  165. sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +0 -34
  166. sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +0 -1
  167. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2186770cc2de1623.js +0 -11
  168. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-95afb019ab85801c.js +0 -6
  169. sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +0 -1
  170. sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +0 -1
  173. sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +0 -1
  174. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dc0299ffefebcdbe.js +0 -16
  175. sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/pages/users-6790fcefd5487b13.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/pages/volumes-61ea7ba7e56f8d06.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +0 -1
  179. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +0 -1
  180. sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +0 -1
  182. /sky/dashboard/out/_next/static/{Q2sVXboB_t7cgvntL-6nD → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
  183. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
  184. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
  185. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
  186. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0
sky/global_user_state.py CHANGED
@@ -11,6 +11,7 @@ import json
11
11
  import os
12
12
  import pickle
13
13
  import re
14
+ import threading
14
15
  import time
15
16
  import typing
16
17
  from typing import Any, Dict, List, Optional, Set, Tuple
@@ -47,6 +48,7 @@ _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
47
48
  _ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
48
49
 
49
50
  _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
51
+ _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
50
52
 
51
53
  Base = declarative.declarative_base()
52
54
 
@@ -241,21 +243,29 @@ def create_table(engine: sqlalchemy.engine.Engine):
241
243
  migration_utils.GLOBAL_USER_STATE_VERSION)
242
244
 
243
245
 
246
+ # We wrap the sqlalchemy engine initialization in a thread
247
+ # lock to ensure that multiple threads do not initialize the
248
+ # engine which could result in a rare race condition where
249
+ # a session has already been created with _SQLALCHEMY_ENGINE = e1,
250
+ # and then another thread overwrites _SQLALCHEMY_ENGINE = e2
251
+ # which could result in e1 being garbage collected unexpectedly.
244
252
  def initialize_and_get_db() -> sqlalchemy.engine.Engine:
245
253
  global _SQLALCHEMY_ENGINE
246
254
 
247
255
  if _SQLALCHEMY_ENGINE is not None:
248
256
  return _SQLALCHEMY_ENGINE
257
+ with _SQLALCHEMY_ENGINE_LOCK:
258
+ if _SQLALCHEMY_ENGINE is not None:
259
+ return _SQLALCHEMY_ENGINE
260
+ # get an engine to the db
261
+ engine = migration_utils.get_engine('state')
249
262
 
250
- # get an engine to the db
251
- engine = migration_utils.get_engine('state')
263
+ # run migrations if needed
264
+ create_table(engine)
252
265
 
253
- # run migrations if needed
254
- create_table(engine)
255
-
256
- # return engine
257
- _SQLALCHEMY_ENGINE = engine
258
- return _SQLALCHEMY_ENGINE
266
+ # return engine
267
+ _SQLALCHEMY_ENGINE = engine
268
+ return _SQLALCHEMY_ENGINE
259
269
 
260
270
 
261
271
  def _init_db(func):
sky/jobs/__init__.py CHANGED
@@ -5,6 +5,9 @@ from sky.jobs.client.sdk import cancel
5
5
  from sky.jobs.client.sdk import dashboard
6
6
  from sky.jobs.client.sdk import download_logs
7
7
  from sky.jobs.client.sdk import launch
8
+ from sky.jobs.client.sdk import pool_apply
9
+ from sky.jobs.client.sdk import pool_down
10
+ from sky.jobs.client.sdk import pool_status
8
11
  from sky.jobs.client.sdk import queue
9
12
  from sky.jobs.client.sdk import tail_logs
10
13
  from sky.jobs.constants import JOBS_CLUSTER_NAME_PREFIX_LENGTH
sky/jobs/client/sdk.py CHANGED
@@ -9,8 +9,10 @@ import click
9
9
  from sky import sky_logging
10
10
  from sky.client import common as client_common
11
11
  from sky.client import sdk
12
+ from sky.serve.client import impl
12
13
  from sky.server import common as server_common
13
14
  from sky.server import rest
15
+ from sky.server import versions
14
16
  from sky.server.requests import payloads
15
17
  from sky.skylet import constants
16
18
  from sky.usage import usage_lib
@@ -23,6 +25,7 @@ if typing.TYPE_CHECKING:
23
25
  import io
24
26
 
25
27
  import sky
28
+ from sky.serve import serve_utils
26
29
 
27
30
  logger = sky_logging.init_logger(__name__)
28
31
 
@@ -33,6 +36,8 @@ logger = sky_logging.init_logger(__name__)
33
36
  def launch(
34
37
  task: Union['sky.Task', 'sky.Dag'],
35
38
  name: Optional[str] = None,
39
+ pool: Optional[str] = None,
40
+ num_jobs: Optional[int] = None,
36
41
  # Internal only:
37
42
  # pylint: disable=invalid-name
38
43
  _need_confirmation: bool = False,
@@ -61,15 +66,35 @@ def launch(
61
66
  chain dag.
62
67
  sky.exceptions.NotSupportedError: the feature is not supported.
63
68
  """
69
+ remote_api_version = versions.get_remote_api_version()
70
+ if (pool is not None and
71
+ (remote_api_version is None or remote_api_version < 12)):
72
+ raise click.UsageError('Pools are not supported in your API server. '
73
+ 'Please upgrade to a newer API server to use '
74
+ 'pools.')
75
+ if pool is None and num_jobs is not None:
76
+ raise click.UsageError('Cannot specify num_jobs without pool.')
64
77
 
65
78
  dag = dag_utils.convert_entrypoint_to_dag(task)
66
79
  with admin_policy_utils.apply_and_use_config_in_current_request(
67
80
  dag, at_client_side=True) as dag:
68
81
  sdk.validate(dag)
69
82
  if _need_confirmation:
70
- request_id = sdk.optimize(dag)
71
- sdk.stream_and_get(request_id)
72
- prompt = f'Launching a managed job {dag.name!r}. Proceed?'
83
+ job_identity = 'a managed job'
84
+ if pool is None:
85
+ request_id = sdk.optimize(dag)
86
+ sdk.stream_and_get(request_id)
87
+ else:
88
+ request_id = pool_status(pool)
89
+ pool_statuses = sdk.get(request_id)
90
+ if not pool_statuses:
91
+ raise click.UsageError(f'Pool {pool!r} not found.')
92
+ resources = pool_statuses[0]['requested_resources_str']
93
+ click.secho(f'Use resources from pool {pool!r}: {resources}.',
94
+ fg='green')
95
+ if num_jobs is not None:
96
+ job_identity = f'{num_jobs} managed jobs'
97
+ prompt = f'Launching {job_identity} {dag.name!r}. Proceed?'
73
98
  if prompt is not None:
74
99
  click.confirm(prompt,
75
100
  default=True,
@@ -81,6 +106,8 @@ def launch(
81
106
  body = payloads.JobsLaunchBody(
82
107
  task=dag_str,
83
108
  name=name,
109
+ pool=pool,
110
+ num_jobs=num_jobs,
84
111
  )
85
112
  response = server_common.make_authenticated_request(
86
113
  'POST',
@@ -158,6 +185,7 @@ def cancel(
158
185
  job_ids: Optional[List[int]] = None,
159
186
  all: bool = False, # pylint: disable=redefined-builtin
160
187
  all_users: bool = False,
188
+ pool: Optional[str] = None,
161
189
  ) -> server_common.RequestId:
162
190
  """Cancels managed jobs.
163
191
 
@@ -168,6 +196,7 @@ def cancel(
168
196
  job_ids: IDs of the managed jobs to cancel.
169
197
  all: Whether to cancel all managed jobs.
170
198
  all_users: Whether to cancel all managed jobs from all users.
199
+ pool: Pool name to cancel.
171
200
 
172
201
  Returns:
173
202
  The request ID of the cancel request.
@@ -176,11 +205,18 @@ def cancel(
176
205
  sky.exceptions.ClusterNotUpError: the jobs controller is not up.
177
206
  RuntimeError: failed to cancel the job.
178
207
  """
208
+ remote_api_version = versions.get_remote_api_version()
209
+ if (pool is not None and
210
+ (remote_api_version is None or remote_api_version < 12)):
211
+ raise click.UsageError('Pools are not supported in your API server. '
212
+ 'Please upgrade to a newer API server to use '
213
+ 'pools.')
179
214
  body = payloads.JobsCancelBody(
180
215
  name=name,
181
216
  job_ids=job_ids,
182
217
  all=all,
183
218
  all_users=all_users,
219
+ pool=pool,
184
220
  )
185
221
  response = server_common.make_authenticated_request(
186
222
  'POST',
@@ -327,3 +363,44 @@ def dashboard() -> None:
327
363
  url = f'{api_server_url}/jobs/dashboard?{params}'
328
364
  logger.info(f'Opening dashboard in browser: {url}')
329
365
  webbrowser.open(url)
366
+
367
+
368
+ @context.contextual
369
+ @usage_lib.entrypoint
370
+ @server_common.check_server_healthy_or_start
371
+ @versions.minimal_api_version(12)
372
+ def pool_apply(
373
+ task: Union['sky.Task', 'sky.Dag'],
374
+ pool_name: str,
375
+ mode: 'serve_utils.UpdateMode',
376
+ # Internal only:
377
+ # pylint: disable=invalid-name
378
+ _need_confirmation: bool = False
379
+ ) -> server_common.RequestId:
380
+ """Apply a config to a pool."""
381
+ return impl.apply(task,
382
+ pool_name,
383
+ mode,
384
+ pool=True,
385
+ _need_confirmation=_need_confirmation)
386
+
387
+
388
+ @usage_lib.entrypoint
389
+ @server_common.check_server_healthy_or_start
390
+ @versions.minimal_api_version(12)
391
+ def pool_down(
392
+ pool_names: Optional[Union[str, List[str]]],
393
+ all: bool = False, # pylint: disable=redefined-builtin
394
+ purge: bool = False,
395
+ ) -> server_common.RequestId:
396
+ """Delete a pool."""
397
+ return impl.down(pool_names, all, purge, pool=True)
398
+
399
+
400
+ @usage_lib.entrypoint
401
+ @server_common.check_server_healthy_or_start
402
+ @versions.minimal_api_version(12)
403
+ def pool_status(
404
+ pool_names: Optional[Union[str, List[str]]],) -> server_common.RequestId:
405
+ """Query a pool."""
406
+ return impl.status(pool_names, pool=True)
sky/jobs/controller.py CHANGED
@@ -30,6 +30,7 @@ from sky.jobs import recovery_strategy
30
30
  from sky.jobs import scheduler
31
31
  from sky.jobs import state as managed_job_state
32
32
  from sky.jobs import utils as managed_job_utils
33
+ from sky.serve import serve_utils
33
34
  from sky.skylet import constants
34
35
  from sky.skylet import job_lib
35
36
  from sky.usage import usage_lib
@@ -60,12 +61,13 @@ def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
60
61
  class JobsController:
61
62
  """Each jobs controller manages the life cycle of one managed job."""
62
63
 
63
- def __init__(self, job_id: int, dag_yaml: str) -> None:
64
+ def __init__(self, job_id: int, dag_yaml: str, pool: Optional[str]) -> None:
64
65
  self._job_id = job_id
65
66
  self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
66
67
  logger.info(self._dag)
67
68
  # TODO(zhwu): this assumes the specific backend.
68
69
  self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
70
+ self._pool = pool
69
71
 
70
72
  # pylint: disable=line-too-long
71
73
  # Add a unique identifier to the task environment variables, so that
@@ -99,8 +101,10 @@ class JobsController:
99
101
  task.update_envs(task_envs)
100
102
 
101
103
  def _download_log_and_stream(
102
- self, task_id: Optional[int],
103
- handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle]
104
+ self,
105
+ task_id: Optional[int],
106
+ handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle],
107
+ job_id_on_pool_cluster: Optional[int],
104
108
  ) -> None:
105
109
  """Downloads and streams the logs of the current job with given task ID.
106
110
 
@@ -113,9 +117,14 @@ class JobsController:
113
117
  'Skipping downloading and streaming the logs.')
114
118
  return
115
119
  managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
116
- 'managed_jobs')
117
- log_file = controller_utils.download_and_stream_latest_job_log(
118
- self._backend, handle, managed_job_logs_dir)
120
+ 'managed_jobs',
121
+ f'job-id-{self._job_id}')
122
+ log_file = controller_utils.download_and_stream_job_log(
123
+ self._backend,
124
+ handle,
125
+ managed_job_logs_dir,
126
+ job_ids=[str(job_id_on_pool_cluster)]
127
+ if job_id_on_pool_cluster is not None else None)
119
128
  if log_file is not None:
120
129
  # Set the path of the log file for the current task, so it can be
121
130
  # accessed even after the job is finished
@@ -123,6 +132,12 @@ class JobsController:
123
132
  log_file)
124
133
  logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
125
134
 
135
+ def _cleanup_cluster(self, cluster_name: Optional[str]) -> None:
136
+ if cluster_name is None:
137
+ return
138
+ if self._pool is None:
139
+ managed_job_utils.terminate_cluster(cluster_name)
140
+
126
141
  def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
127
142
  """Busy loop monitoring cluster status and handling recovery.
128
143
 
@@ -193,10 +208,14 @@ class JobsController:
193
208
  usage_lib.messages.usage.update_task_id(task_id)
194
209
  task_id_env_var = task.envs[constants.TASK_ID_ENV_VAR]
195
210
  assert task.name is not None, task
211
+ # Set the cluster name to None if the job is submitted
212
+ # to a pool. This will be updated when we later calls the `launch`
213
+ # or `recover` function from the strategy executor.
196
214
  cluster_name = managed_job_utils.generate_managed_job_cluster_name(
197
- task.name, self._job_id)
215
+ task.name, self._job_id) if self._pool is None else None
198
216
  self._strategy_executor = recovery_strategy.StrategyExecutor.make(
199
- cluster_name, self._backend, task, self._job_id, task_id)
217
+ cluster_name, self._backend, task, self._job_id, task_id,
218
+ self._pool)
200
219
  if not is_resume:
201
220
  submitted_at = time.time()
202
221
  if task_id == 0:
@@ -226,6 +245,13 @@ class JobsController:
226
245
  if not is_resume:
227
246
  remote_job_submitted_at = self._strategy_executor.launch()
228
247
  assert remote_job_submitted_at is not None, remote_job_submitted_at
248
+ if self._pool is None:
249
+ job_id_on_pool_cluster = None
250
+ else:
251
+ # Update the cluster name when using cluster pool.
252
+ cluster_name, job_id_on_pool_cluster = (
253
+ managed_job_state.get_pool_submit_info(self._job_id))
254
+ assert cluster_name is not None, (cluster_name, job_id_on_pool_cluster)
229
255
 
230
256
  if not is_resume:
231
257
  managed_job_state.set_started(job_id=self._job_id,
@@ -279,7 +305,9 @@ class JobsController:
279
305
  if not force_transit_to_recovering:
280
306
  try:
281
307
  job_status = managed_job_utils.get_job_status(
282
- self._backend, cluster_name)
308
+ self._backend,
309
+ cluster_name,
310
+ job_id=job_id_on_pool_cluster)
283
311
  except exceptions.FetchClusterInfoError as fetch_e:
284
312
  logger.info(
285
313
  'Failed to fetch the job status. Start recovery.\n'
@@ -288,7 +316,7 @@ class JobsController:
288
316
 
289
317
  if job_status == job_lib.JobStatus.SUCCEEDED:
290
318
  success_end_time = managed_job_utils.try_to_get_job_end_time(
291
- self._backend, cluster_name)
319
+ self._backend, cluster_name, job_id_on_pool_cluster)
292
320
  # The job is done. Set the job to SUCCEEDED first before start
293
321
  # downloading and streaming the logs to make it more responsive.
294
322
  managed_job_state.set_succeeded(self._job_id,
@@ -299,6 +327,8 @@ class JobsController:
299
327
  f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
300
328
  f'Cleaning up the cluster {cluster_name}.')
301
329
  try:
330
+ logger.info(f'Downloading logs on cluster {cluster_name} '
331
+ f'and job id {job_id_on_pool_cluster}.')
302
332
  clusters = backend_utils.get_clusters(
303
333
  cluster_names=[cluster_name],
304
334
  refresh=common.StatusRefreshMode.NONE,
@@ -307,7 +337,8 @@ class JobsController:
307
337
  assert len(clusters) == 1, (clusters, cluster_name)
308
338
  handle = clusters[0].get('handle')
309
339
  # Best effort to download and stream the logs.
310
- self._download_log_and_stream(task_id, handle)
340
+ self._download_log_and_stream(task_id, handle,
341
+ job_id_on_pool_cluster)
311
342
  except Exception as e: # pylint: disable=broad-except
312
343
  # We don't want to crash here, so just log and continue.
313
344
  logger.warning(
@@ -316,7 +347,7 @@ class JobsController:
316
347
  exc_info=True)
317
348
  # Only clean up the cluster, not the storages, because tasks may
318
349
  # share storages.
319
- managed_job_utils.terminate_cluster(cluster_name=cluster_name)
350
+ self._cleanup_cluster(cluster_name)
320
351
  return True
321
352
 
322
353
  # For single-node jobs, non-terminated job_status indicates a
@@ -364,13 +395,14 @@ class JobsController:
364
395
  job_status == job_lib.JobStatus.FAILED_DRIVER):
365
396
  # The user code has probably crashed, fail immediately.
366
397
  end_time = managed_job_utils.try_to_get_job_end_time(
367
- self._backend, cluster_name)
398
+ self._backend, cluster_name, job_id_on_pool_cluster)
368
399
  logger.info(
369
400
  f'The user job failed ({job_status}). Please check the '
370
401
  'logs below.\n'
371
402
  f'== Logs of the user job (ID: {self._job_id}) ==\n')
372
403
 
373
- self._download_log_and_stream(task_id, handle)
404
+ self._download_log_and_stream(task_id, handle,
405
+ job_id_on_pool_cluster)
374
406
 
375
407
  failure_reason = (
376
408
  'To see the details, run: '
@@ -457,7 +489,7 @@ class JobsController:
457
489
  # those clusters again may fail.
458
490
  logger.info('Cleaning up the preempted or failed cluster'
459
491
  '...')
460
- managed_job_utils.terminate_cluster(cluster_name)
492
+ self._cleanup_cluster(cluster_name)
461
493
 
462
494
  # Try to recover the managed jobs, when the cluster is preempted or
463
495
  # failed or the job status is failed to be fetched.
@@ -467,6 +499,10 @@ class JobsController:
467
499
  force_transit_to_recovering=force_transit_to_recovering,
468
500
  callback_func=callback_func)
469
501
  recovered_time = self._strategy_executor.recover()
502
+ if self._pool is not None:
503
+ cluster_name, job_id_on_pool_cluster = (
504
+ managed_job_state.get_pool_submit_info(self._job_id))
505
+ assert cluster_name is not None
470
506
  managed_job_state.set_recovered(self._job_id,
471
507
  task_id,
472
508
  recovered_time=recovered_time,
@@ -541,11 +577,11 @@ class JobsController:
541
577
  task=self._dag.tasks[task_id]))
542
578
 
543
579
 
544
- def _run_controller(job_id: int, dag_yaml: str):
580
+ def _run_controller(job_id: int, dag_yaml: str, pool: Optional[str]):
545
581
  """Runs the controller in a remote process for interruption."""
546
582
  # The controller needs to be instantiated in the remote process, since
547
583
  # the controller is not serializable.
548
- jobs_controller = JobsController(job_id, dag_yaml)
584
+ jobs_controller = JobsController(job_id, dag_yaml, pool)
549
585
  jobs_controller.run()
550
586
 
551
587
 
@@ -577,7 +613,7 @@ def _handle_signal(job_id):
577
613
  f'User sent {user_signal.value} signal.')
578
614
 
579
615
 
580
- def _cleanup(job_id: int, dag_yaml: str):
616
+ def _cleanup(job_id: int, dag_yaml: str, pool: Optional[str]):
581
617
  """Clean up the cluster(s) and storages.
582
618
 
583
619
  (1) Clean up the succeeded task(s)' ephemeral storage. The storage has
@@ -595,9 +631,18 @@ def _cleanup(job_id: int, dag_yaml: str):
595
631
  dag, _ = _get_dag_and_name(dag_yaml)
596
632
  for task in dag.tasks:
597
633
  assert task.name is not None, task
598
- cluster_name = managed_job_utils.generate_managed_job_cluster_name(
599
- task.name, job_id)
600
- managed_job_utils.terminate_cluster(cluster_name)
634
+ if pool is None:
635
+ cluster_name = managed_job_utils.generate_managed_job_cluster_name(
636
+ task.name, job_id)
637
+ managed_job_utils.terminate_cluster(cluster_name)
638
+ else:
639
+ cluster_name, job_id_on_pool_cluster = (
640
+ managed_job_state.get_pool_submit_info(job_id))
641
+ if cluster_name is not None:
642
+ if job_id_on_pool_cluster is not None:
643
+ core.cancel(cluster_name=cluster_name,
644
+ job_ids=[job_id_on_pool_cluster],
645
+ _try_cancel_if_cluster_is_init=True)
601
646
 
602
647
  # Clean up Storages with persistent=False.
603
648
  # TODO(zhwu): this assumes the specific backend.
@@ -629,7 +674,7 @@ def _cleanup(job_id: int, dag_yaml: str):
629
674
  f'Failed to clean up file mount {file_mount}: {e}')
630
675
 
631
676
 
632
- def start(job_id, dag_yaml):
677
+ def start(job_id, dag_yaml, pool):
633
678
  """Start the controller."""
634
679
  controller_process = None
635
680
  cancelling = False
@@ -643,7 +688,8 @@ def start(job_id, dag_yaml):
643
688
  # So we can only enable daemon after we no longer need to
644
689
  # start daemon processes like Ray.
645
690
  controller_process = multiprocessing.Process(target=_run_controller,
646
- args=(job_id, dag_yaml))
691
+ args=(job_id, dag_yaml,
692
+ pool))
647
693
  controller_process.start()
648
694
  while controller_process.is_alive():
649
695
  _handle_signal(job_id)
@@ -679,7 +725,7 @@ def start(job_id, dag_yaml):
679
725
  # https://unix.stackexchange.com/questions/356408/strange-problem-with-trap-and-sigint
680
726
  # But anyway, a clean solution is killing the controller process
681
727
  # directly, and then cleanup the cluster job_state.
682
- _cleanup(job_id, dag_yaml=dag_yaml)
728
+ _cleanup(job_id, dag_yaml=dag_yaml, pool=pool)
683
729
  logger.info(f'Cluster of managed job {job_id} has been cleaned up.')
684
730
 
685
731
  if cancelling:
@@ -717,8 +763,13 @@ if __name__ == '__main__':
717
763
  parser.add_argument('dag_yaml',
718
764
  type=str,
719
765
  help='The path to the user job yaml file.')
766
+ parser.add_argument('--pool',
767
+ required=False,
768
+ default=None,
769
+ type=str,
770
+ help='The pool to use for the controller job.')
720
771
  args = parser.parse_args()
721
772
  # We start process with 'spawn', because 'fork' could result in weird
722
773
  # behaviors; 'spawn' is also cross-platform.
723
774
  multiprocessing.set_start_method('spawn', force=True)
724
- start(args.job_id, args.dag_yaml)
775
+ start(args.job_id, args.dag_yaml, args.pool)