skypilot-nightly 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (186) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +4 -1
  3. sky/backends/cloud_vm_ray_backend.py +4 -3
  4. sky/catalog/__init__.py +3 -3
  5. sky/catalog/aws_catalog.py +12 -0
  6. sky/catalog/common.py +2 -2
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +448 -60
  9. sky/client/common.py +12 -9
  10. sky/clouds/nebius.py +1 -1
  11. sky/clouds/utils/gcp_utils.py +1 -1
  12. sky/clouds/vast.py +1 -2
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +11 -0
  16. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +30 -0
  17. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  22. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +16 -0
  28. sky/dashboard/out/_next/static/chunks/4937.d6bf67771e353356.js +15 -0
  29. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  31. sky/dashboard/out/_next/static/chunks/6135-d0e285ac5f3f2485.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  33. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  34. sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/691.6d99cbfba347cebf.js +55 -0
  36. sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  39. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +6 -0
  41. sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/9847.4c46c5e229c78704.js +30 -0
  43. sky/dashboard/out/_next/static/chunks/9984.78ee6d2c6fa4b0e8.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  46. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/_app-a67ae198457b9886.js +34 -0
  49. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +11 -0
  51. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +11 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +1 -0
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-5adfc4d4b3db6f71.js +1 -0
  65. sky/dashboard/out/_next/static/oKqDxFQ88cquF4nQGE_0w/_buildManifest.js +1 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs.html +1 -1
  75. sky/dashboard/out/users.html +1 -1
  76. sky/dashboard/out/volumes.html +1 -1
  77. sky/dashboard/out/workspace/new.html +1 -1
  78. sky/dashboard/out/workspaces/[name].html +1 -1
  79. sky/dashboard/out/workspaces.html +1 -1
  80. sky/data/data_utils.py +25 -0
  81. sky/data/storage.py +1219 -1775
  82. sky/global_user_state.py +18 -8
  83. sky/jobs/__init__.py +3 -0
  84. sky/jobs/client/sdk.py +80 -3
  85. sky/jobs/controller.py +76 -25
  86. sky/jobs/recovery_strategy.py +80 -34
  87. sky/jobs/scheduler.py +68 -20
  88. sky/jobs/server/core.py +228 -136
  89. sky/jobs/server/server.py +40 -0
  90. sky/jobs/state.py +164 -31
  91. sky/jobs/utils.py +144 -68
  92. sky/logs/aws.py +4 -2
  93. sky/provision/kubernetes/utils.py +6 -4
  94. sky/provision/nebius/constants.py +3 -0
  95. sky/provision/vast/instance.py +2 -1
  96. sky/provision/vast/utils.py +9 -6
  97. sky/py.typed +0 -0
  98. sky/resources.py +24 -14
  99. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  100. sky/serve/autoscalers.py +8 -0
  101. sky/serve/client/impl.py +188 -0
  102. sky/serve/client/sdk.py +12 -82
  103. sky/serve/constants.py +5 -1
  104. sky/serve/controller.py +5 -0
  105. sky/serve/replica_managers.py +112 -37
  106. sky/serve/serve_state.py +16 -6
  107. sky/serve/serve_utils.py +274 -77
  108. sky/serve/server/core.py +8 -525
  109. sky/serve/server/impl.py +709 -0
  110. sky/serve/service.py +13 -9
  111. sky/serve/service_spec.py +74 -4
  112. sky/server/constants.py +1 -1
  113. sky/server/requests/payloads.py +33 -0
  114. sky/server/requests/requests.py +18 -1
  115. sky/server/requests/serializers/decoders.py +12 -3
  116. sky/server/requests/serializers/encoders.py +13 -2
  117. sky/server/server.py +6 -1
  118. sky/skylet/events.py +9 -0
  119. sky/skypilot_config.py +24 -21
  120. sky/task.py +41 -11
  121. sky/templates/jobs-controller.yaml.j2 +3 -0
  122. sky/templates/sky-serve-controller.yaml.j2 +18 -2
  123. sky/users/server.py +1 -1
  124. sky/utils/command_runner.py +4 -2
  125. sky/utils/controller_utils.py +14 -10
  126. sky/utils/dag_utils.py +4 -2
  127. sky/utils/db/migration_utils.py +2 -4
  128. sky/utils/schemas.py +24 -19
  129. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
  130. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +135 -130
  131. sky/dashboard/out/_next/static/Q2sVXboB_t7cgvntL-6nD/_buildManifest.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +0 -1
  133. sky/dashboard/out/_next/static/chunks/1141-e49a159c30a6c4a7.js +0 -11
  134. sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +0 -30
  135. sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +0 -6
  137. sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +0 -15
  140. sky/dashboard/out/_next/static/chunks/2641.74c19c4d45a2c034.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/3698-9fa11dafb5cad4a6.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/4725.66125dcd9832aa5d.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +0 -16
  146. sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +0 -15
  147. sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +0 -8
  149. sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +0 -39
  151. sky/dashboard/out/_next/static/chunks/6601-d4a381403a8bae91.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +0 -55
  153. sky/dashboard/out/_next/static/chunks/6989-eab0e9c16b64fd9f.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +0 -41
  156. sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +0 -6
  158. sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +0 -1
  159. sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +0 -30
  160. sky/dashboard/out/_next/static/chunks/9984.0460de9d3adf5582.js +0 -1
  161. sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +0 -1
  162. sky/dashboard/out/_next/static/chunks/framework-efc06c2733009cd3.js +0 -33
  163. sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +0 -1
  164. sky/dashboard/out/_next/static/chunks/main-c0a4f1ea606d48d2.js +0 -1
  165. sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +0 -34
  166. sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +0 -1
  167. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2186770cc2de1623.js +0 -11
  168. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-95afb019ab85801c.js +0 -6
  169. sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +0 -1
  170. sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +0 -1
  173. sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +0 -1
  174. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dc0299ffefebcdbe.js +0 -16
  175. sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/pages/users-6790fcefd5487b13.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/pages/volumes-61ea7ba7e56f8d06.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +0 -1
  179. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +0 -1
  180. sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +0 -1
  182. /sky/dashboard/out/_next/static/{Q2sVXboB_t7cgvntL-6nD → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
  183. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
  184. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
  185. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
  186. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0
sky/jobs/server/core.py CHANGED
@@ -24,6 +24,8 @@ from sky.jobs import constants as managed_job_constants
24
24
  from sky.jobs import state as managed_job_state
25
25
  from sky.jobs import utils as managed_job_utils
26
26
  from sky.provision import common as provision_common
27
+ from sky.serve import serve_utils
28
+ from sky.serve.server import impl
27
29
  from sky.skylet import constants as skylet_constants
28
30
  from sky.usage import usage_lib
29
31
  from sky.utils import admin_policy_utils
@@ -90,7 +92,8 @@ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
90
92
  return local_to_controller_file_mounts
91
93
 
92
94
 
93
- def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag') -> Optional[int]:
95
+ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag', pool: Optional[str],
96
+ num_jobs: Optional[int]) -> Optional[List[int]]:
94
97
  """Submit the managed job locally if in consolidation mode.
95
98
 
96
99
  In normal mode the managed job submission is done in the ray job submission.
@@ -104,18 +107,29 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag') -> Optional[int]:
104
107
 
105
108
  # Create local directory for the managed job.
106
109
  pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
107
- consolidation_mode_job_id = managed_job_state.set_job_info_without_job_id(
108
- dag.name,
109
- workspace=skypilot_config.get_active_workspace(
110
- force_user_workspace=True),
111
- entrypoint=common_utils.get_current_command())
112
- for task_id, task in enumerate(dag.tasks):
113
- resources_str = backend_utils.get_task_resources_str(
114
- task, is_managed_job=True)
115
- managed_job_state.set_pending(consolidation_mode_job_id, task_id,
116
- task.name, resources_str,
117
- task.metadata_json)
118
- return consolidation_mode_job_id
110
+ job_ids = []
111
+ for _ in range(num_jobs if num_jobs is not None else 1):
112
+ # TODO(tian): We should have a separate name for each job when
113
+ # submitting multiple jobs. Current blocker is that we are sharing
114
+ # the same dag object for all jobs. Maybe we can do copy.copy() for
115
+ # each job and then give it a unique name (e.g. append job id after
116
+ # the task name). The name of the dag also needs to be aligned with
117
+ # the task name.
118
+ consolidation_mode_job_id = (
119
+ managed_job_state.set_job_info_without_job_id(
120
+ dag.name,
121
+ workspace=skypilot_config.get_active_workspace(
122
+ force_user_workspace=True),
123
+ entrypoint=common_utils.get_current_command(),
124
+ pool=pool))
125
+ for task_id, task in enumerate(dag.tasks):
126
+ resources_str = backend_utils.get_task_resources_str(
127
+ task, is_managed_job=True)
128
+ managed_job_state.set_pending(consolidation_mode_job_id, task_id,
129
+ task.name, resources_str,
130
+ task.metadata_json)
131
+ job_ids.append(consolidation_mode_job_id)
132
+ return job_ids
119
133
 
120
134
 
121
135
  @timeline.event
@@ -123,8 +137,10 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag') -> Optional[int]:
123
137
  def launch(
124
138
  task: Union['sky.Task', 'sky.Dag'],
125
139
  name: Optional[str] = None,
140
+ pool: Optional[str] = None,
141
+ num_jobs: Optional[int] = None,
126
142
  stream_logs: bool = True,
127
- ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
143
+ ) -> Tuple[Optional[Union[int, List[int]]], Optional[backends.ResourceHandle]]:
128
144
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
129
145
  """Launches a managed job.
130
146
 
@@ -149,6 +165,9 @@ def launch(
149
165
  handle: Optional[backends.ResourceHandle]; handle to the controller VM.
150
166
  None if dryrun.
151
167
  """
168
+ if pool is not None and not managed_job_utils.is_consolidation_mode():
169
+ with ux_utils.print_exception_no_traceback():
170
+ raise ValueError('pool is only supported in consolidation mode.')
152
171
  entrypoint = task
153
172
  # using hasattr instead of isinstance to avoid importing sky
154
173
  if hasattr(task, 'metadata'):
@@ -178,8 +197,8 @@ def launch(
178
197
  # pre-mount operations when submitting jobs.
179
198
  dag.pre_mount_volumes()
180
199
 
181
- user_dag_str_redacted = dag_utils.dump_chain_dag_to_yaml_str(
182
- dag, redact_secrets=True)
200
+ user_dag_str_user_specified = dag_utils.dump_chain_dag_to_yaml_str(
201
+ dag, use_user_specified_yaml=True)
183
202
 
184
203
  dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
185
204
 
@@ -262,122 +281,159 @@ def launch(
262
281
  f'Reason: {common_utils.format_exception(e)}')
263
282
 
264
283
  local_to_controller_file_mounts = _upload_files_to_controller(dag)
265
-
266
- # Has to use `\` to avoid yapf issue.
267
- with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
268
- mode='w') as f, \
269
- tempfile.NamedTemporaryFile(prefix=f'managed-user-dag-{dag.name}-',
270
- mode='w') as original_user_yaml_path:
271
- original_user_yaml_path.write(user_dag_str_redacted)
272
- original_user_yaml_path.flush()
273
-
274
- dag_utils.dump_chain_dag_to_yaml(dag, f.name)
275
- controller = controller_utils.Controllers.JOBS_CONTROLLER
276
- controller_name = controller.value.cluster_name
277
- prefix = managed_job_constants.JOBS_TASK_YAML_PREFIX
284
+ controller = controller_utils.Controllers.JOBS_CONTROLLER
285
+ controller_name = controller.value.cluster_name
286
+ prefix = managed_job_constants.JOBS_TASK_YAML_PREFIX
287
+ controller_resources = controller_utils.get_controller_resources(
288
+ controller=controller,
289
+ task_resources=sum([list(t.resources) for t in dag.tasks], []))
290
+
291
+ consolidation_mode_job_ids = _maybe_submit_job_locally(
292
+ prefix, dag, pool, num_jobs)
293
+
294
+ # This is only needed for non-consolidation mode. For consolidation
295
+ # mode, the controller uses the same catalog as API server.
296
+ modified_catalogs = {} if consolidation_mode_job_ids is not None else (
297
+ service_catalog_common.get_modified_catalog_file_mounts())
298
+
299
+ def _submit_one(
300
+ consolidation_mode_job_id: Optional[int] = None,
301
+ job_rank: Optional[int] = None,
302
+ ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
303
+ rank_suffix = '' if job_rank is None else f'-{job_rank}'
278
304
  remote_original_user_yaml_path = (
279
- f'{prefix}/{dag.name}-{dag_uuid}.original_user_yaml')
280
- remote_user_yaml_path = f'{prefix}/{dag.name}-{dag_uuid}.yaml'
281
- remote_user_config_path = f'{prefix}/{dag.name}-{dag_uuid}.config_yaml'
282
- remote_env_file_path = f'{prefix}/{dag.name}-{dag_uuid}.env'
283
- controller_resources = controller_utils.get_controller_resources(
284
- controller=controller,
285
- task_resources=sum([list(t.resources) for t in dag.tasks], []))
286
-
287
- consolidation_mode_job_id = _maybe_submit_job_locally(prefix, dag)
288
-
289
- # This is only needed for non-consolidation mode. For consolidation
290
- # mode, the controller uses the same catalog as API server.
291
- modified_catalogs = {} if consolidation_mode_job_id is not None else (
292
- service_catalog_common.get_modified_catalog_file_mounts())
293
-
294
- vars_to_fill = {
295
- 'remote_original_user_yaml_path': remote_original_user_yaml_path,
296
- 'original_user_dag_path': original_user_yaml_path.name,
297
- 'remote_user_yaml_path': remote_user_yaml_path,
298
- 'user_yaml_path': f.name,
299
- 'local_to_controller_file_mounts': local_to_controller_file_mounts,
300
- 'jobs_controller': controller_name,
301
- # Note: actual cluster name will be <task.name>-<managed job ID>
302
- 'dag_name': dag.name,
303
- 'remote_user_config_path': remote_user_config_path,
304
- 'remote_env_file_path': remote_env_file_path,
305
- 'modified_catalogs': modified_catalogs,
306
- 'priority': priority,
307
- 'consolidation_mode_job_id': consolidation_mode_job_id,
308
- **controller_utils.shared_controller_vars_to_fill(
309
- controller,
310
- remote_user_config_path=remote_user_config_path,
311
- # TODO(aylei): the mutated config will not be updated
312
- # afterwards without recreate the controller. Need to
313
- # revisit this.
314
- local_user_config=mutated_user_config,
315
- ),
316
- }
317
-
318
- yaml_path = os.path.join(
319
- managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
320
- f'{name}-{dag_uuid}.yaml')
321
- common_utils.fill_template(
322
- managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
323
- vars_to_fill,
324
- output_path=yaml_path)
325
- controller_task = task_lib.Task.from_yaml(yaml_path)
326
- controller_task.set_resources(controller_resources)
327
-
328
- controller_task.managed_job_dag = dag
329
- # pylint: disable=protected-access
330
- controller_task._metadata = metadata
331
-
332
- logger.info(
333
- f'{colorama.Fore.YELLOW}'
334
- f'Launching managed job {dag.name!r} from jobs controller...'
335
- f'{colorama.Style.RESET_ALL}')
336
-
337
- # Launch with the api server's user hash, so that sky status does not
338
- # show the owner of the controller as whatever user launched it first.
339
- with common.with_server_user():
340
- # Always launch the controller in the default workspace.
341
- with skypilot_config.local_active_workspace_ctx(
342
- skylet_constants.SKYPILOT_DEFAULT_WORKSPACE):
343
- # TODO(zhwu): the buckets need to be correctly handled for
344
- # a specific workspace. For example, if a job is launched in
345
- # workspace A, but the controller is in workspace B, the
346
- # intermediate bucket and newly created bucket should be in
347
- # workspace A.
348
- if consolidation_mode_job_id is None:
349
- return execution.launch(task=controller_task,
350
- cluster_name=controller_name,
351
- stream_logs=stream_logs,
352
- retry_until_up=True,
353
- fast=True,
354
- _disable_controller_check=True)
355
- # Manually launch the scheduler process in consolidation mode.
356
- local_handle = backend_utils.is_controller_accessible(
357
- controller=controller, stopped_message='')
358
- backend = backend_utils.get_backend_from_handle(local_handle)
359
- assert isinstance(backend, backends.CloudVmRayBackend)
360
- backend.sync_file_mounts(
361
- handle=local_handle,
362
- all_file_mounts=controller_task.file_mounts,
363
- storage_mounts=controller_task.storage_mounts)
364
- run_script = controller_task.run
365
- assert isinstance(run_script, str)
366
- # Manually add the env variables to the run script. Originally
367
- # this is done in ray jobs submission but now we have to do it
368
- # manually because there is no ray runtime on the API server.
369
- env_cmds = [
370
- f'export {k}={v!r}'
371
- for k, v in controller_task.envs.items()
372
- ]
373
- run_script = '\n'.join(env_cmds + [run_script])
374
- # Dump script for high availability recovery.
375
- if controller_utils.high_availability_specified(
376
- controller_name):
377
- managed_job_state.set_ha_recovery_script(
378
- consolidation_mode_job_id, run_script)
379
- backend.run_on_head(local_handle, run_script)
380
- return consolidation_mode_job_id, local_handle
305
+ f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.original_user_yaml')
306
+ remote_user_yaml_path = (
307
+ f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.yaml')
308
+ remote_user_config_path = (
309
+ f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.config_yaml')
310
+ remote_env_file_path = (
311
+ f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.env')
312
+ with tempfile.NamedTemporaryFile(
313
+ prefix=f'managed-dag-{dag.name}{rank_suffix}-',
314
+ mode='w',
315
+ ) as f, tempfile.NamedTemporaryFile(
316
+ prefix=f'managed-user-dag-{dag.name}{rank_suffix}-',
317
+ mode='w',
318
+ ) as original_user_yaml_path:
319
+ original_user_yaml_path.write(user_dag_str_user_specified)
320
+ original_user_yaml_path.flush()
321
+ for task_ in dag.tasks:
322
+ if job_rank is not None:
323
+ task_.update_envs({'SKYPILOT_JOB_RANK': str(job_rank)})
324
+
325
+ dag_utils.dump_chain_dag_to_yaml(dag, f.name)
326
+
327
+ vars_to_fill = {
328
+ 'remote_original_user_yaml_path':
329
+ (remote_original_user_yaml_path),
330
+ 'original_user_dag_path': original_user_yaml_path.name,
331
+ 'remote_user_yaml_path': remote_user_yaml_path,
332
+ 'user_yaml_path': f.name,
333
+ 'local_to_controller_file_mounts':
334
+ (local_to_controller_file_mounts),
335
+ 'jobs_controller': controller_name,
336
+ # Note: actual cluster name will be <task.name>-<managed job ID>
337
+ 'dag_name': dag.name,
338
+ 'remote_user_config_path': remote_user_config_path,
339
+ 'remote_env_file_path': remote_env_file_path,
340
+ 'modified_catalogs': modified_catalogs,
341
+ 'priority': priority,
342
+ 'consolidation_mode_job_id': consolidation_mode_job_id,
343
+ 'pool': pool,
344
+ **controller_utils.shared_controller_vars_to_fill(
345
+ controller,
346
+ remote_user_config_path=remote_user_config_path,
347
+ # TODO(aylei): the mutated config will not be updated
348
+ # afterwards without recreate the controller. Need to
349
+ # revisit this.
350
+ local_user_config=mutated_user_config,
351
+ ),
352
+ }
353
+
354
+ yaml_path = os.path.join(
355
+ managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
356
+ f'{name}-{dag_uuid}-{consolidation_mode_job_id}.yaml')
357
+ common_utils.fill_template(
358
+ managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
359
+ vars_to_fill,
360
+ output_path=yaml_path)
361
+ controller_task = task_lib.Task.from_yaml(yaml_path)
362
+ controller_task.set_resources(controller_resources)
363
+
364
+ controller_task.managed_job_dag = dag
365
+ # pylint: disable=protected-access
366
+ controller_task._metadata = metadata
367
+
368
+ job_identity = ''
369
+ if consolidation_mode_job_id is not None:
370
+ job_identity = f' (Job ID: {consolidation_mode_job_id})'
371
+ logger.info(f'{colorama.Fore.YELLOW}'
372
+ f'Launching managed job {dag.name!r}{job_identity} '
373
+ f'from jobs controller...{colorama.Style.RESET_ALL}')
374
+
375
+ # Launch with the api server's user hash, so that sky status does
376
+ # not show the owner of the controller as whatever user launched
377
+ # it first.
378
+ with common.with_server_user():
379
+ # Always launch the controller in the default workspace.
380
+ with skypilot_config.local_active_workspace_ctx(
381
+ skylet_constants.SKYPILOT_DEFAULT_WORKSPACE):
382
+ # TODO(zhwu): the buckets need to be correctly handled for
383
+ # a specific workspace. For example, if a job is launched in
384
+ # workspace A, but the controller is in workspace B, the
385
+ # intermediate bucket and newly created bucket should be in
386
+ # workspace A.
387
+ if consolidation_mode_job_id is None:
388
+ return execution.launch(task=controller_task,
389
+ cluster_name=controller_name,
390
+ stream_logs=stream_logs,
391
+ retry_until_up=True,
392
+ fast=True,
393
+ _disable_controller_check=True)
394
+ # Manually launch the scheduler in consolidation mode.
395
+ local_handle = backend_utils.is_controller_accessible(
396
+ controller=controller, stopped_message='')
397
+ backend = backend_utils.get_backend_from_handle(
398
+ local_handle)
399
+ assert isinstance(backend, backends.CloudVmRayBackend)
400
+ with sky_logging.silent():
401
+ backend.sync_file_mounts(
402
+ handle=local_handle,
403
+ all_file_mounts=controller_task.file_mounts,
404
+ storage_mounts=controller_task.storage_mounts)
405
+ run_script = controller_task.run
406
+ assert isinstance(run_script, str)
407
+ # Manually add the env variables to the run script.
408
+ # Originally this is done in ray jobs submission but now we
409
+ # have to do it manually because there is no ray runtime on
410
+ # the API server.
411
+ env_cmds = [
412
+ f'export {k}={v!r}'
413
+ for k, v in controller_task.envs.items()
414
+ ]
415
+ run_script = '\n'.join(env_cmds + [run_script])
416
+ # Dump script for high availability recovery.
417
+ if controller_utils.high_availability_specified(
418
+ controller_name):
419
+ managed_job_state.set_ha_recovery_script(
420
+ consolidation_mode_job_id, run_script)
421
+ backend.run_on_head(local_handle, run_script)
422
+ return consolidation_mode_job_id, local_handle
423
+
424
+ if consolidation_mode_job_ids is None:
425
+ return _submit_one()
426
+ if pool is None:
427
+ assert len(consolidation_mode_job_ids) == 1
428
+ return _submit_one(consolidation_mode_job_ids[0])
429
+ ids = []
430
+ all_handle = None
431
+ for job_rank, job_id in enumerate(consolidation_mode_job_ids):
432
+ jid, handle = _submit_one(job_id, job_rank)
433
+ assert jid is not None, (job_id, handle)
434
+ ids.append(jid)
435
+ all_handle = handle
436
+ return ids, all_handle
381
437
 
382
438
 
383
439
  def queue_from_kubernetes_pod(
@@ -590,7 +646,8 @@ def queue(refresh: bool,
590
646
  def cancel(name: Optional[str] = None,
591
647
  job_ids: Optional[List[int]] = None,
592
648
  all: bool = False,
593
- all_users: bool = False) -> None:
649
+ all_users: bool = False,
650
+ pool: Optional[str] = None) -> None:
594
651
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
595
652
  """Cancels managed jobs.
596
653
 
@@ -608,15 +665,19 @@ def cancel(name: Optional[str] = None,
608
665
  stopped_message='All managed jobs should have finished.')
609
666
 
610
667
  job_id_str = ','.join(map(str, job_ids))
611
- if sum([bool(job_ids), name is not None, all or all_users]) != 1:
668
+ if sum([
669
+ bool(job_ids), name is not None, pool is not None, all or
670
+ all_users
671
+ ]) != 1:
612
672
  arguments = []
613
673
  arguments += [f'job_ids={job_id_str}'] if job_ids else []
614
674
  arguments += [f'name={name}'] if name is not None else []
675
+ arguments += [f'pool={pool}'] if pool is not None else []
615
676
  arguments += ['all'] if all else []
616
677
  arguments += ['all_users'] if all_users else []
617
678
  with ux_utils.print_exception_no_traceback():
618
679
  raise ValueError(
619
- 'Can only specify one of JOB_IDS, name, or all/'
680
+ 'Can only specify one of JOB_IDS, name, pool, or all/'
620
681
  f'all_users. Provided {" ".join(arguments)!r}.')
621
682
 
622
683
  backend = backend_utils.get_backend_from_handle(handle)
@@ -629,9 +690,11 @@ def cancel(name: Optional[str] = None,
629
690
  elif job_ids:
630
691
  code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
631
692
  job_ids)
632
- else:
633
- assert name is not None, (job_ids, name, all)
693
+ elif name is not None:
634
694
  code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(name)
695
+ else:
696
+ assert pool is not None, (job_ids, name, pool, all)
697
+ code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_pool(pool)
635
698
  # The stderr is redirected to stdout
636
699
  returncode, stdout, stderr = backend.run_on_head(handle,
637
700
  code,
@@ -751,3 +814,32 @@ def download_logs(
751
814
  job_name=name,
752
815
  controller=controller,
753
816
  local_dir=local_dir)
817
+
818
+
819
+ @usage_lib.entrypoint
820
+ def pool_apply(
821
+ task: 'sky.Task',
822
+ pool_name: str,
823
+ mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
824
+ ) -> None:
825
+ """Apply a config to a pool."""
826
+ return impl.apply(task, pool_name, mode, pool=True)
827
+
828
+
829
+ @usage_lib.entrypoint
830
+ # pylint: disable=redefined-builtin
831
+ def pool_down(
832
+ pool_names: Optional[Union[str, List[str]]] = None,
833
+ all: bool = False,
834
+ purge: bool = False,
835
+ ) -> None:
836
+ """Delete a pool."""
837
+ return impl.down(pool_names, all, purge, pool=True)
838
+
839
+
840
+ @usage_lib.entrypoint
841
+ def pool_status(
842
+ pool_names: Optional[Union[str,
843
+ List[str]]] = None,) -> List[Dict[str, Any]]:
844
+ """Query a pool."""
845
+ return impl.status(pool_names, pool=True)
sky/jobs/server/server.py CHANGED
@@ -106,3 +106,43 @@ async def download_logs(
106
106
  if jobs_download_logs_body.refresh else api_requests.ScheduleType.SHORT,
107
107
  request_cluster_name=common.JOB_CONTROLLER_NAME,
108
108
  )
109
+
110
+
111
+ @router.post('/pool_apply')
112
+ async def pool_apply(request: fastapi.Request,
113
+ jobs_pool_apply_body: payloads.JobsPoolApplyBody) -> None:
114
+ executor.schedule_request(
115
+ request_id=request.state.request_id,
116
+ request_name='jobs.pool_apply',
117
+ request_body=jobs_pool_apply_body,
118
+ func=core.pool_apply,
119
+ schedule_type=api_requests.ScheduleType.LONG,
120
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
121
+ )
122
+
123
+
124
+ @router.post('/pool_down')
125
+ async def pool_down(request: fastapi.Request,
126
+ jobs_pool_down_body: payloads.JobsPoolDownBody) -> None:
127
+ executor.schedule_request(
128
+ request_id=request.state.request_id,
129
+ request_name='jobs.pool_down',
130
+ request_body=jobs_pool_down_body,
131
+ func=core.pool_down,
132
+ schedule_type=api_requests.ScheduleType.SHORT,
133
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
134
+ )
135
+
136
+
137
+ @router.post('/pool_status')
138
+ async def pool_status(
139
+ request: fastapi.Request,
140
+ jobs_pool_status_body: payloads.JobsPoolStatusBody) -> None:
141
+ executor.schedule_request(
142
+ request_id=request.state.request_id,
143
+ request_name='jobs.pool_status',
144
+ request_body=jobs_pool_status_body,
145
+ func=core.pool_status,
146
+ schedule_type=api_requests.ScheduleType.SHORT,
147
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
148
+ )