skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +16 -5
  3. sky/backends/__init__.py +2 -1
  4. sky/backends/backend_utils.py +38 -11
  5. sky/backends/cloud_vm_ray_backend.py +52 -18
  6. sky/client/cli/command.py +264 -25
  7. sky/client/sdk.py +119 -85
  8. sky/clouds/aws.py +10 -7
  9. sky/clouds/azure.py +10 -7
  10. sky/clouds/cloud.py +2 -0
  11. sky/clouds/cudo.py +2 -0
  12. sky/clouds/do.py +10 -7
  13. sky/clouds/fluidstack.py +2 -0
  14. sky/clouds/gcp.py +10 -7
  15. sky/clouds/hyperbolic.py +10 -7
  16. sky/clouds/ibm.py +2 -0
  17. sky/clouds/kubernetes.py +27 -9
  18. sky/clouds/lambda_cloud.py +10 -7
  19. sky/clouds/nebius.py +10 -7
  20. sky/clouds/oci.py +10 -7
  21. sky/clouds/paperspace.py +10 -7
  22. sky/clouds/runpod.py +10 -7
  23. sky/clouds/scp.py +10 -7
  24. sky/clouds/vast.py +10 -7
  25. sky/clouds/vsphere.py +2 -0
  26. sky/core.py +89 -15
  27. sky/dag.py +14 -0
  28. sky/dashboard/out/404.html +1 -1
  29. sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
  32. sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
  40. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  41. sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
  48. sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  53. sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
  58. sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
  60. sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
  61. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  62. sky/dashboard/out/clusters/[cluster].html +1 -1
  63. sky/dashboard/out/clusters.html +1 -1
  64. sky/dashboard/out/config.html +1 -1
  65. sky/dashboard/out/index.html +1 -1
  66. sky/dashboard/out/infra/[context].html +1 -1
  67. sky/dashboard/out/infra.html +1 -1
  68. sky/dashboard/out/jobs/[job].html +1 -1
  69. sky/dashboard/out/jobs.html +1 -1
  70. sky/dashboard/out/users.html +1 -1
  71. sky/dashboard/out/volumes.html +1 -0
  72. sky/dashboard/out/workspace/new.html +1 -1
  73. sky/dashboard/out/workspaces/[name].html +1 -1
  74. sky/dashboard/out/workspaces.html +1 -1
  75. sky/data/storage_utils.py +2 -4
  76. sky/exceptions.py +26 -0
  77. sky/execution.py +5 -0
  78. sky/global_user_state.py +263 -20
  79. sky/jobs/client/sdk.py +13 -12
  80. sky/jobs/controller.py +5 -1
  81. sky/jobs/scheduler.py +4 -3
  82. sky/jobs/server/core.py +121 -51
  83. sky/jobs/state.py +15 -0
  84. sky/jobs/utils.py +114 -8
  85. sky/models.py +16 -0
  86. sky/provision/__init__.py +26 -0
  87. sky/provision/kubernetes/__init__.py +3 -0
  88. sky/provision/kubernetes/instance.py +38 -77
  89. sky/provision/kubernetes/utils.py +52 -2
  90. sky/provision/kubernetes/volume.py +147 -0
  91. sky/resources.py +20 -76
  92. sky/serve/client/sdk.py +13 -13
  93. sky/serve/server/core.py +5 -1
  94. sky/server/common.py +40 -5
  95. sky/server/constants.py +5 -1
  96. sky/server/metrics.py +105 -0
  97. sky/server/requests/executor.py +30 -14
  98. sky/server/requests/payloads.py +22 -3
  99. sky/server/requests/requests.py +59 -2
  100. sky/server/rest.py +152 -0
  101. sky/server/server.py +70 -19
  102. sky/server/state.py +20 -0
  103. sky/server/stream_utils.py +8 -3
  104. sky/server/uvicorn.py +153 -13
  105. sky/setup_files/dependencies.py +2 -0
  106. sky/skylet/constants.py +19 -14
  107. sky/task.py +141 -43
  108. sky/templates/jobs-controller.yaml.j2 +12 -1
  109. sky/templates/kubernetes-ray.yml.j2 +31 -2
  110. sky/users/permission.py +2 -0
  111. sky/utils/admin_policy_utils.py +5 -1
  112. sky/utils/cli_utils/status_utils.py +25 -17
  113. sky/utils/command_runner.py +118 -12
  114. sky/utils/command_runner.pyi +57 -0
  115. sky/utils/common_utils.py +9 -1
  116. sky/utils/context.py +3 -1
  117. sky/utils/controller_utils.py +1 -2
  118. sky/utils/resources_utils.py +66 -0
  119. sky/utils/rich_utils.py +6 -0
  120. sky/utils/schemas.py +180 -38
  121. sky/utils/status_lib.py +10 -0
  122. sky/utils/validator.py +11 -1
  123. sky/volumes/__init__.py +0 -0
  124. sky/volumes/client/__init__.py +0 -0
  125. sky/volumes/client/sdk.py +64 -0
  126. sky/volumes/server/__init__.py +0 -0
  127. sky/volumes/server/core.py +199 -0
  128. sky/volumes/server/server.py +85 -0
  129. sky/volumes/utils.py +158 -0
  130. sky/volumes/volume.py +198 -0
  131. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
  132. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
  133. sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
  136. sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
  137. sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  140. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  146. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
  147. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
  148. sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
  151. sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
  153. sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
  156. sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
  158. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
  159. /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
  160. /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
  161. /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
  162. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
  163. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
  164. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
  165. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/jobs/server/core.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """SDK functions for managed jobs."""
2
2
  import os
3
+ import pathlib
3
4
  import tempfile
4
5
  import typing
5
6
  from typing import Any, Dict, List, Optional, Tuple, Union
@@ -20,6 +21,7 @@ from sky.backends import backend_utils
20
21
  from sky.catalog import common as service_catalog_common
21
22
  from sky.data import storage as storage_lib
22
23
  from sky.jobs import constants as managed_job_constants
24
+ from sky.jobs import state as managed_job_state
23
25
  from sky.jobs import utils as managed_job_utils
24
26
  from sky.provision import common as provision_common
25
27
  from sky.skylet import constants as skylet_constants
@@ -43,6 +45,72 @@ if typing.TYPE_CHECKING:
43
45
  logger = sky_logging.init_logger(__name__)
44
46
 
45
47
 
48
+ def _maybe_upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
49
+ """Maybe upload files to the controller.
50
+
51
+ In consolidation mode, we don't need to upload files to the controller as
52
+ the API server and the controller are colocated.
53
+ """
54
+ local_to_controller_file_mounts: Dict[str, str] = {}
55
+
56
+ if managed_job_utils.is_consolidation_mode():
57
+ return local_to_controller_file_mounts
58
+
59
+ if storage_lib.get_cached_enabled_storage_cloud_names_or_refresh():
60
+ for task_ in dag.tasks:
61
+ controller_utils.maybe_translate_local_file_mounts_and_sync_up(
62
+ task_, task_type='jobs')
63
+ else:
64
+ # We do not have any cloud storage available, so fall back to
65
+ # two-hop file_mount uploading.
66
+ # Note: we can't easily hack sync_storage_mounts() to upload
67
+ # directly to the controller, because the controller may not
68
+ # even be up yet.
69
+ for task_ in dag.tasks:
70
+ if task_.storage_mounts:
71
+ # Technically, we could convert COPY storage_mounts that
72
+ # have a local source and do not specify `store`, but we
73
+ # will not do that for now. Only plain file_mounts are
74
+ # supported.
75
+ raise exceptions.NotSupportedError(
76
+ 'Cloud-based file_mounts are specified, but no cloud '
77
+ 'storage is available. Please specify local '
78
+ 'file_mounts only.')
79
+
80
+ # Merge file mounts from all tasks.
81
+ local_to_controller_file_mounts.update(
82
+ controller_utils.translate_local_file_mounts_to_two_hop(task_))
83
+
84
+ return local_to_controller_file_mounts
85
+
86
+
87
+ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag') -> Optional[int]:
88
+ """Submit the managed job locally if in consolidation mode.
89
+
90
+ In normal mode the managed job submission is done in the ray job submission.
91
+ For consolidation mode, we need to manually submit it. Check the following
92
+ function for the normal mode submission:
93
+ sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend,
94
+ _exec_code_on_head::_maybe_add_managed_job_code
95
+ """
96
+ if not managed_job_utils.is_consolidation_mode():
97
+ return None
98
+
99
+ # Create local directory for the managed job.
100
+ pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
101
+ consolidation_mode_job_id = managed_job_state.set_job_info_without_job_id(
102
+ dag.name,
103
+ workspace=skypilot_config.get_active_workspace(
104
+ force_user_workspace=True),
105
+ entrypoint=common_utils.get_current_command())
106
+ for task_id, task in enumerate(dag.tasks):
107
+ resources_str = backend_utils.get_task_resources_str(
108
+ task, is_managed_job=True)
109
+ managed_job_state.set_pending(consolidation_mode_job_id, task_id,
110
+ task.name, resources_str)
111
+ return consolidation_mode_job_id
112
+
113
+
46
114
  @timeline.event
47
115
  @usage_lib.entrypoint
48
116
  def launch(
@@ -77,6 +145,7 @@ def launch(
77
145
  entrypoint = task
78
146
  dag_uuid = str(uuid.uuid4().hex[:4])
79
147
  dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
148
+ dag.resolve_and_validate_volumes()
80
149
  # Always apply the policy again here, even though it might have been applied
81
150
  # in the CLI. This is to ensure that we apply the policy to the final DAG
82
151
  # and get the mutated config.
@@ -86,6 +155,9 @@ def launch(
86
155
  raise ValueError('Only single-task or chain DAG is '
87
156
  f'allowed for job_launch. Dag: {dag}')
88
157
  dag.validate()
158
+ # TODO(aylei): use consolidated job controller instead of performing
159
+ # pre-mount operations when submitting jobs.
160
+ dag.pre_mount_volumes()
89
161
 
90
162
  user_dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
91
163
 
@@ -103,7 +175,7 @@ def launch(
103
175
  'will be auto-generated) .')
104
176
  task_names.add(task_.name)
105
177
 
106
- # Check for priority in resources first, then fall back to job priority
178
+ # Check for priority in resources
107
179
  task_priority = None
108
180
  if task_.resources:
109
181
  # Convert set to list to access elements by index
@@ -121,20 +193,6 @@ def launch(
121
193
  f'{resource.priority} but expected {task_priority}.'
122
194
  )
123
195
 
124
- # Check for conflict between resources priority and job
125
- # priority
126
- if task_.job_priority is not None:
127
- with ux_utils.print_exception_no_traceback():
128
- raise ValueError(
129
- f'Task {task_.name!r}: Cannot specify both '
130
- f'resources.priority ({task_priority}) and '
131
- f'job.priority ({task_.job_priority}). Please use only '
132
- 'one priority specification method.')
133
-
134
- # Fall back to job priority if no resources priority found
135
- if task_priority is None:
136
- task_priority = task_.job_priority
137
-
138
196
  if task_priority is not None:
139
197
  if (priority is not None and priority != task_priority):
140
198
  with ux_utils.print_exception_no_traceback():
@@ -183,34 +241,7 @@ def launch(
183
241
  f'with:\n\n`sky down {cluster_name} --purge`\n\n'
184
242
  f'Reason: {common_utils.format_exception(e)}')
185
243
 
186
- local_to_controller_file_mounts = {}
187
-
188
- if storage_lib.get_cached_enabled_storage_cloud_names_or_refresh():
189
- for task_ in dag.tasks:
190
- controller_utils.maybe_translate_local_file_mounts_and_sync_up(
191
- task_, task_type='jobs')
192
-
193
- else:
194
- # We do not have any cloud storage available, so fall back to
195
- # two-hop file_mount uploading.
196
- # Note: we can't easily hack sync_storage_mounts() to upload
197
- # directly to the controller, because the controller may not
198
- # even be up yet.
199
- for task_ in dag.tasks:
200
- if task_.storage_mounts:
201
- # Technically, we could convert COPY storage_mounts that
202
- # have a local source and do not specify `store`, but we
203
- # will not do that for now. Only plain file_mounts are
204
- # supported.
205
- raise exceptions.NotSupportedError(
206
- 'Cloud-based file_mounts are specified, but no cloud '
207
- 'storage is available. Please specify local '
208
- 'file_mounts only.')
209
-
210
- # Merge file mounts from all tasks.
211
- local_to_controller_file_mounts.update(
212
- controller_utils.translate_local_file_mounts_to_two_hop(
213
- task_))
244
+ local_to_controller_file_mounts = _maybe_upload_files_to_controller(dag)
214
245
 
215
246
  # Has to use `\` to avoid yapf issue.
216
247
  with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
@@ -233,6 +264,13 @@ def launch(
233
264
  controller=controller,
234
265
  task_resources=sum([list(t.resources) for t in dag.tasks], []))
235
266
 
267
+ consolidation_mode_job_id = _maybe_submit_job_locally(prefix, dag)
268
+
269
+ # This is only needed for non-consolidation mode. For consolidation
270
+ # mode, the controller uses the same catalog as API server.
271
+ modified_catalogs = {} if consolidation_mode_job_id is not None else (
272
+ service_catalog_common.get_modified_catalog_file_mounts())
273
+
236
274
  vars_to_fill = {
237
275
  'remote_original_user_yaml_path': remote_original_user_yaml_path,
238
276
  'original_user_dag_path': original_user_yaml_path.name,
@@ -244,9 +282,9 @@ def launch(
244
282
  'dag_name': dag.name,
245
283
  'remote_user_config_path': remote_user_config_path,
246
284
  'remote_env_file_path': remote_env_file_path,
247
- 'modified_catalogs':
248
- service_catalog_common.get_modified_catalog_file_mounts(),
285
+ 'modified_catalogs': modified_catalogs,
249
286
  'priority': priority,
287
+ 'consolidation_mode_job_id': consolidation_mode_job_id,
250
288
  **controller_utils.shared_controller_vars_to_fill(
251
289
  controller,
252
290
  remote_user_config_path=remote_user_config_path,
@@ -285,12 +323,44 @@ def launch(
285
323
  # workspace A, but the controller is in workspace B, the
286
324
  # intermediate bucket and newly created bucket should be in
287
325
  # workspace A.
288
- return execution.launch(task=controller_task,
289
- cluster_name=controller_name,
290
- stream_logs=stream_logs,
291
- retry_until_up=True,
292
- fast=True,
293
- _disable_controller_check=True)
326
+ if consolidation_mode_job_id is None:
327
+ return execution.launch(task=controller_task,
328
+ cluster_name=controller_name,
329
+ stream_logs=stream_logs,
330
+ retry_until_up=True,
331
+ fast=True,
332
+ _disable_controller_check=True)
333
+ # Manually launch the scheduler process in consolidation mode.
334
+ local_handle = backend_utils.is_controller_accessible(
335
+ controller=controller, stopped_message='')
336
+ backend = backend_utils.get_backend_from_handle(local_handle)
337
+ assert isinstance(backend, backends.CloudVmRayBackend)
338
+ backend.sync_file_mounts(
339
+ handle=local_handle,
340
+ all_file_mounts=controller_task.file_mounts,
341
+ storage_mounts=controller_task.storage_mounts)
342
+ run_script = controller_task.run
343
+ assert isinstance(run_script, str)
344
+ # Manually add the env variables to the run script. Originally
345
+ # this is done in ray jobs submission but now we have to do it
346
+ # manually because there is no ray runtime on the API server.
347
+ env_cmds = [
348
+ f'export {k}={v!r}'
349
+ for k, v in controller_task.envs.items()
350
+ ]
351
+ run_script = '\n'.join(env_cmds + [run_script])
352
+ # Dump script for high availability recovery.
353
+ if controller_utils.high_availability_specified(
354
+ controller_name):
355
+ dump_script_path = (
356
+ managed_job_utils.get_ha_dump_script_path(
357
+ consolidation_mode_job_id))
358
+ dump_script_path.parent.mkdir(parents=True, exist_ok=True)
359
+ with open(dump_script_path, 'w',
360
+ encoding='utf-8') as script_f:
361
+ script_f.write(run_script)
362
+ backend.run_on_head(local_handle, run_script)
363
+ return consolidation_mode_job_id, local_handle
294
364
 
295
365
 
296
366
  def queue_from_kubernetes_pod(
sky/jobs/state.py CHANGED
@@ -463,6 +463,21 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
463
463
  entrypoint))
464
464
 
465
465
 
466
+ @_init_db
467
+ def set_job_info_without_job_id(name: str, workspace: str,
468
+ entrypoint: str) -> int:
469
+ assert _DB_PATH is not None
470
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
471
+ cursor.execute(
472
+ """\
473
+ INSERT INTO job_info
474
+ (name, schedule_state, workspace, entrypoint)
475
+ VALUES (?, ?, ?, ?)""",
476
+ (name, ManagedJobScheduleState.INACTIVE.value, workspace,
477
+ entrypoint))
478
+ return cursor.lastrowid
479
+
480
+
466
481
  @_init_db
467
482
  def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
468
483
  """Set the task to pending state."""
sky/jobs/utils.py CHANGED
@@ -5,6 +5,7 @@ jobs.constants.MANAGED_JOBS_VERSION and handle the API change in the
5
5
  ManagedJobCodeGen.
6
6
  """
7
7
  import collections
8
+ import datetime
8
9
  import enum
9
10
  import os
10
11
  import pathlib
@@ -33,7 +34,10 @@ from sky.skylet import constants
33
34
  from sky.skylet import job_lib
34
35
  from sky.skylet import log_lib
35
36
  from sky.usage import usage_lib
37
+ from sky.utils import annotations
38
+ from sky.utils import command_runner
36
39
  from sky.utils import common_utils
40
+ from sky.utils import controller_utils
37
41
  from sky.utils import infra_utils
38
42
  from sky.utils import log_utils
39
43
  from sky.utils import message_utils
@@ -124,6 +128,114 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
124
128
  time.sleep(backoff.current_backoff())
125
129
 
126
130
 
131
+ def _check_consolidation_mode_consistency(
132
+ current_is_consolidation_mode: bool) -> None:
133
+ """Check the consistency of the consolidation mode."""
134
+ # Check whether the consolidation mode config is changed.
135
+ if current_is_consolidation_mode:
136
+ controller_cn = (
137
+ controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
138
+ if global_user_state.get_cluster_from_name(controller_cn) is not None:
139
+ with ux_utils.print_exception_no_traceback():
140
+ raise exceptions.InconsistentConsolidationModeError(
141
+ f'{colorama.Fore.RED}Consolidation mode is '
142
+ f'enabled, but the controller cluster '
143
+ f'{controller_cn} is still running. Please '
144
+ 'terminate the controller cluster first.'
145
+ f'{colorama.Style.RESET_ALL}')
146
+ else:
147
+ all_jobs = managed_job_state.get_managed_jobs()
148
+ if all_jobs:
149
+ nonterminal_jobs = (
150
+ managed_job_state.get_nonterminal_job_ids_by_name(
151
+ None, all_users=True))
152
+ if nonterminal_jobs:
153
+ with ux_utils.print_exception_no_traceback():
154
+ raise exceptions.InconsistentConsolidationModeError(
155
+ f'{colorama.Fore.RED}Consolidation mode '
156
+ 'is disabled, but there are still '
157
+ f'{len(nonterminal_jobs)} managed jobs '
158
+ 'running. Please terminate those jobs '
159
+ f'first.{colorama.Style.RESET_ALL}')
160
+ else:
161
+ logger.warning(
162
+ f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
163
+ f'but there are {len(all_jobs)} jobs from previous '
164
+ 'consolidation mode. Reset the `jobs.controller.'
165
+ 'consolidation_mode` to `true` and run `sky jobs queue` '
166
+ 'to see those jobs. Switching to normal mode will '
167
+ f'lose the job history.{colorama.Style.RESET_ALL}')
168
+
169
+
170
+ # Whether to use consolidation mode or not. When this is enabled, the managed
171
+ # jobs controller will not be running on a separate cluster, but locally on the
172
+ # API Server. Under the hood, we submit the job monitoring logic as processes
173
+ # directly in the API Server.
174
+ # Use LRU Cache so that the check is only done once.
175
+ @annotations.lru_cache(scope='request', maxsize=1)
176
+ def is_consolidation_mode() -> bool:
177
+ consolidation_mode = skypilot_config.get_nested(
178
+ ('jobs', 'controller', 'consolidation_mode'), default_value=False)
179
+ _check_consolidation_mode_consistency(consolidation_mode)
180
+ return consolidation_mode
181
+
182
+
183
+ def get_ha_dump_script_path(job_id: int) -> pathlib.Path:
184
+ """Get the path to the HA dump script for a job."""
185
+ return pathlib.Path(constants.PERSISTENT_RUN_SCRIPT_DIR).expanduser(
186
+ ).resolve() / f'sky_job_{job_id}'
187
+
188
+
189
+ def ha_recovery_for_consolidation_mode():
190
+ """Recovery logic for HA mode."""
191
+ # No setup recovery is needed in consolidation mode, as the API server
192
+ # already has all runtime installed. Directly start jobs recovery here.
193
+ # Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
194
+ runner = command_runner.LocalProcessCommandRunner()
195
+ with open(constants.HA_PERSISTENT_RECOVERY_LOG_PATH, 'w',
196
+ encoding='utf-8') as f:
197
+ start = time.time()
198
+ f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
199
+ for job in managed_job_state.get_managed_jobs():
200
+ job_id = job['job_id']
201
+ controller_pid = job['controller_pid']
202
+
203
+ # In consolidation mode, it is possible that only the API server
204
+ # process is restarted, and the controller process is not. In such
205
+ # case, we don't need to do anything and the controller process will
206
+ # just keep running.
207
+ if controller_pid is not None:
208
+ try:
209
+ if _controller_process_alive(controller_pid, job_id):
210
+ f.write(f'Controller pid {controller_pid} for '
211
+ f'job {job_id} is still running. '
212
+ 'Skipping recovery.\n')
213
+ continue
214
+ except Exception: # pylint: disable=broad-except
215
+ # _controller_process_alive may raise if psutil fails; we
216
+ # should not crash the recovery logic because of this.
217
+ f.write('Error checking controller pid '
218
+ f'{controller_pid} for job {job_id}\n')
219
+
220
+ if job['schedule_state'] not in [
221
+ managed_job_state.ManagedJobScheduleState.DONE,
222
+ managed_job_state.ManagedJobScheduleState.WAITING
223
+ ]:
224
+ dump_script_path = get_ha_dump_script_path(job_id)
225
+ if not dump_script_path.exists():
226
+ f.write(f'Job {job_id}\'s recovery file ({dump_script_path}'
227
+ ') does not exist. Skipping recovery. Job '
228
+ f'schedule state: {job["schedule_state"]}\n')
229
+ continue
230
+ with open(dump_script_path, 'r', encoding='utf-8') as script_f:
231
+ script = script_f.read()
232
+ runner.run(script)
233
+ f.write(f'Job {job_id} (file: {dump_script_path}) completed '
234
+ f'recovery at {datetime.datetime.now()}\n')
235
+ f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
236
+ f.write(f'Total recovery time: {time.time() - start} seconds\n')
237
+
238
+
127
239
  def get_job_status(backend: 'backends.CloudVmRayBackend',
128
240
  cluster_name: str) -> Optional['job_lib.JobStatus']:
129
241
  """Check the status of the job running on a managed job cluster.
@@ -157,9 +269,8 @@ def _controller_process_alive(pid: int, job_id: int) -> bool:
157
269
  """Check if the controller process is alive."""
158
270
  try:
159
271
  process = psutil.Process(pid)
160
- # The last two args of the command line should be --job-id <id>
161
- job_args = process.cmdline()[-2:]
162
- return process.is_running() and job_args == ['--job-id', str(job_id)]
272
+ cmd_str = ' '.join(process.cmdline())
273
+ return process.is_running() and f'--job-id {job_id}' in cmd_str
163
274
  except psutil.NoSuchProcess:
164
275
  return False
165
276
 
@@ -1136,7 +1247,6 @@ def format_job_table(
1136
1247
  'TASK',
1137
1248
  *(['WORKSPACE'] if show_workspace else []),
1138
1249
  'NAME',
1139
- 'PRIORITY',
1140
1250
  *user_cols,
1141
1251
  'REQUESTED',
1142
1252
  'SUBMITTED',
@@ -1208,7 +1318,6 @@ def format_job_table(
1208
1318
  submitted_at = None
1209
1319
  end_at: Optional[int] = 0
1210
1320
  recovery_cnt = 0
1211
- priority = job_tasks[0].get('priority', '-')
1212
1321
  managed_job_status, current_task_id = _get_job_status_from_tasks(
1213
1322
  job_tasks)
1214
1323
  for task in job_tasks:
@@ -1244,7 +1353,6 @@ def format_job_table(
1244
1353
  '',
1245
1354
  *([''] if show_workspace else []),
1246
1355
  job_name,
1247
- str(priority),
1248
1356
  *user_values,
1249
1357
  '-',
1250
1358
  submitted,
@@ -1275,13 +1383,11 @@ def format_job_table(
1275
1383
  submitted = log_utils.readable_time_duration(task['submitted_at'])
1276
1384
  user_values = get_user_column_values(task)
1277
1385
  task_workspace = '-' if len(job_tasks) > 1 else workspace
1278
- priority = task.get('priority', '-')
1279
1386
  values = [
1280
1387
  task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
1281
1388
  task['task_id'] if len(job_tasks) > 1 else '-',
1282
1389
  *([task_workspace] if show_workspace else []),
1283
1390
  task['task_name'],
1284
- str(priority),
1285
1391
  *user_values,
1286
1392
  task['resources'],
1287
1393
  # SUBMITTED
sky/models.py CHANGED
@@ -6,6 +6,8 @@ import getpass
6
6
  import os
7
7
  from typing import Any, Dict, Optional
8
8
 
9
+ import pydantic
10
+
9
11
  from sky.skylet import constants
10
12
  from sky.utils import common_utils
11
13
 
@@ -48,6 +50,8 @@ class KubernetesNodeInfo:
48
50
  # Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
49
51
  total: Dict[str, int]
50
52
  free: Dict[str, int]
53
+ # IP address of the node (external IP preferred, fallback to internal IP)
54
+ ip_address: Optional[str] = None
51
55
 
52
56
 
53
57
  @dataclasses.dataclass
@@ -76,3 +80,15 @@ class KubernetesNodesInfo:
76
80
  },
77
81
  hint=data['hint'],
78
82
  )
83
+
84
+
85
+ class VolumeConfig(pydantic.BaseModel):
86
+ """Configuration for creating a volume."""
87
+ name: str
88
+ type: str
89
+ cloud: str
90
+ region: Optional[str]
91
+ zone: Optional[str]
92
+ name_on_cloud: str
93
+ size: Optional[str]
94
+ config: Dict[str, Any] = {}
sky/provision/__init__.py CHANGED
@@ -8,6 +8,7 @@ import inspect
8
8
  import typing
9
9
  from typing import Any, Dict, List, Optional, Type
10
10
 
11
+ from sky import models
11
12
  from sky import sky_logging
12
13
  # These provision.<cloud> modules should never fail even if underlying cloud SDK
13
14
  # dependencies are not installed. This is ensured by using sky.adaptors inside
@@ -103,6 +104,31 @@ def bootstrap_instances(
103
104
  raise NotImplementedError
104
105
 
105
106
 
107
+ @_route_to_cloud_impl
108
+ def apply_volume(provider_name: str,
109
+ config: models.VolumeConfig) -> models.VolumeConfig:
110
+ """Create or register a volume.
111
+
112
+ This function creates or registers a volume with the provided configuration,
113
+ and returns a VolumeConfig object with updated configuration.
114
+ """
115
+ raise NotImplementedError
116
+
117
+
118
+ @_route_to_cloud_impl
119
+ def delete_volume(provider_name: str,
120
+ config: models.VolumeConfig) -> models.VolumeConfig:
121
+ """Delete a volume."""
122
+ raise NotImplementedError
123
+
124
+
125
+ @_route_to_cloud_impl
126
+ def get_volume_usedby(provider_name: str,
127
+ config: models.VolumeConfig) -> List[str]:
128
+ """Get the usedby of a volume."""
129
+ raise NotImplementedError
130
+
131
+
106
132
  @_route_to_cloud_impl
107
133
  def run_instances(provider_name: str, region: str, cluster_name_on_cloud: str,
108
134
  config: common.ProvisionConfig) -> common.ProvisionRecord:
@@ -11,3 +11,6 @@ from sky.provision.kubernetes.instance import wait_instances
11
11
  from sky.provision.kubernetes.network import cleanup_ports
12
12
  from sky.provision.kubernetes.network import open_ports
13
13
  from sky.provision.kubernetes.network import query_ports
14
+ from sky.provision.kubernetes.volume import apply_volume
15
+ from sky.provision.kubernetes.volume import delete_volume
16
+ from sky.provision.kubernetes.volume import get_volume_usedby