skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/admin_policy.py +16 -5
- sky/backends/__init__.py +2 -1
- sky/backends/backend_utils.py +38 -11
- sky/backends/cloud_vm_ray_backend.py +52 -18
- sky/client/cli/command.py +264 -25
- sky/client/sdk.py +119 -85
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +27 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +89 -15
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
- sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
- sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
- sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
- sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
- sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
- sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +26 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +263 -20
- sky/jobs/client/sdk.py +13 -12
- sky/jobs/controller.py +5 -1
- sky/jobs/scheduler.py +4 -3
- sky/jobs/server/core.py +121 -51
- sky/jobs/state.py +15 -0
- sky/jobs/utils.py +114 -8
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +52 -2
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +22 -3
- sky/server/requests/requests.py +59 -2
- sky/server/rest.py +152 -0
- sky/server/server.py +70 -19
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +19 -14
- sky/task.py +141 -43
- sky/templates/jobs-controller.yaml.j2 +12 -1
- sky/templates/kubernetes-ray.yml.j2 +31 -2
- sky/users/permission.py +2 -0
- sky/utils/admin_policy_utils.py +5 -1
- sky/utils/cli_utils/status_utils.py +25 -17
- sky/utils/command_runner.py +118 -12
- sky/utils/command_runner.pyi +57 -0
- sky/utils/common_utils.py +9 -1
- sky/utils/context.py +3 -1
- sky/utils/controller_utils.py +1 -2
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +180 -38
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
- sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
- sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
- /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/jobs/server/core.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
"""SDK functions for managed jobs."""
|
2
2
|
import os
|
3
|
+
import pathlib
|
3
4
|
import tempfile
|
4
5
|
import typing
|
5
6
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
@@ -20,6 +21,7 @@ from sky.backends import backend_utils
|
|
20
21
|
from sky.catalog import common as service_catalog_common
|
21
22
|
from sky.data import storage as storage_lib
|
22
23
|
from sky.jobs import constants as managed_job_constants
|
24
|
+
from sky.jobs import state as managed_job_state
|
23
25
|
from sky.jobs import utils as managed_job_utils
|
24
26
|
from sky.provision import common as provision_common
|
25
27
|
from sky.skylet import constants as skylet_constants
|
@@ -43,6 +45,72 @@ if typing.TYPE_CHECKING:
|
|
43
45
|
logger = sky_logging.init_logger(__name__)
|
44
46
|
|
45
47
|
|
48
|
+
def _maybe_upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
|
49
|
+
"""Maybe upload files to the controller.
|
50
|
+
|
51
|
+
In consolidation mode, we don't need to upload files to the controller as
|
52
|
+
the API server and the controller are colocated.
|
53
|
+
"""
|
54
|
+
local_to_controller_file_mounts: Dict[str, str] = {}
|
55
|
+
|
56
|
+
if managed_job_utils.is_consolidation_mode():
|
57
|
+
return local_to_controller_file_mounts
|
58
|
+
|
59
|
+
if storage_lib.get_cached_enabled_storage_cloud_names_or_refresh():
|
60
|
+
for task_ in dag.tasks:
|
61
|
+
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
62
|
+
task_, task_type='jobs')
|
63
|
+
else:
|
64
|
+
# We do not have any cloud storage available, so fall back to
|
65
|
+
# two-hop file_mount uploading.
|
66
|
+
# Note: we can't easily hack sync_storage_mounts() to upload
|
67
|
+
# directly to the controller, because the controller may not
|
68
|
+
# even be up yet.
|
69
|
+
for task_ in dag.tasks:
|
70
|
+
if task_.storage_mounts:
|
71
|
+
# Technically, we could convert COPY storage_mounts that
|
72
|
+
# have a local source and do not specify `store`, but we
|
73
|
+
# will not do that for now. Only plain file_mounts are
|
74
|
+
# supported.
|
75
|
+
raise exceptions.NotSupportedError(
|
76
|
+
'Cloud-based file_mounts are specified, but no cloud '
|
77
|
+
'storage is available. Please specify local '
|
78
|
+
'file_mounts only.')
|
79
|
+
|
80
|
+
# Merge file mounts from all tasks.
|
81
|
+
local_to_controller_file_mounts.update(
|
82
|
+
controller_utils.translate_local_file_mounts_to_two_hop(task_))
|
83
|
+
|
84
|
+
return local_to_controller_file_mounts
|
85
|
+
|
86
|
+
|
87
|
+
def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag') -> Optional[int]:
|
88
|
+
"""Submit the managed job locally if in consolidation mode.
|
89
|
+
|
90
|
+
In normal mode the managed job submission is done in the ray job submission.
|
91
|
+
For consolidation mode, we need to manually submit it. Check the following
|
92
|
+
function for the normal mode submission:
|
93
|
+
sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend,
|
94
|
+
_exec_code_on_head::_maybe_add_managed_job_code
|
95
|
+
"""
|
96
|
+
if not managed_job_utils.is_consolidation_mode():
|
97
|
+
return None
|
98
|
+
|
99
|
+
# Create local directory for the managed job.
|
100
|
+
pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
|
101
|
+
consolidation_mode_job_id = managed_job_state.set_job_info_without_job_id(
|
102
|
+
dag.name,
|
103
|
+
workspace=skypilot_config.get_active_workspace(
|
104
|
+
force_user_workspace=True),
|
105
|
+
entrypoint=common_utils.get_current_command())
|
106
|
+
for task_id, task in enumerate(dag.tasks):
|
107
|
+
resources_str = backend_utils.get_task_resources_str(
|
108
|
+
task, is_managed_job=True)
|
109
|
+
managed_job_state.set_pending(consolidation_mode_job_id, task_id,
|
110
|
+
task.name, resources_str)
|
111
|
+
return consolidation_mode_job_id
|
112
|
+
|
113
|
+
|
46
114
|
@timeline.event
|
47
115
|
@usage_lib.entrypoint
|
48
116
|
def launch(
|
@@ -77,6 +145,7 @@ def launch(
|
|
77
145
|
entrypoint = task
|
78
146
|
dag_uuid = str(uuid.uuid4().hex[:4])
|
79
147
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
148
|
+
dag.resolve_and_validate_volumes()
|
80
149
|
# Always apply the policy again here, even though it might have been applied
|
81
150
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
82
151
|
# and get the mutated config.
|
@@ -86,6 +155,9 @@ def launch(
|
|
86
155
|
raise ValueError('Only single-task or chain DAG is '
|
87
156
|
f'allowed for job_launch. Dag: {dag}')
|
88
157
|
dag.validate()
|
158
|
+
# TODO(aylei): use consolidated job controller instead of performing
|
159
|
+
# pre-mount operations when submitting jobs.
|
160
|
+
dag.pre_mount_volumes()
|
89
161
|
|
90
162
|
user_dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
91
163
|
|
@@ -103,7 +175,7 @@ def launch(
|
|
103
175
|
'will be auto-generated) .')
|
104
176
|
task_names.add(task_.name)
|
105
177
|
|
106
|
-
# Check for priority in resources
|
178
|
+
# Check for priority in resources
|
107
179
|
task_priority = None
|
108
180
|
if task_.resources:
|
109
181
|
# Convert set to list to access elements by index
|
@@ -121,20 +193,6 @@ def launch(
|
|
121
193
|
f'{resource.priority} but expected {task_priority}.'
|
122
194
|
)
|
123
195
|
|
124
|
-
# Check for conflict between resources priority and job
|
125
|
-
# priority
|
126
|
-
if task_.job_priority is not None:
|
127
|
-
with ux_utils.print_exception_no_traceback():
|
128
|
-
raise ValueError(
|
129
|
-
f'Task {task_.name!r}: Cannot specify both '
|
130
|
-
f'resources.priority ({task_priority}) and '
|
131
|
-
f'job.priority ({task_.job_priority}). Please use only '
|
132
|
-
'one priority specification method.')
|
133
|
-
|
134
|
-
# Fall back to job priority if no resources priority found
|
135
|
-
if task_priority is None:
|
136
|
-
task_priority = task_.job_priority
|
137
|
-
|
138
196
|
if task_priority is not None:
|
139
197
|
if (priority is not None and priority != task_priority):
|
140
198
|
with ux_utils.print_exception_no_traceback():
|
@@ -183,34 +241,7 @@ def launch(
|
|
183
241
|
f'with:\n\n`sky down {cluster_name} --purge`\n\n'
|
184
242
|
f'Reason: {common_utils.format_exception(e)}')
|
185
243
|
|
186
|
-
|
187
|
-
|
188
|
-
if storage_lib.get_cached_enabled_storage_cloud_names_or_refresh():
|
189
|
-
for task_ in dag.tasks:
|
190
|
-
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
191
|
-
task_, task_type='jobs')
|
192
|
-
|
193
|
-
else:
|
194
|
-
# We do not have any cloud storage available, so fall back to
|
195
|
-
# two-hop file_mount uploading.
|
196
|
-
# Note: we can't easily hack sync_storage_mounts() to upload
|
197
|
-
# directly to the controller, because the controller may not
|
198
|
-
# even be up yet.
|
199
|
-
for task_ in dag.tasks:
|
200
|
-
if task_.storage_mounts:
|
201
|
-
# Technically, we could convert COPY storage_mounts that
|
202
|
-
# have a local source and do not specify `store`, but we
|
203
|
-
# will not do that for now. Only plain file_mounts are
|
204
|
-
# supported.
|
205
|
-
raise exceptions.NotSupportedError(
|
206
|
-
'Cloud-based file_mounts are specified, but no cloud '
|
207
|
-
'storage is available. Please specify local '
|
208
|
-
'file_mounts only.')
|
209
|
-
|
210
|
-
# Merge file mounts from all tasks.
|
211
|
-
local_to_controller_file_mounts.update(
|
212
|
-
controller_utils.translate_local_file_mounts_to_two_hop(
|
213
|
-
task_))
|
244
|
+
local_to_controller_file_mounts = _maybe_upload_files_to_controller(dag)
|
214
245
|
|
215
246
|
# Has to use `\` to avoid yapf issue.
|
216
247
|
with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
|
@@ -233,6 +264,13 @@ def launch(
|
|
233
264
|
controller=controller,
|
234
265
|
task_resources=sum([list(t.resources) for t in dag.tasks], []))
|
235
266
|
|
267
|
+
consolidation_mode_job_id = _maybe_submit_job_locally(prefix, dag)
|
268
|
+
|
269
|
+
# This is only needed for non-consolidation mode. For consolidation
|
270
|
+
# mode, the controller uses the same catalog as API server.
|
271
|
+
modified_catalogs = {} if consolidation_mode_job_id is not None else (
|
272
|
+
service_catalog_common.get_modified_catalog_file_mounts())
|
273
|
+
|
236
274
|
vars_to_fill = {
|
237
275
|
'remote_original_user_yaml_path': remote_original_user_yaml_path,
|
238
276
|
'original_user_dag_path': original_user_yaml_path.name,
|
@@ -244,9 +282,9 @@ def launch(
|
|
244
282
|
'dag_name': dag.name,
|
245
283
|
'remote_user_config_path': remote_user_config_path,
|
246
284
|
'remote_env_file_path': remote_env_file_path,
|
247
|
-
'modified_catalogs':
|
248
|
-
service_catalog_common.get_modified_catalog_file_mounts(),
|
285
|
+
'modified_catalogs': modified_catalogs,
|
249
286
|
'priority': priority,
|
287
|
+
'consolidation_mode_job_id': consolidation_mode_job_id,
|
250
288
|
**controller_utils.shared_controller_vars_to_fill(
|
251
289
|
controller,
|
252
290
|
remote_user_config_path=remote_user_config_path,
|
@@ -285,12 +323,44 @@ def launch(
|
|
285
323
|
# workspace A, but the controller is in workspace B, the
|
286
324
|
# intermediate bucket and newly created bucket should be in
|
287
325
|
# workspace A.
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
326
|
+
if consolidation_mode_job_id is None:
|
327
|
+
return execution.launch(task=controller_task,
|
328
|
+
cluster_name=controller_name,
|
329
|
+
stream_logs=stream_logs,
|
330
|
+
retry_until_up=True,
|
331
|
+
fast=True,
|
332
|
+
_disable_controller_check=True)
|
333
|
+
# Manually launch the scheduler process in consolidation mode.
|
334
|
+
local_handle = backend_utils.is_controller_accessible(
|
335
|
+
controller=controller, stopped_message='')
|
336
|
+
backend = backend_utils.get_backend_from_handle(local_handle)
|
337
|
+
assert isinstance(backend, backends.CloudVmRayBackend)
|
338
|
+
backend.sync_file_mounts(
|
339
|
+
handle=local_handle,
|
340
|
+
all_file_mounts=controller_task.file_mounts,
|
341
|
+
storage_mounts=controller_task.storage_mounts)
|
342
|
+
run_script = controller_task.run
|
343
|
+
assert isinstance(run_script, str)
|
344
|
+
# Manually add the env variables to the run script. Originally
|
345
|
+
# this is done in ray jobs submission but now we have to do it
|
346
|
+
# manually because there is no ray runtime on the API server.
|
347
|
+
env_cmds = [
|
348
|
+
f'export {k}={v!r}'
|
349
|
+
for k, v in controller_task.envs.items()
|
350
|
+
]
|
351
|
+
run_script = '\n'.join(env_cmds + [run_script])
|
352
|
+
# Dump script for high availability recovery.
|
353
|
+
if controller_utils.high_availability_specified(
|
354
|
+
controller_name):
|
355
|
+
dump_script_path = (
|
356
|
+
managed_job_utils.get_ha_dump_script_path(
|
357
|
+
consolidation_mode_job_id))
|
358
|
+
dump_script_path.parent.mkdir(parents=True, exist_ok=True)
|
359
|
+
with open(dump_script_path, 'w',
|
360
|
+
encoding='utf-8') as script_f:
|
361
|
+
script_f.write(run_script)
|
362
|
+
backend.run_on_head(local_handle, run_script)
|
363
|
+
return consolidation_mode_job_id, local_handle
|
294
364
|
|
295
365
|
|
296
366
|
def queue_from_kubernetes_pod(
|
sky/jobs/state.py
CHANGED
@@ -463,6 +463,21 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
|
|
463
463
|
entrypoint))
|
464
464
|
|
465
465
|
|
466
|
+
@_init_db
|
467
|
+
def set_job_info_without_job_id(name: str, workspace: str,
|
468
|
+
entrypoint: str) -> int:
|
469
|
+
assert _DB_PATH is not None
|
470
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
471
|
+
cursor.execute(
|
472
|
+
"""\
|
473
|
+
INSERT INTO job_info
|
474
|
+
(name, schedule_state, workspace, entrypoint)
|
475
|
+
VALUES (?, ?, ?, ?)""",
|
476
|
+
(name, ManagedJobScheduleState.INACTIVE.value, workspace,
|
477
|
+
entrypoint))
|
478
|
+
return cursor.lastrowid
|
479
|
+
|
480
|
+
|
466
481
|
@_init_db
|
467
482
|
def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
|
468
483
|
"""Set the task to pending state."""
|
sky/jobs/utils.py
CHANGED
@@ -5,6 +5,7 @@ jobs.constants.MANAGED_JOBS_VERSION and handle the API change in the
|
|
5
5
|
ManagedJobCodeGen.
|
6
6
|
"""
|
7
7
|
import collections
|
8
|
+
import datetime
|
8
9
|
import enum
|
9
10
|
import os
|
10
11
|
import pathlib
|
@@ -33,7 +34,10 @@ from sky.skylet import constants
|
|
33
34
|
from sky.skylet import job_lib
|
34
35
|
from sky.skylet import log_lib
|
35
36
|
from sky.usage import usage_lib
|
37
|
+
from sky.utils import annotations
|
38
|
+
from sky.utils import command_runner
|
36
39
|
from sky.utils import common_utils
|
40
|
+
from sky.utils import controller_utils
|
37
41
|
from sky.utils import infra_utils
|
38
42
|
from sky.utils import log_utils
|
39
43
|
from sky.utils import message_utils
|
@@ -124,6 +128,114 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
|
|
124
128
|
time.sleep(backoff.current_backoff())
|
125
129
|
|
126
130
|
|
131
|
+
def _check_consolidation_mode_consistency(
|
132
|
+
current_is_consolidation_mode: bool) -> None:
|
133
|
+
"""Check the consistency of the consolidation mode."""
|
134
|
+
# Check whether the consolidation mode config is changed.
|
135
|
+
if current_is_consolidation_mode:
|
136
|
+
controller_cn = (
|
137
|
+
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
|
138
|
+
if global_user_state.get_cluster_from_name(controller_cn) is not None:
|
139
|
+
with ux_utils.print_exception_no_traceback():
|
140
|
+
raise exceptions.InconsistentConsolidationModeError(
|
141
|
+
f'{colorama.Fore.RED}Consolidation mode is '
|
142
|
+
f'enabled, but the controller cluster '
|
143
|
+
f'{controller_cn} is still running. Please '
|
144
|
+
'terminate the controller cluster first.'
|
145
|
+
f'{colorama.Style.RESET_ALL}')
|
146
|
+
else:
|
147
|
+
all_jobs = managed_job_state.get_managed_jobs()
|
148
|
+
if all_jobs:
|
149
|
+
nonterminal_jobs = (
|
150
|
+
managed_job_state.get_nonterminal_job_ids_by_name(
|
151
|
+
None, all_users=True))
|
152
|
+
if nonterminal_jobs:
|
153
|
+
with ux_utils.print_exception_no_traceback():
|
154
|
+
raise exceptions.InconsistentConsolidationModeError(
|
155
|
+
f'{colorama.Fore.RED}Consolidation mode '
|
156
|
+
'is disabled, but there are still '
|
157
|
+
f'{len(nonterminal_jobs)} managed jobs '
|
158
|
+
'running. Please terminate those jobs '
|
159
|
+
f'first.{colorama.Style.RESET_ALL}')
|
160
|
+
else:
|
161
|
+
logger.warning(
|
162
|
+
f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
|
163
|
+
f'but there are {len(all_jobs)} jobs from previous '
|
164
|
+
'consolidation mode. Reset the `jobs.controller.'
|
165
|
+
'consolidation_mode` to `true` and run `sky jobs queue` '
|
166
|
+
'to see those jobs. Switching to normal mode will '
|
167
|
+
f'lose the job history.{colorama.Style.RESET_ALL}')
|
168
|
+
|
169
|
+
|
170
|
+
# Whether to use consolidation mode or not. When this is enabled, the managed
|
171
|
+
# jobs controller will not be running on a separate cluster, but locally on the
|
172
|
+
# API Server. Under the hood, we submit the job monitoring logic as processes
|
173
|
+
# directly in the API Server.
|
174
|
+
# Use LRU Cache so that the check is only done once.
|
175
|
+
@annotations.lru_cache(scope='request', maxsize=1)
|
176
|
+
def is_consolidation_mode() -> bool:
|
177
|
+
consolidation_mode = skypilot_config.get_nested(
|
178
|
+
('jobs', 'controller', 'consolidation_mode'), default_value=False)
|
179
|
+
_check_consolidation_mode_consistency(consolidation_mode)
|
180
|
+
return consolidation_mode
|
181
|
+
|
182
|
+
|
183
|
+
def get_ha_dump_script_path(job_id: int) -> pathlib.Path:
|
184
|
+
"""Get the path to the HA dump script for a job."""
|
185
|
+
return pathlib.Path(constants.PERSISTENT_RUN_SCRIPT_DIR).expanduser(
|
186
|
+
).resolve() / f'sky_job_{job_id}'
|
187
|
+
|
188
|
+
|
189
|
+
def ha_recovery_for_consolidation_mode():
|
190
|
+
"""Recovery logic for HA mode."""
|
191
|
+
# No setup recovery is needed in consolidation mode, as the API server
|
192
|
+
# already has all runtime installed. Directly start jobs recovery here.
|
193
|
+
# Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
|
194
|
+
runner = command_runner.LocalProcessCommandRunner()
|
195
|
+
with open(constants.HA_PERSISTENT_RECOVERY_LOG_PATH, 'w',
|
196
|
+
encoding='utf-8') as f:
|
197
|
+
start = time.time()
|
198
|
+
f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
|
199
|
+
for job in managed_job_state.get_managed_jobs():
|
200
|
+
job_id = job['job_id']
|
201
|
+
controller_pid = job['controller_pid']
|
202
|
+
|
203
|
+
# In consolidation mode, it is possible that only the API server
|
204
|
+
# process is restarted, and the controller process is not. In such
|
205
|
+
# case, we don't need to do anything and the controller process will
|
206
|
+
# just keep running.
|
207
|
+
if controller_pid is not None:
|
208
|
+
try:
|
209
|
+
if _controller_process_alive(controller_pid, job_id):
|
210
|
+
f.write(f'Controller pid {controller_pid} for '
|
211
|
+
f'job {job_id} is still running. '
|
212
|
+
'Skipping recovery.\n')
|
213
|
+
continue
|
214
|
+
except Exception: # pylint: disable=broad-except
|
215
|
+
# _controller_process_alive may raise if psutil fails; we
|
216
|
+
# should not crash the recovery logic because of this.
|
217
|
+
f.write('Error checking controller pid '
|
218
|
+
f'{controller_pid} for job {job_id}\n')
|
219
|
+
|
220
|
+
if job['schedule_state'] not in [
|
221
|
+
managed_job_state.ManagedJobScheduleState.DONE,
|
222
|
+
managed_job_state.ManagedJobScheduleState.WAITING
|
223
|
+
]:
|
224
|
+
dump_script_path = get_ha_dump_script_path(job_id)
|
225
|
+
if not dump_script_path.exists():
|
226
|
+
f.write(f'Job {job_id}\'s recovery file ({dump_script_path}'
|
227
|
+
') does not exist. Skipping recovery. Job '
|
228
|
+
f'schedule state: {job["schedule_state"]}\n')
|
229
|
+
continue
|
230
|
+
with open(dump_script_path, 'r', encoding='utf-8') as script_f:
|
231
|
+
script = script_f.read()
|
232
|
+
runner.run(script)
|
233
|
+
f.write(f'Job {job_id} (file: {dump_script_path}) completed '
|
234
|
+
f'recovery at {datetime.datetime.now()}\n')
|
235
|
+
f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
|
236
|
+
f.write(f'Total recovery time: {time.time() - start} seconds\n')
|
237
|
+
|
238
|
+
|
127
239
|
def get_job_status(backend: 'backends.CloudVmRayBackend',
|
128
240
|
cluster_name: str) -> Optional['job_lib.JobStatus']:
|
129
241
|
"""Check the status of the job running on a managed job cluster.
|
@@ -157,9 +269,8 @@ def _controller_process_alive(pid: int, job_id: int) -> bool:
|
|
157
269
|
"""Check if the controller process is alive."""
|
158
270
|
try:
|
159
271
|
process = psutil.Process(pid)
|
160
|
-
|
161
|
-
|
162
|
-
return process.is_running() and job_args == ['--job-id', str(job_id)]
|
272
|
+
cmd_str = ' '.join(process.cmdline())
|
273
|
+
return process.is_running() and f'--job-id {job_id}' in cmd_str
|
163
274
|
except psutil.NoSuchProcess:
|
164
275
|
return False
|
165
276
|
|
@@ -1136,7 +1247,6 @@ def format_job_table(
|
|
1136
1247
|
'TASK',
|
1137
1248
|
*(['WORKSPACE'] if show_workspace else []),
|
1138
1249
|
'NAME',
|
1139
|
-
'PRIORITY',
|
1140
1250
|
*user_cols,
|
1141
1251
|
'REQUESTED',
|
1142
1252
|
'SUBMITTED',
|
@@ -1208,7 +1318,6 @@ def format_job_table(
|
|
1208
1318
|
submitted_at = None
|
1209
1319
|
end_at: Optional[int] = 0
|
1210
1320
|
recovery_cnt = 0
|
1211
|
-
priority = job_tasks[0].get('priority', '-')
|
1212
1321
|
managed_job_status, current_task_id = _get_job_status_from_tasks(
|
1213
1322
|
job_tasks)
|
1214
1323
|
for task in job_tasks:
|
@@ -1244,7 +1353,6 @@ def format_job_table(
|
|
1244
1353
|
'',
|
1245
1354
|
*([''] if show_workspace else []),
|
1246
1355
|
job_name,
|
1247
|
-
str(priority),
|
1248
1356
|
*user_values,
|
1249
1357
|
'-',
|
1250
1358
|
submitted,
|
@@ -1275,13 +1383,11 @@ def format_job_table(
|
|
1275
1383
|
submitted = log_utils.readable_time_duration(task['submitted_at'])
|
1276
1384
|
user_values = get_user_column_values(task)
|
1277
1385
|
task_workspace = '-' if len(job_tasks) > 1 else workspace
|
1278
|
-
priority = task.get('priority', '-')
|
1279
1386
|
values = [
|
1280
1387
|
task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
|
1281
1388
|
task['task_id'] if len(job_tasks) > 1 else '-',
|
1282
1389
|
*([task_workspace] if show_workspace else []),
|
1283
1390
|
task['task_name'],
|
1284
|
-
str(priority),
|
1285
1391
|
*user_values,
|
1286
1392
|
task['resources'],
|
1287
1393
|
# SUBMITTED
|
sky/models.py
CHANGED
@@ -6,6 +6,8 @@ import getpass
|
|
6
6
|
import os
|
7
7
|
from typing import Any, Dict, Optional
|
8
8
|
|
9
|
+
import pydantic
|
10
|
+
|
9
11
|
from sky.skylet import constants
|
10
12
|
from sky.utils import common_utils
|
11
13
|
|
@@ -48,6 +50,8 @@ class KubernetesNodeInfo:
|
|
48
50
|
# Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
|
49
51
|
total: Dict[str, int]
|
50
52
|
free: Dict[str, int]
|
53
|
+
# IP address of the node (external IP preferred, fallback to internal IP)
|
54
|
+
ip_address: Optional[str] = None
|
51
55
|
|
52
56
|
|
53
57
|
@dataclasses.dataclass
|
@@ -76,3 +80,15 @@ class KubernetesNodesInfo:
|
|
76
80
|
},
|
77
81
|
hint=data['hint'],
|
78
82
|
)
|
83
|
+
|
84
|
+
|
85
|
+
class VolumeConfig(pydantic.BaseModel):
|
86
|
+
"""Configuration for creating a volume."""
|
87
|
+
name: str
|
88
|
+
type: str
|
89
|
+
cloud: str
|
90
|
+
region: Optional[str]
|
91
|
+
zone: Optional[str]
|
92
|
+
name_on_cloud: str
|
93
|
+
size: Optional[str]
|
94
|
+
config: Dict[str, Any] = {}
|
sky/provision/__init__.py
CHANGED
@@ -8,6 +8,7 @@ import inspect
|
|
8
8
|
import typing
|
9
9
|
from typing import Any, Dict, List, Optional, Type
|
10
10
|
|
11
|
+
from sky import models
|
11
12
|
from sky import sky_logging
|
12
13
|
# These provision.<cloud> modules should never fail even if underlying cloud SDK
|
13
14
|
# dependencies are not installed. This is ensured by using sky.adaptors inside
|
@@ -103,6 +104,31 @@ def bootstrap_instances(
|
|
103
104
|
raise NotImplementedError
|
104
105
|
|
105
106
|
|
107
|
+
@_route_to_cloud_impl
|
108
|
+
def apply_volume(provider_name: str,
|
109
|
+
config: models.VolumeConfig) -> models.VolumeConfig:
|
110
|
+
"""Create or register a volume.
|
111
|
+
|
112
|
+
This function creates or registers a volume with the provided configuration,
|
113
|
+
and returns a VolumeConfig object with updated configuration.
|
114
|
+
"""
|
115
|
+
raise NotImplementedError
|
116
|
+
|
117
|
+
|
118
|
+
@_route_to_cloud_impl
|
119
|
+
def delete_volume(provider_name: str,
|
120
|
+
config: models.VolumeConfig) -> models.VolumeConfig:
|
121
|
+
"""Delete a volume."""
|
122
|
+
raise NotImplementedError
|
123
|
+
|
124
|
+
|
125
|
+
@_route_to_cloud_impl
|
126
|
+
def get_volume_usedby(provider_name: str,
|
127
|
+
config: models.VolumeConfig) -> List[str]:
|
128
|
+
"""Get the usedby of a volume."""
|
129
|
+
raise NotImplementedError
|
130
|
+
|
131
|
+
|
106
132
|
@_route_to_cloud_impl
|
107
133
|
def run_instances(provider_name: str, region: str, cluster_name_on_cloud: str,
|
108
134
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
@@ -11,3 +11,6 @@ from sky.provision.kubernetes.instance import wait_instances
|
|
11
11
|
from sky.provision.kubernetes.network import cleanup_ports
|
12
12
|
from sky.provision.kubernetes.network import open_ports
|
13
13
|
from sky.provision.kubernetes.network import query_ports
|
14
|
+
from sky.provision.kubernetes.volume import apply_volume
|
15
|
+
from sky.provision.kubernetes.volume import delete_volume
|
16
|
+
from sky.provision.kubernetes.volume import get_volume_usedby
|