skypilot-nightly 1.0.0.dev20250621__py3-none-any.whl → 1.0.0.dev20250624__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/admin_policy.py +16 -5
- sky/backends/__init__.py +2 -1
- sky/backends/backend_utils.py +12 -0
- sky/backends/cloud_vm_ray_backend.py +36 -13
- sky/client/cli/command.py +42 -21
- sky/client/sdk.py +12 -6
- sky/clouds/kubernetes.py +1 -0
- sky/core.py +88 -15
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/37-4650f214e2119168.js +6 -0
- sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +6 -0
- sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
- sky/dashboard/out/_next/static/chunks/856-bfddc18e16f3873c.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-ce7991c156584b06.js +1 -0
- sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
- sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-ce31493da9747ef4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-7e9736af1c6345a6.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-cf490d1fa38f3740.js → [job]-171c27f4ca94861c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-ecc5a7003776cfa7.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +1 -0
- sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +3 -0
- sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +11 -0
- sky/global_user_state.py +134 -20
- sky/jobs/client/sdk.py +0 -1
- sky/jobs/controller.py +5 -1
- sky/jobs/scheduler.py +4 -3
- sky/jobs/server/core.py +117 -51
- sky/jobs/state.py +15 -0
- sky/jobs/utils.py +114 -8
- sky/resources.py +1 -1
- sky/server/requests/payloads.py +6 -3
- sky/server/requests/requests.py +24 -1
- sky/server/server.py +4 -3
- sky/skylet/constants.py +5 -11
- sky/task.py +1 -26
- sky/templates/jobs-controller.yaml.j2 +12 -1
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/utils/admin_policy_utils.py +5 -1
- sky/utils/cli_utils/status_utils.py +25 -17
- sky/utils/command_runner.py +118 -12
- sky/utils/command_runner.pyi +57 -0
- sky/utils/common_utils.py +9 -1
- sky/utils/controller_utils.py +1 -2
- sky/utils/schemas.py +34 -35
- {skypilot_nightly-1.0.0.dev20250621.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250621.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/RECORD +78 -77
- sky/dashboard/out/_next/static/PZWXta2b3IpViuIKI97hg/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
- sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
- sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
- /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-bde186946d353355.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-56412c7976b4655b.js} +0 -0
- /sky/dashboard/out/_next/static/{PZWXta2b3IpViuIKI97hg → zsALxITkbP8J8NVwSDwMo}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250621.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250621.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250621.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250621.dist-info → skypilot_nightly-1.0.0.dev20250624.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py
CHANGED
@@ -5,6 +5,7 @@ jobs.constants.MANAGED_JOBS_VERSION and handle the API change in the
|
|
5
5
|
ManagedJobCodeGen.
|
6
6
|
"""
|
7
7
|
import collections
|
8
|
+
import datetime
|
8
9
|
import enum
|
9
10
|
import os
|
10
11
|
import pathlib
|
@@ -33,7 +34,10 @@ from sky.skylet import constants
|
|
33
34
|
from sky.skylet import job_lib
|
34
35
|
from sky.skylet import log_lib
|
35
36
|
from sky.usage import usage_lib
|
37
|
+
from sky.utils import annotations
|
38
|
+
from sky.utils import command_runner
|
36
39
|
from sky.utils import common_utils
|
40
|
+
from sky.utils import controller_utils
|
37
41
|
from sky.utils import infra_utils
|
38
42
|
from sky.utils import log_utils
|
39
43
|
from sky.utils import message_utils
|
@@ -124,6 +128,114 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
|
|
124
128
|
time.sleep(backoff.current_backoff())
|
125
129
|
|
126
130
|
|
131
|
+
def _check_consolidation_mode_consistency(
|
132
|
+
current_is_consolidation_mode: bool) -> None:
|
133
|
+
"""Check the consistency of the consolidation mode."""
|
134
|
+
# Check whether the consolidation mode config is changed.
|
135
|
+
if current_is_consolidation_mode:
|
136
|
+
controller_cn = (
|
137
|
+
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
|
138
|
+
if global_user_state.get_cluster_from_name(controller_cn) is not None:
|
139
|
+
with ux_utils.print_exception_no_traceback():
|
140
|
+
raise exceptions.InconsistentConsolidationModeError(
|
141
|
+
f'{colorama.Fore.RED}Consolidation mode is '
|
142
|
+
f'enabled, but the controller cluster '
|
143
|
+
f'{controller_cn} is still running. Please '
|
144
|
+
'terminate the controller cluster first.'
|
145
|
+
f'{colorama.Style.RESET_ALL}')
|
146
|
+
else:
|
147
|
+
all_jobs = managed_job_state.get_managed_jobs()
|
148
|
+
if all_jobs:
|
149
|
+
nonterminal_jobs = (
|
150
|
+
managed_job_state.get_nonterminal_job_ids_by_name(
|
151
|
+
None, all_users=True))
|
152
|
+
if nonterminal_jobs:
|
153
|
+
with ux_utils.print_exception_no_traceback():
|
154
|
+
raise exceptions.InconsistentConsolidationModeError(
|
155
|
+
f'{colorama.Fore.RED}Consolidation mode '
|
156
|
+
'is disabled, but there are still '
|
157
|
+
f'{len(nonterminal_jobs)} managed jobs '
|
158
|
+
'running. Please terminate those jobs '
|
159
|
+
f'first.{colorama.Style.RESET_ALL}')
|
160
|
+
else:
|
161
|
+
logger.warning(
|
162
|
+
f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
|
163
|
+
f'but there are {len(all_jobs)} jobs from previous '
|
164
|
+
'consolidation mode. Reset the `jobs.controller.'
|
165
|
+
'consolidation_mode` to `true` and run `sky jobs queue` '
|
166
|
+
'to see those jobs. Switching to normal mode will '
|
167
|
+
f'lose the job history.{colorama.Style.RESET_ALL}')
|
168
|
+
|
169
|
+
|
170
|
+
# Whether to use consolidation mode or not. When this is enabled, the managed
|
171
|
+
# jobs controller will not be running on a separate cluster, but locally on the
|
172
|
+
# API Server. Under the hood, we submit the job monitoring logic as processes
|
173
|
+
# directly in the API Server.
|
174
|
+
# Use LRU Cache so that the check is only done once.
|
175
|
+
@annotations.lru_cache(scope='request', maxsize=1)
|
176
|
+
def is_consolidation_mode() -> bool:
|
177
|
+
consolidation_mode = skypilot_config.get_nested(
|
178
|
+
('jobs', 'controller', 'consolidation_mode'), default_value=False)
|
179
|
+
_check_consolidation_mode_consistency(consolidation_mode)
|
180
|
+
return consolidation_mode
|
181
|
+
|
182
|
+
|
183
|
+
def get_ha_dump_script_path(job_id: int) -> pathlib.Path:
|
184
|
+
"""Get the path to the HA dump script for a job."""
|
185
|
+
return pathlib.Path(constants.PERSISTENT_RUN_SCRIPT_DIR).expanduser(
|
186
|
+
).resolve() / f'sky_job_{job_id}'
|
187
|
+
|
188
|
+
|
189
|
+
def ha_recovery_for_consolidation_mode():
|
190
|
+
"""Recovery logic for HA mode."""
|
191
|
+
# No setup recovery is needed in consolidation mode, as the API server
|
192
|
+
# already has all runtime installed. Directly start jobs recovery here.
|
193
|
+
# Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
|
194
|
+
runner = command_runner.LocalProcessCommandRunner()
|
195
|
+
with open(constants.HA_PERSISTENT_RECOVERY_LOG_PATH, 'w',
|
196
|
+
encoding='utf-8') as f:
|
197
|
+
start = time.time()
|
198
|
+
f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
|
199
|
+
for job in managed_job_state.get_managed_jobs():
|
200
|
+
job_id = job['job_id']
|
201
|
+
controller_pid = job['controller_pid']
|
202
|
+
|
203
|
+
# In consolidation mode, it is possible that only the API server
|
204
|
+
# process is restarted, and the controller process is not. In such
|
205
|
+
# case, we don't need to do anything and the controller process will
|
206
|
+
# just keep running.
|
207
|
+
if controller_pid is not None:
|
208
|
+
try:
|
209
|
+
if _controller_process_alive(controller_pid, job_id):
|
210
|
+
f.write(f'Controller pid {controller_pid} for '
|
211
|
+
f'job {job_id} is still running. '
|
212
|
+
'Skipping recovery.\n')
|
213
|
+
continue
|
214
|
+
except Exception: # pylint: disable=broad-except
|
215
|
+
# _controller_process_alive may raise if psutil fails; we
|
216
|
+
# should not crash the recovery logic because of this.
|
217
|
+
f.write('Error checking controller pid '
|
218
|
+
f'{controller_pid} for job {job_id}\n')
|
219
|
+
|
220
|
+
if job['schedule_state'] not in [
|
221
|
+
managed_job_state.ManagedJobScheduleState.DONE,
|
222
|
+
managed_job_state.ManagedJobScheduleState.WAITING
|
223
|
+
]:
|
224
|
+
dump_script_path = get_ha_dump_script_path(job_id)
|
225
|
+
if not dump_script_path.exists():
|
226
|
+
f.write(f'Job {job_id}\'s recovery file ({dump_script_path}'
|
227
|
+
') does not exist. Skipping recovery. Job '
|
228
|
+
f'schedule state: {job["schedule_state"]}\n')
|
229
|
+
continue
|
230
|
+
with open(dump_script_path, 'r', encoding='utf-8') as script_f:
|
231
|
+
script = script_f.read()
|
232
|
+
runner.run(script)
|
233
|
+
f.write(f'Job {job_id} (file: {dump_script_path}) completed '
|
234
|
+
f'recovery at {datetime.datetime.now()}\n')
|
235
|
+
f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
|
236
|
+
f.write(f'Total recovery time: {time.time() - start} seconds\n')
|
237
|
+
|
238
|
+
|
127
239
|
def get_job_status(backend: 'backends.CloudVmRayBackend',
|
128
240
|
cluster_name: str) -> Optional['job_lib.JobStatus']:
|
129
241
|
"""Check the status of the job running on a managed job cluster.
|
@@ -157,9 +269,8 @@ def _controller_process_alive(pid: int, job_id: int) -> bool:
|
|
157
269
|
"""Check if the controller process is alive."""
|
158
270
|
try:
|
159
271
|
process = psutil.Process(pid)
|
160
|
-
|
161
|
-
|
162
|
-
return process.is_running() and job_args == ['--job-id', str(job_id)]
|
272
|
+
cmd_str = ' '.join(process.cmdline())
|
273
|
+
return process.is_running() and f'--job-id {job_id}' in cmd_str
|
163
274
|
except psutil.NoSuchProcess:
|
164
275
|
return False
|
165
276
|
|
@@ -1136,7 +1247,6 @@ def format_job_table(
|
|
1136
1247
|
'TASK',
|
1137
1248
|
*(['WORKSPACE'] if show_workspace else []),
|
1138
1249
|
'NAME',
|
1139
|
-
'PRIORITY',
|
1140
1250
|
*user_cols,
|
1141
1251
|
'REQUESTED',
|
1142
1252
|
'SUBMITTED',
|
@@ -1208,7 +1318,6 @@ def format_job_table(
|
|
1208
1318
|
submitted_at = None
|
1209
1319
|
end_at: Optional[int] = 0
|
1210
1320
|
recovery_cnt = 0
|
1211
|
-
priority = job_tasks[0].get('priority', '-')
|
1212
1321
|
managed_job_status, current_task_id = _get_job_status_from_tasks(
|
1213
1322
|
job_tasks)
|
1214
1323
|
for task in job_tasks:
|
@@ -1244,7 +1353,6 @@ def format_job_table(
|
|
1244
1353
|
'',
|
1245
1354
|
*([''] if show_workspace else []),
|
1246
1355
|
job_name,
|
1247
|
-
str(priority),
|
1248
1356
|
*user_values,
|
1249
1357
|
'-',
|
1250
1358
|
submitted,
|
@@ -1275,13 +1383,11 @@ def format_job_table(
|
|
1275
1383
|
submitted = log_utils.readable_time_duration(task['submitted_at'])
|
1276
1384
|
user_values = get_user_column_values(task)
|
1277
1385
|
task_workspace = '-' if len(job_tasks) > 1 else workspace
|
1278
|
-
priority = task.get('priority', '-')
|
1279
1386
|
values = [
|
1280
1387
|
task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
|
1281
1388
|
task['task_id'] if len(job_tasks) > 1 else '-',
|
1282
1389
|
*([task_workspace] if show_workspace else []),
|
1283
1390
|
task['task_name'],
|
1284
|
-
str(priority),
|
1285
1391
|
*user_values,
|
1286
1392
|
task['resources'],
|
1287
1393
|
# SUBMITTED
|
sky/resources.py
CHANGED
@@ -2296,7 +2296,7 @@ def parse_time_minutes(time: str) -> int:
|
|
2296
2296
|
def parse_memory_resource(resource_qty_str: Union[str, int, float],
|
2297
2297
|
field_name: str,
|
2298
2298
|
ret_type: type = int,
|
2299
|
-
unit: str = '
|
2299
|
+
unit: str = 'gb',
|
2300
2300
|
allow_plus: bool = False,
|
2301
2301
|
allow_x: bool = False,
|
2302
2302
|
allow_rounding: bool = False) -> str:
|
sky/server/requests/payloads.py
CHANGED
@@ -5,7 +5,6 @@ kwargs for the payloads, otherwise, we have to keep the default values the sync
|
|
5
5
|
with the backend functions. The benefit of having the default values in the
|
6
6
|
payloads is that a user can find the default values in the Restful API docs.
|
7
7
|
"""
|
8
|
-
import getpass
|
9
8
|
import os
|
10
9
|
import typing
|
11
10
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
@@ -58,8 +57,7 @@ def request_body_env_vars() -> dict:
|
|
58
57
|
if common.is_api_server_local() and env_var in EXTERNAL_LOCAL_ENV_VARS:
|
59
58
|
env_vars[env_var] = os.environ[env_var]
|
60
59
|
env_vars[constants.USER_ID_ENV_VAR] = common_utils.get_user_hash()
|
61
|
-
env_vars[constants.USER_ENV_VAR] =
|
62
|
-
getpass.getuser())
|
60
|
+
env_vars[constants.USER_ENV_VAR] = common_utils.get_current_user_name()
|
63
61
|
env_vars[
|
64
62
|
usage_constants.USAGE_RUN_ID_ENV_VAR] = usage_lib.messages.usage.run_id
|
65
63
|
# Remove the path to config file, as the config content is included in the
|
@@ -613,3 +611,8 @@ class UpdateConfigBody(RequestBody):
|
|
613
611
|
class GetConfigBody(RequestBody):
|
614
612
|
"""The request body for getting the entire SkyPilot configuration."""
|
615
613
|
pass
|
614
|
+
|
615
|
+
|
616
|
+
class CostReportBody(RequestBody):
|
617
|
+
"""The request body for the cost report endpoint."""
|
618
|
+
days: Optional[int] = 30
|
sky/server/requests/requests.py
CHANGED
@@ -327,6 +327,26 @@ def refresh_cluster_status_event():
|
|
327
327
|
time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
|
328
328
|
|
329
329
|
|
330
|
+
def managed_job_status_refresh_event():
|
331
|
+
"""Refresh the managed job status for controller consolidation mode."""
|
332
|
+
# pylint: disable=import-outside-toplevel
|
333
|
+
from sky.jobs import utils as managed_job_utils
|
334
|
+
if not managed_job_utils.is_consolidation_mode():
|
335
|
+
return
|
336
|
+
# We run the recovery logic before starting the event loop as those two are
|
337
|
+
# conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
|
338
|
+
from sky.utils import controller_utils
|
339
|
+
if controller_utils.high_availability_specified(
|
340
|
+
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
|
341
|
+
managed_job_utils.ha_recovery_for_consolidation_mode()
|
342
|
+
# After recovery, we start the event loop.
|
343
|
+
from sky.skylet import events
|
344
|
+
event = events.ManagedJobEvent()
|
345
|
+
while True:
|
346
|
+
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
347
|
+
event.run()
|
348
|
+
|
349
|
+
|
330
350
|
@dataclasses.dataclass
|
331
351
|
class InternalRequestDaemon:
|
332
352
|
id: str
|
@@ -341,7 +361,10 @@ INTERNAL_REQUEST_DAEMONS = [
|
|
341
361
|
# cluster being stopped or down when `sky status -r` is called.
|
342
362
|
InternalRequestDaemon(id='skypilot-status-refresh-daemon',
|
343
363
|
name='status',
|
344
|
-
event_fn=refresh_cluster_status_event)
|
364
|
+
event_fn=refresh_cluster_status_event),
|
365
|
+
InternalRequestDaemon(id='managed-job-status-refresh-daemon',
|
366
|
+
name='managed-job-status',
|
367
|
+
event_fn=managed_job_status_refresh_event),
|
345
368
|
]
|
346
369
|
|
347
370
|
|
sky/server/server.py
CHANGED
@@ -1044,13 +1044,14 @@ async def download(download_body: payloads.DownloadBody) -> None:
|
|
1044
1044
|
detail=f'Error creating zip file: {str(e)}')
|
1045
1045
|
|
1046
1046
|
|
1047
|
-
@app.
|
1048
|
-
async def cost_report(request: fastapi.Request
|
1047
|
+
@app.post('/cost_report')
|
1048
|
+
async def cost_report(request: fastapi.Request,
|
1049
|
+
cost_report_body: payloads.CostReportBody) -> None:
|
1049
1050
|
"""Gets the cost report of a cluster."""
|
1050
1051
|
executor.schedule_request(
|
1051
1052
|
request_id=request.state.request_id,
|
1052
1053
|
request_name='cost_report',
|
1053
|
-
request_body=
|
1054
|
+
request_body=cost_report_body,
|
1054
1055
|
func=core.cost_report,
|
1055
1056
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
1056
1057
|
)
|
sky/skylet/constants.py
CHANGED
@@ -401,6 +401,8 @@ PERSISTENT_RUN_SCRIPT_DIR = '~/.sky/.controller_recovery_task_run'
|
|
401
401
|
PERSISTENT_RUN_RESTARTING_SIGNAL_FILE = (
|
402
402
|
'~/.sky/.controller_recovery_restarting_signal')
|
403
403
|
|
404
|
+
HA_PERSISTENT_RECOVERY_LOG_PATH = '/tmp/ha_recovery.log'
|
405
|
+
|
404
406
|
# The placeholder for the local skypilot config path in file mounts for
|
405
407
|
# controllers.
|
406
408
|
LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
|
@@ -436,30 +438,20 @@ LOGGING_CONFIG_DIR = '~/.sky/logging'
|
|
436
438
|
|
437
439
|
# Resources constants
|
438
440
|
TIME_UNITS = {
|
439
|
-
's': 1 / 60,
|
440
|
-
'sec': 1 / 60,
|
441
441
|
'm': 1,
|
442
|
-
'min': 1,
|
443
442
|
'h': 60,
|
444
|
-
'hr': 60,
|
445
443
|
'd': 24 * 60,
|
446
|
-
'
|
444
|
+
'w': 7 * 24 * 60,
|
447
445
|
}
|
448
446
|
|
449
447
|
TIME_PATTERN: str = (
|
450
448
|
f'^[0-9]+({"|".join([unit.lower() for unit in TIME_UNITS])})?$/i')
|
451
449
|
|
452
450
|
MEMORY_SIZE_UNITS = {
|
453
|
-
'b': 1,
|
454
|
-
'k': 2**10,
|
455
451
|
'kb': 2**10,
|
456
|
-
'm': 2**20,
|
457
452
|
'mb': 2**20,
|
458
|
-
'g': 2**30,
|
459
453
|
'gb': 2**30,
|
460
|
-
't': 2**40,
|
461
454
|
'tb': 2**40,
|
462
|
-
'p': 2**50,
|
463
455
|
'pb': 2**50,
|
464
456
|
}
|
465
457
|
|
@@ -472,3 +464,5 @@ MEMORY_SIZE_PLUS_PATTERN = f'{MEMORY_SIZE_PATTERN[:-3]}+?$/i'
|
|
472
464
|
MIN_PRIORITY = -1000
|
473
465
|
MAX_PRIORITY = 1000
|
474
466
|
DEFAULT_PRIORITY = 0
|
467
|
+
|
468
|
+
COST_REPORT_DEFAULT_DAYS = 30
|
sky/task.py
CHANGED
@@ -342,8 +342,7 @@ class Task:
|
|
342
342
|
self.resources: Union[List[sky.Resources],
|
343
343
|
Set[sky.Resources]] = {sky.Resources()}
|
344
344
|
self._service: Optional[service_spec.SkyServiceSpec] = None
|
345
|
-
|
346
|
-
self._job_priority: Optional[int] = None
|
345
|
+
|
347
346
|
# Resources that this task cannot run on.
|
348
347
|
self.blocked_resources = blocked_resources
|
349
348
|
|
@@ -712,10 +711,6 @@ class Task:
|
|
712
711
|
service = service_spec.SkyServiceSpec.from_yaml_config(service)
|
713
712
|
task.set_service(service)
|
714
713
|
|
715
|
-
job = config.pop('job', None)
|
716
|
-
if job is not None and 'priority' in job:
|
717
|
-
task.set_job_priority(job['priority'])
|
718
|
-
|
719
714
|
assert not config, f'Invalid task args: {config.keys()}'
|
720
715
|
return task
|
721
716
|
|
@@ -976,23 +971,6 @@ class Task:
|
|
976
971
|
self._service = service
|
977
972
|
return self
|
978
973
|
|
979
|
-
@property
|
980
|
-
def job_priority(self) -> Optional[int]:
|
981
|
-
"""The priority of the managed job running this task."""
|
982
|
-
return self._job_priority
|
983
|
-
|
984
|
-
def set_job_priority(self, priority: int) -> 'Task':
|
985
|
-
"""Sets the job priority for this task.
|
986
|
-
|
987
|
-
Args:
|
988
|
-
priority: an integer between 0 and 1000.
|
989
|
-
|
990
|
-
Returns:
|
991
|
-
self: The current task, with job priority set.
|
992
|
-
"""
|
993
|
-
self._job_priority = priority
|
994
|
-
return self
|
995
|
-
|
996
974
|
def set_time_estimator(self, func: Callable[['sky.Resources'],
|
997
975
|
int]) -> 'Task':
|
998
976
|
"""Sets a func mapping resources to estimated time (secs).
|
@@ -1436,9 +1414,6 @@ class Task:
|
|
1436
1414
|
if self.service is not None:
|
1437
1415
|
add_if_not_none('service', self.service.to_yaml_config())
|
1438
1416
|
|
1439
|
-
if self.job_priority is not None:
|
1440
|
-
add_if_not_none('job', {'priority': self.job_priority})
|
1441
|
-
|
1442
1417
|
add_if_not_none('num_nodes', self.num_nodes)
|
1443
1418
|
|
1444
1419
|
if self.inputs is not None:
|
@@ -31,7 +31,9 @@ setup: |
|
|
31
31
|
{% endif %}
|
32
32
|
|
33
33
|
run: |
|
34
|
+
{%- if consolidation_mode_job_id is none %}
|
34
35
|
{{ sky_activate_python_env }}
|
36
|
+
{%- endif %}
|
35
37
|
|
36
38
|
# Write env vars to a file
|
37
39
|
{%- for env_name, env_value in controller_envs.items() %}
|
@@ -42,9 +44,18 @@ run: |
|
|
42
44
|
# Note: The job is already in the `spot` table, marked as PENDING.
|
43
45
|
# CloudVmRayBackend._exec_code_on_head() calls
|
44
46
|
# managed_job_codegen.set_pending() before we get here.
|
45
|
-
|
47
|
+
{%- if consolidation_mode_job_id is not none %}
|
48
|
+
{{sky_python_cmd}} \
|
49
|
+
{%- else %}
|
50
|
+
python \
|
51
|
+
{%- endif %}
|
52
|
+
-u -m sky.jobs.scheduler {{remote_user_yaml_path}} \
|
46
53
|
--user-yaml-path {{remote_original_user_yaml_path}} \
|
54
|
+
{%- if consolidation_mode_job_id is not none %}
|
55
|
+
--job-id {{consolidation_mode_job_id}} \
|
56
|
+
{%- else %}
|
47
57
|
--job-id $SKYPILOT_INTERNAL_JOB_ID \
|
58
|
+
{%- endif %}
|
48
59
|
--env-file {{remote_env_file_path}} \
|
49
60
|
--priority {{priority}}
|
50
61
|
|
@@ -641,7 +641,7 @@ available_node_types:
|
|
641
641
|
{% if high_availability %}
|
642
642
|
mkdir -p {{k8s_high_availability_deployment_run_script_dir}}
|
643
643
|
if [ -f {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready ]; then
|
644
|
-
SKYPILOT_HA_RECOVERY_LOG="
|
644
|
+
SKYPILOT_HA_RECOVERY_LOG="{{ha_recovery_log_path}}"
|
645
645
|
echo "Starting HA recovery at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
|
646
646
|
start_time=$SECONDS
|
647
647
|
retry_count=0
|
sky/utils/admin_policy_utils.py
CHANGED
@@ -140,13 +140,17 @@ def apply(
|
|
140
140
|
at_client_side)
|
141
141
|
try:
|
142
142
|
mutated_user_request = policy.apply(user_request)
|
143
|
+
# Avoid duplicate exception wrapping.
|
144
|
+
except exceptions.UserRequestRejectedByPolicy as e:
|
145
|
+
with ux_utils.print_exception_no_traceback():
|
146
|
+
raise e
|
143
147
|
except Exception as e: # pylint: disable=broad-except
|
144
148
|
with ux_utils.print_exception_no_traceback():
|
145
149
|
raise exceptions.UserRequestRejectedByPolicy(
|
146
150
|
f'{colorama.Fore.RED}User request rejected by policy '
|
147
151
|
f'{policy!r}{colorama.Fore.RESET}: '
|
148
152
|
f'{common_utils.format_exception(e, use_bracket=True)}'
|
149
|
-
) from
|
153
|
+
) from None
|
150
154
|
if mutated_config is None:
|
151
155
|
mutated_config = mutated_user_request.skypilot_config
|
152
156
|
else:
|
@@ -7,7 +7,6 @@ import colorama
|
|
7
7
|
|
8
8
|
from sky import backends
|
9
9
|
from sky.utils import common_utils
|
10
|
-
from sky.utils import controller_utils
|
11
10
|
from sky.utils import log_utils
|
12
11
|
from sky.utils import resources_utils
|
13
12
|
from sky.utils import status_lib
|
@@ -137,7 +136,8 @@ def get_total_cost_of_displayed_records(
|
|
137
136
|
|
138
137
|
def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord],
|
139
138
|
show_all: bool,
|
140
|
-
controller_name: Optional[str] = None
|
139
|
+
controller_name: Optional[str] = None,
|
140
|
+
days: Optional[int] = None):
|
141
141
|
"""Compute cluster table values and display for cost report.
|
142
142
|
|
143
143
|
For each cluster, this shows: cluster name, resources, launched time,
|
@@ -200,23 +200,21 @@ def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord],
|
|
200
200
|
cluster_table.add_row(row)
|
201
201
|
|
202
202
|
if cluster_records:
|
203
|
+
controller_record = cluster_records[0]
|
203
204
|
if controller_name is not None:
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
controller_handle: backends.CloudVmRayResourceHandle = (
|
208
|
-
cluster_records[0]['handle'])
|
209
|
-
autostop_config = (
|
210
|
-
controller_handle.launched_resources.autostop_config)
|
211
|
-
if autostop_config is not None:
|
205
|
+
autostop = controller_record.get('autostop', None)
|
206
|
+
autostop_str = ''
|
207
|
+
if autostop is not None:
|
212
208
|
autostop_str = (f'{colorama.Style.DIM} (will be autostopped if '
|
213
|
-
f'idle for {
|
209
|
+
f'idle for {autostop}min)'
|
214
210
|
f'{colorama.Style.RESET_ALL}')
|
215
211
|
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
216
212
|
f'{controller_name}{colorama.Style.RESET_ALL}'
|
217
213
|
f'{autostop_str}')
|
218
214
|
else:
|
219
|
-
|
215
|
+
days_str = '' if days is None else f' (last {days} days)'
|
216
|
+
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
217
|
+
f'Clusters{days_str}'
|
220
218
|
f'{colorama.Style.RESET_ALL}')
|
221
219
|
click.echo(cluster_table)
|
222
220
|
|
@@ -345,7 +343,9 @@ def _get_infra(cluster_record: _ClusterRecord, truncate: bool = True) -> str:
|
|
345
343
|
|
346
344
|
|
347
345
|
def _get_status_value_for_cost_report(
|
348
|
-
cluster_cost_report_record: _ClusterCostReportRecord
|
346
|
+
cluster_cost_report_record: _ClusterCostReportRecord,
|
347
|
+
truncate: bool = True) -> int:
|
348
|
+
del truncate
|
349
349
|
status = cluster_cost_report_record['status']
|
350
350
|
if status is None:
|
351
351
|
return -1
|
@@ -353,7 +353,9 @@ def _get_status_value_for_cost_report(
|
|
353
353
|
|
354
354
|
|
355
355
|
def _get_status_for_cost_report(
|
356
|
-
cluster_cost_report_record: _ClusterCostReportRecord
|
356
|
+
cluster_cost_report_record: _ClusterCostReportRecord,
|
357
|
+
truncate: bool = True) -> str:
|
358
|
+
del truncate
|
357
359
|
status = cluster_cost_report_record['status']
|
358
360
|
if status is None:
|
359
361
|
return f'{colorama.Style.DIM}TERMINATED{colorama.Style.RESET_ALL}'
|
@@ -361,7 +363,9 @@ def _get_status_for_cost_report(
|
|
361
363
|
|
362
364
|
|
363
365
|
def _get_resources_for_cost_report(
|
364
|
-
cluster_cost_report_record: _ClusterCostReportRecord
|
366
|
+
cluster_cost_report_record: _ClusterCostReportRecord,
|
367
|
+
truncate: bool = True) -> str:
|
368
|
+
del truncate
|
365
369
|
launched_nodes = cluster_cost_report_record['num_nodes']
|
366
370
|
launched_resources = cluster_cost_report_record['resources']
|
367
371
|
|
@@ -373,7 +377,9 @@ def _get_resources_for_cost_report(
|
|
373
377
|
|
374
378
|
|
375
379
|
def _get_price_for_cost_report(
|
376
|
-
cluster_cost_report_record: _ClusterCostReportRecord
|
380
|
+
cluster_cost_report_record: _ClusterCostReportRecord,
|
381
|
+
truncate: bool = True) -> str:
|
382
|
+
del truncate
|
377
383
|
launched_nodes = cluster_cost_report_record['num_nodes']
|
378
384
|
launched_resources = cluster_cost_report_record['resources']
|
379
385
|
|
@@ -383,7 +389,9 @@ def _get_price_for_cost_report(
|
|
383
389
|
|
384
390
|
|
385
391
|
def _get_estimated_cost_for_cost_report(
|
386
|
-
cluster_cost_report_record: _ClusterCostReportRecord
|
392
|
+
cluster_cost_report_record: _ClusterCostReportRecord,
|
393
|
+
truncate: bool = True) -> str:
|
394
|
+
del truncate
|
387
395
|
cost = cluster_cost_report_record['total_cost']
|
388
396
|
|
389
397
|
if not cost:
|