skypilot-nightly 1.0.0.dev20241011__py3-none-any.whl → 1.0.0.dev20241013__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/azure.py +3 -1
- sky/adaptors/common.py +6 -2
- sky/backends/backend.py +9 -4
- sky/backends/backend_utils.py +13 -16
- sky/backends/cloud_vm_ray_backend.py +207 -161
- sky/backends/local_docker_backend.py +3 -1
- sky/benchmark/benchmark_utils.py +5 -4
- sky/cli.py +128 -31
- sky/clouds/service_catalog/aws_catalog.py +6 -7
- sky/clouds/service_catalog/common.py +4 -3
- sky/clouds/service_catalog/cudo_catalog.py +11 -1
- sky/core.py +4 -2
- sky/data/storage.py +44 -32
- sky/data/storage_utils.py +12 -7
- sky/exceptions.py +5 -0
- sky/execution.py +10 -24
- sky/jobs/__init__.py +2 -0
- sky/jobs/core.py +87 -7
- sky/jobs/utils.py +35 -19
- sky/optimizer.py +50 -37
- sky/provision/aws/config.py +15 -6
- sky/provision/azure/config.py +14 -3
- sky/provision/azure/instance.py +15 -9
- sky/provision/kubernetes/instance.py +3 -1
- sky/provision/kubernetes/utils.py +25 -0
- sky/provision/provisioner.py +63 -74
- sky/serve/core.py +42 -40
- sky/sky_logging.py +9 -5
- sky/skylet/log_lib.py +5 -4
- sky/skylet/providers/lambda_cloud/node_provider.py +1 -1
- sky/utils/cli_utils/status_utils.py +168 -21
- sky/utils/command_runner.py +11 -11
- sky/utils/common_utils.py +22 -5
- sky/utils/controller_utils.py +78 -29
- sky/utils/env_options.py +22 -7
- sky/utils/log_utils.py +39 -24
- sky/utils/resources_utils.py +23 -0
- sky/utils/rich_utils.py +55 -5
- sky/utils/ux_utils.py +63 -4
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/RECORD +46 -46
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/top_level.txt +0 -0
sky/data/storage_utils.py
CHANGED
@@ -12,7 +12,6 @@ from sky import sky_logging
|
|
12
12
|
from sky.skylet import constants
|
13
13
|
from sky.utils import common_utils
|
14
14
|
from sky.utils import log_utils
|
15
|
-
from sky.utils.cli_utils import status_utils
|
16
15
|
|
17
16
|
logger = sky_logging.init_logger(__name__)
|
18
17
|
|
@@ -22,6 +21,8 @@ _FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG = (
|
|
22
21
|
'to the cloud storage for {path!r}'
|
23
22
|
'due to the following error: {error_msg!r}')
|
24
23
|
|
24
|
+
_LAST_USE_TRUNC_LENGTH = 25
|
25
|
+
|
25
26
|
|
26
27
|
def format_storage_table(storages: List[Dict[str, Any]],
|
27
28
|
show_all: bool = False) -> str:
|
@@ -46,8 +47,8 @@ def format_storage_table(storages: List[Dict[str, Any]],
|
|
46
47
|
if show_all:
|
47
48
|
command = row['last_use']
|
48
49
|
else:
|
49
|
-
command =
|
50
|
-
|
50
|
+
command = common_utils.truncate_long_string(row['last_use'],
|
51
|
+
_LAST_USE_TRUNC_LENGTH)
|
51
52
|
storage_table.add_row([
|
52
53
|
# NAME
|
53
54
|
row['name'],
|
@@ -212,9 +213,13 @@ def get_excluded_files(src_dir_path: str) -> List[str]:
|
|
212
213
|
skyignore_path = os.path.join(expand_src_dir_path,
|
213
214
|
constants.SKY_IGNORE_FILE)
|
214
215
|
if os.path.exists(skyignore_path):
|
215
|
-
logger.info(f'
|
216
|
-
f'
|
216
|
+
logger.info(f' {colorama.Style.DIM}'
|
217
|
+
f'Excluded files to sync to cluster based on '
|
218
|
+
f'{constants.SKY_IGNORE_FILE}.'
|
219
|
+
f'{colorama.Style.RESET_ALL}')
|
217
220
|
return get_excluded_files_from_skyignore(src_dir_path)
|
218
|
-
logger.info(f'
|
219
|
-
f'
|
221
|
+
logger.info(f' {colorama.Style.DIM}'
|
222
|
+
f'Excluded files to sync to cluster based on '
|
223
|
+
f'{constants.GIT_IGNORE_FILE}.'
|
224
|
+
f'{colorama.Style.RESET_ALL}')
|
220
225
|
return get_excluded_files_from_gitignore(src_dir_path)
|
sky/exceptions.py
CHANGED
@@ -291,3 +291,8 @@ class PortDoesNotExistError(Exception):
|
|
291
291
|
class UserRequestRejectedByPolicy(Exception):
|
292
292
|
"""Raised when a user request is rejected by an admin policy."""
|
293
293
|
pass
|
294
|
+
|
295
|
+
|
296
|
+
class NoClusterLaunchedError(Exception):
|
297
|
+
"""No cluster launched, so cleanup can be skipped during failover."""
|
298
|
+
pass
|
sky/execution.py
CHANGED
@@ -3,7 +3,6 @@
|
|
3
3
|
See `Stage` for a Task's life cycle.
|
4
4
|
"""
|
5
5
|
import enum
|
6
|
-
import os
|
7
6
|
from typing import List, Optional, Tuple, Union
|
8
7
|
|
9
8
|
import colorama
|
@@ -20,10 +19,8 @@ from sky.usage import usage_lib
|
|
20
19
|
from sky.utils import admin_policy_utils
|
21
20
|
from sky.utils import controller_utils
|
22
21
|
from sky.utils import dag_utils
|
23
|
-
from sky.utils import env_options
|
24
22
|
from sky.utils import resources_utils
|
25
23
|
from sky.utils import rich_utils
|
26
|
-
from sky.utils import subprocess_utils
|
27
24
|
from sky.utils import timeline
|
28
25
|
from sky.utils import ux_utils
|
29
26
|
|
@@ -293,11 +290,17 @@ def _execute(
|
|
293
290
|
logger.info('Dryrun finished.')
|
294
291
|
return None, None
|
295
292
|
|
296
|
-
|
297
|
-
|
298
|
-
|
293
|
+
do_workdir = (Stage.SYNC_WORKDIR in stages and not dryrun and
|
294
|
+
task.workdir is not None)
|
295
|
+
do_file_mounts = (Stage.SYNC_FILE_MOUNTS in stages and not dryrun and
|
296
|
+
task.file_mounts is not None)
|
297
|
+
if do_workdir or do_file_mounts:
|
298
|
+
logger.info(ux_utils.starting_message('Mounting files.'))
|
299
299
|
|
300
|
-
if
|
300
|
+
if do_workdir:
|
301
|
+
backend.sync_workdir(handle, task.workdir)
|
302
|
+
|
303
|
+
if do_file_mounts:
|
301
304
|
backend.sync_file_mounts(handle, task.file_mounts,
|
302
305
|
task.storage_mounts)
|
303
306
|
|
@@ -330,23 +333,6 @@ def _execute(
|
|
330
333
|
backend.teardown_ephemeral_storage(task)
|
331
334
|
backend.teardown(handle, terminate=True)
|
332
335
|
finally:
|
333
|
-
controller = controller_utils.Controllers.from_name(cluster_name)
|
334
|
-
if controller is None and not _is_launched_by_sky_serve_controller:
|
335
|
-
# UX: print live clusters to make users aware (to save costs).
|
336
|
-
#
|
337
|
-
# Don't print if this job is launched by the jobs controller,
|
338
|
-
# because managed jobs are serverless, there can be many of them,
|
339
|
-
# and users tend to continuously monitor managed jobs using `sky
|
340
|
-
# job queue`. Also don't print if this job is a skyserve controller
|
341
|
-
# job or launched by a skyserve controller job, because the
|
342
|
-
# redirect for this subprocess.run won't success and it will
|
343
|
-
# pollute the controller logs.
|
344
|
-
#
|
345
|
-
# Disable the usage collection for this status command.
|
346
|
-
env = dict(os.environ,
|
347
|
-
**{env_options.Options.DISABLE_LOGGING.value: '1'})
|
348
|
-
subprocess_utils.run(
|
349
|
-
'sky status --no-show-managed-jobs --no-show-services', env=env)
|
350
336
|
print()
|
351
337
|
print('\x1b[?25h', end='') # Show cursor.
|
352
338
|
return job_id, handle
|
sky/jobs/__init__.py
CHANGED
@@ -8,6 +8,7 @@ from sky.jobs.constants import JOBS_TASK_YAML_PREFIX
|
|
8
8
|
from sky.jobs.core import cancel
|
9
9
|
from sky.jobs.core import launch
|
10
10
|
from sky.jobs.core import queue
|
11
|
+
from sky.jobs.core import queue_from_kubernetes_pod
|
11
12
|
from sky.jobs.core import tail_logs
|
12
13
|
from sky.jobs.recovery_strategy import DEFAULT_RECOVERY_STRATEGY
|
13
14
|
from sky.jobs.recovery_strategy import RECOVERY_STRATEGIES
|
@@ -34,6 +35,7 @@ __all__ = [
|
|
34
35
|
'cancel',
|
35
36
|
'launch',
|
36
37
|
'queue',
|
38
|
+
'queue_from_kubernetes_pod',
|
37
39
|
'tail_logs',
|
38
40
|
# utils
|
39
41
|
'ManagedJobCodeGen',
|
sky/jobs/core.py
CHANGED
@@ -9,6 +9,7 @@ import colorama
|
|
9
9
|
import sky
|
10
10
|
from sky import backends
|
11
11
|
from sky import exceptions
|
12
|
+
from sky import provision as provision_lib
|
12
13
|
from sky import sky_logging
|
13
14
|
from sky import status_lib
|
14
15
|
from sky import task as task_lib
|
@@ -16,6 +17,7 @@ from sky.backends import backend_utils
|
|
16
17
|
from sky.clouds.service_catalog import common as service_catalog_common
|
17
18
|
from sky.jobs import constants as managed_job_constants
|
18
19
|
from sky.jobs import utils as managed_job_utils
|
20
|
+
from sky.provision import common
|
19
21
|
from sky.skylet import constants as skylet_constants
|
20
22
|
from sky.usage import usage_lib
|
21
23
|
from sky.utils import admin_policy_utils
|
@@ -77,9 +79,11 @@ def launch(
|
|
77
79
|
|
78
80
|
dag_utils.fill_default_config_in_dag_for_job_launch(dag)
|
79
81
|
|
80
|
-
|
81
|
-
|
82
|
-
|
82
|
+
with rich_utils.safe_status(
|
83
|
+
ux_utils.spinner_message('Initializing managed job')):
|
84
|
+
for task_ in dag.tasks:
|
85
|
+
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
86
|
+
task_, path='jobs')
|
83
87
|
|
84
88
|
with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
|
85
89
|
mode='w') as f:
|
@@ -127,7 +131,6 @@ def launch(
|
|
127
131
|
f'{colorama.Fore.YELLOW}'
|
128
132
|
f'Launching managed job {dag.name!r} from jobs controller...'
|
129
133
|
f'{colorama.Style.RESET_ALL}')
|
130
|
-
sky_logging.print('Launching jobs controller...')
|
131
134
|
sky.launch(task=controller_task,
|
132
135
|
stream_logs=stream_logs,
|
133
136
|
cluster_name=controller_name,
|
@@ -138,6 +141,82 @@ def launch(
|
|
138
141
|
_disable_controller_check=True)
|
139
142
|
|
140
143
|
|
144
|
+
def queue_from_kubernetes_pod(
|
145
|
+
pod_name: str,
|
146
|
+
context: Optional[str] = None,
|
147
|
+
skip_finished: bool = False) -> List[Dict[str, Any]]:
|
148
|
+
"""Gets the jobs queue from a specific controller pod.
|
149
|
+
|
150
|
+
Args:
|
151
|
+
pod_name (str): The name of the controller pod to query for jobs.
|
152
|
+
context (Optional[str]): The Kubernetes context to use. If None, the
|
153
|
+
current context is used.
|
154
|
+
skip_finished (bool): If True, does not return finished jobs.
|
155
|
+
|
156
|
+
Returns:
|
157
|
+
[
|
158
|
+
{
|
159
|
+
'job_id': int,
|
160
|
+
'job_name': str,
|
161
|
+
'resources': str,
|
162
|
+
'submitted_at': (float) timestamp of submission,
|
163
|
+
'end_at': (float) timestamp of end,
|
164
|
+
'duration': (float) duration in seconds,
|
165
|
+
'recovery_count': (int) Number of retries,
|
166
|
+
'status': (sky.jobs.ManagedJobStatus) of the job,
|
167
|
+
'cluster_resources': (str) resources of the cluster,
|
168
|
+
'region': (str) region of the cluster,
|
169
|
+
}
|
170
|
+
]
|
171
|
+
|
172
|
+
Raises:
|
173
|
+
RuntimeError: If there's an error fetching the managed jobs.
|
174
|
+
"""
|
175
|
+
# Create dummy cluster info to get the command runner.
|
176
|
+
provider_config = {'context': context}
|
177
|
+
instances = {
|
178
|
+
pod_name: [
|
179
|
+
common.InstanceInfo(instance_id=pod_name,
|
180
|
+
internal_ip='',
|
181
|
+
external_ip='',
|
182
|
+
tags={})
|
183
|
+
]
|
184
|
+
} # Internal IP is not required for Kubernetes
|
185
|
+
cluster_info = common.ClusterInfo(provider_name='kubernetes',
|
186
|
+
head_instance_id=pod_name,
|
187
|
+
provider_config=provider_config,
|
188
|
+
instances=instances)
|
189
|
+
managed_jobs_runner = provision_lib.get_command_runners(
|
190
|
+
'kubernetes', cluster_info)[0]
|
191
|
+
|
192
|
+
code = managed_job_utils.ManagedJobCodeGen.get_job_table()
|
193
|
+
returncode, job_table_payload, stderr = managed_jobs_runner.run(
|
194
|
+
code,
|
195
|
+
require_outputs=True,
|
196
|
+
separate_stderr=True,
|
197
|
+
stream_logs=False,
|
198
|
+
)
|
199
|
+
try:
|
200
|
+
subprocess_utils.handle_returncode(returncode,
|
201
|
+
code,
|
202
|
+
'Failed to fetch managed jobs',
|
203
|
+
job_table_payload + stderr,
|
204
|
+
stream_logs=False)
|
205
|
+
except exceptions.CommandError as e:
|
206
|
+
raise RuntimeError(str(e)) from e
|
207
|
+
|
208
|
+
jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
|
209
|
+
if skip_finished:
|
210
|
+
# Filter out the finished jobs. If a multi-task job is partially
|
211
|
+
# finished, we will include all its tasks.
|
212
|
+
non_finished_tasks = list(
|
213
|
+
filter(lambda job: not job['status'].is_terminal(), jobs))
|
214
|
+
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
215
|
+
jobs = list(
|
216
|
+
filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
|
217
|
+
return jobs
|
218
|
+
|
219
|
+
|
141
220
|
@usage_lib.entrypoint
|
142
221
|
def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
|
143
222
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
@@ -184,11 +263,12 @@ def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
|
|
184
263
|
f'{colorama.Style.RESET_ALL}')
|
185
264
|
|
186
265
|
rich_utils.force_update_status(
|
187
|
-
'
|
188
|
-
|
266
|
+
ux_utils.spinner_message('Checking managed jobs - restarting '
|
267
|
+
'controller'))
|
189
268
|
handle = sky.start(jobs_controller_type.value.cluster_name)
|
190
269
|
controller_status = status_lib.ClusterStatus.UP
|
191
|
-
rich_utils.force_update_status(
|
270
|
+
rich_utils.force_update_status(
|
271
|
+
ux_utils.spinner_message('Checking managed jobs'))
|
192
272
|
|
193
273
|
assert handle is not None, (controller_status, refresh)
|
194
274
|
|
sky/jobs/utils.py
CHANGED
@@ -34,6 +34,7 @@ from sky.utils import common_utils
|
|
34
34
|
from sky.utils import log_utils
|
35
35
|
from sky.utils import rich_utils
|
36
36
|
from sky.utils import subprocess_utils
|
37
|
+
from sky.utils import ux_utils
|
37
38
|
|
38
39
|
if typing.TYPE_CHECKING:
|
39
40
|
import sky
|
@@ -57,11 +58,13 @@ JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
|
|
57
58
|
|
58
59
|
_LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
|
59
60
|
|
60
|
-
_JOB_WAITING_STATUS_MESSAGE = (
|
61
|
-
|
61
|
+
_JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
|
62
|
+
'Waiting for task to start[/]'
|
63
|
+
'{status_str}. It may take a few minutes.\n'
|
64
|
+
' [dim]View controller logs: sky jobs logs --controller {job_id}')
|
62
65
|
_JOB_CANCELLED_MESSAGE = (
|
63
|
-
'
|
64
|
-
'
|
66
|
+
ux_utils.spinner_message('Waiting for task status to be updated.') +
|
67
|
+
' It may take a minute.')
|
65
68
|
|
66
69
|
# The maximum time to wait for the managed job status to transition to terminal
|
67
70
|
# state, after the job finished. This is a safeguard to avoid the case where
|
@@ -290,8 +293,8 @@ def cancel_job_by_name(job_name: str) -> str:
|
|
290
293
|
def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
291
294
|
"""Stream logs by job id."""
|
292
295
|
controller_status = job_lib.get_status(job_id)
|
293
|
-
status_msg = (
|
294
|
-
|
296
|
+
status_msg = ux_utils.spinner_message(
|
297
|
+
'Waiting for controller process to be RUNNING') + '{status_str}'
|
295
298
|
status_display = rich_utils.safe_status(status_msg.format(status_str=''))
|
296
299
|
num_tasks = managed_job_state.get_num_tasks(job_id)
|
297
300
|
|
@@ -310,7 +313,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
310
313
|
time.sleep(_LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS)
|
311
314
|
controller_status = job_lib.get_status(job_id)
|
312
315
|
|
313
|
-
msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='')
|
316
|
+
msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='', job_id=job_id)
|
314
317
|
status_display.update(msg)
|
315
318
|
prev_msg = msg
|
316
319
|
managed_job_status = managed_job_state.get_status(job_id)
|
@@ -356,7 +359,8 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
356
359
|
logger.debug(
|
357
360
|
f'INFO: The log is not ready yet{status_str}. '
|
358
361
|
f'Waiting for {JOB_STATUS_CHECK_GAP_SECONDS} seconds.')
|
359
|
-
msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str=status_str
|
362
|
+
msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str=status_str,
|
363
|
+
job_id=job_id)
|
360
364
|
if msg != prev_msg:
|
361
365
|
status_display.update(msg)
|
362
366
|
prev_msg = msg
|
@@ -444,8 +448,9 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
444
448
|
managed_job_status = managed_job_state.get_status(job_id)
|
445
449
|
assert managed_job_status is not None, job_id
|
446
450
|
|
447
|
-
logger.info(
|
448
|
-
|
451
|
+
logger.info(
|
452
|
+
ux_utils.finishing_message(f'Managed job finished: {job_id} '
|
453
|
+
f'(status: {managed_job_status.value}).'))
|
449
454
|
return ''
|
450
455
|
|
451
456
|
|
@@ -599,11 +604,20 @@ def format_job_table(
|
|
599
604
|
a list of "rows" (each of which is a list of str).
|
600
605
|
"""
|
601
606
|
jobs = collections.defaultdict(list)
|
607
|
+
# Check if the tasks have user information.
|
608
|
+
tasks_have_user = any([task.get('user') for task in tasks])
|
609
|
+
if max_jobs and tasks_have_user:
|
610
|
+
raise ValueError('max_jobs is not supported when tasks have user info.')
|
611
|
+
|
612
|
+
def get_hash(task):
|
613
|
+
if tasks_have_user:
|
614
|
+
return (task['user'], task['job_id'])
|
615
|
+
return task['job_id']
|
616
|
+
|
602
617
|
for task in tasks:
|
603
618
|
# The tasks within the same job_id are already sorted
|
604
619
|
# by the task_id.
|
605
|
-
jobs[task
|
606
|
-
jobs = dict(jobs)
|
620
|
+
jobs[get_hash(task)].append(task)
|
607
621
|
|
608
622
|
status_counts: Dict[str, int] = collections.defaultdict(int)
|
609
623
|
for job_tasks in jobs.values():
|
@@ -611,17 +625,14 @@ def format_job_table(
|
|
611
625
|
if not managed_job_status.is_terminal():
|
612
626
|
status_counts[managed_job_status.value] += 1
|
613
627
|
|
614
|
-
if max_jobs is not None:
|
615
|
-
job_ids = sorted(jobs.keys(), reverse=True)
|
616
|
-
job_ids = job_ids[:max_jobs]
|
617
|
-
jobs = {job_id: jobs[job_id] for job_id in job_ids}
|
618
|
-
|
619
628
|
columns = [
|
620
629
|
'ID', 'TASK', 'NAME', 'RESOURCES', 'SUBMITTED', 'TOT. DURATION',
|
621
630
|
'JOB DURATION', '#RECOVERIES', 'STATUS'
|
622
631
|
]
|
623
632
|
if show_all:
|
624
633
|
columns += ['STARTED', 'CLUSTER', 'REGION', 'FAILURE']
|
634
|
+
if tasks_have_user:
|
635
|
+
columns.insert(0, 'USER')
|
625
636
|
job_table = log_utils.create_table(columns)
|
626
637
|
|
627
638
|
status_counts: Dict[str, int] = collections.defaultdict(int)
|
@@ -636,9 +647,9 @@ def format_job_table(
|
|
636
647
|
for task in all_tasks:
|
637
648
|
# The tasks within the same job_id are already sorted
|
638
649
|
# by the task_id.
|
639
|
-
jobs[task
|
650
|
+
jobs[get_hash(task)].append(task)
|
640
651
|
|
641
|
-
for
|
652
|
+
for job_hash, job_tasks in jobs.items():
|
642
653
|
if len(job_tasks) > 1:
|
643
654
|
# Aggregate the tasks into a new row in the table.
|
644
655
|
job_name = job_tasks[0]['job_name']
|
@@ -674,6 +685,7 @@ def format_job_table(
|
|
674
685
|
if not managed_job_status.is_terminal():
|
675
686
|
status_str += f' (task: {current_task_id})'
|
676
687
|
|
688
|
+
job_id = job_hash[1] if tasks_have_user else job_hash
|
677
689
|
job_values = [
|
678
690
|
job_id,
|
679
691
|
'',
|
@@ -692,6 +704,8 @@ def format_job_table(
|
|
692
704
|
'-',
|
693
705
|
failure_reason if failure_reason is not None else '-',
|
694
706
|
])
|
707
|
+
if tasks_have_user:
|
708
|
+
job_values.insert(0, job_tasks[0].get('user', '-'))
|
695
709
|
job_table.add_row(job_values)
|
696
710
|
|
697
711
|
for task in job_tasks:
|
@@ -724,6 +738,8 @@ def format_job_table(
|
|
724
738
|
task['failure_reason']
|
725
739
|
if task['failure_reason'] is not None else '-',
|
726
740
|
])
|
741
|
+
if tasks_have_user:
|
742
|
+
values.insert(0, task.get('user', '-'))
|
727
743
|
job_table.add_row(values)
|
728
744
|
|
729
745
|
if len(job_tasks) > 1:
|
sky/optimizer.py
CHANGED
@@ -123,22 +123,23 @@ class Optimizer:
|
|
123
123
|
for a task.
|
124
124
|
exceptions.NoCloudAccessError: if no public clouds are enabled.
|
125
125
|
"""
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
126
|
+
with rich_utils.safe_status(ux_utils.spinner_message('Optimizing')):
|
127
|
+
_check_specified_clouds(dag)
|
128
|
+
|
129
|
+
# This function is effectful: mutates every node in 'dag' by setting
|
130
|
+
# node.best_resources if it is None.
|
131
|
+
Optimizer._add_dummy_source_sink_nodes(dag)
|
132
|
+
try:
|
133
|
+
unused_best_plan = Optimizer._optimize_dag(
|
134
|
+
dag=dag,
|
135
|
+
minimize_cost=minimize == OptimizeTarget.COST,
|
136
|
+
blocked_resources=blocked_resources,
|
137
|
+
quiet=quiet)
|
138
|
+
finally:
|
139
|
+
# Make sure to remove the dummy source/sink nodes, even if the
|
140
|
+
# optimization fails.
|
141
|
+
Optimizer._remove_dummy_source_sink_nodes(dag)
|
142
|
+
return dag
|
142
143
|
|
143
144
|
@staticmethod
|
144
145
|
def _add_dummy_source_sink_nodes(dag: 'dag_lib.Dag'):
|
@@ -259,6 +260,9 @@ class Optimizer:
|
|
259
260
|
launchable_resources: Dict[resources_lib.Resources,
|
260
261
|
List[resources_lib.Resources]]
|
261
262
|
) -> Dict[resources_lib.Resources, int]:
|
263
|
+
if not resources_utils.need_to_query_reservations():
|
264
|
+
return {}
|
265
|
+
|
262
266
|
num_available_reserved_nodes_per_resource = {}
|
263
267
|
|
264
268
|
def get_reservations_available_resources(
|
@@ -269,7 +273,7 @@ class Optimizer:
|
|
269
273
|
launchable_resources_list: List[resources_lib.Resources] = sum(
|
270
274
|
launchable_resources.values(), [])
|
271
275
|
with rich_utils.safe_status(
|
272
|
-
'
|
276
|
+
ux_utils.spinner_message('Checking reserved resources')):
|
273
277
|
subprocess_utils.run_in_parallel(
|
274
278
|
get_reservations_available_resources,
|
275
279
|
launchable_resources_list)
|
@@ -337,8 +341,8 @@ class Optimizer:
|
|
337
341
|
if minimize_cost:
|
338
342
|
cost_per_node = resources.get_cost(estimated_runtime)
|
339
343
|
num_available_reserved_nodes = (
|
340
|
-
num_available_reserved_nodes_per_resource
|
341
|
-
|
344
|
+
num_available_reserved_nodes_per_resource.get(
|
345
|
+
resources, 0))
|
342
346
|
|
343
347
|
# We consider the cost of the unused reservation
|
344
348
|
# resources to be 0 since we are already paying for
|
@@ -384,10 +388,14 @@ class Optimizer:
|
|
384
388
|
fuzzy_candidates_str = (
|
385
389
|
f'\nTry one of these offered accelerators: {cyan}'
|
386
390
|
f'{fuzzy_candidates}{reset}')
|
391
|
+
node_resources_reprs = ', '.join(f'{node.num_nodes}x ' +
|
392
|
+
r.repr_with_region_zone
|
393
|
+
for r in node.resources)
|
387
394
|
error_msg = (
|
388
395
|
f'{source_hint.capitalize()} does not contain any '
|
389
|
-
f'instances satisfying the request
|
390
|
-
f'
|
396
|
+
f'instances satisfying the request: '
|
397
|
+
f'{node_resources_reprs}.'
|
398
|
+
f'\nTo fix: relax or change the '
|
391
399
|
f'resource requirements.{fuzzy_candidates_str}\n\n'
|
392
400
|
f'Hint: {bold}sky show-gpus{reset} '
|
393
401
|
'to list available accelerators.\n'
|
@@ -716,7 +724,6 @@ class Optimizer:
|
|
716
724
|
node_to_cost_map: _TaskToCostMap,
|
717
725
|
minimize_cost: bool,
|
718
726
|
):
|
719
|
-
logger.info('== Optimizer ==')
|
720
727
|
ordered_node_to_cost_map = collections.OrderedDict()
|
721
728
|
ordered_best_plan = collections.OrderedDict()
|
722
729
|
for node in topo_order:
|
@@ -738,15 +745,18 @@ class Optimizer:
|
|
738
745
|
node.get_inputs() is None and node.get_outputs() is None):
|
739
746
|
print_hourly_cost = True
|
740
747
|
|
741
|
-
if
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
748
|
+
if not env_options.Options.MINIMIZE_LOGGING.get():
|
749
|
+
if print_hourly_cost:
|
750
|
+
logger.info(
|
751
|
+
f'{colorama.Style.BRIGHT}Estimated cost: '
|
752
|
+
f'{colorama.Style.RESET_ALL}${total_cost:.1f} / hour\n')
|
753
|
+
else:
|
754
|
+
logger.info(
|
755
|
+
f'{colorama.Style.BRIGHT}Estimated total runtime: '
|
756
|
+
f'{colorama.Style.RESET_ALL}{total_time / 3600:.1f} '
|
757
|
+
'hours\n'
|
758
|
+
f'{colorama.Style.BRIGHT}Estimated total cost: '
|
759
|
+
f'{colorama.Style.RESET_ALL}${total_cost:.1f}\n')
|
750
760
|
|
751
761
|
def _get_resources_element_list(
|
752
762
|
resources: 'resources_lib.Resources') -> List[str]:
|
@@ -845,7 +855,7 @@ class Optimizer:
|
|
845
855
|
best_plan_table = _create_table(['TASK', '#NODES'] +
|
846
856
|
resource_fields)
|
847
857
|
best_plan_table.add_rows(best_plan_rows)
|
848
|
-
logger.info(f'{best_plan_table}
|
858
|
+
logger.info(f'{best_plan_table}')
|
849
859
|
|
850
860
|
# Print the egress plan if any data egress is scheduled.
|
851
861
|
Optimizer._print_egress_plan(graph, best_plan, minimize_cost)
|
@@ -864,6 +874,10 @@ class Optimizer:
|
|
864
874
|
}
|
865
875
|
task_str = (f'for task {task.name!r} ' if num_tasks > 1 else '')
|
866
876
|
plural = 's' if task.num_nodes > 1 else ''
|
877
|
+
if num_tasks > 1:
|
878
|
+
# Add a new line for better readability, when there are multiple
|
879
|
+
# tasks.
|
880
|
+
logger.info('')
|
867
881
|
logger.info(
|
868
882
|
f'{colorama.Style.BRIGHT}Considered resources {task_str}'
|
869
883
|
f'({task.num_nodes} node{plural}):'
|
@@ -934,7 +948,7 @@ class Optimizer:
|
|
934
948
|
|
935
949
|
table = _create_table(field_names)
|
936
950
|
table.add_rows(rows)
|
937
|
-
logger.info(f'{table}
|
951
|
+
logger.info(f'{table}')
|
938
952
|
|
939
953
|
# Warning message for using disk_tier=ultra
|
940
954
|
# TODO(yi): Consider price of disks in optimizer and
|
@@ -965,10 +979,10 @@ class Optimizer:
|
|
965
979
|
f'Multiple {cloud} instances satisfy '
|
966
980
|
f'{acc_name}:{int(acc_count)}. '
|
967
981
|
f'The cheapest {candidate_list[0]!r} is considered '
|
968
|
-
f'among:\n{instance_list}
|
982
|
+
f'among:\n{instance_list}.')
|
969
983
|
if is_multi_instances:
|
970
984
|
logger.info(
|
971
|
-
f'To list more details, run
|
985
|
+
f'To list more details, run: sky show-gpus {acc_name}\n')
|
972
986
|
|
973
987
|
@staticmethod
|
974
988
|
def _optimize_dag(
|
@@ -1101,8 +1115,7 @@ class Optimizer:
|
|
1101
1115
|
Optimizer.print_optimized_plan(graph, topo_order, best_plan,
|
1102
1116
|
total_time, total_cost,
|
1103
1117
|
node_to_cost_map, minimize_cost)
|
1104
|
-
|
1105
|
-
Optimizer._print_candidates(local_node_to_candidate_map)
|
1118
|
+
Optimizer._print_candidates(local_node_to_candidate_map)
|
1106
1119
|
return best_plan
|
1107
1120
|
|
1108
1121
|
|
sky/provision/aws/config.py
CHANGED
@@ -16,10 +16,12 @@ from typing import Any, Dict, List, Optional, Set, Tuple
|
|
16
16
|
|
17
17
|
import colorama
|
18
18
|
|
19
|
+
from sky import exceptions
|
19
20
|
from sky import sky_logging
|
20
21
|
from sky.adaptors import aws
|
21
22
|
from sky.provision import common
|
22
23
|
from sky.provision.aws import utils
|
24
|
+
from sky.utils import common_utils
|
23
25
|
|
24
26
|
logger = sky_logging.init_logger(__name__)
|
25
27
|
|
@@ -535,12 +537,19 @@ def _get_or_create_vpc_security_group(ec2, vpc_id: str,
|
|
535
537
|
if vpc_id in vpc_to_existing_sg:
|
536
538
|
return vpc_to_existing_sg[vpc_id]
|
537
539
|
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
540
|
+
try:
|
541
|
+
# create a new security group
|
542
|
+
ec2.meta.client.create_security_group(
|
543
|
+
Description='Auto-created security group for Ray workers',
|
544
|
+
GroupName=expected_sg_name,
|
545
|
+
VpcId=vpc_id,
|
546
|
+
)
|
547
|
+
except ec2.meta.client.exceptions.ClientError as e:
|
548
|
+
message = ('Failed to create security group. Error: '
|
549
|
+
f'{common_utils.format_exception(e)}')
|
550
|
+
logger.warning(message)
|
551
|
+
raise exceptions.NoClusterLaunchedError(message) from e
|
552
|
+
|
544
553
|
security_group = _get_security_groups_from_vpc_ids(ec2, [vpc_id],
|
545
554
|
[expected_sg_name])
|
546
555
|
|