skypilot-nightly 1.0.0.dev20241011__py3-none-any.whl → 1.0.0.dev20241013__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/azure.py +3 -1
- sky/adaptors/common.py +6 -2
- sky/backends/backend.py +9 -4
- sky/backends/backend_utils.py +13 -16
- sky/backends/cloud_vm_ray_backend.py +207 -161
- sky/backends/local_docker_backend.py +3 -1
- sky/benchmark/benchmark_utils.py +5 -4
- sky/cli.py +128 -31
- sky/clouds/service_catalog/aws_catalog.py +6 -7
- sky/clouds/service_catalog/common.py +4 -3
- sky/clouds/service_catalog/cudo_catalog.py +11 -1
- sky/core.py +4 -2
- sky/data/storage.py +44 -32
- sky/data/storage_utils.py +12 -7
- sky/exceptions.py +5 -0
- sky/execution.py +10 -24
- sky/jobs/__init__.py +2 -0
- sky/jobs/core.py +87 -7
- sky/jobs/utils.py +35 -19
- sky/optimizer.py +50 -37
- sky/provision/aws/config.py +15 -6
- sky/provision/azure/config.py +14 -3
- sky/provision/azure/instance.py +15 -9
- sky/provision/kubernetes/instance.py +3 -1
- sky/provision/kubernetes/utils.py +25 -0
- sky/provision/provisioner.py +63 -74
- sky/serve/core.py +42 -40
- sky/sky_logging.py +9 -5
- sky/skylet/log_lib.py +5 -4
- sky/skylet/providers/lambda_cloud/node_provider.py +1 -1
- sky/utils/cli_utils/status_utils.py +168 -21
- sky/utils/command_runner.py +11 -11
- sky/utils/common_utils.py +22 -5
- sky/utils/controller_utils.py +78 -29
- sky/utils/env_options.py +22 -7
- sky/utils/log_utils.py +39 -24
- sky/utils/resources_utils.py +23 -0
- sky/utils/rich_utils.py +55 -5
- sky/utils/ux_utils.py +63 -4
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/RECORD +46 -46
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,16 @@
|
|
1
1
|
"""Utilities for sky status."""
|
2
|
-
from typing import Any, Callable, Dict, List, Optional
|
2
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
3
3
|
|
4
4
|
import click
|
5
5
|
import colorama
|
6
6
|
|
7
7
|
from sky import backends
|
8
|
+
from sky import clouds as sky_clouds
|
9
|
+
from sky import resources as resources_lib
|
8
10
|
from sky import status_lib
|
11
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
9
12
|
from sky.skylet import constants
|
13
|
+
from sky.utils import common_utils
|
10
14
|
from sky.utils import log_utils
|
11
15
|
from sky.utils import resources_utils
|
12
16
|
|
@@ -19,25 +23,6 @@ _ClusterRecord = Dict[str, Any]
|
|
19
23
|
_ClusterCostReportRecord = Dict[str, Any]
|
20
24
|
|
21
25
|
|
22
|
-
def truncate_long_string(s: str, max_length: int = 35) -> str:
|
23
|
-
if len(s) <= max_length:
|
24
|
-
return s
|
25
|
-
splits = s.split(' ')
|
26
|
-
if len(splits[0]) > max_length:
|
27
|
-
return splits[0][:max_length] + '...' # Use '…'?
|
28
|
-
# Truncate on word boundary.
|
29
|
-
i = 0
|
30
|
-
total = 0
|
31
|
-
for i, part in enumerate(splits):
|
32
|
-
total += len(part)
|
33
|
-
if total >= max_length:
|
34
|
-
break
|
35
|
-
prefix = ' '.join(splits[:i])
|
36
|
-
if len(prefix) < max_length:
|
37
|
-
prefix += s[len(prefix):max_length]
|
38
|
-
return prefix + '...'
|
39
|
-
|
40
|
-
|
41
26
|
class StatusColumn:
|
42
27
|
"""One column of the displayed cluster table"""
|
43
28
|
|
@@ -54,7 +39,7 @@ class StatusColumn:
|
|
54
39
|
def calc(self, record):
|
55
40
|
val = self.calc_func(record)
|
56
41
|
if self.trunc_length != 0:
|
57
|
-
val = truncate_long_string(str(val), self.trunc_length)
|
42
|
+
val = common_utils.truncate_long_string(str(val), self.trunc_length)
|
58
43
|
return val
|
59
44
|
|
60
45
|
|
@@ -316,3 +301,165 @@ def _get_estimated_cost_for_cost_report(
|
|
316
301
|
return '-'
|
317
302
|
|
318
303
|
return f'$ {cost:.2f}'
|
304
|
+
|
305
|
+
|
306
|
+
def show_kubernetes_cluster_status_table(clusters: List[Any],
|
307
|
+
show_all: bool) -> None:
|
308
|
+
"""Compute cluster table values and display for Kubernetes clusters."""
|
309
|
+
status_columns = [
|
310
|
+
StatusColumn('USER', lambda c: c['user']),
|
311
|
+
StatusColumn('NAME', lambda c: c['cluster_name']),
|
312
|
+
StatusColumn(
|
313
|
+
'LAUNCHED',
|
314
|
+
lambda c: log_utils.readable_time_duration(c['launched_at'])),
|
315
|
+
StatusColumn('RESOURCES',
|
316
|
+
lambda c: c['resources_str'],
|
317
|
+
trunc_length=70 if not show_all else 0),
|
318
|
+
StatusColumn('STATUS', lambda c: c['status'].colored_str()),
|
319
|
+
# TODO(romilb): We should consider adding POD_NAME field here when --all
|
320
|
+
# is passed to help users fetch pod name programmatically.
|
321
|
+
]
|
322
|
+
|
323
|
+
columns = [
|
324
|
+
col.name for col in status_columns if col.show_by_default or show_all
|
325
|
+
]
|
326
|
+
cluster_table = log_utils.create_table(columns)
|
327
|
+
|
328
|
+
# Sort table by user, then by cluster name
|
329
|
+
sorted_clusters = sorted(clusters,
|
330
|
+
key=lambda c: (c['user'], c['cluster_name']))
|
331
|
+
|
332
|
+
for cluster in sorted_clusters:
|
333
|
+
row = []
|
334
|
+
for status_column in status_columns:
|
335
|
+
if status_column.show_by_default or show_all:
|
336
|
+
row.append(status_column.calc(cluster))
|
337
|
+
cluster_table.add_row(row)
|
338
|
+
|
339
|
+
if clusters:
|
340
|
+
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
341
|
+
f'SkyPilot clusters'
|
342
|
+
f'{colorama.Style.RESET_ALL}')
|
343
|
+
click.echo(cluster_table)
|
344
|
+
else:
|
345
|
+
click.echo('No SkyPilot resources found in the '
|
346
|
+
'active Kubernetes context.')
|
347
|
+
|
348
|
+
|
349
|
+
def process_skypilot_pods(
|
350
|
+
pods: List[Any],
|
351
|
+
context: Optional[str] = None
|
352
|
+
) -> Tuple[List[Dict[Any, Any]], Dict[str, Any], Dict[str, Any]]:
|
353
|
+
"""Process SkyPilot pods on k8s to extract cluster and controller info.
|
354
|
+
|
355
|
+
Args:
|
356
|
+
pods: List of Kubernetes pod objects.
|
357
|
+
context: Kubernetes context name, used to detect GPU label formatter.
|
358
|
+
|
359
|
+
Returns:
|
360
|
+
A tuple containing:
|
361
|
+
- List of dictionaries with cluster information.
|
362
|
+
- Dictionary of job controller information.
|
363
|
+
- Dictionary of serve controller information.
|
364
|
+
|
365
|
+
Each dictionary contains the following keys:
|
366
|
+
'cluster_name_on_cloud': The cluster_name_on_cloud used by SkyPilot
|
367
|
+
'cluster_name': The cluster name without the user hash
|
368
|
+
'user': The user who created the cluster. Fetched from pod label
|
369
|
+
'status': The cluster status (assumed UP if pod exists)
|
370
|
+
'pods': List of pod objects in the cluster
|
371
|
+
'launched_at': Timestamp of when the cluster was launched
|
372
|
+
'resources': sky.Resources object for the cluster
|
373
|
+
"""
|
374
|
+
clusters: Dict[str, Dict] = {}
|
375
|
+
jobs_controllers: Dict[str, Dict] = {}
|
376
|
+
serve_controllers: Dict[str, Dict] = {}
|
377
|
+
|
378
|
+
for pod in pods:
|
379
|
+
cluster_name_on_cloud = pod.metadata.labels.get('skypilot-cluster')
|
380
|
+
cluster_name = cluster_name_on_cloud.rsplit(
|
381
|
+
'-', 1
|
382
|
+
)[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
|
383
|
+
|
384
|
+
# Check if cluster name is name of a controller
|
385
|
+
# Can't use controller_utils.Controllers.from_name(cluster_name)
|
386
|
+
# because hash is different across users
|
387
|
+
if 'controller' in cluster_name_on_cloud:
|
388
|
+
start_time = pod.status.start_time.timestamp()
|
389
|
+
controller_info = {
|
390
|
+
'cluster_name_on_cloud': cluster_name_on_cloud,
|
391
|
+
'cluster_name': cluster_name,
|
392
|
+
'user': pod.metadata.labels.get('skypilot-user'),
|
393
|
+
'status': status_lib.ClusterStatus.UP,
|
394
|
+
# Assuming UP if pod exists
|
395
|
+
'pods': [pod],
|
396
|
+
'launched_at': start_time
|
397
|
+
}
|
398
|
+
if 'sky-jobs-controller' in cluster_name_on_cloud:
|
399
|
+
jobs_controllers[cluster_name_on_cloud] = controller_info
|
400
|
+
elif 'sky-serve-controller' in cluster_name_on_cloud:
|
401
|
+
serve_controllers[cluster_name_on_cloud] = controller_info
|
402
|
+
|
403
|
+
if cluster_name_on_cloud not in clusters:
|
404
|
+
# Parse the start time for the cluster
|
405
|
+
start_time = pod.status.start_time
|
406
|
+
if start_time is not None:
|
407
|
+
start_time = pod.status.start_time.timestamp()
|
408
|
+
|
409
|
+
# Parse resources
|
410
|
+
cpu_request = kubernetes_utils.parse_cpu_or_gpu_resource(
|
411
|
+
pod.spec.containers[0].resources.requests.get('cpu', '0'))
|
412
|
+
memory_request = kubernetes_utils.parse_memory_resource(
|
413
|
+
pod.spec.containers[0].resources.requests.get('memory', '0'),
|
414
|
+
unit='G')
|
415
|
+
gpu_count = kubernetes_utils.parse_cpu_or_gpu_resource(
|
416
|
+
pod.spec.containers[0].resources.requests.get(
|
417
|
+
'nvidia.com/gpu', '0'))
|
418
|
+
if gpu_count > 0:
|
419
|
+
label_formatter, _ = (
|
420
|
+
kubernetes_utils.detect_gpu_label_formatter(context))
|
421
|
+
assert label_formatter is not None, (
|
422
|
+
'GPU label formatter cannot be None if there are pods '
|
423
|
+
f'requesting GPUs: {pod.metadata.name}')
|
424
|
+
gpu_label = label_formatter.get_label_key()
|
425
|
+
# Get GPU name from pod node selector
|
426
|
+
if pod.spec.node_selector is not None:
|
427
|
+
gpu_name = label_formatter.get_accelerator_from_label_value(
|
428
|
+
pod.spec.node_selector.get(gpu_label))
|
429
|
+
|
430
|
+
resources = resources_lib.Resources(
|
431
|
+
cloud=sky_clouds.Kubernetes(),
|
432
|
+
cpus=int(cpu_request),
|
433
|
+
memory=int(memory_request),
|
434
|
+
accelerators=(f'{gpu_name}:{gpu_count}'
|
435
|
+
if gpu_count > 0 else None))
|
436
|
+
if pod.status.phase == 'Pending':
|
437
|
+
# If pod is pending, do not show it in the status
|
438
|
+
continue
|
439
|
+
|
440
|
+
clusters[cluster_name_on_cloud] = {
|
441
|
+
'cluster_name_on_cloud': cluster_name_on_cloud,
|
442
|
+
'cluster_name': cluster_name,
|
443
|
+
'user': pod.metadata.labels.get('skypilot-user'),
|
444
|
+
'status': status_lib.ClusterStatus.UP,
|
445
|
+
'pods': [],
|
446
|
+
'launched_at': start_time,
|
447
|
+
'resources': resources,
|
448
|
+
}
|
449
|
+
else:
|
450
|
+
# Update start_time if this pod started earlier
|
451
|
+
pod_start_time = pod.status.start_time
|
452
|
+
if pod_start_time is not None:
|
453
|
+
pod_start_time = pod_start_time.timestamp()
|
454
|
+
if pod_start_time < clusters[cluster_name_on_cloud][
|
455
|
+
'launched_at']:
|
456
|
+
clusters[cluster_name_on_cloud][
|
457
|
+
'launched_at'] = pod_start_time
|
458
|
+
clusters[cluster_name_on_cloud]['pods'].append(pod)
|
459
|
+
# Update resources_str in clusters:
|
460
|
+
for cluster_name, cluster in clusters.items():
|
461
|
+
resources = cluster['resources']
|
462
|
+
num_pods = len(cluster['pods'])
|
463
|
+
resources_str = f'{num_pods}x {resources}'
|
464
|
+
cluster['resources_str'] = resources_str
|
465
|
+
return list(clusters.values()), jobs_controllers, serve_controllers
|
sky/utils/command_runner.py
CHANGED
@@ -171,7 +171,7 @@ class CommandRunner:
|
|
171
171
|
cmd: Union[str, List[str]],
|
172
172
|
process_stream: bool,
|
173
173
|
separate_stderr: bool,
|
174
|
-
|
174
|
+
skip_num_lines: int,
|
175
175
|
source_bashrc: bool = False,
|
176
176
|
) -> str:
|
177
177
|
"""Returns the command to run."""
|
@@ -203,12 +203,12 @@ class CommandRunner:
|
|
203
203
|
]
|
204
204
|
if not separate_stderr:
|
205
205
|
command.append('2>&1')
|
206
|
-
if not process_stream and
|
206
|
+
if not process_stream and skip_num_lines:
|
207
207
|
command += [
|
208
208
|
# A hack to remove the following bash warnings (twice):
|
209
209
|
# bash: cannot set terminal process group
|
210
210
|
# bash: no job control in this shell
|
211
|
-
f'| stdbuf -o0 tail -n +{
|
211
|
+
f'| stdbuf -o0 tail -n +{skip_num_lines}',
|
212
212
|
# This is required to make sure the executor of command can get
|
213
213
|
# correct returncode, since linux pipe is used.
|
214
214
|
'; exit ${PIPESTATUS[0]}'
|
@@ -320,7 +320,7 @@ class CommandRunner:
|
|
320
320
|
separate_stderr: bool = False,
|
321
321
|
connect_timeout: Optional[int] = None,
|
322
322
|
source_bashrc: bool = False,
|
323
|
-
|
323
|
+
skip_num_lines: int = 0,
|
324
324
|
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
325
325
|
"""Runs the command on the cluster.
|
326
326
|
|
@@ -335,7 +335,7 @@ class CommandRunner:
|
|
335
335
|
connect_timeout: timeout in seconds for the ssh connection.
|
336
336
|
source_bashrc: Whether to source the ~/.bashrc before running the
|
337
337
|
command.
|
338
|
-
|
338
|
+
skip_num_lines: The number of lines to skip at the beginning of the
|
339
339
|
output. This is used when the output is not processed by
|
340
340
|
SkyPilot but we still want to get rid of some warning messages,
|
341
341
|
such as SSH warnings.
|
@@ -529,7 +529,7 @@ class SSHCommandRunner(CommandRunner):
|
|
529
529
|
separate_stderr: bool = False,
|
530
530
|
connect_timeout: Optional[int] = None,
|
531
531
|
source_bashrc: bool = False,
|
532
|
-
|
532
|
+
skip_num_lines: int = 0,
|
533
533
|
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
534
534
|
"""Uses 'ssh' to run 'cmd' on a node with ip.
|
535
535
|
|
@@ -550,7 +550,7 @@ class SSHCommandRunner(CommandRunner):
|
|
550
550
|
connect_timeout: timeout in seconds for the ssh connection.
|
551
551
|
source_bashrc: Whether to source the bashrc before running the
|
552
552
|
command.
|
553
|
-
|
553
|
+
skip_num_lines: The number of lines to skip at the beginning of the
|
554
554
|
output. This is used when the output is not processed by
|
555
555
|
SkyPilot but we still want to get rid of some warning messages,
|
556
556
|
such as SSH warnings.
|
@@ -573,7 +573,7 @@ class SSHCommandRunner(CommandRunner):
|
|
573
573
|
command_str = self._get_command_to_run(cmd,
|
574
574
|
process_stream,
|
575
575
|
separate_stderr,
|
576
|
-
|
576
|
+
skip_num_lines=skip_num_lines,
|
577
577
|
source_bashrc=source_bashrc)
|
578
578
|
command = base_ssh_command + [shlex.quote(command_str)]
|
579
579
|
|
@@ -693,7 +693,7 @@ class KubernetesCommandRunner(CommandRunner):
|
|
693
693
|
separate_stderr: bool = False,
|
694
694
|
connect_timeout: Optional[int] = None,
|
695
695
|
source_bashrc: bool = False,
|
696
|
-
|
696
|
+
skip_num_lines: int = 0,
|
697
697
|
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
698
698
|
"""Uses 'kubectl exec' to run 'cmd' on a pod by its name and namespace.
|
699
699
|
|
@@ -713,7 +713,7 @@ class KubernetesCommandRunner(CommandRunner):
|
|
713
713
|
connect_timeout: timeout in seconds for the pod connection.
|
714
714
|
source_bashrc: Whether to source the bashrc before running the
|
715
715
|
command.
|
716
|
-
|
716
|
+
skip_num_lines: The number of lines to skip at the beginning of the
|
717
717
|
output. This is used when the output is not processed by
|
718
718
|
SkyPilot but we still want to get rid of some warning messages,
|
719
719
|
such as SSH warnings.
|
@@ -751,7 +751,7 @@ class KubernetesCommandRunner(CommandRunner):
|
|
751
751
|
command_str = self._get_command_to_run(cmd,
|
752
752
|
process_stream,
|
753
753
|
separate_stderr,
|
754
|
-
|
754
|
+
skip_num_lines=skip_num_lines,
|
755
755
|
source_bashrc=source_bashrc)
|
756
756
|
command = kubectl_base_command + [
|
757
757
|
# It is important to use /bin/bash -c here to make sure we quote the
|
sky/utils/common_utils.py
CHANGED
@@ -16,7 +16,6 @@ import time
|
|
16
16
|
from typing import Any, Callable, Dict, List, Optional, Union
|
17
17
|
import uuid
|
18
18
|
|
19
|
-
import colorama
|
20
19
|
import jinja2
|
21
20
|
import jsonschema
|
22
21
|
import yaml
|
@@ -479,11 +478,9 @@ def format_exception(e: Union[Exception, SystemExit, KeyboardInterrupt],
|
|
479
478
|
Returns:
|
480
479
|
A string that represents the exception.
|
481
480
|
"""
|
482
|
-
bright = colorama.Style.BRIGHT
|
483
|
-
reset = colorama.Style.RESET_ALL
|
484
481
|
if use_bracket:
|
485
|
-
return f'
|
486
|
-
return f'{
|
482
|
+
return f'[{class_fullname(e.__class__)}] {e}'
|
483
|
+
return f'{class_fullname(e.__class__)}: {e}'
|
487
484
|
|
488
485
|
|
489
486
|
def remove_color(s: str):
|
@@ -679,3 +676,23 @@ def deprecated_function(
|
|
679
676
|
return func(*args, **kwargs)
|
680
677
|
|
681
678
|
return new_func
|
679
|
+
|
680
|
+
|
681
|
+
def truncate_long_string(s: str, max_length: int = 35) -> str:
|
682
|
+
"""Truncate a string to a maximum length, preserving whole words."""
|
683
|
+
if len(s) <= max_length:
|
684
|
+
return s
|
685
|
+
splits = s.split(' ')
|
686
|
+
if len(splits[0]) > max_length:
|
687
|
+
return splits[0][:max_length] + '...' # Use '…'?
|
688
|
+
# Truncate on word boundary.
|
689
|
+
i = 0
|
690
|
+
total = 0
|
691
|
+
for i, part in enumerate(splits):
|
692
|
+
total += len(part)
|
693
|
+
if total >= max_length:
|
694
|
+
break
|
695
|
+
prefix = ' '.join(splits[:i])
|
696
|
+
if len(prefix) < max_length:
|
697
|
+
prefix += s[len(prefix):max_length]
|
698
|
+
return prefix + '...'
|
sky/utils/controller_utils.py
CHANGED
@@ -28,6 +28,7 @@ from sky.serve import serve_utils
|
|
28
28
|
from sky.skylet import constants
|
29
29
|
from sky.utils import common_utils
|
30
30
|
from sky.utils import env_options
|
31
|
+
from sky.utils import rich_utils
|
31
32
|
from sky.utils import ux_utils
|
32
33
|
|
33
34
|
if typing.TYPE_CHECKING:
|
@@ -192,7 +193,11 @@ def _get_cloud_dependencies_installation_commands(
|
|
192
193
|
# TODO(tian): Make dependency installation command a method of cloud
|
193
194
|
# class and get all installation command for enabled clouds.
|
194
195
|
commands = []
|
195
|
-
|
196
|
+
# We use <step>/<total> instead of strong formatting, as we need to update
|
197
|
+
# the <total> at the end of the for loop, and python does not support
|
198
|
+
# partial string formatting.
|
199
|
+
prefix_str = ('[<step>/<total>] Check & install cloud dependencies '
|
200
|
+
'on controller: ')
|
196
201
|
# This is to make sure the shorter checking message does not have junk
|
197
202
|
# characters from the previous message.
|
198
203
|
empty_str = ' ' * 10
|
@@ -203,6 +208,7 @@ def _get_cloud_dependencies_installation_commands(
|
|
203
208
|
# other clouds will install boto3 but not awscli.
|
204
209
|
'pip list | grep awscli> /dev/null 2>&1 || pip install "urllib3<2" '
|
205
210
|
'awscli>=1.27.10 "colorama<0.4.5" > /dev/null 2>&1')
|
211
|
+
setup_clouds: List[str] = []
|
206
212
|
for cloud in sky_check.get_cached_enabled_clouds_or_refresh():
|
207
213
|
if isinstance(
|
208
214
|
clouds,
|
@@ -211,11 +217,16 @@ def _get_cloud_dependencies_installation_commands(
|
|
211
217
|
# fluidstack and paperspace
|
212
218
|
continue
|
213
219
|
if isinstance(cloud, clouds.AWS):
|
214
|
-
|
220
|
+
step_prefix = prefix_str.replace('<step>',
|
221
|
+
str(len(setup_clouds) + 1))
|
222
|
+
commands.append(f'echo -en "\\r{step_prefix}AWS{empty_str}" && ' +
|
215
223
|
aws_dependencies_installation)
|
224
|
+
setup_clouds.append(str(cloud))
|
216
225
|
elif isinstance(cloud, clouds.Azure):
|
226
|
+
step_prefix = prefix_str.replace('<step>',
|
227
|
+
str(len(setup_clouds) + 1))
|
217
228
|
commands.append(
|
218
|
-
f'echo -en "\\r{
|
229
|
+
f'echo -en "\\r{step_prefix}Azure{empty_str}" && '
|
219
230
|
'pip list | grep azure-cli > /dev/null 2>&1 || '
|
220
231
|
'pip install "azure-cli>=2.31.0" azure-core '
|
221
232
|
'"azure-identity>=1.13.0" azure-mgmt-network > /dev/null 2>&1')
|
@@ -225,9 +236,12 @@ def _get_cloud_dependencies_installation_commands(
|
|
225
236
|
commands.append(
|
226
237
|
'pip list | grep azure-storage-blob > /dev/null 2>&1 || '
|
227
238
|
'pip install azure-storage-blob msgraph-sdk > /dev/null 2>&1')
|
239
|
+
setup_clouds.append(str(cloud))
|
228
240
|
elif isinstance(cloud, clouds.GCP):
|
241
|
+
step_prefix = prefix_str.replace('<step>',
|
242
|
+
str(len(setup_clouds) + 1))
|
229
243
|
commands.append(
|
230
|
-
f'echo -en "\\r{
|
244
|
+
f'echo -en "\\r{step_prefix}GCP{empty_str}" && '
|
231
245
|
'pip list | grep google-api-python-client > /dev/null 2>&1 || '
|
232
246
|
'pip install "google-api-python-client>=2.69.0" '
|
233
247
|
'> /dev/null 2>&1')
|
@@ -238,9 +252,12 @@ def _get_cloud_dependencies_installation_commands(
|
|
238
252
|
'pip list | grep google-cloud-storage > /dev/null 2>&1 || '
|
239
253
|
'pip install google-cloud-storage > /dev/null 2>&1')
|
240
254
|
commands.append(f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}')
|
255
|
+
setup_clouds.append(str(cloud))
|
241
256
|
elif isinstance(cloud, clouds.Kubernetes):
|
257
|
+
step_prefix = prefix_str.replace('<step>',
|
258
|
+
str(len(setup_clouds) + 1))
|
242
259
|
commands.append(
|
243
|
-
f'echo -en "\\r{
|
260
|
+
f'echo -en "\\r{step_prefix}Kubernetes{empty_str}" && '
|
244
261
|
'pip list | grep kubernetes > /dev/null 2>&1 || '
|
245
262
|
'pip install "kubernetes>=20.0.0" > /dev/null 2>&1 &&'
|
246
263
|
# Install k8s + skypilot dependencies
|
@@ -248,8 +265,8 @@ def _get_cloud_dependencies_installation_commands(
|
|
248
265
|
'! command -v curl &> /dev/null || '
|
249
266
|
'! command -v socat &> /dev/null || '
|
250
267
|
'! command -v netcat &> /dev/null; '
|
251
|
-
'then apt update
|
252
|
-
'&> /dev/null; '
|
268
|
+
'then apt update &> /dev/null && '
|
269
|
+
'apt install curl socat netcat -y &> /dev/null; '
|
253
270
|
'fi" && '
|
254
271
|
# Install kubectl
|
255
272
|
'(command -v kubectl &>/dev/null || '
|
@@ -258,34 +275,55 @@ def _get_cloud_dependencies_installation_commands(
|
|
258
275
|
'/bin/linux/amd64/kubectl" && '
|
259
276
|
'sudo install -o root -g root -m 0755 '
|
260
277
|
'kubectl /usr/local/bin/kubectl))')
|
278
|
+
setup_clouds.append(str(cloud))
|
261
279
|
elif isinstance(cloud, clouds.Cudo):
|
280
|
+
step_prefix = prefix_str.replace('<step>',
|
281
|
+
str(len(setup_clouds) + 1))
|
262
282
|
commands.append(
|
263
|
-
f'echo -en "\\r{
|
283
|
+
f'echo -en "\\r{step_prefix}Cudo{empty_str}" && '
|
264
284
|
'pip list | grep cudo-compute > /dev/null 2>&1 || '
|
265
285
|
'pip install "cudo-compute>=0.1.10" > /dev/null 2>&1 && '
|
266
286
|
'wget https://download.cudo.org/compute/cudoctl-0.3.2-amd64.deb -O ~/cudoctl.deb > /dev/null 2>&1 && ' # pylint: disable=line-too-long
|
267
287
|
'sudo dpkg -i ~/cudoctl.deb > /dev/null 2>&1')
|
288
|
+
setup_clouds.append(str(cloud))
|
268
289
|
elif isinstance(cloud, clouds.RunPod):
|
269
|
-
|
290
|
+
step_prefix = prefix_str.replace('<step>',
|
291
|
+
str(len(setup_clouds) + 1))
|
292
|
+
commands.append(f'echo -en "\\r{step_prefix}RunPod{empty_str}" && '
|
270
293
|
'pip list | grep runpod > /dev/null 2>&1 || '
|
271
294
|
'pip install "runpod>=1.5.1" > /dev/null 2>&1')
|
295
|
+
setup_clouds.append(str(cloud))
|
272
296
|
if controller == Controllers.JOBS_CONTROLLER:
|
273
297
|
if isinstance(cloud, clouds.IBM):
|
298
|
+
step_prefix = prefix_str.replace('<step>',
|
299
|
+
str(len(setup_clouds) + 1))
|
274
300
|
commands.append(
|
275
|
-
f'echo -en "\\r{
|
301
|
+
f'echo -en "\\r{step_prefix}IBM{empty_str}" '
|
276
302
|
'&& pip list | grep ibm-cloud-sdk-core > /dev/null 2>&1 || '
|
277
303
|
'pip install ibm-cloud-sdk-core ibm-vpc '
|
278
304
|
'ibm-platform-services ibm-cos-sdk > /dev/null 2>&1')
|
305
|
+
setup_clouds.append(str(cloud))
|
279
306
|
elif isinstance(cloud, clouds.OCI):
|
307
|
+
step_prefix = prefix_str.replace('<step>',
|
308
|
+
str(len(setup_clouds) + 1))
|
280
309
|
commands.append(f'echo -en "\\r{prefix_str}OCI{empty_str}" && '
|
281
310
|
'pip list | grep oci > /dev/null 2>&1 || '
|
282
311
|
'pip install oci > /dev/null 2>&1')
|
312
|
+
setup_clouds.append(str(cloud))
|
283
313
|
if (cloudflare.NAME
|
284
314
|
in storage_lib.get_cached_enabled_storage_clouds_or_refresh()):
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
315
|
+
step_prefix = prefix_str.replace('<step>', str(len(setup_clouds) + 1))
|
316
|
+
commands.append(
|
317
|
+
f'echo -en "\\r{step_prefix}Cloudflare{empty_str}" && ' +
|
318
|
+
aws_dependencies_installation)
|
319
|
+
setup_clouds.append(cloudflare.NAME)
|
320
|
+
|
321
|
+
finish_prefix = prefix_str.replace('[<step>/<total>] ', ' ')
|
322
|
+
commands.append(f'echo -e "\\r{finish_prefix}done.{empty_str}"')
|
323
|
+
commands = [
|
324
|
+
command.replace('<total>', str(len(setup_clouds)))
|
325
|
+
for command in commands
|
326
|
+
]
|
289
327
|
return commands
|
290
328
|
|
291
329
|
|
@@ -388,7 +426,7 @@ def shared_controller_vars_to_fill(
|
|
388
426
|
'local_user_config_path': local_user_config_path,
|
389
427
|
}
|
390
428
|
env_vars: Dict[str, str] = {
|
391
|
-
env.
|
429
|
+
env.env_key: str(int(env.get())) for env in env_options.Options
|
392
430
|
}
|
393
431
|
env_vars.update({
|
394
432
|
# Should not use $USER here, as that env var can be empty when
|
@@ -396,7 +434,9 @@ def shared_controller_vars_to_fill(
|
|
396
434
|
constants.USER_ENV_VAR: getpass.getuser(),
|
397
435
|
constants.USER_ID_ENV_VAR: common_utils.get_user_hash(),
|
398
436
|
# Skip cloud identity check to avoid the overhead.
|
399
|
-
env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.
|
437
|
+
env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.env_key: '1',
|
438
|
+
# Disable minimize logging to get more details on the controller.
|
439
|
+
env_options.Options.MINIMIZE_LOGGING.env_key: '0',
|
400
440
|
})
|
401
441
|
if skypilot_config.loaded():
|
402
442
|
# Only set the SKYPILOT_CONFIG env var if the user has a config file.
|
@@ -599,6 +639,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
599
639
|
# ================================================================
|
600
640
|
# Translate the workdir and local file mounts to cloud file mounts.
|
601
641
|
# ================================================================
|
642
|
+
|
602
643
|
run_id = common_utils.get_usage_run_id()[:8]
|
603
644
|
original_file_mounts = task.file_mounts if task.file_mounts else {}
|
604
645
|
original_storage_mounts = task.storage_mounts if task.storage_mounts else {}
|
@@ -618,8 +659,12 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
618
659
|
elif has_local_source_paths_workdir:
|
619
660
|
msg = 'workdir'
|
620
661
|
if msg:
|
621
|
-
logger.info(
|
622
|
-
|
662
|
+
logger.info(
|
663
|
+
ux_utils.starting_message(f'Translating {msg} to '
|
664
|
+
'SkyPilot Storage...'))
|
665
|
+
rich_utils.force_update_status(
|
666
|
+
ux_utils.spinner_message(
|
667
|
+
f'Translating {msg} to SkyPilot Storage...'))
|
623
668
|
|
624
669
|
# Step 1: Translate the workdir to SkyPilot storage.
|
625
670
|
new_storage_mounts = {}
|
@@ -643,8 +688,8 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
643
688
|
})
|
644
689
|
# Check of the existence of the workdir in file_mounts is done in
|
645
690
|
# the task construction.
|
646
|
-
logger.info(f'Workdir {workdir!r}
|
647
|
-
f'{bucket_name!r}.')
|
691
|
+
logger.info(f' {colorama.Style.DIM}Workdir: {workdir!r} '
|
692
|
+
f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}')
|
648
693
|
|
649
694
|
# Step 2: Translate the local file mounts with folder in src to SkyPilot
|
650
695
|
# storage.
|
@@ -668,9 +713,8 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
668
713
|
'persistent': False,
|
669
714
|
'mode': 'COPY',
|
670
715
|
})
|
671
|
-
logger.info(
|
672
|
-
|
673
|
-
f'storage {bucket_name}.')
|
716
|
+
logger.info(f' {colorama.Style.DIM}Folder : {src!r} '
|
717
|
+
f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}')
|
674
718
|
|
675
719
|
# Step 3: Translate local file mounts with file in src to SkyPilot storage.
|
676
720
|
# Hard link the files in src to a temporary directory, and upload folder.
|
@@ -703,10 +747,12 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
703
747
|
f'destination {file_mount_remote_tmp_dir} '
|
704
748
|
'being taken.')
|
705
749
|
sources = list(src_to_file_id.keys())
|
706
|
-
sources_str = '\n
|
707
|
-
logger.info('
|
708
|
-
f'
|
709
|
-
f'\n
|
750
|
+
sources_str = '\n '.join(sources)
|
751
|
+
logger.info(f' {colorama.Style.DIM}Files (listed below) '
|
752
|
+
f' -> storage: {file_bucket_name}:'
|
753
|
+
f'\n {sources_str}{colorama.Style.RESET_ALL}')
|
754
|
+
rich_utils.force_update_status(
|
755
|
+
ux_utils.spinner_message('Uploading translated local files/folders'))
|
710
756
|
task.update_storage_mounts(new_storage_mounts)
|
711
757
|
|
712
758
|
# Step 4: Upload storage from sources
|
@@ -716,8 +762,9 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
716
762
|
if task.storage_mounts:
|
717
763
|
# There may be existing (non-translated) storage mounts, so log this
|
718
764
|
# whenever task.storage_mounts is non-empty.
|
719
|
-
|
720
|
-
|
765
|
+
rich_utils.force_update_status(
|
766
|
+
ux_utils.spinner_message('Uploading local sources to storage[/] '
|
767
|
+
'[dim]View storages: sky storage ls'))
|
721
768
|
try:
|
722
769
|
task.sync_storage_mounts()
|
723
770
|
except ValueError as e:
|
@@ -800,3 +847,5 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
800
847
|
})
|
801
848
|
updated_mount_storages[storage_path] = new_storage
|
802
849
|
task.update_storage_mounts(updated_mount_storages)
|
850
|
+
if msg:
|
851
|
+
logger.info(ux_utils.finishing_message('Uploaded local files/folders.'))
|
sky/utils/env_options.py
CHANGED
@@ -5,17 +5,32 @@ import os
|
|
5
5
|
|
6
6
|
class Options(enum.Enum):
|
7
7
|
"""Environment variables for SkyPilot."""
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
|
9
|
+
# (env var name, default value)
|
10
|
+
IS_DEVELOPER = ('SKYPILOT_DEV', False)
|
11
|
+
SHOW_DEBUG_INFO = ('SKYPILOT_DEBUG', False)
|
12
|
+
DISABLE_LOGGING = ('SKYPILOT_DISABLE_USAGE_COLLECTION', False)
|
13
|
+
MINIMIZE_LOGGING = ('SKYPILOT_MINIMIZE_LOGGING', True)
|
12
14
|
# Internal: this is used to skip the cloud user identity check, which is
|
13
15
|
# used to protect cluster operations in a multi-identity scenario.
|
14
16
|
# Currently, this is only used in the job and serve controller, as there
|
15
17
|
# will not be multiple identities, and skipping the check can increase
|
16
18
|
# robustness.
|
17
|
-
SKIP_CLOUD_IDENTITY_CHECK = 'SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK'
|
19
|
+
SKIP_CLOUD_IDENTITY_CHECK = ('SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK', False)
|
20
|
+
|
21
|
+
def __init__(self, env_var: str, default: bool) -> None:
|
22
|
+
self.env_var = env_var
|
23
|
+
self.default = default
|
18
24
|
|
19
|
-
def
|
25
|
+
def __repr__(self) -> str:
|
26
|
+
return self.env_var
|
27
|
+
|
28
|
+
def get(self) -> bool:
|
20
29
|
"""Check if an environment variable is set to True."""
|
21
|
-
return os.getenv(self.
|
30
|
+
return os.getenv(self.env_var,
|
31
|
+
str(self.default)).lower() in ('true', '1')
|
32
|
+
|
33
|
+
@property
|
34
|
+
def env_key(self) -> str:
|
35
|
+
"""The environment variable key name."""
|
36
|
+
return self.value[0]
|