skypilot-nightly 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/common.py +15 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/oci.py +32 -1
- sky/authentication.py +20 -8
- sky/backends/backend_utils.py +44 -0
- sky/backends/cloud_vm_ray_backend.py +202 -41
- sky/backends/wheel_utils.py +4 -1
- sky/check.py +31 -1
- sky/cli.py +39 -43
- sky/cloud_stores.py +71 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +137 -50
- sky/clouds/cloud.py +4 -0
- sky/clouds/do.py +303 -0
- sky/clouds/gcp.py +9 -0
- sky/clouds/kubernetes.py +3 -3
- sky/clouds/oci.py +20 -9
- sky/clouds/service_catalog/__init__.py +7 -3
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +10 -51
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
- sky/clouds/utils/oci_utils.py +15 -2
- sky/core.py +8 -5
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +19 -4
- sky/data/mounting_utils.py +99 -15
- sky/data/storage.py +961 -130
- sky/global_user_state.py +1 -1
- sky/jobs/__init__.py +2 -0
- sky/jobs/constants.py +8 -7
- sky/jobs/controller.py +19 -22
- sky/jobs/core.py +46 -2
- sky/jobs/recovery_strategy.py +114 -143
- sky/jobs/scheduler.py +283 -0
- sky/jobs/state.py +290 -21
- sky/jobs/utils.py +346 -95
- sky/optimizer.py +6 -3
- sky/provision/aws/config.py +59 -29
- sky/provision/azure/instance.py +1 -1
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +306 -0
- sky/provision/docker_utils.py +22 -11
- sky/provision/gcp/instance_utils.py +15 -9
- sky/provision/kubernetes/instance.py +3 -2
- sky/provision/kubernetes/utils.py +125 -20
- sky/provision/oci/query_utils.py +17 -14
- sky/provision/provisioner.py +0 -1
- sky/provision/runpod/instance.py +10 -1
- sky/provision/runpod/utils.py +170 -13
- sky/resources.py +1 -1
- sky/serve/autoscalers.py +359 -301
- sky/serve/controller.py +10 -8
- sky/serve/core.py +84 -7
- sky/serve/load_balancer.py +27 -10
- sky/serve/replica_managers.py +1 -3
- sky/serve/serve_state.py +10 -5
- sky/serve/serve_utils.py +28 -1
- sky/serve/service.py +4 -3
- sky/serve/service_spec.py +31 -0
- sky/setup_files/dependencies.py +4 -1
- sky/skylet/constants.py +8 -4
- sky/skylet/events.py +7 -3
- sky/skylet/job_lib.py +10 -30
- sky/skylet/log_lib.py +8 -8
- sky/skylet/log_lib.pyi +3 -0
- sky/skylet/providers/command_runner.py +5 -7
- sky/skylet/skylet.py +1 -1
- sky/task.py +28 -1
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/jobs-controller.yaml.j2 +41 -7
- sky/templates/runpod-ray.yml.j2 +13 -0
- sky/templates/sky-serve-controller.yaml.j2 +4 -0
- sky/usage/usage_lib.py +10 -2
- sky/utils/accelerator_registry.py +12 -8
- sky/utils/controller_utils.py +114 -39
- sky/utils/db_utils.py +18 -4
- sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
- sky/utils/log_utils.py +2 -0
- sky/utils/resources_utils.py +25 -21
- sky/utils/schemas.py +27 -0
- sky/utils/subprocess_utils.py +54 -10
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +23 -4
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +92 -82
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/cli.py
CHANGED
@@ -998,8 +998,10 @@ def cli():
|
|
998
998
|
@click.option('--docker',
|
999
999
|
'backend_name',
|
1000
1000
|
flag_value=backends.LocalDockerBackend.NAME,
|
1001
|
-
|
1002
|
-
help='
|
1001
|
+
hidden=True,
|
1002
|
+
help=('(Deprecated) Local docker support is deprecated. '
|
1003
|
+
'To run locally, create a local Kubernetes cluster with '
|
1004
|
+
'``sky local up``.'))
|
1003
1005
|
@_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS)
|
1004
1006
|
@click.option(
|
1005
1007
|
'--idle-minutes-to-autostop',
|
@@ -1142,6 +1144,11 @@ def launch(
|
|
1142
1144
|
backend: backends.Backend
|
1143
1145
|
if backend_name == backends.LocalDockerBackend.NAME:
|
1144
1146
|
backend = backends.LocalDockerBackend()
|
1147
|
+
click.secho(
|
1148
|
+
'WARNING: LocalDockerBackend is deprecated and will be '
|
1149
|
+
'removed in a future release. To run locally, create a local '
|
1150
|
+
'Kubernetes cluster with `sky local up`.',
|
1151
|
+
fg='yellow')
|
1145
1152
|
elif backend_name == backends.CloudVmRayBackend.NAME:
|
1146
1153
|
backend = backends.CloudVmRayBackend()
|
1147
1154
|
else:
|
@@ -3523,11 +3530,11 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
|
|
3523
3530
|
if sum([bool(names), all]) != 1:
|
3524
3531
|
raise click.UsageError('Either --all or a name must be specified.')
|
3525
3532
|
if all:
|
3526
|
-
|
3527
|
-
|
3533
|
+
# Use '*' to get all storages.
|
3534
|
+
names = global_user_state.get_glob_storage_name(storage_name='*')
|
3535
|
+
if not names:
|
3528
3536
|
click.echo('No storage(s) to delete.')
|
3529
3537
|
return
|
3530
|
-
names = [s['name'] for s in storages]
|
3531
3538
|
else:
|
3532
3539
|
names = _get_glob_storages(names)
|
3533
3540
|
if names:
|
@@ -3541,7 +3548,13 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
|
|
3541
3548
|
abort=True,
|
3542
3549
|
show_default=True)
|
3543
3550
|
|
3544
|
-
|
3551
|
+
def delete_storage(name: str) -> None:
|
3552
|
+
try:
|
3553
|
+
sky.storage_delete(name)
|
3554
|
+
except Exception as e: # pylint: disable=broad-except
|
3555
|
+
click.secho(f'Error deleting storage {name}: {e}', fg='red')
|
3556
|
+
|
3557
|
+
subprocess_utils.run_in_parallel(delete_storage, names)
|
3545
3558
|
|
3546
3559
|
|
3547
3560
|
@cli.group(cls=_NaturalOrderGroup)
|
@@ -3581,18 +3594,6 @@ def jobs():
|
|
3581
3594
|
is_flag=True,
|
3582
3595
|
help=('If True, as soon as a job is submitted, return from this call '
|
3583
3596
|
'and do not stream execution logs.'))
|
3584
|
-
@click.option(
|
3585
|
-
'--retry-until-up/--no-retry-until-up',
|
3586
|
-
'-r/-no-r',
|
3587
|
-
default=None,
|
3588
|
-
is_flag=True,
|
3589
|
-
required=False,
|
3590
|
-
help=(
|
3591
|
-
'(Default: True; this flag is deprecated and will be removed in a '
|
3592
|
-
'future release.) Whether to retry provisioning infinitely until the '
|
3593
|
-
'cluster is up, if unavailability errors are encountered. This ' # pylint: disable=bad-docstring-quotes
|
3594
|
-
'applies to launching all managed jobs (both the initial and '
|
3595
|
-
'any recovery attempts), not the jobs controller.'))
|
3596
3597
|
@click.option('--yes',
|
3597
3598
|
'-y',
|
3598
3599
|
is_flag=True,
|
@@ -3629,7 +3630,6 @@ def jobs_launch(
|
|
3629
3630
|
disk_tier: Optional[str],
|
3630
3631
|
ports: Tuple[str],
|
3631
3632
|
detach_run: bool,
|
3632
|
-
retry_until_up: Optional[bool],
|
3633
3633
|
yes: bool,
|
3634
3634
|
fast: bool,
|
3635
3635
|
):
|
@@ -3673,19 +3673,6 @@ def jobs_launch(
|
|
3673
3673
|
ports=ports,
|
3674
3674
|
job_recovery=job_recovery,
|
3675
3675
|
)
|
3676
|
-
# Deprecation. We set the default behavior to be retry until up, and the
|
3677
|
-
# flag `--retry-until-up` is deprecated. We can remove the flag in 0.8.0.
|
3678
|
-
if retry_until_up is not None:
|
3679
|
-
flag_str = '--retry-until-up'
|
3680
|
-
if not retry_until_up:
|
3681
|
-
flag_str = '--no-retry-until-up'
|
3682
|
-
click.secho(
|
3683
|
-
f'Flag {flag_str} is deprecated and will be removed in a '
|
3684
|
-
'future release (managed jobs will always be retried). '
|
3685
|
-
'Please file an issue if this does not work for you.',
|
3686
|
-
fg='yellow')
|
3687
|
-
else:
|
3688
|
-
retry_until_up = True
|
3689
3676
|
|
3690
3677
|
# Deprecation. The default behavior is fast, and the flag will be removed.
|
3691
3678
|
# The flag was not present in 0.7.x (only nightly), so we will remove before
|
@@ -3735,10 +3722,7 @@ def jobs_launch(
|
|
3735
3722
|
|
3736
3723
|
common_utils.check_cluster_name_is_valid(name)
|
3737
3724
|
|
3738
|
-
managed_jobs.launch(dag,
|
3739
|
-
name,
|
3740
|
-
detach_run=detach_run,
|
3741
|
-
retry_until_up=retry_until_up)
|
3725
|
+
managed_jobs.launch(dag, name, detach_run=detach_run)
|
3742
3726
|
|
3743
3727
|
|
3744
3728
|
@jobs.command('queue', cls=_DocumentedCodeCommand)
|
@@ -3926,17 +3910,29 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
|
|
3926
3910
|
required=False,
|
3927
3911
|
help='Query the latest job logs, restarting the jobs controller if stopped.'
|
3928
3912
|
)
|
3913
|
+
@click.option('--sync-down',
|
3914
|
+
'-s',
|
3915
|
+
default=False,
|
3916
|
+
is_flag=True,
|
3917
|
+
required=False,
|
3918
|
+
help='Download logs for all jobs shown in the queue.')
|
3929
3919
|
@click.argument('job_id', required=False, type=int)
|
3930
3920
|
@usage_lib.entrypoint
|
3931
3921
|
def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
3932
|
-
controller: bool, refresh: bool):
|
3933
|
-
"""Tail the log of a managed job."""
|
3922
|
+
controller: bool, refresh: bool, sync_down: bool):
|
3923
|
+
"""Tail or sync down the log of a managed job."""
|
3934
3924
|
try:
|
3935
|
-
|
3936
|
-
|
3937
|
-
|
3938
|
-
|
3939
|
-
|
3925
|
+
if sync_down:
|
3926
|
+
managed_jobs.sync_down_logs(name=name,
|
3927
|
+
job_id=job_id,
|
3928
|
+
controller=controller,
|
3929
|
+
refresh=refresh)
|
3930
|
+
else:
|
3931
|
+
managed_jobs.tail_logs(name=name,
|
3932
|
+
job_id=job_id,
|
3933
|
+
follow=follow,
|
3934
|
+
controller=controller,
|
3935
|
+
refresh=refresh)
|
3940
3936
|
except exceptions.ClusterNotUpError:
|
3941
3937
|
with ux_utils.print_exception_no_traceback():
|
3942
3938
|
raise
|
sky/cloud_stores.py
CHANGED
@@ -7,6 +7,7 @@ TODO:
|
|
7
7
|
* Better interface.
|
8
8
|
* Better implementation (e.g., fsspec, smart_open, using each cloud's SDK).
|
9
9
|
"""
|
10
|
+
import os
|
10
11
|
import shlex
|
11
12
|
import subprocess
|
12
13
|
import time
|
@@ -18,6 +19,7 @@ from sky.adaptors import aws
|
|
18
19
|
from sky.adaptors import azure
|
19
20
|
from sky.adaptors import cloudflare
|
20
21
|
from sky.adaptors import ibm
|
22
|
+
from sky.adaptors import oci
|
21
23
|
from sky.clouds import gcp
|
22
24
|
from sky.data import data_utils
|
23
25
|
from sky.data.data_utils import Rclone
|
@@ -111,8 +113,16 @@ class GcsCloudStorage(CloudStorage):
|
|
111
113
|
@property
|
112
114
|
def _gsutil_command(self):
|
113
115
|
gsutil_alias, alias_gen = data_utils.get_gsutil_command()
|
114
|
-
return (
|
115
|
-
|
116
|
+
return (
|
117
|
+
f'{alias_gen}; GOOGLE_APPLICATION_CREDENTIALS='
|
118
|
+
f'{gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH}; '
|
119
|
+
# Explicitly activate service account. Unlike the gcp packages
|
120
|
+
# and other GCP commands, gsutil does not automatically pick up
|
121
|
+
# the default credential keys when it is a service account.
|
122
|
+
'gcloud auth activate-service-account '
|
123
|
+
'--key-file=$GOOGLE_APPLICATION_CREDENTIALS '
|
124
|
+
'2> /dev/null || true; '
|
125
|
+
f'{gsutil_alias}')
|
116
126
|
|
117
127
|
def is_directory(self, url: str) -> bool:
|
118
128
|
"""Returns whether 'url' is a directory.
|
@@ -470,6 +480,64 @@ class IBMCosCloudStorage(CloudStorage):
|
|
470
480
|
return self.make_sync_dir_command(source, destination)
|
471
481
|
|
472
482
|
|
483
|
+
class OciCloudStorage(CloudStorage):
|
484
|
+
"""OCI Cloud Storage."""
|
485
|
+
|
486
|
+
def is_directory(self, url: str) -> bool:
|
487
|
+
"""Returns whether OCI 'url' is a directory.
|
488
|
+
In cloud object stores, a "directory" refers to a regular object whose
|
489
|
+
name is a prefix of other objects.
|
490
|
+
"""
|
491
|
+
bucket_name, path = data_utils.split_oci_path(url)
|
492
|
+
|
493
|
+
client = oci.get_object_storage_client()
|
494
|
+
namespace = client.get_namespace(
|
495
|
+
compartment_id=oci.get_oci_config()['tenancy']).data
|
496
|
+
|
497
|
+
objects = client.list_objects(namespace_name=namespace,
|
498
|
+
bucket_name=bucket_name,
|
499
|
+
prefix=path).data.objects
|
500
|
+
|
501
|
+
if len(objects) == 0:
|
502
|
+
# A directory with few or no items
|
503
|
+
return True
|
504
|
+
|
505
|
+
if len(objects) > 1:
|
506
|
+
# A directory with more than 1 items
|
507
|
+
return True
|
508
|
+
|
509
|
+
object_name = objects[0].name
|
510
|
+
if path.endswith(object_name):
|
511
|
+
# An object path
|
512
|
+
return False
|
513
|
+
|
514
|
+
# A directory with only 1 item
|
515
|
+
return True
|
516
|
+
|
517
|
+
@oci.with_oci_env
|
518
|
+
def make_sync_dir_command(self, source: str, destination: str) -> str:
|
519
|
+
"""Downloads using OCI CLI."""
|
520
|
+
bucket_name, path = data_utils.split_oci_path(source)
|
521
|
+
|
522
|
+
download_via_ocicli = (f'oci os object sync --no-follow-symlinks '
|
523
|
+
f'--bucket-name {bucket_name} '
|
524
|
+
f'--prefix "{path}" --dest-dir "{destination}"')
|
525
|
+
|
526
|
+
return download_via_ocicli
|
527
|
+
|
528
|
+
@oci.with_oci_env
|
529
|
+
def make_sync_file_command(self, source: str, destination: str) -> str:
|
530
|
+
"""Downloads a file using OCI CLI."""
|
531
|
+
bucket_name, path = data_utils.split_oci_path(source)
|
532
|
+
filename = os.path.basename(path)
|
533
|
+
destination = os.path.join(destination, filename)
|
534
|
+
|
535
|
+
download_via_ocicli = (f'oci os object get --bucket-name {bucket_name} '
|
536
|
+
f'--name "{path}" --file "{destination}"')
|
537
|
+
|
538
|
+
return download_via_ocicli
|
539
|
+
|
540
|
+
|
473
541
|
def get_storage_from_path(url: str) -> CloudStorage:
|
474
542
|
"""Returns a CloudStorage by identifying the scheme:// in a URL."""
|
475
543
|
result = urllib.parse.urlsplit(url)
|
@@ -485,6 +553,7 @@ _REGISTRY = {
|
|
485
553
|
's3': S3CloudStorage(),
|
486
554
|
'r2': R2CloudStorage(),
|
487
555
|
'cos': IBMCosCloudStorage(),
|
556
|
+
'oci': OciCloudStorage(),
|
488
557
|
# TODO: This is a hack, as Azure URL starts with https://, we should
|
489
558
|
# refactor the registry to be able to take regex, so that Azure blob can
|
490
559
|
# be identified with `https://(.*?)\.blob\.core\.windows\.net`
|
sky/clouds/__init__.py
CHANGED
@@ -15,6 +15,7 @@ from sky.clouds.cloud_registry import CLOUD_REGISTRY
|
|
15
15
|
from sky.clouds.aws import AWS
|
16
16
|
from sky.clouds.azure import Azure
|
17
17
|
from sky.clouds.cudo import Cudo
|
18
|
+
from sky.clouds.do import DO
|
18
19
|
from sky.clouds.fluidstack import Fluidstack
|
19
20
|
from sky.clouds.gcp import GCP
|
20
21
|
from sky.clouds.ibm import IBM
|
@@ -34,6 +35,7 @@ __all__ = [
|
|
34
35
|
'Cudo',
|
35
36
|
'GCP',
|
36
37
|
'Lambda',
|
38
|
+
'DO',
|
37
39
|
'Paperspace',
|
38
40
|
'SCP',
|
39
41
|
'RunPod',
|
sky/clouds/aws.py
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
import enum
|
3
3
|
import fnmatch
|
4
4
|
import functools
|
5
|
+
import hashlib
|
6
|
+
import json
|
5
7
|
import os
|
6
8
|
import re
|
7
9
|
import subprocess
|
@@ -16,6 +18,7 @@ from sky import sky_logging
|
|
16
18
|
from sky import skypilot_config
|
17
19
|
from sky.adaptors import aws
|
18
20
|
from sky.clouds import service_catalog
|
21
|
+
from sky.clouds.service_catalog import common as catalog_common
|
19
22
|
from sky.clouds.utils import aws_utils
|
20
23
|
from sky.skylet import constants
|
21
24
|
from sky.utils import common_utils
|
@@ -92,6 +95,10 @@ class AWSIdentityType(enum.Enum):
|
|
92
95
|
|
93
96
|
CONTAINER_ROLE = 'container-role'
|
94
97
|
|
98
|
+
CUSTOM_PROCESS = 'custom-process'
|
99
|
+
|
100
|
+
ASSUME_ROLE = 'assume-role'
|
101
|
+
|
95
102
|
# Name Value Type Location
|
96
103
|
# ---- ----- ---- --------
|
97
104
|
# profile <not set> None None
|
@@ -100,6 +107,24 @@ class AWSIdentityType(enum.Enum):
|
|
100
107
|
# region us-east-1 config-file ~/.aws/config
|
101
108
|
SHARED_CREDENTIALS_FILE = 'shared-credentials-file'
|
102
109
|
|
110
|
+
def can_credential_expire(self) -> bool:
|
111
|
+
"""Check if the AWS identity type can expire.
|
112
|
+
|
113
|
+
SSO,IAM_ROLE and CONTAINER_ROLE are temporary credentials and refreshed
|
114
|
+
automatically. ENV and SHARED_CREDENTIALS_FILE are short-lived
|
115
|
+
credentials without refresh.
|
116
|
+
IAM ROLE:
|
117
|
+
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
|
118
|
+
SSO/Container-role refresh token:
|
119
|
+
https://docs.aws.amazon.com/solutions/latest/dea-api/auth-refreshtoken.html
|
120
|
+
"""
|
121
|
+
# TODO(hong): Add a CLI based check for the expiration of the temporary
|
122
|
+
# credentials
|
123
|
+
expirable_types = {
|
124
|
+
AWSIdentityType.ENV, AWSIdentityType.SHARED_CREDENTIALS_FILE
|
125
|
+
}
|
126
|
+
return self in expirable_types
|
127
|
+
|
103
128
|
|
104
129
|
@clouds.CLOUD_REGISTRY.register
|
105
130
|
class AWS(clouds.Cloud):
|
@@ -593,10 +618,27 @@ class AWS(clouds.Cloud):
|
|
593
618
|
hints = f'AWS IAM role is set.{single_cloud_hint}'
|
594
619
|
elif identity_type == AWSIdentityType.CONTAINER_ROLE:
|
595
620
|
# Similar to the IAM ROLE, an ECS container may not store credentials
|
596
|
-
# in the~/.aws/credentials file. So we don't check for the existence of
|
621
|
+
# in the ~/.aws/credentials file. So we don't check for the existence of
|
597
622
|
# the file. i.e. the container will be assigned the IAM role of the
|
598
623
|
# task: skypilot-v1.
|
599
624
|
hints = f'AWS container-role is set.{single_cloud_hint}'
|
625
|
+
elif identity_type == AWSIdentityType.CUSTOM_PROCESS:
|
626
|
+
# Similar to the IAM ROLE, a custom process may not store credentials
|
627
|
+
# in the ~/.aws/credentials file. So we don't check for the existence of
|
628
|
+
# the file. i.e. the custom process will be assigned the IAM role of the
|
629
|
+
# task: skypilot-v1.
|
630
|
+
hints = f'AWS custom-process is set.{single_cloud_hint}'
|
631
|
+
elif identity_type == AWSIdentityType.ASSUME_ROLE:
|
632
|
+
# When using ASSUME ROLE, the credentials are coming from a different
|
633
|
+
# source profile. So we don't check for the existence of ~/.aws/credentials.
|
634
|
+
# i.e. the assumed role will be assigned the IAM role of the
|
635
|
+
# task: skypilot-v1.
|
636
|
+
hints = f'AWS assume-role is set.{single_cloud_hint}'
|
637
|
+
elif identity_type == AWSIdentityType.ENV:
|
638
|
+
# When using ENV vars, the credentials are coming from the environment
|
639
|
+
# variables. So we don't check for the existence of ~/.aws/credentials.
|
640
|
+
# i.e. the identity is not determined by the file.
|
641
|
+
hints = f'AWS env is set.{single_cloud_hint}'
|
600
642
|
else:
|
601
643
|
# This file is required because it is required by the VMs launched on
|
602
644
|
# other clouds to access private s3 buckets and resources like EC2.
|
@@ -624,14 +666,10 @@ class AWS(clouds.Cloud):
|
|
624
666
|
|
625
667
|
@classmethod
|
626
668
|
def _current_identity_type(cls) -> Optional[AWSIdentityType]:
|
627
|
-
|
628
|
-
|
629
|
-
check=False,
|
630
|
-
stdout=subprocess.PIPE,
|
631
|
-
stderr=subprocess.PIPE)
|
632
|
-
if proc.returncode != 0:
|
669
|
+
stdout = cls._aws_configure_list()
|
670
|
+
if stdout is None:
|
633
671
|
return None
|
634
|
-
|
672
|
+
output = stdout.decode()
|
635
673
|
|
636
674
|
# We determine the identity type by looking at the output of
|
637
675
|
# `aws configure list`. The output looks like:
|
@@ -646,55 +684,32 @@ class AWS(clouds.Cloud):
|
|
646
684
|
|
647
685
|
def _is_access_key_of_type(type_str: str) -> bool:
|
648
686
|
# The dot (.) does not match line separators.
|
649
|
-
results = re.findall(fr'access_key.*{type_str}',
|
687
|
+
results = re.findall(fr'access_key.*{type_str}', output)
|
650
688
|
if len(results) > 1:
|
651
689
|
raise RuntimeError(
|
652
|
-
f'Unexpected `aws configure list` output:\n{
|
690
|
+
f'Unexpected `aws configure list` output:\n{output}')
|
653
691
|
return len(results) == 1
|
654
692
|
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
elif _is_access_key_of_type(AWSIdentityType.CONTAINER_ROLE.value):
|
660
|
-
return AWSIdentityType.CONTAINER_ROLE
|
661
|
-
elif _is_access_key_of_type(AWSIdentityType.ENV.value):
|
662
|
-
return AWSIdentityType.ENV
|
663
|
-
else:
|
664
|
-
return AWSIdentityType.SHARED_CREDENTIALS_FILE
|
693
|
+
for identity_type in AWSIdentityType:
|
694
|
+
if _is_access_key_of_type(identity_type.value):
|
695
|
+
return identity_type
|
696
|
+
return AWSIdentityType.SHARED_CREDENTIALS_FILE
|
665
697
|
|
666
698
|
@classmethod
|
667
|
-
@functools.lru_cache(maxsize=1)
|
668
|
-
def
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
- within the same root account, switch between different IAM
|
679
|
-
users, and treat [user_id=1234, account=A] and
|
680
|
-
[user_id=4567, account=A] to be the *same*. Namely, switching
|
681
|
-
between these IAM roles within the same root account will cause
|
682
|
-
the first element of the returned list to differ, and will allow
|
683
|
-
the same actual user to continue to interact with their clusters.
|
684
|
-
Note: this is not 100% safe, since the IAM users can have very
|
685
|
-
specific permissions, that disallow them to access the clusters
|
686
|
-
but it is a reasonable compromise as that could be rare.
|
687
|
-
|
688
|
-
Returns:
|
689
|
-
A list of strings that uniquely identifies the user on this cloud.
|
690
|
-
For identity check, we will fallback through the list of strings
|
691
|
-
until we find a match, and print a warning if we fail for the
|
692
|
-
first string.
|
699
|
+
@functools.lru_cache(maxsize=1)
|
700
|
+
def _aws_configure_list(cls) -> Optional[bytes]:
|
701
|
+
proc = subprocess.run('aws configure list',
|
702
|
+
shell=True,
|
703
|
+
check=False,
|
704
|
+
stdout=subprocess.PIPE,
|
705
|
+
stderr=subprocess.PIPE)
|
706
|
+
if proc.returncode != 0:
|
707
|
+
return None
|
708
|
+
return proc.stdout
|
693
709
|
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
"""
|
710
|
+
@classmethod
|
711
|
+
@functools.lru_cache(maxsize=1) # Cache since getting identity is slow.
|
712
|
+
def _sts_get_caller_identity(cls) -> Optional[List[List[str]]]:
|
698
713
|
try:
|
699
714
|
sts = aws.client('sts')
|
700
715
|
# The caller identity contains 3 fields: UserId, Account, Arn.
|
@@ -773,6 +788,72 @@ class AWS(clouds.Cloud):
|
|
773
788
|
# automatic switching for AWS. Currently we only support one identity.
|
774
789
|
return [user_ids]
|
775
790
|
|
791
|
+
@classmethod
|
792
|
+
@functools.lru_cache(maxsize=1) # Cache since getting identity is slow.
|
793
|
+
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
794
|
+
"""Returns a [UserId, Account] list that uniquely identifies the user.
|
795
|
+
|
796
|
+
These fields come from `aws sts get-caller-identity` and are cached
|
797
|
+
locally by `aws configure list` output. The identities are assumed to
|
798
|
+
be stable for the duration of the `sky` process. Modifying the
|
799
|
+
credentials while the `sky` process is running will not affect the
|
800
|
+
identity returned by this function.
|
801
|
+
|
802
|
+
We permit the same actual user to:
|
803
|
+
|
804
|
+
- switch between different root accounts (after which both elements
|
805
|
+
of the list will be different) and have their clusters owned by
|
806
|
+
each account be protected; or
|
807
|
+
|
808
|
+
- within the same root account, switch between different IAM
|
809
|
+
users, and treat [user_id=1234, account=A] and
|
810
|
+
[user_id=4567, account=A] to be the *same*. Namely, switching
|
811
|
+
between these IAM roles within the same root account will cause
|
812
|
+
the first element of the returned list to differ, and will allow
|
813
|
+
the same actual user to continue to interact with their clusters.
|
814
|
+
Note: this is not 100% safe, since the IAM users can have very
|
815
|
+
specific permissions, that disallow them to access the clusters
|
816
|
+
but it is a reasonable compromise as that could be rare.
|
817
|
+
|
818
|
+
Returns:
|
819
|
+
A list of strings that uniquely identifies the user on this cloud.
|
820
|
+
For identity check, we will fallback through the list of strings
|
821
|
+
until we find a match, and print a warning if we fail for the
|
822
|
+
first string.
|
823
|
+
|
824
|
+
Raises:
|
825
|
+
exceptions.CloudUserIdentityError: if the user identity cannot be
|
826
|
+
retrieved.
|
827
|
+
"""
|
828
|
+
stdout = cls._aws_configure_list()
|
829
|
+
if stdout is None:
|
830
|
+
# `aws configure list` is not available, possible reasons:
|
831
|
+
# - awscli is not installed but credentials are valid, e.g. run from
|
832
|
+
# an EC2 instance with IAM role
|
833
|
+
# - aws credentials are not set, proceed anyway to get unified error
|
834
|
+
# message for users
|
835
|
+
return cls._sts_get_caller_identity()
|
836
|
+
config_hash = hashlib.md5(stdout).hexdigest()[:8]
|
837
|
+
# Getting aws identity cost ~1s, so we cache the result with the output of
|
838
|
+
# `aws configure list` as cache key. Different `aws configure list` output
|
839
|
+
# can have same aws identity, our assumption is the output would be stable
|
840
|
+
# in real world, so the number of cache files would be limited.
|
841
|
+
# TODO(aylei): consider using a more stable cache key and evalute eviction.
|
842
|
+
cache_path = catalog_common.get_catalog_path(
|
843
|
+
f'aws/.cache/user-identity-{config_hash}.txt')
|
844
|
+
if os.path.exists(cache_path):
|
845
|
+
try:
|
846
|
+
with open(cache_path, 'r', encoding='utf-8') as f:
|
847
|
+
return json.loads(f.read())
|
848
|
+
except json.JSONDecodeError:
|
849
|
+
# cache is invalid, ignore it and fetch identity again
|
850
|
+
pass
|
851
|
+
|
852
|
+
result = cls._sts_get_caller_identity()
|
853
|
+
with open(cache_path, 'w', encoding='utf-8') as f:
|
854
|
+
f.write(json.dumps(result))
|
855
|
+
return result
|
856
|
+
|
776
857
|
@classmethod
|
777
858
|
def get_active_user_identity_str(cls) -> Optional[str]:
|
778
859
|
user_identity = cls.get_active_user_identity()
|
@@ -812,6 +893,12 @@ class AWS(clouds.Cloud):
|
|
812
893
|
if os.path.exists(os.path.expanduser(f'~/.aws/{filename}'))
|
813
894
|
}
|
814
895
|
|
896
|
+
@functools.lru_cache(maxsize=1)
|
897
|
+
def can_credential_expire(self) -> bool:
|
898
|
+
identity_type = self._current_identity_type()
|
899
|
+
return identity_type is not None and identity_type.can_credential_expire(
|
900
|
+
)
|
901
|
+
|
815
902
|
def instance_type_exists(self, instance_type):
|
816
903
|
return service_catalog.instance_type_exists(instance_type, clouds='aws')
|
817
904
|
|
sky/clouds/cloud.py
CHANGED
@@ -536,6 +536,10 @@ class Cloud:
|
|
536
536
|
"""
|
537
537
|
raise NotImplementedError
|
538
538
|
|
539
|
+
def can_credential_expire(self) -> bool:
|
540
|
+
"""Returns whether the cloud credential can expire."""
|
541
|
+
return False
|
542
|
+
|
539
543
|
@classmethod
|
540
544
|
def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
|
541
545
|
"""Check the image size from the cloud.
|