skypilot-nightly 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/common.py +15 -9
  3. sky/adaptors/do.py +20 -0
  4. sky/adaptors/oci.py +32 -1
  5. sky/authentication.py +20 -8
  6. sky/backends/backend_utils.py +44 -0
  7. sky/backends/cloud_vm_ray_backend.py +202 -41
  8. sky/backends/wheel_utils.py +4 -1
  9. sky/check.py +31 -1
  10. sky/cli.py +39 -43
  11. sky/cloud_stores.py +71 -2
  12. sky/clouds/__init__.py +2 -0
  13. sky/clouds/aws.py +137 -50
  14. sky/clouds/cloud.py +4 -0
  15. sky/clouds/do.py +303 -0
  16. sky/clouds/gcp.py +9 -0
  17. sky/clouds/kubernetes.py +3 -3
  18. sky/clouds/oci.py +20 -9
  19. sky/clouds/service_catalog/__init__.py +7 -3
  20. sky/clouds/service_catalog/constants.py +1 -1
  21. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +10 -51
  22. sky/clouds/service_catalog/do_catalog.py +111 -0
  23. sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
  24. sky/clouds/utils/oci_utils.py +15 -2
  25. sky/core.py +8 -5
  26. sky/data/data_transfer.py +37 -0
  27. sky/data/data_utils.py +19 -4
  28. sky/data/mounting_utils.py +99 -15
  29. sky/data/storage.py +961 -130
  30. sky/global_user_state.py +1 -1
  31. sky/jobs/__init__.py +2 -0
  32. sky/jobs/constants.py +8 -7
  33. sky/jobs/controller.py +19 -22
  34. sky/jobs/core.py +46 -2
  35. sky/jobs/recovery_strategy.py +114 -143
  36. sky/jobs/scheduler.py +283 -0
  37. sky/jobs/state.py +290 -21
  38. sky/jobs/utils.py +346 -95
  39. sky/optimizer.py +6 -3
  40. sky/provision/aws/config.py +59 -29
  41. sky/provision/azure/instance.py +1 -1
  42. sky/provision/do/__init__.py +11 -0
  43. sky/provision/do/config.py +14 -0
  44. sky/provision/do/constants.py +10 -0
  45. sky/provision/do/instance.py +287 -0
  46. sky/provision/do/utils.py +306 -0
  47. sky/provision/docker_utils.py +22 -11
  48. sky/provision/gcp/instance_utils.py +15 -9
  49. sky/provision/kubernetes/instance.py +3 -2
  50. sky/provision/kubernetes/utils.py +125 -20
  51. sky/provision/oci/query_utils.py +17 -14
  52. sky/provision/provisioner.py +0 -1
  53. sky/provision/runpod/instance.py +10 -1
  54. sky/provision/runpod/utils.py +170 -13
  55. sky/resources.py +1 -1
  56. sky/serve/autoscalers.py +359 -301
  57. sky/serve/controller.py +10 -8
  58. sky/serve/core.py +84 -7
  59. sky/serve/load_balancer.py +27 -10
  60. sky/serve/replica_managers.py +1 -3
  61. sky/serve/serve_state.py +10 -5
  62. sky/serve/serve_utils.py +28 -1
  63. sky/serve/service.py +4 -3
  64. sky/serve/service_spec.py +31 -0
  65. sky/setup_files/dependencies.py +4 -1
  66. sky/skylet/constants.py +8 -4
  67. sky/skylet/events.py +7 -3
  68. sky/skylet/job_lib.py +10 -30
  69. sky/skylet/log_lib.py +8 -8
  70. sky/skylet/log_lib.pyi +3 -0
  71. sky/skylet/providers/command_runner.py +5 -7
  72. sky/skylet/skylet.py +1 -1
  73. sky/task.py +28 -1
  74. sky/templates/do-ray.yml.j2 +98 -0
  75. sky/templates/jobs-controller.yaml.j2 +41 -7
  76. sky/templates/runpod-ray.yml.j2 +13 -0
  77. sky/templates/sky-serve-controller.yaml.j2 +4 -0
  78. sky/usage/usage_lib.py +10 -2
  79. sky/utils/accelerator_registry.py +12 -8
  80. sky/utils/controller_utils.py +114 -39
  81. sky/utils/db_utils.py +18 -4
  82. sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
  83. sky/utils/log_utils.py +2 -0
  84. sky/utils/resources_utils.py +25 -21
  85. sky/utils/schemas.py +27 -0
  86. sky/utils/subprocess_utils.py +54 -10
  87. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +23 -4
  88. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +92 -82
  89. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +1 -1
  90. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
  91. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
  92. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/cli.py CHANGED
@@ -998,8 +998,10 @@ def cli():
998
998
  @click.option('--docker',
999
999
  'backend_name',
1000
1000
  flag_value=backends.LocalDockerBackend.NAME,
1001
- default=False,
1002
- help='If used, runs locally inside a docker container.')
1001
+ hidden=True,
1002
+ help=('(Deprecated) Local docker support is deprecated. '
1003
+ 'To run locally, create a local Kubernetes cluster with '
1004
+ '``sky local up``.'))
1003
1005
  @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS)
1004
1006
  @click.option(
1005
1007
  '--idle-minutes-to-autostop',
@@ -1142,6 +1144,11 @@ def launch(
1142
1144
  backend: backends.Backend
1143
1145
  if backend_name == backends.LocalDockerBackend.NAME:
1144
1146
  backend = backends.LocalDockerBackend()
1147
+ click.secho(
1148
+ 'WARNING: LocalDockerBackend is deprecated and will be '
1149
+ 'removed in a future release. To run locally, create a local '
1150
+ 'Kubernetes cluster with `sky local up`.',
1151
+ fg='yellow')
1145
1152
  elif backend_name == backends.CloudVmRayBackend.NAME:
1146
1153
  backend = backends.CloudVmRayBackend()
1147
1154
  else:
@@ -3523,11 +3530,11 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
3523
3530
  if sum([bool(names), all]) != 1:
3524
3531
  raise click.UsageError('Either --all or a name must be specified.')
3525
3532
  if all:
3526
- storages = sky.storage_ls()
3527
- if not storages:
3533
+ # Use '*' to get all storages.
3534
+ names = global_user_state.get_glob_storage_name(storage_name='*')
3535
+ if not names:
3528
3536
  click.echo('No storage(s) to delete.')
3529
3537
  return
3530
- names = [s['name'] for s in storages]
3531
3538
  else:
3532
3539
  names = _get_glob_storages(names)
3533
3540
  if names:
@@ -3541,7 +3548,13 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
3541
3548
  abort=True,
3542
3549
  show_default=True)
3543
3550
 
3544
- subprocess_utils.run_in_parallel(sky.storage_delete, names)
3551
+ def delete_storage(name: str) -> None:
3552
+ try:
3553
+ sky.storage_delete(name)
3554
+ except Exception as e: # pylint: disable=broad-except
3555
+ click.secho(f'Error deleting storage {name}: {e}', fg='red')
3556
+
3557
+ subprocess_utils.run_in_parallel(delete_storage, names)
3545
3558
 
3546
3559
 
3547
3560
  @cli.group(cls=_NaturalOrderGroup)
@@ -3581,18 +3594,6 @@ def jobs():
3581
3594
  is_flag=True,
3582
3595
  help=('If True, as soon as a job is submitted, return from this call '
3583
3596
  'and do not stream execution logs.'))
3584
- @click.option(
3585
- '--retry-until-up/--no-retry-until-up',
3586
- '-r/-no-r',
3587
- default=None,
3588
- is_flag=True,
3589
- required=False,
3590
- help=(
3591
- '(Default: True; this flag is deprecated and will be removed in a '
3592
- 'future release.) Whether to retry provisioning infinitely until the '
3593
- 'cluster is up, if unavailability errors are encountered. This ' # pylint: disable=bad-docstring-quotes
3594
- 'applies to launching all managed jobs (both the initial and '
3595
- 'any recovery attempts), not the jobs controller.'))
3596
3597
  @click.option('--yes',
3597
3598
  '-y',
3598
3599
  is_flag=True,
@@ -3629,7 +3630,6 @@ def jobs_launch(
3629
3630
  disk_tier: Optional[str],
3630
3631
  ports: Tuple[str],
3631
3632
  detach_run: bool,
3632
- retry_until_up: Optional[bool],
3633
3633
  yes: bool,
3634
3634
  fast: bool,
3635
3635
  ):
@@ -3673,19 +3673,6 @@ def jobs_launch(
3673
3673
  ports=ports,
3674
3674
  job_recovery=job_recovery,
3675
3675
  )
3676
- # Deprecation. We set the default behavior to be retry until up, and the
3677
- # flag `--retry-until-up` is deprecated. We can remove the flag in 0.8.0.
3678
- if retry_until_up is not None:
3679
- flag_str = '--retry-until-up'
3680
- if not retry_until_up:
3681
- flag_str = '--no-retry-until-up'
3682
- click.secho(
3683
- f'Flag {flag_str} is deprecated and will be removed in a '
3684
- 'future release (managed jobs will always be retried). '
3685
- 'Please file an issue if this does not work for you.',
3686
- fg='yellow')
3687
- else:
3688
- retry_until_up = True
3689
3676
 
3690
3677
  # Deprecation. The default behavior is fast, and the flag will be removed.
3691
3678
  # The flag was not present in 0.7.x (only nightly), so we will remove before
@@ -3735,10 +3722,7 @@ def jobs_launch(
3735
3722
 
3736
3723
  common_utils.check_cluster_name_is_valid(name)
3737
3724
 
3738
- managed_jobs.launch(dag,
3739
- name,
3740
- detach_run=detach_run,
3741
- retry_until_up=retry_until_up)
3725
+ managed_jobs.launch(dag, name, detach_run=detach_run)
3742
3726
 
3743
3727
 
3744
3728
  @jobs.command('queue', cls=_DocumentedCodeCommand)
@@ -3926,17 +3910,29 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
3926
3910
  required=False,
3927
3911
  help='Query the latest job logs, restarting the jobs controller if stopped.'
3928
3912
  )
3913
+ @click.option('--sync-down',
3914
+ '-s',
3915
+ default=False,
3916
+ is_flag=True,
3917
+ required=False,
3918
+ help='Download logs for all jobs shown in the queue.')
3929
3919
  @click.argument('job_id', required=False, type=int)
3930
3920
  @usage_lib.entrypoint
3931
3921
  def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
3932
- controller: bool, refresh: bool):
3933
- """Tail the log of a managed job."""
3922
+ controller: bool, refresh: bool, sync_down: bool):
3923
+ """Tail or sync down the log of a managed job."""
3934
3924
  try:
3935
- managed_jobs.tail_logs(name=name,
3936
- job_id=job_id,
3937
- follow=follow,
3938
- controller=controller,
3939
- refresh=refresh)
3925
+ if sync_down:
3926
+ managed_jobs.sync_down_logs(name=name,
3927
+ job_id=job_id,
3928
+ controller=controller,
3929
+ refresh=refresh)
3930
+ else:
3931
+ managed_jobs.tail_logs(name=name,
3932
+ job_id=job_id,
3933
+ follow=follow,
3934
+ controller=controller,
3935
+ refresh=refresh)
3940
3936
  except exceptions.ClusterNotUpError:
3941
3937
  with ux_utils.print_exception_no_traceback():
3942
3938
  raise
sky/cloud_stores.py CHANGED
@@ -7,6 +7,7 @@ TODO:
7
7
  * Better interface.
8
8
  * Better implementation (e.g., fsspec, smart_open, using each cloud's SDK).
9
9
  """
10
+ import os
10
11
  import shlex
11
12
  import subprocess
12
13
  import time
@@ -18,6 +19,7 @@ from sky.adaptors import aws
18
19
  from sky.adaptors import azure
19
20
  from sky.adaptors import cloudflare
20
21
  from sky.adaptors import ibm
22
+ from sky.adaptors import oci
21
23
  from sky.clouds import gcp
22
24
  from sky.data import data_utils
23
25
  from sky.data.data_utils import Rclone
@@ -111,8 +113,16 @@ class GcsCloudStorage(CloudStorage):
111
113
  @property
112
114
  def _gsutil_command(self):
113
115
  gsutil_alias, alias_gen = data_utils.get_gsutil_command()
114
- return (f'{alias_gen}; GOOGLE_APPLICATION_CREDENTIALS='
115
- f'{gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH} {gsutil_alias}')
116
+ return (
117
+ f'{alias_gen}; GOOGLE_APPLICATION_CREDENTIALS='
118
+ f'{gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH}; '
119
+ # Explicitly activate service account. Unlike the gcp packages
120
+ # and other GCP commands, gsutil does not automatically pick up
121
+ # the default credential keys when it is a service account.
122
+ 'gcloud auth activate-service-account '
123
+ '--key-file=$GOOGLE_APPLICATION_CREDENTIALS '
124
+ '2> /dev/null || true; '
125
+ f'{gsutil_alias}')
116
126
 
117
127
  def is_directory(self, url: str) -> bool:
118
128
  """Returns whether 'url' is a directory.
@@ -470,6 +480,64 @@ class IBMCosCloudStorage(CloudStorage):
470
480
  return self.make_sync_dir_command(source, destination)
471
481
 
472
482
 
483
+ class OciCloudStorage(CloudStorage):
484
+ """OCI Cloud Storage."""
485
+
486
+ def is_directory(self, url: str) -> bool:
487
+ """Returns whether OCI 'url' is a directory.
488
+ In cloud object stores, a "directory" refers to a regular object whose
489
+ name is a prefix of other objects.
490
+ """
491
+ bucket_name, path = data_utils.split_oci_path(url)
492
+
493
+ client = oci.get_object_storage_client()
494
+ namespace = client.get_namespace(
495
+ compartment_id=oci.get_oci_config()['tenancy']).data
496
+
497
+ objects = client.list_objects(namespace_name=namespace,
498
+ bucket_name=bucket_name,
499
+ prefix=path).data.objects
500
+
501
+ if len(objects) == 0:
502
+ # A directory with few or no items
503
+ return True
504
+
505
+ if len(objects) > 1:
506
+ # A directory with more than 1 items
507
+ return True
508
+
509
+ object_name = objects[0].name
510
+ if path.endswith(object_name):
511
+ # An object path
512
+ return False
513
+
514
+ # A directory with only 1 item
515
+ return True
516
+
517
+ @oci.with_oci_env
518
+ def make_sync_dir_command(self, source: str, destination: str) -> str:
519
+ """Downloads using OCI CLI."""
520
+ bucket_name, path = data_utils.split_oci_path(source)
521
+
522
+ download_via_ocicli = (f'oci os object sync --no-follow-symlinks '
523
+ f'--bucket-name {bucket_name} '
524
+ f'--prefix "{path}" --dest-dir "{destination}"')
525
+
526
+ return download_via_ocicli
527
+
528
+ @oci.with_oci_env
529
+ def make_sync_file_command(self, source: str, destination: str) -> str:
530
+ """Downloads a file using OCI CLI."""
531
+ bucket_name, path = data_utils.split_oci_path(source)
532
+ filename = os.path.basename(path)
533
+ destination = os.path.join(destination, filename)
534
+
535
+ download_via_ocicli = (f'oci os object get --bucket-name {bucket_name} '
536
+ f'--name "{path}" --file "{destination}"')
537
+
538
+ return download_via_ocicli
539
+
540
+
473
541
  def get_storage_from_path(url: str) -> CloudStorage:
474
542
  """Returns a CloudStorage by identifying the scheme:// in a URL."""
475
543
  result = urllib.parse.urlsplit(url)
@@ -485,6 +553,7 @@ _REGISTRY = {
485
553
  's3': S3CloudStorage(),
486
554
  'r2': R2CloudStorage(),
487
555
  'cos': IBMCosCloudStorage(),
556
+ 'oci': OciCloudStorage(),
488
557
  # TODO: This is a hack, as Azure URL starts with https://, we should
489
558
  # refactor the registry to be able to take regex, so that Azure blob can
490
559
  # be identified with `https://(.*?)\.blob\.core\.windows\.net`
sky/clouds/__init__.py CHANGED
@@ -15,6 +15,7 @@ from sky.clouds.cloud_registry import CLOUD_REGISTRY
15
15
  from sky.clouds.aws import AWS
16
16
  from sky.clouds.azure import Azure
17
17
  from sky.clouds.cudo import Cudo
18
+ from sky.clouds.do import DO
18
19
  from sky.clouds.fluidstack import Fluidstack
19
20
  from sky.clouds.gcp import GCP
20
21
  from sky.clouds.ibm import IBM
@@ -34,6 +35,7 @@ __all__ = [
34
35
  'Cudo',
35
36
  'GCP',
36
37
  'Lambda',
38
+ 'DO',
37
39
  'Paperspace',
38
40
  'SCP',
39
41
  'RunPod',
sky/clouds/aws.py CHANGED
@@ -2,6 +2,8 @@
2
2
  import enum
3
3
  import fnmatch
4
4
  import functools
5
+ import hashlib
6
+ import json
5
7
  import os
6
8
  import re
7
9
  import subprocess
@@ -16,6 +18,7 @@ from sky import sky_logging
16
18
  from sky import skypilot_config
17
19
  from sky.adaptors import aws
18
20
  from sky.clouds import service_catalog
21
+ from sky.clouds.service_catalog import common as catalog_common
19
22
  from sky.clouds.utils import aws_utils
20
23
  from sky.skylet import constants
21
24
  from sky.utils import common_utils
@@ -92,6 +95,10 @@ class AWSIdentityType(enum.Enum):
92
95
 
93
96
  CONTAINER_ROLE = 'container-role'
94
97
 
98
+ CUSTOM_PROCESS = 'custom-process'
99
+
100
+ ASSUME_ROLE = 'assume-role'
101
+
95
102
  # Name Value Type Location
96
103
  # ---- ----- ---- --------
97
104
  # profile <not set> None None
@@ -100,6 +107,24 @@ class AWSIdentityType(enum.Enum):
100
107
  # region us-east-1 config-file ~/.aws/config
101
108
  SHARED_CREDENTIALS_FILE = 'shared-credentials-file'
102
109
 
110
+ def can_credential_expire(self) -> bool:
111
+ """Check if the AWS identity type can expire.
112
+
113
+ SSO,IAM_ROLE and CONTAINER_ROLE are temporary credentials and refreshed
114
+ automatically. ENV and SHARED_CREDENTIALS_FILE are short-lived
115
+ credentials without refresh.
116
+ IAM ROLE:
117
+ https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
118
+ SSO/Container-role refresh token:
119
+ https://docs.aws.amazon.com/solutions/latest/dea-api/auth-refreshtoken.html
120
+ """
121
+ # TODO(hong): Add a CLI based check for the expiration of the temporary
122
+ # credentials
123
+ expirable_types = {
124
+ AWSIdentityType.ENV, AWSIdentityType.SHARED_CREDENTIALS_FILE
125
+ }
126
+ return self in expirable_types
127
+
103
128
 
104
129
  @clouds.CLOUD_REGISTRY.register
105
130
  class AWS(clouds.Cloud):
@@ -593,10 +618,27 @@ class AWS(clouds.Cloud):
593
618
  hints = f'AWS IAM role is set.{single_cloud_hint}'
594
619
  elif identity_type == AWSIdentityType.CONTAINER_ROLE:
595
620
  # Similar to the IAM ROLE, an ECS container may not store credentials
596
- # in the~/.aws/credentials file. So we don't check for the existence of
621
+ # in the ~/.aws/credentials file. So we don't check for the existence of
597
622
  # the file. i.e. the container will be assigned the IAM role of the
598
623
  # task: skypilot-v1.
599
624
  hints = f'AWS container-role is set.{single_cloud_hint}'
625
+ elif identity_type == AWSIdentityType.CUSTOM_PROCESS:
626
+ # Similar to the IAM ROLE, a custom process may not store credentials
627
+ # in the ~/.aws/credentials file. So we don't check for the existence of
628
+ # the file. i.e. the custom process will be assigned the IAM role of the
629
+ # task: skypilot-v1.
630
+ hints = f'AWS custom-process is set.{single_cloud_hint}'
631
+ elif identity_type == AWSIdentityType.ASSUME_ROLE:
632
+ # When using ASSUME ROLE, the credentials are coming from a different
633
+ # source profile. So we don't check for the existence of ~/.aws/credentials.
634
+ # i.e. the assumed role will be assigned the IAM role of the
635
+ # task: skypilot-v1.
636
+ hints = f'AWS assume-role is set.{single_cloud_hint}'
637
+ elif identity_type == AWSIdentityType.ENV:
638
+ # When using ENV vars, the credentials are coming from the environment
639
+ # variables. So we don't check for the existence of ~/.aws/credentials.
640
+ # i.e. the identity is not determined by the file.
641
+ hints = f'AWS env is set.{single_cloud_hint}'
600
642
  else:
601
643
  # This file is required because it is required by the VMs launched on
602
644
  # other clouds to access private s3 buckets and resources like EC2.
@@ -624,14 +666,10 @@ class AWS(clouds.Cloud):
624
666
 
625
667
  @classmethod
626
668
  def _current_identity_type(cls) -> Optional[AWSIdentityType]:
627
- proc = subprocess.run('aws configure list',
628
- shell=True,
629
- check=False,
630
- stdout=subprocess.PIPE,
631
- stderr=subprocess.PIPE)
632
- if proc.returncode != 0:
669
+ stdout = cls._aws_configure_list()
670
+ if stdout is None:
633
671
  return None
634
- stdout = proc.stdout.decode()
672
+ output = stdout.decode()
635
673
 
636
674
  # We determine the identity type by looking at the output of
637
675
  # `aws configure list`. The output looks like:
@@ -646,55 +684,32 @@ class AWS(clouds.Cloud):
646
684
 
647
685
  def _is_access_key_of_type(type_str: str) -> bool:
648
686
  # The dot (.) does not match line separators.
649
- results = re.findall(fr'access_key.*{type_str}', stdout)
687
+ results = re.findall(fr'access_key.*{type_str}', output)
650
688
  if len(results) > 1:
651
689
  raise RuntimeError(
652
- f'Unexpected `aws configure list` output:\n{stdout}')
690
+ f'Unexpected `aws configure list` output:\n{output}')
653
691
  return len(results) == 1
654
692
 
655
- if _is_access_key_of_type(AWSIdentityType.SSO.value):
656
- return AWSIdentityType.SSO
657
- elif _is_access_key_of_type(AWSIdentityType.IAM_ROLE.value):
658
- return AWSIdentityType.IAM_ROLE
659
- elif _is_access_key_of_type(AWSIdentityType.CONTAINER_ROLE.value):
660
- return AWSIdentityType.CONTAINER_ROLE
661
- elif _is_access_key_of_type(AWSIdentityType.ENV.value):
662
- return AWSIdentityType.ENV
663
- else:
664
- return AWSIdentityType.SHARED_CREDENTIALS_FILE
693
+ for identity_type in AWSIdentityType:
694
+ if _is_access_key_of_type(identity_type.value):
695
+ return identity_type
696
+ return AWSIdentityType.SHARED_CREDENTIALS_FILE
665
697
 
666
698
  @classmethod
667
- @functools.lru_cache(maxsize=1) # Cache since getting identity is slow.
668
- def get_user_identities(cls) -> Optional[List[List[str]]]:
669
- """Returns a [UserId, Account] list that uniquely identifies the user.
670
-
671
- These fields come from `aws sts get-caller-identity`. We permit the same
672
- actual user to:
673
-
674
- - switch between different root accounts (after which both elements
675
- of the list will be different) and have their clusters owned by
676
- each account be protected; or
677
-
678
- - within the same root account, switch between different IAM
679
- users, and treat [user_id=1234, account=A] and
680
- [user_id=4567, account=A] to be the *same*. Namely, switching
681
- between these IAM roles within the same root account will cause
682
- the first element of the returned list to differ, and will allow
683
- the same actual user to continue to interact with their clusters.
684
- Note: this is not 100% safe, since the IAM users can have very
685
- specific permissions, that disallow them to access the clusters
686
- but it is a reasonable compromise as that could be rare.
687
-
688
- Returns:
689
- A list of strings that uniquely identifies the user on this cloud.
690
- For identity check, we will fallback through the list of strings
691
- until we find a match, and print a warning if we fail for the
692
- first string.
699
+ @functools.lru_cache(maxsize=1)
700
+ def _aws_configure_list(cls) -> Optional[bytes]:
701
+ proc = subprocess.run('aws configure list',
702
+ shell=True,
703
+ check=False,
704
+ stdout=subprocess.PIPE,
705
+ stderr=subprocess.PIPE)
706
+ if proc.returncode != 0:
707
+ return None
708
+ return proc.stdout
693
709
 
694
- Raises:
695
- exceptions.CloudUserIdentityError: if the user identity cannot be
696
- retrieved.
697
- """
710
+ @classmethod
711
+ @functools.lru_cache(maxsize=1) # Cache since getting identity is slow.
712
+ def _sts_get_caller_identity(cls) -> Optional[List[List[str]]]:
698
713
  try:
699
714
  sts = aws.client('sts')
700
715
  # The caller identity contains 3 fields: UserId, Account, Arn.
@@ -773,6 +788,72 @@ class AWS(clouds.Cloud):
773
788
  # automatic switching for AWS. Currently we only support one identity.
774
789
  return [user_ids]
775
790
 
791
+ @classmethod
792
+ @functools.lru_cache(maxsize=1) # Cache since getting identity is slow.
793
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
794
+ """Returns a [UserId, Account] list that uniquely identifies the user.
795
+
796
+ These fields come from `aws sts get-caller-identity` and are cached
797
+ locally by `aws configure list` output. The identities are assumed to
798
+ be stable for the duration of the `sky` process. Modifying the
799
+ credentials while the `sky` process is running will not affect the
800
+ identity returned by this function.
801
+
802
+ We permit the same actual user to:
803
+
804
+ - switch between different root accounts (after which both elements
805
+ of the list will be different) and have their clusters owned by
806
+ each account be protected; or
807
+
808
+ - within the same root account, switch between different IAM
809
+ users, and treat [user_id=1234, account=A] and
810
+ [user_id=4567, account=A] to be the *same*. Namely, switching
811
+ between these IAM roles within the same root account will cause
812
+ the first element of the returned list to differ, and will allow
813
+ the same actual user to continue to interact with their clusters.
814
+ Note: this is not 100% safe, since the IAM users can have very
815
+ specific permissions, that disallow them to access the clusters
816
+ but it is a reasonable compromise as that could be rare.
817
+
818
+ Returns:
819
+ A list of strings that uniquely identifies the user on this cloud.
820
+ For identity check, we will fallback through the list of strings
821
+ until we find a match, and print a warning if we fail for the
822
+ first string.
823
+
824
+ Raises:
825
+ exceptions.CloudUserIdentityError: if the user identity cannot be
826
+ retrieved.
827
+ """
828
+ stdout = cls._aws_configure_list()
829
+ if stdout is None:
830
+ # `aws configure list` is not available, possible reasons:
831
+ # - awscli is not installed but credentials are valid, e.g. run from
832
+ # an EC2 instance with IAM role
833
+ # - aws credentials are not set, proceed anyway to get unified error
834
+ # message for users
835
+ return cls._sts_get_caller_identity()
836
+ config_hash = hashlib.md5(stdout).hexdigest()[:8]
837
+ # Getting aws identity cost ~1s, so we cache the result with the output of
838
+ # `aws configure list` as cache key. Different `aws configure list` output
839
+ # can have same aws identity, our assumption is the output would be stable
840
+ # in real world, so the number of cache files would be limited.
841
+ # TODO(aylei): consider using a more stable cache key and evalute eviction.
842
+ cache_path = catalog_common.get_catalog_path(
843
+ f'aws/.cache/user-identity-{config_hash}.txt')
844
+ if os.path.exists(cache_path):
845
+ try:
846
+ with open(cache_path, 'r', encoding='utf-8') as f:
847
+ return json.loads(f.read())
848
+ except json.JSONDecodeError:
849
+ # cache is invalid, ignore it and fetch identity again
850
+ pass
851
+
852
+ result = cls._sts_get_caller_identity()
853
+ with open(cache_path, 'w', encoding='utf-8') as f:
854
+ f.write(json.dumps(result))
855
+ return result
856
+
776
857
  @classmethod
777
858
  def get_active_user_identity_str(cls) -> Optional[str]:
778
859
  user_identity = cls.get_active_user_identity()
@@ -812,6 +893,12 @@ class AWS(clouds.Cloud):
812
893
  if os.path.exists(os.path.expanduser(f'~/.aws/{filename}'))
813
894
  }
814
895
 
896
+ @functools.lru_cache(maxsize=1)
897
+ def can_credential_expire(self) -> bool:
898
+ identity_type = self._current_identity_type()
899
+ return identity_type is not None and identity_type.can_credential_expire(
900
+ )
901
+
815
902
  def instance_type_exists(self, instance_type):
816
903
  return service_catalog.instance_type_exists(instance_type, clouds='aws')
817
904
 
sky/clouds/cloud.py CHANGED
@@ -536,6 +536,10 @@ class Cloud:
536
536
  """
537
537
  raise NotImplementedError
538
538
 
539
+ def can_credential_expire(self) -> bool:
540
+ """Returns whether the cloud credential can expire."""
541
+ return False
542
+
539
543
  @classmethod
540
544
  def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
541
545
  """Check the image size from the cloud.