skypilot-nightly 1.0.0.dev20241114__py3-none-any.whl → 1.0.0.dev20241116__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +142 -74
  3. sky/backends/cloud_vm_ray_backend.py +15 -11
  4. sky/cli.py +15 -4
  5. sky/clouds/aws.py +1 -0
  6. sky/clouds/oci.py +0 -2
  7. sky/clouds/service_catalog/aws_catalog.py +2 -0
  8. sky/clouds/utils/oci_utils.py +5 -0
  9. sky/execution.py +43 -22
  10. sky/global_user_state.py +36 -16
  11. sky/jobs/core.py +0 -1
  12. sky/jobs/utils.py +4 -3
  13. sky/provision/kubernetes/utils.py +2 -0
  14. sky/provision/oci/instance.py +12 -11
  15. sky/provision/oci/query_utils.py +212 -6
  16. sky/serve/core.py +1 -0
  17. sky/serve/serve_utils.py +35 -30
  18. sky/skylet/constants.py +1 -1
  19. sky/skylet/job_lib.py +249 -138
  20. sky/skylet/log_lib.py +1 -34
  21. sky/skylet/subprocess_daemon.py +33 -13
  22. sky/utils/controller_utils.py +10 -9
  23. sky/utils/schemas.py +1 -0
  24. sky/utils/subprocess_utils.py +50 -0
  25. sky/utils/timeline.py +2 -4
  26. {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/METADATA +1 -1
  27. {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/RECORD +31 -31
  28. {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/LICENSE +0 -0
  29. {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/WHEEL +0 -0
  30. {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/entry_points.txt +0 -0
  31. {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'a2e670d347b666f40edb7f675af87c86faec3971'
8
+ _SKYPILOT_COMMIT_SHA = '95e2f281a0441b2043ee1bea7d7cddb4e2e69782'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241114'
38
+ __version__ = '1.0.0.dev20241116'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -100,6 +100,10 @@ DEFAULT_TASK_CPU_DEMAND = 0.5
100
100
  CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock')
101
101
  CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
102
102
 
103
+ # Time that must elapse since the last status check before we should re-check if
104
+ # the cluster has been terminated or autostopped.
105
+ _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
106
+
103
107
  # Filelocks for updating cluster's file_mounts.
104
108
  CLUSTER_FILE_MOUNTS_LOCK_PATH = os.path.expanduser(
105
109
  '~/.sky/.{}_file_mounts.lock')
@@ -686,26 +690,56 @@ def write_cluster_config(
686
690
  skypilot_config.get_nested(
687
691
  (str(to_provision.cloud).lower(), 'specific_reservations'), set()))
688
692
 
693
+ # Remote identity handling can have 4 cases:
694
+ # 1. LOCAL_CREDENTIALS (default for most clouds): Upload local credentials
695
+ # 2. SERVICE_ACCOUNT: SkyPilot creates and manages a service account
696
+ # 3. Custom service account: Use specified service account
697
+ # 4. NO_UPLOAD: Do not upload any credentials
698
+ #
699
+ # We need to upload credentials only if LOCAL_CREDENTIALS is specified. In
700
+ # other cases, we exclude the cloud from credential file uploads after
701
+ # running required checks.
689
702
  assert cluster_name is not None
690
- excluded_clouds = []
703
+ excluded_clouds = set()
691
704
  remote_identity_config = skypilot_config.get_nested(
692
705
  (str(cloud).lower(), 'remote_identity'), None)
693
706
  remote_identity = schemas.get_default_remote_identity(str(cloud).lower())
694
707
  if isinstance(remote_identity_config, str):
695
708
  remote_identity = remote_identity_config
696
709
  if isinstance(remote_identity_config, list):
710
+ # Some clouds (e.g., AWS) support specifying multiple service accounts
711
+ # chosen based on the cluster name. Do the matching here to pick the
712
+ # correct one.
697
713
  for profile in remote_identity_config:
698
714
  if fnmatch.fnmatchcase(cluster_name, list(profile.keys())[0]):
699
715
  remote_identity = list(profile.values())[0]
700
716
  break
701
717
  if remote_identity != schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value:
702
- if not cloud.supports_service_account_on_remote():
718
+ # If LOCAL_CREDENTIALS is not specified, we add the cloud to the
719
+ # excluded_clouds set, but we must also check if the cloud supports
720
+ # service accounts.
721
+ if remote_identity == schemas.RemoteIdentityOptions.NO_UPLOAD.value:
722
+ # If NO_UPLOAD is specified, fall back to default remote identity
723
+ # for downstream logic but add it to excluded_clouds to skip
724
+ # credential file uploads.
725
+ remote_identity = schemas.get_default_remote_identity(
726
+ str(cloud).lower())
727
+ elif not cloud.supports_service_account_on_remote():
703
728
  raise exceptions.InvalidCloudConfigs(
704
729
  'remote_identity: SERVICE_ACCOUNT is specified in '
705
730
  f'{skypilot_config.loaded_config_path!r} for {cloud}, but it '
706
731
  'is not supported by this cloud. Remove the config or set: '
707
732
  '`remote_identity: LOCAL_CREDENTIALS`.')
708
- excluded_clouds = [cloud]
733
+ excluded_clouds.add(cloud)
734
+
735
+ for cloud_str, cloud_obj in cloud_registry.CLOUD_REGISTRY.items():
736
+ remote_identity_config = skypilot_config.get_nested(
737
+ (cloud_str.lower(), 'remote_identity'), None)
738
+ if remote_identity_config:
739
+ if (remote_identity_config ==
740
+ schemas.RemoteIdentityOptions.NO_UPLOAD.value):
741
+ excluded_clouds.add(cloud_obj)
742
+
709
743
  credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)
710
744
 
711
745
  auth_config = {'ssh_private_key': auth.PRIVATE_SSH_KEY_PATH}
@@ -1669,11 +1703,27 @@ def check_can_clone_disk_and_override_task(
1669
1703
 
1670
1704
  def _update_cluster_status_no_lock(
1671
1705
  cluster_name: str) -> Optional[Dict[str, Any]]:
1672
- """Updates the status of the cluster.
1706
+ """Update the cluster status.
1707
+
1708
+ The cluster status is updated by checking ray cluster and real status from
1709
+ cloud.
1710
+
1711
+ The function will update the cached cluster status in the global state. For
1712
+ the design of the cluster status and transition, please refer to the
1713
+ sky/design_docs/cluster_status.md
1714
+
1715
+ Returns:
1716
+ If the cluster is terminated or does not exist, return None. Otherwise
1717
+ returns the input record with status and handle potentially updated.
1673
1718
 
1674
1719
  Raises:
1720
+ exceptions.ClusterOwnerIdentityMismatchError: if the current user is
1721
+ not the same as the user who created the cluster.
1722
+ exceptions.CloudUserIdentityError: if we fail to get the current user
1723
+ identity.
1675
1724
  exceptions.ClusterStatusFetchingError: the cluster status cannot be
1676
- fetched from the cloud provider.
1725
+ fetched from the cloud provider or there are leaked nodes causing
1726
+ the node number larger than expected.
1677
1727
  """
1678
1728
  record = global_user_state.get_cluster_from_name(cluster_name)
1679
1729
  if record is None:
@@ -1893,52 +1943,22 @@ def _update_cluster_status_no_lock(
1893
1943
  return global_user_state.get_cluster_from_name(cluster_name)
1894
1944
 
1895
1945
 
1896
- def _update_cluster_status(
1897
- cluster_name: str,
1898
- acquire_per_cluster_status_lock: bool,
1899
- cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
1900
- ) -> Optional[Dict[str, Any]]:
1901
- """Update the cluster status.
1902
-
1903
- The cluster status is updated by checking ray cluster and real status from
1904
- cloud.
1905
-
1906
- The function will update the cached cluster status in the global state. For
1907
- the design of the cluster status and transition, please refer to the
1908
- sky/design_docs/cluster_status.md
1909
-
1910
- Args:
1911
- cluster_name: The name of the cluster.
1912
- acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
1913
- before updating the status.
1914
- cluster_status_lock_timeout: The timeout to acquire the per-cluster
1915
- lock.
1916
-
1917
- Returns:
1918
- If the cluster is terminated or does not exist, return None. Otherwise
1919
- returns the input record with status and handle potentially updated.
1946
+ def _must_refresh_cluster_status(
1947
+ record: Dict[str, Any],
1948
+ force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]]
1949
+ ) -> bool:
1950
+ force_refresh_for_cluster = (force_refresh_statuses is not None and
1951
+ record['status'] in force_refresh_statuses)
1920
1952
 
1921
- Raises:
1922
- exceptions.ClusterOwnerIdentityMismatchError: if the current user is
1923
- not the same as the user who created the cluster.
1924
- exceptions.CloudUserIdentityError: if we fail to get the current user
1925
- identity.
1926
- exceptions.ClusterStatusFetchingError: the cluster status cannot be
1927
- fetched from the cloud provider or there are leaked nodes causing
1928
- the node number larger than expected.
1929
- """
1930
- if not acquire_per_cluster_status_lock:
1931
- return _update_cluster_status_no_lock(cluster_name)
1953
+ use_spot = record['handle'].launched_resources.use_spot
1954
+ has_autostop = (record['status'] != status_lib.ClusterStatus.STOPPED and
1955
+ record['autostop'] >= 0)
1956
+ recently_refreshed = (record['status_updated_at'] is not None and
1957
+ time.time() - record['status_updated_at'] <
1958
+ _CLUSTER_STATUS_CACHE_DURATION_SECONDS)
1959
+ is_stale = (use_spot or has_autostop) and not recently_refreshed
1932
1960
 
1933
- try:
1934
- with filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name),
1935
- timeout=cluster_status_lock_timeout):
1936
- return _update_cluster_status_no_lock(cluster_name)
1937
- except filelock.Timeout:
1938
- logger.debug('Refreshing status: Failed get the lock for cluster '
1939
- f'{cluster_name!r}. Using the cached status.')
1940
- record = global_user_state.get_cluster_from_name(cluster_name)
1941
- return record
1961
+ return force_refresh_for_cluster or is_stale
1942
1962
 
1943
1963
 
1944
1964
  def refresh_cluster_record(
@@ -1956,16 +1976,22 @@ def refresh_cluster_record(
1956
1976
 
1957
1977
  Args:
1958
1978
  cluster_name: The name of the cluster.
1959
- force_refresh_statuses: if specified, refresh the cluster if it has one of
1960
- the specified statuses. Additionally, clusters satisfying the
1961
- following conditions will always be refreshed no matter the
1962
- argument is specified or not:
1963
- 1. is a spot cluster, or
1964
- 2. is a non-spot cluster, is not STOPPED, and autostop is set.
1979
+ force_refresh_statuses: if specified, refresh the cluster if it has one
1980
+ of the specified statuses. Additionally, clusters satisfying the
1981
+ following conditions will be refreshed no matter the argument is
1982
+ specified or not:
1983
+ - the most latest available status update is more than
1984
+ _CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
1985
+ 1. the cluster is a spot cluster, or
1986
+ 2. cluster autostop is set and the cluster is not STOPPED.
1965
1987
  acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
1966
- before updating the status.
1988
+ before updating the status. Even if this is True, the lock may not be
1989
+ acquired if the status does not need to be refreshed.
1967
1990
  cluster_status_lock_timeout: The timeout to acquire the per-cluster
1968
- lock. If timeout, the function will use the cached status.
1991
+ lock. If timeout, the function will use the cached status. If the
1992
+ value is <0, do not timeout (wait for the lock indefinitely). By
1993
+ default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
1994
+ if correctness is required, you must set this to -1.
1969
1995
 
1970
1996
  Returns:
1971
1997
  If the cluster is terminated or does not exist, return None.
@@ -1986,19 +2012,58 @@ def refresh_cluster_record(
1986
2012
  return None
1987
2013
  check_owner_identity(cluster_name)
1988
2014
 
1989
- handle = record['handle']
1990
- if isinstance(handle, backends.CloudVmRayResourceHandle):
1991
- use_spot = handle.launched_resources.use_spot
1992
- has_autostop = (record['status'] != status_lib.ClusterStatus.STOPPED and
1993
- record['autostop'] >= 0)
1994
- force_refresh_for_cluster = (force_refresh_statuses is not None and
1995
- record['status'] in force_refresh_statuses)
1996
- if force_refresh_for_cluster or has_autostop or use_spot:
1997
- record = _update_cluster_status(
1998
- cluster_name,
1999
- acquire_per_cluster_status_lock=acquire_per_cluster_status_lock,
2000
- cluster_status_lock_timeout=cluster_status_lock_timeout)
2001
- return record
2015
+ if not isinstance(record['handle'], backends.CloudVmRayResourceHandle):
2016
+ return record
2017
+
2018
+ # The loop logic allows us to notice if the status was updated in the
2019
+ # global_user_state by another process and stop trying to get the lock.
2020
+ # The core loop logic is adapted from FileLock's implementation.
2021
+ lock = filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
2022
+ start_time = time.perf_counter()
2023
+
2024
+ # Loop until we have an up-to-date status or until we acquire the lock.
2025
+ while True:
2026
+ # Check to see if we can return the cached status.
2027
+ if not _must_refresh_cluster_status(record, force_refresh_statuses):
2028
+ return record
2029
+
2030
+ if not acquire_per_cluster_status_lock:
2031
+ return _update_cluster_status_no_lock(cluster_name)
2032
+
2033
+ # Try to acquire the lock so we can fetch the status.
2034
+ try:
2035
+ with lock.acquire(blocking=False):
2036
+ # Lock acquired.
2037
+
2038
+ # Check the cluster status again, since it could have been
2039
+ # updated between our last check and acquiring the lock.
2040
+ record = global_user_state.get_cluster_from_name(cluster_name)
2041
+ if record is None or not _must_refresh_cluster_status(
2042
+ record, force_refresh_statuses):
2043
+ return record
2044
+
2045
+ # Update and return the cluster status.
2046
+ return _update_cluster_status_no_lock(cluster_name)
2047
+ except filelock.Timeout:
2048
+ # lock.acquire() will throw a Timeout exception if the lock is not
2049
+ # available and we have blocking=False.
2050
+ pass
2051
+
2052
+ # Logic adapted from FileLock.acquire().
2053
+ # If cluster_status_lock_time is <0, we will never hit this. No timeout.
2054
+ # Otherwise, if we have timed out, return the cached status. This has
2055
+ # the potential to cause correctness issues, but if so it is the
2056
+ # caller's responsibility to set the timeout to -1.
2057
+ if 0 <= cluster_status_lock_timeout < time.perf_counter() - start_time:
2058
+ logger.debug('Refreshing status: Failed get the lock for cluster '
2059
+ f'{cluster_name!r}. Using the cached status.')
2060
+ return record
2061
+ time.sleep(0.05)
2062
+
2063
+ # Refresh for next loop iteration.
2064
+ record = global_user_state.get_cluster_from_name(cluster_name)
2065
+ if record is None:
2066
+ return None
2002
2067
 
2003
2068
 
2004
2069
  @timeline.event
@@ -2604,15 +2669,18 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str,
2604
2669
  pattern = re.compile(r'AttributeError: module \'sky\.(.*)\' has no '
2605
2670
  r'attribute \'(.*)\'')
2606
2671
  if returncode != 0:
2672
+ # TODO(zhwu): Backward compatibility for old SkyPilot runtime version on
2673
+ # the remote cluster. Remove this after 0.10.0 is released.
2607
2674
  attribute_error = re.findall(pattern, stderr)
2608
- if attribute_error:
2675
+ if attribute_error or 'SkyPilot runtime is too old' in stderr:
2609
2676
  with ux_utils.print_exception_no_traceback():
2610
2677
  raise RuntimeError(
2611
2678
  f'{colorama.Fore.RED}SkyPilot runtime needs to be updated '
2612
- 'on the remote cluster. To update, run (existing jobs are '
2613
- f'not interrupted): {colorama.Style.BRIGHT}sky start -f -y '
2679
+ f'on the remote cluster: {cluster_name}. To update, run '
2680
+ '(existing jobs will not be interrupted): '
2681
+ f'{colorama.Style.BRIGHT}sky start -f -y '
2614
2682
  f'{cluster_name}{colorama.Style.RESET_ALL}'
2615
- f'\n--- Details ---\n{stderr.strip()}\n')
2683
+ f'\n--- Details ---\n{stderr.strip()}\n') from None
2616
2684
 
2617
2685
 
2618
2686
  def get_endpoints(cluster: str,
@@ -276,6 +276,7 @@ class RayCodeGen:
276
276
  from sky.skylet import constants
277
277
  from sky.skylet import job_lib
278
278
  from sky.utils import log_utils
279
+ from sky.utils import subprocess_utils
279
280
 
280
281
  SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
281
282
 
@@ -3275,14 +3276,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3275
3276
  encoded_script = shlex.quote(codegen)
3276
3277
  create_script_code = (f'{{ echo {encoded_script} > {script_path}; }}')
3277
3278
  job_submit_cmd = (
3278
- f'RAY_DASHBOARD_PORT=$({constants.SKY_PYTHON_CMD} -c "from sky.skylet import job_lib; print(job_lib.get_job_submission_port())" 2> /dev/null || echo 8265);' # pylint: disable=line-too-long
3279
- f'{cd} && {constants.SKY_RAY_CMD} job submit '
3280
- '--address=http://127.0.0.1:$RAY_DASHBOARD_PORT '
3281
- f'--submission-id {job_id}-$(whoami) --no-wait '
3282
- f'"{constants.SKY_PYTHON_CMD} -u {script_path} '
3279
+ # JOB_CMD_IDENTIFIER is used for identifying the process retrieved
3280
+ # with pid is the same driver process.
3281
+ f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
3282
+ f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
3283
3283
  # Do not use &>, which is not POSIX and may not work.
3284
3284
  # Note that the order of ">filename 2>&1" matters.
3285
- f'> {remote_log_path} 2>&1"')
3285
+ f'> {remote_log_path} 2>&1')
3286
3286
 
3287
3287
  code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
3288
3288
  job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
@@ -3330,6 +3330,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3330
3330
  job_submit_cmd,
3331
3331
  stream_logs=False,
3332
3332
  require_outputs=True)
3333
+ # Happens when someone calls `sky exec` but remote is outdated for
3334
+ # running a job. Necessitating calling `sky launch`.
3335
+ backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3336
+ handle.cluster_name)
3333
3337
  if returncode == 255 and 'too long' in stdout + stderr:
3334
3338
  # If the generated script is too long, we retry it with dumping
3335
3339
  # the script to a file and running it with SSH. We use a general
@@ -3344,10 +3348,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3344
3348
  stream_logs=False,
3345
3349
  require_outputs=True)
3346
3350
 
3347
- # Happens when someone calls `sky exec` but remote is outdated
3348
- # necessitating calling `sky launch`.
3349
- backend_utils.check_stale_runtime_on_remote(returncode, stdout,
3350
- handle.cluster_name)
3351
3351
  subprocess_utils.handle_returncode(returncode,
3352
3352
  job_submit_cmd,
3353
3353
  f'Failed to submit job {job_id}.',
@@ -3417,6 +3417,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3417
3417
  stream_logs=False,
3418
3418
  require_outputs=True,
3419
3419
  separate_stderr=True)
3420
+ # Happens when someone calls `sky exec` but remote is outdated for
3421
+ # adding a job. Necessitating calling `sky launch`.
3422
+ backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3423
+ handle.cluster_name)
3420
3424
  # TODO(zhwu): this sometimes will unexpectedly fail, we can add
3421
3425
  # retry for this, after we figure out the reason.
3422
3426
  subprocess_utils.handle_returncode(returncode, code,
@@ -3554,7 +3558,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3554
3558
  backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
3555
3559
 
3556
3560
  try:
3557
- with filelock.FileLock(
3561
+ with timeline.FileLockEvent(
3558
3562
  lock_path,
3559
3563
  backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
3560
3564
  self.teardown_no_lock(
sky/cli.py CHANGED
@@ -3699,13 +3699,24 @@ def jobs_launch(
3699
3699
  dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
3700
3700
  dag_utils.fill_default_config_in_dag_for_job_launch(dag)
3701
3701
 
3702
- click.secho(f'Managed job {dag.name!r} will be launched on (estimated):',
3703
- fg='cyan')
3704
3702
  dag, _ = admin_policy_utils.apply(
3705
3703
  dag, use_mutated_config_in_current_request=False)
3706
- dag = sky.optimize(dag)
3707
3704
 
3708
- if not yes:
3705
+ if yes:
3706
+ # Skip resource preview if -y is set, since we are probably running in
3707
+ # a script and the user won't have a chance to review it anyway.
3708
+ # This can save a couple of seconds.
3709
+ click.secho(
3710
+ f'Resources for managed job {dag.name!r} will be computed on the '
3711
+ 'managed jobs controller, since --yes is set.',
3712
+ fg='cyan')
3713
+
3714
+ else:
3715
+ click.secho(
3716
+ f'Managed job {dag.name!r} will be launched on (estimated):',
3717
+ fg='cyan')
3718
+ dag = sky.optimize(dag)
3719
+
3709
3720
  prompt = f'Launching a managed job {dag.name!r}. Proceed?'
3710
3721
  if prompt is not None:
3711
3722
  click.confirm(prompt, default=True, abort=True, show_default=True)
sky/clouds/aws.py CHANGED
@@ -663,6 +663,7 @@ class AWS(clouds.Cloud):
663
663
  return AWSIdentityType.SHARED_CREDENTIALS_FILE
664
664
 
665
665
  @classmethod
666
+ @functools.lru_cache(maxsize=1) # Cache since getting identity is slow.
666
667
  def get_user_identities(cls) -> Optional[List[List[str]]]:
667
668
  """Returns a [UserId, Account] list that uniquely identifies the user.
668
669
 
sky/clouds/oci.py CHANGED
@@ -75,8 +75,6 @@ class OCI(clouds.Cloud):
75
75
  (f'Docker image is currently not supported on {cls._REPR}. '
76
76
  'You can try running docker command inside the '
77
77
  '`run` section in task.yaml.'),
78
- clouds.CloudImplementationFeatures.OPEN_PORTS:
79
- (f'Opening ports is currently not supported on {cls._REPR}.'),
80
78
  }
81
79
  if resources.use_spot:
82
80
  features[clouds.CloudImplementationFeatures.STOP] = (
@@ -20,6 +20,7 @@ from sky.clouds.service_catalog.data_fetchers import fetch_aws
20
20
  from sky.utils import common_utils
21
21
  from sky.utils import resources_utils
22
22
  from sky.utils import rich_utils
23
+ from sky.utils import timeline
23
24
  from sky.utils import ux_utils
24
25
 
25
26
  if typing.TYPE_CHECKING:
@@ -100,6 +101,7 @@ def _get_az_mappings(aws_user_hash: str) -> Optional['pd.DataFrame']:
100
101
  return az_mappings
101
102
 
102
103
 
104
+ @timeline.event
103
105
  def _fetch_and_apply_az_mapping(df: common.LazyDataFrame) -> 'pd.DataFrame':
104
106
  """Maps zone IDs (use1-az1) to zone names (us-east-1x).
105
107
 
@@ -4,6 +4,8 @@ History:
4
4
  - Zhanghao Wu @ Oct 2023: Formatting and refactoring
5
5
  - Hysun He (hysun.he@oracle.com) @ Oct, 2024: Add default image OS
6
6
  configuration.
7
+ - Hysun He (hysun.he@oracle.com) @ Nov.12, 2024: Add the constant
8
+ SERVICE_PORT_RULE_TAG
7
9
  """
8
10
  import os
9
11
 
@@ -42,6 +44,9 @@ class OCIConfig:
42
44
  VCN_CIDR_INTERNET = '0.0.0.0/0'
43
45
  VCN_CIDR = '192.168.0.0/16'
44
46
  VCN_SUBNET_CIDR = '192.168.0.0/18'
47
+ SERVICE_PORT_RULE_TAG = 'SkyServe-Service-Port'
48
+ # NSG name template
49
+ NSG_NAME_TEMPLATE = 'nsg_{cluster_name}'
45
50
 
46
51
  MAX_RETRY_COUNT = 3
47
52
  RETRY_INTERVAL_BASE_SECONDS = 5
sky/execution.py CHANGED
@@ -11,10 +11,10 @@ import sky
11
11
  from sky import admin_policy
12
12
  from sky import backends
13
13
  from sky import clouds
14
- from sky import exceptions
15
14
  from sky import global_user_state
16
15
  from sky import optimizer
17
16
  from sky import sky_logging
17
+ from sky import status_lib
18
18
  from sky.backends import backend_utils
19
19
  from sky.usage import usage_lib
20
20
  from sky.utils import admin_policy_utils
@@ -267,6 +267,12 @@ def _execute(
267
267
  # no-credential machine should not enter optimize(), which
268
268
  # would directly error out ('No cloud is enabled...'). Fix
269
269
  # by moving `sky check` checks out of optimize()?
270
+
271
+ controller = controller_utils.Controllers.from_name(
272
+ cluster_name)
273
+ if controller is not None:
274
+ logger.info(
275
+ f'Choosing resources for {controller.name}...')
270
276
  dag = sky.optimize(dag, minimize=optimize_target)
271
277
  task = dag.tasks[0] # Keep: dag may have been deep-copied.
272
278
  assert task.best_resources is not None, task
@@ -463,28 +469,43 @@ def launch(
463
469
  stages = None
464
470
  # Check if cluster exists and we are doing fast provisioning
465
471
  if fast and cluster_name is not None:
466
- maybe_handle = global_user_state.get_handle_from_cluster_name(
467
- cluster_name)
468
- if maybe_handle is not None:
469
- try:
470
- # This will throw if the cluster is not available
471
- backend_utils.check_cluster_available(
472
+ cluster_status, maybe_handle = (
473
+ backend_utils.refresh_cluster_status_handle(cluster_name))
474
+ if cluster_status == status_lib.ClusterStatus.INIT:
475
+ # If the cluster is INIT, it may be provisioning. We want to prevent
476
+ # concurrent calls from queueing up many sequential reprovision
477
+ # attempts. Since provisioning will hold the cluster status lock, we
478
+ # wait to hold that lock by force refreshing the status. This will
479
+ # block until the cluster finishes provisioning, then correctly see
480
+ # that it is UP.
481
+ # TODO(cooperc): If multiple processes launched in parallel see that
482
+ # the cluster is STOPPED or does not exist, they will still all try
483
+ # to provision it, since we do not hold the lock continuously from
484
+ # the status check until the provision call. Fixing this requires a
485
+ # bigger refactor.
486
+ cluster_status, maybe_handle = (
487
+ backend_utils.refresh_cluster_status_handle(
472
488
  cluster_name,
473
- operation='executing tasks',
474
- check_cloud_vm_ray_backend=False,
475
- dryrun=dryrun)
476
- handle = maybe_handle
477
- # Get all stages
478
- stages = [
479
- Stage.SYNC_WORKDIR,
480
- Stage.SYNC_FILE_MOUNTS,
481
- Stage.PRE_EXEC,
482
- Stage.EXEC,
483
- Stage.DOWN,
484
- ]
485
- except exceptions.ClusterNotUpError:
486
- # Proceed with normal provisioning
487
- pass
489
+ force_refresh_statuses=[
490
+ # If the cluster is INIT, we want to try to grab the
491
+ # status lock, which should block until provisioning is
492
+ # finished.
493
+ status_lib.ClusterStatus.INIT,
494
+ ],
495
+ # Wait indefinitely to obtain the lock, so that we don't
496
+ # have multiple processes launching the same cluster at
497
+ # once.
498
+ cluster_status_lock_timeout=-1,
499
+ ))
500
+ if cluster_status == status_lib.ClusterStatus.UP:
501
+ handle = maybe_handle
502
+ stages = [
503
+ Stage.SYNC_WORKDIR,
504
+ Stage.SYNC_FILE_MOUNTS,
505
+ Stage.PRE_EXEC,
506
+ Stage.EXEC,
507
+ Stage.DOWN,
508
+ ]
488
509
 
489
510
  return _execute(
490
511
  entrypoint=entrypoint,