skypilot-nightly 1.0.0.dev20241114__py3-none-any.whl → 1.0.0.dev20241116__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +142 -74
- sky/backends/cloud_vm_ray_backend.py +15 -11
- sky/cli.py +15 -4
- sky/clouds/aws.py +1 -0
- sky/clouds/oci.py +0 -2
- sky/clouds/service_catalog/aws_catalog.py +2 -0
- sky/clouds/utils/oci_utils.py +5 -0
- sky/execution.py +43 -22
- sky/global_user_state.py +36 -16
- sky/jobs/core.py +0 -1
- sky/jobs/utils.py +4 -3
- sky/provision/kubernetes/utils.py +2 -0
- sky/provision/oci/instance.py +12 -11
- sky/provision/oci/query_utils.py +212 -6
- sky/serve/core.py +1 -0
- sky/serve/serve_utils.py +35 -30
- sky/skylet/constants.py +1 -1
- sky/skylet/job_lib.py +249 -138
- sky/skylet/log_lib.py +1 -34
- sky/skylet/subprocess_daemon.py +33 -13
- sky/utils/controller_utils.py +10 -9
- sky/utils/schemas.py +1 -0
- sky/utils/subprocess_utils.py +50 -0
- sky/utils/timeline.py +2 -4
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/RECORD +31 -31
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241114.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '95e2f281a0441b2043ee1bea7d7cddb4e2e69782'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241116'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/backends/backend_utils.py
CHANGED
@@ -100,6 +100,10 @@ DEFAULT_TASK_CPU_DEMAND = 0.5
|
|
100
100
|
CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock')
|
101
101
|
CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
|
102
102
|
|
103
|
+
# Time that must elapse since the last status check before we should re-check if
|
104
|
+
# the cluster has been terminated or autostopped.
|
105
|
+
_CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
|
106
|
+
|
103
107
|
# Filelocks for updating cluster's file_mounts.
|
104
108
|
CLUSTER_FILE_MOUNTS_LOCK_PATH = os.path.expanduser(
|
105
109
|
'~/.sky/.{}_file_mounts.lock')
|
@@ -686,26 +690,56 @@ def write_cluster_config(
|
|
686
690
|
skypilot_config.get_nested(
|
687
691
|
(str(to_provision.cloud).lower(), 'specific_reservations'), set()))
|
688
692
|
|
693
|
+
# Remote identity handling can have 4 cases:
|
694
|
+
# 1. LOCAL_CREDENTIALS (default for most clouds): Upload local credentials
|
695
|
+
# 2. SERVICE_ACCOUNT: SkyPilot creates and manages a service account
|
696
|
+
# 3. Custom service account: Use specified service account
|
697
|
+
# 4. NO_UPLOAD: Do not upload any credentials
|
698
|
+
#
|
699
|
+
# We need to upload credentials only if LOCAL_CREDENTIALS is specified. In
|
700
|
+
# other cases, we exclude the cloud from credential file uploads after
|
701
|
+
# running required checks.
|
689
702
|
assert cluster_name is not None
|
690
|
-
excluded_clouds =
|
703
|
+
excluded_clouds = set()
|
691
704
|
remote_identity_config = skypilot_config.get_nested(
|
692
705
|
(str(cloud).lower(), 'remote_identity'), None)
|
693
706
|
remote_identity = schemas.get_default_remote_identity(str(cloud).lower())
|
694
707
|
if isinstance(remote_identity_config, str):
|
695
708
|
remote_identity = remote_identity_config
|
696
709
|
if isinstance(remote_identity_config, list):
|
710
|
+
# Some clouds (e.g., AWS) support specifying multiple service accounts
|
711
|
+
# chosen based on the cluster name. Do the matching here to pick the
|
712
|
+
# correct one.
|
697
713
|
for profile in remote_identity_config:
|
698
714
|
if fnmatch.fnmatchcase(cluster_name, list(profile.keys())[0]):
|
699
715
|
remote_identity = list(profile.values())[0]
|
700
716
|
break
|
701
717
|
if remote_identity != schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value:
|
702
|
-
|
718
|
+
# If LOCAL_CREDENTIALS is not specified, we add the cloud to the
|
719
|
+
# excluded_clouds set, but we must also check if the cloud supports
|
720
|
+
# service accounts.
|
721
|
+
if remote_identity == schemas.RemoteIdentityOptions.NO_UPLOAD.value:
|
722
|
+
# If NO_UPLOAD is specified, fall back to default remote identity
|
723
|
+
# for downstream logic but add it to excluded_clouds to skip
|
724
|
+
# credential file uploads.
|
725
|
+
remote_identity = schemas.get_default_remote_identity(
|
726
|
+
str(cloud).lower())
|
727
|
+
elif not cloud.supports_service_account_on_remote():
|
703
728
|
raise exceptions.InvalidCloudConfigs(
|
704
729
|
'remote_identity: SERVICE_ACCOUNT is specified in '
|
705
730
|
f'{skypilot_config.loaded_config_path!r} for {cloud}, but it '
|
706
731
|
'is not supported by this cloud. Remove the config or set: '
|
707
732
|
'`remote_identity: LOCAL_CREDENTIALS`.')
|
708
|
-
excluded_clouds
|
733
|
+
excluded_clouds.add(cloud)
|
734
|
+
|
735
|
+
for cloud_str, cloud_obj in cloud_registry.CLOUD_REGISTRY.items():
|
736
|
+
remote_identity_config = skypilot_config.get_nested(
|
737
|
+
(cloud_str.lower(), 'remote_identity'), None)
|
738
|
+
if remote_identity_config:
|
739
|
+
if (remote_identity_config ==
|
740
|
+
schemas.RemoteIdentityOptions.NO_UPLOAD.value):
|
741
|
+
excluded_clouds.add(cloud_obj)
|
742
|
+
|
709
743
|
credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)
|
710
744
|
|
711
745
|
auth_config = {'ssh_private_key': auth.PRIVATE_SSH_KEY_PATH}
|
@@ -1669,11 +1703,27 @@ def check_can_clone_disk_and_override_task(
|
|
1669
1703
|
|
1670
1704
|
def _update_cluster_status_no_lock(
|
1671
1705
|
cluster_name: str) -> Optional[Dict[str, Any]]:
|
1672
|
-
"""
|
1706
|
+
"""Update the cluster status.
|
1707
|
+
|
1708
|
+
The cluster status is updated by checking ray cluster and real status from
|
1709
|
+
cloud.
|
1710
|
+
|
1711
|
+
The function will update the cached cluster status in the global state. For
|
1712
|
+
the design of the cluster status and transition, please refer to the
|
1713
|
+
sky/design_docs/cluster_status.md
|
1714
|
+
|
1715
|
+
Returns:
|
1716
|
+
If the cluster is terminated or does not exist, return None. Otherwise
|
1717
|
+
returns the input record with status and handle potentially updated.
|
1673
1718
|
|
1674
1719
|
Raises:
|
1720
|
+
exceptions.ClusterOwnerIdentityMismatchError: if the current user is
|
1721
|
+
not the same as the user who created the cluster.
|
1722
|
+
exceptions.CloudUserIdentityError: if we fail to get the current user
|
1723
|
+
identity.
|
1675
1724
|
exceptions.ClusterStatusFetchingError: the cluster status cannot be
|
1676
|
-
fetched from the cloud provider
|
1725
|
+
fetched from the cloud provider or there are leaked nodes causing
|
1726
|
+
the node number larger than expected.
|
1677
1727
|
"""
|
1678
1728
|
record = global_user_state.get_cluster_from_name(cluster_name)
|
1679
1729
|
if record is None:
|
@@ -1893,52 +1943,22 @@ def _update_cluster_status_no_lock(
|
|
1893
1943
|
return global_user_state.get_cluster_from_name(cluster_name)
|
1894
1944
|
|
1895
1945
|
|
1896
|
-
def
|
1897
|
-
|
1898
|
-
|
1899
|
-
|
1900
|
-
|
1901
|
-
|
1902
|
-
|
1903
|
-
The cluster status is updated by checking ray cluster and real status from
|
1904
|
-
cloud.
|
1905
|
-
|
1906
|
-
The function will update the cached cluster status in the global state. For
|
1907
|
-
the design of the cluster status and transition, please refer to the
|
1908
|
-
sky/design_docs/cluster_status.md
|
1909
|
-
|
1910
|
-
Args:
|
1911
|
-
cluster_name: The name of the cluster.
|
1912
|
-
acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
|
1913
|
-
before updating the status.
|
1914
|
-
cluster_status_lock_timeout: The timeout to acquire the per-cluster
|
1915
|
-
lock.
|
1916
|
-
|
1917
|
-
Returns:
|
1918
|
-
If the cluster is terminated or does not exist, return None. Otherwise
|
1919
|
-
returns the input record with status and handle potentially updated.
|
1946
|
+
def _must_refresh_cluster_status(
|
1947
|
+
record: Dict[str, Any],
|
1948
|
+
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]]
|
1949
|
+
) -> bool:
|
1950
|
+
force_refresh_for_cluster = (force_refresh_statuses is not None and
|
1951
|
+
record['status'] in force_refresh_statuses)
|
1920
1952
|
|
1921
|
-
|
1922
|
-
|
1923
|
-
|
1924
|
-
|
1925
|
-
|
1926
|
-
|
1927
|
-
|
1928
|
-
the node number larger than expected.
|
1929
|
-
"""
|
1930
|
-
if not acquire_per_cluster_status_lock:
|
1931
|
-
return _update_cluster_status_no_lock(cluster_name)
|
1953
|
+
use_spot = record['handle'].launched_resources.use_spot
|
1954
|
+
has_autostop = (record['status'] != status_lib.ClusterStatus.STOPPED and
|
1955
|
+
record['autostop'] >= 0)
|
1956
|
+
recently_refreshed = (record['status_updated_at'] is not None and
|
1957
|
+
time.time() - record['status_updated_at'] <
|
1958
|
+
_CLUSTER_STATUS_CACHE_DURATION_SECONDS)
|
1959
|
+
is_stale = (use_spot or has_autostop) and not recently_refreshed
|
1932
1960
|
|
1933
|
-
|
1934
|
-
with filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name),
|
1935
|
-
timeout=cluster_status_lock_timeout):
|
1936
|
-
return _update_cluster_status_no_lock(cluster_name)
|
1937
|
-
except filelock.Timeout:
|
1938
|
-
logger.debug('Refreshing status: Failed get the lock for cluster '
|
1939
|
-
f'{cluster_name!r}. Using the cached status.')
|
1940
|
-
record = global_user_state.get_cluster_from_name(cluster_name)
|
1941
|
-
return record
|
1961
|
+
return force_refresh_for_cluster or is_stale
|
1942
1962
|
|
1943
1963
|
|
1944
1964
|
def refresh_cluster_record(
|
@@ -1956,16 +1976,22 @@ def refresh_cluster_record(
|
|
1956
1976
|
|
1957
1977
|
Args:
|
1958
1978
|
cluster_name: The name of the cluster.
|
1959
|
-
force_refresh_statuses: if specified, refresh the cluster if it has one
|
1960
|
-
the specified statuses. Additionally, clusters satisfying the
|
1961
|
-
following conditions will
|
1962
|
-
|
1963
|
-
|
1964
|
-
|
1979
|
+
force_refresh_statuses: if specified, refresh the cluster if it has one
|
1980
|
+
of the specified statuses. Additionally, clusters satisfying the
|
1981
|
+
following conditions will be refreshed no matter the argument is
|
1982
|
+
specified or not:
|
1983
|
+
- the most latest available status update is more than
|
1984
|
+
_CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
|
1985
|
+
1. the cluster is a spot cluster, or
|
1986
|
+
2. cluster autostop is set and the cluster is not STOPPED.
|
1965
1987
|
acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
|
1966
|
-
before updating the status.
|
1988
|
+
before updating the status. Even if this is True, the lock may not be
|
1989
|
+
acquired if the status does not need to be refreshed.
|
1967
1990
|
cluster_status_lock_timeout: The timeout to acquire the per-cluster
|
1968
|
-
lock. If timeout, the function will use the cached status.
|
1991
|
+
lock. If timeout, the function will use the cached status. If the
|
1992
|
+
value is <0, do not timeout (wait for the lock indefinitely). By
|
1993
|
+
default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
|
1994
|
+
if correctness is required, you must set this to -1.
|
1969
1995
|
|
1970
1996
|
Returns:
|
1971
1997
|
If the cluster is terminated or does not exist, return None.
|
@@ -1986,19 +2012,58 @@ def refresh_cluster_record(
|
|
1986
2012
|
return None
|
1987
2013
|
check_owner_identity(cluster_name)
|
1988
2014
|
|
1989
|
-
|
1990
|
-
|
1991
|
-
|
1992
|
-
|
1993
|
-
|
1994
|
-
|
1995
|
-
|
1996
|
-
|
1997
|
-
|
1998
|
-
|
1999
|
-
|
2000
|
-
|
2001
|
-
|
2015
|
+
if not isinstance(record['handle'], backends.CloudVmRayResourceHandle):
|
2016
|
+
return record
|
2017
|
+
|
2018
|
+
# The loop logic allows us to notice if the status was updated in the
|
2019
|
+
# global_user_state by another process and stop trying to get the lock.
|
2020
|
+
# The core loop logic is adapted from FileLock's implementation.
|
2021
|
+
lock = filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
|
2022
|
+
start_time = time.perf_counter()
|
2023
|
+
|
2024
|
+
# Loop until we have an up-to-date status or until we acquire the lock.
|
2025
|
+
while True:
|
2026
|
+
# Check to see if we can return the cached status.
|
2027
|
+
if not _must_refresh_cluster_status(record, force_refresh_statuses):
|
2028
|
+
return record
|
2029
|
+
|
2030
|
+
if not acquire_per_cluster_status_lock:
|
2031
|
+
return _update_cluster_status_no_lock(cluster_name)
|
2032
|
+
|
2033
|
+
# Try to acquire the lock so we can fetch the status.
|
2034
|
+
try:
|
2035
|
+
with lock.acquire(blocking=False):
|
2036
|
+
# Lock acquired.
|
2037
|
+
|
2038
|
+
# Check the cluster status again, since it could have been
|
2039
|
+
# updated between our last check and acquiring the lock.
|
2040
|
+
record = global_user_state.get_cluster_from_name(cluster_name)
|
2041
|
+
if record is None or not _must_refresh_cluster_status(
|
2042
|
+
record, force_refresh_statuses):
|
2043
|
+
return record
|
2044
|
+
|
2045
|
+
# Update and return the cluster status.
|
2046
|
+
return _update_cluster_status_no_lock(cluster_name)
|
2047
|
+
except filelock.Timeout:
|
2048
|
+
# lock.acquire() will throw a Timeout exception if the lock is not
|
2049
|
+
# available and we have blocking=False.
|
2050
|
+
pass
|
2051
|
+
|
2052
|
+
# Logic adapted from FileLock.acquire().
|
2053
|
+
# If cluster_status_lock_time is <0, we will never hit this. No timeout.
|
2054
|
+
# Otherwise, if we have timed out, return the cached status. This has
|
2055
|
+
# the potential to cause correctness issues, but if so it is the
|
2056
|
+
# caller's responsibility to set the timeout to -1.
|
2057
|
+
if 0 <= cluster_status_lock_timeout < time.perf_counter() - start_time:
|
2058
|
+
logger.debug('Refreshing status: Failed get the lock for cluster '
|
2059
|
+
f'{cluster_name!r}. Using the cached status.')
|
2060
|
+
return record
|
2061
|
+
time.sleep(0.05)
|
2062
|
+
|
2063
|
+
# Refresh for next loop iteration.
|
2064
|
+
record = global_user_state.get_cluster_from_name(cluster_name)
|
2065
|
+
if record is None:
|
2066
|
+
return None
|
2002
2067
|
|
2003
2068
|
|
2004
2069
|
@timeline.event
|
@@ -2604,15 +2669,18 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str,
|
|
2604
2669
|
pattern = re.compile(r'AttributeError: module \'sky\.(.*)\' has no '
|
2605
2670
|
r'attribute \'(.*)\'')
|
2606
2671
|
if returncode != 0:
|
2672
|
+
# TODO(zhwu): Backward compatibility for old SkyPilot runtime version on
|
2673
|
+
# the remote cluster. Remove this after 0.10.0 is released.
|
2607
2674
|
attribute_error = re.findall(pattern, stderr)
|
2608
|
-
if attribute_error:
|
2675
|
+
if attribute_error or 'SkyPilot runtime is too old' in stderr:
|
2609
2676
|
with ux_utils.print_exception_no_traceback():
|
2610
2677
|
raise RuntimeError(
|
2611
2678
|
f'{colorama.Fore.RED}SkyPilot runtime needs to be updated '
|
2612
|
-
'on the remote cluster. To update, run
|
2613
|
-
|
2679
|
+
f'on the remote cluster: {cluster_name}. To update, run '
|
2680
|
+
'(existing jobs will not be interrupted): '
|
2681
|
+
f'{colorama.Style.BRIGHT}sky start -f -y '
|
2614
2682
|
f'{cluster_name}{colorama.Style.RESET_ALL}'
|
2615
|
-
f'\n--- Details ---\n{stderr.strip()}\n')
|
2683
|
+
f'\n--- Details ---\n{stderr.strip()}\n') from None
|
2616
2684
|
|
2617
2685
|
|
2618
2686
|
def get_endpoints(cluster: str,
|
@@ -276,6 +276,7 @@ class RayCodeGen:
|
|
276
276
|
from sky.skylet import constants
|
277
277
|
from sky.skylet import job_lib
|
278
278
|
from sky.utils import log_utils
|
279
|
+
from sky.utils import subprocess_utils
|
279
280
|
|
280
281
|
SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
|
281
282
|
|
@@ -3275,14 +3276,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3275
3276
|
encoded_script = shlex.quote(codegen)
|
3276
3277
|
create_script_code = (f'{{ echo {encoded_script} > {script_path}; }}')
|
3277
3278
|
job_submit_cmd = (
|
3278
|
-
|
3279
|
-
|
3280
|
-
'
|
3281
|
-
f'
|
3282
|
-
f'"{constants.SKY_PYTHON_CMD} -u {script_path} '
|
3279
|
+
# JOB_CMD_IDENTIFIER is used for identifying the process retrieved
|
3280
|
+
# with pid is the same driver process.
|
3281
|
+
f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
|
3282
|
+
f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
|
3283
3283
|
# Do not use &>, which is not POSIX and may not work.
|
3284
3284
|
# Note that the order of ">filename 2>&1" matters.
|
3285
|
-
f'> {remote_log_path} 2>&1
|
3285
|
+
f'> {remote_log_path} 2>&1')
|
3286
3286
|
|
3287
3287
|
code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
|
3288
3288
|
job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
|
@@ -3330,6 +3330,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3330
3330
|
job_submit_cmd,
|
3331
3331
|
stream_logs=False,
|
3332
3332
|
require_outputs=True)
|
3333
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
3334
|
+
# running a job. Necessitating calling `sky launch`.
|
3335
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
3336
|
+
handle.cluster_name)
|
3333
3337
|
if returncode == 255 and 'too long' in stdout + stderr:
|
3334
3338
|
# If the generated script is too long, we retry it with dumping
|
3335
3339
|
# the script to a file and running it with SSH. We use a general
|
@@ -3344,10 +3348,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3344
3348
|
stream_logs=False,
|
3345
3349
|
require_outputs=True)
|
3346
3350
|
|
3347
|
-
# Happens when someone calls `sky exec` but remote is outdated
|
3348
|
-
# necessitating calling `sky launch`.
|
3349
|
-
backend_utils.check_stale_runtime_on_remote(returncode, stdout,
|
3350
|
-
handle.cluster_name)
|
3351
3351
|
subprocess_utils.handle_returncode(returncode,
|
3352
3352
|
job_submit_cmd,
|
3353
3353
|
f'Failed to submit job {job_id}.',
|
@@ -3417,6 +3417,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3417
3417
|
stream_logs=False,
|
3418
3418
|
require_outputs=True,
|
3419
3419
|
separate_stderr=True)
|
3420
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
3421
|
+
# adding a job. Necessitating calling `sky launch`.
|
3422
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
3423
|
+
handle.cluster_name)
|
3420
3424
|
# TODO(zhwu): this sometimes will unexpectedly fail, we can add
|
3421
3425
|
# retry for this, after we figure out the reason.
|
3422
3426
|
subprocess_utils.handle_returncode(returncode, code,
|
@@ -3554,7 +3558,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3554
3558
|
backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
|
3555
3559
|
|
3556
3560
|
try:
|
3557
|
-
with
|
3561
|
+
with timeline.FileLockEvent(
|
3558
3562
|
lock_path,
|
3559
3563
|
backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
|
3560
3564
|
self.teardown_no_lock(
|
sky/cli.py
CHANGED
@@ -3699,13 +3699,24 @@ def jobs_launch(
|
|
3699
3699
|
dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
|
3700
3700
|
dag_utils.fill_default_config_in_dag_for_job_launch(dag)
|
3701
3701
|
|
3702
|
-
click.secho(f'Managed job {dag.name!r} will be launched on (estimated):',
|
3703
|
-
fg='cyan')
|
3704
3702
|
dag, _ = admin_policy_utils.apply(
|
3705
3703
|
dag, use_mutated_config_in_current_request=False)
|
3706
|
-
dag = sky.optimize(dag)
|
3707
3704
|
|
3708
|
-
if
|
3705
|
+
if yes:
|
3706
|
+
# Skip resource preview if -y is set, since we are probably running in
|
3707
|
+
# a script and the user won't have a chance to review it anyway.
|
3708
|
+
# This can save a couple of seconds.
|
3709
|
+
click.secho(
|
3710
|
+
f'Resources for managed job {dag.name!r} will be computed on the '
|
3711
|
+
'managed jobs controller, since --yes is set.',
|
3712
|
+
fg='cyan')
|
3713
|
+
|
3714
|
+
else:
|
3715
|
+
click.secho(
|
3716
|
+
f'Managed job {dag.name!r} will be launched on (estimated):',
|
3717
|
+
fg='cyan')
|
3718
|
+
dag = sky.optimize(dag)
|
3719
|
+
|
3709
3720
|
prompt = f'Launching a managed job {dag.name!r}. Proceed?'
|
3710
3721
|
if prompt is not None:
|
3711
3722
|
click.confirm(prompt, default=True, abort=True, show_default=True)
|
sky/clouds/aws.py
CHANGED
@@ -663,6 +663,7 @@ class AWS(clouds.Cloud):
|
|
663
663
|
return AWSIdentityType.SHARED_CREDENTIALS_FILE
|
664
664
|
|
665
665
|
@classmethod
|
666
|
+
@functools.lru_cache(maxsize=1) # Cache since getting identity is slow.
|
666
667
|
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
667
668
|
"""Returns a [UserId, Account] list that uniquely identifies the user.
|
668
669
|
|
sky/clouds/oci.py
CHANGED
@@ -75,8 +75,6 @@ class OCI(clouds.Cloud):
|
|
75
75
|
(f'Docker image is currently not supported on {cls._REPR}. '
|
76
76
|
'You can try running docker command inside the '
|
77
77
|
'`run` section in task.yaml.'),
|
78
|
-
clouds.CloudImplementationFeatures.OPEN_PORTS:
|
79
|
-
(f'Opening ports is currently not supported on {cls._REPR}.'),
|
80
78
|
}
|
81
79
|
if resources.use_spot:
|
82
80
|
features[clouds.CloudImplementationFeatures.STOP] = (
|
@@ -20,6 +20,7 @@ from sky.clouds.service_catalog.data_fetchers import fetch_aws
|
|
20
20
|
from sky.utils import common_utils
|
21
21
|
from sky.utils import resources_utils
|
22
22
|
from sky.utils import rich_utils
|
23
|
+
from sky.utils import timeline
|
23
24
|
from sky.utils import ux_utils
|
24
25
|
|
25
26
|
if typing.TYPE_CHECKING:
|
@@ -100,6 +101,7 @@ def _get_az_mappings(aws_user_hash: str) -> Optional['pd.DataFrame']:
|
|
100
101
|
return az_mappings
|
101
102
|
|
102
103
|
|
104
|
+
@timeline.event
|
103
105
|
def _fetch_and_apply_az_mapping(df: common.LazyDataFrame) -> 'pd.DataFrame':
|
104
106
|
"""Maps zone IDs (use1-az1) to zone names (us-east-1x).
|
105
107
|
|
sky/clouds/utils/oci_utils.py
CHANGED
@@ -4,6 +4,8 @@ History:
|
|
4
4
|
- Zhanghao Wu @ Oct 2023: Formatting and refactoring
|
5
5
|
- Hysun He (hysun.he@oracle.com) @ Oct, 2024: Add default image OS
|
6
6
|
configuration.
|
7
|
+
- Hysun He (hysun.he@oracle.com) @ Nov.12, 2024: Add the constant
|
8
|
+
SERVICE_PORT_RULE_TAG
|
7
9
|
"""
|
8
10
|
import os
|
9
11
|
|
@@ -42,6 +44,9 @@ class OCIConfig:
|
|
42
44
|
VCN_CIDR_INTERNET = '0.0.0.0/0'
|
43
45
|
VCN_CIDR = '192.168.0.0/16'
|
44
46
|
VCN_SUBNET_CIDR = '192.168.0.0/18'
|
47
|
+
SERVICE_PORT_RULE_TAG = 'SkyServe-Service-Port'
|
48
|
+
# NSG name template
|
49
|
+
NSG_NAME_TEMPLATE = 'nsg_{cluster_name}'
|
45
50
|
|
46
51
|
MAX_RETRY_COUNT = 3
|
47
52
|
RETRY_INTERVAL_BASE_SECONDS = 5
|
sky/execution.py
CHANGED
@@ -11,10 +11,10 @@ import sky
|
|
11
11
|
from sky import admin_policy
|
12
12
|
from sky import backends
|
13
13
|
from sky import clouds
|
14
|
-
from sky import exceptions
|
15
14
|
from sky import global_user_state
|
16
15
|
from sky import optimizer
|
17
16
|
from sky import sky_logging
|
17
|
+
from sky import status_lib
|
18
18
|
from sky.backends import backend_utils
|
19
19
|
from sky.usage import usage_lib
|
20
20
|
from sky.utils import admin_policy_utils
|
@@ -267,6 +267,12 @@ def _execute(
|
|
267
267
|
# no-credential machine should not enter optimize(), which
|
268
268
|
# would directly error out ('No cloud is enabled...'). Fix
|
269
269
|
# by moving `sky check` checks out of optimize()?
|
270
|
+
|
271
|
+
controller = controller_utils.Controllers.from_name(
|
272
|
+
cluster_name)
|
273
|
+
if controller is not None:
|
274
|
+
logger.info(
|
275
|
+
f'Choosing resources for {controller.name}...')
|
270
276
|
dag = sky.optimize(dag, minimize=optimize_target)
|
271
277
|
task = dag.tasks[0] # Keep: dag may have been deep-copied.
|
272
278
|
assert task.best_resources is not None, task
|
@@ -463,28 +469,43 @@ def launch(
|
|
463
469
|
stages = None
|
464
470
|
# Check if cluster exists and we are doing fast provisioning
|
465
471
|
if fast and cluster_name is not None:
|
466
|
-
maybe_handle =
|
467
|
-
cluster_name)
|
468
|
-
if
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
+
cluster_status, maybe_handle = (
|
473
|
+
backend_utils.refresh_cluster_status_handle(cluster_name))
|
474
|
+
if cluster_status == status_lib.ClusterStatus.INIT:
|
475
|
+
# If the cluster is INIT, it may be provisioning. We want to prevent
|
476
|
+
# concurrent calls from queueing up many sequential reprovision
|
477
|
+
# attempts. Since provisioning will hold the cluster status lock, we
|
478
|
+
# wait to hold that lock by force refreshing the status. This will
|
479
|
+
# block until the cluster finishes provisioning, then correctly see
|
480
|
+
# that it is UP.
|
481
|
+
# TODO(cooperc): If multiple processes launched in parallel see that
|
482
|
+
# the cluster is STOPPED or does not exist, they will still all try
|
483
|
+
# to provision it, since we do not hold the lock continuously from
|
484
|
+
# the status check until the provision call. Fixing this requires a
|
485
|
+
# bigger refactor.
|
486
|
+
cluster_status, maybe_handle = (
|
487
|
+
backend_utils.refresh_cluster_status_handle(
|
472
488
|
cluster_name,
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
489
|
+
force_refresh_statuses=[
|
490
|
+
# If the cluster is INIT, we want to try to grab the
|
491
|
+
# status lock, which should block until provisioning is
|
492
|
+
# finished.
|
493
|
+
status_lib.ClusterStatus.INIT,
|
494
|
+
],
|
495
|
+
# Wait indefinitely to obtain the lock, so that we don't
|
496
|
+
# have multiple processes launching the same cluster at
|
497
|
+
# once.
|
498
|
+
cluster_status_lock_timeout=-1,
|
499
|
+
))
|
500
|
+
if cluster_status == status_lib.ClusterStatus.UP:
|
501
|
+
handle = maybe_handle
|
502
|
+
stages = [
|
503
|
+
Stage.SYNC_WORKDIR,
|
504
|
+
Stage.SYNC_FILE_MOUNTS,
|
505
|
+
Stage.PRE_EXEC,
|
506
|
+
Stage.EXEC,
|
507
|
+
Stage.DOWN,
|
508
|
+
]
|
488
509
|
|
489
510
|
return _execute(
|
490
511
|
entrypoint=entrypoint,
|