skypilot-nightly 1.0.0.dev20241115__py3-none-any.whl → 1.0.0.dev20241116__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +135 -70
- sky/backends/cloud_vm_ray_backend.py +1 -1
- sky/cli.py +15 -4
- sky/clouds/aws.py +1 -0
- sky/clouds/service_catalog/aws_catalog.py +2 -0
- sky/execution.py +6 -0
- sky/global_user_state.py +36 -16
- sky/provision/kubernetes/utils.py +2 -0
- sky/utils/schemas.py +1 -0
- sky/utils/timeline.py +2 -4
- {skypilot_nightly-1.0.0.dev20241115.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241115.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/RECORD +17 -17
- {skypilot_nightly-1.0.0.dev20241115.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241115.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241115.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241115.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '95e2f281a0441b2043ee1bea7d7cddb4e2e69782'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241116'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/backends/backend_utils.py
CHANGED
@@ -100,6 +100,10 @@ DEFAULT_TASK_CPU_DEMAND = 0.5
|
|
100
100
|
CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock')
|
101
101
|
CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
|
102
102
|
|
103
|
+
# Time that must elapse since the last status check before we should re-check if
|
104
|
+
# the cluster has been terminated or autostopped.
|
105
|
+
_CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
|
106
|
+
|
103
107
|
# Filelocks for updating cluster's file_mounts.
|
104
108
|
CLUSTER_FILE_MOUNTS_LOCK_PATH = os.path.expanduser(
|
105
109
|
'~/.sky/.{}_file_mounts.lock')
|
@@ -686,26 +690,56 @@ def write_cluster_config(
|
|
686
690
|
skypilot_config.get_nested(
|
687
691
|
(str(to_provision.cloud).lower(), 'specific_reservations'), set()))
|
688
692
|
|
693
|
+
# Remote identity handling can have 4 cases:
|
694
|
+
# 1. LOCAL_CREDENTIALS (default for most clouds): Upload local credentials
|
695
|
+
# 2. SERVICE_ACCOUNT: SkyPilot creates and manages a service account
|
696
|
+
# 3. Custom service account: Use specified service account
|
697
|
+
# 4. NO_UPLOAD: Do not upload any credentials
|
698
|
+
#
|
699
|
+
# We need to upload credentials only if LOCAL_CREDENTIALS is specified. In
|
700
|
+
# other cases, we exclude the cloud from credential file uploads after
|
701
|
+
# running required checks.
|
689
702
|
assert cluster_name is not None
|
690
|
-
excluded_clouds =
|
703
|
+
excluded_clouds = set()
|
691
704
|
remote_identity_config = skypilot_config.get_nested(
|
692
705
|
(str(cloud).lower(), 'remote_identity'), None)
|
693
706
|
remote_identity = schemas.get_default_remote_identity(str(cloud).lower())
|
694
707
|
if isinstance(remote_identity_config, str):
|
695
708
|
remote_identity = remote_identity_config
|
696
709
|
if isinstance(remote_identity_config, list):
|
710
|
+
# Some clouds (e.g., AWS) support specifying multiple service accounts
|
711
|
+
# chosen based on the cluster name. Do the matching here to pick the
|
712
|
+
# correct one.
|
697
713
|
for profile in remote_identity_config:
|
698
714
|
if fnmatch.fnmatchcase(cluster_name, list(profile.keys())[0]):
|
699
715
|
remote_identity = list(profile.values())[0]
|
700
716
|
break
|
701
717
|
if remote_identity != schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value:
|
702
|
-
|
718
|
+
# If LOCAL_CREDENTIALS is not specified, we add the cloud to the
|
719
|
+
# excluded_clouds set, but we must also check if the cloud supports
|
720
|
+
# service accounts.
|
721
|
+
if remote_identity == schemas.RemoteIdentityOptions.NO_UPLOAD.value:
|
722
|
+
# If NO_UPLOAD is specified, fall back to default remote identity
|
723
|
+
# for downstream logic but add it to excluded_clouds to skip
|
724
|
+
# credential file uploads.
|
725
|
+
remote_identity = schemas.get_default_remote_identity(
|
726
|
+
str(cloud).lower())
|
727
|
+
elif not cloud.supports_service_account_on_remote():
|
703
728
|
raise exceptions.InvalidCloudConfigs(
|
704
729
|
'remote_identity: SERVICE_ACCOUNT is specified in '
|
705
730
|
f'{skypilot_config.loaded_config_path!r} for {cloud}, but it '
|
706
731
|
'is not supported by this cloud. Remove the config or set: '
|
707
732
|
'`remote_identity: LOCAL_CREDENTIALS`.')
|
708
|
-
excluded_clouds
|
733
|
+
excluded_clouds.add(cloud)
|
734
|
+
|
735
|
+
for cloud_str, cloud_obj in cloud_registry.CLOUD_REGISTRY.items():
|
736
|
+
remote_identity_config = skypilot_config.get_nested(
|
737
|
+
(cloud_str.lower(), 'remote_identity'), None)
|
738
|
+
if remote_identity_config:
|
739
|
+
if (remote_identity_config ==
|
740
|
+
schemas.RemoteIdentityOptions.NO_UPLOAD.value):
|
741
|
+
excluded_clouds.add(cloud_obj)
|
742
|
+
|
709
743
|
credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)
|
710
744
|
|
711
745
|
auth_config = {'ssh_private_key': auth.PRIVATE_SSH_KEY_PATH}
|
@@ -1669,11 +1703,27 @@ def check_can_clone_disk_and_override_task(
|
|
1669
1703
|
|
1670
1704
|
def _update_cluster_status_no_lock(
|
1671
1705
|
cluster_name: str) -> Optional[Dict[str, Any]]:
|
1672
|
-
"""
|
1706
|
+
"""Update the cluster status.
|
1707
|
+
|
1708
|
+
The cluster status is updated by checking ray cluster and real status from
|
1709
|
+
cloud.
|
1710
|
+
|
1711
|
+
The function will update the cached cluster status in the global state. For
|
1712
|
+
the design of the cluster status and transition, please refer to the
|
1713
|
+
sky/design_docs/cluster_status.md
|
1714
|
+
|
1715
|
+
Returns:
|
1716
|
+
If the cluster is terminated or does not exist, return None. Otherwise
|
1717
|
+
returns the input record with status and handle potentially updated.
|
1673
1718
|
|
1674
1719
|
Raises:
|
1720
|
+
exceptions.ClusterOwnerIdentityMismatchError: if the current user is
|
1721
|
+
not the same as the user who created the cluster.
|
1722
|
+
exceptions.CloudUserIdentityError: if we fail to get the current user
|
1723
|
+
identity.
|
1675
1724
|
exceptions.ClusterStatusFetchingError: the cluster status cannot be
|
1676
|
-
fetched from the cloud provider
|
1725
|
+
fetched from the cloud provider or there are leaked nodes causing
|
1726
|
+
the node number larger than expected.
|
1677
1727
|
"""
|
1678
1728
|
record = global_user_state.get_cluster_from_name(cluster_name)
|
1679
1729
|
if record is None:
|
@@ -1893,52 +1943,22 @@ def _update_cluster_status_no_lock(
|
|
1893
1943
|
return global_user_state.get_cluster_from_name(cluster_name)
|
1894
1944
|
|
1895
1945
|
|
1896
|
-
def
|
1897
|
-
|
1898
|
-
|
1899
|
-
|
1900
|
-
|
1901
|
-
|
1902
|
-
|
1903
|
-
The cluster status is updated by checking ray cluster and real status from
|
1904
|
-
cloud.
|
1905
|
-
|
1906
|
-
The function will update the cached cluster status in the global state. For
|
1907
|
-
the design of the cluster status and transition, please refer to the
|
1908
|
-
sky/design_docs/cluster_status.md
|
1909
|
-
|
1910
|
-
Args:
|
1911
|
-
cluster_name: The name of the cluster.
|
1912
|
-
acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
|
1913
|
-
before updating the status.
|
1914
|
-
cluster_status_lock_timeout: The timeout to acquire the per-cluster
|
1915
|
-
lock.
|
1916
|
-
|
1917
|
-
Returns:
|
1918
|
-
If the cluster is terminated or does not exist, return None. Otherwise
|
1919
|
-
returns the input record with status and handle potentially updated.
|
1946
|
+
def _must_refresh_cluster_status(
|
1947
|
+
record: Dict[str, Any],
|
1948
|
+
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]]
|
1949
|
+
) -> bool:
|
1950
|
+
force_refresh_for_cluster = (force_refresh_statuses is not None and
|
1951
|
+
record['status'] in force_refresh_statuses)
|
1920
1952
|
|
1921
|
-
|
1922
|
-
|
1923
|
-
|
1924
|
-
|
1925
|
-
|
1926
|
-
|
1927
|
-
|
1928
|
-
the node number larger than expected.
|
1929
|
-
"""
|
1930
|
-
if not acquire_per_cluster_status_lock:
|
1931
|
-
return _update_cluster_status_no_lock(cluster_name)
|
1953
|
+
use_spot = record['handle'].launched_resources.use_spot
|
1954
|
+
has_autostop = (record['status'] != status_lib.ClusterStatus.STOPPED and
|
1955
|
+
record['autostop'] >= 0)
|
1956
|
+
recently_refreshed = (record['status_updated_at'] is not None and
|
1957
|
+
time.time() - record['status_updated_at'] <
|
1958
|
+
_CLUSTER_STATUS_CACHE_DURATION_SECONDS)
|
1959
|
+
is_stale = (use_spot or has_autostop) and not recently_refreshed
|
1932
1960
|
|
1933
|
-
|
1934
|
-
with filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name),
|
1935
|
-
timeout=cluster_status_lock_timeout):
|
1936
|
-
return _update_cluster_status_no_lock(cluster_name)
|
1937
|
-
except filelock.Timeout:
|
1938
|
-
logger.debug('Refreshing status: Failed get the lock for cluster '
|
1939
|
-
f'{cluster_name!r}. Using the cached status.')
|
1940
|
-
record = global_user_state.get_cluster_from_name(cluster_name)
|
1941
|
-
return record
|
1961
|
+
return force_refresh_for_cluster or is_stale
|
1942
1962
|
|
1943
1963
|
|
1944
1964
|
def refresh_cluster_record(
|
@@ -1956,16 +1976,22 @@ def refresh_cluster_record(
|
|
1956
1976
|
|
1957
1977
|
Args:
|
1958
1978
|
cluster_name: The name of the cluster.
|
1959
|
-
force_refresh_statuses: if specified, refresh the cluster if it has one
|
1960
|
-
the specified statuses. Additionally, clusters satisfying the
|
1961
|
-
following conditions will
|
1962
|
-
|
1963
|
-
|
1964
|
-
|
1979
|
+
force_refresh_statuses: if specified, refresh the cluster if it has one
|
1980
|
+
of the specified statuses. Additionally, clusters satisfying the
|
1981
|
+
following conditions will be refreshed no matter the argument is
|
1982
|
+
specified or not:
|
1983
|
+
- the most latest available status update is more than
|
1984
|
+
_CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
|
1985
|
+
1. the cluster is a spot cluster, or
|
1986
|
+
2. cluster autostop is set and the cluster is not STOPPED.
|
1965
1987
|
acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
|
1966
|
-
before updating the status.
|
1988
|
+
before updating the status. Even if this is True, the lock may not be
|
1989
|
+
acquired if the status does not need to be refreshed.
|
1967
1990
|
cluster_status_lock_timeout: The timeout to acquire the per-cluster
|
1968
|
-
lock. If timeout, the function will use the cached status.
|
1991
|
+
lock. If timeout, the function will use the cached status. If the
|
1992
|
+
value is <0, do not timeout (wait for the lock indefinitely). By
|
1993
|
+
default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
|
1994
|
+
if correctness is required, you must set this to -1.
|
1969
1995
|
|
1970
1996
|
Returns:
|
1971
1997
|
If the cluster is terminated or does not exist, return None.
|
@@ -1986,19 +2012,58 @@ def refresh_cluster_record(
|
|
1986
2012
|
return None
|
1987
2013
|
check_owner_identity(cluster_name)
|
1988
2014
|
|
1989
|
-
|
1990
|
-
|
1991
|
-
|
1992
|
-
|
1993
|
-
|
1994
|
-
|
1995
|
-
|
1996
|
-
|
1997
|
-
|
1998
|
-
|
1999
|
-
|
2000
|
-
|
2001
|
-
|
2015
|
+
if not isinstance(record['handle'], backends.CloudVmRayResourceHandle):
|
2016
|
+
return record
|
2017
|
+
|
2018
|
+
# The loop logic allows us to notice if the status was updated in the
|
2019
|
+
# global_user_state by another process and stop trying to get the lock.
|
2020
|
+
# The core loop logic is adapted from FileLock's implementation.
|
2021
|
+
lock = filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
|
2022
|
+
start_time = time.perf_counter()
|
2023
|
+
|
2024
|
+
# Loop until we have an up-to-date status or until we acquire the lock.
|
2025
|
+
while True:
|
2026
|
+
# Check to see if we can return the cached status.
|
2027
|
+
if not _must_refresh_cluster_status(record, force_refresh_statuses):
|
2028
|
+
return record
|
2029
|
+
|
2030
|
+
if not acquire_per_cluster_status_lock:
|
2031
|
+
return _update_cluster_status_no_lock(cluster_name)
|
2032
|
+
|
2033
|
+
# Try to acquire the lock so we can fetch the status.
|
2034
|
+
try:
|
2035
|
+
with lock.acquire(blocking=False):
|
2036
|
+
# Lock acquired.
|
2037
|
+
|
2038
|
+
# Check the cluster status again, since it could have been
|
2039
|
+
# updated between our last check and acquiring the lock.
|
2040
|
+
record = global_user_state.get_cluster_from_name(cluster_name)
|
2041
|
+
if record is None or not _must_refresh_cluster_status(
|
2042
|
+
record, force_refresh_statuses):
|
2043
|
+
return record
|
2044
|
+
|
2045
|
+
# Update and return the cluster status.
|
2046
|
+
return _update_cluster_status_no_lock(cluster_name)
|
2047
|
+
except filelock.Timeout:
|
2048
|
+
# lock.acquire() will throw a Timeout exception if the lock is not
|
2049
|
+
# available and we have blocking=False.
|
2050
|
+
pass
|
2051
|
+
|
2052
|
+
# Logic adapted from FileLock.acquire().
|
2053
|
+
# If cluster_status_lock_time is <0, we will never hit this. No timeout.
|
2054
|
+
# Otherwise, if we have timed out, return the cached status. This has
|
2055
|
+
# the potential to cause correctness issues, but if so it is the
|
2056
|
+
# caller's responsibility to set the timeout to -1.
|
2057
|
+
if 0 <= cluster_status_lock_timeout < time.perf_counter() - start_time:
|
2058
|
+
logger.debug('Refreshing status: Failed get the lock for cluster '
|
2059
|
+
f'{cluster_name!r}. Using the cached status.')
|
2060
|
+
return record
|
2061
|
+
time.sleep(0.05)
|
2062
|
+
|
2063
|
+
# Refresh for next loop iteration.
|
2064
|
+
record = global_user_state.get_cluster_from_name(cluster_name)
|
2065
|
+
if record is None:
|
2066
|
+
return None
|
2002
2067
|
|
2003
2068
|
|
2004
2069
|
@timeline.event
|
@@ -3558,7 +3558,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3558
3558
|
backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
|
3559
3559
|
|
3560
3560
|
try:
|
3561
|
-
with
|
3561
|
+
with timeline.FileLockEvent(
|
3562
3562
|
lock_path,
|
3563
3563
|
backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
|
3564
3564
|
self.teardown_no_lock(
|
sky/cli.py
CHANGED
@@ -3699,13 +3699,24 @@ def jobs_launch(
|
|
3699
3699
|
dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
|
3700
3700
|
dag_utils.fill_default_config_in_dag_for_job_launch(dag)
|
3701
3701
|
|
3702
|
-
click.secho(f'Managed job {dag.name!r} will be launched on (estimated):',
|
3703
|
-
fg='cyan')
|
3704
3702
|
dag, _ = admin_policy_utils.apply(
|
3705
3703
|
dag, use_mutated_config_in_current_request=False)
|
3706
|
-
dag = sky.optimize(dag)
|
3707
3704
|
|
3708
|
-
if
|
3705
|
+
if yes:
|
3706
|
+
# Skip resource preview if -y is set, since we are probably running in
|
3707
|
+
# a script and the user won't have a chance to review it anyway.
|
3708
|
+
# This can save a couple of seconds.
|
3709
|
+
click.secho(
|
3710
|
+
f'Resources for managed job {dag.name!r} will be computed on the '
|
3711
|
+
'managed jobs controller, since --yes is set.',
|
3712
|
+
fg='cyan')
|
3713
|
+
|
3714
|
+
else:
|
3715
|
+
click.secho(
|
3716
|
+
f'Managed job {dag.name!r} will be launched on (estimated):',
|
3717
|
+
fg='cyan')
|
3718
|
+
dag = sky.optimize(dag)
|
3719
|
+
|
3709
3720
|
prompt = f'Launching a managed job {dag.name!r}. Proceed?'
|
3710
3721
|
if prompt is not None:
|
3711
3722
|
click.confirm(prompt, default=True, abort=True, show_default=True)
|
sky/clouds/aws.py
CHANGED
@@ -663,6 +663,7 @@ class AWS(clouds.Cloud):
|
|
663
663
|
return AWSIdentityType.SHARED_CREDENTIALS_FILE
|
664
664
|
|
665
665
|
@classmethod
|
666
|
+
@functools.lru_cache(maxsize=1) # Cache since getting identity is slow.
|
666
667
|
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
667
668
|
"""Returns a [UserId, Account] list that uniquely identifies the user.
|
668
669
|
|
@@ -20,6 +20,7 @@ from sky.clouds.service_catalog.data_fetchers import fetch_aws
|
|
20
20
|
from sky.utils import common_utils
|
21
21
|
from sky.utils import resources_utils
|
22
22
|
from sky.utils import rich_utils
|
23
|
+
from sky.utils import timeline
|
23
24
|
from sky.utils import ux_utils
|
24
25
|
|
25
26
|
if typing.TYPE_CHECKING:
|
@@ -100,6 +101,7 @@ def _get_az_mappings(aws_user_hash: str) -> Optional['pd.DataFrame']:
|
|
100
101
|
return az_mappings
|
101
102
|
|
102
103
|
|
104
|
+
@timeline.event
|
103
105
|
def _fetch_and_apply_az_mapping(df: common.LazyDataFrame) -> 'pd.DataFrame':
|
104
106
|
"""Maps zone IDs (use1-az1) to zone names (us-east-1x).
|
105
107
|
|
sky/execution.py
CHANGED
@@ -267,6 +267,12 @@ def _execute(
|
|
267
267
|
# no-credential machine should not enter optimize(), which
|
268
268
|
# would directly error out ('No cloud is enabled...'). Fix
|
269
269
|
# by moving `sky check` checks out of optimize()?
|
270
|
+
|
271
|
+
controller = controller_utils.Controllers.from_name(
|
272
|
+
cluster_name)
|
273
|
+
if controller is not None:
|
274
|
+
logger.info(
|
275
|
+
f'Choosing resources for {controller.name}...')
|
270
276
|
dag = sky.optimize(dag, minimize=optimize_target)
|
271
277
|
task = dag.tasks[0] # Keep: dag may have been deep-copied.
|
272
278
|
assert task.best_resources is not None, task
|
sky/global_user_state.py
CHANGED
@@ -60,7 +60,8 @@ def create_table(cursor, conn):
|
|
60
60
|
owner TEXT DEFAULT null,
|
61
61
|
cluster_hash TEXT DEFAULT null,
|
62
62
|
storage_mounts_metadata BLOB DEFAULT null,
|
63
|
-
cluster_ever_up INTEGER DEFAULT 0
|
63
|
+
cluster_ever_up INTEGER DEFAULT 0,
|
64
|
+
status_updated_at INTEGER DEFAULT null)""")
|
64
65
|
|
65
66
|
# Table for Cluster History
|
66
67
|
# usage_intervals: List[Tuple[int, int]]
|
@@ -130,6 +131,10 @@ def create_table(cursor, conn):
|
|
130
131
|
# clusters were never really UP, setting it to 1 means they won't be
|
131
132
|
# auto-deleted during any failover.
|
132
133
|
value_to_replace_existing_entries=1)
|
134
|
+
|
135
|
+
db_utils.add_column_to_table(cursor, conn, 'clusters', 'status_updated_at',
|
136
|
+
'INTEGER DEFAULT null')
|
137
|
+
|
133
138
|
conn.commit()
|
134
139
|
|
135
140
|
|
@@ -159,6 +164,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
159
164
|
status = status_lib.ClusterStatus.INIT
|
160
165
|
if ready:
|
161
166
|
status = status_lib.ClusterStatus.UP
|
167
|
+
status_updated_at = int(time.time())
|
162
168
|
|
163
169
|
# TODO (sumanth): Cluster history table will have multiple entries
|
164
170
|
# when the cluster failover through multiple regions (one entry per region).
|
@@ -191,7 +197,7 @@ def add_or_update_cluster(cluster_name: str,
|
|
191
197
|
# specified.
|
192
198
|
'(name, launched_at, handle, last_use, status, '
|
193
199
|
'autostop, to_down, metadata, owner, cluster_hash, '
|
194
|
-
'storage_mounts_metadata, cluster_ever_up) '
|
200
|
+
'storage_mounts_metadata, cluster_ever_up, status_updated_at) '
|
195
201
|
'VALUES ('
|
196
202
|
# name
|
197
203
|
'?, '
|
@@ -228,7 +234,9 @@ def add_or_update_cluster(cluster_name: str,
|
|
228
234
|
'COALESCE('
|
229
235
|
'(SELECT storage_mounts_metadata FROM clusters WHERE name=?), null), '
|
230
236
|
# cluster_ever_up
|
231
|
-
'((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?)'
|
237
|
+
'((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?),'
|
238
|
+
# status_updated_at
|
239
|
+
'?'
|
232
240
|
')',
|
233
241
|
(
|
234
242
|
# name
|
@@ -260,6 +268,8 @@ def add_or_update_cluster(cluster_name: str,
|
|
260
268
|
# cluster_ever_up
|
261
269
|
cluster_name,
|
262
270
|
int(ready),
|
271
|
+
# status_updated_at
|
272
|
+
status_updated_at,
|
263
273
|
))
|
264
274
|
|
265
275
|
launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
|
@@ -330,11 +340,13 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None:
|
|
330
340
|
# stopped VM, which leads to timeout.
|
331
341
|
if hasattr(handle, 'stable_internal_external_ips'):
|
332
342
|
handle.stable_internal_external_ips = None
|
343
|
+
current_time = int(time.time())
|
333
344
|
_DB.cursor.execute(
|
334
|
-
'UPDATE clusters SET handle=(?), status=(?) '
|
335
|
-
'WHERE name=(?)', (
|
345
|
+
'UPDATE clusters SET handle=(?), status=(?), '
|
346
|
+
'status_updated_at=(?) WHERE name=(?)', (
|
336
347
|
pickle.dumps(handle),
|
337
348
|
status_lib.ClusterStatus.STOPPED.value,
|
349
|
+
current_time,
|
338
350
|
cluster_name,
|
339
351
|
))
|
340
352
|
_DB.conn.commit()
|
@@ -359,10 +371,10 @@ def get_glob_cluster_names(cluster_name: str) -> List[str]:
|
|
359
371
|
|
360
372
|
def set_cluster_status(cluster_name: str,
|
361
373
|
status: status_lib.ClusterStatus) -> None:
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
374
|
+
current_time = int(time.time())
|
375
|
+
_DB.cursor.execute(
|
376
|
+
'UPDATE clusters SET status=(?), status_updated_at=(?) WHERE name=(?)',
|
377
|
+
(status.value, current_time, cluster_name))
|
366
378
|
count = _DB.cursor.rowcount
|
367
379
|
_DB.conn.commit()
|
368
380
|
assert count <= 1, count
|
@@ -570,15 +582,18 @@ def _load_storage_mounts_metadata(
|
|
570
582
|
|
571
583
|
def get_cluster_from_name(
|
572
584
|
cluster_name: Optional[str]) -> Optional[Dict[str, Any]]:
|
573
|
-
rows = _DB.cursor.execute(
|
574
|
-
|
585
|
+
rows = _DB.cursor.execute(
|
586
|
+
'SELECT name, launched_at, handle, last_use, status, autostop, '
|
587
|
+
'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
|
588
|
+
'cluster_ever_up, status_updated_at FROM clusters WHERE name=(?)',
|
589
|
+
(cluster_name,)).fetchall()
|
575
590
|
for row in rows:
|
576
591
|
# Explicitly specify the number of fields to unpack, so that
|
577
592
|
# we can add new fields to the database in the future without
|
578
593
|
# breaking the previous code.
|
579
594
|
(name, launched_at, handle, last_use, status, autostop, metadata,
|
580
|
-
to_down, owner, cluster_hash, storage_mounts_metadata,
|
581
|
-
|
595
|
+
to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
|
596
|
+
status_updated_at) = row[:13]
|
582
597
|
# TODO: use namedtuple instead of dict
|
583
598
|
record = {
|
584
599
|
'name': name,
|
@@ -594,6 +609,7 @@ def get_cluster_from_name(
|
|
594
609
|
'storage_mounts_metadata':
|
595
610
|
_load_storage_mounts_metadata(storage_mounts_metadata),
|
596
611
|
'cluster_ever_up': bool(cluster_ever_up),
|
612
|
+
'status_updated_at': status_updated_at,
|
597
613
|
}
|
598
614
|
return record
|
599
615
|
return None
|
@@ -601,12 +617,15 @@ def get_cluster_from_name(
|
|
601
617
|
|
602
618
|
def get_clusters() -> List[Dict[str, Any]]:
|
603
619
|
rows = _DB.cursor.execute(
|
604
|
-
'select
|
620
|
+
'select name, launched_at, handle, last_use, status, autostop, '
|
621
|
+
'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
|
622
|
+
'cluster_ever_up, status_updated_at from clusters '
|
623
|
+
'order by launched_at desc').fetchall()
|
605
624
|
records = []
|
606
625
|
for row in rows:
|
607
626
|
(name, launched_at, handle, last_use, status, autostop, metadata,
|
608
|
-
to_down, owner, cluster_hash, storage_mounts_metadata,
|
609
|
-
|
627
|
+
to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
|
628
|
+
status_updated_at) = row[:13]
|
610
629
|
# TODO: use namedtuple instead of dict
|
611
630
|
record = {
|
612
631
|
'name': name,
|
@@ -622,6 +641,7 @@ def get_clusters() -> List[Dict[str, Any]]:
|
|
622
641
|
'storage_mounts_metadata':
|
623
642
|
_load_storage_mounts_metadata(storage_mounts_metadata),
|
624
643
|
'cluster_ever_up': bool(cluster_ever_up),
|
644
|
+
'status_updated_at': status_updated_at,
|
625
645
|
}
|
626
646
|
|
627
647
|
records.append(record)
|
sky/utils/schemas.py
CHANGED
sky/utils/timeline.py
CHANGED
@@ -79,11 +79,9 @@ def event(name_or_fn: Union[str, Callable], message: Optional[str] = None):
|
|
79
79
|
class FileLockEvent:
|
80
80
|
"""Serve both as a file lock and event for the lock."""
|
81
81
|
|
82
|
-
def __init__(self, lockfile: Union[str, os.PathLike]):
|
82
|
+
def __init__(self, lockfile: Union[str, os.PathLike], timeout: float = -1):
|
83
83
|
self._lockfile = lockfile
|
84
|
-
|
85
|
-
# pylint: disable=abstract-class-instantiated
|
86
|
-
self._lock = filelock.FileLock(self._lockfile)
|
84
|
+
self._lock = filelock.FileLock(self._lockfile, timeout)
|
87
85
|
self._hold_lock_event = Event(f'[FileLock.hold]:{self._lockfile}')
|
88
86
|
|
89
87
|
def acquire(self):
|
{skypilot_nightly-1.0.0.dev20241115.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/RECORD
RENAMED
@@ -1,14 +1,14 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=3SD_D0xLryHjVHya6CGXpUfvTyBigxV1kr8ujkBxa3g,5882
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
|
4
4
|
sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
|
5
|
-
sky/cli.py,sha256=
|
5
|
+
sky/cli.py,sha256=2QrlLeMwKpVKYOBDwtgs9zkBvNgn9Rg3XKk9aE6_0eY,213418
|
6
6
|
sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
|
7
7
|
sky/core.py,sha256=0-4W_DKJZgbwXuzNZKQ2R_qJxqxbqqNfyi0U0PQBKvQ,38230
|
8
8
|
sky/dag.py,sha256=f3sJlkH4bE6Uuz3ozNtsMhcBpRx7KmC9Sa4seDKt4hU,3104
|
9
9
|
sky/exceptions.py,sha256=E3C2Ejcc8RUDAUQn7ar_Jr97C_AxD2rKKMmJOfLJ9d0,8965
|
10
|
-
sky/execution.py,sha256=
|
11
|
-
sky/global_user_state.py,sha256=
|
10
|
+
sky/execution.py,sha256=teXbprZ_2BYPr7HYT0-GJNxSDfBmNNdFqpms2xTvihM,27427
|
11
|
+
sky/global_user_state.py,sha256=ob3jvtG_yMPGvLlVScgeJ9pqk3FP4jhfEixw8WzFwho,29682
|
12
12
|
sky/optimizer.py,sha256=GjvKQIBtY3NlULzau_9tfa7V2KYVJRrmNrjKVIWCPIQ,59753
|
13
13
|
sky/resources.py,sha256=usmB8p7HyzyWHcevQ8HV6eIlukYJ9BC0trFOaE2kNuw,69049
|
14
14
|
sky/sky_logging.py,sha256=oLmTmwkuucIto3LHXLJfMcyRpYSkmZAZa5XzQPA5IHk,4434
|
@@ -30,8 +30,8 @@ sky/adaptors/runpod.py,sha256=4Nt_BfZhJAKQNA3wO8cxvvNI8x4NsDGHu_4EhRDlGYQ,225
|
|
30
30
|
sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
|
31
31
|
sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
|
32
32
|
sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
|
33
|
-
sky/backends/backend_utils.py,sha256=
|
34
|
-
sky/backends/cloud_vm_ray_backend.py,sha256=
|
33
|
+
sky/backends/backend_utils.py,sha256=vese_H1lOFL40r-LPhjqqM8OlgICDgEW-YtNQ95cYs8,125608
|
34
|
+
sky/backends/cloud_vm_ray_backend.py,sha256=REJ8bSe-QJzlytFes7hmxb3Nmx--zC7y3JDB2PAwv3Q,232316
|
35
35
|
sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
|
36
36
|
sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
|
37
37
|
sky/backends/wheel_utils.py,sha256=CUVOwlBtQjOMv-RSDGx2jMQ0M1D0w9ZPm0TDafJwBDI,8180
|
@@ -40,7 +40,7 @@ sky/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
40
|
sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG2yg,8723
|
41
41
|
sky/benchmark/benchmark_utils.py,sha256=eb-i6zYoo-Zkod-T9qtCu1FcYLw--Yyos1SyibUPZNE,26194
|
42
42
|
sky/clouds/__init__.py,sha256=WuNIJEnZmBO72tU5awgaaL3rdvFRSkgaYNNeuY68dXo,1356
|
43
|
-
sky/clouds/aws.py,sha256=
|
43
|
+
sky/clouds/aws.py,sha256=um7-lam6BfYYgCvNAMsIY_Gty8wt8cOlCHsu3Ah3Od8,49616
|
44
44
|
sky/clouds/azure.py,sha256=38eUcB1_lt5FvDWo-G_pKIIsT1c_bCU2AifEYo7KX9Y,30756
|
45
45
|
sky/clouds/cloud.py,sha256=Y_9Hi2DhAbrqMLvb_NFPt--N5V6ua8BgbwV4xIc19KU,35216
|
46
46
|
sky/clouds/cloud_registry.py,sha256=oLoYFjm_SDTgdHokY7b6A5Utq80HXRQNxV0fLjDdVsQ,2361
|
@@ -56,7 +56,7 @@ sky/clouds/runpod.py,sha256=_4myVdGIvQshkka8fn6mBXHgz5TZqhrNhAEM2_HrCT8,11487
|
|
56
56
|
sky/clouds/scp.py,sha256=NivPvzQxA90R37QR3fgTup8ScGfxKsXAhH0xklAj5QU,15817
|
57
57
|
sky/clouds/vsphere.py,sha256=ZzlcQBzv0aaRYXwZOrdKIGFK94LaOfDSV3lJBg9xyc4,12256
|
58
58
|
sky/clouds/service_catalog/__init__.py,sha256=cFZ3HLdQVa42xOhK2XxuB_xPIX4X1UR89InR4y2y_78,14757
|
59
|
-
sky/clouds/service_catalog/aws_catalog.py,sha256=
|
59
|
+
sky/clouds/service_catalog/aws_catalog.py,sha256=j33lNC5GXWK6CiGWZORCnumGlRODmCAT2_lfWp0YtBc,13106
|
60
60
|
sky/clouds/service_catalog/azure_catalog.py,sha256=5Q51x_WEKvQ2YSgJvZHRH3URlbwIstYuwpjaWW_wJlw,8149
|
61
61
|
sky/clouds/service_catalog/common.py,sha256=qHNLzh59W34CSSCCztu75n69TuGyDQ310SQc_P-t544,27700
|
62
62
|
sky/clouds/service_catalog/config.py,sha256=ylzqewdEBjDg4awvFek6ldYmFrnvD2bVGLZuLPvEVYA,1793
|
@@ -140,7 +140,7 @@ sky/provision/kubernetes/config.py,sha256=WEKcFXXhe89bLGAvoMiBvTDxdxkpTIA6ezrj2v
|
|
140
140
|
sky/provision/kubernetes/instance.py,sha256=lHD1cVVEMZFLNnd7_UCVr079SY9D3dH1X3abMBRRtdI,47103
|
141
141
|
sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
|
142
142
|
sky/provision/kubernetes/network_utils.py,sha256=t1FS3K400fetH7cBuRgQJZl5_jEeMshsvsYmnMUcq8k,11399
|
143
|
-
sky/provision/kubernetes/utils.py,sha256=
|
143
|
+
sky/provision/kubernetes/utils.py,sha256=4Gqn66jE0Nzs4UBjajfUM4E4mXWYnYzdhLz6SQG44Kg,101244
|
144
144
|
sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
|
145
145
|
sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
|
146
146
|
sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
|
@@ -257,9 +257,9 @@ sky/utils/kubernetes_enums.py,sha256=imGqHSa8O07zD_6xH1SDMM7dBU5lF5fzFFlQuQy00QM
|
|
257
257
|
sky/utils/log_utils.py,sha256=oZYF45uC7GFjAqO-Je-aiX6zhtq91TP-KKaIbQNF-jY,14024
|
258
258
|
sky/utils/resources_utils.py,sha256=Xqi7gxPYw2y5wl5okUI5zx5LEij0hJF_V3Zi8q7TXYg,7890
|
259
259
|
sky/utils/rich_utils.py,sha256=hmnI1X5dKvRIQzB7EyNb34FT97qFNve-0QHqM5r0mVk,3066
|
260
|
-
sky/utils/schemas.py,sha256=
|
260
|
+
sky/utils/schemas.py,sha256=yz8IKUA2oWJartaranIc9MfDZmZcIybPZUGViw1Ii1Q,29475
|
261
261
|
sky/utils/subprocess_utils.py,sha256=4pnt_QPvPuh3ylG5xlr18JlZeF6693h3fmG1uaD8qLo,8669
|
262
|
-
sky/utils/timeline.py,sha256=
|
262
|
+
sky/utils/timeline.py,sha256=ebHxKJK2HX0utGArrUgSezTPkcwav3VETa_AQS34t-E,3925
|
263
263
|
sky/utils/ux_utils.py,sha256=CqyIFGDuSE8fQasPkna_loZMwtboC9KedR09WEQ7qz0,6502
|
264
264
|
sky/utils/validator.py,sha256=cAFERCoC7jH0DFKepcU4x9SYmdrYL1iVmW9tXA18hvo,701
|
265
265
|
sky/utils/cli_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -275,9 +275,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
|
|
275
275
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
276
276
|
sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
|
277
277
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
|
278
|
-
skypilot_nightly-1.0.0.
|
279
|
-
skypilot_nightly-1.0.0.
|
280
|
-
skypilot_nightly-1.0.0.
|
281
|
-
skypilot_nightly-1.0.0.
|
282
|
-
skypilot_nightly-1.0.0.
|
283
|
-
skypilot_nightly-1.0.0.
|
278
|
+
skypilot_nightly-1.0.0.dev20241116.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
279
|
+
skypilot_nightly-1.0.0.dev20241116.dist-info/METADATA,sha256=n63PW0hLjDH_3izIXS8s1yxVvm9WtasQaSh8eQshN30,19699
|
280
|
+
skypilot_nightly-1.0.0.dev20241116.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
281
|
+
skypilot_nightly-1.0.0.dev20241116.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
282
|
+
skypilot_nightly-1.0.0.dev20241116.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
283
|
+
skypilot_nightly-1.0.0.dev20241116.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20241115.dist-info → skypilot_nightly-1.0.0.dev20241116.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|