skypilot-nightly 1.0.0.dev20241115__py3-none-any.whl → 1.0.0.dev20241117__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'a404e3fc9bee7f0865f4118cfdd158de2b51ee28'
8
+ _SKYPILOT_COMMIT_SHA = 'ed4329a724aa583dd02e325e824fd1e36ee32aaf'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241115'
38
+ __version__ = '1.0.0.dev20241117'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -100,6 +100,10 @@ DEFAULT_TASK_CPU_DEMAND = 0.5
100
100
  CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock')
101
101
  CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
102
102
 
103
+ # Time that must elapse since the last status check before we should re-check if
104
+ # the cluster has been terminated or autostopped.
105
+ _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
106
+
103
107
  # Filelocks for updating cluster's file_mounts.
104
108
  CLUSTER_FILE_MOUNTS_LOCK_PATH = os.path.expanduser(
105
109
  '~/.sky/.{}_file_mounts.lock')
@@ -686,26 +690,56 @@ def write_cluster_config(
686
690
  skypilot_config.get_nested(
687
691
  (str(to_provision.cloud).lower(), 'specific_reservations'), set()))
688
692
 
693
+ # Remote identity handling can have 4 cases:
694
+ # 1. LOCAL_CREDENTIALS (default for most clouds): Upload local credentials
695
+ # 2. SERVICE_ACCOUNT: SkyPilot creates and manages a service account
696
+ # 3. Custom service account: Use specified service account
697
+ # 4. NO_UPLOAD: Do not upload any credentials
698
+ #
699
+ # We need to upload credentials only if LOCAL_CREDENTIALS is specified. In
700
+ # other cases, we exclude the cloud from credential file uploads after
701
+ # running required checks.
689
702
  assert cluster_name is not None
690
- excluded_clouds = []
703
+ excluded_clouds = set()
691
704
  remote_identity_config = skypilot_config.get_nested(
692
705
  (str(cloud).lower(), 'remote_identity'), None)
693
706
  remote_identity = schemas.get_default_remote_identity(str(cloud).lower())
694
707
  if isinstance(remote_identity_config, str):
695
708
  remote_identity = remote_identity_config
696
709
  if isinstance(remote_identity_config, list):
710
+ # Some clouds (e.g., AWS) support specifying multiple service accounts
711
+ # chosen based on the cluster name. Do the matching here to pick the
712
+ # correct one.
697
713
  for profile in remote_identity_config:
698
714
  if fnmatch.fnmatchcase(cluster_name, list(profile.keys())[0]):
699
715
  remote_identity = list(profile.values())[0]
700
716
  break
701
717
  if remote_identity != schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value:
702
- if not cloud.supports_service_account_on_remote():
718
+ # If LOCAL_CREDENTIALS is not specified, we add the cloud to the
719
+ # excluded_clouds set, but we must also check if the cloud supports
720
+ # service accounts.
721
+ if remote_identity == schemas.RemoteIdentityOptions.NO_UPLOAD.value:
722
+ # If NO_UPLOAD is specified, fall back to default remote identity
723
+ # for downstream logic but add it to excluded_clouds to skip
724
+ # credential file uploads.
725
+ remote_identity = schemas.get_default_remote_identity(
726
+ str(cloud).lower())
727
+ elif not cloud.supports_service_account_on_remote():
703
728
  raise exceptions.InvalidCloudConfigs(
704
729
  'remote_identity: SERVICE_ACCOUNT is specified in '
705
730
  f'{skypilot_config.loaded_config_path!r} for {cloud}, but it '
706
731
  'is not supported by this cloud. Remove the config or set: '
707
732
  '`remote_identity: LOCAL_CREDENTIALS`.')
708
- excluded_clouds = [cloud]
733
+ excluded_clouds.add(cloud)
734
+
735
+ for cloud_str, cloud_obj in cloud_registry.CLOUD_REGISTRY.items():
736
+ remote_identity_config = skypilot_config.get_nested(
737
+ (cloud_str.lower(), 'remote_identity'), None)
738
+ if remote_identity_config:
739
+ if (remote_identity_config ==
740
+ schemas.RemoteIdentityOptions.NO_UPLOAD.value):
741
+ excluded_clouds.add(cloud_obj)
742
+
709
743
  credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)
710
744
 
711
745
  auth_config = {'ssh_private_key': auth.PRIVATE_SSH_KEY_PATH}
@@ -1669,11 +1703,27 @@ def check_can_clone_disk_and_override_task(
1669
1703
 
1670
1704
  def _update_cluster_status_no_lock(
1671
1705
  cluster_name: str) -> Optional[Dict[str, Any]]:
1672
- """Updates the status of the cluster.
1706
+ """Update the cluster status.
1707
+
1708
+ The cluster status is updated by checking ray cluster and real status from
1709
+ cloud.
1710
+
1711
+ The function will update the cached cluster status in the global state. For
1712
+ the design of the cluster status and transition, please refer to the
1713
+ sky/design_docs/cluster_status.md
1714
+
1715
+ Returns:
1716
+ If the cluster is terminated or does not exist, return None. Otherwise
1717
+ returns the input record with status and handle potentially updated.
1673
1718
 
1674
1719
  Raises:
1720
+ exceptions.ClusterOwnerIdentityMismatchError: if the current user is
1721
+ not the same as the user who created the cluster.
1722
+ exceptions.CloudUserIdentityError: if we fail to get the current user
1723
+ identity.
1675
1724
  exceptions.ClusterStatusFetchingError: the cluster status cannot be
1676
- fetched from the cloud provider.
1725
+ fetched from the cloud provider or there are leaked nodes causing
1726
+ the node number larger than expected.
1677
1727
  """
1678
1728
  record = global_user_state.get_cluster_from_name(cluster_name)
1679
1729
  if record is None:
@@ -1893,52 +1943,22 @@ def _update_cluster_status_no_lock(
1893
1943
  return global_user_state.get_cluster_from_name(cluster_name)
1894
1944
 
1895
1945
 
1896
- def _update_cluster_status(
1897
- cluster_name: str,
1898
- acquire_per_cluster_status_lock: bool,
1899
- cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
1900
- ) -> Optional[Dict[str, Any]]:
1901
- """Update the cluster status.
1902
-
1903
- The cluster status is updated by checking ray cluster and real status from
1904
- cloud.
1905
-
1906
- The function will update the cached cluster status in the global state. For
1907
- the design of the cluster status and transition, please refer to the
1908
- sky/design_docs/cluster_status.md
1909
-
1910
- Args:
1911
- cluster_name: The name of the cluster.
1912
- acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
1913
- before updating the status.
1914
- cluster_status_lock_timeout: The timeout to acquire the per-cluster
1915
- lock.
1916
-
1917
- Returns:
1918
- If the cluster is terminated or does not exist, return None. Otherwise
1919
- returns the input record with status and handle potentially updated.
1946
+ def _must_refresh_cluster_status(
1947
+ record: Dict[str, Any],
1948
+ force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]]
1949
+ ) -> bool:
1950
+ force_refresh_for_cluster = (force_refresh_statuses is not None and
1951
+ record['status'] in force_refresh_statuses)
1920
1952
 
1921
- Raises:
1922
- exceptions.ClusterOwnerIdentityMismatchError: if the current user is
1923
- not the same as the user who created the cluster.
1924
- exceptions.CloudUserIdentityError: if we fail to get the current user
1925
- identity.
1926
- exceptions.ClusterStatusFetchingError: the cluster status cannot be
1927
- fetched from the cloud provider or there are leaked nodes causing
1928
- the node number larger than expected.
1929
- """
1930
- if not acquire_per_cluster_status_lock:
1931
- return _update_cluster_status_no_lock(cluster_name)
1953
+ use_spot = record['handle'].launched_resources.use_spot
1954
+ has_autostop = (record['status'] != status_lib.ClusterStatus.STOPPED and
1955
+ record['autostop'] >= 0)
1956
+ recently_refreshed = (record['status_updated_at'] is not None and
1957
+ time.time() - record['status_updated_at'] <
1958
+ _CLUSTER_STATUS_CACHE_DURATION_SECONDS)
1959
+ is_stale = (use_spot or has_autostop) and not recently_refreshed
1932
1960
 
1933
- try:
1934
- with filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name),
1935
- timeout=cluster_status_lock_timeout):
1936
- return _update_cluster_status_no_lock(cluster_name)
1937
- except filelock.Timeout:
1938
- logger.debug('Refreshing status: Failed get the lock for cluster '
1939
- f'{cluster_name!r}. Using the cached status.')
1940
- record = global_user_state.get_cluster_from_name(cluster_name)
1941
- return record
1961
+ return force_refresh_for_cluster or is_stale
1942
1962
 
1943
1963
 
1944
1964
  def refresh_cluster_record(
@@ -1956,16 +1976,22 @@ def refresh_cluster_record(
1956
1976
 
1957
1977
  Args:
1958
1978
  cluster_name: The name of the cluster.
1959
- force_refresh_statuses: if specified, refresh the cluster if it has one of
1960
- the specified statuses. Additionally, clusters satisfying the
1961
- following conditions will always be refreshed no matter the
1962
- argument is specified or not:
1963
- 1. is a spot cluster, or
1964
- 2. is a non-spot cluster, is not STOPPED, and autostop is set.
1979
+ force_refresh_statuses: if specified, refresh the cluster if it has one
1980
+ of the specified statuses. Additionally, clusters satisfying the
1981
+ following conditions will be refreshed no matter the argument is
1982
+ specified or not:
1983
+ - the most latest available status update is more than
1984
+ _CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
1985
+ 1. the cluster is a spot cluster, or
1986
+ 2. cluster autostop is set and the cluster is not STOPPED.
1965
1987
  acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
1966
- before updating the status.
1988
+ before updating the status. Even if this is True, the lock may not be
1989
+ acquired if the status does not need to be refreshed.
1967
1990
  cluster_status_lock_timeout: The timeout to acquire the per-cluster
1968
- lock. If timeout, the function will use the cached status.
1991
+ lock. If timeout, the function will use the cached status. If the
1992
+ value is <0, do not timeout (wait for the lock indefinitely). By
1993
+ default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
1994
+ if correctness is required, you must set this to -1.
1969
1995
 
1970
1996
  Returns:
1971
1997
  If the cluster is terminated or does not exist, return None.
@@ -1986,19 +2012,58 @@ def refresh_cluster_record(
1986
2012
  return None
1987
2013
  check_owner_identity(cluster_name)
1988
2014
 
1989
- handle = record['handle']
1990
- if isinstance(handle, backends.CloudVmRayResourceHandle):
1991
- use_spot = handle.launched_resources.use_spot
1992
- has_autostop = (record['status'] != status_lib.ClusterStatus.STOPPED and
1993
- record['autostop'] >= 0)
1994
- force_refresh_for_cluster = (force_refresh_statuses is not None and
1995
- record['status'] in force_refresh_statuses)
1996
- if force_refresh_for_cluster or has_autostop or use_spot:
1997
- record = _update_cluster_status(
1998
- cluster_name,
1999
- acquire_per_cluster_status_lock=acquire_per_cluster_status_lock,
2000
- cluster_status_lock_timeout=cluster_status_lock_timeout)
2001
- return record
2015
+ if not isinstance(record['handle'], backends.CloudVmRayResourceHandle):
2016
+ return record
2017
+
2018
+ # The loop logic allows us to notice if the status was updated in the
2019
+ # global_user_state by another process and stop trying to get the lock.
2020
+ # The core loop logic is adapted from FileLock's implementation.
2021
+ lock = filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
2022
+ start_time = time.perf_counter()
2023
+
2024
+ # Loop until we have an up-to-date status or until we acquire the lock.
2025
+ while True:
2026
+ # Check to see if we can return the cached status.
2027
+ if not _must_refresh_cluster_status(record, force_refresh_statuses):
2028
+ return record
2029
+
2030
+ if not acquire_per_cluster_status_lock:
2031
+ return _update_cluster_status_no_lock(cluster_name)
2032
+
2033
+ # Try to acquire the lock so we can fetch the status.
2034
+ try:
2035
+ with lock.acquire(blocking=False):
2036
+ # Lock acquired.
2037
+
2038
+ # Check the cluster status again, since it could have been
2039
+ # updated between our last check and acquiring the lock.
2040
+ record = global_user_state.get_cluster_from_name(cluster_name)
2041
+ if record is None or not _must_refresh_cluster_status(
2042
+ record, force_refresh_statuses):
2043
+ return record
2044
+
2045
+ # Update and return the cluster status.
2046
+ return _update_cluster_status_no_lock(cluster_name)
2047
+ except filelock.Timeout:
2048
+ # lock.acquire() will throw a Timeout exception if the lock is not
2049
+ # available and we have blocking=False.
2050
+ pass
2051
+
2052
+ # Logic adapted from FileLock.acquire().
2053
+ # If cluster_status_lock_time is <0, we will never hit this. No timeout.
2054
+ # Otherwise, if we have timed out, return the cached status. This has
2055
+ # the potential to cause correctness issues, but if so it is the
2056
+ # caller's responsibility to set the timeout to -1.
2057
+ if 0 <= cluster_status_lock_timeout < time.perf_counter() - start_time:
2058
+ logger.debug('Refreshing status: Failed get the lock for cluster '
2059
+ f'{cluster_name!r}. Using the cached status.')
2060
+ return record
2061
+ time.sleep(0.05)
2062
+
2063
+ # Refresh for next loop iteration.
2064
+ record = global_user_state.get_cluster_from_name(cluster_name)
2065
+ if record is None:
2066
+ return None
2002
2067
 
2003
2068
 
2004
2069
  @timeline.event
@@ -3558,7 +3558,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3558
3558
  backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
3559
3559
 
3560
3560
  try:
3561
- with filelock.FileLock(
3561
+ with timeline.FileLockEvent(
3562
3562
  lock_path,
3563
3563
  backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
3564
3564
  self.teardown_no_lock(
sky/cli.py CHANGED
@@ -3699,13 +3699,24 @@ def jobs_launch(
3699
3699
  dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
3700
3700
  dag_utils.fill_default_config_in_dag_for_job_launch(dag)
3701
3701
 
3702
- click.secho(f'Managed job {dag.name!r} will be launched on (estimated):',
3703
- fg='cyan')
3704
3702
  dag, _ = admin_policy_utils.apply(
3705
3703
  dag, use_mutated_config_in_current_request=False)
3706
- dag = sky.optimize(dag)
3707
3704
 
3708
- if not yes:
3705
+ if yes:
3706
+ # Skip resource preview if -y is set, since we are probably running in
3707
+ # a script and the user won't have a chance to review it anyway.
3708
+ # This can save a couple of seconds.
3709
+ click.secho(
3710
+ f'Resources for managed job {dag.name!r} will be computed on the '
3711
+ 'managed jobs controller, since --yes is set.',
3712
+ fg='cyan')
3713
+
3714
+ else:
3715
+ click.secho(
3716
+ f'Managed job {dag.name!r} will be launched on (estimated):',
3717
+ fg='cyan')
3718
+ dag = sky.optimize(dag)
3719
+
3709
3720
  prompt = f'Launching a managed job {dag.name!r}. Proceed?'
3710
3721
  if prompt is not None:
3711
3722
  click.confirm(prompt, default=True, abort=True, show_default=True)
sky/clouds/aws.py CHANGED
@@ -663,6 +663,7 @@ class AWS(clouds.Cloud):
663
663
  return AWSIdentityType.SHARED_CREDENTIALS_FILE
664
664
 
665
665
  @classmethod
666
+ @functools.lru_cache(maxsize=1) # Cache since getting identity is slow.
666
667
  def get_user_identities(cls) -> Optional[List[List[str]]]:
667
668
  """Returns a [UserId, Account] list that uniquely identifies the user.
668
669
 
@@ -20,6 +20,7 @@ from sky.clouds.service_catalog.data_fetchers import fetch_aws
20
20
  from sky.utils import common_utils
21
21
  from sky.utils import resources_utils
22
22
  from sky.utils import rich_utils
23
+ from sky.utils import timeline
23
24
  from sky.utils import ux_utils
24
25
 
25
26
  if typing.TYPE_CHECKING:
@@ -100,6 +101,7 @@ def _get_az_mappings(aws_user_hash: str) -> Optional['pd.DataFrame']:
100
101
  return az_mappings
101
102
 
102
103
 
104
+ @timeline.event
103
105
  def _fetch_and_apply_az_mapping(df: common.LazyDataFrame) -> 'pd.DataFrame':
104
106
  """Maps zone IDs (use1-az1) to zone names (us-east-1x).
105
107
 
@@ -46,6 +46,7 @@ GPU_TO_MEMORY = {
46
46
  'RTX6000': 24576,
47
47
  'V100': 16384,
48
48
  'H100': 81920,
49
+ 'GH200': 98304,
49
50
  'GENERAL': None
50
51
  }
51
52
 
sky/execution.py CHANGED
@@ -267,6 +267,12 @@ def _execute(
267
267
  # no-credential machine should not enter optimize(), which
268
268
  # would directly error out ('No cloud is enabled...'). Fix
269
269
  # by moving `sky check` checks out of optimize()?
270
+
271
+ controller = controller_utils.Controllers.from_name(
272
+ cluster_name)
273
+ if controller is not None:
274
+ logger.info(
275
+ f'Choosing resources for {controller.name}...')
270
276
  dag = sky.optimize(dag, minimize=optimize_target)
271
277
  task = dag.tasks[0] # Keep: dag may have been deep-copied.
272
278
  assert task.best_resources is not None, task
sky/global_user_state.py CHANGED
@@ -60,7 +60,8 @@ def create_table(cursor, conn):
60
60
  owner TEXT DEFAULT null,
61
61
  cluster_hash TEXT DEFAULT null,
62
62
  storage_mounts_metadata BLOB DEFAULT null,
63
- cluster_ever_up INTEGER DEFAULT 0)""")
63
+ cluster_ever_up INTEGER DEFAULT 0,
64
+ status_updated_at INTEGER DEFAULT null)""")
64
65
 
65
66
  # Table for Cluster History
66
67
  # usage_intervals: List[Tuple[int, int]]
@@ -130,6 +131,10 @@ def create_table(cursor, conn):
130
131
  # clusters were never really UP, setting it to 1 means they won't be
131
132
  # auto-deleted during any failover.
132
133
  value_to_replace_existing_entries=1)
134
+
135
+ db_utils.add_column_to_table(cursor, conn, 'clusters', 'status_updated_at',
136
+ 'INTEGER DEFAULT null')
137
+
133
138
  conn.commit()
134
139
 
135
140
 
@@ -159,6 +164,7 @@ def add_or_update_cluster(cluster_name: str,
159
164
  status = status_lib.ClusterStatus.INIT
160
165
  if ready:
161
166
  status = status_lib.ClusterStatus.UP
167
+ status_updated_at = int(time.time())
162
168
 
163
169
  # TODO (sumanth): Cluster history table will have multiple entries
164
170
  # when the cluster failover through multiple regions (one entry per region).
@@ -191,7 +197,7 @@ def add_or_update_cluster(cluster_name: str,
191
197
  # specified.
192
198
  '(name, launched_at, handle, last_use, status, '
193
199
  'autostop, to_down, metadata, owner, cluster_hash, '
194
- 'storage_mounts_metadata, cluster_ever_up) '
200
+ 'storage_mounts_metadata, cluster_ever_up, status_updated_at) '
195
201
  'VALUES ('
196
202
  # name
197
203
  '?, '
@@ -228,7 +234,9 @@ def add_or_update_cluster(cluster_name: str,
228
234
  'COALESCE('
229
235
  '(SELECT storage_mounts_metadata FROM clusters WHERE name=?), null), '
230
236
  # cluster_ever_up
231
- '((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?)'
237
+ '((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?),'
238
+ # status_updated_at
239
+ '?'
232
240
  ')',
233
241
  (
234
242
  # name
@@ -260,6 +268,8 @@ def add_or_update_cluster(cluster_name: str,
260
268
  # cluster_ever_up
261
269
  cluster_name,
262
270
  int(ready),
271
+ # status_updated_at
272
+ status_updated_at,
263
273
  ))
264
274
 
265
275
  launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
@@ -330,11 +340,13 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None:
330
340
  # stopped VM, which leads to timeout.
331
341
  if hasattr(handle, 'stable_internal_external_ips'):
332
342
  handle.stable_internal_external_ips = None
343
+ current_time = int(time.time())
333
344
  _DB.cursor.execute(
334
- 'UPDATE clusters SET handle=(?), status=(?) '
335
- 'WHERE name=(?)', (
345
+ 'UPDATE clusters SET handle=(?), status=(?), '
346
+ 'status_updated_at=(?) WHERE name=(?)', (
336
347
  pickle.dumps(handle),
337
348
  status_lib.ClusterStatus.STOPPED.value,
349
+ current_time,
338
350
  cluster_name,
339
351
  ))
340
352
  _DB.conn.commit()
@@ -359,10 +371,10 @@ def get_glob_cluster_names(cluster_name: str) -> List[str]:
359
371
 
360
372
  def set_cluster_status(cluster_name: str,
361
373
  status: status_lib.ClusterStatus) -> None:
362
- _DB.cursor.execute('UPDATE clusters SET status=(?) WHERE name=(?)', (
363
- status.value,
364
- cluster_name,
365
- ))
374
+ current_time = int(time.time())
375
+ _DB.cursor.execute(
376
+ 'UPDATE clusters SET status=(?), status_updated_at=(?) WHERE name=(?)',
377
+ (status.value, current_time, cluster_name))
366
378
  count = _DB.cursor.rowcount
367
379
  _DB.conn.commit()
368
380
  assert count <= 1, count
@@ -570,15 +582,18 @@ def _load_storage_mounts_metadata(
570
582
 
571
583
  def get_cluster_from_name(
572
584
  cluster_name: Optional[str]) -> Optional[Dict[str, Any]]:
573
- rows = _DB.cursor.execute('SELECT * FROM clusters WHERE name=(?)',
574
- (cluster_name,)).fetchall()
585
+ rows = _DB.cursor.execute(
586
+ 'SELECT name, launched_at, handle, last_use, status, autostop, '
587
+ 'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
588
+ 'cluster_ever_up, status_updated_at FROM clusters WHERE name=(?)',
589
+ (cluster_name,)).fetchall()
575
590
  for row in rows:
576
591
  # Explicitly specify the number of fields to unpack, so that
577
592
  # we can add new fields to the database in the future without
578
593
  # breaking the previous code.
579
594
  (name, launched_at, handle, last_use, status, autostop, metadata,
580
- to_down, owner, cluster_hash, storage_mounts_metadata,
581
- cluster_ever_up) = row[:12]
595
+ to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
596
+ status_updated_at) = row[:13]
582
597
  # TODO: use namedtuple instead of dict
583
598
  record = {
584
599
  'name': name,
@@ -594,6 +609,7 @@ def get_cluster_from_name(
594
609
  'storage_mounts_metadata':
595
610
  _load_storage_mounts_metadata(storage_mounts_metadata),
596
611
  'cluster_ever_up': bool(cluster_ever_up),
612
+ 'status_updated_at': status_updated_at,
597
613
  }
598
614
  return record
599
615
  return None
@@ -601,12 +617,15 @@ def get_cluster_from_name(
601
617
 
602
618
  def get_clusters() -> List[Dict[str, Any]]:
603
619
  rows = _DB.cursor.execute(
604
- 'select * from clusters order by launched_at desc').fetchall()
620
+ 'select name, launched_at, handle, last_use, status, autostop, '
621
+ 'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
622
+ 'cluster_ever_up, status_updated_at from clusters '
623
+ 'order by launched_at desc').fetchall()
605
624
  records = []
606
625
  for row in rows:
607
626
  (name, launched_at, handle, last_use, status, autostop, metadata,
608
- to_down, owner, cluster_hash, storage_mounts_metadata,
609
- cluster_ever_up) = row[:12]
627
+ to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
628
+ status_updated_at) = row[:13]
610
629
  # TODO: use namedtuple instead of dict
611
630
  record = {
612
631
  'name': name,
@@ -622,6 +641,7 @@ def get_clusters() -> List[Dict[str, Any]]:
622
641
  'storage_mounts_metadata':
623
642
  _load_storage_mounts_metadata(storage_mounts_metadata),
624
643
  'cluster_ever_up': bool(cluster_ever_up),
644
+ 'status_updated_at': status_updated_at,
625
645
  }
626
646
 
627
647
  records.append(record)
@@ -1693,6 +1693,8 @@ def merge_dicts(source: Dict[Any, Any], destination: Dict[Any, Any]):
1693
1693
  else:
1694
1694
  destination[key].extend(value)
1695
1695
  else:
1696
+ if destination is None:
1697
+ destination = {}
1696
1698
  destination[key] = value
1697
1699
 
1698
1700
 
sky/utils/schemas.py CHANGED
@@ -663,6 +663,7 @@ class RemoteIdentityOptions(enum.Enum):
663
663
  """
664
664
  LOCAL_CREDENTIALS = 'LOCAL_CREDENTIALS'
665
665
  SERVICE_ACCOUNT = 'SERVICE_ACCOUNT'
666
+ NO_UPLOAD = 'NO_UPLOAD'
666
667
 
667
668
 
668
669
  def get_default_remote_identity(cloud: str) -> str:
sky/utils/timeline.py CHANGED
@@ -79,11 +79,9 @@ def event(name_or_fn: Union[str, Callable], message: Optional[str] = None):
79
79
  class FileLockEvent:
80
80
  """Serve both as a file lock and event for the lock."""
81
81
 
82
- def __init__(self, lockfile: Union[str, os.PathLike]):
82
+ def __init__(self, lockfile: Union[str, os.PathLike], timeout: float = -1):
83
83
  self._lockfile = lockfile
84
- # TODO(mraheja): remove pylint disabling when filelock version updated
85
- # pylint: disable=abstract-class-instantiated
86
- self._lock = filelock.FileLock(self._lockfile)
84
+ self._lock = filelock.FileLock(self._lockfile, timeout)
87
85
  self._hold_lock_event = Event(f'[FileLock.hold]:{self._lockfile}')
88
86
 
89
87
  def acquire(self):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241115
3
+ Version: 1.0.0.dev20241117
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,14 +1,14 @@
1
- sky/__init__.py,sha256=Z1KWPa9F9FO2X9cCqN-yr4pThakcQfTxVeuiH_sd-eM,5882
1
+ sky/__init__.py,sha256=FKXdRPZUmWj3JCCvbzjdeGgAjXo6aW7Cr2VNUxzzINE,5882
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
4
4
  sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
5
- sky/cli.py,sha256=xigcV79-9ceMHiix9m5fvTwpJUkPBfpLWWNwY7_auY0,213013
5
+ sky/cli.py,sha256=2QrlLeMwKpVKYOBDwtgs9zkBvNgn9Rg3XKk9aE6_0eY,213418
6
6
  sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
7
7
  sky/core.py,sha256=0-4W_DKJZgbwXuzNZKQ2R_qJxqxbqqNfyi0U0PQBKvQ,38230
8
8
  sky/dag.py,sha256=f3sJlkH4bE6Uuz3ozNtsMhcBpRx7KmC9Sa4seDKt4hU,3104
9
9
  sky/exceptions.py,sha256=E3C2Ejcc8RUDAUQn7ar_Jr97C_AxD2rKKMmJOfLJ9d0,8965
10
- sky/execution.py,sha256=4qSxCCTWk7vKSclIx1k03PTZaSQ0MXnoihGYYB6O7QU,27155
11
- sky/global_user_state.py,sha256=PywEmUutF97XBgRMClR6IS5_KM8JJC0oA1LsPUZebp0,28681
10
+ sky/execution.py,sha256=teXbprZ_2BYPr7HYT0-GJNxSDfBmNNdFqpms2xTvihM,27427
11
+ sky/global_user_state.py,sha256=ob3jvtG_yMPGvLlVScgeJ9pqk3FP4jhfEixw8WzFwho,29682
12
12
  sky/optimizer.py,sha256=GjvKQIBtY3NlULzau_9tfa7V2KYVJRrmNrjKVIWCPIQ,59753
13
13
  sky/resources.py,sha256=usmB8p7HyzyWHcevQ8HV6eIlukYJ9BC0trFOaE2kNuw,69049
14
14
  sky/sky_logging.py,sha256=oLmTmwkuucIto3LHXLJfMcyRpYSkmZAZa5XzQPA5IHk,4434
@@ -30,8 +30,8 @@ sky/adaptors/runpod.py,sha256=4Nt_BfZhJAKQNA3wO8cxvvNI8x4NsDGHu_4EhRDlGYQ,225
30
30
  sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
31
31
  sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
32
32
  sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
33
- sky/backends/backend_utils.py,sha256=Eb9DE-ePtwBNtN4lgt3jFhnbLnN3SlibmKyqAtlpgVE,122075
34
- sky/backends/cloud_vm_ray_backend.py,sha256=vEvyhlUK8-EbumF-shyDU4ZR3qeMhJuz_Qy45KmDqQI,232311
33
+ sky/backends/backend_utils.py,sha256=vese_H1lOFL40r-LPhjqqM8OlgICDgEW-YtNQ95cYs8,125608
34
+ sky/backends/cloud_vm_ray_backend.py,sha256=REJ8bSe-QJzlytFes7hmxb3Nmx--zC7y3JDB2PAwv3Q,232316
35
35
  sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
36
36
  sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
37
37
  sky/backends/wheel_utils.py,sha256=CUVOwlBtQjOMv-RSDGx2jMQ0M1D0w9ZPm0TDafJwBDI,8180
@@ -40,7 +40,7 @@ sky/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
40
  sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG2yg,8723
41
41
  sky/benchmark/benchmark_utils.py,sha256=eb-i6zYoo-Zkod-T9qtCu1FcYLw--Yyos1SyibUPZNE,26194
42
42
  sky/clouds/__init__.py,sha256=WuNIJEnZmBO72tU5awgaaL3rdvFRSkgaYNNeuY68dXo,1356
43
- sky/clouds/aws.py,sha256=2STW4eaCEtxre96yVagUcewNHiYGmxHKITNEQvgBmww,49539
43
+ sky/clouds/aws.py,sha256=um7-lam6BfYYgCvNAMsIY_Gty8wt8cOlCHsu3Ah3Od8,49616
44
44
  sky/clouds/azure.py,sha256=38eUcB1_lt5FvDWo-G_pKIIsT1c_bCU2AifEYo7KX9Y,30756
45
45
  sky/clouds/cloud.py,sha256=Y_9Hi2DhAbrqMLvb_NFPt--N5V6ua8BgbwV4xIc19KU,35216
46
46
  sky/clouds/cloud_registry.py,sha256=oLoYFjm_SDTgdHokY7b6A5Utq80HXRQNxV0fLjDdVsQ,2361
@@ -56,7 +56,7 @@ sky/clouds/runpod.py,sha256=_4myVdGIvQshkka8fn6mBXHgz5TZqhrNhAEM2_HrCT8,11487
56
56
  sky/clouds/scp.py,sha256=NivPvzQxA90R37QR3fgTup8ScGfxKsXAhH0xklAj5QU,15817
57
57
  sky/clouds/vsphere.py,sha256=ZzlcQBzv0aaRYXwZOrdKIGFK94LaOfDSV3lJBg9xyc4,12256
58
58
  sky/clouds/service_catalog/__init__.py,sha256=cFZ3HLdQVa42xOhK2XxuB_xPIX4X1UR89InR4y2y_78,14757
59
- sky/clouds/service_catalog/aws_catalog.py,sha256=vTI7h5bjZg3lItT9RBaSwY1Fl0vX5UN1CgMDM6-C1pw,13059
59
+ sky/clouds/service_catalog/aws_catalog.py,sha256=j33lNC5GXWK6CiGWZORCnumGlRODmCAT2_lfWp0YtBc,13106
60
60
  sky/clouds/service_catalog/azure_catalog.py,sha256=5Q51x_WEKvQ2YSgJvZHRH3URlbwIstYuwpjaWW_wJlw,8149
61
61
  sky/clouds/service_catalog/common.py,sha256=qHNLzh59W34CSSCCztu75n69TuGyDQ310SQc_P-t544,27700
62
62
  sky/clouds/service_catalog/config.py,sha256=ylzqewdEBjDg4awvFek6ldYmFrnvD2bVGLZuLPvEVYA,1793
@@ -79,7 +79,7 @@ sky/clouds/service_catalog/data_fetchers/fetch_azure.py,sha256=L1JsX1YrhpyI7ylzE
79
79
  sky/clouds/service_catalog/data_fetchers/fetch_cudo.py,sha256=52P48lvWN0s1ArjeLPeLemPRpxjSRcHincRle0nqdm4,3440
80
80
  sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py,sha256=35nO_VaDOgp5W13kt_lIANSk_CNf7gBiZGJ5fGyZu6o,6808
81
81
  sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=VrTTkMF5AjiplfDmvPBW-otR3oXGU3-oFouVMfIua4Q,33447
82
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=yOPmmckiQ0HU6bKXWd7YdTrsF2sql3Bs_jYNpuxlo0I,4942
82
+ sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=MN54h0CAGPHQAeF2eTmuESq3b0-d1kDARRUM6OkivCk,4962
83
83
  sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py,sha256=SF_gTU74qg6L-DSWneCAbqP0lwZXaaDi5otiMIJbrw0,21462
84
84
  sky/clouds/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
85
  sky/clouds/utils/aws_utils.py,sha256=W5BRC-2F_VY4BymRA1kS6-MufsI3V8cfY_hv--4gJBU,1986
@@ -140,7 +140,7 @@ sky/provision/kubernetes/config.py,sha256=WEKcFXXhe89bLGAvoMiBvTDxdxkpTIA6ezrj2v
140
140
  sky/provision/kubernetes/instance.py,sha256=lHD1cVVEMZFLNnd7_UCVr079SY9D3dH1X3abMBRRtdI,47103
141
141
  sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
142
142
  sky/provision/kubernetes/network_utils.py,sha256=t1FS3K400fetH7cBuRgQJZl5_jEeMshsvsYmnMUcq8k,11399
143
- sky/provision/kubernetes/utils.py,sha256=0W5p5HtvVZrGSim_H4n2x_AwKRPGMZToQwoN0qI3qbA,101175
143
+ sky/provision/kubernetes/utils.py,sha256=4Gqn66jE0Nzs4UBjajfUM4E4mXWYnYzdhLz6SQG44Kg,101244
144
144
  sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
145
145
  sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
146
146
  sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
@@ -257,9 +257,9 @@ sky/utils/kubernetes_enums.py,sha256=imGqHSa8O07zD_6xH1SDMM7dBU5lF5fzFFlQuQy00QM
257
257
  sky/utils/log_utils.py,sha256=oZYF45uC7GFjAqO-Je-aiX6zhtq91TP-KKaIbQNF-jY,14024
258
258
  sky/utils/resources_utils.py,sha256=Xqi7gxPYw2y5wl5okUI5zx5LEij0hJF_V3Zi8q7TXYg,7890
259
259
  sky/utils/rich_utils.py,sha256=hmnI1X5dKvRIQzB7EyNb34FT97qFNve-0QHqM5r0mVk,3066
260
- sky/utils/schemas.py,sha256=67LK87wBywblIyF-QgG5hgL1BvBuHsxeQLQBO0M5OH4,29447
260
+ sky/utils/schemas.py,sha256=yz8IKUA2oWJartaranIc9MfDZmZcIybPZUGViw1Ii1Q,29475
261
261
  sky/utils/subprocess_utils.py,sha256=4pnt_QPvPuh3ylG5xlr18JlZeF6693h3fmG1uaD8qLo,8669
262
- sky/utils/timeline.py,sha256=q6jNZ0NO-mVNC3nUplfWOrb6Y68i5OB65xnkMblAcT4,4028
262
+ sky/utils/timeline.py,sha256=ebHxKJK2HX0utGArrUgSezTPkcwav3VETa_AQS34t-E,3925
263
263
  sky/utils/ux_utils.py,sha256=CqyIFGDuSE8fQasPkna_loZMwtboC9KedR09WEQ7qz0,6502
264
264
  sky/utils/validator.py,sha256=cAFERCoC7jH0DFKepcU4x9SYmdrYL1iVmW9tXA18hvo,701
265
265
  sky/utils/cli_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -275,9 +275,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
275
275
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
276
276
  sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
277
277
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
278
- skypilot_nightly-1.0.0.dev20241115.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
279
- skypilot_nightly-1.0.0.dev20241115.dist-info/METADATA,sha256=f0iTz_zN3tt2o0Ty5TdCfcNH98fuUWJPFVz8dHVsybM,19699
280
- skypilot_nightly-1.0.0.dev20241115.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
281
- skypilot_nightly-1.0.0.dev20241115.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
282
- skypilot_nightly-1.0.0.dev20241115.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
283
- skypilot_nightly-1.0.0.dev20241115.dist-info/RECORD,,
278
+ skypilot_nightly-1.0.0.dev20241117.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
279
+ skypilot_nightly-1.0.0.dev20241117.dist-info/METADATA,sha256=JVbQomtCB0I-POH2-81YkHd0E28F51ikvaUmgIGYa8g,19699
280
+ skypilot_nightly-1.0.0.dev20241117.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
281
+ skypilot_nightly-1.0.0.dev20241117.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
282
+ skypilot_nightly-1.0.0.dev20241117.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
283
+ skypilot_nightly-1.0.0.dev20241117.dist-info/RECORD,,