skypilot-nightly 1.0.0.dev20250807__py3-none-any.whl → 1.0.0.dev20250808__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (61) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +20 -1
  3. sky/backends/cloud_vm_ray_backend.py +9 -2
  4. sky/client/cli/command.py +40 -26
  5. sky/client/sdk.py +132 -65
  6. sky/client/sdk_async.py +1 -1
  7. sky/core.py +5 -2
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → -DXZksWqf2waNHeU9YTQe}/_buildManifest.js +1 -1
  10. sky/dashboard/out/_next/static/chunks/{6601-3e21152fe16da09c.js → 6601-06114c982db410b6.js} +1 -1
  11. sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/{8969-318c3dca725e8e5d.js → 8969-c9686994ddafcf01.js} +1 -1
  13. sky/dashboard/out/_next/static/chunks/pages/{_app-1e6de35d15a8d432.js → _app-491a4d699d95e808.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +11 -0
  15. sky/dashboard/out/_next/static/chunks/webpack-339efec49c0cc7d0.js +1 -0
  16. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  17. sky/dashboard/out/clusters/[cluster].html +1 -1
  18. sky/dashboard/out/clusters.html +1 -1
  19. sky/dashboard/out/config.html +1 -1
  20. sky/dashboard/out/index.html +1 -1
  21. sky/dashboard/out/infra/[context].html +1 -1
  22. sky/dashboard/out/infra.html +1 -1
  23. sky/dashboard/out/jobs/[job].html +1 -1
  24. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  25. sky/dashboard/out/jobs.html +1 -1
  26. sky/dashboard/out/users.html +1 -1
  27. sky/dashboard/out/volumes.html +1 -1
  28. sky/dashboard/out/workspace/new.html +1 -1
  29. sky/dashboard/out/workspaces/[name].html +1 -1
  30. sky/dashboard/out/workspaces.html +1 -1
  31. sky/execution.py +6 -4
  32. sky/global_user_state.py +8 -1
  33. sky/jobs/client/sdk.py +27 -20
  34. sky/jobs/controller.py +2 -1
  35. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  36. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  37. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  38. sky/serve/client/impl.py +11 -8
  39. sky/serve/client/sdk.py +7 -7
  40. sky/serve/serve_state.py +437 -340
  41. sky/serve/server/impl.py +2 -2
  42. sky/server/common.py +12 -8
  43. sky/server/constants.py +1 -1
  44. sky/setup_files/alembic.ini +4 -0
  45. sky/utils/cli_utils/status_utils.py +1 -1
  46. sky/utils/db/db_utils.py +31 -0
  47. sky/utils/db/migration_utils.py +5 -1
  48. sky/utils/kubernetes/deploy_remote_cluster.py +3 -1
  49. sky/utils/resource_checker.py +162 -21
  50. sky/volumes/client/sdk.py +4 -4
  51. sky/workspaces/core.py +210 -6
  52. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/METADATA +2 -2
  53. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/RECORD +58 -55
  54. sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +0 -1
  55. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +0 -11
  56. sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +0 -1
  57. /sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → -DXZksWqf2waNHeU9YTQe}/_ssgManifest.js +0 -0
  58. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/WHEEL +0 -0
  59. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/entry_points.txt +0 -0
  60. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/licenses/LICENSE +0 -0
  61. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'a167cba8230b0ffda6baa0c825fa0eb5d5ab4aa4'
8
+ _SKYPILOT_COMMIT_SHA = 'eb83a691489c0c37aae9c22f607469ff78a74e34'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250807'
38
+ __version__ = '1.0.0.dev20250808'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -121,6 +121,7 @@ CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
121
121
  _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
122
122
 
123
123
  CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
124
+ WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
124
125
 
125
126
  # Remote dir that holds our runtime files.
126
127
  _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
@@ -2760,6 +2761,9 @@ def get_clusters(
2760
2761
  refresh: common.StatusRefreshMode,
2761
2762
  cluster_names: Optional[Union[str, List[str]]] = None,
2762
2763
  all_users: bool = True,
2764
+ # Internal only:
2765
+ # pylint: disable=invalid-name
2766
+ _include_is_managed: bool = False,
2763
2767
  ) -> List[Dict[str, Any]]:
2764
2768
  """Returns a list of cached or optionally refreshed cluster records.
2765
2769
 
@@ -2780,6 +2784,8 @@ def get_clusters(
2780
2784
  names.
2781
2785
  all_users: If True, return clusters from all users. If False, only
2782
2786
  return clusters from the current user.
2787
+ _include_is_managed: Whether to force include clusters created by the
2788
+ controller.
2783
2789
 
2784
2790
  Returns:
2785
2791
  A list of cluster records. If the cluster does not exist or has been
@@ -2788,6 +2794,13 @@ def get_clusters(
2788
2794
  records = global_user_state.get_clusters()
2789
2795
  current_user = common_utils.get_current_user()
2790
2796
 
2797
+ # Filter out clusters created by the controller.
2798
+ if (not env_options.Options.SHOW_DEBUG_INFO.get() and
2799
+ not _include_is_managed):
2800
+ records = [
2801
+ record for record in records if not record.get('is_managed', False)
2802
+ ]
2803
+
2791
2804
  # Filter by user if requested
2792
2805
  if not all_users:
2793
2806
  records = [
@@ -3221,7 +3234,8 @@ def get_endpoints(cluster: str,
3221
3234
  with ux_utils.print_exception_no_traceback():
3222
3235
  raise ValueError(f'Invalid endpoint {port!r}.') from None
3223
3236
  cluster_records = get_clusters(refresh=common.StatusRefreshMode.NONE,
3224
- cluster_names=[cluster])
3237
+ cluster_names=[cluster],
3238
+ _include_is_managed=True)
3225
3239
  if not cluster_records:
3226
3240
  with ux_utils.print_exception_no_traceback():
3227
3241
  raise exceptions.ClusterNotUpError(
@@ -3311,3 +3325,8 @@ def cluster_status_lock_id(cluster_name: str) -> str:
3311
3325
  def cluster_file_mounts_lock_id(cluster_name: str) -> str:
3312
3326
  """Get the lock ID for cluster file mounts operations."""
3313
3327
  return f'{cluster_name}_file_mounts'
3328
+
3329
+
3330
+ def workspace_lock_id(workspace_name: str) -> str:
3331
+ """Get the lock ID for workspace operations."""
3332
+ return f'{workspace_name}_workspace'
@@ -1177,7 +1177,8 @@ class RetryingVmProvisioner(object):
1177
1177
  local_wheel_path: pathlib.Path,
1178
1178
  wheel_hash: str,
1179
1179
  blocked_resources: Optional[Iterable[
1180
- resources_lib.Resources]] = None):
1180
+ resources_lib.Resources]] = None,
1181
+ is_managed: Optional[bool] = None):
1181
1182
  self._blocked_resources: Set[resources_lib.Resources] = set()
1182
1183
  if blocked_resources:
1183
1184
  # blocked_resources is not None and not empty.
@@ -1189,6 +1190,7 @@ class RetryingVmProvisioner(object):
1189
1190
  self._requested_features = requested_features
1190
1191
  self._local_wheel_path = local_wheel_path
1191
1192
  self._wheel_hash = wheel_hash
1193
+ self._is_managed = is_managed
1192
1194
 
1193
1195
  def _yield_zones(
1194
1196
  self, to_provision: resources_lib.Resources, num_nodes: int,
@@ -1522,6 +1524,7 @@ class RetryingVmProvisioner(object):
1522
1524
  cluster_handle=handle,
1523
1525
  requested_resources=requested_resources,
1524
1526
  ready=False,
1527
+ is_managed=self._is_managed,
1525
1528
  )
1526
1529
 
1527
1530
  global_user_state.set_owner_identity_for_cluster(
@@ -2753,6 +2756,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2753
2756
  self._dag = None
2754
2757
  self._optimize_target = None
2755
2758
  self._requested_features = set()
2759
+ self._dump_final_script = False
2760
+ self._is_managed = False
2756
2761
 
2757
2762
  # Command for running the setup script. It is only set when the
2758
2763
  # setup needs to be run outside the self._setup() and as part of
@@ -2769,6 +2774,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2769
2774
  self._requested_features = kwargs.pop('requested_features',
2770
2775
  self._requested_features)
2771
2776
  self._dump_final_script = kwargs.pop('dump_final_script', False)
2777
+ self._is_managed = kwargs.pop('is_managed', False)
2772
2778
  assert not kwargs, f'Unexpected kwargs: {kwargs}'
2773
2779
 
2774
2780
  def check_resources_fit_cluster(
@@ -2990,7 +2996,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2990
2996
  self._requested_features,
2991
2997
  local_wheel_path,
2992
2998
  wheel_hash,
2993
- blocked_resources=task.blocked_resources)
2999
+ blocked_resources=task.blocked_resources,
3000
+ is_managed=self._is_managed)
2994
3001
  log_path = os.path.join(self.log_dir, 'provision.log')
2995
3002
  rich_utils.force_update_status(
2996
3003
  ux_utils.spinner_message('Launching', log_path))
sky/client/cli/command.py CHANGED
@@ -35,7 +35,7 @@ import sys
35
35
  import traceback
36
36
  import typing
37
37
  from typing import (Any, Callable, Dict, Generator, List, Optional, Set, Tuple,
38
- Union)
38
+ TypeVar, Union)
39
39
 
40
40
  import click
41
41
  import colorama
@@ -116,6 +116,8 @@ _DAG_NOT_SUPPORTED_MESSAGE = ('YAML specifies a DAG which is only supported by '
116
116
  '`sky jobs launch`. `{command}` supports a '
117
117
  'single task only.')
118
118
 
119
+ T = TypeVar('T')
120
+
119
121
 
120
122
  def _get_cluster_records_and_set_ssh_config(
121
123
  clusters: Optional[List[str]],
@@ -224,8 +226,8 @@ def _get_glob_matches(candidate_names: List[str],
224
226
  return list(set(glob_storages))
225
227
 
226
228
 
227
- def _async_call_or_wait(request_id: str, async_call: bool,
228
- request_name: str) -> Any:
229
+ def _async_call_or_wait(request_id: server_common.RequestId[T],
230
+ async_call: bool, request_name: str) -> Any:
229
231
  short_request_id = request_id[:8]
230
232
  if not async_call:
231
233
  try:
@@ -1411,7 +1413,7 @@ def exec(
1411
1413
 
1412
1414
 
1413
1415
  def _handle_jobs_queue_request(
1414
- request_id: str,
1416
+ request_id: server_common.RequestId[List[Dict[str, Any]]],
1415
1417
  show_all: bool,
1416
1418
  show_user: bool,
1417
1419
  max_num_jobs_to_show: Optional[int],
@@ -1492,7 +1494,7 @@ def _handle_jobs_queue_request(
1492
1494
 
1493
1495
 
1494
1496
  def _handle_services_request(
1495
- request_id: str,
1497
+ request_id: server_common.RequestId[List[Dict[str, Any]]],
1496
1498
  service_names: Optional[List[str]],
1497
1499
  show_all: bool,
1498
1500
  show_endpoint: bool,
@@ -1879,17 +1881,19 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1879
1881
  skip_finished=True,
1880
1882
  all_users=all_users)
1881
1883
 
1882
- def submit_services() -> Optional[str]:
1884
+ def submit_services(
1885
+ ) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
1883
1886
  return serve_lib.status(service_names=None)
1884
1887
 
1885
- def submit_pools() -> Optional[str]:
1888
+ def submit_pools(
1889
+ ) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
1886
1890
  try:
1887
1891
  return managed_jobs.pool_status(pool_names=None)
1888
1892
  except exceptions.APINotSupportedError as e:
1889
1893
  logger.debug(f'Pools are not supported in the remote server: {e}')
1890
1894
  return None
1891
1895
 
1892
- def submit_workspace() -> Optional[str]:
1896
+ def submit_workspace() -> Optional[server_common.RequestId[Dict[str, Any]]]:
1893
1897
  try:
1894
1898
  return sdk.workspaces()
1895
1899
  except RuntimeError:
@@ -1928,11 +1932,14 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1928
1932
  if not (ip or show_endpoints):
1929
1933
  workspace_request_id = workspace_request_future.result()
1930
1934
 
1931
- managed_jobs_queue_request_id = ('' if not managed_jobs_queue_request_id
1932
- else managed_jobs_queue_request_id)
1933
- service_status_request_id = ('' if not service_status_request_id else
1935
+ managed_jobs_queue_request_id = (server_common.RequestId()
1936
+ if not managed_jobs_queue_request_id else
1937
+ managed_jobs_queue_request_id)
1938
+ service_status_request_id = (server_common.RequestId()
1939
+ if not service_status_request_id else
1934
1940
  service_status_request_id)
1935
- pool_status_request_id = ('' if not pool_status_request_id else
1941
+ pool_status_request_id = (server_common.RequestId()
1942
+ if not pool_status_request_id else
1936
1943
  pool_status_request_id)
1937
1944
 
1938
1945
  # Phase 3: Get cluster records and handle special cases
@@ -1957,7 +1964,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1957
1964
  if workspace_request_id is not None:
1958
1965
  all_workspaces = sdk.get(workspace_request_id)
1959
1966
  else:
1960
- all_workspaces = [constants.SKYPILOT_DEFAULT_WORKSPACE]
1967
+ all_workspaces = {constants.SKYPILOT_DEFAULT_WORKSPACE: {}}
1961
1968
  active_workspace = skypilot_config.get_active_workspace()
1962
1969
  show_workspace = len(all_workspaces) > 1
1963
1970
  _show_enabled_infra(active_workspace, show_workspace)
@@ -3836,7 +3843,7 @@ def show_gpus(
3836
3843
  yield k8s_messages
3837
3844
  yield '\n\n'
3838
3845
 
3839
- result = sdk.stream_and_get(
3846
+ list_accelerator_counts_result = sdk.stream_and_get(
3840
3847
  sdk.list_accelerator_counts(
3841
3848
  gpus_only=True,
3842
3849
  clouds=clouds_to_list,
@@ -3853,14 +3860,20 @@ def show_gpus(
3853
3860
 
3854
3861
  # "Common" GPUs
3855
3862
  for gpu in catalog.get_common_gpus():
3856
- if gpu in result:
3857
- gpu_table.add_row([gpu, _list_to_str(result.pop(gpu))])
3863
+ if gpu in list_accelerator_counts_result:
3864
+ gpu_table.add_row([
3865
+ gpu,
3866
+ _list_to_str(list_accelerator_counts_result.pop(gpu))
3867
+ ])
3858
3868
  yield from gpu_table.get_string()
3859
3869
 
3860
3870
  # Google TPUs
3861
3871
  for tpu in catalog.get_tpus():
3862
- if tpu in result:
3863
- tpu_table.add_row([tpu, _list_to_str(result.pop(tpu))])
3872
+ if tpu in list_accelerator_counts_result:
3873
+ tpu_table.add_row([
3874
+ tpu,
3875
+ _list_to_str(list_accelerator_counts_result.pop(tpu))
3876
+ ])
3864
3877
  if tpu_table.get_string():
3865
3878
  yield '\n\n'
3866
3879
  yield from tpu_table.get_string()
@@ -3868,7 +3881,7 @@ def show_gpus(
3868
3881
  # Other GPUs
3869
3882
  if show_all:
3870
3883
  yield '\n\n'
3871
- for gpu, qty in sorted(result.items()):
3884
+ for gpu, qty in sorted(list_accelerator_counts_result.items()):
3872
3885
  other_table.add_row([gpu, _list_to_str(qty)])
3873
3886
  yield from other_table.get_string()
3874
3887
  yield '\n\n'
@@ -3919,7 +3932,7 @@ def show_gpus(
3919
3932
 
3920
3933
  # For clouds other than Kubernetes, get the accelerator details
3921
3934
  # Case-sensitive
3922
- result = sdk.stream_and_get(
3935
+ list_accelerators_result = sdk.stream_and_get(
3923
3936
  sdk.list_accelerators(gpus_only=True,
3924
3937
  name_filter=name,
3925
3938
  quantity_filter=quantity,
@@ -3935,8 +3948,8 @@ def show_gpus(
3935
3948
  # - Group by cloud
3936
3949
  # - Sort within each group by prices
3937
3950
  # - Sort groups by each cloud's (min price, min spot price)
3938
- new_result = {}
3939
- for i, (gpu, items) in enumerate(result.items()):
3951
+ new_result: Dict[str, List[catalog_common.InstanceTypeInfo]] = {}
3952
+ for i, (gpu, items) in enumerate(list_accelerators_result.items()):
3940
3953
  df = pd.DataFrame([t._asdict() for t in items])
3941
3954
  # Determine the minimum prices for each cloud.
3942
3955
  min_price_df = df.groupby('cloud').agg(min_price=('price', 'min'),
@@ -3954,14 +3967,14 @@ def show_gpus(
3954
3967
  for row in df.to_records(index=False)
3955
3968
  ]
3956
3969
  new_result[gpu] = sorted_dataclasses
3957
- result = new_result
3970
+ list_accelerators_result = new_result
3958
3971
 
3959
3972
  if print_section_titles and not show_all:
3960
3973
  yield '\n\n'
3961
3974
  yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3962
3975
  f'Cloud GPUs{colorama.Style.RESET_ALL}\n')
3963
3976
 
3964
- if not result:
3977
+ if not list_accelerators_result:
3965
3978
  quantity_str = (f' with requested quantity {quantity}'
3966
3979
  if quantity else '')
3967
3980
  cloud_str = f' on {cloud_obj}.' if cloud_name else ' in cloud catalogs.'
@@ -3969,7 +3982,7 @@ def show_gpus(
3969
3982
  yield 'To show available accelerators, run: sky show-gpus --all'
3970
3983
  return
3971
3984
 
3972
- for i, (gpu, items) in enumerate(result.items()):
3985
+ for i, (gpu, items) in enumerate(list_accelerators_result.items()):
3973
3986
  accelerator_table_headers = [
3974
3987
  'GPU',
3975
3988
  'QTY',
@@ -6039,7 +6052,8 @@ def api_logs(request_id: Optional[str], server_logs: bool,
6039
6052
  if request_id is not None and log_path is not None:
6040
6053
  raise click.BadParameter(
6041
6054
  'Only one of request ID and log path can be provided.')
6042
- sdk.stream_and_get(request_id, log_path, tail)
6055
+ sdk.stream_and_get(server_common.RequestId[None](request_id), log_path,
6056
+ tail)
6043
6057
 
6044
6058
 
6045
6059
  @api.command('cancel', cls=_DocumentedCodeCommand)