skypilot-nightly 1.0.0.dev20250807__py3-none-any.whl → 1.0.0.dev20250812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (91) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +5 -2
  3. sky/backends/backend_utils.py +57 -7
  4. sky/backends/cloud_vm_ray_backend.py +50 -8
  5. sky/client/cli/command.py +60 -26
  6. sky/client/sdk.py +132 -65
  7. sky/client/sdk_async.py +1 -1
  8. sky/core.py +10 -2
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → Fuy7OzApYTUMz2QgoP7dP}/_buildManifest.js +1 -1
  11. sky/dashboard/out/_next/static/chunks/{6601-3e21152fe16da09c.js → 6601-06114c982db410b6.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/{8969-318c3dca725e8e5d.js → 8969-c9686994ddafcf01.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/pages/{_app-1e6de35d15a8d432.js → _app-491a4d699d95e808.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
  16. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +1 -0
  18. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  19. sky/dashboard/out/clusters/[cluster].html +1 -1
  20. sky/dashboard/out/clusters.html +1 -1
  21. sky/dashboard/out/config.html +1 -1
  22. sky/dashboard/out/index.html +1 -1
  23. sky/dashboard/out/infra/[context].html +1 -1
  24. sky/dashboard/out/infra.html +1 -1
  25. sky/dashboard/out/jobs/[job].html +1 -1
  26. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  27. sky/dashboard/out/jobs.html +1 -1
  28. sky/dashboard/out/users.html +1 -1
  29. sky/dashboard/out/volumes.html +1 -1
  30. sky/dashboard/out/workspace/new.html +1 -1
  31. sky/dashboard/out/workspaces/[name].html +1 -1
  32. sky/dashboard/out/workspaces.html +1 -1
  33. sky/execution.py +21 -4
  34. sky/global_user_state.py +110 -1
  35. sky/jobs/client/sdk.py +27 -20
  36. sky/jobs/controller.py +2 -1
  37. sky/jobs/recovery_strategy.py +3 -0
  38. sky/jobs/server/core.py +4 -0
  39. sky/jobs/utils.py +9 -2
  40. sky/provision/__init__.py +3 -2
  41. sky/provision/aws/instance.py +5 -4
  42. sky/provision/azure/instance.py +5 -4
  43. sky/provision/cudo/instance.py +5 -4
  44. sky/provision/do/instance.py +5 -4
  45. sky/provision/fluidstack/instance.py +5 -4
  46. sky/provision/gcp/instance.py +5 -4
  47. sky/provision/hyperbolic/instance.py +5 -4
  48. sky/provision/kubernetes/instance.py +36 -6
  49. sky/provision/lambda_cloud/instance.py +5 -4
  50. sky/provision/nebius/instance.py +5 -4
  51. sky/provision/oci/instance.py +5 -4
  52. sky/provision/paperspace/instance.py +5 -4
  53. sky/provision/provisioner.py +6 -0
  54. sky/provision/runpod/instance.py +5 -4
  55. sky/provision/scp/instance.py +5 -5
  56. sky/provision/vast/instance.py +5 -5
  57. sky/provision/vsphere/instance.py +5 -4
  58. sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
  59. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  60. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  61. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  62. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  63. sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
  64. sky/serve/client/impl.py +11 -8
  65. sky/serve/client/sdk.py +7 -7
  66. sky/serve/serve_state.py +437 -340
  67. sky/serve/serve_utils.py +37 -3
  68. sky/serve/server/impl.py +2 -2
  69. sky/server/common.py +12 -8
  70. sky/server/constants.py +1 -1
  71. sky/setup_files/alembic.ini +4 -0
  72. sky/skypilot_config.py +4 -4
  73. sky/users/permission.py +1 -1
  74. sky/utils/cli_utils/status_utils.py +10 -1
  75. sky/utils/db/db_utils.py +53 -1
  76. sky/utils/db/migration_utils.py +5 -1
  77. sky/utils/kubernetes/deploy_remote_cluster.py +3 -1
  78. sky/utils/resource_checker.py +162 -21
  79. sky/volumes/client/sdk.py +4 -4
  80. sky/workspaces/core.py +210 -6
  81. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/METADATA +2 -2
  82. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/RECORD +87 -83
  83. sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +0 -11
  85. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +0 -1
  87. /sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → Fuy7OzApYTUMz2QgoP7dP}/_ssgManifest.js +0 -0
  88. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/WHEEL +0 -0
  89. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/entry_points.txt +0 -0
  90. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/licenses/LICENSE +0 -0
  91. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'a167cba8230b0ffda6baa0c825fa0eb5d5ab4aa4'
8
+ _SKYPILOT_COMMIT_SHA = '1e311e80f4a9112a6d2c86bb78d4c225042cedbc'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250807'
38
+ __version__ = '1.0.0.dev20250812'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -142,8 +142,11 @@ def _load_config(context: Optional[str] = None):
142
142
  # show up in SkyPilot tasks. For now, we work around by using
143
143
  # DNS name instead of environment variables.
144
144
  # See issue: https://github.com/skypilot-org/skypilot/issues/2287
145
- os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
146
- os.environ['KUBERNETES_SERVICE_PORT'] = '443'
145
+ # Only set if not already present (preserving existing values)
146
+ if 'KUBERNETES_SERVICE_HOST' not in os.environ:
147
+ os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
148
+ if 'KUBERNETES_SERVICE_PORT' not in os.environ:
149
+ os.environ['KUBERNETES_SERVICE_PORT'] = '443'
147
150
  kubernetes.config.load_incluster_config()
148
151
  except kubernetes.config.config_exception.ConfigException:
149
152
  _load_config_from_kubeconfig()
@@ -121,6 +121,7 @@ CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
121
121
  _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
122
122
 
123
123
  CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
124
+ WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
124
125
 
125
126
  # Remote dir that holds our runtime files.
126
127
  _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
@@ -1772,8 +1773,9 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
1772
1773
 
1773
1774
  def _query_cluster_status_via_cloud_api(
1774
1775
  handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
1775
- ) -> List[status_lib.ClusterStatus]:
1776
- """Returns the status of the cluster.
1776
+ ) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
1777
+ """Returns the status of the cluster as a list of tuples corresponding
1778
+ to the node status and an optional reason string for said status.
1777
1779
 
1778
1780
  Raises:
1779
1781
  exceptions.ClusterStatusFetchingError: the cluster status cannot be
@@ -1812,9 +1814,13 @@ def _query_cluster_status_via_cloud_api(
1812
1814
  region = provider_config.get('region') or provider_config.get(
1813
1815
  'location')
1814
1816
  zone = ray_config['provider'].get('availability_zone')
1817
+ # TODO (kyuds): refactor cloud.query_status api to include reason.
1818
+ # Currently not refactoring as this API is actually supposed to be
1819
+ # deprecated soon.
1815
1820
  node_statuses = cloud.query_status(
1816
1821
  cluster_name_on_cloud,
1817
1822
  tag_filter_for_cluster(cluster_name_on_cloud), region, zone)
1823
+ node_statuses = [(status, None) for status in node_statuses]
1818
1824
  return node_statuses
1819
1825
 
1820
1826
 
@@ -2014,8 +2020,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2014
2020
 
2015
2021
  node_statuses = _query_cluster_status_via_cloud_api(handle)
2016
2022
 
2017
- all_nodes_up = (all(
2018
- status == status_lib.ClusterStatus.UP for status in node_statuses) and
2023
+ all_nodes_up = (all(status[0] == status_lib.ClusterStatus.UP
2024
+ for status in node_statuses) and
2019
2025
  len(node_statuses) == handle.launched_nodes)
2020
2026
 
2021
2027
  def get_node_counts_from_ray_status(
@@ -2120,6 +2126,13 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2120
2126
  # run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
2121
2127
  # head-ip/worker-ips`.
2122
2128
  record['status'] = status_lib.ClusterStatus.UP
2129
+ # Add cluster event for instance status check.
2130
+ global_user_state.add_cluster_event(
2131
+ cluster_name,
2132
+ status_lib.ClusterStatus.UP,
2133
+ 'All nodes up + ray cluster healthy.',
2134
+ global_user_state.ClusterEventType.STATUS_CHANGE,
2135
+ nop_if_duplicate=True)
2123
2136
  global_user_state.add_or_update_cluster(cluster_name,
2124
2137
  handle,
2125
2138
  requested_resources=None,
@@ -2204,9 +2217,19 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2204
2217
  # regardless of the ray cluster's health.
2205
2218
  # (2) Otherwise, we will reset the autostop setting, unless the cluster is
2206
2219
  # autostopping/autodowning.
2207
- is_abnormal = ((0 < len(node_statuses) < handle.launched_nodes) or any(
2208
- status != status_lib.ClusterStatus.STOPPED for status in node_statuses))
2220
+ some_nodes_terminated = 0 < len(node_statuses) < handle.launched_nodes
2221
+ some_nodes_not_stopped = any(status[0] != status_lib.ClusterStatus.STOPPED
2222
+ for status in node_statuses)
2223
+ is_abnormal = (some_nodes_terminated or some_nodes_not_stopped)
2224
+
2209
2225
  if is_abnormal:
2226
+ status_reason = ', '.join(
2227
+ [status[1] for status in node_statuses if status[1] is not None])
2228
+
2229
+ if some_nodes_terminated:
2230
+ init_reason = f'one or more nodes terminated ({status_reason})'
2231
+ elif some_nodes_not_stopped:
2232
+ init_reason = f'some nodes are up and some nodes are stopped ({status_reason})'
2210
2233
  logger.debug('The cluster is abnormal. Setting to INIT status. '
2211
2234
  f'node_statuses: {node_statuses}')
2212
2235
  if record['autostop'] >= 0:
@@ -2290,6 +2313,12 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2290
2313
  # represent that the cluster is partially preempted.
2291
2314
  # TODO(zhwu): the definition of INIT should be audited/changed.
2292
2315
  # Adding a new status UNHEALTHY for abnormal status can be a choice.
2316
+ global_user_state.add_cluster_event(
2317
+ cluster_name,
2318
+ status_lib.ClusterStatus.INIT,
2319
+ f'Cluster is abnormal because {init_reason}. Transitioned to INIT.',
2320
+ global_user_state.ClusterEventType.STATUS_CHANGE,
2321
+ nop_if_duplicate=True)
2293
2322
  global_user_state.add_or_update_cluster(cluster_name,
2294
2323
  handle,
2295
2324
  requested_resources=None,
@@ -2300,6 +2329,9 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2300
2329
  # STOPPED.
2301
2330
  backend = backends.CloudVmRayBackend()
2302
2331
  backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
2332
+ global_user_state.add_cluster_event(
2333
+ cluster_name, None, 'All nodes stopped, terminating cluster.',
2334
+ global_user_state.ClusterEventType.STATUS_CHANGE)
2303
2335
  return global_user_state.get_cluster_from_name(cluster_name)
2304
2336
 
2305
2337
 
@@ -2760,6 +2792,9 @@ def get_clusters(
2760
2792
  refresh: common.StatusRefreshMode,
2761
2793
  cluster_names: Optional[Union[str, List[str]]] = None,
2762
2794
  all_users: bool = True,
2795
+ # Internal only:
2796
+ # pylint: disable=invalid-name
2797
+ _include_is_managed: bool = False,
2763
2798
  ) -> List[Dict[str, Any]]:
2764
2799
  """Returns a list of cached or optionally refreshed cluster records.
2765
2800
 
@@ -2780,6 +2815,8 @@ def get_clusters(
2780
2815
  names.
2781
2816
  all_users: If True, return clusters from all users. If False, only
2782
2817
  return clusters from the current user.
2818
+ _include_is_managed: Whether to force include clusters created by the
2819
+ controller.
2783
2820
 
2784
2821
  Returns:
2785
2822
  A list of cluster records. If the cluster does not exist or has been
@@ -2788,6 +2825,13 @@ def get_clusters(
2788
2825
  records = global_user_state.get_clusters()
2789
2826
  current_user = common_utils.get_current_user()
2790
2827
 
2828
+ # Filter out clusters created by the controller.
2829
+ if (not env_options.Options.SHOW_DEBUG_INFO.get() and
2830
+ not _include_is_managed):
2831
+ records = [
2832
+ record for record in records if not record.get('is_managed', False)
2833
+ ]
2834
+
2791
2835
  # Filter by user if requested
2792
2836
  if not all_users:
2793
2837
  records = [
@@ -3221,7 +3265,8 @@ def get_endpoints(cluster: str,
3221
3265
  with ux_utils.print_exception_no_traceback():
3222
3266
  raise ValueError(f'Invalid endpoint {port!r}.') from None
3223
3267
  cluster_records = get_clusters(refresh=common.StatusRefreshMode.NONE,
3224
- cluster_names=[cluster])
3268
+ cluster_names=[cluster],
3269
+ _include_is_managed=True)
3225
3270
  if not cluster_records:
3226
3271
  with ux_utils.print_exception_no_traceback():
3227
3272
  raise exceptions.ClusterNotUpError(
@@ -3311,3 +3356,8 @@ def cluster_status_lock_id(cluster_name: str) -> str:
3311
3356
  def cluster_file_mounts_lock_id(cluster_name: str) -> str:
3312
3357
  """Get the lock ID for cluster file mounts operations."""
3313
3358
  return f'{cluster_name}_file_mounts'
3359
+
3360
+
3361
+ def workspace_lock_id(workspace_name: str) -> str:
3362
+ """Get the lock ID for workspace operations."""
3363
+ return f'{workspace_name}_workspace'
@@ -1177,7 +1177,8 @@ class RetryingVmProvisioner(object):
1177
1177
  local_wheel_path: pathlib.Path,
1178
1178
  wheel_hash: str,
1179
1179
  blocked_resources: Optional[Iterable[
1180
- resources_lib.Resources]] = None):
1180
+ resources_lib.Resources]] = None,
1181
+ is_managed: Optional[bool] = None):
1181
1182
  self._blocked_resources: Set[resources_lib.Resources] = set()
1182
1183
  if blocked_resources:
1183
1184
  # blocked_resources is not None and not empty.
@@ -1189,6 +1190,7 @@ class RetryingVmProvisioner(object):
1189
1190
  self._requested_features = requested_features
1190
1191
  self._local_wheel_path = local_wheel_path
1191
1192
  self._wheel_hash = wheel_hash
1193
+ self._is_managed = is_managed
1192
1194
 
1193
1195
  def _yield_zones(
1194
1196
  self, to_provision: resources_lib.Resources, num_nodes: int,
@@ -1522,8 +1524,16 @@ class RetryingVmProvisioner(object):
1522
1524
  cluster_handle=handle,
1523
1525
  requested_resources=requested_resources,
1524
1526
  ready=False,
1527
+ is_managed=self._is_managed,
1525
1528
  )
1526
1529
 
1530
+ # Add cluster event for actual provisioning start.
1531
+ global_user_state.add_cluster_event(
1532
+ cluster_name, status_lib.ClusterStatus.INIT,
1533
+ f'Provisioning on {to_provision.cloud.display_name()} ' +
1534
+ f'in {to_provision.region}',
1535
+ global_user_state.ClusterEventType.STATUS_CHANGE)
1536
+
1527
1537
  global_user_state.set_owner_identity_for_cluster(
1528
1538
  cluster_name, cloud_user_identity)
1529
1539
 
@@ -2753,6 +2763,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2753
2763
  self._dag = None
2754
2764
  self._optimize_target = None
2755
2765
  self._requested_features = set()
2766
+ self._dump_final_script = False
2767
+ self._is_managed = False
2756
2768
 
2757
2769
  # Command for running the setup script. It is only set when the
2758
2770
  # setup needs to be run outside the self._setup() and as part of
@@ -2769,6 +2781,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2769
2781
  self._requested_features = kwargs.pop('requested_features',
2770
2782
  self._requested_features)
2771
2783
  self._dump_final_script = kwargs.pop('dump_final_script', False)
2784
+ self._is_managed = kwargs.pop('is_managed', False)
2772
2785
  assert not kwargs, f'Unexpected kwargs: {kwargs}'
2773
2786
 
2774
2787
  def check_resources_fit_cluster(
@@ -2930,10 +2943,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2930
2943
  skip_unnecessary_provisioning)
2931
2944
  except locks.LockTimeout:
2932
2945
  if not communicated_with_user:
2933
- logger.info(f'{colorama.Fore.YELLOW}'
2934
- f'Launching delayed, check concurrent tasks: '
2935
- f'sky api status')
2936
- communicated_with_user = True
2946
+ rich_utils.force_update_status(
2947
+ ux_utils.spinner_message('Launching - blocked by ' +
2948
+ 'other requests ' +
2949
+ colorama.Style.RESET_ALL +
2950
+ colorama.Style.DIM +
2951
+ 'Check concurrent requests: ' +
2952
+ 'sky api status '))
2937
2953
 
2938
2954
  def _locked_provision(
2939
2955
  self,
@@ -2990,7 +3006,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2990
3006
  self._requested_features,
2991
3007
  local_wheel_path,
2992
3008
  wheel_hash,
2993
- blocked_resources=task.blocked_resources)
3009
+ blocked_resources=task.blocked_resources,
3010
+ is_managed=self._is_managed)
2994
3011
  log_path = os.path.join(self.log_dir, 'provision.log')
2995
3012
  rich_utils.force_update_status(
2996
3013
  ux_utils.spinner_message('Launching', log_path))
@@ -3000,6 +3017,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3000
3017
  break
3001
3018
  except exceptions.ResourcesUnavailableError as e:
3002
3019
  log_path = retry_provisioner.log_dir + '/provision.log'
3020
+
3003
3021
  error_message = (
3004
3022
  f'{colorama.Fore.RED}Failed to provision all '
3005
3023
  f'possible launchable resources.'
@@ -3016,6 +3034,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3016
3034
  hint_message = (f'\n{retry_message} '
3017
3035
  f'{ux_utils.log_path_hint(log_path)}'
3018
3036
  f'{colorama.Style.RESET_ALL}')
3037
+
3038
+ # Add cluster event for retry.
3039
+ global_user_state.add_cluster_event(
3040
+ cluster_name, status_lib.ClusterStatus.INIT,
3041
+ f'Retrying provisioning after {gap_seconds:.0f}s',
3042
+ global_user_state.ClusterEventType.STATUS_CHANGE)
3043
+
3019
3044
  raise exceptions.ExecutionRetryableError(
3020
3045
  error_message,
3021
3046
  hint=hint_message,
@@ -3067,6 +3092,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3067
3092
  # and other necessary files to the VM.
3068
3093
  # 3. Run setup commands to install dependencies.
3069
3094
  # 4. Starting ray cluster and skylet.
3095
+
3096
+ # Add cluster event for runtime setup start
3097
+ global_user_state.add_cluster_event(
3098
+ handle.cluster_name, status_lib.ClusterStatus.INIT,
3099
+ 'Setting up SkyPilot runtime on cluster',
3100
+ global_user_state.ClusterEventType.STATUS_CHANGE)
3101
+
3070
3102
  cluster_info = provisioner.post_provision_runtime_setup(
3071
3103
  repr(handle.launched_resources.cloud),
3072
3104
  resources_utils.ClusterName(handle.cluster_name,
@@ -3252,6 +3284,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3252
3284
  config_hash=config_hash,
3253
3285
  task_config=user_specified_task_config,
3254
3286
  )
3287
+
3288
+ # Add cluster event for successful provisioning.
3289
+ global_user_state.add_cluster_event(
3290
+ handle.cluster_name, status_lib.ClusterStatus.UP,
3291
+ 'Cluster successfully provisioned with ' +
3292
+ f'{handle.launched_nodes} nodes',
3293
+ global_user_state.ClusterEventType.STATUS_CHANGE)
3294
+
3255
3295
  usage_lib.messages.usage.update_final_cluster_status(
3256
3296
  status_lib.ClusterStatus.UP)
3257
3297
  # We still add the cluster to ssh config file on API server, this
@@ -4624,8 +4664,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4624
4664
  non_terminated_only=False)
4625
4665
 
4626
4666
  unexpected_node_state: Optional[Tuple[str, str]] = None
4627
- for node_id, node_status in node_status_dict.items():
4628
- logger.debug(f'{node_id} status: {node_status}')
4667
+ for node_id, node_status_tuple in node_status_dict.items():
4668
+ node_status, reason = node_status_tuple
4669
+ reason = '' if reason is None else f' ({reason})'
4670
+ logger.debug(f'{node_id} status: {node_status}{reason}')
4629
4671
  # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
4630
4672
  # between "stopping/stopped" and "terminating/terminated",
4631
4673
  # so we allow for either status instead of casing on
sky/client/cli/command.py CHANGED
@@ -35,7 +35,7 @@ import sys
35
35
  import traceback
36
36
  import typing
37
37
  from typing import (Any, Callable, Dict, Generator, List, Optional, Set, Tuple,
38
- Union)
38
+ TypeVar, Union)
39
39
 
40
40
  import click
41
41
  import colorama
@@ -116,6 +116,8 @@ _DAG_NOT_SUPPORTED_MESSAGE = ('YAML specifies a DAG which is only supported by '
116
116
  '`sky jobs launch`. `{command}` supports a '
117
117
  'single task only.')
118
118
 
119
+ T = TypeVar('T')
120
+
119
121
 
120
122
  def _get_cluster_records_and_set_ssh_config(
121
123
  clusters: Optional[List[str]],
@@ -224,8 +226,8 @@ def _get_glob_matches(candidate_names: List[str],
224
226
  return list(set(glob_storages))
225
227
 
226
228
 
227
- def _async_call_or_wait(request_id: str, async_call: bool,
228
- request_name: str) -> Any:
229
+ def _async_call_or_wait(request_id: server_common.RequestId[T],
230
+ async_call: bool, request_name: str) -> Any:
229
231
  short_request_id = request_id[:8]
230
232
  if not async_call:
231
233
  try:
@@ -1411,7 +1413,7 @@ def exec(
1411
1413
 
1412
1414
 
1413
1415
  def _handle_jobs_queue_request(
1414
- request_id: str,
1416
+ request_id: server_common.RequestId[List[Dict[str, Any]]],
1415
1417
  show_all: bool,
1416
1418
  show_user: bool,
1417
1419
  max_num_jobs_to_show: Optional[int],
@@ -1492,7 +1494,7 @@ def _handle_jobs_queue_request(
1492
1494
 
1493
1495
 
1494
1496
  def _handle_services_request(
1495
- request_id: str,
1497
+ request_id: server_common.RequestId[List[Dict[str, Any]]],
1496
1498
  service_names: Optional[List[str]],
1497
1499
  show_all: bool,
1498
1500
  show_endpoint: bool,
@@ -1879,17 +1881,19 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1879
1881
  skip_finished=True,
1880
1882
  all_users=all_users)
1881
1883
 
1882
- def submit_services() -> Optional[str]:
1884
+ def submit_services(
1885
+ ) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
1883
1886
  return serve_lib.status(service_names=None)
1884
1887
 
1885
- def submit_pools() -> Optional[str]:
1888
+ def submit_pools(
1889
+ ) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
1886
1890
  try:
1887
1891
  return managed_jobs.pool_status(pool_names=None)
1888
1892
  except exceptions.APINotSupportedError as e:
1889
1893
  logger.debug(f'Pools are not supported in the remote server: {e}')
1890
1894
  return None
1891
1895
 
1892
- def submit_workspace() -> Optional[str]:
1896
+ def submit_workspace() -> Optional[server_common.RequestId[Dict[str, Any]]]:
1893
1897
  try:
1894
1898
  return sdk.workspaces()
1895
1899
  except RuntimeError:
@@ -1928,11 +1932,14 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1928
1932
  if not (ip or show_endpoints):
1929
1933
  workspace_request_id = workspace_request_future.result()
1930
1934
 
1931
- managed_jobs_queue_request_id = ('' if not managed_jobs_queue_request_id
1932
- else managed_jobs_queue_request_id)
1933
- service_status_request_id = ('' if not service_status_request_id else
1935
+ managed_jobs_queue_request_id = (server_common.RequestId()
1936
+ if not managed_jobs_queue_request_id else
1937
+ managed_jobs_queue_request_id)
1938
+ service_status_request_id = (server_common.RequestId()
1939
+ if not service_status_request_id else
1934
1940
  service_status_request_id)
1935
- pool_status_request_id = ('' if not pool_status_request_id else
1941
+ pool_status_request_id = (server_common.RequestId()
1942
+ if not pool_status_request_id else
1936
1943
  pool_status_request_id)
1937
1944
 
1938
1945
  # Phase 3: Get cluster records and handle special cases
@@ -1957,7 +1964,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1957
1964
  if workspace_request_id is not None:
1958
1965
  all_workspaces = sdk.get(workspace_request_id)
1959
1966
  else:
1960
- all_workspaces = [constants.SKYPILOT_DEFAULT_WORKSPACE]
1967
+ all_workspaces = {constants.SKYPILOT_DEFAULT_WORKSPACE: {}}
1961
1968
  active_workspace = skypilot_config.get_active_workspace()
1962
1969
  show_workspace = len(all_workspaces) > 1
1963
1970
  _show_enabled_infra(active_workspace, show_workspace)
@@ -2974,6 +2981,8 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
2974
2981
  controller = controller_utils.Controllers.from_name(controller_name)
2975
2982
  assert controller is not None, controller_name
2976
2983
 
2984
+ # TODO(tian): We also need to check pools after we allow running pools on
2985
+ # jobs controller.
2977
2986
  with rich_utils.client_status(
2978
2987
  '[bold cyan]Checking for in-progress managed jobs[/]'):
2979
2988
  try:
@@ -3070,6 +3079,21 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
3070
3079
  # controller being STOPPED or being firstly launched, i.e., there is
3071
3080
  # no in-prgress services.
3072
3081
  services = []
3082
+ except exceptions.InconsistentConsolidationModeError:
3083
+ # If this error is raised, it means the user switched to the
3084
+ # consolidation mode but the previous controller cluster is still
3085
+ # running. We should allow the user to tear down the controller
3086
+ # cluster in this case.
3087
+ with skypilot_config.override_skypilot_config(
3088
+ {'serve': {
3089
+ 'controller': {
3090
+ 'consolidation_mode': False
3091
+ }
3092
+ }}):
3093
+ # Check again with the consolidation mode disabled. This is to
3094
+ # make sure there is no in-progress services.
3095
+ request_id = serve_lib.status(service_names=None)
3096
+ services = sdk.stream_and_get(request_id)
3073
3097
 
3074
3098
  if services:
3075
3099
  service_names = [service['name'] for service in services]
@@ -3836,7 +3860,7 @@ def show_gpus(
3836
3860
  yield k8s_messages
3837
3861
  yield '\n\n'
3838
3862
 
3839
- result = sdk.stream_and_get(
3863
+ list_accelerator_counts_result = sdk.stream_and_get(
3840
3864
  sdk.list_accelerator_counts(
3841
3865
  gpus_only=True,
3842
3866
  clouds=clouds_to_list,
@@ -3853,14 +3877,20 @@ def show_gpus(
3853
3877
 
3854
3878
  # "Common" GPUs
3855
3879
  for gpu in catalog.get_common_gpus():
3856
- if gpu in result:
3857
- gpu_table.add_row([gpu, _list_to_str(result.pop(gpu))])
3880
+ if gpu in list_accelerator_counts_result:
3881
+ gpu_table.add_row([
3882
+ gpu,
3883
+ _list_to_str(list_accelerator_counts_result.pop(gpu))
3884
+ ])
3858
3885
  yield from gpu_table.get_string()
3859
3886
 
3860
3887
  # Google TPUs
3861
3888
  for tpu in catalog.get_tpus():
3862
- if tpu in result:
3863
- tpu_table.add_row([tpu, _list_to_str(result.pop(tpu))])
3889
+ if tpu in list_accelerator_counts_result:
3890
+ tpu_table.add_row([
3891
+ tpu,
3892
+ _list_to_str(list_accelerator_counts_result.pop(tpu))
3893
+ ])
3864
3894
  if tpu_table.get_string():
3865
3895
  yield '\n\n'
3866
3896
  yield from tpu_table.get_string()
@@ -3868,7 +3898,7 @@ def show_gpus(
3868
3898
  # Other GPUs
3869
3899
  if show_all:
3870
3900
  yield '\n\n'
3871
- for gpu, qty in sorted(result.items()):
3901
+ for gpu, qty in sorted(list_accelerator_counts_result.items()):
3872
3902
  other_table.add_row([gpu, _list_to_str(qty)])
3873
3903
  yield from other_table.get_string()
3874
3904
  yield '\n\n'
@@ -3919,7 +3949,7 @@ def show_gpus(
3919
3949
 
3920
3950
  # For clouds other than Kubernetes, get the accelerator details
3921
3951
  # Case-sensitive
3922
- result = sdk.stream_and_get(
3952
+ list_accelerators_result = sdk.stream_and_get(
3923
3953
  sdk.list_accelerators(gpus_only=True,
3924
3954
  name_filter=name,
3925
3955
  quantity_filter=quantity,
@@ -3935,8 +3965,8 @@ def show_gpus(
3935
3965
  # - Group by cloud
3936
3966
  # - Sort within each group by prices
3937
3967
  # - Sort groups by each cloud's (min price, min spot price)
3938
- new_result = {}
3939
- for i, (gpu, items) in enumerate(result.items()):
3968
+ new_result: Dict[str, List[catalog_common.InstanceTypeInfo]] = {}
3969
+ for i, (gpu, items) in enumerate(list_accelerators_result.items()):
3940
3970
  df = pd.DataFrame([t._asdict() for t in items])
3941
3971
  # Determine the minimum prices for each cloud.
3942
3972
  min_price_df = df.groupby('cloud').agg(min_price=('price', 'min'),
@@ -3954,14 +3984,14 @@ def show_gpus(
3954
3984
  for row in df.to_records(index=False)
3955
3985
  ]
3956
3986
  new_result[gpu] = sorted_dataclasses
3957
- result = new_result
3987
+ list_accelerators_result = new_result
3958
3988
 
3959
3989
  if print_section_titles and not show_all:
3960
3990
  yield '\n\n'
3961
3991
  yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3962
3992
  f'Cloud GPUs{colorama.Style.RESET_ALL}\n')
3963
3993
 
3964
- if not result:
3994
+ if not list_accelerators_result:
3965
3995
  quantity_str = (f' with requested quantity {quantity}'
3966
3996
  if quantity else '')
3967
3997
  cloud_str = f' on {cloud_obj}.' if cloud_name else ' in cloud catalogs.'
@@ -3969,7 +3999,7 @@ def show_gpus(
3969
3999
  yield 'To show available accelerators, run: sky show-gpus --all'
3970
4000
  return
3971
4001
 
3972
- for i, (gpu, items) in enumerate(result.items()):
4002
+ for i, (gpu, items) in enumerate(list_accelerators_result.items()):
3973
4003
  accelerator_table_headers = [
3974
4004
  'GPU',
3975
4005
  'QTY',
@@ -6039,7 +6069,11 @@ def api_logs(request_id: Optional[str], server_logs: bool,
6039
6069
  if request_id is not None and log_path is not None:
6040
6070
  raise click.BadParameter(
6041
6071
  'Only one of request ID and log path can be provided.')
6042
- sdk.stream_and_get(request_id, log_path, tail)
6072
+ # Only wrap request_id when it is provided; otherwise pass None so the
6073
+ # server accepts log_path-only streaming.
6074
+ req_id = (server_common.RequestId[None](request_id)
6075
+ if request_id is not None else None)
6076
+ sdk.stream_and_get(req_id, log_path, tail, follow=follow)
6043
6077
 
6044
6078
 
6045
6079
  @api.command('cancel', cls=_DocumentedCodeCommand)