skypilot-nightly 1.0.0.dev20250828__py3-none-any.whl → 1.0.0.dev20250831__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (59) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +24 -2
  3. sky/backends/backend_utils.py +152 -59
  4. sky/backends/cloud_vm_ray_backend.py +56 -3
  5. sky/backends/wheel_utils.py +35 -8
  6. sky/client/cli/command.py +17 -6
  7. sky/client/common.py +5 -4
  8. sky/client/sdk.py +5 -0
  9. sky/client/sdk_async.py +8 -2
  10. sky/clouds/aws.py +118 -1
  11. sky/core.py +8 -3
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/chunks/{webpack-6dae1cd599a34def.js → webpack-6e76f636a048e145.js} +1 -1
  14. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  15. sky/dashboard/out/clusters/[cluster].html +1 -1
  16. sky/dashboard/out/clusters.html +1 -1
  17. sky/dashboard/out/config.html +1 -1
  18. sky/dashboard/out/index.html +1 -1
  19. sky/dashboard/out/infra/[context].html +1 -1
  20. sky/dashboard/out/infra.html +1 -1
  21. sky/dashboard/out/jobs/[job].html +1 -1
  22. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  23. sky/dashboard/out/jobs.html +1 -1
  24. sky/dashboard/out/users.html +1 -1
  25. sky/dashboard/out/volumes.html +1 -1
  26. sky/dashboard/out/workspace/new.html +1 -1
  27. sky/dashboard/out/workspaces/[name].html +1 -1
  28. sky/dashboard/out/workspaces.html +1 -1
  29. sky/global_user_state.py +58 -10
  30. sky/provision/aws/config.py +78 -3
  31. sky/provision/aws/instance.py +45 -6
  32. sky/provision/docker_utils.py +1 -1
  33. sky/provision/kubernetes/utils.py +48 -26
  34. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  35. sky/server/common.py +1 -2
  36. sky/server/daemons.py +6 -0
  37. sky/server/requests/executor.py +2 -1
  38. sky/server/requests/payloads.py +4 -1
  39. sky/server/server.py +67 -58
  40. sky/setup_files/dependencies.py +25 -8
  41. sky/setup_files/setup.py +2 -0
  42. sky/sky_logging.py +28 -0
  43. sky/skylet/constants.py +6 -0
  44. sky/templates/aws-ray.yml.j2 +1 -0
  45. sky/utils/annotations.py +8 -2
  46. sky/utils/cluster_utils.py +3 -3
  47. sky/utils/db/migration_utils.py +1 -1
  48. sky/utils/kubernetes_enums.py +1 -0
  49. sky/utils/lock_events.py +94 -0
  50. sky/utils/schemas.py +6 -0
  51. sky/utils/timeline.py +24 -93
  52. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/METADATA +36 -48
  53. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/RECORD +59 -57
  54. /sky/dashboard/out/_next/static/{9DW6d9jaP2kZt0NcgIfFa → FtHzmn6BMJ5PzqHhEY51g}/_buildManifest.js +0 -0
  55. /sky/dashboard/out/_next/static/{9DW6d9jaP2kZt0NcgIfFa → FtHzmn6BMJ5PzqHhEY51g}/_ssgManifest.js +0 -0
  56. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/WHEEL +0 -0
  57. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/entry_points.txt +0 -0
  58. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/licenses/LICENSE +0 -0
  59. {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -7,7 +7,7 @@ import urllib.request
7
7
  from sky.utils import directory_utils
8
8
 
9
9
  # Replaced with the current commit when building the wheels.
10
- _SKYPILOT_COMMIT_SHA = 'ff93214498e29e0aa9a73868b73613535f96b8a3'
10
+ _SKYPILOT_COMMIT_SHA = 'f394ebc3a46b5d2cde5afb8a765d97709ea584ed'
11
11
 
12
12
 
13
13
  def _get_git_commit():
@@ -37,7 +37,7 @@ def _get_git_commit():
37
37
 
38
38
 
39
39
  __commit__ = _get_git_commit()
40
- __version__ = '1.0.0.dev20250828'
40
+ __version__ = '1.0.0.dev20250831'
41
41
  __root_dir__ = directory_utils.get_sky_dir()
42
42
 
43
43
 
sky/adaptors/nebius.py CHANGED
@@ -62,6 +62,10 @@ def iam_token_path() -> str:
62
62
  return '~/.nebius/NEBIUS_IAM_TOKEN.txt'
63
63
 
64
64
 
65
+ def domain_path() -> str:
66
+ return '~/.nebius/NEBIUS_DOMAIN.txt'
67
+
68
+
65
69
  def credentials_path() -> str:
66
70
  workspace_path = skypilot_config.get_workspace_cloud('nebius').get(
67
71
  'credentials_file_path', None)
@@ -82,6 +86,22 @@ def _get_default_credentials_path() -> str:
82
86
  return '~/.nebius/credentials.json'
83
87
 
84
88
 
89
+ def api_domain() -> Optional[str]:
90
+ domain_in_ws_config = skypilot_config.get_workspace_cloud('nebius').get(
91
+ 'domain', None)
92
+ if domain_in_ws_config is not None:
93
+ return domain_in_ws_config
94
+ domain_in_config = skypilot_config.get_effective_region_config(
95
+ cloud='nebius', region=None, keys=('domain',), default_value=None)
96
+ if domain_in_config is not None:
97
+ return domain_in_config
98
+ try:
99
+ with open(os.path.expanduser(domain_path()), encoding='utf-8') as file:
100
+ return file.read().strip()
101
+ except FileNotFoundError:
102
+ return None
103
+
104
+
85
105
  DEFAULT_REGION = 'eu-north1'
86
106
 
87
107
  NEBIUS_PROFILE_NAME = 'nebius'
@@ -215,10 +235,12 @@ def _sdk(token: Optional[str], cred_path: Optional[str]):
215
235
  # Exactly one of token or cred_path must be provided
216
236
  assert (token is None) != (cred_path is None), (token, cred_path)
217
237
  if token is not None:
218
- return nebius.sdk.SDK(credentials=token)
238
+ return nebius.sdk.SDK(credentials=token, domain=api_domain())
219
239
  if cred_path is not None:
220
240
  return nebius.sdk.SDK(
221
- credentials_file_name=os.path.expanduser(cred_path))
241
+ credentials_file_name=os.path.expanduser(cred_path),
242
+ domain=api_domain(),
243
+ )
222
244
  raise ValueError('Either token or credentials file path must be provided')
223
245
 
224
246
 
@@ -928,19 +928,19 @@ def write_cluster_config(
928
928
  # Add kubernetes config fields from ~/.sky/config
929
929
  if isinstance(cloud, clouds.Kubernetes):
930
930
  cluster_config_overrides = to_provision.cluster_config_overrides
931
- kubernetes_utils.combine_pod_config_fields(
932
- tmp_yaml_path,
931
+ with open(tmp_yaml_path, 'r', encoding='utf-8') as f:
932
+ tmp_yaml_str = f.read()
933
+ cluster_yaml_obj = yaml_utils.safe_load(tmp_yaml_str)
934
+ combined_yaml_obj = kubernetes_utils.combine_pod_config_fields_and_metadata(
935
+ cluster_yaml_obj,
933
936
  cluster_config_overrides=cluster_config_overrides,
934
937
  cloud=cloud,
935
938
  context=region.name)
936
- kubernetes_utils.combine_metadata_fields(
937
- tmp_yaml_path,
938
- cluster_config_overrides=cluster_config_overrides,
939
- context=region.name)
940
- yaml_obj = yaml_utils.read_yaml(tmp_yaml_path)
941
- pod_config: Dict[str, Any] = yaml_obj['available_node_types'][
942
- 'ray_head_default']['node_config']
939
+ # Write the updated YAML back to the file
940
+ yaml_utils.dump_yaml(tmp_yaml_path, combined_yaml_obj)
943
941
 
942
+ pod_config: Dict[str, Any] = combined_yaml_obj['available_node_types'][
943
+ 'ray_head_default']['node_config']
944
944
  # Check pod spec only. For high availability controllers, we deploy pvc & deployment for the controller. Read kubernetes-ray.yml.j2 for more details.
945
945
  pod_config.pop('deployment_spec', None)
946
946
  pod_config.pop('pvc_spec', None)
@@ -1409,6 +1409,62 @@ def ssh_credential_from_yaml(
1409
1409
  return credentials
1410
1410
 
1411
1411
 
1412
+ def ssh_credentials_from_handles(
1413
+ handles: List['cloud_vm_ray_backend.CloudVmRayResourceHandle'],
1414
+ ) -> List[Dict[str, Any]]:
1415
+ """Returns ssh_user, ssh_private_key and ssh_control name.
1416
+ """
1417
+ non_empty_cluster_yaml_paths = [
1418
+ handle.cluster_yaml
1419
+ for handle in handles
1420
+ if handle.cluster_yaml is not None
1421
+ ]
1422
+ cluster_yaml_dicts = global_user_state.get_cluster_yaml_dict_multiple(
1423
+ non_empty_cluster_yaml_paths)
1424
+ cluster_yaml_dicts_to_index = {
1425
+ cluster_yaml_path: cluster_yaml_dict
1426
+ for cluster_yaml_path, cluster_yaml_dict in zip(
1427
+ non_empty_cluster_yaml_paths, cluster_yaml_dicts)
1428
+ }
1429
+
1430
+ credentials_to_return: List[Dict[str, Any]] = []
1431
+ for handle in handles:
1432
+ if handle.cluster_yaml is None:
1433
+ credentials_to_return.append(dict())
1434
+ continue
1435
+ ssh_user = handle.ssh_user
1436
+ docker_user = handle.docker_user
1437
+ config = cluster_yaml_dicts_to_index[handle.cluster_yaml]
1438
+ auth_section = config['auth']
1439
+ if ssh_user is None:
1440
+ ssh_user = auth_section['ssh_user'].strip()
1441
+ ssh_private_key_path = auth_section.get('ssh_private_key')
1442
+ ssh_control_name = config.get('cluster_name', '__default__')
1443
+ ssh_proxy_command = auth_section.get('ssh_proxy_command')
1444
+
1445
+ # Update the ssh_user placeholder in proxy command, if required
1446
+ if (ssh_proxy_command is not None and
1447
+ constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
1448
+ ssh_proxy_command = ssh_proxy_command.replace(
1449
+ constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
1450
+
1451
+ credentials = {
1452
+ 'ssh_user': ssh_user,
1453
+ 'ssh_private_key': ssh_private_key_path,
1454
+ 'ssh_control_name': ssh_control_name,
1455
+ 'ssh_proxy_command': ssh_proxy_command,
1456
+ }
1457
+ if docker_user is not None:
1458
+ credentials['docker_user'] = docker_user
1459
+ ssh_provider_module = config['provider']['module']
1460
+ # If we are running ssh command on kubernetes node.
1461
+ if 'kubernetes' in ssh_provider_module:
1462
+ credentials['disable_control_master'] = True
1463
+ credentials_to_return.append(credentials)
1464
+
1465
+ return credentials_to_return
1466
+
1467
+
1412
1468
  def parallel_data_transfer_to_nodes(
1413
1469
  runners: List[command_runner.CommandRunner],
1414
1470
  source: Optional[str],
@@ -2056,7 +2112,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2056
2112
  f'{output}\n', stderr)
2057
2113
  return (*_count_healthy_nodes_from_ray(output), output, stderr)
2058
2114
 
2115
+ ray_status_details: Optional[str] = None
2116
+
2059
2117
  def run_ray_status_to_check_ray_cluster_healthy() -> bool:
2118
+ nonlocal ray_status_details
2060
2119
  try:
2061
2120
  # NOTE: fetching the IPs is very slow as it calls into
2062
2121
  # `ray get head-ip/worker-ips`. Using cached IPs is safe because
@@ -2134,19 +2193,25 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2134
2193
  # showing up
2135
2194
  time.sleep(1)
2136
2195
 
2196
+ ray_status_details = (
2197
+ f'{ready_head + ready_workers}/{total_nodes} ready')
2137
2198
  raise RuntimeError(
2138
2199
  f'Refreshing status ({cluster_name!r}): ray status not showing '
2139
2200
  f'all nodes ({ready_head + ready_workers}/'
2140
2201
  f'{total_nodes});\noutput:\n{output}\nstderr:\n{stderr}')
2141
2202
 
2142
2203
  except exceptions.FetchClusterInfoError:
2204
+ ray_status_details = 'failed to get IPs'
2143
2205
  logger.debug(
2144
2206
  f'Refreshing status ({cluster_name!r}) failed to get IPs.')
2145
2207
  except RuntimeError as e:
2208
+ if ray_status_details is None:
2209
+ ray_status_details = str(e)
2146
2210
  logger.debug(common_utils.format_exception(e))
2147
2211
  except Exception as e: # pylint: disable=broad-except
2148
2212
  # This can be raised by `external_ssh_ports()`, due to the
2149
2213
  # underlying call to kubernetes API.
2214
+ ray_status_details = str(e)
2150
2215
  logger.debug(f'Refreshing status ({cluster_name!r}) failed: ',
2151
2216
  exc_info=e)
2152
2217
  return False
@@ -2259,6 +2324,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2259
2324
  # (2) Otherwise, we will reset the autostop setting, unless the cluster is
2260
2325
  # autostopping/autodowning.
2261
2326
  some_nodes_terminated = 0 < len(node_statuses) < handle.launched_nodes
2327
+ # If all nodes are up and ray cluster is health, we would have returned
2328
+ # earlier. So if all_nodes_up is True and we are here, it means the ray
2329
+ # cluster must have been unhealthy.
2330
+ ray_cluster_unhealthy = all_nodes_up
2262
2331
  some_nodes_not_stopped = any(status[0] != status_lib.ClusterStatus.STOPPED
2263
2332
  for status in node_statuses)
2264
2333
  is_abnormal = (some_nodes_terminated or some_nodes_not_stopped)
@@ -2269,8 +2338,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2269
2338
 
2270
2339
  if some_nodes_terminated:
2271
2340
  init_reason = 'one or more nodes terminated'
2341
+ elif ray_cluster_unhealthy:
2342
+ init_reason = f'ray cluster is unhealthy ({ray_status_details})'
2272
2343
  elif some_nodes_not_stopped:
2273
- init_reason = 'some nodes are up and some nodes are stopped'
2344
+ init_reason = 'some but not all nodes are stopped'
2274
2345
  logger.debug('The cluster is abnormal. Setting to INIT status. '
2275
2346
  f'node_statuses: {node_statuses}')
2276
2347
  if record['autostop'] >= 0:
@@ -2365,7 +2436,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2365
2436
  # Some status reason clears after a certain time (e.g. k8s events
2366
2437
  # are only stored for an hour by default), so it is possible that
2367
2438
  # the previous event has a status reason, but now it does not.
2368
- init_reason_regex = f'^Cluster is abnormal because {init_reason}.*'
2439
+ init_reason_regex = (f'^Cluster is abnormal because '
2440
+ f'{re.escape(init_reason)}.*')
2369
2441
  log_message = f'Cluster is abnormal because {init_reason}'
2370
2442
  if status_reason:
2371
2443
  log_message += f' ({status_reason})'
@@ -2385,10 +2457,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2385
2457
  return global_user_state.get_cluster_from_name(cluster_name)
2386
2458
  # Now is_abnormal is False: either node_statuses is empty or all nodes are
2387
2459
  # STOPPED.
2460
+ verb = 'terminated' if to_terminate else 'stopped'
2388
2461
  backend = backends.CloudVmRayBackend()
2389
2462
  global_user_state.add_cluster_event(
2390
- cluster_name, None, 'All nodes terminated, cleaning up the cluster.',
2391
- global_user_state.ClusterEventType.STATUS_CHANGE)
2463
+ cluster_name,
2464
+ None,
2465
+ f'All nodes {verb}, cleaning up the cluster.',
2466
+ global_user_state.ClusterEventType.STATUS_CHANGE,
2467
+ # This won't do anything for a terminated cluster, but it's needed for a
2468
+ # stopped cluster.
2469
+ nop_if_duplicate=True,
2470
+ )
2392
2471
  backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
2393
2472
  return global_user_state.get_cluster_from_name(cluster_name)
2394
2473
 
@@ -2850,6 +2929,7 @@ def get_clusters(
2850
2929
  refresh: common.StatusRefreshMode,
2851
2930
  cluster_names: Optional[Union[str, List[str]]] = None,
2852
2931
  all_users: bool = True,
2932
+ include_credentials: bool = False,
2853
2933
  # Internal only:
2854
2934
  # pylint: disable=invalid-name
2855
2935
  _include_is_managed: bool = False,
@@ -2862,17 +2942,14 @@ def get_clusters(
2862
2942
  of the clusters.
2863
2943
 
2864
2944
  Args:
2865
- include_controller: Whether to include controllers, e.g. jobs controller
2866
- or sky serve controller.
2867
2945
  refresh: Whether to refresh the status of the clusters. (Refreshing will
2868
2946
  set the status to STOPPED if the cluster cannot be pinged.)
2869
- cloud_filter: Sets which clouds to filer through from the global user
2870
- state. Supports three values, 'all' for all clouds, 'public' for
2871
- public clouds only, and 'local' for only local clouds.
2872
2947
  cluster_names: If provided, only return records for the given cluster
2873
2948
  names.
2874
2949
  all_users: If True, return clusters from all users. If False, only
2875
2950
  return clusters from the current user.
2951
+ include_credentials: If True, include cluster ssh credentials in the
2952
+ return value.
2876
2953
  _include_is_managed: Whether to force include clusters created by the
2877
2954
  controller.
2878
2955
 
@@ -2916,54 +2993,68 @@ def get_clusters(
2916
2993
  logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
2917
2994
  records = new_records
2918
2995
 
2919
- def _update_record_with_credentials_and_resources_str(
2920
- record: Optional[Dict[str, Any]]) -> None:
2996
+ def _get_records_with_handle(
2997
+ records: List[Optional[Dict[str, Any]]]) -> List[Dict[str, Any]]:
2998
+ """Filter for records that have a handle"""
2999
+ return [
3000
+ record for record in records
3001
+ if record is not None and record['handle'] is not None
3002
+ ]
3003
+
3004
+ def _update_records_with_resources_str(
3005
+ records: List[Optional[Dict[str, Any]]]) -> None:
3006
+ """Add resource str to record"""
3007
+ for record in _get_records_with_handle(records):
3008
+ handle = record['handle']
3009
+ record[
3010
+ 'resources_str'] = resources_utils.get_readable_resources_repr(
3011
+ handle, simplify=True)
3012
+ record[
3013
+ 'resources_str_full'] = resources_utils.get_readable_resources_repr(
3014
+ handle, simplify=False)
3015
+
3016
+ def _update_records_with_credentials(
3017
+ records: List[Optional[Dict[str, Any]]]) -> None:
2921
3018
  """Add the credentials to the record.
2922
3019
 
2923
3020
  This is useful for the client side to setup the ssh config of the
2924
3021
  cluster.
2925
3022
  """
2926
- if record is None:
2927
- return
2928
- handle = record['handle']
2929
- if handle is None:
3023
+ records_with_handle = _get_records_with_handle(records)
3024
+ if len(records_with_handle) == 0:
2930
3025
  return
2931
- record['resources_str'] = resources_utils.get_readable_resources_repr(
2932
- handle, simplify=True)
2933
- record[
2934
- 'resources_str_full'] = resources_utils.get_readable_resources_repr(
2935
- handle, simplify=False)
2936
- credentials = ssh_credential_from_yaml(handle.cluster_yaml,
2937
- handle.docker_user,
2938
- handle.ssh_user)
2939
-
2940
- if not credentials:
2941
- return
2942
- ssh_private_key_path = credentials.get('ssh_private_key', None)
2943
- if ssh_private_key_path is not None:
2944
- if not os.path.exists(os.path.expanduser(ssh_private_key_path)):
2945
- auth.create_ssh_key_files_from_db(ssh_private_key_path)
2946
- with open(os.path.expanduser(ssh_private_key_path),
2947
- 'r',
2948
- encoding='utf-8') as f:
2949
- credentials['ssh_private_key_content'] = f.read()
2950
- else:
2951
- private_key_path, _ = auth.get_or_generate_keys()
2952
- with open(os.path.expanduser(private_key_path),
2953
- 'r',
2954
- encoding='utf-8') as f:
2955
- credentials['ssh_private_key_content'] = f.read()
2956
- record['credentials'] = credentials
3026
+
3027
+ handles = [record['handle'] for record in records_with_handle]
3028
+ credentials = ssh_credentials_from_handles(handles)
3029
+ cached_private_keys: Dict[str, str] = {}
3030
+ for record, credential in zip(records_with_handle, credentials):
3031
+ if not credential:
3032
+ continue
3033
+ ssh_private_key_path = credential.get('ssh_private_key', None)
3034
+ if ssh_private_key_path is not None:
3035
+ expanded_private_key_path = os.path.expanduser(
3036
+ ssh_private_key_path)
3037
+ if not os.path.exists(expanded_private_key_path):
3038
+ auth.create_ssh_key_files_from_db(ssh_private_key_path)
3039
+ else:
3040
+ private_key_path, _ = auth.get_or_generate_keys()
3041
+ expanded_private_key_path = os.path.expanduser(private_key_path)
3042
+ if expanded_private_key_path in cached_private_keys:
3043
+ credential['ssh_private_key_content'] = cached_private_keys[
3044
+ expanded_private_key_path]
3045
+ else:
3046
+ with open(expanded_private_key_path, 'r',
3047
+ encoding='utf-8') as f:
3048
+ credential['ssh_private_key_content'] = f.read()
3049
+ cached_private_keys[expanded_private_key_path] = credential[
3050
+ 'ssh_private_key_content']
3051
+ record['credentials'] = credential
2957
3052
 
2958
3053
  def _update_records_with_resources(
2959
3054
  records: List[Optional[Dict[str, Any]]]) -> None:
2960
3055
  """Add the resources to the record."""
2961
- for record in records:
2962
- if record is None:
2963
- continue
3056
+ for record in _get_records_with_handle(records):
2964
3057
  handle = record['handle']
2965
- if handle is None:
2966
- continue
2967
3058
  record['nodes'] = handle.launched_nodes
2968
3059
  if handle.launched_resources is None:
2969
3060
  continue
@@ -2980,9 +3071,9 @@ def get_clusters(
2980
3071
  if handle.launched_resources.accelerators else None)
2981
3072
 
2982
3073
  # Add auth_config to the records
2983
- for record in records:
2984
- _update_record_with_credentials_and_resources_str(record)
2985
-
3074
+ _update_records_with_resources_str(records)
3075
+ if include_credentials:
3076
+ _update_records_with_credentials(records)
2986
3077
  if refresh == common.StatusRefreshMode.NONE:
2987
3078
  # Add resources to the records
2988
3079
  _update_records_with_resources(records)
@@ -3022,7 +3113,9 @@ def get_clusters(
3022
3113
  cluster_name,
3023
3114
  force_refresh_statuses=force_refresh_statuses,
3024
3115
  acquire_per_cluster_status_lock=True)
3025
- _update_record_with_credentials_and_resources_str(record)
3116
+ _update_records_with_resources_str([record])
3117
+ if include_credentials:
3118
+ _update_records_with_credentials([record])
3026
3119
  except (exceptions.ClusterStatusFetchingError,
3027
3120
  exceptions.CloudUserIdentityError,
3028
3121
  exceptions.ClusterOwnerIdentityMismatchError) as e:
@@ -65,6 +65,7 @@ from sky.utils import context_utils
65
65
  from sky.utils import controller_utils
66
66
  from sky.utils import directory_utils
67
67
  from sky.utils import env_options
68
+ from sky.utils import lock_events
68
69
  from sky.utils import locks
69
70
  from sky.utils import log_utils
70
71
  from sky.utils import message_utils
@@ -2498,7 +2499,12 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2498
2499
  self.stable_internal_external_ips = stable_internal_external_ips
2499
2500
 
2500
2501
  @context_utils.cancellation_guard
2501
- @annotations.lru_cache(scope='global')
2502
+ # we expect different request to be acting on different clusters
2503
+ # (= different handles) so we have no real expectation of cache hit
2504
+ # across requests.
2505
+ # Do not change this cache to global scope
2506
+ # without understanding https://github.com/skypilot-org/skypilot/pull/6908
2507
+ @annotations.lru_cache(scope='request', maxsize=10)
2502
2508
  @timeline.event
2503
2509
  def get_command_runners(self,
2504
2510
  force_cached: bool = False,
@@ -2854,7 +2860,12 @@ class LocalResourcesHandle(CloudVmRayResourceHandle):
2854
2860
  self.is_grpc_enabled = False
2855
2861
 
2856
2862
  @context_utils.cancellation_guard
2857
- @annotations.lru_cache(scope='global')
2863
+ # we expect different request to be acting on different clusters
2864
+ # (= different handles) so we have no real expectation of cache hit
2865
+ # across requests.
2866
+ # Do not change this cache to global scope
2867
+ # without understanding https://github.com/skypilot-org/skypilot/pull/6908
2868
+ @annotations.lru_cache(scope='request', maxsize=10)
2858
2869
  @timeline.event
2859
2870
  def get_command_runners(self,
2860
2871
  force_cached: bool = False,
@@ -3112,7 +3123,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3112
3123
  retry_until_up: bool = False,
3113
3124
  skip_unnecessary_provisioning: bool = False,
3114
3125
  ) -> Tuple[Optional[CloudVmRayResourceHandle], bool]:
3115
- with timeline.DistributedLockEvent(lock_id, _CLUSTER_LOCK_TIMEOUT):
3126
+ with lock_events.DistributedLockEvent(lock_id, _CLUSTER_LOCK_TIMEOUT):
3127
+ # Reset spinner message to remove any mention of being blocked
3128
+ # by other requests.
3129
+ rich_utils.force_update_status(
3130
+ ux_utils.spinner_message('Launching'))
3131
+
3116
3132
  # Try to launch the exiting cluster first. If no existing
3117
3133
  # cluster, this function will create a to_provision_config
3118
3134
  # with required resources.
@@ -5141,6 +5157,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5141
5157
  # Take a random resource in order to get resource info that applies
5142
5158
  # to all resources.
5143
5159
  one_task_resource = list(task.resources)[0]
5160
+
5144
5161
  # Assume resources share the same ports.
5145
5162
  for resource in task.resources:
5146
5163
  assert resource.ports == one_task_resource.ports
@@ -5181,6 +5198,42 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5181
5198
  if one_task_resource.docker_login_config is not None:
5182
5199
  to_provision = to_provision.copy(
5183
5200
  _docker_login_config=one_task_resource.docker_login_config)
5201
+
5202
+ # cluster_config_overrides should be the same for all resources.
5203
+ for resource in task.resources:
5204
+ assert (resource.cluster_config_overrides ==
5205
+ one_task_resource.cluster_config_overrides)
5206
+ if isinstance(to_provision.cloud, clouds.Kubernetes):
5207
+ # Warn users if the Kubernetes pod config is different
5208
+ # from the existing cluster.
5209
+ cluster_yaml_str = global_user_state.get_cluster_yaml_str(
5210
+ cluster_name)
5211
+ actual_cluster_yaml_obj = yaml_utils.safe_load(cluster_yaml_str)
5212
+ desired_cluster_yaml_obj = (
5213
+ kubernetes_utils.combine_pod_config_fields_and_metadata(
5214
+ actual_cluster_yaml_obj,
5215
+ cluster_config_overrides=one_task_resource.
5216
+ cluster_config_overrides,
5217
+ cloud=to_provision.cloud,
5218
+ context=to_provision.region))
5219
+
5220
+ def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
5221
+ return (yaml_obj.get('available_node_types',
5222
+ {}).get('ray_head_default',
5223
+ {}).get('node_config', {}))
5224
+
5225
+ if _get_pod_config(desired_cluster_yaml_obj) != _get_pod_config(
5226
+ actual_cluster_yaml_obj):
5227
+ # pylint: disable=line-too-long
5228
+ logger.warning(
5229
+ f'{colorama.Fore.YELLOW}WARNING: Kubernetes pod config mismatch detected. Task requires different '
5230
+ f'pod config than the existing cluster. The existing '
5231
+ f'cluster will be used with its current pod config.'
5232
+ f'To apply use your task\'s new pod config:\n'
5233
+ f' • Use a new cluster'
5234
+ f' • Or restart this cluster: sky down {cluster_name}; sky launch -c {cluster_name} ...'
5235
+ f'{colorama.Style.RESET_ALL}')
5236
+
5184
5237
  return RetryingVmProvisioner.ToProvisionConfig(
5185
5238
  cluster_name,
5186
5239
  to_provision,
@@ -16,6 +16,7 @@ import pathlib
16
16
  import re
17
17
  import shutil
18
18
  import subprocess
19
+ import sys
19
20
  import tempfile
20
21
  from typing import Optional, Tuple
21
22
 
@@ -133,19 +134,45 @@ def _build_sky_wheel() -> pathlib.Path:
133
134
  # It is important to normalize the path, otherwise 'pip wheel' would
134
135
  # treat the directory as a file and generate an empty wheel.
135
136
  norm_path = str(tmp_dir) + os.sep
137
+ # TODO(#5046): Consider adding native UV support for building wheels.
138
+ # Use `python -m pip` instead of `pip3` for better compatibility across
139
+ # different environments (conda, venv, UV, system Python, etc.)
136
140
  try:
137
- # TODO(suquark): For python>=3.7, 'subprocess.run' supports capture
138
- # of the output.
139
141
  subprocess.run([
140
- 'pip3', 'wheel', '--no-deps', norm_path, '--wheel-dir',
142
+ sys.executable, '-m', 'pip', 'wheel', '--no-deps', norm_path,
143
+ '--wheel-dir',
141
144
  str(tmp_dir)
142
145
  ],
143
- stdout=subprocess.DEVNULL,
144
- stderr=subprocess.PIPE,
145
- check=True)
146
+ capture_output=True,
147
+ check=True,
148
+ text=True)
146
149
  except subprocess.CalledProcessError as e:
147
- raise RuntimeError('Failed to build pip wheel for SkyPilot. '
148
- f'Error message: {e.stderr.decode()}') from e
150
+ error_msg = e.stderr
151
+ if 'No module named pip' in error_msg:
152
+ # pip module not found - provide helpful suggestions based on
153
+ # the available package managers
154
+ if shutil.which('uv'):
155
+ msg = ('pip module not found. Since you have UV installed, '
156
+ 'you can install pip by running:\n'
157
+ ' uv pip install pip')
158
+ elif shutil.which('conda'):
159
+ msg = (
160
+ 'pip module not found. Since you have conda installed, '
161
+ 'you can install pip by running:\n'
162
+ ' conda install pip')
163
+ else:
164
+ msg = ('pip module not found. Please install pip for your '
165
+ f'Python environment ({sys.executable}).')
166
+ else:
167
+ # Other pip errors
168
+ msg = f'pip wheel command failed. Error: {error_msg}'
169
+ raise RuntimeError('Failed to build pip wheel for SkyPilot.\n' +
170
+ msg) from e
171
+ except FileNotFoundError as e:
172
+ # Python executable not found (extremely rare)
173
+ raise RuntimeError(
174
+ f'Failed to build pip wheel for SkyPilot. '
175
+ f'Python executable not found: {sys.executable}') from e
149
176
 
150
177
  try:
151
178
  wheel_path = next(tmp_dir.glob(_WHEEL_PATTERN))
sky/client/cli/command.py CHANGED
@@ -143,7 +143,10 @@ def _get_cluster_records_and_set_ssh_config(
143
143
  # TODO(zhwu): this additional RTT makes CLIs slow. We should optimize this.
144
144
  if clusters is not None:
145
145
  all_users = True
146
- request_id = sdk.status(clusters, refresh=refresh, all_users=all_users)
146
+ request_id = sdk.status(clusters,
147
+ refresh=refresh,
148
+ all_users=all_users,
149
+ _include_credentials=True)
147
150
  cluster_records = sdk.stream_and_get(request_id)
148
151
  # Update the SSH config for all clusters
149
152
  for record in cluster_records:
@@ -1655,7 +1658,9 @@ def _show_endpoint(query_clusters: Optional[List[str]],
1655
1658
  return
1656
1659
 
1657
1660
 
1658
- def _show_enabled_infra(active_workspace: str, show_workspace: bool):
1661
+ def _show_enabled_infra(
1662
+ active_workspace: str, show_workspace: bool,
1663
+ enabled_clouds_request_id: server_common.RequestId[List[str]]):
1659
1664
  """Show the enabled infrastructure."""
1660
1665
  workspace_str = ''
1661
1666
  if show_workspace:
@@ -1663,8 +1668,7 @@ def _show_enabled_infra(active_workspace: str, show_workspace: bool):
1663
1668
  title = (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Enabled Infra'
1664
1669
  f'{workspace_str}:'
1665
1670
  f'{colorama.Style.RESET_ALL} ')
1666
- all_infras = sdk.get(
1667
- sdk.enabled_clouds(workspace=active_workspace, expand=True))
1671
+ all_infras = sdk.get(enabled_clouds_request_id)
1668
1672
  click.echo(f'{title}{", ".join(all_infras)}\n')
1669
1673
 
1670
1674
 
@@ -1878,6 +1882,11 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1878
1882
  f'{colorama.Style.RESET_ALL}')
1879
1883
  return None
1880
1884
 
1885
+ active_workspace = skypilot_config.get_active_workspace()
1886
+
1887
+ def submit_enabled_clouds():
1888
+ return sdk.enabled_clouds(workspace=active_workspace, expand=True)
1889
+
1881
1890
  managed_jobs_queue_request_id = None
1882
1891
  service_status_request_id = None
1883
1892
  workspace_request_id = None
@@ -1893,6 +1902,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1893
1902
  pools_request_future = executor.submit(submit_pools)
1894
1903
  if not (ip or show_endpoints):
1895
1904
  workspace_request_future = executor.submit(submit_workspace)
1905
+ enabled_clouds_request_future = executor.submit(submit_enabled_clouds)
1896
1906
 
1897
1907
  # Get the request IDs
1898
1908
  if show_managed_jobs:
@@ -1903,6 +1913,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1903
1913
  pool_status_request_id = pools_request_future.result()
1904
1914
  if not (ip or show_endpoints):
1905
1915
  workspace_request_id = workspace_request_future.result()
1916
+ enabled_clouds_request_id = enabled_clouds_request_future.result()
1906
1917
 
1907
1918
  managed_jobs_queue_request_id = (server_common.RequestId()
1908
1919
  if not managed_jobs_queue_request_id else
@@ -1937,9 +1948,9 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1937
1948
  all_workspaces = sdk.get(workspace_request_id)
1938
1949
  else:
1939
1950
  all_workspaces = {constants.SKYPILOT_DEFAULT_WORKSPACE: {}}
1940
- active_workspace = skypilot_config.get_active_workspace()
1941
1951
  show_workspace = len(all_workspaces) > 1
1942
- _show_enabled_infra(active_workspace, show_workspace)
1952
+ _show_enabled_infra(active_workspace, show_workspace,
1953
+ enabled_clouds_request_id)
1943
1954
  click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters'
1944
1955
  f'{colorama.Style.RESET_ALL}')
1945
1956
 
sky/client/common.py CHANGED
@@ -82,10 +82,11 @@ def download_logs_from_api_server(
82
82
  local_machine_prefix) for remote_path in paths_on_api_server
83
83
  }
84
84
  body = payloads.DownloadBody(folder_paths=list(paths_on_api_server),)
85
- response = requests.post(f'{server_common.get_server_url()}/download',
86
- json=json.loads(body.model_dump_json()),
87
- stream=True,
88
- cookies=server_common.get_api_cookie_jar())
85
+ response = server_common.make_authenticated_request(
86
+ 'POST',
87
+ '/download',
88
+ json=json.loads(body.model_dump_json()),
89
+ stream=True)
89
90
  if response.status_code == 200:
90
91
  remote_home_path = response.headers.get('X-Home-Path')
91
92
  assert remote_home_path is not None, response.headers