skypilot-nightly 1.0.0.dev20250828__py3-none-any.whl → 1.0.0.dev20250831__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +24 -2
- sky/backends/backend_utils.py +152 -59
- sky/backends/cloud_vm_ray_backend.py +56 -3
- sky/backends/wheel_utils.py +35 -8
- sky/client/cli/command.py +17 -6
- sky/client/common.py +5 -4
- sky/client/sdk.py +5 -0
- sky/client/sdk_async.py +8 -2
- sky/clouds/aws.py +118 -1
- sky/core.py +8 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-6dae1cd599a34def.js → webpack-6e76f636a048e145.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +58 -10
- sky/provision/aws/config.py +78 -3
- sky/provision/aws/instance.py +45 -6
- sky/provision/docker_utils.py +1 -1
- sky/provision/kubernetes/utils.py +48 -26
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/server/common.py +1 -2
- sky/server/daemons.py +6 -0
- sky/server/requests/executor.py +2 -1
- sky/server/requests/payloads.py +4 -1
- sky/server/server.py +67 -58
- sky/setup_files/dependencies.py +25 -8
- sky/setup_files/setup.py +2 -0
- sky/sky_logging.py +28 -0
- sky/skylet/constants.py +6 -0
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/utils/annotations.py +8 -2
- sky/utils/cluster_utils.py +3 -3
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/kubernetes_enums.py +1 -0
- sky/utils/lock_events.py +94 -0
- sky/utils/schemas.py +6 -0
- sky/utils/timeline.py +24 -93
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/METADATA +36 -48
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/RECORD +59 -57
- /sky/dashboard/out/_next/static/{9DW6d9jaP2kZt0NcgIfFa → FtHzmn6BMJ5PzqHhEY51g}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{9DW6d9jaP2kZt0NcgIfFa → FtHzmn6BMJ5PzqHhEY51g}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250828.dist-info → skypilot_nightly-1.0.0.dev20250831.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ import urllib.request
|
|
|
7
7
|
from sky.utils import directory_utils
|
|
8
8
|
|
|
9
9
|
# Replaced with the current commit when building the wheels.
|
|
10
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
10
|
+
_SKYPILOT_COMMIT_SHA = 'f394ebc3a46b5d2cde5afb8a765d97709ea584ed'
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def _get_git_commit():
|
|
@@ -37,7 +37,7 @@ def _get_git_commit():
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
__commit__ = _get_git_commit()
|
|
40
|
-
__version__ = '1.0.0.
|
|
40
|
+
__version__ = '1.0.0.dev20250831'
|
|
41
41
|
__root_dir__ = directory_utils.get_sky_dir()
|
|
42
42
|
|
|
43
43
|
|
sky/adaptors/nebius.py
CHANGED
|
@@ -62,6 +62,10 @@ def iam_token_path() -> str:
|
|
|
62
62
|
return '~/.nebius/NEBIUS_IAM_TOKEN.txt'
|
|
63
63
|
|
|
64
64
|
|
|
65
|
+
def domain_path() -> str:
|
|
66
|
+
return '~/.nebius/NEBIUS_DOMAIN.txt'
|
|
67
|
+
|
|
68
|
+
|
|
65
69
|
def credentials_path() -> str:
|
|
66
70
|
workspace_path = skypilot_config.get_workspace_cloud('nebius').get(
|
|
67
71
|
'credentials_file_path', None)
|
|
@@ -82,6 +86,22 @@ def _get_default_credentials_path() -> str:
|
|
|
82
86
|
return '~/.nebius/credentials.json'
|
|
83
87
|
|
|
84
88
|
|
|
89
|
+
def api_domain() -> Optional[str]:
|
|
90
|
+
domain_in_ws_config = skypilot_config.get_workspace_cloud('nebius').get(
|
|
91
|
+
'domain', None)
|
|
92
|
+
if domain_in_ws_config is not None:
|
|
93
|
+
return domain_in_ws_config
|
|
94
|
+
domain_in_config = skypilot_config.get_effective_region_config(
|
|
95
|
+
cloud='nebius', region=None, keys=('domain',), default_value=None)
|
|
96
|
+
if domain_in_config is not None:
|
|
97
|
+
return domain_in_config
|
|
98
|
+
try:
|
|
99
|
+
with open(os.path.expanduser(domain_path()), encoding='utf-8') as file:
|
|
100
|
+
return file.read().strip()
|
|
101
|
+
except FileNotFoundError:
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
|
|
85
105
|
DEFAULT_REGION = 'eu-north1'
|
|
86
106
|
|
|
87
107
|
NEBIUS_PROFILE_NAME = 'nebius'
|
|
@@ -215,10 +235,12 @@ def _sdk(token: Optional[str], cred_path: Optional[str]):
|
|
|
215
235
|
# Exactly one of token or cred_path must be provided
|
|
216
236
|
assert (token is None) != (cred_path is None), (token, cred_path)
|
|
217
237
|
if token is not None:
|
|
218
|
-
return nebius.sdk.SDK(credentials=token)
|
|
238
|
+
return nebius.sdk.SDK(credentials=token, domain=api_domain())
|
|
219
239
|
if cred_path is not None:
|
|
220
240
|
return nebius.sdk.SDK(
|
|
221
|
-
credentials_file_name=os.path.expanduser(cred_path)
|
|
241
|
+
credentials_file_name=os.path.expanduser(cred_path),
|
|
242
|
+
domain=api_domain(),
|
|
243
|
+
)
|
|
222
244
|
raise ValueError('Either token or credentials file path must be provided')
|
|
223
245
|
|
|
224
246
|
|
sky/backends/backend_utils.py
CHANGED
|
@@ -928,19 +928,19 @@ def write_cluster_config(
|
|
|
928
928
|
# Add kubernetes config fields from ~/.sky/config
|
|
929
929
|
if isinstance(cloud, clouds.Kubernetes):
|
|
930
930
|
cluster_config_overrides = to_provision.cluster_config_overrides
|
|
931
|
-
|
|
932
|
-
|
|
931
|
+
with open(tmp_yaml_path, 'r', encoding='utf-8') as f:
|
|
932
|
+
tmp_yaml_str = f.read()
|
|
933
|
+
cluster_yaml_obj = yaml_utils.safe_load(tmp_yaml_str)
|
|
934
|
+
combined_yaml_obj = kubernetes_utils.combine_pod_config_fields_and_metadata(
|
|
935
|
+
cluster_yaml_obj,
|
|
933
936
|
cluster_config_overrides=cluster_config_overrides,
|
|
934
937
|
cloud=cloud,
|
|
935
938
|
context=region.name)
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
cluster_config_overrides=cluster_config_overrides,
|
|
939
|
-
context=region.name)
|
|
940
|
-
yaml_obj = yaml_utils.read_yaml(tmp_yaml_path)
|
|
941
|
-
pod_config: Dict[str, Any] = yaml_obj['available_node_types'][
|
|
942
|
-
'ray_head_default']['node_config']
|
|
939
|
+
# Write the updated YAML back to the file
|
|
940
|
+
yaml_utils.dump_yaml(tmp_yaml_path, combined_yaml_obj)
|
|
943
941
|
|
|
942
|
+
pod_config: Dict[str, Any] = combined_yaml_obj['available_node_types'][
|
|
943
|
+
'ray_head_default']['node_config']
|
|
944
944
|
# Check pod spec only. For high availability controllers, we deploy pvc & deployment for the controller. Read kubernetes-ray.yml.j2 for more details.
|
|
945
945
|
pod_config.pop('deployment_spec', None)
|
|
946
946
|
pod_config.pop('pvc_spec', None)
|
|
@@ -1409,6 +1409,62 @@ def ssh_credential_from_yaml(
|
|
|
1409
1409
|
return credentials
|
|
1410
1410
|
|
|
1411
1411
|
|
|
1412
|
+
def ssh_credentials_from_handles(
|
|
1413
|
+
handles: List['cloud_vm_ray_backend.CloudVmRayResourceHandle'],
|
|
1414
|
+
) -> List[Dict[str, Any]]:
|
|
1415
|
+
"""Returns ssh_user, ssh_private_key and ssh_control name.
|
|
1416
|
+
"""
|
|
1417
|
+
non_empty_cluster_yaml_paths = [
|
|
1418
|
+
handle.cluster_yaml
|
|
1419
|
+
for handle in handles
|
|
1420
|
+
if handle.cluster_yaml is not None
|
|
1421
|
+
]
|
|
1422
|
+
cluster_yaml_dicts = global_user_state.get_cluster_yaml_dict_multiple(
|
|
1423
|
+
non_empty_cluster_yaml_paths)
|
|
1424
|
+
cluster_yaml_dicts_to_index = {
|
|
1425
|
+
cluster_yaml_path: cluster_yaml_dict
|
|
1426
|
+
for cluster_yaml_path, cluster_yaml_dict in zip(
|
|
1427
|
+
non_empty_cluster_yaml_paths, cluster_yaml_dicts)
|
|
1428
|
+
}
|
|
1429
|
+
|
|
1430
|
+
credentials_to_return: List[Dict[str, Any]] = []
|
|
1431
|
+
for handle in handles:
|
|
1432
|
+
if handle.cluster_yaml is None:
|
|
1433
|
+
credentials_to_return.append(dict())
|
|
1434
|
+
continue
|
|
1435
|
+
ssh_user = handle.ssh_user
|
|
1436
|
+
docker_user = handle.docker_user
|
|
1437
|
+
config = cluster_yaml_dicts_to_index[handle.cluster_yaml]
|
|
1438
|
+
auth_section = config['auth']
|
|
1439
|
+
if ssh_user is None:
|
|
1440
|
+
ssh_user = auth_section['ssh_user'].strip()
|
|
1441
|
+
ssh_private_key_path = auth_section.get('ssh_private_key')
|
|
1442
|
+
ssh_control_name = config.get('cluster_name', '__default__')
|
|
1443
|
+
ssh_proxy_command = auth_section.get('ssh_proxy_command')
|
|
1444
|
+
|
|
1445
|
+
# Update the ssh_user placeholder in proxy command, if required
|
|
1446
|
+
if (ssh_proxy_command is not None and
|
|
1447
|
+
constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
|
|
1448
|
+
ssh_proxy_command = ssh_proxy_command.replace(
|
|
1449
|
+
constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
|
|
1450
|
+
|
|
1451
|
+
credentials = {
|
|
1452
|
+
'ssh_user': ssh_user,
|
|
1453
|
+
'ssh_private_key': ssh_private_key_path,
|
|
1454
|
+
'ssh_control_name': ssh_control_name,
|
|
1455
|
+
'ssh_proxy_command': ssh_proxy_command,
|
|
1456
|
+
}
|
|
1457
|
+
if docker_user is not None:
|
|
1458
|
+
credentials['docker_user'] = docker_user
|
|
1459
|
+
ssh_provider_module = config['provider']['module']
|
|
1460
|
+
# If we are running ssh command on kubernetes node.
|
|
1461
|
+
if 'kubernetes' in ssh_provider_module:
|
|
1462
|
+
credentials['disable_control_master'] = True
|
|
1463
|
+
credentials_to_return.append(credentials)
|
|
1464
|
+
|
|
1465
|
+
return credentials_to_return
|
|
1466
|
+
|
|
1467
|
+
|
|
1412
1468
|
def parallel_data_transfer_to_nodes(
|
|
1413
1469
|
runners: List[command_runner.CommandRunner],
|
|
1414
1470
|
source: Optional[str],
|
|
@@ -2056,7 +2112,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2056
2112
|
f'{output}\n', stderr)
|
|
2057
2113
|
return (*_count_healthy_nodes_from_ray(output), output, stderr)
|
|
2058
2114
|
|
|
2115
|
+
ray_status_details: Optional[str] = None
|
|
2116
|
+
|
|
2059
2117
|
def run_ray_status_to_check_ray_cluster_healthy() -> bool:
|
|
2118
|
+
nonlocal ray_status_details
|
|
2060
2119
|
try:
|
|
2061
2120
|
# NOTE: fetching the IPs is very slow as it calls into
|
|
2062
2121
|
# `ray get head-ip/worker-ips`. Using cached IPs is safe because
|
|
@@ -2134,19 +2193,25 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2134
2193
|
# showing up
|
|
2135
2194
|
time.sleep(1)
|
|
2136
2195
|
|
|
2196
|
+
ray_status_details = (
|
|
2197
|
+
f'{ready_head + ready_workers}/{total_nodes} ready')
|
|
2137
2198
|
raise RuntimeError(
|
|
2138
2199
|
f'Refreshing status ({cluster_name!r}): ray status not showing '
|
|
2139
2200
|
f'all nodes ({ready_head + ready_workers}/'
|
|
2140
2201
|
f'{total_nodes});\noutput:\n{output}\nstderr:\n{stderr}')
|
|
2141
2202
|
|
|
2142
2203
|
except exceptions.FetchClusterInfoError:
|
|
2204
|
+
ray_status_details = 'failed to get IPs'
|
|
2143
2205
|
logger.debug(
|
|
2144
2206
|
f'Refreshing status ({cluster_name!r}) failed to get IPs.')
|
|
2145
2207
|
except RuntimeError as e:
|
|
2208
|
+
if ray_status_details is None:
|
|
2209
|
+
ray_status_details = str(e)
|
|
2146
2210
|
logger.debug(common_utils.format_exception(e))
|
|
2147
2211
|
except Exception as e: # pylint: disable=broad-except
|
|
2148
2212
|
# This can be raised by `external_ssh_ports()`, due to the
|
|
2149
2213
|
# underlying call to kubernetes API.
|
|
2214
|
+
ray_status_details = str(e)
|
|
2150
2215
|
logger.debug(f'Refreshing status ({cluster_name!r}) failed: ',
|
|
2151
2216
|
exc_info=e)
|
|
2152
2217
|
return False
|
|
@@ -2259,6 +2324,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2259
2324
|
# (2) Otherwise, we will reset the autostop setting, unless the cluster is
|
|
2260
2325
|
# autostopping/autodowning.
|
|
2261
2326
|
some_nodes_terminated = 0 < len(node_statuses) < handle.launched_nodes
|
|
2327
|
+
# If all nodes are up and ray cluster is health, we would have returned
|
|
2328
|
+
# earlier. So if all_nodes_up is True and we are here, it means the ray
|
|
2329
|
+
# cluster must have been unhealthy.
|
|
2330
|
+
ray_cluster_unhealthy = all_nodes_up
|
|
2262
2331
|
some_nodes_not_stopped = any(status[0] != status_lib.ClusterStatus.STOPPED
|
|
2263
2332
|
for status in node_statuses)
|
|
2264
2333
|
is_abnormal = (some_nodes_terminated or some_nodes_not_stopped)
|
|
@@ -2269,8 +2338,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2269
2338
|
|
|
2270
2339
|
if some_nodes_terminated:
|
|
2271
2340
|
init_reason = 'one or more nodes terminated'
|
|
2341
|
+
elif ray_cluster_unhealthy:
|
|
2342
|
+
init_reason = f'ray cluster is unhealthy ({ray_status_details})'
|
|
2272
2343
|
elif some_nodes_not_stopped:
|
|
2273
|
-
init_reason = 'some
|
|
2344
|
+
init_reason = 'some but not all nodes are stopped'
|
|
2274
2345
|
logger.debug('The cluster is abnormal. Setting to INIT status. '
|
|
2275
2346
|
f'node_statuses: {node_statuses}')
|
|
2276
2347
|
if record['autostop'] >= 0:
|
|
@@ -2365,7 +2436,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2365
2436
|
# Some status reason clears after a certain time (e.g. k8s events
|
|
2366
2437
|
# are only stored for an hour by default), so it is possible that
|
|
2367
2438
|
# the previous event has a status reason, but now it does not.
|
|
2368
|
-
init_reason_regex = f'^Cluster is abnormal because
|
|
2439
|
+
init_reason_regex = (f'^Cluster is abnormal because '
|
|
2440
|
+
f'{re.escape(init_reason)}.*')
|
|
2369
2441
|
log_message = f'Cluster is abnormal because {init_reason}'
|
|
2370
2442
|
if status_reason:
|
|
2371
2443
|
log_message += f' ({status_reason})'
|
|
@@ -2385,10 +2457,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2385
2457
|
return global_user_state.get_cluster_from_name(cluster_name)
|
|
2386
2458
|
# Now is_abnormal is False: either node_statuses is empty or all nodes are
|
|
2387
2459
|
# STOPPED.
|
|
2460
|
+
verb = 'terminated' if to_terminate else 'stopped'
|
|
2388
2461
|
backend = backends.CloudVmRayBackend()
|
|
2389
2462
|
global_user_state.add_cluster_event(
|
|
2390
|
-
cluster_name,
|
|
2391
|
-
|
|
2463
|
+
cluster_name,
|
|
2464
|
+
None,
|
|
2465
|
+
f'All nodes {verb}, cleaning up the cluster.',
|
|
2466
|
+
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2467
|
+
# This won't do anything for a terminated cluster, but it's needed for a
|
|
2468
|
+
# stopped cluster.
|
|
2469
|
+
nop_if_duplicate=True,
|
|
2470
|
+
)
|
|
2392
2471
|
backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
|
|
2393
2472
|
return global_user_state.get_cluster_from_name(cluster_name)
|
|
2394
2473
|
|
|
@@ -2850,6 +2929,7 @@ def get_clusters(
|
|
|
2850
2929
|
refresh: common.StatusRefreshMode,
|
|
2851
2930
|
cluster_names: Optional[Union[str, List[str]]] = None,
|
|
2852
2931
|
all_users: bool = True,
|
|
2932
|
+
include_credentials: bool = False,
|
|
2853
2933
|
# Internal only:
|
|
2854
2934
|
# pylint: disable=invalid-name
|
|
2855
2935
|
_include_is_managed: bool = False,
|
|
@@ -2862,17 +2942,14 @@ def get_clusters(
|
|
|
2862
2942
|
of the clusters.
|
|
2863
2943
|
|
|
2864
2944
|
Args:
|
|
2865
|
-
include_controller: Whether to include controllers, e.g. jobs controller
|
|
2866
|
-
or sky serve controller.
|
|
2867
2945
|
refresh: Whether to refresh the status of the clusters. (Refreshing will
|
|
2868
2946
|
set the status to STOPPED if the cluster cannot be pinged.)
|
|
2869
|
-
cloud_filter: Sets which clouds to filer through from the global user
|
|
2870
|
-
state. Supports three values, 'all' for all clouds, 'public' for
|
|
2871
|
-
public clouds only, and 'local' for only local clouds.
|
|
2872
2947
|
cluster_names: If provided, only return records for the given cluster
|
|
2873
2948
|
names.
|
|
2874
2949
|
all_users: If True, return clusters from all users. If False, only
|
|
2875
2950
|
return clusters from the current user.
|
|
2951
|
+
include_credentials: If True, include cluster ssh credentials in the
|
|
2952
|
+
return value.
|
|
2876
2953
|
_include_is_managed: Whether to force include clusters created by the
|
|
2877
2954
|
controller.
|
|
2878
2955
|
|
|
@@ -2916,54 +2993,68 @@ def get_clusters(
|
|
|
2916
2993
|
logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
|
|
2917
2994
|
records = new_records
|
|
2918
2995
|
|
|
2919
|
-
def
|
|
2920
|
-
|
|
2996
|
+
def _get_records_with_handle(
|
|
2997
|
+
records: List[Optional[Dict[str, Any]]]) -> List[Dict[str, Any]]:
|
|
2998
|
+
"""Filter for records that have a handle"""
|
|
2999
|
+
return [
|
|
3000
|
+
record for record in records
|
|
3001
|
+
if record is not None and record['handle'] is not None
|
|
3002
|
+
]
|
|
3003
|
+
|
|
3004
|
+
def _update_records_with_resources_str(
|
|
3005
|
+
records: List[Optional[Dict[str, Any]]]) -> None:
|
|
3006
|
+
"""Add resource str to record"""
|
|
3007
|
+
for record in _get_records_with_handle(records):
|
|
3008
|
+
handle = record['handle']
|
|
3009
|
+
record[
|
|
3010
|
+
'resources_str'] = resources_utils.get_readable_resources_repr(
|
|
3011
|
+
handle, simplify=True)
|
|
3012
|
+
record[
|
|
3013
|
+
'resources_str_full'] = resources_utils.get_readable_resources_repr(
|
|
3014
|
+
handle, simplify=False)
|
|
3015
|
+
|
|
3016
|
+
def _update_records_with_credentials(
|
|
3017
|
+
records: List[Optional[Dict[str, Any]]]) -> None:
|
|
2921
3018
|
"""Add the credentials to the record.
|
|
2922
3019
|
|
|
2923
3020
|
This is useful for the client side to setup the ssh config of the
|
|
2924
3021
|
cluster.
|
|
2925
3022
|
"""
|
|
2926
|
-
|
|
2927
|
-
|
|
2928
|
-
handle = record['handle']
|
|
2929
|
-
if handle is None:
|
|
3023
|
+
records_with_handle = _get_records_with_handle(records)
|
|
3024
|
+
if len(records_with_handle) == 0:
|
|
2930
3025
|
return
|
|
2931
|
-
|
|
2932
|
-
|
|
2933
|
-
|
|
2934
|
-
|
|
2935
|
-
|
|
2936
|
-
|
|
2937
|
-
|
|
2938
|
-
|
|
2939
|
-
|
|
2940
|
-
|
|
2941
|
-
|
|
2942
|
-
|
|
2943
|
-
|
|
2944
|
-
|
|
2945
|
-
auth.
|
|
2946
|
-
|
|
2947
|
-
|
|
2948
|
-
|
|
2949
|
-
|
|
2950
|
-
|
|
2951
|
-
|
|
2952
|
-
|
|
2953
|
-
|
|
2954
|
-
|
|
2955
|
-
|
|
2956
|
-
|
|
3026
|
+
|
|
3027
|
+
handles = [record['handle'] for record in records_with_handle]
|
|
3028
|
+
credentials = ssh_credentials_from_handles(handles)
|
|
3029
|
+
cached_private_keys: Dict[str, str] = {}
|
|
3030
|
+
for record, credential in zip(records_with_handle, credentials):
|
|
3031
|
+
if not credential:
|
|
3032
|
+
continue
|
|
3033
|
+
ssh_private_key_path = credential.get('ssh_private_key', None)
|
|
3034
|
+
if ssh_private_key_path is not None:
|
|
3035
|
+
expanded_private_key_path = os.path.expanduser(
|
|
3036
|
+
ssh_private_key_path)
|
|
3037
|
+
if not os.path.exists(expanded_private_key_path):
|
|
3038
|
+
auth.create_ssh_key_files_from_db(ssh_private_key_path)
|
|
3039
|
+
else:
|
|
3040
|
+
private_key_path, _ = auth.get_or_generate_keys()
|
|
3041
|
+
expanded_private_key_path = os.path.expanduser(private_key_path)
|
|
3042
|
+
if expanded_private_key_path in cached_private_keys:
|
|
3043
|
+
credential['ssh_private_key_content'] = cached_private_keys[
|
|
3044
|
+
expanded_private_key_path]
|
|
3045
|
+
else:
|
|
3046
|
+
with open(expanded_private_key_path, 'r',
|
|
3047
|
+
encoding='utf-8') as f:
|
|
3048
|
+
credential['ssh_private_key_content'] = f.read()
|
|
3049
|
+
cached_private_keys[expanded_private_key_path] = credential[
|
|
3050
|
+
'ssh_private_key_content']
|
|
3051
|
+
record['credentials'] = credential
|
|
2957
3052
|
|
|
2958
3053
|
def _update_records_with_resources(
|
|
2959
3054
|
records: List[Optional[Dict[str, Any]]]) -> None:
|
|
2960
3055
|
"""Add the resources to the record."""
|
|
2961
|
-
for record in records:
|
|
2962
|
-
if record is None:
|
|
2963
|
-
continue
|
|
3056
|
+
for record in _get_records_with_handle(records):
|
|
2964
3057
|
handle = record['handle']
|
|
2965
|
-
if handle is None:
|
|
2966
|
-
continue
|
|
2967
3058
|
record['nodes'] = handle.launched_nodes
|
|
2968
3059
|
if handle.launched_resources is None:
|
|
2969
3060
|
continue
|
|
@@ -2980,9 +3071,9 @@ def get_clusters(
|
|
|
2980
3071
|
if handle.launched_resources.accelerators else None)
|
|
2981
3072
|
|
|
2982
3073
|
# Add auth_config to the records
|
|
2983
|
-
|
|
2984
|
-
|
|
2985
|
-
|
|
3074
|
+
_update_records_with_resources_str(records)
|
|
3075
|
+
if include_credentials:
|
|
3076
|
+
_update_records_with_credentials(records)
|
|
2986
3077
|
if refresh == common.StatusRefreshMode.NONE:
|
|
2987
3078
|
# Add resources to the records
|
|
2988
3079
|
_update_records_with_resources(records)
|
|
@@ -3022,7 +3113,9 @@ def get_clusters(
|
|
|
3022
3113
|
cluster_name,
|
|
3023
3114
|
force_refresh_statuses=force_refresh_statuses,
|
|
3024
3115
|
acquire_per_cluster_status_lock=True)
|
|
3025
|
-
|
|
3116
|
+
_update_records_with_resources_str([record])
|
|
3117
|
+
if include_credentials:
|
|
3118
|
+
_update_records_with_credentials([record])
|
|
3026
3119
|
except (exceptions.ClusterStatusFetchingError,
|
|
3027
3120
|
exceptions.CloudUserIdentityError,
|
|
3028
3121
|
exceptions.ClusterOwnerIdentityMismatchError) as e:
|
|
@@ -65,6 +65,7 @@ from sky.utils import context_utils
|
|
|
65
65
|
from sky.utils import controller_utils
|
|
66
66
|
from sky.utils import directory_utils
|
|
67
67
|
from sky.utils import env_options
|
|
68
|
+
from sky.utils import lock_events
|
|
68
69
|
from sky.utils import locks
|
|
69
70
|
from sky.utils import log_utils
|
|
70
71
|
from sky.utils import message_utils
|
|
@@ -2498,7 +2499,12 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2498
2499
|
self.stable_internal_external_ips = stable_internal_external_ips
|
|
2499
2500
|
|
|
2500
2501
|
@context_utils.cancellation_guard
|
|
2501
|
-
|
|
2502
|
+
# we expect different request to be acting on different clusters
|
|
2503
|
+
# (= different handles) so we have no real expectation of cache hit
|
|
2504
|
+
# across requests.
|
|
2505
|
+
# Do not change this cache to global scope
|
|
2506
|
+
# without understanding https://github.com/skypilot-org/skypilot/pull/6908
|
|
2507
|
+
@annotations.lru_cache(scope='request', maxsize=10)
|
|
2502
2508
|
@timeline.event
|
|
2503
2509
|
def get_command_runners(self,
|
|
2504
2510
|
force_cached: bool = False,
|
|
@@ -2854,7 +2860,12 @@ class LocalResourcesHandle(CloudVmRayResourceHandle):
|
|
|
2854
2860
|
self.is_grpc_enabled = False
|
|
2855
2861
|
|
|
2856
2862
|
@context_utils.cancellation_guard
|
|
2857
|
-
|
|
2863
|
+
# we expect different request to be acting on different clusters
|
|
2864
|
+
# (= different handles) so we have no real expectation of cache hit
|
|
2865
|
+
# across requests.
|
|
2866
|
+
# Do not change this cache to global scope
|
|
2867
|
+
# without understanding https://github.com/skypilot-org/skypilot/pull/6908
|
|
2868
|
+
@annotations.lru_cache(scope='request', maxsize=10)
|
|
2858
2869
|
@timeline.event
|
|
2859
2870
|
def get_command_runners(self,
|
|
2860
2871
|
force_cached: bool = False,
|
|
@@ -3112,7 +3123,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3112
3123
|
retry_until_up: bool = False,
|
|
3113
3124
|
skip_unnecessary_provisioning: bool = False,
|
|
3114
3125
|
) -> Tuple[Optional[CloudVmRayResourceHandle], bool]:
|
|
3115
|
-
with
|
|
3126
|
+
with lock_events.DistributedLockEvent(lock_id, _CLUSTER_LOCK_TIMEOUT):
|
|
3127
|
+
# Reset spinner message to remove any mention of being blocked
|
|
3128
|
+
# by other requests.
|
|
3129
|
+
rich_utils.force_update_status(
|
|
3130
|
+
ux_utils.spinner_message('Launching'))
|
|
3131
|
+
|
|
3116
3132
|
# Try to launch the exiting cluster first. If no existing
|
|
3117
3133
|
# cluster, this function will create a to_provision_config
|
|
3118
3134
|
# with required resources.
|
|
@@ -5141,6 +5157,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5141
5157
|
# Take a random resource in order to get resource info that applies
|
|
5142
5158
|
# to all resources.
|
|
5143
5159
|
one_task_resource = list(task.resources)[0]
|
|
5160
|
+
|
|
5144
5161
|
# Assume resources share the same ports.
|
|
5145
5162
|
for resource in task.resources:
|
|
5146
5163
|
assert resource.ports == one_task_resource.ports
|
|
@@ -5181,6 +5198,42 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5181
5198
|
if one_task_resource.docker_login_config is not None:
|
|
5182
5199
|
to_provision = to_provision.copy(
|
|
5183
5200
|
_docker_login_config=one_task_resource.docker_login_config)
|
|
5201
|
+
|
|
5202
|
+
# cluster_config_overrides should be the same for all resources.
|
|
5203
|
+
for resource in task.resources:
|
|
5204
|
+
assert (resource.cluster_config_overrides ==
|
|
5205
|
+
one_task_resource.cluster_config_overrides)
|
|
5206
|
+
if isinstance(to_provision.cloud, clouds.Kubernetes):
|
|
5207
|
+
# Warn users if the Kubernetes pod config is different
|
|
5208
|
+
# from the existing cluster.
|
|
5209
|
+
cluster_yaml_str = global_user_state.get_cluster_yaml_str(
|
|
5210
|
+
cluster_name)
|
|
5211
|
+
actual_cluster_yaml_obj = yaml_utils.safe_load(cluster_yaml_str)
|
|
5212
|
+
desired_cluster_yaml_obj = (
|
|
5213
|
+
kubernetes_utils.combine_pod_config_fields_and_metadata(
|
|
5214
|
+
actual_cluster_yaml_obj,
|
|
5215
|
+
cluster_config_overrides=one_task_resource.
|
|
5216
|
+
cluster_config_overrides,
|
|
5217
|
+
cloud=to_provision.cloud,
|
|
5218
|
+
context=to_provision.region))
|
|
5219
|
+
|
|
5220
|
+
def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
|
|
5221
|
+
return (yaml_obj.get('available_node_types',
|
|
5222
|
+
{}).get('ray_head_default',
|
|
5223
|
+
{}).get('node_config', {}))
|
|
5224
|
+
|
|
5225
|
+
if _get_pod_config(desired_cluster_yaml_obj) != _get_pod_config(
|
|
5226
|
+
actual_cluster_yaml_obj):
|
|
5227
|
+
# pylint: disable=line-too-long
|
|
5228
|
+
logger.warning(
|
|
5229
|
+
f'{colorama.Fore.YELLOW}WARNING: Kubernetes pod config mismatch detected. Task requires different '
|
|
5230
|
+
f'pod config than the existing cluster. The existing '
|
|
5231
|
+
f'cluster will be used with its current pod config.'
|
|
5232
|
+
f'To apply use your task\'s new pod config:\n'
|
|
5233
|
+
f' • Use a new cluster'
|
|
5234
|
+
f' • Or restart this cluster: sky down {cluster_name}; sky launch -c {cluster_name} ...'
|
|
5235
|
+
f'{colorama.Style.RESET_ALL}')
|
|
5236
|
+
|
|
5184
5237
|
return RetryingVmProvisioner.ToProvisionConfig(
|
|
5185
5238
|
cluster_name,
|
|
5186
5239
|
to_provision,
|
sky/backends/wheel_utils.py
CHANGED
|
@@ -16,6 +16,7 @@ import pathlib
|
|
|
16
16
|
import re
|
|
17
17
|
import shutil
|
|
18
18
|
import subprocess
|
|
19
|
+
import sys
|
|
19
20
|
import tempfile
|
|
20
21
|
from typing import Optional, Tuple
|
|
21
22
|
|
|
@@ -133,19 +134,45 @@ def _build_sky_wheel() -> pathlib.Path:
|
|
|
133
134
|
# It is important to normalize the path, otherwise 'pip wheel' would
|
|
134
135
|
# treat the directory as a file and generate an empty wheel.
|
|
135
136
|
norm_path = str(tmp_dir) + os.sep
|
|
137
|
+
# TODO(#5046): Consider adding native UV support for building wheels.
|
|
138
|
+
# Use `python -m pip` instead of `pip3` for better compatibility across
|
|
139
|
+
# different environments (conda, venv, UV, system Python, etc.)
|
|
136
140
|
try:
|
|
137
|
-
# TODO(suquark): For python>=3.7, 'subprocess.run' supports capture
|
|
138
|
-
# of the output.
|
|
139
141
|
subprocess.run([
|
|
140
|
-
'
|
|
142
|
+
sys.executable, '-m', 'pip', 'wheel', '--no-deps', norm_path,
|
|
143
|
+
'--wheel-dir',
|
|
141
144
|
str(tmp_dir)
|
|
142
145
|
],
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
+
capture_output=True,
|
|
147
|
+
check=True,
|
|
148
|
+
text=True)
|
|
146
149
|
except subprocess.CalledProcessError as e:
|
|
147
|
-
|
|
148
|
-
|
|
150
|
+
error_msg = e.stderr
|
|
151
|
+
if 'No module named pip' in error_msg:
|
|
152
|
+
# pip module not found - provide helpful suggestions based on
|
|
153
|
+
# the available package managers
|
|
154
|
+
if shutil.which('uv'):
|
|
155
|
+
msg = ('pip module not found. Since you have UV installed, '
|
|
156
|
+
'you can install pip by running:\n'
|
|
157
|
+
' uv pip install pip')
|
|
158
|
+
elif shutil.which('conda'):
|
|
159
|
+
msg = (
|
|
160
|
+
'pip module not found. Since you have conda installed, '
|
|
161
|
+
'you can install pip by running:\n'
|
|
162
|
+
' conda install pip')
|
|
163
|
+
else:
|
|
164
|
+
msg = ('pip module not found. Please install pip for your '
|
|
165
|
+
f'Python environment ({sys.executable}).')
|
|
166
|
+
else:
|
|
167
|
+
# Other pip errors
|
|
168
|
+
msg = f'pip wheel command failed. Error: {error_msg}'
|
|
169
|
+
raise RuntimeError('Failed to build pip wheel for SkyPilot.\n' +
|
|
170
|
+
msg) from e
|
|
171
|
+
except FileNotFoundError as e:
|
|
172
|
+
# Python executable not found (extremely rare)
|
|
173
|
+
raise RuntimeError(
|
|
174
|
+
f'Failed to build pip wheel for SkyPilot. '
|
|
175
|
+
f'Python executable not found: {sys.executable}') from e
|
|
149
176
|
|
|
150
177
|
try:
|
|
151
178
|
wheel_path = next(tmp_dir.glob(_WHEEL_PATTERN))
|
sky/client/cli/command.py
CHANGED
|
@@ -143,7 +143,10 @@ def _get_cluster_records_and_set_ssh_config(
|
|
|
143
143
|
# TODO(zhwu): this additional RTT makes CLIs slow. We should optimize this.
|
|
144
144
|
if clusters is not None:
|
|
145
145
|
all_users = True
|
|
146
|
-
request_id = sdk.status(clusters,
|
|
146
|
+
request_id = sdk.status(clusters,
|
|
147
|
+
refresh=refresh,
|
|
148
|
+
all_users=all_users,
|
|
149
|
+
_include_credentials=True)
|
|
147
150
|
cluster_records = sdk.stream_and_get(request_id)
|
|
148
151
|
# Update the SSH config for all clusters
|
|
149
152
|
for record in cluster_records:
|
|
@@ -1655,7 +1658,9 @@ def _show_endpoint(query_clusters: Optional[List[str]],
|
|
|
1655
1658
|
return
|
|
1656
1659
|
|
|
1657
1660
|
|
|
1658
|
-
def _show_enabled_infra(
|
|
1661
|
+
def _show_enabled_infra(
|
|
1662
|
+
active_workspace: str, show_workspace: bool,
|
|
1663
|
+
enabled_clouds_request_id: server_common.RequestId[List[str]]):
|
|
1659
1664
|
"""Show the enabled infrastructure."""
|
|
1660
1665
|
workspace_str = ''
|
|
1661
1666
|
if show_workspace:
|
|
@@ -1663,8 +1668,7 @@ def _show_enabled_infra(active_workspace: str, show_workspace: bool):
|
|
|
1663
1668
|
title = (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Enabled Infra'
|
|
1664
1669
|
f'{workspace_str}:'
|
|
1665
1670
|
f'{colorama.Style.RESET_ALL} ')
|
|
1666
|
-
all_infras = sdk.get(
|
|
1667
|
-
sdk.enabled_clouds(workspace=active_workspace, expand=True))
|
|
1671
|
+
all_infras = sdk.get(enabled_clouds_request_id)
|
|
1668
1672
|
click.echo(f'{title}{", ".join(all_infras)}\n')
|
|
1669
1673
|
|
|
1670
1674
|
|
|
@@ -1878,6 +1882,11 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1878
1882
|
f'{colorama.Style.RESET_ALL}')
|
|
1879
1883
|
return None
|
|
1880
1884
|
|
|
1885
|
+
active_workspace = skypilot_config.get_active_workspace()
|
|
1886
|
+
|
|
1887
|
+
def submit_enabled_clouds():
|
|
1888
|
+
return sdk.enabled_clouds(workspace=active_workspace, expand=True)
|
|
1889
|
+
|
|
1881
1890
|
managed_jobs_queue_request_id = None
|
|
1882
1891
|
service_status_request_id = None
|
|
1883
1892
|
workspace_request_id = None
|
|
@@ -1893,6 +1902,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1893
1902
|
pools_request_future = executor.submit(submit_pools)
|
|
1894
1903
|
if not (ip or show_endpoints):
|
|
1895
1904
|
workspace_request_future = executor.submit(submit_workspace)
|
|
1905
|
+
enabled_clouds_request_future = executor.submit(submit_enabled_clouds)
|
|
1896
1906
|
|
|
1897
1907
|
# Get the request IDs
|
|
1898
1908
|
if show_managed_jobs:
|
|
@@ -1903,6 +1913,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1903
1913
|
pool_status_request_id = pools_request_future.result()
|
|
1904
1914
|
if not (ip or show_endpoints):
|
|
1905
1915
|
workspace_request_id = workspace_request_future.result()
|
|
1916
|
+
enabled_clouds_request_id = enabled_clouds_request_future.result()
|
|
1906
1917
|
|
|
1907
1918
|
managed_jobs_queue_request_id = (server_common.RequestId()
|
|
1908
1919
|
if not managed_jobs_queue_request_id else
|
|
@@ -1937,9 +1948,9 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1937
1948
|
all_workspaces = sdk.get(workspace_request_id)
|
|
1938
1949
|
else:
|
|
1939
1950
|
all_workspaces = {constants.SKYPILOT_DEFAULT_WORKSPACE: {}}
|
|
1940
|
-
active_workspace = skypilot_config.get_active_workspace()
|
|
1941
1951
|
show_workspace = len(all_workspaces) > 1
|
|
1942
|
-
_show_enabled_infra(active_workspace, show_workspace
|
|
1952
|
+
_show_enabled_infra(active_workspace, show_workspace,
|
|
1953
|
+
enabled_clouds_request_id)
|
|
1943
1954
|
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters'
|
|
1944
1955
|
f'{colorama.Style.RESET_ALL}')
|
|
1945
1956
|
|
sky/client/common.py
CHANGED
|
@@ -82,10 +82,11 @@ def download_logs_from_api_server(
|
|
|
82
82
|
local_machine_prefix) for remote_path in paths_on_api_server
|
|
83
83
|
}
|
|
84
84
|
body = payloads.DownloadBody(folder_paths=list(paths_on_api_server),)
|
|
85
|
-
response =
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
85
|
+
response = server_common.make_authenticated_request(
|
|
86
|
+
'POST',
|
|
87
|
+
'/download',
|
|
88
|
+
json=json.loads(body.model_dump_json()),
|
|
89
|
+
stream=True)
|
|
89
90
|
if response.status_code == 200:
|
|
90
91
|
remote_home_path = response.headers.get('X-Home-Path')
|
|
91
92
|
assert remote_home_path is not None, response.headers
|