skypilot-nightly 1.0.0.dev20250919__py3-none-any.whl → 1.0.0.dev20250925__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend.py +10 -0
- sky/backends/backend_utils.py +200 -78
- sky/backends/cloud_vm_ray_backend.py +37 -13
- sky/backends/local_docker_backend.py +9 -0
- sky/client/cli/command.py +104 -53
- sky/client/sdk.py +13 -5
- sky/client/sdk_async.py +4 -2
- sky/clouds/kubernetes.py +2 -1
- sky/clouds/runpod.py +20 -7
- sky/core.py +7 -53
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{VvaUqYDvHOcHZRnvMBmax → bn-NHt5qTzeTN2PefXuDA}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-b2a3938c22b6647b.js → webpack-16ba1d7187d2e3b1.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +19 -10
- sky/execution.py +4 -2
- sky/global_user_state.py +224 -38
- sky/jobs/client/sdk.py +10 -1
- sky/jobs/controller.py +7 -7
- sky/jobs/server/core.py +3 -3
- sky/jobs/server/server.py +15 -11
- sky/jobs/utils.py +1 -1
- sky/logs/agent.py +30 -3
- sky/logs/aws.py +9 -19
- sky/provision/__init__.py +2 -1
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/cudo/instance.py +2 -2
- sky/provision/do/instance.py +2 -2
- sky/provision/docker_utils.py +41 -19
- sky/provision/fluidstack/instance.py +2 -2
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +134 -8
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -2
- sky/provision/primeintellect/instance.py +2 -2
- sky/provision/provisioner.py +1 -0
- sky/provision/runpod/instance.py +2 -2
- sky/provision/scp/instance.py +2 -2
- sky/provision/seeweb/instance.py +2 -1
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +6 -5
- sky/schemas/api/responses.py +2 -1
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +45 -19
- sky/serve/replica_managers.py +12 -5
- sky/serve/serve_utils.py +5 -11
- sky/serve/server/core.py +9 -6
- sky/serve/server/impl.py +78 -25
- sky/serve/server/server.py +4 -5
- sky/serve/service_spec.py +33 -0
- sky/server/auth/oauth2_proxy.py +2 -2
- sky/server/constants.py +1 -1
- sky/server/daemons.py +2 -3
- sky/server/requests/executor.py +56 -6
- sky/server/requests/payloads.py +31 -8
- sky/server/requests/preconditions.py +2 -3
- sky/server/rest.py +2 -0
- sky/server/server.py +28 -19
- sky/server/stream_utils.py +34 -12
- sky/setup_files/dependencies.py +12 -2
- sky/setup_files/setup.py +44 -44
- sky/skylet/constants.py +2 -3
- sky/templates/kubernetes-ray.yml.j2 +16 -15
- sky/usage/usage_lib.py +3 -0
- sky/utils/cli_utils/status_utils.py +4 -5
- sky/utils/context.py +104 -29
- sky/utils/controller_utils.py +7 -6
- sky/utils/kubernetes/create_cluster.sh +13 -28
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/kubernetes_deploy_utils.py +170 -37
- sky/utils/kubernetes_enums.py +5 -0
- sky/utils/ux_utils.py +35 -1
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +44 -8
- sky/volumes/server/server.py +33 -7
- sky/volumes/volume.py +22 -14
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/METADATA +38 -33
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/RECORD +109 -109
- sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
- /sky/dashboard/out/_next/static/{VvaUqYDvHOcHZRnvMBmax → bn-NHt5qTzeTN2PefXuDA}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ import urllib.request
|
|
|
7
7
|
from sky.utils import directory_utils
|
|
8
8
|
|
|
9
9
|
# Replaced with the current commit when building the wheels.
|
|
10
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
10
|
+
_SKYPILOT_COMMIT_SHA = 'c5a7c4995b9a92ce1c005ad783d2725c7f7f9af2'
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def _get_git_commit():
|
|
@@ -37,7 +37,7 @@ def _get_git_commit():
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
__commit__ = _get_git_commit()
|
|
40
|
-
__version__ = '1.0.0.
|
|
40
|
+
__version__ = '1.0.0.dev20250925'
|
|
41
41
|
__root_dir__ = directory_utils.get_sky_dir()
|
|
42
42
|
|
|
43
43
|
|
sky/backends/backend.py
CHANGED
|
@@ -95,6 +95,12 @@ class Backend(Generic[_ResourceHandleType]):
|
|
|
95
95
|
envs_and_secrets: Dict[str, str]) -> None:
|
|
96
96
|
return self._sync_workdir(handle, workdir, envs_and_secrets)
|
|
97
97
|
|
|
98
|
+
@timeline.event
|
|
99
|
+
@usage_lib.messages.usage.update_runtime('download_file')
|
|
100
|
+
def download_file(self, handle: _ResourceHandleType, local_file_path: str,
|
|
101
|
+
remote_file_path: str) -> None:
|
|
102
|
+
return self._download_file(handle, local_file_path, remote_file_path)
|
|
103
|
+
|
|
98
104
|
@timeline.event
|
|
99
105
|
@usage_lib.messages.usage.update_runtime('sync_file_mounts')
|
|
100
106
|
def sync_file_mounts(
|
|
@@ -172,6 +178,10 @@ class Backend(Generic[_ResourceHandleType]):
|
|
|
172
178
|
envs_and_secrets: Dict[str, str]) -> None:
|
|
173
179
|
raise NotImplementedError
|
|
174
180
|
|
|
181
|
+
def _download_file(self, handle: _ResourceHandleType, local_file_path: str,
|
|
182
|
+
remote_file_path: str) -> None:
|
|
183
|
+
raise NotImplementedError
|
|
184
|
+
|
|
175
185
|
def _sync_file_mounts(
|
|
176
186
|
self,
|
|
177
187
|
handle: _ResourceHandleType,
|
sky/backends/backend_utils.py
CHANGED
|
@@ -52,6 +52,7 @@ from sky.utils import cluster_utils
|
|
|
52
52
|
from sky.utils import command_runner
|
|
53
53
|
from sky.utils import common
|
|
54
54
|
from sky.utils import common_utils
|
|
55
|
+
from sky.utils import context as context_lib
|
|
55
56
|
from sky.utils import context_utils
|
|
56
57
|
from sky.utils import controller_utils
|
|
57
58
|
from sky.utils import env_options
|
|
@@ -1843,7 +1844,9 @@ def check_owner_identity(cluster_name: str) -> None:
|
|
|
1843
1844
|
"""
|
|
1844
1845
|
if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
|
|
1845
1846
|
return
|
|
1846
|
-
record = global_user_state.get_cluster_from_name(cluster_name
|
|
1847
|
+
record = global_user_state.get_cluster_from_name(cluster_name,
|
|
1848
|
+
include_user_info=False,
|
|
1849
|
+
summary_response=True)
|
|
1847
1850
|
if record is None:
|
|
1848
1851
|
return
|
|
1849
1852
|
handle = record['handle']
|
|
@@ -1930,6 +1933,7 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
|
|
|
1930
1933
|
}
|
|
1931
1934
|
|
|
1932
1935
|
|
|
1936
|
+
@context_utils.cancellation_guard
|
|
1933
1937
|
def _query_cluster_status_via_cloud_api(
|
|
1934
1938
|
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
|
|
1935
1939
|
) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
|
|
@@ -2137,7 +2141,10 @@ def check_can_clone_disk_and_override_task(
|
|
|
2137
2141
|
return task, handle
|
|
2138
2142
|
|
|
2139
2143
|
|
|
2140
|
-
def _update_cluster_status(
|
|
2144
|
+
def _update_cluster_status(
|
|
2145
|
+
cluster_name: str,
|
|
2146
|
+
include_user_info: bool = True,
|
|
2147
|
+
summary_response: bool = False) -> Optional[Dict[str, Any]]:
|
|
2141
2148
|
"""Update the cluster status.
|
|
2142
2149
|
|
|
2143
2150
|
The cluster status is updated by checking ray cluster and real status from
|
|
@@ -2164,7 +2171,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2164
2171
|
fetched from the cloud provider or there are leaked nodes causing
|
|
2165
2172
|
the node number larger than expected.
|
|
2166
2173
|
"""
|
|
2167
|
-
record = global_user_state.get_cluster_from_name(
|
|
2174
|
+
record = global_user_state.get_cluster_from_name(
|
|
2175
|
+
cluster_name,
|
|
2176
|
+
include_user_info=include_user_info,
|
|
2177
|
+
summary_response=summary_response)
|
|
2168
2178
|
if record is None:
|
|
2169
2179
|
return None
|
|
2170
2180
|
handle = record['handle']
|
|
@@ -2340,7 +2350,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2340
2350
|
ready=True,
|
|
2341
2351
|
is_launch=False,
|
|
2342
2352
|
existing_cluster_hash=record['cluster_hash'])
|
|
2343
|
-
return global_user_state.get_cluster_from_name(
|
|
2353
|
+
return global_user_state.get_cluster_from_name(
|
|
2354
|
+
cluster_name,
|
|
2355
|
+
include_user_info=include_user_info,
|
|
2356
|
+
summary_response=summary_response)
|
|
2344
2357
|
|
|
2345
2358
|
# All cases below are transitioning the cluster to non-UP states.
|
|
2346
2359
|
launched_resources = handle.launched_resources.assert_launchable()
|
|
@@ -2552,7 +2565,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2552
2565
|
ready=False,
|
|
2553
2566
|
is_launch=False,
|
|
2554
2567
|
existing_cluster_hash=record['cluster_hash'])
|
|
2555
|
-
return global_user_state.get_cluster_from_name(
|
|
2568
|
+
return global_user_state.get_cluster_from_name(
|
|
2569
|
+
cluster_name,
|
|
2570
|
+
include_user_info=include_user_info,
|
|
2571
|
+
summary_response=summary_response)
|
|
2556
2572
|
# Now is_abnormal is False: either node_statuses is empty or all nodes are
|
|
2557
2573
|
# STOPPED.
|
|
2558
2574
|
verb = 'terminated' if to_terminate else 'stopped'
|
|
@@ -2567,7 +2583,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2567
2583
|
nop_if_duplicate=True,
|
|
2568
2584
|
)
|
|
2569
2585
|
backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
|
|
2570
|
-
return global_user_state.get_cluster_from_name(
|
|
2586
|
+
return global_user_state.get_cluster_from_name(
|
|
2587
|
+
cluster_name,
|
|
2588
|
+
include_user_info=include_user_info,
|
|
2589
|
+
summary_response=summary_response)
|
|
2571
2590
|
|
|
2572
2591
|
|
|
2573
2592
|
def _must_refresh_cluster_status(
|
|
@@ -2589,12 +2608,13 @@ def _must_refresh_cluster_status(
|
|
|
2589
2608
|
|
|
2590
2609
|
|
|
2591
2610
|
def refresh_cluster_record(
|
|
2592
|
-
|
|
2593
|
-
|
|
2594
|
-
|
|
2595
|
-
|
|
2596
|
-
|
|
2597
|
-
|
|
2611
|
+
cluster_name: str,
|
|
2612
|
+
*,
|
|
2613
|
+
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
|
|
2614
|
+
acquire_per_cluster_status_lock: bool = True,
|
|
2615
|
+
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
|
|
2616
|
+
include_user_info: bool = True,
|
|
2617
|
+
summary_response: bool = False) -> Optional[Dict[str, Any]]:
|
|
2598
2618
|
"""Refresh the cluster, and return the possibly updated record.
|
|
2599
2619
|
|
|
2600
2620
|
The function will update the cached cluster status in the global state. For
|
|
@@ -2634,7 +2654,11 @@ def refresh_cluster_record(
|
|
|
2634
2654
|
the node number larger than expected.
|
|
2635
2655
|
"""
|
|
2636
2656
|
|
|
2637
|
-
|
|
2657
|
+
ctx = context_lib.get()
|
|
2658
|
+
record = global_user_state.get_cluster_from_name(
|
|
2659
|
+
cluster_name,
|
|
2660
|
+
include_user_info=include_user_info,
|
|
2661
|
+
summary_response=summary_response)
|
|
2638
2662
|
if record is None:
|
|
2639
2663
|
return None
|
|
2640
2664
|
# TODO(zhwu, 05/20): switch to the specific workspace to make sure we are
|
|
@@ -2653,12 +2677,16 @@ def refresh_cluster_record(
|
|
|
2653
2677
|
|
|
2654
2678
|
# Loop until we have an up-to-date status or until we acquire the lock.
|
|
2655
2679
|
while True:
|
|
2680
|
+
# Check if the context is canceled.
|
|
2681
|
+
if ctx is not None and ctx.is_canceled():
|
|
2682
|
+
raise asyncio.CancelledError()
|
|
2656
2683
|
# Check to see if we can return the cached status.
|
|
2657
2684
|
if not _must_refresh_cluster_status(record, force_refresh_statuses):
|
|
2658
2685
|
return record
|
|
2659
2686
|
|
|
2660
2687
|
if not acquire_per_cluster_status_lock:
|
|
2661
|
-
return _update_cluster_status(cluster_name
|
|
2688
|
+
return _update_cluster_status(cluster_name, include_user_info,
|
|
2689
|
+
summary_response)
|
|
2662
2690
|
|
|
2663
2691
|
# Try to acquire the lock so we can fetch the status.
|
|
2664
2692
|
try:
|
|
@@ -2666,12 +2694,16 @@ def refresh_cluster_record(
|
|
|
2666
2694
|
# Check the cluster status again, since it could have been
|
|
2667
2695
|
# updated between our last check and acquiring the lock.
|
|
2668
2696
|
record = global_user_state.get_cluster_from_name(
|
|
2669
|
-
cluster_name
|
|
2697
|
+
cluster_name,
|
|
2698
|
+
include_user_info=include_user_info,
|
|
2699
|
+
summary_response=summary_response)
|
|
2670
2700
|
if record is None or not _must_refresh_cluster_status(
|
|
2671
2701
|
record, force_refresh_statuses):
|
|
2672
2702
|
return record
|
|
2673
2703
|
# Update and return the cluster status.
|
|
2674
|
-
return _update_cluster_status(cluster_name
|
|
2704
|
+
return _update_cluster_status(cluster_name,
|
|
2705
|
+
include_user_info,
|
|
2706
|
+
summary_response)
|
|
2675
2707
|
|
|
2676
2708
|
except locks.LockTimeout:
|
|
2677
2709
|
# lock.acquire() will throw a Timeout exception if the lock is not
|
|
@@ -2692,7 +2724,10 @@ def refresh_cluster_record(
|
|
|
2692
2724
|
time.sleep(lock.poll_interval)
|
|
2693
2725
|
|
|
2694
2726
|
# Refresh for next loop iteration.
|
|
2695
|
-
record = global_user_state.get_cluster_from_name(
|
|
2727
|
+
record = global_user_state.get_cluster_from_name(
|
|
2728
|
+
cluster_name,
|
|
2729
|
+
include_user_info=include_user_info,
|
|
2730
|
+
summary_response=summary_response)
|
|
2696
2731
|
if record is None:
|
|
2697
2732
|
return None
|
|
2698
2733
|
|
|
@@ -2717,7 +2752,9 @@ def refresh_cluster_status_handle(
|
|
|
2717
2752
|
cluster_name,
|
|
2718
2753
|
force_refresh_statuses=force_refresh_statuses,
|
|
2719
2754
|
acquire_per_cluster_status_lock=acquire_per_cluster_status_lock,
|
|
2720
|
-
cluster_status_lock_timeout=cluster_status_lock_timeout
|
|
2755
|
+
cluster_status_lock_timeout=cluster_status_lock_timeout,
|
|
2756
|
+
include_user_info=False,
|
|
2757
|
+
summary_response=True)
|
|
2721
2758
|
if record is None:
|
|
2722
2759
|
return None, None
|
|
2723
2760
|
return record['status'], record['handle']
|
|
@@ -2768,7 +2805,9 @@ def check_cluster_available(
|
|
|
2768
2805
|
exceptions.CloudUserIdentityError: if we fail to get the current user
|
|
2769
2806
|
identity.
|
|
2770
2807
|
"""
|
|
2771
|
-
record = global_user_state.get_cluster_from_name(cluster_name
|
|
2808
|
+
record = global_user_state.get_cluster_from_name(cluster_name,
|
|
2809
|
+
include_user_info=False,
|
|
2810
|
+
summary_response=True)
|
|
2772
2811
|
if dryrun:
|
|
2773
2812
|
assert record is not None, cluster_name
|
|
2774
2813
|
return record['handle']
|
|
@@ -2955,7 +2994,8 @@ def is_controller_accessible(
|
|
|
2955
2994
|
f'fatal, but {controller_name} commands/calls may hang or return '
|
|
2956
2995
|
'stale information, when the controller is not up.\n'
|
|
2957
2996
|
f' Details: {common_utils.format_exception(e, use_bracket=True)}')
|
|
2958
|
-
record = global_user_state.get_cluster_from_name(
|
|
2997
|
+
record = global_user_state.get_cluster_from_name(
|
|
2998
|
+
cluster_name, include_user_info=False, summary_response=True)
|
|
2959
2999
|
if record is not None:
|
|
2960
3000
|
controller_status, handle = record['status'], record['handle']
|
|
2961
3001
|
# We check the connection even if the cluster has a cached status UP
|
|
@@ -3012,22 +3052,98 @@ class CloudFilter(enum.Enum):
|
|
|
3012
3052
|
LOCAL = 'local'
|
|
3013
3053
|
|
|
3014
3054
|
|
|
3015
|
-
def _get_glob_clusters(
|
|
3055
|
+
def _get_glob_clusters(
|
|
3056
|
+
clusters: List[str],
|
|
3057
|
+
silent: bool = False,
|
|
3058
|
+
workspaces_filter: Optional[Dict[str, Any]] = None) -> List[str]:
|
|
3016
3059
|
"""Returns a list of clusters that match the glob pattern."""
|
|
3017
3060
|
glob_clusters = []
|
|
3018
3061
|
for cluster in clusters:
|
|
3019
|
-
glob_cluster = global_user_state.get_glob_cluster_names(
|
|
3062
|
+
glob_cluster = global_user_state.get_glob_cluster_names(
|
|
3063
|
+
cluster, workspaces_filter=workspaces_filter)
|
|
3020
3064
|
if len(glob_cluster) == 0 and not silent:
|
|
3021
3065
|
logger.info(f'Cluster {cluster} not found.')
|
|
3022
3066
|
glob_clusters.extend(glob_cluster)
|
|
3023
3067
|
return list(set(glob_clusters))
|
|
3024
3068
|
|
|
3025
3069
|
|
|
3070
|
+
def _refresh_cluster(
|
|
3071
|
+
cluster_name: str,
|
|
3072
|
+
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]],
|
|
3073
|
+
include_user_info: bool = True,
|
|
3074
|
+
summary_response: bool = False) -> Optional[Dict[str, Any]]:
|
|
3075
|
+
try:
|
|
3076
|
+
record = refresh_cluster_record(
|
|
3077
|
+
cluster_name,
|
|
3078
|
+
force_refresh_statuses=force_refresh_statuses,
|
|
3079
|
+
acquire_per_cluster_status_lock=True,
|
|
3080
|
+
include_user_info=include_user_info,
|
|
3081
|
+
summary_response=summary_response)
|
|
3082
|
+
except (exceptions.ClusterStatusFetchingError,
|
|
3083
|
+
exceptions.CloudUserIdentityError,
|
|
3084
|
+
exceptions.ClusterOwnerIdentityMismatchError) as e:
|
|
3085
|
+
# Do not fail the entire refresh process. The caller will
|
|
3086
|
+
# handle the 'UNKNOWN' status, and collect the errors into
|
|
3087
|
+
# a table.
|
|
3088
|
+
record = {'status': 'UNKNOWN', 'error': e}
|
|
3089
|
+
return record
|
|
3090
|
+
|
|
3091
|
+
|
|
3092
|
+
def refresh_cluster_records() -> None:
|
|
3093
|
+
"""Refreshes the status of all clusters, except managed clusters.
|
|
3094
|
+
|
|
3095
|
+
Used by the background status refresh daemon.
|
|
3096
|
+
This function is a stripped-down version of get_clusters, with only the
|
|
3097
|
+
bare bones refresh logic.
|
|
3098
|
+
|
|
3099
|
+
Returns:
|
|
3100
|
+
None
|
|
3101
|
+
|
|
3102
|
+
Raises:
|
|
3103
|
+
None
|
|
3104
|
+
"""
|
|
3105
|
+
exclude_managed_clusters = True
|
|
3106
|
+
if env_options.Options.SHOW_DEBUG_INFO.get():
|
|
3107
|
+
exclude_managed_clusters = False
|
|
3108
|
+
cluster_names = global_user_state.get_cluster_names(
|
|
3109
|
+
exclude_managed_clusters=exclude_managed_clusters,)
|
|
3110
|
+
|
|
3111
|
+
# TODO(syang): we should try not to leak
|
|
3112
|
+
# request info in backend_utils.py.
|
|
3113
|
+
# Refactor this to use some other info to
|
|
3114
|
+
# determine if a launch is in progress.
|
|
3115
|
+
request = requests_lib.get_request_tasks(
|
|
3116
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
3117
|
+
status=[requests_lib.RequestStatus.RUNNING],
|
|
3118
|
+
cluster_names=cluster_names,
|
|
3119
|
+
include_request_names=['sky.launch']))
|
|
3120
|
+
cluster_names_with_launch_request = {
|
|
3121
|
+
request.cluster_name for request in request
|
|
3122
|
+
}
|
|
3123
|
+
cluster_names_without_launch_request = [
|
|
3124
|
+
cluster_name for cluster_name in cluster_names
|
|
3125
|
+
if cluster_name not in cluster_names_with_launch_request
|
|
3126
|
+
]
|
|
3127
|
+
|
|
3128
|
+
def _refresh_cluster_record(cluster_name):
|
|
3129
|
+
return _refresh_cluster(cluster_name,
|
|
3130
|
+
force_refresh_statuses=set(
|
|
3131
|
+
status_lib.ClusterStatus),
|
|
3132
|
+
include_user_info=False,
|
|
3133
|
+
summary_response=True)
|
|
3134
|
+
|
|
3135
|
+
if len(cluster_names) > 0:
|
|
3136
|
+
# Do not refresh the clusters that have an active launch request.
|
|
3137
|
+
subprocess_utils.run_in_parallel(_refresh_cluster_record,
|
|
3138
|
+
cluster_names_without_launch_request)
|
|
3139
|
+
|
|
3140
|
+
|
|
3026
3141
|
def get_clusters(
|
|
3027
3142
|
refresh: common.StatusRefreshMode,
|
|
3028
3143
|
cluster_names: Optional[Union[str, List[str]]] = None,
|
|
3029
3144
|
all_users: bool = True,
|
|
3030
3145
|
include_credentials: bool = False,
|
|
3146
|
+
summary_response: bool = False,
|
|
3031
3147
|
# Internal only:
|
|
3032
3148
|
# pylint: disable=invalid-name
|
|
3033
3149
|
_include_is_managed: bool = False,
|
|
@@ -3055,6 +3171,23 @@ def get_clusters(
|
|
|
3055
3171
|
A list of cluster records. If the cluster does not exist or has been
|
|
3056
3172
|
terminated, the record will be omitted from the returned list.
|
|
3057
3173
|
"""
|
|
3174
|
+
accessible_workspaces = workspaces_core.get_workspaces()
|
|
3175
|
+
if cluster_names is not None:
|
|
3176
|
+
if isinstance(cluster_names, str):
|
|
3177
|
+
cluster_names = [cluster_names]
|
|
3178
|
+
non_glob_cluster_names = []
|
|
3179
|
+
glob_cluster_names = []
|
|
3180
|
+
for cluster_name in cluster_names:
|
|
3181
|
+
if ux_utils.is_glob_pattern(cluster_name):
|
|
3182
|
+
glob_cluster_names.append(cluster_name)
|
|
3183
|
+
else:
|
|
3184
|
+
non_glob_cluster_names.append(cluster_name)
|
|
3185
|
+
cluster_names = non_glob_cluster_names
|
|
3186
|
+
if glob_cluster_names:
|
|
3187
|
+
cluster_names += _get_glob_clusters(
|
|
3188
|
+
glob_cluster_names,
|
|
3189
|
+
silent=True,
|
|
3190
|
+
workspaces_filter=accessible_workspaces)
|
|
3058
3191
|
|
|
3059
3192
|
exclude_managed_clusters = False
|
|
3060
3193
|
if not (_include_is_managed or env_options.Options.SHOW_DEBUG_INFO.get()):
|
|
@@ -3062,34 +3195,24 @@ def get_clusters(
|
|
|
3062
3195
|
user_hashes_filter = None
|
|
3063
3196
|
if not all_users:
|
|
3064
3197
|
user_hashes_filter = {common_utils.get_current_user().id}
|
|
3065
|
-
accessible_workspaces = workspaces_core.get_workspaces()
|
|
3066
|
-
|
|
3067
3198
|
records = global_user_state.get_clusters(
|
|
3068
3199
|
exclude_managed_clusters=exclude_managed_clusters,
|
|
3069
3200
|
user_hashes_filter=user_hashes_filter,
|
|
3070
|
-
workspaces_filter=accessible_workspaces
|
|
3201
|
+
workspaces_filter=accessible_workspaces,
|
|
3202
|
+
cluster_names=cluster_names,
|
|
3203
|
+
summary_response=summary_response)
|
|
3071
3204
|
|
|
3072
3205
|
yellow = colorama.Fore.YELLOW
|
|
3073
3206
|
bright = colorama.Style.BRIGHT
|
|
3074
3207
|
reset = colorama.Style.RESET_ALL
|
|
3075
3208
|
|
|
3076
3209
|
if cluster_names is not None:
|
|
3077
|
-
|
|
3078
|
-
|
|
3079
|
-
|
|
3080
|
-
|
|
3081
|
-
|
|
3082
|
-
for cluster_name in cluster_names:
|
|
3083
|
-
for record in records:
|
|
3084
|
-
if record['name'] == cluster_name:
|
|
3085
|
-
new_records.append(record)
|
|
3086
|
-
break
|
|
3087
|
-
else:
|
|
3088
|
-
not_exist_cluster_names.append(cluster_name)
|
|
3089
|
-
if not_exist_cluster_names:
|
|
3090
|
-
clusters_str = ', '.join(not_exist_cluster_names)
|
|
3210
|
+
record_names = {record['name'] for record in records}
|
|
3211
|
+
not_found_clusters = ux_utils.get_non_matched_query(
|
|
3212
|
+
cluster_names, record_names)
|
|
3213
|
+
if not_found_clusters:
|
|
3214
|
+
clusters_str = ', '.join(not_found_clusters)
|
|
3091
3215
|
logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
|
|
3092
|
-
records = new_records
|
|
3093
3216
|
|
|
3094
3217
|
def _get_records_with_handle(
|
|
3095
3218
|
records: List[Optional[Dict[str, Any]]]) -> List[Dict[str, Any]]:
|
|
@@ -3099,7 +3222,7 @@ def get_clusters(
|
|
|
3099
3222
|
if record is not None and record['handle'] is not None
|
|
3100
3223
|
]
|
|
3101
3224
|
|
|
3102
|
-
def
|
|
3225
|
+
def _update_records_with_handle_info(
|
|
3103
3226
|
records: List[Optional[Dict[str, Any]]]) -> None:
|
|
3104
3227
|
"""Add resource str to record"""
|
|
3105
3228
|
for record in _get_records_with_handle(records):
|
|
@@ -3110,6 +3233,8 @@ def get_clusters(
|
|
|
3110
3233
|
record[
|
|
3111
3234
|
'resources_str_full'] = resources_utils.get_readable_resources_repr(
|
|
3112
3235
|
handle, simplify=False)
|
|
3236
|
+
if not summary_response:
|
|
3237
|
+
record['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
|
|
3113
3238
|
|
|
3114
3239
|
def _update_records_with_credentials(
|
|
3115
3240
|
records: List[Optional[Dict[str, Any]]]) -> None:
|
|
@@ -3149,7 +3274,7 @@ def get_clusters(
|
|
|
3149
3274
|
record['credentials'] = credential
|
|
3150
3275
|
|
|
3151
3276
|
def _update_records_with_resources(
|
|
3152
|
-
|
|
3277
|
+
records: List[Optional[Dict[str, Any]]],) -> None:
|
|
3153
3278
|
"""Add the resources to the record."""
|
|
3154
3279
|
for record in _get_records_with_handle(records):
|
|
3155
3280
|
handle = record['handle']
|
|
@@ -3168,8 +3293,8 @@ def get_clusters(
|
|
|
3168
3293
|
f'{handle.launched_resources.accelerators}'
|
|
3169
3294
|
if handle.launched_resources.accelerators else None)
|
|
3170
3295
|
|
|
3171
|
-
# Add
|
|
3172
|
-
|
|
3296
|
+
# Add handle info to the records
|
|
3297
|
+
_update_records_with_handle_info(records)
|
|
3173
3298
|
if include_credentials:
|
|
3174
3299
|
_update_records_with_credentials(records)
|
|
3175
3300
|
if refresh == common.StatusRefreshMode.NONE:
|
|
@@ -3190,47 +3315,44 @@ def get_clusters(
|
|
|
3190
3315
|
else:
|
|
3191
3316
|
force_refresh_statuses = None
|
|
3192
3317
|
|
|
3193
|
-
def
|
|
3194
|
-
|
|
3195
|
-
|
|
3196
|
-
|
|
3197
|
-
|
|
3198
|
-
|
|
3199
|
-
|
|
3200
|
-
status=[requests_lib.RequestStatus.RUNNING],
|
|
3201
|
-
cluster_names=[cluster_name],
|
|
3202
|
-
include_request_names=['sky.launch']))
|
|
3203
|
-
if len(request) > 0:
|
|
3204
|
-
# There is an active launch request on the cluster,
|
|
3205
|
-
# so we don't want to update the cluster status until
|
|
3206
|
-
# the request is completed.
|
|
3207
|
-
logger.debug(f'skipping refresh for cluster {cluster_name} '
|
|
3208
|
-
'as there is an active launch request')
|
|
3209
|
-
return global_user_state.get_cluster_from_name(cluster_name)
|
|
3210
|
-
try:
|
|
3211
|
-
record = refresh_cluster_record(
|
|
3212
|
-
cluster_name,
|
|
3213
|
-
force_refresh_statuses=force_refresh_statuses,
|
|
3214
|
-
acquire_per_cluster_status_lock=True)
|
|
3215
|
-
_update_records_with_resources_str([record])
|
|
3318
|
+
def _refresh_cluster_record(cluster_name):
|
|
3319
|
+
record = _refresh_cluster(cluster_name,
|
|
3320
|
+
force_refresh_statuses=force_refresh_statuses,
|
|
3321
|
+
include_user_info=True,
|
|
3322
|
+
summary_response=summary_response)
|
|
3323
|
+
if 'error' not in record:
|
|
3324
|
+
_update_records_with_handle_info([record])
|
|
3216
3325
|
if include_credentials:
|
|
3217
3326
|
_update_records_with_credentials([record])
|
|
3218
|
-
|
|
3219
|
-
exceptions.CloudUserIdentityError,
|
|
3220
|
-
exceptions.ClusterOwnerIdentityMismatchError) as e:
|
|
3221
|
-
# Do not fail the entire refresh process. The caller will
|
|
3222
|
-
# handle the 'UNKNOWN' status, and collect the errors into
|
|
3223
|
-
# a table.
|
|
3224
|
-
record = {'status': 'UNKNOWN', 'error': e}
|
|
3225
|
-
progress.update(task, advance=1)
|
|
3327
|
+
progress.update(task, advance=1)
|
|
3226
3328
|
return record
|
|
3227
3329
|
|
|
3228
3330
|
cluster_names = [record['name'] for record in records]
|
|
3229
|
-
|
|
3230
|
-
|
|
3331
|
+
# TODO(syang): we should try not to leak
|
|
3332
|
+
# request info in backend_utils.py.
|
|
3333
|
+
# Refactor this to use some other info to
|
|
3334
|
+
# determine if a launch is in progress.
|
|
3335
|
+
request = requests_lib.get_request_tasks(
|
|
3336
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
3337
|
+
status=[requests_lib.RequestStatus.RUNNING],
|
|
3338
|
+
cluster_names=cluster_names,
|
|
3339
|
+
include_request_names=['sky.launch']))
|
|
3340
|
+
cluster_names_with_launch_request = {
|
|
3341
|
+
request.cluster_name for request in request
|
|
3342
|
+
}
|
|
3343
|
+
cluster_names_without_launch_request = [
|
|
3344
|
+
cluster_name for cluster_name in cluster_names
|
|
3345
|
+
if cluster_name not in cluster_names_with_launch_request
|
|
3346
|
+
]
|
|
3347
|
+
# for clusters that have an active launch request, we do not refresh the status
|
|
3348
|
+
updated_records = [
|
|
3349
|
+
record for record in records
|
|
3350
|
+
if record['name'] in cluster_names_with_launch_request
|
|
3351
|
+
]
|
|
3352
|
+
if len(cluster_names_without_launch_request) > 0:
|
|
3231
3353
|
with progress:
|
|
3232
3354
|
updated_records = subprocess_utils.run_in_parallel(
|
|
3233
|
-
|
|
3355
|
+
_refresh_cluster_record, cluster_names_without_launch_request)
|
|
3234
3356
|
|
|
3235
3357
|
# Show information for removed clusters.
|
|
3236
3358
|
kept_records = []
|
|
@@ -116,6 +116,9 @@ Path = str
|
|
|
116
116
|
|
|
117
117
|
SKY_REMOTE_APP_DIR = backend_utils.SKY_REMOTE_APP_DIR
|
|
118
118
|
SKY_REMOTE_WORKDIR = constants.SKY_REMOTE_WORKDIR
|
|
119
|
+
# Unset RAY_RAYLET_PID to prevent the Ray cluster in the SkyPilot runtime
|
|
120
|
+
# from interfering with the Ray cluster in the user's task (if any).
|
|
121
|
+
UNSET_RAY_ENV_VARS = ['RAY_RAYLET_PID']
|
|
119
122
|
|
|
120
123
|
logger = sky_logging.init_logger(__name__)
|
|
121
124
|
|
|
@@ -712,6 +715,8 @@ class RayCodeGen:
|
|
|
712
715
|
done
|
|
713
716
|
echo "skypilot: cached mount uploaded complete"
|
|
714
717
|
fi""")
|
|
718
|
+
unset_ray_env_vars = ' && '.join(
|
|
719
|
+
[f'unset {var}' for var in UNSET_RAY_ENV_VARS])
|
|
715
720
|
self._code += [
|
|
716
721
|
sky_env_vars_dict_str,
|
|
717
722
|
textwrap.dedent(f"""\
|
|
@@ -721,6 +726,7 @@ class RayCodeGen:
|
|
|
721
726
|
script = run_fn({gang_scheduling_id}, gang_scheduling_id_to_ip)
|
|
722
727
|
|
|
723
728
|
if script is not None:
|
|
729
|
+
script=f'{unset_ray_env_vars}; {{script}}'
|
|
724
730
|
script += rclone_flush_script
|
|
725
731
|
sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
|
|
726
732
|
|
|
@@ -3261,9 +3267,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3261
3267
|
# Usage Collection:
|
|
3262
3268
|
usage_lib.messages.usage.update_cluster_resources(
|
|
3263
3269
|
handle.launched_nodes, launched_resources)
|
|
3264
|
-
|
|
3265
|
-
if
|
|
3266
|
-
usage_lib.messages.usage.update_cluster_status(
|
|
3270
|
+
status = global_user_state.get_status_from_cluster_name(cluster_name)
|
|
3271
|
+
if status is not None:
|
|
3272
|
+
usage_lib.messages.usage.update_cluster_status(status)
|
|
3267
3273
|
|
|
3268
3274
|
assert launched_resources.region is not None, handle
|
|
3269
3275
|
|
|
@@ -3532,8 +3538,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3532
3538
|
error_message + '\n' + str(e),
|
|
3533
3539
|
failover_history=e.failover_history) from None
|
|
3534
3540
|
if dryrun:
|
|
3535
|
-
|
|
3536
|
-
|
|
3541
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
3542
|
+
cluster_name)
|
|
3543
|
+
return handle if handle is not None else None, False
|
|
3537
3544
|
|
|
3538
3545
|
if config_dict['provisioning_skipped']:
|
|
3539
3546
|
# Skip further provisioning.
|
|
@@ -3541,10 +3548,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3541
3548
|
# ('handle', 'provision_record', 'resources_vars')
|
|
3542
3549
|
# We need to return the handle - but it should be the existing
|
|
3543
3550
|
# handle for the cluster.
|
|
3544
|
-
|
|
3545
|
-
|
|
3546
|
-
|
|
3547
|
-
return
|
|
3551
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
3552
|
+
cluster_name)
|
|
3553
|
+
assert handle is not None, (cluster_name, handle)
|
|
3554
|
+
return handle, True
|
|
3548
3555
|
|
|
3549
3556
|
if 'provision_record' in config_dict:
|
|
3550
3557
|
# New provisioner is used here.
|
|
@@ -3939,6 +3946,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3939
3946
|
remote_setup_file_name = f'/tmp/sky_setup_{self.run_timestamp}'
|
|
3940
3947
|
# Need this `-i` option to make sure `source ~/.bashrc` work
|
|
3941
3948
|
setup_cmd = f'/bin/bash -i {remote_setup_file_name} 2>&1'
|
|
3949
|
+
unset_ray_env_vars = ' && '.join(
|
|
3950
|
+
[f'unset {var}' for var in UNSET_RAY_ENV_VARS])
|
|
3951
|
+
setup_cmd = f'{unset_ray_env_vars}; {setup_cmd}'
|
|
3942
3952
|
runners = handle.get_command_runners(avoid_ssh_control=True)
|
|
3943
3953
|
|
|
3944
3954
|
def _setup_node(node_id: int) -> None:
|
|
@@ -4088,6 +4098,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4088
4098
|
logger.info(
|
|
4089
4099
|
ux_utils.finishing_message('Setup completed.', setup_log_path))
|
|
4090
4100
|
|
|
4101
|
+
def _download_file(self, handle: CloudVmRayResourceHandle,
|
|
4102
|
+
local_file_path: str, remote_file_path: str) -> None:
|
|
4103
|
+
"""Syncs file from remote to local."""
|
|
4104
|
+
runners = handle.get_command_runners()
|
|
4105
|
+
head_runner = runners[0]
|
|
4106
|
+
head_runner.rsync(
|
|
4107
|
+
source=local_file_path,
|
|
4108
|
+
target=remote_file_path,
|
|
4109
|
+
up=False,
|
|
4110
|
+
stream_logs=False,
|
|
4111
|
+
)
|
|
4112
|
+
|
|
4091
4113
|
def _exec_code_on_head(
|
|
4092
4114
|
self,
|
|
4093
4115
|
handle: CloudVmRayResourceHandle,
|
|
@@ -4992,10 +5014,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4992
5014
|
f'{handle.cluster_name!r}. Assuming the cluster is still '
|
|
4993
5015
|
'up.')
|
|
4994
5016
|
if not cluster_status_fetched:
|
|
4995
|
-
|
|
5017
|
+
status = global_user_state.get_status_from_cluster_name(
|
|
4996
5018
|
handle.cluster_name)
|
|
4997
|
-
prev_cluster_status =
|
|
4998
|
-
'status'] if record is not None else None
|
|
5019
|
+
prev_cluster_status = status if status is not None else None
|
|
4999
5020
|
if prev_cluster_status is None:
|
|
5000
5021
|
# When the cluster is not in the cluster table, we guarantee that
|
|
5001
5022
|
# all related resources / cache / config are cleaned up, i.e. it
|
|
@@ -5568,7 +5589,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5568
5589
|
exceptions.InvalidClusterNameError: If the cluster name is invalid.
|
|
5569
5590
|
# TODO(zhwu): complete the list of exceptions.
|
|
5570
5591
|
"""
|
|
5571
|
-
record = global_user_state.get_cluster_from_name(
|
|
5592
|
+
record = global_user_state.get_cluster_from_name(
|
|
5593
|
+
cluster_name, include_user_info=False, summary_response=True)
|
|
5572
5594
|
if record is None:
|
|
5573
5595
|
handle_before_refresh = None
|
|
5574
5596
|
status_before_refresh = None
|
|
@@ -5589,6 +5611,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5589
5611
|
cluster_name,
|
|
5590
5612
|
force_refresh_statuses={status_lib.ClusterStatus.INIT},
|
|
5591
5613
|
acquire_per_cluster_status_lock=False,
|
|
5614
|
+
include_user_info=False,
|
|
5615
|
+
summary_response=True,
|
|
5592
5616
|
)
|
|
5593
5617
|
if record is not None:
|
|
5594
5618
|
prev_cluster_status = record['status']
|
|
@@ -189,6 +189,15 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
|
189
189
|
' a NoOp. If you are running sky exec, your workdir has not'
|
|
190
190
|
' been updated.')
|
|
191
191
|
|
|
192
|
+
def _download_file(self, handle: LocalDockerResourceHandle,
|
|
193
|
+
local_file_path: str, remote_file_path: str) -> None:
|
|
194
|
+
"""Syncs file from remote to local."""
|
|
195
|
+
# Copy from docker container to local
|
|
196
|
+
container = self.containers[handle]
|
|
197
|
+
copy_cmd = (
|
|
198
|
+
f'docker cp {container.name}:{remote_file_path} {local_file_path}')
|
|
199
|
+
subprocess.run(copy_cmd, shell=True, check=True)
|
|
200
|
+
|
|
192
201
|
def _sync_file_mounts(
|
|
193
202
|
self,
|
|
194
203
|
handle: LocalDockerResourceHandle,
|