skypilot-nightly 1.0.0.dev20250731__py3-none-any.whl → 1.0.0.dev20250802__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +6 -1
- sky/backends/cloud_vm_ray_backend.py +2 -1
- sky/catalog/data_fetchers/fetch_nebius.py +31 -7
- sky/client/cli/command.py +40 -14
- sky/client/cli/flags.py +15 -0
- sky/client/sdk.py +80 -10
- sky/client/sdk.pyi +4 -0
- sky/core.py +10 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{oKqDxFQ88cquF4nQGE_0w → 2JNCZ4daQBotwWRNGi6aE}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1871-7e17c195296e2ea9.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9e7df5fc761c95a7.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +11 -0
- sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-13145516b19858fb.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +21 -1
- sky/data/storage.py +12 -0
- sky/execution.py +5 -3
- sky/jobs/client/sdk.py +5 -1
- sky/provision/runpod/utils.py +27 -12
- sky/resources.py +17 -4
- sky/server/constants.py +1 -1
- sky/server/daemons.py +164 -0
- sky/server/requests/payloads.py +3 -0
- sky/server/requests/requests.py +2 -124
- sky/server/server.py +2 -1
- sky/server/uvicorn.py +2 -1
- sky/setup_files/dependencies.py +1 -1
- sky/sky_logging.py +30 -0
- sky/skylet/autostop_lib.py +96 -8
- sky/skylet/constants.py +4 -3
- sky/skylet/events.py +27 -13
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/utils/schemas.py +29 -0
- {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/METADATA +4 -3
- {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/RECORD +55 -54
- sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +0 -11
- /sky/dashboard/out/_next/static/{oKqDxFQ88cquF4nQGE_0w → 2JNCZ4daQBotwWRNGi6aE}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Optional
|
|
|
5
5
|
import urllib.request
|
|
6
6
|
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
8
|
+
_SKYPILOT_COMMIT_SHA = '6ddfb3cd7d476b51f9309c547338407d6eca092c'
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def _get_git_commit():
|
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
__commit__ = _get_git_commit()
|
|
38
|
-
__version__ = '1.0.0.
|
|
38
|
+
__version__ = '1.0.0.dev20250802'
|
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
|
40
40
|
|
|
41
41
|
|
sky/backends/backend_utils.py
CHANGED
|
@@ -38,6 +38,7 @@ from sky.provision import instance_setup
|
|
|
38
38
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
39
39
|
from sky.serve import serve_utils
|
|
40
40
|
from sky.server.requests import requests as requests_lib
|
|
41
|
+
from sky.skylet import autostop_lib
|
|
41
42
|
from sky.skylet import constants
|
|
42
43
|
from sky.usage import usage_lib
|
|
43
44
|
from sky.utils import cluster_utils
|
|
@@ -2238,7 +2239,11 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2238
2239
|
success = True
|
|
2239
2240
|
reset_local_autostop = True
|
|
2240
2241
|
try:
|
|
2241
|
-
backend.set_autostop(
|
|
2242
|
+
backend.set_autostop(
|
|
2243
|
+
handle,
|
|
2244
|
+
-1,
|
|
2245
|
+
autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
|
|
2246
|
+
stream_logs=False)
|
|
2242
2247
|
except exceptions.CommandError as e:
|
|
2243
2248
|
success = False
|
|
2244
2249
|
if e.returncode == 255:
|
|
@@ -4650,6 +4650,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4650
4650
|
def set_autostop(self,
|
|
4651
4651
|
handle: CloudVmRayResourceHandle,
|
|
4652
4652
|
idle_minutes_to_autostop: Optional[int],
|
|
4653
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor],
|
|
4653
4654
|
down: bool = False,
|
|
4654
4655
|
stream_logs: bool = True) -> None:
|
|
4655
4656
|
# The core.autostop() function should have already checked that the
|
|
@@ -4697,7 +4698,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4697
4698
|
assert (handle.launched_resources is not None and
|
|
4698
4699
|
handle.launched_resources.cloud is not None), handle
|
|
4699
4700
|
code = autostop_lib.AutostopCodeGen.set_autostop(
|
|
4700
|
-
idle_minutes_to_autostop, self.NAME, down)
|
|
4701
|
+
idle_minutes_to_autostop, self.NAME, wait_for, down)
|
|
4701
4702
|
returncode, _, stderr = self.run_on_head(handle,
|
|
4702
4703
|
code,
|
|
4703
4704
|
require_outputs=True,
|
|
@@ -31,17 +31,19 @@ class PresetInfo:
|
|
|
31
31
|
Attributes:
|
|
32
32
|
region (str): The geographical region where the preset is available.
|
|
33
33
|
fullname (str): The full name of the preset, a combination of platform
|
|
34
|
-
|
|
34
|
+
and preset name.
|
|
35
35
|
name (str): The name of the preset.
|
|
36
36
|
platform_name (str): The name of the platform the preset belongs to.
|
|
37
37
|
gpu (int): The number of GPUs in the preset.
|
|
38
38
|
vcpu (int): The number of virtual CPUs in the preset.
|
|
39
39
|
memory_gib (int): The amount of memory in GiB in the preset.
|
|
40
40
|
accelerator_manufacturer (str | None): The manufacturer of the
|
|
41
|
-
|
|
41
|
+
accelerator (e.g., "NVIDIA"), or None if no accelerator.
|
|
42
42
|
accelerator_name (str | None): The name of the accelerator
|
|
43
|
-
|
|
43
|
+
(e.g., "H100"), or None if no accelerator.
|
|
44
44
|
price_hourly (decimal.Decimal): The hourly price of the preset.
|
|
45
|
+
spot_price (decimal.Decimal): The spot (preemptible) price
|
|
46
|
+
of the preset.
|
|
45
47
|
"""
|
|
46
48
|
|
|
47
49
|
region: str
|
|
@@ -54,6 +56,7 @@ class PresetInfo:
|
|
|
54
56
|
accelerator_manufacturer: Optional[str]
|
|
55
57
|
accelerator_name: Optional[str]
|
|
56
58
|
price_hourly: decimal.Decimal
|
|
59
|
+
spot_price: decimal.Decimal
|
|
57
60
|
|
|
58
61
|
|
|
59
62
|
def _format_decimal(value: decimal.Decimal) -> str:
|
|
@@ -66,7 +69,7 @@ def _format_decimal(value: decimal.Decimal) -> str:
|
|
|
66
69
|
Returns:
|
|
67
70
|
str: The formatted string representation of the decimal.
|
|
68
71
|
"""
|
|
69
|
-
formatted_value = f'{value:f}'
|
|
72
|
+
formatted_value = f'{value:f}'
|
|
70
73
|
integer_part, decimal_part = formatted_value.split(
|
|
71
74
|
'.') if '.' in formatted_value else (formatted_value, '')
|
|
72
75
|
if len(decimal_part) < 2:
|
|
@@ -111,20 +114,38 @@ def _estimate_platforms(platforms: List[Any], parent_id: str,
|
|
|
111
114
|
preset=preset.name,
|
|
112
115
|
)),
|
|
113
116
|
))
|
|
114
|
-
|
|
115
117
|
price_request = billing().EstimateBatchRequest(
|
|
116
118
|
resource_specs=[estimate_spec])
|
|
119
|
+
|
|
120
|
+
# Form the specification for the spot price request
|
|
121
|
+
spot_estimate_spec = billing().ResourceSpec(
|
|
122
|
+
compute_instance_spec=compute().CreateInstanceRequest(
|
|
123
|
+
metadata=nebius_common().ResourceMetadata(
|
|
124
|
+
parent_id=parent_id,),
|
|
125
|
+
spec=compute().InstanceSpec(
|
|
126
|
+
resources=compute().ResourcesSpec(
|
|
127
|
+
platform=platform_name,
|
|
128
|
+
preset=preset.name,
|
|
129
|
+
),
|
|
130
|
+
preemptible=compute().PreemptibleSpec(priority=1),
|
|
131
|
+
),
|
|
132
|
+
))
|
|
133
|
+
spot_price_request = billing().EstimateBatchRequest(
|
|
134
|
+
resource_specs=[spot_estimate_spec])
|
|
135
|
+
|
|
117
136
|
# Start future for each preset
|
|
118
137
|
futures.append((
|
|
119
138
|
platform,
|
|
120
139
|
preset,
|
|
121
140
|
calculator_service.estimate_batch(price_request,
|
|
122
141
|
timeout=TIMEOUT),
|
|
142
|
+
calculator_service.estimate_batch(spot_price_request,
|
|
143
|
+
timeout=TIMEOUT),
|
|
123
144
|
))
|
|
124
145
|
|
|
125
146
|
# wait all futures to complete and collect results
|
|
126
147
|
result = []
|
|
127
|
-
for platform, preset, future in futures:
|
|
148
|
+
for platform, preset, future, future_spot in futures:
|
|
128
149
|
platform_name = platform.metadata.name
|
|
129
150
|
result.append(
|
|
130
151
|
PresetInfo(
|
|
@@ -141,6 +162,8 @@ def _estimate_platforms(platforms: List[Any], parent_id: str,
|
|
|
141
162
|
if platform_name.startswith('gpu-') else '',
|
|
142
163
|
price_hourly=decimal.Decimal(
|
|
143
164
|
future.wait().hourly_cost.general.total.cost),
|
|
165
|
+
spot_price=decimal.Decimal(
|
|
166
|
+
future_spot.wait().hourly_cost.general.total.cost),
|
|
144
167
|
))
|
|
145
168
|
|
|
146
169
|
return result
|
|
@@ -196,7 +219,8 @@ def _write_preset_prices(presets: List[PresetInfo], output_file: str) -> None:
|
|
|
196
219
|
'Price': _format_decimal(preset.price_hourly),
|
|
197
220
|
'Region': preset.region,
|
|
198
221
|
'GpuInfo': gpu_info,
|
|
199
|
-
'SpotPrice':
|
|
222
|
+
'SpotPrice': _format_decimal(preset.spot_price)
|
|
223
|
+
if preset.spot_price else '',
|
|
200
224
|
})
|
|
201
225
|
|
|
202
226
|
|
sky/client/cli/command.py
CHANGED
|
@@ -62,7 +62,9 @@ from sky.provision.kubernetes import constants as kubernetes_constants
|
|
|
62
62
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
63
63
|
from sky.server import common as server_common
|
|
64
64
|
from sky.server import constants as server_constants
|
|
65
|
+
from sky.server import versions
|
|
65
66
|
from sky.server.requests import requests
|
|
67
|
+
from sky.skylet import autostop_lib
|
|
66
68
|
from sky.skylet import constants
|
|
67
69
|
from sky.skylet import job_lib
|
|
68
70
|
from sky.usage import usage_lib
|
|
@@ -1011,11 +1013,11 @@ def _handle_infra_cloud_region_zone_options(infra: Optional[str],
|
|
|
1011
1013
|
required=False,
|
|
1012
1014
|
help=('Automatically stop the cluster after this many minutes '
|
|
1013
1015
|
'of idleness, i.e., no running or pending jobs in the cluster\'s job '
|
|
1014
|
-
'queue. Idleness gets reset
|
|
1015
|
-
'are found in the job queue. '
|
|
1016
|
+
'queue. Idleness gets reset depending on the ``--wait-for`` flag. '
|
|
1016
1017
|
'Setting this flag is equivalent to '
|
|
1017
1018
|
'running ``sky launch -d ...`` and then ``sky autostop -i <minutes>``'
|
|
1018
1019
|
'. If not set, the cluster will not be autostopped.'))
|
|
1020
|
+
@flags.wait_for_option('idle-minutes-to-autostop')
|
|
1019
1021
|
@click.option(
|
|
1020
1022
|
'--down',
|
|
1021
1023
|
default=False,
|
|
@@ -1101,6 +1103,7 @@ def launch(
|
|
|
1101
1103
|
network_tier: Optional[str],
|
|
1102
1104
|
ports: Tuple[str, ...],
|
|
1103
1105
|
idle_minutes_to_autostop: Optional[int],
|
|
1106
|
+
wait_for: Optional[str],
|
|
1104
1107
|
down: bool, # pylint: disable=redefined-outer-name
|
|
1105
1108
|
retry_until_up: bool,
|
|
1106
1109
|
yes: bool,
|
|
@@ -1195,6 +1198,8 @@ def launch(
|
|
|
1195
1198
|
cluster_name=cluster,
|
|
1196
1199
|
backend=backend,
|
|
1197
1200
|
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
|
1201
|
+
wait_for=autostop_lib.AutostopWaitFor.from_str(wait_for)
|
|
1202
|
+
if wait_for is not None else None,
|
|
1198
1203
|
down=down,
|
|
1199
1204
|
retry_until_up=retry_until_up,
|
|
1200
1205
|
no_setup=no_setup,
|
|
@@ -1827,6 +1832,9 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1827
1832
|
show_endpoints = endpoints or endpoint is not None
|
|
1828
1833
|
show_single_endpoint = endpoint is not None
|
|
1829
1834
|
show_services = show_services and not any([clusters, ip, endpoints])
|
|
1835
|
+
remote_api_version = versions.get_remote_api_version()
|
|
1836
|
+
if remote_api_version is None or remote_api_version < 12:
|
|
1837
|
+
show_pools = False
|
|
1830
1838
|
|
|
1831
1839
|
query_clusters: Optional[List[str]] = None if not clusters else clusters
|
|
1832
1840
|
refresh_mode = common.StatusRefreshMode.NONE
|
|
@@ -2545,6 +2553,7 @@ def stop(
|
|
|
2545
2553
|
required=False,
|
|
2546
2554
|
help=('Set the idle minutes before autostopping the cluster. '
|
|
2547
2555
|
'See the doc above for detailed semantics.'))
|
|
2556
|
+
@flags.wait_for_option('idle-minutes')
|
|
2548
2557
|
@click.option(
|
|
2549
2558
|
'--cancel',
|
|
2550
2559
|
default=False,
|
|
@@ -2567,6 +2576,7 @@ def autostop(
|
|
|
2567
2576
|
all: bool, # pylint: disable=redefined-builtin
|
|
2568
2577
|
all_users: bool,
|
|
2569
2578
|
idle_minutes: Optional[int],
|
|
2579
|
+
wait_for: Optional[str],
|
|
2570
2580
|
cancel: bool, # pylint: disable=redefined-outer-name
|
|
2571
2581
|
down: bool, # pylint: disable=redefined-outer-name
|
|
2572
2582
|
yes: bool,
|
|
@@ -2576,8 +2586,7 @@ def autostop(
|
|
|
2576
2586
|
"""Schedule an autostop or autodown for cluster(s).
|
|
2577
2587
|
|
|
2578
2588
|
Autostop/autodown will automatically stop or teardown a cluster when it
|
|
2579
|
-
becomes idle for a specified duration.
|
|
2580
|
-
in-progress (pending/running) jobs in a cluster's job queue.
|
|
2589
|
+
becomes idle for a specified duration.
|
|
2581
2590
|
|
|
2582
2591
|
CLUSTERS are the names (or glob patterns) of the clusters to stop. If both
|
|
2583
2592
|
CLUSTERS and ``--all`` are supplied, the latter takes precedence.
|
|
@@ -2590,6 +2599,11 @@ def autostop(
|
|
|
2590
2599
|
|
|
2591
2600
|
- An autostop idle time is set.
|
|
2592
2601
|
|
|
2602
|
+
- An SSH session is active (To disable this, set ``--wait-for jobs``).
|
|
2603
|
+
|
|
2604
|
+
To disable the idleness timer completely and set a hard time limit, set
|
|
2605
|
+
``--wait-for none``.
|
|
2606
|
+
|
|
2593
2607
|
Example 1: say a cluster with autostop set to 2 hours has been idle for 1
|
|
2594
2608
|
hour, then autostop is reset to 30 minutes. The cluster will not be
|
|
2595
2609
|
immediately autostopped. Instead, the idleness timer restarts counting
|
|
@@ -2610,6 +2624,9 @@ def autostop(
|
|
|
2610
2624
|
# Cancel autostop for a specific cluster.
|
|
2611
2625
|
sky autostop cluster_name --cancel
|
|
2612
2626
|
\b
|
|
2627
|
+
# Autostop this cluster after 60 minutes, regardless of activity.
|
|
2628
|
+
sky autostop cluster_name -i 60 --wait-for none
|
|
2629
|
+
\b
|
|
2613
2630
|
# Autodown this cluster after 60 minutes of idleness.
|
|
2614
2631
|
sky autostop cluster_name -i 60 --down
|
|
2615
2632
|
"""
|
|
@@ -2621,13 +2638,16 @@ def autostop(
|
|
|
2621
2638
|
idle_minutes = -1
|
|
2622
2639
|
elif idle_minutes is None:
|
|
2623
2640
|
idle_minutes = 5
|
|
2624
|
-
_down_or_stop_clusters(
|
|
2625
|
-
|
|
2626
|
-
|
|
2627
|
-
|
|
2628
|
-
|
|
2629
|
-
|
|
2630
|
-
|
|
2641
|
+
_down_or_stop_clusters(
|
|
2642
|
+
clusters,
|
|
2643
|
+
apply_to_all=all,
|
|
2644
|
+
all_users=all_users,
|
|
2645
|
+
down=down,
|
|
2646
|
+
no_confirm=yes,
|
|
2647
|
+
idle_minutes_to_autostop=idle_minutes,
|
|
2648
|
+
wait_for=autostop_lib.AutostopWaitFor.from_str(wait_for)
|
|
2649
|
+
if wait_for is not None else None,
|
|
2650
|
+
async_call=async_call)
|
|
2631
2651
|
|
|
2632
2652
|
|
|
2633
2653
|
@cli.command(cls=_DocumentedCodeCommand)
|
|
@@ -2646,11 +2666,11 @@ def autostop(
|
|
|
2646
2666
|
required=False,
|
|
2647
2667
|
help=('Automatically stop the cluster after this many minutes '
|
|
2648
2668
|
'of idleness, i.e., no running or pending jobs in the cluster\'s job '
|
|
2649
|
-
'queue. Idleness gets reset
|
|
2650
|
-
'are found in the job queue. '
|
|
2669
|
+
'queue. Idleness gets reset depending on the ``--wait-for`` flag. '
|
|
2651
2670
|
'Setting this flag is equivalent to '
|
|
2652
2671
|
'running ``sky launch -d ...`` and then ``sky autostop -i <minutes>``'
|
|
2653
2672
|
'. If not set, the cluster will not be autostopped.'))
|
|
2673
|
+
@flags.wait_for_option('idle-minutes-to-autostop')
|
|
2654
2674
|
@click.option(
|
|
2655
2675
|
'--down',
|
|
2656
2676
|
default=False,
|
|
@@ -2689,6 +2709,7 @@ def start(
|
|
|
2689
2709
|
all: bool,
|
|
2690
2710
|
yes: bool,
|
|
2691
2711
|
idle_minutes_to_autostop: Optional[int],
|
|
2712
|
+
wait_for: Optional[str],
|
|
2692
2713
|
down: bool, # pylint: disable=redefined-outer-name
|
|
2693
2714
|
retry_until_up: bool,
|
|
2694
2715
|
force: bool,
|
|
@@ -2848,6 +2869,8 @@ def start(
|
|
|
2848
2869
|
request_ids = subprocess_utils.run_in_parallel(
|
|
2849
2870
|
lambda name: sdk.start(name,
|
|
2850
2871
|
idle_minutes_to_autostop,
|
|
2872
|
+
autostop_lib.AutostopWaitFor.from_str(wait_for)
|
|
2873
|
+
if wait_for is not None else None,
|
|
2851
2874
|
retry_until_up,
|
|
2852
2875
|
down=down,
|
|
2853
2876
|
force=force), to_start)
|
|
@@ -3077,6 +3100,7 @@ def _down_or_stop_clusters(
|
|
|
3077
3100
|
no_confirm: bool = True,
|
|
3078
3101
|
purge: bool = False,
|
|
3079
3102
|
idle_minutes_to_autostop: Optional[int] = None,
|
|
3103
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = None,
|
|
3080
3104
|
async_call: bool = False) -> None:
|
|
3081
3105
|
"""Tears down or (auto-)stops a cluster (or all clusters).
|
|
3082
3106
|
|
|
@@ -3094,6 +3118,7 @@ def _down_or_stop_clusters(
|
|
|
3094
3118
|
purge: If True, forcefully remove the clusters from the cluster table.
|
|
3095
3119
|
idle_minutes_to_autostop: The number of minutes to wait before
|
|
3096
3120
|
automatically stopping the cluster.
|
|
3121
|
+
wait_for: Determines the condition for resetting the idleness timer.
|
|
3097
3122
|
async_call: If True, send the request asynchronously.
|
|
3098
3123
|
"""
|
|
3099
3124
|
if down:
|
|
@@ -3237,7 +3262,8 @@ def _down_or_stop_clusters(
|
|
|
3237
3262
|
success_progress = False
|
|
3238
3263
|
if idle_minutes_to_autostop is not None:
|
|
3239
3264
|
try:
|
|
3240
|
-
request_id = sdk.autostop(name, idle_minutes_to_autostop,
|
|
3265
|
+
request_id = sdk.autostop(name, idle_minutes_to_autostop,
|
|
3266
|
+
wait_for, down)
|
|
3241
3267
|
request_ids.append(request_id)
|
|
3242
3268
|
_async_call_or_wait(
|
|
3243
3269
|
request_id, async_call,
|
sky/client/cli/flags.py
CHANGED
|
@@ -7,6 +7,7 @@ import click
|
|
|
7
7
|
import dotenv
|
|
8
8
|
|
|
9
9
|
from sky import skypilot_config
|
|
10
|
+
from sky.skylet import autostop_lib
|
|
10
11
|
from sky.utils import resources_utils
|
|
11
12
|
|
|
12
13
|
|
|
@@ -340,3 +341,17 @@ def all_users_option(helptext: Optional[str] = None):
|
|
|
340
341
|
help=helptext)(func)
|
|
341
342
|
|
|
342
343
|
return return_option_decorator
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def wait_for_option(pair: str):
|
|
347
|
+
"""A decorator for the --wait-for option."""
|
|
348
|
+
|
|
349
|
+
def return_option_decorator(func):
|
|
350
|
+
return click.option(
|
|
351
|
+
'--wait-for',
|
|
352
|
+
type=click.Choice(autostop_lib.AutostopWaitFor.supported_modes()),
|
|
353
|
+
default=None,
|
|
354
|
+
required=False,
|
|
355
|
+
help=autostop_lib.AutostopWaitFor.cli_help_message(pair=pair))(func)
|
|
356
|
+
|
|
357
|
+
return return_option_decorator
|
sky/client/sdk.py
CHANGED
|
@@ -10,19 +10,14 @@ Usage example:
|
|
|
10
10
|
statuses = sky.get(request_id)
|
|
11
11
|
|
|
12
12
|
"""
|
|
13
|
-
import base64
|
|
14
|
-
import binascii
|
|
15
13
|
from http import cookiejar
|
|
16
14
|
import json
|
|
17
15
|
import logging
|
|
18
16
|
import os
|
|
19
|
-
import pathlib
|
|
20
17
|
import subprocess
|
|
21
|
-
import time
|
|
22
18
|
import typing
|
|
23
19
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
24
20
|
from urllib import parse as urlparse
|
|
25
|
-
import webbrowser
|
|
26
21
|
|
|
27
22
|
import click
|
|
28
23
|
import colorama
|
|
@@ -37,8 +32,10 @@ from sky.client import common as client_common
|
|
|
37
32
|
from sky.client import oauth as oauth_lib
|
|
38
33
|
from sky.server import common as server_common
|
|
39
34
|
from sky.server import rest
|
|
35
|
+
from sky.server import versions
|
|
40
36
|
from sky.server.requests import payloads
|
|
41
37
|
from sky.server.requests import requests as requests_lib
|
|
38
|
+
from sky.skylet import autostop_lib
|
|
42
39
|
from sky.skylet import constants
|
|
43
40
|
from sky.usage import usage_lib
|
|
44
41
|
from sky.utils import admin_policy_utils
|
|
@@ -57,7 +54,12 @@ from sky.utils import ux_utils
|
|
|
57
54
|
from sky.utils.kubernetes import ssh_utils
|
|
58
55
|
|
|
59
56
|
if typing.TYPE_CHECKING:
|
|
57
|
+
import base64
|
|
58
|
+
import binascii
|
|
60
59
|
import io
|
|
60
|
+
import pathlib
|
|
61
|
+
import time
|
|
62
|
+
import webbrowser
|
|
61
63
|
|
|
62
64
|
import psutil
|
|
63
65
|
import requests
|
|
@@ -65,6 +67,14 @@ if typing.TYPE_CHECKING:
|
|
|
65
67
|
import sky
|
|
66
68
|
from sky import backends
|
|
67
69
|
else:
|
|
70
|
+
# only used in api_login()
|
|
71
|
+
base64 = adaptors_common.LazyImport('base64')
|
|
72
|
+
binascii = adaptors_common.LazyImport('binascii')
|
|
73
|
+
pathlib = adaptors_common.LazyImport('pathlib')
|
|
74
|
+
time = adaptors_common.LazyImport('time')
|
|
75
|
+
# only used in dashboard() and api_login()
|
|
76
|
+
webbrowser = adaptors_common.LazyImport('webbrowser')
|
|
77
|
+
# only used in api_stop()
|
|
68
78
|
psutil = adaptors_common.LazyImport('psutil')
|
|
69
79
|
|
|
70
80
|
logger = sky_logging.init_logger(__name__)
|
|
@@ -375,6 +385,7 @@ def launch(
|
|
|
375
385
|
cluster_name: Optional[str] = None,
|
|
376
386
|
retry_until_up: bool = False,
|
|
377
387
|
idle_minutes_to_autostop: Optional[int] = None,
|
|
388
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = None,
|
|
378
389
|
dryrun: bool = False,
|
|
379
390
|
down: bool = False, # pylint: disable=redefined-outer-name
|
|
380
391
|
backend: Optional['backends.Backend'] = None,
|
|
@@ -424,6 +435,15 @@ def launch(
|
|
|
424
435
|
``sky.autostop(idle_minutes=<minutes>)``. If set, the autostop
|
|
425
436
|
config specified in the task' resources will be overridden by
|
|
426
437
|
this parameter.
|
|
438
|
+
wait_for: determines the condition for resetting the idleness timer.
|
|
439
|
+
This option works in conjunction with ``idle_minutes_to_autostop``.
|
|
440
|
+
Choices:
|
|
441
|
+
|
|
442
|
+
1. "jobs_and_ssh" (default) - Wait for all jobs to complete
|
|
443
|
+
AND all SSH sessions to disconnect.
|
|
444
|
+
2. "jobs" - Wait for all jobs to complete.
|
|
445
|
+
3. "none" - Stop immediately after idle time expires,
|
|
446
|
+
regardless of running jobs or SSH connections.
|
|
427
447
|
dryrun: if True, do not actually launch the cluster.
|
|
428
448
|
down: Tear down the cluster after all jobs finish (successfully or
|
|
429
449
|
abnormally). If --idle-minutes-to-autostop is also set, the
|
|
@@ -487,12 +507,27 @@ def launch(
|
|
|
487
507
|
raise NotImplementedError('clone_disk_from is not implemented yet. '
|
|
488
508
|
'Please contact the SkyPilot team if you '
|
|
489
509
|
'need this feature at slack.skypilot.co.')
|
|
510
|
+
|
|
511
|
+
remote_api_version = versions.get_remote_api_version()
|
|
512
|
+
if wait_for is not None and (remote_api_version is None or
|
|
513
|
+
remote_api_version < 13):
|
|
514
|
+
logger.warning('wait_for is not supported in your API server. '
|
|
515
|
+
'Please upgrade to a newer API server to use it.')
|
|
516
|
+
|
|
490
517
|
dag = dag_utils.convert_entrypoint_to_dag(task)
|
|
491
518
|
# Override the autostop config from command line flags to task YAML.
|
|
492
519
|
for task in dag.tasks:
|
|
493
520
|
for resource in task.resources:
|
|
494
|
-
|
|
495
|
-
|
|
521
|
+
if remote_api_version is None or remote_api_version < 13:
|
|
522
|
+
# An older server would not recognize the wait_for field
|
|
523
|
+
# in the schema, so we need to omit it.
|
|
524
|
+
resource.override_autostop_config(
|
|
525
|
+
down=down, idle_minutes=idle_minutes_to_autostop)
|
|
526
|
+
else:
|
|
527
|
+
resource.override_autostop_config(
|
|
528
|
+
down=down,
|
|
529
|
+
idle_minutes=idle_minutes_to_autostop,
|
|
530
|
+
wait_for=wait_for)
|
|
496
531
|
if resource.autostop_config is not None:
|
|
497
532
|
# For backward-compatbility, get the final autostop config for
|
|
498
533
|
# admin policy.
|
|
@@ -825,6 +860,7 @@ def download_logs(cluster_name: str,
|
|
|
825
860
|
def start(
|
|
826
861
|
cluster_name: str,
|
|
827
862
|
idle_minutes_to_autostop: Optional[int] = None,
|
|
863
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = None,
|
|
828
864
|
retry_until_up: bool = False,
|
|
829
865
|
down: bool = False, # pylint: disable=redefined-outer-name
|
|
830
866
|
force: bool = False,
|
|
@@ -851,6 +887,15 @@ def start(
|
|
|
851
887
|
flag is equivalent to running ``sky.launch()`` and then
|
|
852
888
|
``sky.autostop(idle_minutes=<minutes>)``. If not set, the
|
|
853
889
|
cluster will not be autostopped.
|
|
890
|
+
wait_for: determines the condition for resetting the idleness timer.
|
|
891
|
+
This option works in conjunction with ``idle_minutes_to_autostop``.
|
|
892
|
+
Choices:
|
|
893
|
+
|
|
894
|
+
1. "jobs_and_ssh" (default) - Wait for all jobs to complete
|
|
895
|
+
AND all SSH sessions to disconnect.
|
|
896
|
+
2. "jobs" - Wait for all jobs to complete.
|
|
897
|
+
3. "none" - Stop immediately after idle time expires,
|
|
898
|
+
regardless of running jobs or SSH connections.
|
|
854
899
|
retry_until_up: whether to retry launching the cluster until it is
|
|
855
900
|
up.
|
|
856
901
|
down: Autodown the cluster: tear down the cluster after specified
|
|
@@ -879,9 +924,16 @@ def start(
|
|
|
879
924
|
sky.exceptions.ClusterOwnerIdentitiesMismatchError: if the cluster to
|
|
880
925
|
restart was launched by a different user.
|
|
881
926
|
"""
|
|
927
|
+
remote_api_version = versions.get_remote_api_version()
|
|
928
|
+
if wait_for is not None and (remote_api_version is None or
|
|
929
|
+
remote_api_version < 13):
|
|
930
|
+
logger.warning('wait_for is not supported in your API server. '
|
|
931
|
+
'Please upgrade to a newer API server to use it.')
|
|
932
|
+
|
|
882
933
|
body = payloads.StartBody(
|
|
883
934
|
cluster_name=cluster_name,
|
|
884
935
|
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
|
936
|
+
wait_for=wait_for,
|
|
885
937
|
retry_until_up=retry_until_up,
|
|
886
938
|
down=down,
|
|
887
939
|
force=force,
|
|
@@ -982,9 +1034,10 @@ def stop(cluster_name: str, purge: bool = False) -> server_common.RequestId:
|
|
|
982
1034
|
@server_common.check_server_healthy_or_start
|
|
983
1035
|
@annotations.client_api
|
|
984
1036
|
def autostop(
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
1037
|
+
cluster_name: str,
|
|
1038
|
+
idle_minutes: int,
|
|
1039
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = None,
|
|
1040
|
+
down: bool = False, # pylint: disable=redefined-outer-name
|
|
988
1041
|
) -> server_common.RequestId:
|
|
989
1042
|
"""Schedules an autostop/autodown for a cluster.
|
|
990
1043
|
|
|
@@ -1015,6 +1068,15 @@ def autostop(
|
|
|
1015
1068
|
idle_minutes: the number of minutes of idleness (no pending/running
|
|
1016
1069
|
jobs) after which the cluster will be stopped automatically. Setting
|
|
1017
1070
|
to a negative number cancels any autostop/autodown setting.
|
|
1071
|
+
wait_for: determines the condition for resetting the idleness timer.
|
|
1072
|
+
This option works in conjunction with ``idle_minutes``.
|
|
1073
|
+
Choices:
|
|
1074
|
+
|
|
1075
|
+
1. "jobs_and_ssh" (default) - Wait for all jobs to complete
|
|
1076
|
+
AND all SSH sessions to disconnect.
|
|
1077
|
+
2. "jobs" - Wait for all jobs to complete.
|
|
1078
|
+
3. "none" - Stop immediately after idle time expires,
|
|
1079
|
+
regardless of running jobs or SSH connections.
|
|
1018
1080
|
down: if true, use autodown (tear down the cluster; non-restartable),
|
|
1019
1081
|
rather than autostop (restartable).
|
|
1020
1082
|
|
|
@@ -1034,9 +1096,16 @@ def autostop(
|
|
|
1034
1096
|
sky.exceptions.CloudUserIdentityError: if we fail to get the current
|
|
1035
1097
|
user identity.
|
|
1036
1098
|
"""
|
|
1099
|
+
remote_api_version = versions.get_remote_api_version()
|
|
1100
|
+
if wait_for is not None and (remote_api_version is None or
|
|
1101
|
+
remote_api_version < 13):
|
|
1102
|
+
logger.warning('wait_for is not supported in your API server. '
|
|
1103
|
+
'Please upgrade to a newer API server to use it.')
|
|
1104
|
+
|
|
1037
1105
|
body = payloads.AutostopBody(
|
|
1038
1106
|
cluster_name=cluster_name,
|
|
1039
1107
|
idle_minutes=idle_minutes,
|
|
1108
|
+
wait_for=wait_for,
|
|
1040
1109
|
down=down,
|
|
1041
1110
|
)
|
|
1042
1111
|
response = server_common.make_authenticated_request(
|
|
@@ -2343,6 +2412,7 @@ def api_login(endpoint: Optional[str] = None,
|
|
|
2343
2412
|
_save_config_updates(endpoint=endpoint)
|
|
2344
2413
|
dashboard_url = server_common.get_dashboard_url(endpoint)
|
|
2345
2414
|
|
|
2415
|
+
server_common.get_api_server_status.cache_clear()
|
|
2346
2416
|
# After successful authentication, check server health again to get user
|
|
2347
2417
|
# identity
|
|
2348
2418
|
server_status, final_api_server_info = server_common.check_server_healthy(
|
sky/client/sdk.pyi
CHANGED
|
@@ -14,6 +14,7 @@ from sky import skypilot_config as skypilot_config
|
|
|
14
14
|
from sky.server import common as server_common
|
|
15
15
|
from sky.server import rest as rest
|
|
16
16
|
from sky.server.requests import payloads as payloads
|
|
17
|
+
from sky.skylet import autostop_lib as autostop_lib
|
|
17
18
|
from sky.skylet import constants as constants
|
|
18
19
|
from sky.usage import usage_lib as usage_lib
|
|
19
20
|
from sky.utils import admin_policy_utils as admin_policy_utils
|
|
@@ -104,6 +105,7 @@ def launch(task: Union['sky.Task', 'sky.Dag'],
|
|
|
104
105
|
cluster_name: Optional[str] = ...,
|
|
105
106
|
retry_until_up: bool = ...,
|
|
106
107
|
idle_minutes_to_autostop: Optional[int] = ...,
|
|
108
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = ...,
|
|
107
109
|
dryrun: bool = ...,
|
|
108
110
|
down: bool = ...,
|
|
109
111
|
backend: Optional['backends.Backend'] = ...,
|
|
@@ -142,6 +144,7 @@ def download_logs(cluster_name: str,
|
|
|
142
144
|
|
|
143
145
|
def start(cluster_name: str,
|
|
144
146
|
idle_minutes_to_autostop: Optional[int] = ...,
|
|
147
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = ...,
|
|
145
148
|
retry_until_up: bool = ...,
|
|
146
149
|
down: bool = ...,
|
|
147
150
|
force: bool = ...) -> server_common.RequestId:
|
|
@@ -158,6 +161,7 @@ def stop(cluster_name: str, purge: bool = ...) -> server_common.RequestId:
|
|
|
158
161
|
|
|
159
162
|
def autostop(cluster_name: str,
|
|
160
163
|
idle_minutes: int,
|
|
164
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = ...,
|
|
161
165
|
down: bool = ...) -> server_common.RequestId:
|
|
162
166
|
...
|
|
163
167
|
|
sky/core.py
CHANGED
|
@@ -25,6 +25,7 @@ from sky.clouds import cloud as sky_cloud
|
|
|
25
25
|
from sky.jobs.server import core as managed_jobs_core
|
|
26
26
|
from sky.provision.kubernetes import constants as kubernetes_constants
|
|
27
27
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
28
|
+
from sky.skylet import autostop_lib
|
|
28
29
|
from sky.skylet import constants
|
|
29
30
|
from sky.skylet import job_lib
|
|
30
31
|
from sky.skylet import log_lib
|
|
@@ -403,6 +404,8 @@ def cost_report(days: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
|
403
404
|
def _start(
|
|
404
405
|
cluster_name: str,
|
|
405
406
|
idle_minutes_to_autostop: Optional[int] = None,
|
|
407
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = (
|
|
408
|
+
autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR),
|
|
406
409
|
retry_until_up: bool = False,
|
|
407
410
|
down: bool = False, # pylint: disable=redefined-outer-name
|
|
408
411
|
force: bool = False,
|
|
@@ -473,7 +476,7 @@ def _start(
|
|
|
473
476
|
all_file_mounts=None,
|
|
474
477
|
storage_mounts=storage_mounts)
|
|
475
478
|
if idle_minutes_to_autostop is not None:
|
|
476
|
-
backend.set_autostop(handle, idle_minutes_to_autostop, down
|
|
479
|
+
backend.set_autostop(handle, idle_minutes_to_autostop, wait_for, down)
|
|
477
480
|
return handle
|
|
478
481
|
|
|
479
482
|
|
|
@@ -481,6 +484,8 @@ def _start(
|
|
|
481
484
|
def start(
|
|
482
485
|
cluster_name: str,
|
|
483
486
|
idle_minutes_to_autostop: Optional[int] = None,
|
|
487
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = (
|
|
488
|
+
autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR),
|
|
484
489
|
retry_until_up: bool = False,
|
|
485
490
|
down: bool = False, # pylint: disable=redefined-outer-name
|
|
486
491
|
force: bool = False,
|
|
@@ -535,6 +540,7 @@ def start(
|
|
|
535
540
|
'`idle_minutes_to_autostop` must be set if `down` is True.')
|
|
536
541
|
return _start(cluster_name,
|
|
537
542
|
idle_minutes_to_autostop,
|
|
543
|
+
wait_for,
|
|
538
544
|
retry_until_up,
|
|
539
545
|
down,
|
|
540
546
|
force=force)
|
|
@@ -651,6 +657,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
|
|
|
651
657
|
def autostop(
|
|
652
658
|
cluster_name: str,
|
|
653
659
|
idle_minutes: int,
|
|
660
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = autostop_lib.
|
|
661
|
+
DEFAULT_AUTOSTOP_WAIT_FOR,
|
|
654
662
|
down: bool = False, # pylint: disable=redefined-outer-name
|
|
655
663
|
) -> None:
|
|
656
664
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
@@ -740,7 +748,7 @@ def autostop(
|
|
|
740
748
|
f'see reason above.') from e
|
|
741
749
|
|
|
742
750
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
|
743
|
-
backend.set_autostop(handle, idle_minutes, down)
|
|
751
|
+
backend.set_autostop(handle, idle_minutes, wait_for, down)
|
|
744
752
|
|
|
745
753
|
|
|
746
754
|
# ==================
|
sky/dashboard/out/404.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b3227360726f12eb.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b3227360726f12eb.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/b3227360726f12eb.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/b3227360726f12eb.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-13145516b19858fb.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a67ae198457b9886.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js" defer=""></script><script src="/dashboard/_next/static/2JNCZ4daQBotwWRNGi6aE/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/2JNCZ4daQBotwWRNGi6aE/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"2JNCZ4daQBotwWRNGi6aE","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>
|