skypilot-nightly 1.0.0.dev20250801__py3-none-any.whl → 1.0.0.dev20250804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (51) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +10 -2
  3. sky/backends/cloud_vm_ray_backend.py +2 -1
  4. sky/catalog/data_fetchers/fetch_nebius.py +31 -7
  5. sky/client/cli/command.py +42 -20
  6. sky/client/cli/flags.py +15 -0
  7. sky/client/sdk.py +80 -10
  8. sky/client/sdk.pyi +4 -0
  9. sky/core.py +10 -2
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/{f2fEsZwJxryJVOYRNtNKE → KiGGm4fK0CpmN6BT17jkh}/_buildManifest.js +1 -1
  12. sky/dashboard/out/_next/static/chunks/1871-7e17c195296e2ea9.js +6 -0
  13. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9e7df5fc761c95a7.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +11 -0
  15. sky/dashboard/out/_next/static/chunks/{webpack-42cd1b19a6b01078.js → webpack-13145516b19858fb.js} +1 -1
  16. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  17. sky/dashboard/out/clusters/[cluster].html +1 -1
  18. sky/dashboard/out/clusters.html +1 -1
  19. sky/dashboard/out/config.html +1 -1
  20. sky/dashboard/out/index.html +1 -1
  21. sky/dashboard/out/infra/[context].html +1 -1
  22. sky/dashboard/out/infra.html +1 -1
  23. sky/dashboard/out/jobs/[job].html +1 -1
  24. sky/dashboard/out/jobs.html +1 -1
  25. sky/dashboard/out/users.html +1 -1
  26. sky/dashboard/out/volumes.html +1 -1
  27. sky/dashboard/out/workspace/new.html +1 -1
  28. sky/dashboard/out/workspaces/[name].html +1 -1
  29. sky/dashboard/out/workspaces.html +1 -1
  30. sky/execution.py +5 -3
  31. sky/jobs/client/sdk.py +5 -1
  32. sky/provision/kubernetes/utils.py +32 -2
  33. sky/resources.py +17 -4
  34. sky/server/constants.py +1 -1
  35. sky/server/requests/payloads.py +3 -0
  36. sky/setup_files/dependencies.py +1 -1
  37. sky/skylet/autostop_lib.py +96 -8
  38. sky/skylet/constants.py +3 -2
  39. sky/skylet/events.py +27 -13
  40. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  41. sky/utils/schemas.py +6 -0
  42. {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/METADATA +4 -3
  43. {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/RECORD +48 -48
  44. sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +0 -6
  45. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +0 -1
  46. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +0 -11
  47. /sky/dashboard/out/_next/static/{f2fEsZwJxryJVOYRNtNKE → KiGGm4fK0CpmN6BT17jkh}/_ssgManifest.js +0 -0
  48. {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/WHEEL +0 -0
  49. {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/entry_points.txt +0 -0
  50. {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/licenses/LICENSE +0 -0
  51. {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '45dfcb679d71b0fb8abe30470b1d8c6b6c33ef73'
8
+ _SKYPILOT_COMMIT_SHA = '641a8e762a1d86274420f7a9d9cea6657b5be5b4'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250801'
38
+ __version__ = '1.0.0.dev20250804'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -38,6 +38,7 @@ from sky.provision import instance_setup
38
38
  from sky.provision.kubernetes import utils as kubernetes_utils
39
39
  from sky.serve import serve_utils
40
40
  from sky.server.requests import requests as requests_lib
41
+ from sky.skylet import autostop_lib
41
42
  from sky.skylet import constants
42
43
  from sky.usage import usage_lib
43
44
  from sky.utils import cluster_utils
@@ -922,7 +923,10 @@ def write_cluster_config(
922
923
  cluster_config_overrides=cluster_config_overrides,
923
924
  cloud=cloud,
924
925
  context=region.name)
925
- kubernetes_utils.combine_metadata_fields(tmp_yaml_path, region.name)
926
+ kubernetes_utils.combine_metadata_fields(
927
+ tmp_yaml_path,
928
+ cluster_config_overrides=cluster_config_overrides,
929
+ context=region.name)
926
930
  yaml_obj = common_utils.read_yaml(tmp_yaml_path)
927
931
  pod_config: Dict[str, Any] = yaml_obj['available_node_types'][
928
932
  'ray_head_default']['node_config']
@@ -2238,7 +2242,11 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2238
2242
  success = True
2239
2243
  reset_local_autostop = True
2240
2244
  try:
2241
- backend.set_autostop(handle, -1, stream_logs=False)
2245
+ backend.set_autostop(
2246
+ handle,
2247
+ -1,
2248
+ autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
2249
+ stream_logs=False)
2242
2250
  except exceptions.CommandError as e:
2243
2251
  success = False
2244
2252
  if e.returncode == 255:
@@ -4650,6 +4650,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4650
4650
  def set_autostop(self,
4651
4651
  handle: CloudVmRayResourceHandle,
4652
4652
  idle_minutes_to_autostop: Optional[int],
4653
+ wait_for: Optional[autostop_lib.AutostopWaitFor],
4653
4654
  down: bool = False,
4654
4655
  stream_logs: bool = True) -> None:
4655
4656
  # The core.autostop() function should have already checked that the
@@ -4697,7 +4698,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4697
4698
  assert (handle.launched_resources is not None and
4698
4699
  handle.launched_resources.cloud is not None), handle
4699
4700
  code = autostop_lib.AutostopCodeGen.set_autostop(
4700
- idle_minutes_to_autostop, self.NAME, down)
4701
+ idle_minutes_to_autostop, self.NAME, wait_for, down)
4701
4702
  returncode, _, stderr = self.run_on_head(handle,
4702
4703
  code,
4703
4704
  require_outputs=True,
@@ -31,17 +31,19 @@ class PresetInfo:
31
31
  Attributes:
32
32
  region (str): The geographical region where the preset is available.
33
33
  fullname (str): The full name of the preset, a combination of platform
34
- and preset name.
34
+ and preset name.
35
35
  name (str): The name of the preset.
36
36
  platform_name (str): The name of the platform the preset belongs to.
37
37
  gpu (int): The number of GPUs in the preset.
38
38
  vcpu (int): The number of virtual CPUs in the preset.
39
39
  memory_gib (int): The amount of memory in GiB in the preset.
40
40
  accelerator_manufacturer (str | None): The manufacturer of the
41
- accelerator (e.g., "NVIDIA"), or None if no accelerator.
41
+ accelerator (e.g., "NVIDIA"), or None if no accelerator.
42
42
  accelerator_name (str | None): The name of the accelerator
43
- (e.g., "H100"), or None if no accelerator.
43
+ (e.g., "H100"), or None if no accelerator.
44
44
  price_hourly (decimal.Decimal): The hourly price of the preset.
45
+ spot_price (decimal.Decimal): The spot (preemptible) price
46
+ of the preset.
45
47
  """
46
48
 
47
49
  region: str
@@ -54,6 +56,7 @@ class PresetInfo:
54
56
  accelerator_manufacturer: Optional[str]
55
57
  accelerator_name: Optional[str]
56
58
  price_hourly: decimal.Decimal
59
+ spot_price: decimal.Decimal
57
60
 
58
61
 
59
62
  def _format_decimal(value: decimal.Decimal) -> str:
@@ -66,7 +69,7 @@ def _format_decimal(value: decimal.Decimal) -> str:
66
69
  Returns:
67
70
  str: The formatted string representation of the decimal.
68
71
  """
69
- formatted_value = f'{value:f}'.rstrip('0').rstrip('.')
72
+ formatted_value = f'{value:f}'
70
73
  integer_part, decimal_part = formatted_value.split(
71
74
  '.') if '.' in formatted_value else (formatted_value, '')
72
75
  if len(decimal_part) < 2:
@@ -111,20 +114,38 @@ def _estimate_platforms(platforms: List[Any], parent_id: str,
111
114
  preset=preset.name,
112
115
  )),
113
116
  ))
114
-
115
117
  price_request = billing().EstimateBatchRequest(
116
118
  resource_specs=[estimate_spec])
119
+
120
+ # Form the specification for the spot price request
121
+ spot_estimate_spec = billing().ResourceSpec(
122
+ compute_instance_spec=compute().CreateInstanceRequest(
123
+ metadata=nebius_common().ResourceMetadata(
124
+ parent_id=parent_id,),
125
+ spec=compute().InstanceSpec(
126
+ resources=compute().ResourcesSpec(
127
+ platform=platform_name,
128
+ preset=preset.name,
129
+ ),
130
+ preemptible=compute().PreemptibleSpec(priority=1),
131
+ ),
132
+ ))
133
+ spot_price_request = billing().EstimateBatchRequest(
134
+ resource_specs=[spot_estimate_spec])
135
+
117
136
  # Start future for each preset
118
137
  futures.append((
119
138
  platform,
120
139
  preset,
121
140
  calculator_service.estimate_batch(price_request,
122
141
  timeout=TIMEOUT),
142
+ calculator_service.estimate_batch(spot_price_request,
143
+ timeout=TIMEOUT),
123
144
  ))
124
145
 
125
146
  # wait all futures to complete and collect results
126
147
  result = []
127
- for platform, preset, future in futures:
148
+ for platform, preset, future, future_spot in futures:
128
149
  platform_name = platform.metadata.name
129
150
  result.append(
130
151
  PresetInfo(
@@ -141,6 +162,8 @@ def _estimate_platforms(platforms: List[Any], parent_id: str,
141
162
  if platform_name.startswith('gpu-') else '',
142
163
  price_hourly=decimal.Decimal(
143
164
  future.wait().hourly_cost.general.total.cost),
165
+ spot_price=decimal.Decimal(
166
+ future_spot.wait().hourly_cost.general.total.cost),
144
167
  ))
145
168
 
146
169
  return result
@@ -196,7 +219,8 @@ def _write_preset_prices(presets: List[PresetInfo], output_file: str) -> None:
196
219
  'Price': _format_decimal(preset.price_hourly),
197
220
  'Region': preset.region,
198
221
  'GpuInfo': gpu_info,
199
- 'SpotPrice': '',
222
+ 'SpotPrice': _format_decimal(preset.spot_price)
223
+ if preset.spot_price else '',
200
224
  })
201
225
 
202
226
 
sky/client/cli/command.py CHANGED
@@ -62,8 +62,8 @@ from sky.provision.kubernetes import constants as kubernetes_constants
62
62
  from sky.provision.kubernetes import utils as kubernetes_utils
63
63
  from sky.server import common as server_common
64
64
  from sky.server import constants as server_constants
65
- from sky.server import versions
66
65
  from sky.server.requests import requests
66
+ from sky.skylet import autostop_lib
67
67
  from sky.skylet import constants
68
68
  from sky.skylet import job_lib
69
69
  from sky.usage import usage_lib
@@ -1012,11 +1012,11 @@ def _handle_infra_cloud_region_zone_options(infra: Optional[str],
1012
1012
  required=False,
1013
1013
  help=('Automatically stop the cluster after this many minutes '
1014
1014
  'of idleness, i.e., no running or pending jobs in the cluster\'s job '
1015
- 'queue. Idleness gets reset whenever setting-up/running/pending jobs '
1016
- 'are found in the job queue. '
1015
+ 'queue. Idleness gets reset depending on the ``--wait-for`` flag. '
1017
1016
  'Setting this flag is equivalent to '
1018
1017
  'running ``sky launch -d ...`` and then ``sky autostop -i <minutes>``'
1019
1018
  '. If not set, the cluster will not be autostopped.'))
1019
+ @flags.wait_for_option('idle-minutes-to-autostop')
1020
1020
  @click.option(
1021
1021
  '--down',
1022
1022
  default=False,
@@ -1102,6 +1102,7 @@ def launch(
1102
1102
  network_tier: Optional[str],
1103
1103
  ports: Tuple[str, ...],
1104
1104
  idle_minutes_to_autostop: Optional[int],
1105
+ wait_for: Optional[str],
1105
1106
  down: bool, # pylint: disable=redefined-outer-name
1106
1107
  retry_until_up: bool,
1107
1108
  yes: bool,
@@ -1196,6 +1197,8 @@ def launch(
1196
1197
  cluster_name=cluster,
1197
1198
  backend=backend,
1198
1199
  idle_minutes_to_autostop=idle_minutes_to_autostop,
1200
+ wait_for=autostop_lib.AutostopWaitFor.from_str(wait_for)
1201
+ if wait_for is not None else None,
1199
1202
  down=down,
1200
1203
  retry_until_up=retry_until_up,
1201
1204
  no_setup=no_setup,
@@ -1828,9 +1831,6 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1828
1831
  show_endpoints = endpoints or endpoint is not None
1829
1832
  show_single_endpoint = endpoint is not None
1830
1833
  show_services = show_services and not any([clusters, ip, endpoints])
1831
- remote_api_version = versions.get_remote_api_version()
1832
- if remote_api_version is None or remote_api_version < 12:
1833
- show_pools = False
1834
1834
 
1835
1835
  query_clusters: Optional[List[str]] = None if not clusters else clusters
1836
1836
  refresh_mode = common.StatusRefreshMode.NONE
@@ -1882,7 +1882,11 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1882
1882
  return serve_lib.status(service_names=None)
1883
1883
 
1884
1884
  def submit_pools() -> Optional[str]:
1885
- return managed_jobs.pool_status(pool_names=None)
1885
+ try:
1886
+ return managed_jobs.pool_status(pool_names=None)
1887
+ except exceptions.APINotSupportedError as e:
1888
+ logger.debug(f'Pools are not supported in the remote server: {e}')
1889
+ return None
1886
1890
 
1887
1891
  def submit_workspace() -> Optional[str]:
1888
1892
  try:
@@ -2005,7 +2009,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
2005
2009
  controller_utils.Controllers.JOBS_CONTROLLER.value.
2006
2010
  in_progress_hint(False).format(job_info=job_info))
2007
2011
 
2008
- if show_pools:
2012
+ if show_pools and pool_status_request_id:
2009
2013
  num_pools = None
2010
2014
  if managed_jobs_query_interrupted:
2011
2015
  msg = 'KeyboardInterrupt'
@@ -2549,6 +2553,7 @@ def stop(
2549
2553
  required=False,
2550
2554
  help=('Set the idle minutes before autostopping the cluster. '
2551
2555
  'See the doc above for detailed semantics.'))
2556
+ @flags.wait_for_option('idle-minutes')
2552
2557
  @click.option(
2553
2558
  '--cancel',
2554
2559
  default=False,
@@ -2571,6 +2576,7 @@ def autostop(
2571
2576
  all: bool, # pylint: disable=redefined-builtin
2572
2577
  all_users: bool,
2573
2578
  idle_minutes: Optional[int],
2579
+ wait_for: Optional[str],
2574
2580
  cancel: bool, # pylint: disable=redefined-outer-name
2575
2581
  down: bool, # pylint: disable=redefined-outer-name
2576
2582
  yes: bool,
@@ -2580,8 +2586,7 @@ def autostop(
2580
2586
  """Schedule an autostop or autodown for cluster(s).
2581
2587
 
2582
2588
  Autostop/autodown will automatically stop or teardown a cluster when it
2583
- becomes idle for a specified duration. Idleness means there are no
2584
- in-progress (pending/running) jobs in a cluster's job queue.
2589
+ becomes idle for a specified duration.
2585
2590
 
2586
2591
  CLUSTERS are the names (or glob patterns) of the clusters to stop. If both
2587
2592
  CLUSTERS and ``--all`` are supplied, the latter takes precedence.
@@ -2594,6 +2599,11 @@ def autostop(
2594
2599
 
2595
2600
  - An autostop idle time is set.
2596
2601
 
2602
+ - An SSH session is active (To disable this, set ``--wait-for jobs``).
2603
+
2604
+ To disable the idleness timer completely and set a hard time limit, set
2605
+ ``--wait-for none``.
2606
+
2597
2607
  Example 1: say a cluster with autostop set to 2 hours has been idle for 1
2598
2608
  hour, then autostop is reset to 30 minutes. The cluster will not be
2599
2609
  immediately autostopped. Instead, the idleness timer restarts counting
@@ -2614,6 +2624,9 @@ def autostop(
2614
2624
  # Cancel autostop for a specific cluster.
2615
2625
  sky autostop cluster_name --cancel
2616
2626
  \b
2627
+ # Autostop this cluster after 60 minutes, regardless of activity.
2628
+ sky autostop cluster_name -i 60 --wait-for none
2629
+ \b
2617
2630
  # Autodown this cluster after 60 minutes of idleness.
2618
2631
  sky autostop cluster_name -i 60 --down
2619
2632
  """
@@ -2625,13 +2638,16 @@ def autostop(
2625
2638
  idle_minutes = -1
2626
2639
  elif idle_minutes is None:
2627
2640
  idle_minutes = 5
2628
- _down_or_stop_clusters(clusters,
2629
- apply_to_all=all,
2630
- all_users=all_users,
2631
- down=down,
2632
- no_confirm=yes,
2633
- idle_minutes_to_autostop=idle_minutes,
2634
- async_call=async_call)
2641
+ _down_or_stop_clusters(
2642
+ clusters,
2643
+ apply_to_all=all,
2644
+ all_users=all_users,
2645
+ down=down,
2646
+ no_confirm=yes,
2647
+ idle_minutes_to_autostop=idle_minutes,
2648
+ wait_for=autostop_lib.AutostopWaitFor.from_str(wait_for)
2649
+ if wait_for is not None else None,
2650
+ async_call=async_call)
2635
2651
 
2636
2652
 
2637
2653
  @cli.command(cls=_DocumentedCodeCommand)
@@ -2650,11 +2666,11 @@ def autostop(
2650
2666
  required=False,
2651
2667
  help=('Automatically stop the cluster after this many minutes '
2652
2668
  'of idleness, i.e., no running or pending jobs in the cluster\'s job '
2653
- 'queue. Idleness gets reset whenever setting-up/running/pending jobs '
2654
- 'are found in the job queue. '
2669
+ 'queue. Idleness gets reset depending on the ``--wait-for`` flag. '
2655
2670
  'Setting this flag is equivalent to '
2656
2671
  'running ``sky launch -d ...`` and then ``sky autostop -i <minutes>``'
2657
2672
  '. If not set, the cluster will not be autostopped.'))
2673
+ @flags.wait_for_option('idle-minutes-to-autostop')
2658
2674
  @click.option(
2659
2675
  '--down',
2660
2676
  default=False,
@@ -2693,6 +2709,7 @@ def start(
2693
2709
  all: bool,
2694
2710
  yes: bool,
2695
2711
  idle_minutes_to_autostop: Optional[int],
2712
+ wait_for: Optional[str],
2696
2713
  down: bool, # pylint: disable=redefined-outer-name
2697
2714
  retry_until_up: bool,
2698
2715
  force: bool,
@@ -2852,6 +2869,8 @@ def start(
2852
2869
  request_ids = subprocess_utils.run_in_parallel(
2853
2870
  lambda name: sdk.start(name,
2854
2871
  idle_minutes_to_autostop,
2872
+ autostop_lib.AutostopWaitFor.from_str(wait_for)
2873
+ if wait_for is not None else None,
2855
2874
  retry_until_up,
2856
2875
  down=down,
2857
2876
  force=force), to_start)
@@ -3081,6 +3100,7 @@ def _down_or_stop_clusters(
3081
3100
  no_confirm: bool = True,
3082
3101
  purge: bool = False,
3083
3102
  idle_minutes_to_autostop: Optional[int] = None,
3103
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = None,
3084
3104
  async_call: bool = False) -> None:
3085
3105
  """Tears down or (auto-)stops a cluster (or all clusters).
3086
3106
 
@@ -3098,6 +3118,7 @@ def _down_or_stop_clusters(
3098
3118
  purge: If True, forcefully remove the clusters from the cluster table.
3099
3119
  idle_minutes_to_autostop: The number of minutes to wait before
3100
3120
  automatically stopping the cluster.
3121
+ wait_for: Determines the condition for resetting the idleness timer.
3101
3122
  async_call: If True, send the request asynchronously.
3102
3123
  """
3103
3124
  if down:
@@ -3241,7 +3262,8 @@ def _down_or_stop_clusters(
3241
3262
  success_progress = False
3242
3263
  if idle_minutes_to_autostop is not None:
3243
3264
  try:
3244
- request_id = sdk.autostop(name, idle_minutes_to_autostop, down)
3265
+ request_id = sdk.autostop(name, idle_minutes_to_autostop,
3266
+ wait_for, down)
3245
3267
  request_ids.append(request_id)
3246
3268
  _async_call_or_wait(
3247
3269
  request_id, async_call,
sky/client/cli/flags.py CHANGED
@@ -7,6 +7,7 @@ import click
7
7
  import dotenv
8
8
 
9
9
  from sky import skypilot_config
10
+ from sky.skylet import autostop_lib
10
11
  from sky.utils import resources_utils
11
12
 
12
13
 
@@ -340,3 +341,17 @@ def all_users_option(helptext: Optional[str] = None):
340
341
  help=helptext)(func)
341
342
 
342
343
  return return_option_decorator
344
+
345
+
346
+ def wait_for_option(pair: str):
347
+ """A decorator for the --wait-for option."""
348
+
349
+ def return_option_decorator(func):
350
+ return click.option(
351
+ '--wait-for',
352
+ type=click.Choice(autostop_lib.AutostopWaitFor.supported_modes()),
353
+ default=None,
354
+ required=False,
355
+ help=autostop_lib.AutostopWaitFor.cli_help_message(pair=pair))(func)
356
+
357
+ return return_option_decorator
sky/client/sdk.py CHANGED
@@ -10,19 +10,14 @@ Usage example:
10
10
  statuses = sky.get(request_id)
11
11
 
12
12
  """
13
- import base64
14
- import binascii
15
13
  from http import cookiejar
16
14
  import json
17
15
  import logging
18
16
  import os
19
- import pathlib
20
17
  import subprocess
21
- import time
22
18
  import typing
23
19
  from typing import Any, Dict, List, Optional, Tuple, Union
24
20
  from urllib import parse as urlparse
25
- import webbrowser
26
21
 
27
22
  import click
28
23
  import colorama
@@ -37,8 +32,10 @@ from sky.client import common as client_common
37
32
  from sky.client import oauth as oauth_lib
38
33
  from sky.server import common as server_common
39
34
  from sky.server import rest
35
+ from sky.server import versions
40
36
  from sky.server.requests import payloads
41
37
  from sky.server.requests import requests as requests_lib
38
+ from sky.skylet import autostop_lib
42
39
  from sky.skylet import constants
43
40
  from sky.usage import usage_lib
44
41
  from sky.utils import admin_policy_utils
@@ -57,7 +54,12 @@ from sky.utils import ux_utils
57
54
  from sky.utils.kubernetes import ssh_utils
58
55
 
59
56
  if typing.TYPE_CHECKING:
57
+ import base64
58
+ import binascii
60
59
  import io
60
+ import pathlib
61
+ import time
62
+ import webbrowser
61
63
 
62
64
  import psutil
63
65
  import requests
@@ -65,6 +67,14 @@ if typing.TYPE_CHECKING:
65
67
  import sky
66
68
  from sky import backends
67
69
  else:
70
+ # only used in api_login()
71
+ base64 = adaptors_common.LazyImport('base64')
72
+ binascii = adaptors_common.LazyImport('binascii')
73
+ pathlib = adaptors_common.LazyImport('pathlib')
74
+ time = adaptors_common.LazyImport('time')
75
+ # only used in dashboard() and api_login()
76
+ webbrowser = adaptors_common.LazyImport('webbrowser')
77
+ # only used in api_stop()
68
78
  psutil = adaptors_common.LazyImport('psutil')
69
79
 
70
80
  logger = sky_logging.init_logger(__name__)
@@ -375,6 +385,7 @@ def launch(
375
385
  cluster_name: Optional[str] = None,
376
386
  retry_until_up: bool = False,
377
387
  idle_minutes_to_autostop: Optional[int] = None,
388
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = None,
378
389
  dryrun: bool = False,
379
390
  down: bool = False, # pylint: disable=redefined-outer-name
380
391
  backend: Optional['backends.Backend'] = None,
@@ -424,6 +435,15 @@ def launch(
424
435
  ``sky.autostop(idle_minutes=<minutes>)``. If set, the autostop
425
436
  config specified in the task' resources will be overridden by
426
437
  this parameter.
438
+ wait_for: determines the condition for resetting the idleness timer.
439
+ This option works in conjunction with ``idle_minutes_to_autostop``.
440
+ Choices:
441
+
442
+ 1. "jobs_and_ssh" (default) - Wait for all jobs to complete
443
+ AND all SSH sessions to disconnect.
444
+ 2. "jobs" - Wait for all jobs to complete.
445
+ 3. "none" - Stop immediately after idle time expires,
446
+ regardless of running jobs or SSH connections.
427
447
  dryrun: if True, do not actually launch the cluster.
428
448
  down: Tear down the cluster after all jobs finish (successfully or
429
449
  abnormally). If --idle-minutes-to-autostop is also set, the
@@ -487,12 +507,27 @@ def launch(
487
507
  raise NotImplementedError('clone_disk_from is not implemented yet. '
488
508
  'Please contact the SkyPilot team if you '
489
509
  'need this feature at slack.skypilot.co.')
510
+
511
+ remote_api_version = versions.get_remote_api_version()
512
+ if wait_for is not None and (remote_api_version is None or
513
+ remote_api_version < 13):
514
+ logger.warning('wait_for is not supported in your API server. '
515
+ 'Please upgrade to a newer API server to use it.')
516
+
490
517
  dag = dag_utils.convert_entrypoint_to_dag(task)
491
518
  # Override the autostop config from command line flags to task YAML.
492
519
  for task in dag.tasks:
493
520
  for resource in task.resources:
494
- resource.override_autostop_config(
495
- down=down, idle_minutes=idle_minutes_to_autostop)
521
+ if remote_api_version is None or remote_api_version < 13:
522
+ # An older server would not recognize the wait_for field
523
+ # in the schema, so we need to omit it.
524
+ resource.override_autostop_config(
525
+ down=down, idle_minutes=idle_minutes_to_autostop)
526
+ else:
527
+ resource.override_autostop_config(
528
+ down=down,
529
+ idle_minutes=idle_minutes_to_autostop,
530
+ wait_for=wait_for)
496
531
  if resource.autostop_config is not None:
497
532
  # For backward-compatbility, get the final autostop config for
498
533
  # admin policy.
@@ -825,6 +860,7 @@ def download_logs(cluster_name: str,
825
860
  def start(
826
861
  cluster_name: str,
827
862
  idle_minutes_to_autostop: Optional[int] = None,
863
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = None,
828
864
  retry_until_up: bool = False,
829
865
  down: bool = False, # pylint: disable=redefined-outer-name
830
866
  force: bool = False,
@@ -851,6 +887,15 @@ def start(
851
887
  flag is equivalent to running ``sky.launch()`` and then
852
888
  ``sky.autostop(idle_minutes=<minutes>)``. If not set, the
853
889
  cluster will not be autostopped.
890
+ wait_for: determines the condition for resetting the idleness timer.
891
+ This option works in conjunction with ``idle_minutes_to_autostop``.
892
+ Choices:
893
+
894
+ 1. "jobs_and_ssh" (default) - Wait for all jobs to complete
895
+ AND all SSH sessions to disconnect.
896
+ 2. "jobs" - Wait for all jobs to complete.
897
+ 3. "none" - Stop immediately after idle time expires,
898
+ regardless of running jobs or SSH connections.
854
899
  retry_until_up: whether to retry launching the cluster until it is
855
900
  up.
856
901
  down: Autodown the cluster: tear down the cluster after specified
@@ -879,9 +924,16 @@ def start(
879
924
  sky.exceptions.ClusterOwnerIdentitiesMismatchError: if the cluster to
880
925
  restart was launched by a different user.
881
926
  """
927
+ remote_api_version = versions.get_remote_api_version()
928
+ if wait_for is not None and (remote_api_version is None or
929
+ remote_api_version < 13):
930
+ logger.warning('wait_for is not supported in your API server. '
931
+ 'Please upgrade to a newer API server to use it.')
932
+
882
933
  body = payloads.StartBody(
883
934
  cluster_name=cluster_name,
884
935
  idle_minutes_to_autostop=idle_minutes_to_autostop,
936
+ wait_for=wait_for,
885
937
  retry_until_up=retry_until_up,
886
938
  down=down,
887
939
  force=force,
@@ -982,9 +1034,10 @@ def stop(cluster_name: str, purge: bool = False) -> server_common.RequestId:
982
1034
  @server_common.check_server_healthy_or_start
983
1035
  @annotations.client_api
984
1036
  def autostop(
985
- cluster_name: str,
986
- idle_minutes: int,
987
- down: bool = False # pylint: disable=redefined-outer-name
1037
+ cluster_name: str,
1038
+ idle_minutes: int,
1039
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = None,
1040
+ down: bool = False, # pylint: disable=redefined-outer-name
988
1041
  ) -> server_common.RequestId:
989
1042
  """Schedules an autostop/autodown for a cluster.
990
1043
 
@@ -1015,6 +1068,15 @@ def autostop(
1015
1068
  idle_minutes: the number of minutes of idleness (no pending/running
1016
1069
  jobs) after which the cluster will be stopped automatically. Setting
1017
1070
  to a negative number cancels any autostop/autodown setting.
1071
+ wait_for: determines the condition for resetting the idleness timer.
1072
+ This option works in conjunction with ``idle_minutes``.
1073
+ Choices:
1074
+
1075
+ 1. "jobs_and_ssh" (default) - Wait for all jobs to complete
1076
+ AND all SSH sessions to disconnect.
1077
+ 2. "jobs" - Wait for all jobs to complete.
1078
+ 3. "none" - Stop immediately after idle time expires,
1079
+ regardless of running jobs or SSH connections.
1018
1080
  down: if true, use autodown (tear down the cluster; non-restartable),
1019
1081
  rather than autostop (restartable).
1020
1082
 
@@ -1034,9 +1096,16 @@ def autostop(
1034
1096
  sky.exceptions.CloudUserIdentityError: if we fail to get the current
1035
1097
  user identity.
1036
1098
  """
1099
+ remote_api_version = versions.get_remote_api_version()
1100
+ if wait_for is not None and (remote_api_version is None or
1101
+ remote_api_version < 13):
1102
+ logger.warning('wait_for is not supported in your API server. '
1103
+ 'Please upgrade to a newer API server to use it.')
1104
+
1037
1105
  body = payloads.AutostopBody(
1038
1106
  cluster_name=cluster_name,
1039
1107
  idle_minutes=idle_minutes,
1108
+ wait_for=wait_for,
1040
1109
  down=down,
1041
1110
  )
1042
1111
  response = server_common.make_authenticated_request(
@@ -2343,6 +2412,7 @@ def api_login(endpoint: Optional[str] = None,
2343
2412
  _save_config_updates(endpoint=endpoint)
2344
2413
  dashboard_url = server_common.get_dashboard_url(endpoint)
2345
2414
 
2415
+ server_common.get_api_server_status.cache_clear()
2346
2416
  # After successful authentication, check server health again to get user
2347
2417
  # identity
2348
2418
  server_status, final_api_server_info = server_common.check_server_healthy(
sky/client/sdk.pyi CHANGED
@@ -14,6 +14,7 @@ from sky import skypilot_config as skypilot_config
14
14
  from sky.server import common as server_common
15
15
  from sky.server import rest as rest
16
16
  from sky.server.requests import payloads as payloads
17
+ from sky.skylet import autostop_lib as autostop_lib
17
18
  from sky.skylet import constants as constants
18
19
  from sky.usage import usage_lib as usage_lib
19
20
  from sky.utils import admin_policy_utils as admin_policy_utils
@@ -104,6 +105,7 @@ def launch(task: Union['sky.Task', 'sky.Dag'],
104
105
  cluster_name: Optional[str] = ...,
105
106
  retry_until_up: bool = ...,
106
107
  idle_minutes_to_autostop: Optional[int] = ...,
108
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = ...,
107
109
  dryrun: bool = ...,
108
110
  down: bool = ...,
109
111
  backend: Optional['backends.Backend'] = ...,
@@ -142,6 +144,7 @@ def download_logs(cluster_name: str,
142
144
 
143
145
  def start(cluster_name: str,
144
146
  idle_minutes_to_autostop: Optional[int] = ...,
147
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = ...,
145
148
  retry_until_up: bool = ...,
146
149
  down: bool = ...,
147
150
  force: bool = ...) -> server_common.RequestId:
@@ -158,6 +161,7 @@ def stop(cluster_name: str, purge: bool = ...) -> server_common.RequestId:
158
161
 
159
162
  def autostop(cluster_name: str,
160
163
  idle_minutes: int,
164
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = ...,
161
165
  down: bool = ...) -> server_common.RequestId:
162
166
  ...
163
167