skypilot-nightly 1.0.0.dev20250808__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (120) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/kubernetes.py +5 -2
  3. sky/backends/backend_utils.py +102 -8
  4. sky/backends/cloud_vm_ray_backend.py +197 -31
  5. sky/catalog/cudo_catalog.py +1 -1
  6. sky/catalog/data_fetchers/fetch_cudo.py +1 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +6 -3
  8. sky/client/cli/command.py +60 -77
  9. sky/client/common.py +1 -1
  10. sky/client/sdk.py +19 -19
  11. sky/client/sdk_async.py +5 -4
  12. sky/clouds/aws.py +52 -1
  13. sky/clouds/kubernetes.py +14 -0
  14. sky/core.py +5 -0
  15. sky/dag.py +1 -0
  16. sky/dashboard/out/404.html +1 -1
  17. sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
  18. sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/{webpack-339efec49c0cc7d0.js → webpack-00c0a51d21157453.js} +1 -1
  25. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  26. sky/dashboard/out/clusters/[cluster].html +1 -1
  27. sky/dashboard/out/clusters.html +1 -1
  28. sky/dashboard/out/config.html +1 -1
  29. sky/dashboard/out/index.html +1 -1
  30. sky/dashboard/out/infra/[context].html +1 -1
  31. sky/dashboard/out/infra.html +1 -1
  32. sky/dashboard/out/jobs/[job].html +1 -1
  33. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  34. sky/dashboard/out/jobs.html +1 -1
  35. sky/dashboard/out/users.html +1 -1
  36. sky/dashboard/out/volumes.html +1 -1
  37. sky/dashboard/out/workspace/new.html +1 -1
  38. sky/dashboard/out/workspaces/[name].html +1 -1
  39. sky/dashboard/out/workspaces.html +1 -1
  40. sky/data/storage.py +11 -1
  41. sky/exceptions.py +5 -0
  42. sky/execution.py +15 -0
  43. sky/global_user_state.py +160 -2
  44. sky/jobs/constants.py +1 -1
  45. sky/jobs/controller.py +0 -1
  46. sky/jobs/recovery_strategy.py +6 -3
  47. sky/jobs/scheduler.py +23 -68
  48. sky/jobs/server/core.py +22 -12
  49. sky/jobs/state.py +6 -2
  50. sky/jobs/utils.py +17 -2
  51. sky/provision/__init__.py +4 -2
  52. sky/provision/aws/config.py +9 -0
  53. sky/provision/aws/instance.py +41 -17
  54. sky/provision/azure/instance.py +7 -4
  55. sky/provision/cudo/cudo_wrapper.py +1 -1
  56. sky/provision/cudo/instance.py +7 -4
  57. sky/provision/do/instance.py +7 -4
  58. sky/provision/fluidstack/instance.py +7 -4
  59. sky/provision/gcp/instance.py +7 -4
  60. sky/provision/hyperbolic/instance.py +7 -5
  61. sky/provision/kubernetes/instance.py +169 -6
  62. sky/provision/lambda_cloud/instance.py +7 -4
  63. sky/provision/nebius/instance.py +7 -4
  64. sky/provision/oci/instance.py +7 -4
  65. sky/provision/paperspace/instance.py +7 -5
  66. sky/provision/paperspace/utils.py +1 -1
  67. sky/provision/provisioner.py +6 -0
  68. sky/provision/runpod/instance.py +7 -4
  69. sky/provision/runpod/utils.py +1 -1
  70. sky/provision/scp/instance.py +7 -5
  71. sky/provision/vast/instance.py +7 -5
  72. sky/provision/vsphere/instance.py +7 -4
  73. sky/resources.py +1 -2
  74. sky/schemas/__init__.py +0 -0
  75. sky/schemas/api/__init__.py +0 -0
  76. sky/schemas/api/responses.py +70 -0
  77. sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
  78. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  79. sky/schemas/db/serve_state/001_initial_schema.py +1 -1
  80. sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
  81. sky/schemas/generated/__init__.py +0 -0
  82. sky/schemas/generated/autostopv1_pb2.py +36 -0
  83. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  84. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  85. sky/serve/constants.py +3 -7
  86. sky/serve/replica_managers.py +15 -16
  87. sky/serve/serve_state.py +10 -0
  88. sky/serve/serve_utils.py +58 -23
  89. sky/serve/server/impl.py +15 -19
  90. sky/serve/service.py +31 -16
  91. sky/server/server.py +20 -14
  92. sky/setup_files/dependencies.py +11 -10
  93. sky/skylet/autostop_lib.py +38 -5
  94. sky/skylet/constants.py +3 -1
  95. sky/skylet/services.py +44 -0
  96. sky/skylet/skylet.py +49 -4
  97. sky/skypilot_config.py +4 -4
  98. sky/task.py +19 -16
  99. sky/templates/aws-ray.yml.j2 +2 -2
  100. sky/templates/jobs-controller.yaml.j2 +6 -0
  101. sky/users/permission.py +1 -1
  102. sky/utils/cli_utils/status_utils.py +9 -0
  103. sky/utils/command_runner.py +1 -1
  104. sky/utils/config_utils.py +29 -5
  105. sky/utils/controller_utils.py +73 -0
  106. sky/utils/db/db_utils.py +39 -1
  107. sky/utils/db/migration_utils.py +1 -1
  108. sky/utils/schemas.py +3 -0
  109. sky/volumes/server/core.py +2 -2
  110. sky/volumes/server/server.py +2 -2
  111. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
  112. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +117 -108
  113. sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +0 -1
  114. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +0 -11
  115. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
  116. /sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
  117. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
  118. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
  119. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
  120. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'eb83a691489c0c37aae9c22f607469ff78a74e34'
8
+ _SKYPILOT_COMMIT_SHA = '58649973a7c706775528a419f46ae024e59f4603'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250808'
38
+ __version__ = '1.0.0.dev20250814'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -98,6 +98,7 @@ from sky.client.sdk import cancel
98
98
  from sky.client.sdk import cost_report
99
99
  from sky.client.sdk import down
100
100
  from sky.client.sdk import download_logs
101
+ from sky.client.sdk import endpoints
101
102
  from sky.client.sdk import exec # pylint: disable=redefined-builtin
102
103
  from sky.client.sdk import get
103
104
  from sky.client.sdk import job_status
@@ -194,6 +195,7 @@ __all__ = [
194
195
  'down',
195
196
  'autostop',
196
197
  'cost_report',
198
+ 'endpoints',
197
199
  # core APIs Job Management
198
200
  'queue',
199
201
  'cancel',
@@ -142,8 +142,11 @@ def _load_config(context: Optional[str] = None):
142
142
  # show up in SkyPilot tasks. For now, we work around by using
143
143
  # DNS name instead of environment variables.
144
144
  # See issue: https://github.com/skypilot-org/skypilot/issues/2287
145
- os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
146
- os.environ['KUBERNETES_SERVICE_PORT'] = '443'
145
+ # Only set if not already present (preserving existing values)
146
+ if 'KUBERNETES_SERVICE_HOST' not in os.environ:
147
+ os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
148
+ if 'KUBERNETES_SERVICE_PORT' not in os.environ:
149
+ os.environ['KUBERNETES_SERVICE_PORT'] = '443'
147
150
  kubernetes.config.load_incluster_config()
148
151
  except kubernetes.config.config_exception.ConfigException:
149
152
  _load_config_from_kubeconfig()
@@ -13,11 +13,13 @@ import sys
13
13
  import tempfile
14
14
  import time
15
15
  import typing
16
- from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
16
+ from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple,
17
+ TypeVar, Union)
17
18
  import uuid
18
19
 
19
20
  import colorama
20
21
  from packaging import version
22
+ import psutil
21
23
  from typing_extensions import Literal
22
24
 
23
25
  import sky
@@ -61,6 +63,7 @@ from sky.utils import ux_utils
61
63
  from sky.workspaces import core as workspaces_core
62
64
 
63
65
  if typing.TYPE_CHECKING:
66
+ import grpc
64
67
  import requests
65
68
  from requests import adapters
66
69
  from requests.packages.urllib3.util import retry as retry_lib
@@ -79,6 +82,8 @@ else:
79
82
  adapters = adaptors_common.LazyImport('requests.adapters')
80
83
  retry_lib = adaptors_common.LazyImport(
81
84
  'requests.packages.urllib3.util.retry')
85
+ # To avoid requiring grpcio to be installed on the client side.
86
+ grpc = adaptors_common.LazyImport('grpc')
82
87
 
83
88
  logger = sky_logging.init_logger(__name__)
84
89
 
@@ -1773,13 +1778,15 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
1773
1778
 
1774
1779
  def _query_cluster_status_via_cloud_api(
1775
1780
  handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
1776
- ) -> List[status_lib.ClusterStatus]:
1777
- """Returns the status of the cluster.
1781
+ ) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
1782
+ """Returns the status of the cluster as a list of tuples corresponding
1783
+ to the node status and an optional reason string for said status.
1778
1784
 
1779
1785
  Raises:
1780
1786
  exceptions.ClusterStatusFetchingError: the cluster status cannot be
1781
1787
  fetched from the cloud provider.
1782
1788
  """
1789
+ cluster_name = handle.cluster_name
1783
1790
  cluster_name_on_cloud = handle.cluster_name_on_cloud
1784
1791
  cluster_name_in_hint = common_utils.cluster_name_in_hint(
1785
1792
  handle.cluster_name, cluster_name_on_cloud)
@@ -1797,7 +1804,8 @@ def _query_cluster_status_via_cloud_api(
1797
1804
  cloud_name = repr(handle.launched_resources.cloud)
1798
1805
  try:
1799
1806
  node_status_dict = provision_lib.query_instances(
1800
- cloud_name, cluster_name_on_cloud, provider_config)
1807
+ cloud_name, cluster_name, cluster_name_on_cloud,
1808
+ provider_config)
1801
1809
  logger.debug(f'Querying {cloud_name} cluster '
1802
1810
  f'{cluster_name_in_hint} '
1803
1811
  f'status:\n{pprint.pformat(node_status_dict)}')
@@ -1813,9 +1821,13 @@ def _query_cluster_status_via_cloud_api(
1813
1821
  region = provider_config.get('region') or provider_config.get(
1814
1822
  'location')
1815
1823
  zone = ray_config['provider'].get('availability_zone')
1824
+ # TODO (kyuds): refactor cloud.query_status api to include reason.
1825
+ # Currently not refactoring as this API is actually supposed to be
1826
+ # deprecated soon.
1816
1827
  node_statuses = cloud.query_status(
1817
1828
  cluster_name_on_cloud,
1818
1829
  tag_filter_for_cluster(cluster_name_on_cloud), region, zone)
1830
+ node_statuses = [(status, None) for status in node_statuses]
1819
1831
  return node_statuses
1820
1832
 
1821
1833
 
@@ -2015,8 +2027,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2015
2027
 
2016
2028
  node_statuses = _query_cluster_status_via_cloud_api(handle)
2017
2029
 
2018
- all_nodes_up = (all(
2019
- status == status_lib.ClusterStatus.UP for status in node_statuses) and
2030
+ all_nodes_up = (all(status[0] == status_lib.ClusterStatus.UP
2031
+ for status in node_statuses) and
2020
2032
  len(node_statuses) == handle.launched_nodes)
2021
2033
 
2022
2034
  def get_node_counts_from_ray_status(
@@ -2121,6 +2133,13 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2121
2133
  # run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
2122
2134
  # head-ip/worker-ips`.
2123
2135
  record['status'] = status_lib.ClusterStatus.UP
2136
+ # Add cluster event for instance status check.
2137
+ global_user_state.add_cluster_event(
2138
+ cluster_name,
2139
+ status_lib.ClusterStatus.UP,
2140
+ 'All nodes up + ray cluster healthy.',
2141
+ global_user_state.ClusterEventType.STATUS_CHANGE,
2142
+ nop_if_duplicate=True)
2124
2143
  global_user_state.add_or_update_cluster(cluster_name,
2125
2144
  handle,
2126
2145
  requested_resources=None,
@@ -2205,9 +2224,19 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2205
2224
  # regardless of the ray cluster's health.
2206
2225
  # (2) Otherwise, we will reset the autostop setting, unless the cluster is
2207
2226
  # autostopping/autodowning.
2208
- is_abnormal = ((0 < len(node_statuses) < handle.launched_nodes) or any(
2209
- status != status_lib.ClusterStatus.STOPPED for status in node_statuses))
2227
+ some_nodes_terminated = 0 < len(node_statuses) < handle.launched_nodes
2228
+ some_nodes_not_stopped = any(status[0] != status_lib.ClusterStatus.STOPPED
2229
+ for status in node_statuses)
2230
+ is_abnormal = (some_nodes_terminated or some_nodes_not_stopped)
2231
+
2210
2232
  if is_abnormal:
2233
+ status_reason = ', '.join(
2234
+ [status[1] for status in node_statuses if status[1] is not None])
2235
+
2236
+ if some_nodes_terminated:
2237
+ init_reason = 'one or more nodes terminated'
2238
+ elif some_nodes_not_stopped:
2239
+ init_reason = 'some nodes are up and some nodes are stopped'
2211
2240
  logger.debug('The cluster is abnormal. Setting to INIT status. '
2212
2241
  f'node_statuses: {node_statuses}')
2213
2242
  if record['autostop'] >= 0:
@@ -2291,6 +2320,22 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2291
2320
  # represent that the cluster is partially preempted.
2292
2321
  # TODO(zhwu): the definition of INIT should be audited/changed.
2293
2322
  # Adding a new status UNHEALTHY for abnormal status can be a choice.
2323
+ init_reason_regex = None
2324
+ if not status_reason:
2325
+ # If there is not a status reason, don't re-add (and overwrite) the
2326
+ # event if there is already an event with the same reason which may
2327
+ # have a status reason.
2328
+ # Some status reason clears after a certain time (e.g. k8s events
2329
+ # are only stored for an hour by default), so it is possible that
2330
+ # the previous event has a status reason, but now it does not.
2331
+ init_reason_regex = f'^Cluster is abnormal because {init_reason} .*'
2332
+ global_user_state.add_cluster_event(
2333
+ cluster_name,
2334
+ status_lib.ClusterStatus.INIT,
2335
+ f'Cluster is abnormal because {init_reason} ({status_reason}). Transitioned to INIT.',
2336
+ global_user_state.ClusterEventType.STATUS_CHANGE,
2337
+ nop_if_duplicate=True,
2338
+ duplicate_regex=init_reason_regex)
2294
2339
  global_user_state.add_or_update_cluster(cluster_name,
2295
2340
  handle,
2296
2341
  requested_resources=None,
@@ -2301,6 +2346,9 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2301
2346
  # STOPPED.
2302
2347
  backend = backends.CloudVmRayBackend()
2303
2348
  backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
2349
+ global_user_state.add_cluster_event(
2350
+ cluster_name, None, 'All nodes stopped, terminating cluster.',
2351
+ global_user_state.ClusterEventType.STATUS_CHANGE)
2304
2352
  return global_user_state.get_cluster_from_name(cluster_name)
2305
2353
 
2306
2354
 
@@ -3330,3 +3378,49 @@ def cluster_file_mounts_lock_id(cluster_name: str) -> str:
3330
3378
  def workspace_lock_id(workspace_name: str) -> str:
3331
3379
  """Get the lock ID for workspace operations."""
3332
3380
  return f'{workspace_name}_workspace'
3381
+
3382
+
3383
+ T = TypeVar('T')
3384
+
3385
+
3386
+ def invoke_skylet_with_retries(
3387
+ handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
3388
+ func: Callable[..., T]) -> T:
3389
+ """Generic helper for making Skylet gRPC requests.
3390
+
3391
+ This method handles the common pattern of:
3392
+ 1. Try the gRPC request
3393
+ 2. If SSH tunnel is closed, recreate it and retry
3394
+ """
3395
+ max_attempts = 3
3396
+ backoff = common_utils.Backoff(initial_backoff=0.5)
3397
+ last_exception: Optional[Exception] = None
3398
+
3399
+ for _ in range(max_attempts):
3400
+ try:
3401
+ return func()
3402
+ except grpc.RpcError as e:
3403
+ last_exception = e
3404
+ if e.code() == grpc.StatusCode.INTERNAL:
3405
+ with ux_utils.print_exception_no_traceback():
3406
+ raise exceptions.SkyletInternalError(e.details())
3407
+ elif e.code() == grpc.StatusCode.UNAVAILABLE:
3408
+ recreate_tunnel = True
3409
+ try:
3410
+ if handle.skylet_ssh_tunnel is not None:
3411
+ proc = psutil.Process(handle.skylet_ssh_tunnel.pid)
3412
+ if proc.is_running(
3413
+ ) and proc.status() != psutil.STATUS_ZOMBIE:
3414
+ recreate_tunnel = False
3415
+ except psutil.NoSuchProcess:
3416
+ pass
3417
+
3418
+ if recreate_tunnel:
3419
+ handle.open_and_update_skylet_tunnel()
3420
+
3421
+ time.sleep(backoff.current_backoff())
3422
+ else:
3423
+ raise e
3424
+
3425
+ raise RuntimeError(f'Failed to invoke Skylet after {max_attempts} attempts'
3426
+ ) from last_exception
@@ -1,5 +1,6 @@
1
1
  """Backend: runs on cloud virtual machines, managed by Ray."""
2
2
  import copy
3
+ import dataclasses
3
4
  import enum
4
5
  import inspect
5
6
  import json
@@ -20,6 +21,7 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
20
21
  Union)
21
22
 
22
23
  import colorama
24
+ import psutil
23
25
  import yaml
24
26
 
25
27
  import sky
@@ -37,6 +39,7 @@ from sky import resources as resources_lib
37
39
  from sky import sky_logging
38
40
  from sky import skypilot_config
39
41
  from sky import task as task_lib
42
+ from sky.adaptors import common as adaptors_common
40
43
  from sky.backends import backend_utils
41
44
  from sky.backends import wheel_utils
42
45
  from sky.clouds import cloud as sky_cloud
@@ -76,7 +79,18 @@ from sky.utils import ux_utils
76
79
  from sky.utils import volume as volume_lib
77
80
 
78
81
  if typing.TYPE_CHECKING:
82
+ import grpc
83
+
79
84
  from sky import dag
85
+ from sky.schemas.generated import autostopv1_pb2
86
+ from sky.schemas.generated import autostopv1_pb2_grpc
87
+ else:
88
+ # To avoid requiring grpcio to be installed on the client side.
89
+ grpc = adaptors_common.LazyImport('grpc')
90
+ autostopv1_pb2 = adaptors_common.LazyImport(
91
+ 'sky.schemas.generated.autostopv1_pb2')
92
+ autostopv1_pb2_grpc = adaptors_common.LazyImport(
93
+ 'sky.schemas.generated.autostopv1_pb2_grpc')
80
94
 
81
95
  Path = str
82
96
 
@@ -1527,6 +1541,13 @@ class RetryingVmProvisioner(object):
1527
1541
  is_managed=self._is_managed,
1528
1542
  )
1529
1543
 
1544
+ # Add cluster event for actual provisioning start.
1545
+ global_user_state.add_cluster_event(
1546
+ cluster_name, status_lib.ClusterStatus.INIT,
1547
+ f'Provisioning on {to_provision.cloud.display_name()} ' +
1548
+ f'in {to_provision.region}',
1549
+ global_user_state.ClusterEventType.STATUS_CHANGE)
1550
+
1530
1551
  global_user_state.set_owner_identity_for_cluster(
1531
1552
  cluster_name, cloud_user_identity)
1532
1553
 
@@ -2199,6 +2220,12 @@ class RetryingVmProvisioner(object):
2199
2220
  return config_dict
2200
2221
 
2201
2222
 
2223
+ @dataclasses.dataclass
2224
+ class SSHTunnelInfo:
2225
+ port: int
2226
+ pid: int
2227
+
2228
+
2202
2229
  class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2203
2230
  """A pickle-able handle to a cluster created by CloudVmRayBackend.
2204
2231
 
@@ -2218,10 +2245,11 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2218
2245
  - (optional) Launched resources
2219
2246
  - (optional) Docker user name
2220
2247
  - (optional) If TPU(s) are managed, a path to a deletion script.
2248
+ - (optional) Skylet SSH tunnel info.
2221
2249
  """
2222
2250
  # Bump if any fields get added/removed/changed, and add backward
2223
2251
  # compaitibility logic in __setstate__.
2224
- _VERSION = 10
2252
+ _VERSION = 11
2225
2253
 
2226
2254
  def __init__(
2227
2255
  self,
@@ -2254,6 +2282,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2254
2282
  self.launched_nodes = launched_nodes
2255
2283
  self.launched_resources = launched_resources
2256
2284
  self.docker_user: Optional[str] = None
2285
+ self.is_grpc_enabled = True
2286
+ self.skylet_ssh_tunnel: Optional[SSHTunnelInfo] = None
2257
2287
 
2258
2288
  def __repr__(self):
2259
2289
  return (f'ResourceHandle('
@@ -2269,7 +2299,9 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2269
2299
  f'\n\tlaunched_resources={self.launched_nodes}x '
2270
2300
  f'{self.launched_resources}, '
2271
2301
  f'\n\tdocker_user={self.docker_user},'
2272
- f'\n\tssh_user={self.ssh_user}')
2302
+ f'\n\tssh_user={self.ssh_user},'
2303
+ f'\n\tis_grpc_enabled={self.is_grpc_enabled},'
2304
+ f'\n\tskylet_ssh_tunnel={self.skylet_ssh_tunnel}')
2273
2305
 
2274
2306
  def get_cluster_name(self):
2275
2307
  return self.cluster_name
@@ -2593,6 +2625,66 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2593
2625
  cluster_config_file)
2594
2626
  self.docker_user = docker_user
2595
2627
 
2628
+ def get_grpc_channel(self) -> 'grpc.Channel':
2629
+ if self.skylet_ssh_tunnel is None:
2630
+ self.open_and_update_skylet_tunnel()
2631
+ assert self.skylet_ssh_tunnel is not None
2632
+ return grpc.insecure_channel(f'localhost:{self.skylet_ssh_tunnel.port}')
2633
+
2634
+ def _cleanup_ssh_tunnel(self, tunnel_info: SSHTunnelInfo) -> None:
2635
+ """Clean up an SSH tunnel by terminating the process."""
2636
+ try:
2637
+ proc = psutil.Process(tunnel_info.pid)
2638
+ if proc.is_running() and proc.status() != psutil.STATUS_ZOMBIE:
2639
+ logger.debug(
2640
+ f'Terminating SSH tunnel process {tunnel_info.pid}')
2641
+ proc.terminate()
2642
+ try:
2643
+ proc.wait(timeout=3)
2644
+ except psutil.TimeoutExpired:
2645
+ proc.kill()
2646
+ proc.wait(timeout=1)
2647
+ except psutil.NoSuchProcess:
2648
+ pass
2649
+ except Exception as e: # pylint: disable=broad-except
2650
+ logger.warning(
2651
+ f'Failed to cleanup SSH tunnel process {tunnel_info.pid}: {e}')
2652
+
2653
+ def open_and_update_skylet_tunnel(self) -> None:
2654
+ """Opens an SSH tunnel to the Skylet on the head node,
2655
+ updates the cluster handle, and persists it to the database."""
2656
+ local_port = common_utils.find_free_port(10000)
2657
+ runners = self.get_command_runners()
2658
+ head_runner = runners[0]
2659
+ if isinstance(head_runner, command_runner.SSHCommandRunner):
2660
+ # Disabling ControlMaster makes things easier to reason about
2661
+ # with respect to resource management/ownership,
2662
+ # as killing the process will close the tunnel too.
2663
+ head_runner.disable_control_master = True
2664
+
2665
+ cmd = head_runner.port_forward_command([(local_port,
2666
+ constants.SKYLET_GRPC_PORT)])
2667
+ ssh_tunnel_proc = subprocess.Popen(cmd)
2668
+ tunnel_info = SSHTunnelInfo(port=local_port, pid=ssh_tunnel_proc.pid)
2669
+ try:
2670
+ grpc.channel_ready_future(
2671
+ grpc.insecure_channel(f'localhost:{tunnel_info.port}')).result(
2672
+ timeout=constants.SKYLET_GRPC_TIMEOUT_SECONDS)
2673
+ # Clean up existing tunnel before setting up the new one.
2674
+ if self.skylet_ssh_tunnel is not None:
2675
+ self._cleanup_ssh_tunnel(self.skylet_ssh_tunnel)
2676
+ self.skylet_ssh_tunnel = tunnel_info
2677
+ global_user_state.update_cluster_handle(self.cluster_name, self)
2678
+ except grpc.FutureTimeoutError as e:
2679
+ self._cleanup_ssh_tunnel(tunnel_info)
2680
+ logger.warning(
2681
+ f'Skylet gRPC channel for cluster {self.cluster_name} not '
2682
+ f'ready after {constants.SKYLET_GRPC_TIMEOUT_SECONDS}s')
2683
+ raise e
2684
+ except Exception as e:
2685
+ self._cleanup_ssh_tunnel(tunnel_info)
2686
+ raise e
2687
+
2596
2688
  @property
2597
2689
  def cluster_yaml(self) -> Optional[str]:
2598
2690
  if self._cluster_yaml is None:
@@ -2690,6 +2782,10 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2690
2782
  os.path.expanduser(state['_cluster_yaml'])):
2691
2783
  state['_cluster_yaml'] = None
2692
2784
 
2785
+ if version < 11:
2786
+ state['is_grpc_enabled'] = False
2787
+ state['skylet_ssh_tunnel'] = None
2788
+
2693
2789
  self.__dict__.update(state)
2694
2790
 
2695
2791
  # Because the update_cluster_ips and update_ssh_ports
@@ -2729,6 +2825,27 @@ class LocalResourcesHandle(CloudVmRayResourceHandle):
2729
2825
  return [command_runner.LocalProcessCommandRunner()]
2730
2826
 
2731
2827
 
2828
+ class SkyletClient:
2829
+ """The client to interact with a remote cluster through Skylet."""
2830
+
2831
+ def __init__(self, channel: 'grpc.Channel'):
2832
+ self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
2833
+
2834
+ def set_autostop(
2835
+ self,
2836
+ request: 'autostopv1_pb2.SetAutostopRequest',
2837
+ timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2838
+ ) -> 'autostopv1_pb2.SetAutostopResponse':
2839
+ return self._autostop_stub.SetAutostop(request, timeout=timeout)
2840
+
2841
+ def is_autostopping(
2842
+ self,
2843
+ request: 'autostopv1_pb2.IsAutostoppingRequest',
2844
+ timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
2845
+ ) -> 'autostopv1_pb2.IsAutostoppingResponse':
2846
+ return self._autostop_stub.IsAutostopping(request, timeout=timeout)
2847
+
2848
+
2732
2849
  @registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
2733
2850
  class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2734
2851
  """Backend: runs on cloud virtual machines, managed by Ray.
@@ -2936,10 +3053,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2936
3053
  skip_unnecessary_provisioning)
2937
3054
  except locks.LockTimeout:
2938
3055
  if not communicated_with_user:
2939
- logger.info(f'{colorama.Fore.YELLOW}'
2940
- f'Launching delayed, check concurrent tasks: '
2941
- f'sky api status')
2942
- communicated_with_user = True
3056
+ rich_utils.force_update_status(
3057
+ ux_utils.spinner_message('Launching - blocked by ' +
3058
+ 'other requests ' +
3059
+ colorama.Style.RESET_ALL +
3060
+ colorama.Style.DIM +
3061
+ 'Check concurrent requests: ' +
3062
+ 'sky api status '))
2943
3063
 
2944
3064
  def _locked_provision(
2945
3065
  self,
@@ -3007,6 +3127,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3007
3127
  break
3008
3128
  except exceptions.ResourcesUnavailableError as e:
3009
3129
  log_path = retry_provisioner.log_dir + '/provision.log'
3130
+
3010
3131
  error_message = (
3011
3132
  f'{colorama.Fore.RED}Failed to provision all '
3012
3133
  f'possible launchable resources.'
@@ -3023,6 +3144,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3023
3144
  hint_message = (f'\n{retry_message} '
3024
3145
  f'{ux_utils.log_path_hint(log_path)}'
3025
3146
  f'{colorama.Style.RESET_ALL}')
3147
+
3148
+ # Add cluster event for retry.
3149
+ global_user_state.add_cluster_event(
3150
+ cluster_name, status_lib.ClusterStatus.INIT,
3151
+ f'Retrying provisioning after {gap_seconds:.0f}s',
3152
+ global_user_state.ClusterEventType.STATUS_CHANGE)
3153
+
3026
3154
  raise exceptions.ExecutionRetryableError(
3027
3155
  error_message,
3028
3156
  hint=hint_message,
@@ -3074,6 +3202,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3074
3202
  # and other necessary files to the VM.
3075
3203
  # 3. Run setup commands to install dependencies.
3076
3204
  # 4. Starting ray cluster and skylet.
3205
+
3206
+ # Add cluster event for runtime setup start
3207
+ global_user_state.add_cluster_event(
3208
+ handle.cluster_name, status_lib.ClusterStatus.INIT,
3209
+ 'Setting up SkyPilot runtime on cluster',
3210
+ global_user_state.ClusterEventType.STATUS_CHANGE)
3211
+
3077
3212
  cluster_info = provisioner.post_provision_runtime_setup(
3078
3213
  repr(handle.launched_resources.cloud),
3079
3214
  resources_utils.ClusterName(handle.cluster_name,
@@ -3259,6 +3394,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3259
3394
  config_hash=config_hash,
3260
3395
  task_config=user_specified_task_config,
3261
3396
  )
3397
+
3398
+ # Add cluster event for successful provisioning.
3399
+ global_user_state.add_cluster_event(
3400
+ handle.cluster_name, status_lib.ClusterStatus.UP,
3401
+ 'Cluster successfully provisioned with ' +
3402
+ f'{handle.launched_nodes} nodes',
3403
+ global_user_state.ClusterEventType.STATUS_CHANGE)
3404
+
3262
3405
  usage_lib.messages.usage.update_final_cluster_status(
3263
3406
  status_lib.ClusterStatus.UP)
3264
3407
  # We still add the cluster to ssh config file on API server, this
@@ -4626,13 +4769,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4626
4769
  logger.debug(f'instance statuses attempt {attempts + 1}')
4627
4770
  node_status_dict = provision_lib.query_instances(
4628
4771
  repr(cloud),
4772
+ handle.cluster_name,
4629
4773
  cluster_name_on_cloud,
4630
4774
  config['provider'],
4631
4775
  non_terminated_only=False)
4632
4776
 
4633
4777
  unexpected_node_state: Optional[Tuple[str, str]] = None
4634
- for node_id, node_status in node_status_dict.items():
4635
- logger.debug(f'{node_id} status: {node_status}')
4778
+ for node_id, node_status_tuple in node_status_dict.items():
4779
+ node_status, reason = node_status_tuple
4780
+ reason = '' if reason is None else f' ({reason})'
4781
+ logger.debug(f'{node_id} status: {node_status}{reason}')
4636
4782
  # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
4637
4783
  # between "stopping/stopped" and "terminating/terminated",
4638
4784
  # so we allow for either status instead of casing on
@@ -4733,17 +4879,30 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4733
4879
  # Check if we're stopping spot
4734
4880
  assert (handle.launched_resources is not None and
4735
4881
  handle.launched_resources.cloud is not None), handle
4736
- code = autostop_lib.AutostopCodeGen.set_autostop(
4737
- idle_minutes_to_autostop, self.NAME, wait_for, down)
4738
- returncode, _, stderr = self.run_on_head(handle,
4739
- code,
4740
- require_outputs=True,
4741
- stream_logs=stream_logs)
4742
- subprocess_utils.handle_returncode(returncode,
4743
- code,
4744
- 'Failed to set autostop',
4745
- stderr=stderr,
4746
- stream_logs=stream_logs)
4882
+ if handle.is_grpc_enabled:
4883
+ request = autostopv1_pb2.SetAutostopRequest(
4884
+ idle_minutes=idle_minutes_to_autostop,
4885
+ backend=self.NAME,
4886
+ wait_for=wait_for.to_protobuf() if wait_for is not None else
4887
+ autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED,
4888
+ down=down,
4889
+ )
4890
+ backend_utils.invoke_skylet_with_retries(
4891
+ handle, lambda: SkyletClient(handle.get_grpc_channel()).
4892
+ set_autostop(request))
4893
+ else:
4894
+ logger.info(
4895
+ 'Using legacy remote execution for set_autostop on '
4896
+ 'cluster %s.', handle.cluster_name)
4897
+ code = autostop_lib.AutostopCodeGen.set_autostop(
4898
+ idle_minutes_to_autostop, self.NAME, wait_for, down)
4899
+ returncode, _, stderr = self.run_on_head(
4900
+ handle, code, require_outputs=True, stream_logs=stream_logs)
4901
+ subprocess_utils.handle_returncode(returncode,
4902
+ code,
4903
+ 'Failed to set autostop',
4904
+ stderr=stderr,
4905
+ stream_logs=stream_logs)
4747
4906
  global_user_state.set_cluster_autostop_value(
4748
4907
  handle.cluster_name, idle_minutes_to_autostop, down)
4749
4908
 
@@ -4768,18 +4927,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4768
4927
  # The head node of the cluster is not UP or in an abnormal state.
4769
4928
  # We cannot check if the cluster is autostopping.
4770
4929
  return False
4771
- code = autostop_lib.AutostopCodeGen.is_autostopping()
4772
- returncode, stdout, stderr = self.run_on_head(handle,
4773
- code,
4774
- require_outputs=True,
4775
- stream_logs=stream_logs)
4776
-
4777
- if returncode == 0:
4778
- return message_utils.decode_payload(stdout)
4779
- logger.debug('Failed to check if cluster is autostopping with '
4780
- f'{returncode}: {stdout+stderr}\n'
4781
- f'Command: {code}')
4782
- return False
4930
+ if handle.is_grpc_enabled:
4931
+ request = autostopv1_pb2.IsAutostoppingRequest()
4932
+ response = backend_utils.invoke_skylet_with_retries(
4933
+ handle, lambda: SkyletClient(handle.get_grpc_channel()).
4934
+ is_autostopping(request))
4935
+ return response.is_autostopping
4936
+ else:
4937
+ logger.info(
4938
+ 'Using legacy remote execution for is_autostopping on '
4939
+ 'cluster %s.', handle.cluster_name)
4940
+ code = autostop_lib.AutostopCodeGen.is_autostopping()
4941
+ returncode, stdout, stderr = self.run_on_head(
4942
+ handle, code, require_outputs=True, stream_logs=stream_logs)
4943
+ if returncode == 0:
4944
+ return message_utils.decode_payload(stdout)
4945
+ logger.debug('Failed to check if cluster is autostopping with '
4946
+ f'{returncode}: {stdout+stderr}\n'
4947
+ f'Command: {code}')
4948
+ return False
4783
4949
 
4784
4950
  # TODO(zhwu): Refactor this to a CommandRunner class, so different backends
4785
4951
  # can support its own command runner.
@@ -4,7 +4,7 @@ import typing
4
4
  from typing import Dict, List, Optional, Tuple, Union
5
5
 
6
6
  from sky.catalog import common
7
- import sky.provision.cudo.cudo_machine_type as cudo_mt
7
+ from sky.provision.cudo import cudo_machine_type as cudo_mt
8
8
  from sky.utils import ux_utils
9
9
 
10
10
  if typing.TYPE_CHECKING:
@@ -9,7 +9,7 @@ import os
9
9
 
10
10
  import cudo_compute
11
11
 
12
- import sky.provision.cudo.cudo_utils as utils
12
+ from sky.provision.cudo import cudo_utils as utils
13
13
 
14
14
  VMS_CSV = 'cudo/vms.csv'
15
15
 
@@ -22,6 +22,8 @@ TIMEOUT = 10
22
22
  PARENT_ID_TEMPLATE = 'project-{}public-images'
23
23
  ACCELERATOR_MANUFACTURER = 'NVIDIA'
24
24
 
25
+ VRAM = {'L40S': 49152, 'H100': 81920, 'H200': 144384, 'B200': 184320}
26
+
25
27
 
26
28
  @dataclass
27
29
  class PresetInfo:
@@ -196,17 +198,18 @@ def _write_preset_prices(presets: List[PresetInfo], output_file: str) -> None:
196
198
  key=lambda x:
197
199
  (bool(x.gpu), x.region, x.platform_name, x.vcpu)):
198
200
  gpu_info = ''
199
- if preset.gpu > 0:
201
+ if preset.gpu > 0 and preset.accelerator_name:
200
202
  gpu_info_dict = {
201
203
  'Gpus': [{
202
204
  'Name': preset.accelerator_name,
203
205
  'Manufacturer': preset.accelerator_manufacturer,
204
206
  'Count': preset.gpu,
205
207
  'MemoryInfo': {
206
- 'SizeInMiB': preset.memory_gib * 1024 // preset.gpu
208
+ 'SizeInMiB': VRAM.get(preset.accelerator_name, 0)
207
209
  },
208
210
  }],
209
- 'TotalGpuMemoryInMiB': preset.memory_gib * 1024,
211
+ 'TotalGpuMemoryInMiB': VRAM.get(preset.accelerator_name, 0)
212
+ * preset.gpu,
210
213
  }
211
214
  gpu_info = json.dumps(gpu_info_dict).replace('"', '\'')
212
215