skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/core.py CHANGED
@@ -1,25 +1,38 @@
1
1
  """SDK functions for cluster/job management."""
2
- import getpass
2
+ import os
3
+ import shlex
3
4
  import typing
4
- from typing import Any, Dict, List, Optional, Union
5
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
6
 
6
7
  import colorama
7
8
 
8
9
  from sky import backends
10
+ from sky import check as sky_check
9
11
  from sky import clouds
10
12
  from sky import dag
11
13
  from sky import data
12
14
  from sky import exceptions
13
15
  from sky import global_user_state
16
+ from sky import models
14
17
  from sky import sky_logging
15
- from sky import status_lib
16
18
  from sky import task
17
19
  from sky.backends import backend_utils
20
+ from sky.clouds import service_catalog
21
+ from sky.jobs.server import core as managed_jobs_core
22
+ from sky.provision.kubernetes import constants as kubernetes_constants
23
+ from sky.provision.kubernetes import utils as kubernetes_utils
18
24
  from sky.skylet import constants
19
25
  from sky.skylet import job_lib
26
+ from sky.skylet import log_lib
20
27
  from sky.usage import usage_lib
28
+ from sky.utils import common
29
+ from sky.utils import common_utils
21
30
  from sky.utils import controller_utils
31
+ from sky.utils import rich_utils
32
+ from sky.utils import status_lib
22
33
  from sky.utils import subprocess_utils
34
+ from sky.utils import ux_utils
35
+ from sky.utils.kubernetes import kubernetes_deploy_utils
23
36
 
24
37
  if typing.TYPE_CHECKING:
25
38
  from sky import resources as resources_lib
@@ -30,14 +43,15 @@ logger = sky_logging.init_logger(__name__)
30
43
  # = Cluster Management =
31
44
  # ======================
32
45
 
33
- # pylint: disable=redefined-builtin
34
-
35
46
 
36
47
  @usage_lib.entrypoint
37
- def status(cluster_names: Optional[Union[str, List[str]]] = None,
38
- refresh: bool = False) -> List[Dict[str, Any]]:
48
+ def status(
49
+ cluster_names: Optional[Union[str, List[str]]] = None,
50
+ refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
51
+ all_users: bool = False,
52
+ ) -> List[Dict[str, Any]]:
39
53
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
40
- """Get cluster statuses.
54
+ """Gets cluster statuses.
41
55
 
42
56
  If cluster_names is given, return those clusters. Otherwise, return all
43
57
  clusters.
@@ -56,6 +70,10 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None,
56
70
  'autostop': (int) idle time before autostop,
57
71
  'to_down': (bool) whether autodown is used instead of autostop,
58
72
  'metadata': (dict) metadata of the cluster,
73
+ 'user_hash': (str) user hash of the cluster owner,
74
+ 'user_name': (str) user name of the cluster owner,
75
+ 'resources_str': (str) the resource string representation of the
76
+ cluster,
59
77
  }
60
78
 
61
79
  Each cluster can have one of the following statuses:
@@ -104,9 +122,91 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None,
104
122
  cluster. If a cluster is found to be terminated or not found, it will
105
123
  be omitted from the returned list.
106
124
  """
107
- return backend_utils.get_clusters(include_controller=True,
108
- refresh=refresh,
109
- cluster_names=cluster_names)
125
+ clusters = backend_utils.get_clusters(refresh=refresh,
126
+ cluster_names=cluster_names,
127
+ all_users=all_users)
128
+ return clusters
129
+
130
+
131
+ def status_kubernetes(
132
+ ) -> Tuple[List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
133
+ List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
134
+ List[Dict[str, Any]], Optional[str]]:
135
+ """Gets all SkyPilot clusters and jobs in the Kubernetes cluster.
136
+
137
+ Managed jobs and services are also included in the clusters returned.
138
+ The caller must parse the controllers to identify which clusters are run
139
+ as managed jobs or services.
140
+ all_clusters, unmanaged_clusters, all_jobs, context
141
+ Returns:
142
+ A tuple containing:
143
+ - all_clusters: List of KubernetesSkyPilotClusterInfoPayload with info
144
+ for all clusters, including managed jobs, services and controllers.
145
+ - unmanaged_clusters: List of KubernetesSkyPilotClusterInfoPayload with
146
+ info for all clusters excluding managed jobs and services.
147
+ Controllers are included.
148
+ - all_jobs: List of managed jobs from all controllers. Each entry is a
149
+ dictionary job info, see jobs.queue_from_kubernetes_pod for details.
150
+ - context: Kubernetes context used to fetch the cluster information.
151
+ """
152
+ context = kubernetes_utils.get_current_kube_config_context_name()
153
+ try:
154
+ pods = kubernetes_utils.get_skypilot_pods(context)
155
+ except exceptions.ResourcesUnavailableError as e:
156
+ with ux_utils.print_exception_no_traceback():
157
+ raise ValueError('Failed to get SkyPilot pods from '
158
+ f'Kubernetes: {str(e)}') from e
159
+ all_clusters, jobs_controllers, _ = (kubernetes_utils.process_skypilot_pods(
160
+ pods, context))
161
+ all_jobs = []
162
+ with rich_utils.safe_status(
163
+ ux_utils.spinner_message(
164
+ '[bold cyan]Checking in-progress managed jobs[/]')) as spinner:
165
+ for i, job_controller_info in enumerate(jobs_controllers):
166
+ user = job_controller_info.user
167
+ pod = job_controller_info.pods[0]
168
+ status_message = '[bold cyan]Checking managed jobs controller'
169
+ if len(jobs_controllers) > 1:
170
+ status_message += f's ({i + 1}/{len(jobs_controllers)})'
171
+ spinner.update(f'{status_message}[/]')
172
+ try:
173
+ job_list = managed_jobs_core.queue_from_kubernetes_pod(
174
+ pod.metadata.name)
175
+ except RuntimeError as e:
176
+ logger.warning('Failed to get managed jobs from controller '
177
+ f'{pod.metadata.name}: {str(e)}')
178
+ job_list = []
179
+ # Add user field to jobs
180
+ for job in job_list:
181
+ job['user'] = user
182
+ all_jobs.extend(job_list)
183
+ # Reconcile cluster state between managed jobs and clusters:
184
+ # To maintain a clear separation between regular SkyPilot clusters
185
+ # and those from managed jobs, we need to exclude the latter from
186
+ # the main cluster list.
187
+ # We do this by reconstructing managed job cluster names from each
188
+ # job's name and ID. We then use this set to filter out managed
189
+ # clusters from the main cluster list. This is necessary because there
190
+ # are no identifiers distinguishing clusters from managed jobs from
191
+ # regular clusters.
192
+ managed_job_cluster_names = set()
193
+ for job in all_jobs:
194
+ # Managed job cluster name is <job_name>-<job_id>
195
+ managed_cluster_name = f'{job["job_name"]}-{job["job_id"]}'
196
+ managed_job_cluster_names.add(managed_cluster_name)
197
+ unmanaged_clusters = [
198
+ c for c in all_clusters
199
+ if c.cluster_name not in managed_job_cluster_names
200
+ ]
201
+ all_clusters = [
202
+ kubernetes_utils.KubernetesSkyPilotClusterInfoPayload.from_cluster(c)
203
+ for c in all_clusters
204
+ ]
205
+ unmanaged_clusters = [
206
+ kubernetes_utils.KubernetesSkyPilotClusterInfoPayload.from_cluster(c)
207
+ for c in unmanaged_clusters
208
+ ]
209
+ return all_clusters, unmanaged_clusters, all_jobs, context
110
210
 
111
211
 
112
212
  def endpoints(cluster: str,
@@ -126,7 +226,10 @@ def endpoints(cluster: str,
126
226
  RuntimeError: if the cluster has no ports to be exposed or no endpoints
127
227
  are exposed yet.
128
228
  """
129
- return backend_utils.get_endpoints(cluster=cluster, port=port)
229
+ with rich_utils.safe_status(
230
+ ux_utils.spinner_message(
231
+ f'Fetching endpoints for cluster {cluster}')):
232
+ return backend_utils.get_endpoints(cluster=cluster, port=port)
130
233
 
131
234
 
132
235
  @usage_lib.entrypoint
@@ -173,6 +276,9 @@ def cost_report() -> List[Dict[str, Any]]:
173
276
 
174
277
  for cluster_report in cluster_reports:
175
278
  cluster_report['total_cost'] = get_total_cost(cluster_report)
279
+ cluster_report['cloud'] = str(cluster_report['resources'].cloud)
280
+ cluster_report['accelerators'] = cluster_report[
281
+ 'resources'].accelerators
176
282
 
177
283
  return cluster_reports
178
284
 
@@ -188,7 +294,8 @@ def _start(
188
294
  cluster_status, handle = backend_utils.refresh_cluster_status_handle(
189
295
  cluster_name)
190
296
  if handle is None:
191
- raise ValueError(f'Cluster {cluster_name!r} does not exist.')
297
+ raise exceptions.ClusterDoesNotExist(
298
+ f'Cluster {cluster_name!r} does not exist.')
192
299
  if not force and cluster_status == status_lib.ClusterStatus.UP:
193
300
  sky_logging.print(f'Cluster {cluster_name!r} is already up.')
194
301
  return handle
@@ -266,8 +373,7 @@ def start(
266
373
  many minute of idleness, i.e., no running or pending jobs in the
267
374
  cluster's job queue. Idleness gets reset whenever setting-up/
268
375
  running/pending jobs are found in the job queue. Setting this
269
- flag is equivalent to running
270
- ``sky.launch(..., detach_run=True, ...)`` and then
376
+ flag is equivalent to running ``sky.launch()`` and then
271
377
  ``sky.autostop(idle_minutes=<minutes>)``. If not set, the
272
378
  cluster will not be autostopped.
273
379
  retry_until_up: whether to retry launching the cluster until it is
@@ -279,12 +385,13 @@ def start(
279
385
  Useful for upgrading SkyPilot runtime.
280
386
 
281
387
  Raises:
282
- ValueError: argument values are invalid: (1) the specified cluster does
283
- not exist; (2) if ``down`` is set to True but
284
- ``idle_minutes_to_autostop`` is None; (3) if the specified cluster is
285
- the managed jobs controller, and either ``idle_minutes_to_autostop``
286
- is not None or ``down`` is True (omit them to use the default
287
- autostop settings).
388
+ ValueError: argument values are invalid: (1) if ``down`` is set to True
389
+ but ``idle_minutes_to_autostop`` is None; (2) if the specified
390
+ cluster is the managed jobs controller, and either
391
+ ``idle_minutes_to_autostop`` is not None or ``down`` is True (omit
392
+ them to use the default autostop settings).
393
+ sky.exceptions.ClusterDoesNotExist: the specified cluster does not
394
+ exist.
288
395
  sky.exceptions.NotSupportedError: if the cluster to restart was
289
396
  launched using a non-default backend that does not support this
290
397
  operation.
@@ -310,10 +417,45 @@ def _stop_not_supported_message(resources: 'resources_lib.Resources') -> str:
310
417
  return message
311
418
 
312
419
 
420
+ @usage_lib.entrypoint
421
+ def down(cluster_name: str, purge: bool = False) -> None:
422
+ # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
423
+ """Tears down a cluster.
424
+
425
+ Tearing down a cluster will delete all associated resources (all billing
426
+ stops), and any data on the attached disks will be lost. Accelerators
427
+ (e.g., TPUs) that are part of the cluster will be deleted too.
428
+
429
+ Args:
430
+ cluster_name: name of the cluster to down.
431
+ purge: (Advanced) Forcefully remove the cluster from SkyPilot's cluster
432
+ table, even if the actual cluster termination failed on the cloud.
433
+ WARNING: This flag should only be set sparingly in certain manual
434
+ troubleshooting scenarios; with it set, it is the user's
435
+ responsibility to ensure there are no leaked instances and related
436
+ resources.
437
+
438
+ Raises:
439
+ sky.exceptions.ClusterDoesNotExist: the specified cluster does not
440
+ exist.
441
+ RuntimeError: failed to tear down the cluster.
442
+ sky.exceptions.NotSupportedError: the specified cluster is the managed
443
+ jobs controller.
444
+ """
445
+ handle = global_user_state.get_handle_from_cluster_name(cluster_name)
446
+ if handle is None:
447
+ raise exceptions.ClusterDoesNotExist(
448
+ f'Cluster {cluster_name!r} does not exist.')
449
+
450
+ usage_lib.record_cluster_name_for_current_operation(cluster_name)
451
+ backend = backend_utils.get_backend_from_handle(handle)
452
+ backend.teardown(handle, terminate=True, purge=purge)
453
+
454
+
313
455
  @usage_lib.entrypoint
314
456
  def stop(cluster_name: str, purge: bool = False) -> None:
315
457
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
316
- """Stop a cluster.
458
+ """Stops a cluster.
317
459
 
318
460
  Data on attached disks is not lost when a cluster is stopped. Billing for
319
461
  the instances will stop, while the disks will still be charged. Those
@@ -332,7 +474,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
332
474
  related resources.
333
475
 
334
476
  Raises:
335
- ValueError: the specified cluster does not exist.
477
+ sky.exceptions.ClusterDoesNotExist: the specified cluster does not
478
+ exist.
336
479
  RuntimeError: failed to stop the cluster.
337
480
  sky.exceptions.NotSupportedError: if the specified cluster is a spot
338
481
  cluster, or a TPU VM Pod cluster, or the managed jobs controller.
@@ -343,7 +486,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
343
486
  f'is not supported.')
344
487
  handle = global_user_state.get_handle_from_cluster_name(cluster_name)
345
488
  if handle is None:
346
- raise ValueError(f'Cluster {cluster_name!r} does not exist.')
489
+ raise exceptions.ClusterDoesNotExist(
490
+ f'Cluster {cluster_name!r} does not exist.')
347
491
 
348
492
  backend = backend_utils.get_backend_from_handle(handle)
349
493
 
@@ -368,39 +512,6 @@ def stop(cluster_name: str, purge: bool = False) -> None:
368
512
  backend.teardown(handle, terminate=False, purge=purge)
369
513
 
370
514
 
371
- @usage_lib.entrypoint
372
- def down(cluster_name: str, purge: bool = False) -> None:
373
- # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
374
- """Tear down a cluster.
375
-
376
- Tearing down a cluster will delete all associated resources (all billing
377
- stops), and any data on the attached disks will be lost. Accelerators
378
- (e.g., TPUs) that are part of the cluster will be deleted too.
379
-
380
- Args:
381
- cluster_name: name of the cluster to down.
382
- purge: (Advanced) Forcefully remove the cluster from SkyPilot's cluster
383
- table, even if the actual cluster termination failed on the cloud.
384
- WARNING: This flag should only be set sparingly in certain manual
385
- troubleshooting scenarios; with it set, it is the user's
386
- responsibility to ensure there are no leaked instances and related
387
- resources.
388
-
389
- Raises:
390
- ValueError: the specified cluster does not exist.
391
- RuntimeError: failed to tear down the cluster.
392
- sky.exceptions.NotSupportedError: the specified cluster is the managed
393
- jobs controller.
394
- """
395
- handle = global_user_state.get_handle_from_cluster_name(cluster_name)
396
- if handle is None:
397
- raise ValueError(f'Cluster {cluster_name!r} does not exist.')
398
-
399
- usage_lib.record_cluster_name_for_current_operation(cluster_name)
400
- backend = backend_utils.get_backend_from_handle(handle)
401
- backend.teardown(handle, terminate=True, purge=purge)
402
-
403
-
404
515
  @usage_lib.entrypoint
405
516
  def autostop(
406
517
  cluster_name: str,
@@ -408,7 +519,7 @@ def autostop(
408
519
  down: bool = False, # pylint: disable=redefined-outer-name
409
520
  ) -> None:
410
521
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
411
- """Schedule an autostop/autodown for a cluster.
522
+ """Schedules an autostop/autodown for a cluster.
412
523
 
413
524
  Autostop/autodown will automatically stop or teardown a cluster when it
414
525
  becomes idle for a specified duration. Idleness means there are no
@@ -441,7 +552,7 @@ def autostop(
441
552
  rather than autostop (restartable).
442
553
 
443
554
  Raises:
444
- ValueError: if the cluster does not exist.
555
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
445
556
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
446
557
  sky.exceptions.NotSupportedError: if the cluster is not based on
447
558
  CloudVmRayBackend or the cluster is TPU VM Pod.
@@ -515,7 +626,7 @@ def queue(cluster_name: str,
515
626
  skip_finished: bool = False,
516
627
  all_users: bool = False) -> List[dict]:
517
628
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
518
- """Get the job queue of a cluster.
629
+ """Gets the job queue of a cluster.
519
630
 
520
631
  Please refer to the sky.cli.queue for the document.
521
632
 
@@ -526,6 +637,7 @@ def queue(cluster_name: str,
526
637
  'job_id': (int) job id,
527
638
  'job_name': (str) job name,
528
639
  'username': (str) username,
640
+ 'user_hash': (str) user hash,
529
641
  'submitted_at': (int) timestamp of submitted,
530
642
  'start_at': (int) timestamp of started,
531
643
  'end_at': (int) timestamp of ended,
@@ -535,7 +647,7 @@ def queue(cluster_name: str,
535
647
  }
536
648
  ]
537
649
  raises:
538
- ValueError: if the cluster does not exist.
650
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
539
651
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
540
652
  sky.exceptions.NotSupportedError: if the cluster is not based on
541
653
  CloudVmRayBackend.
@@ -546,10 +658,10 @@ def queue(cluster_name: str,
546
658
  exceptions.CommandError: if failed to get the job queue with ssh.
547
659
  """
548
660
  all_jobs = not skip_finished
549
- username: Optional[str] = getpass.getuser()
661
+ user_hash: Optional[str] = common_utils.get_user_hash()
550
662
  if all_users:
551
- username = None
552
- code = job_lib.JobLibCodeGen.get_job_queue(username, all_jobs)
663
+ user_hash = None
664
+ code = job_lib.JobLibCodeGen.get_job_queue(user_hash, all_jobs)
553
665
 
554
666
  handle = backend_utils.check_cluster_available(
555
667
  cluster_name,
@@ -576,25 +688,29 @@ def queue(cluster_name: str,
576
688
  def cancel(
577
689
  cluster_name: str,
578
690
  all: bool = False,
691
+ all_users: bool = False,
579
692
  job_ids: Optional[List[int]] = None,
580
693
  # pylint: disable=invalid-name
694
+ # Internal only:
581
695
  _try_cancel_if_cluster_is_init: bool = False,
582
696
  ) -> None:
583
697
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
584
- """Cancel jobs on a cluster.
698
+ """Cancels jobs on a cluster.
585
699
 
586
700
  Please refer to the sky.cli.cancel for the document.
587
701
 
588
- When `all` is False and `job_ids` is None, cancel the latest running job.
702
+ When none of `job_ids`, `all` and `all_users` is set, cancel the latest
703
+ running job.
589
704
 
590
705
  Additional arguments:
591
- _try_cancel_if_cluster_is_init: (bool) whether to try cancelling the job
706
+ try_cancel_if_cluster_is_init: (bool) whether to try cancelling the job
592
707
  even if the cluster is not UP, but the head node is still alive.
593
708
  This is used by the jobs controller to cancel the job when the
594
709
  worker node is preempted in the spot cluster.
595
710
 
596
711
  Raises:
597
- ValueError: if arguments are invalid, or the cluster does not exist.
712
+ ValueError: if arguments are invalid.
713
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
598
714
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
599
715
  sky.exceptions.NotSupportedError: if the specified cluster is a
600
716
  controller that does not support this operation.
@@ -606,9 +722,9 @@ def cancel(
606
722
  controller_utils.check_cluster_name_not_controller(
607
723
  cluster_name, operation_str='Cancelling jobs')
608
724
 
609
- if all and job_ids:
610
- raise ValueError('Cannot specify both `all` and `job_ids`. To cancel '
611
- 'all jobs, set `job_ids` to None.')
725
+ if all and job_ids is not None:
726
+ raise exceptions.NotSupportedError(
727
+ 'Cannot specify both --all and job IDs.')
612
728
 
613
729
  # Check the status of the cluster.
614
730
  handle = None
@@ -635,42 +751,47 @@ def cancel(
635
751
 
636
752
  backend = backend_utils.get_backend_from_handle(handle)
637
753
 
638
- if all:
639
- sky_logging.print(f'{colorama.Fore.YELLOW}'
640
- f'Cancelling all jobs on cluster {cluster_name!r}...'
641
- f'{colorama.Style.RESET_ALL}')
642
- elif job_ids is None:
643
- # all = False, job_ids is None => cancel the latest running job.
754
+ if all_users:
644
755
  sky_logging.print(
645
756
  f'{colorama.Fore.YELLOW}'
646
- f'Cancelling latest running job on cluster {cluster_name!r}...'
757
+ f'Cancelling all users\' jobs on cluster {cluster_name!r}...'
647
758
  f'{colorama.Style.RESET_ALL}')
648
- elif len(job_ids):
649
- # all = False, len(job_ids) > 0 => cancel the specified jobs.
759
+ elif all:
760
+ sky_logging.print(
761
+ f'{colorama.Fore.YELLOW}'
762
+ f'Cancelling all your jobs on cluster {cluster_name!r}...'
763
+ f'{colorama.Style.RESET_ALL}')
764
+ elif job_ids is not None:
650
765
  jobs_str = ', '.join(map(str, job_ids))
651
766
  sky_logging.print(
652
767
  f'{colorama.Fore.YELLOW}'
653
768
  f'Cancelling jobs ({jobs_str}) on cluster {cluster_name!r}...'
654
769
  f'{colorama.Style.RESET_ALL}')
655
770
  else:
656
- # all = False, len(job_ids) == 0 => no jobs to cancel.
657
- return
771
+ sky_logging.print(
772
+ f'{colorama.Fore.YELLOW}'
773
+ f'Cancelling latest running job on cluster {cluster_name!r}...'
774
+ f'{colorama.Style.RESET_ALL}')
658
775
 
659
- backend.cancel_jobs(handle, job_ids, all)
776
+ backend.cancel_jobs(handle,
777
+ job_ids,
778
+ cancel_all=all or all_users,
779
+ user_hash=common_utils.get_user_hash())
660
780
 
661
781
 
662
782
  @usage_lib.entrypoint
663
783
  def tail_logs(cluster_name: str,
664
784
  job_id: Optional[int],
665
- follow: bool = True) -> None:
785
+ follow: bool = True,
786
+ tail: int = 0) -> None:
666
787
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
667
- """Tail the logs of a job.
788
+ """Tails the logs of a job.
668
789
 
669
790
  Please refer to the sky.cli.tail_logs for the document.
670
791
 
671
792
  Raises:
672
- ValueError: arguments are invalid or the cluster is not supported or
673
- the cluster does not exist.
793
+ ValueError: if arguments are invalid or the cluster is not supported.
794
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
674
795
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
675
796
  sky.exceptions.NotSupportedError: if the cluster is not based on
676
797
  CloudVmRayBackend.
@@ -686,16 +807,8 @@ def tail_logs(cluster_name: str,
686
807
  )
687
808
  backend = backend_utils.get_backend_from_handle(handle)
688
809
 
689
- job_str = f'job {job_id}'
690
- if job_id is None:
691
- job_str = 'the last job'
692
- sky_logging.print(
693
- f'{colorama.Fore.YELLOW}'
694
- f'Tailing logs of {job_str} on cluster {cluster_name!r}...'
695
- f'{colorama.Style.RESET_ALL}')
696
-
697
810
  usage_lib.record_cluster_name_for_current_operation(cluster_name)
698
- backend.tail_logs(handle, job_id, follow=follow)
811
+ backend.tail_logs(handle, job_id, follow=follow, tail=tail)
699
812
 
700
813
 
701
814
  @usage_lib.entrypoint
@@ -704,7 +817,7 @@ def download_logs(
704
817
  job_ids: Optional[List[str]],
705
818
  local_dir: str = constants.SKY_LOGS_DIRECTORY) -> Dict[str, str]:
706
819
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
707
- """Download the logs of jobs.
820
+ """Downloads the logs of jobs.
708
821
 
709
822
  Args:
710
823
  cluster_name: (str) name of the cluster.
@@ -712,7 +825,7 @@ def download_logs(
712
825
  Returns:
713
826
  Dict[str, str]: a mapping of job_id to local log path.
714
827
  Raises:
715
- ValueError: if the cluster does not exist.
828
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
716
829
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
717
830
  sky.exceptions.NotSupportedError: if the cluster is not based on
718
831
  CloudVmRayBackend.
@@ -729,7 +842,7 @@ def download_logs(
729
842
  backend = backend_utils.get_backend_from_handle(handle)
730
843
  assert isinstance(backend, backends.CloudVmRayBackend), backend
731
844
 
732
- if job_ids is not None and len(job_ids) == 0:
845
+ if job_ids is not None and not job_ids:
733
846
  return {}
734
847
 
735
848
  usage_lib.record_cluster_name_for_current_operation(cluster_name)
@@ -757,7 +870,7 @@ def job_status(cluster_name: str,
757
870
  If job_ids is None and there is no job on the cluster, it will return
758
871
  {None: None}.
759
872
  Raises:
760
- ValueError: if the cluster does not exist.
873
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
761
874
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
762
875
  sky.exceptions.NotSupportedError: if the cluster is not based on
763
876
  CloudVmRayBackend.
@@ -778,7 +891,7 @@ def job_status(cluster_name: str,
778
891
  f'of type {backend.__class__.__name__!r}.')
779
892
  assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
780
893
 
781
- if job_ids is not None and len(job_ids) == 0:
894
+ if job_ids is not None and not job_ids:
782
895
  return {}
783
896
 
784
897
  sky_logging.print(f'{colorama.Fore.YELLOW}'
@@ -796,7 +909,7 @@ def job_status(cluster_name: str,
796
909
  @usage_lib.entrypoint
797
910
  def storage_ls() -> List[Dict[str, Any]]:
798
911
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
799
- """Get the storages.
912
+ """Gets the storages.
800
913
 
801
914
  Returns:
802
915
  [
@@ -818,7 +931,7 @@ def storage_ls() -> List[Dict[str, Any]]:
818
931
  @usage_lib.entrypoint
819
932
  def storage_delete(name: str) -> None:
820
933
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
821
- """Delete a storage.
934
+ """Deletes a storage.
822
935
 
823
936
  Raises:
824
937
  ValueError: If the storage does not exist.
@@ -828,7 +941,148 @@ def storage_delete(name: str) -> None:
828
941
  if handle is None:
829
942
  raise ValueError(f'Storage name {name!r} not found.')
830
943
  else:
831
- store_object = data.Storage(name=handle.storage_name,
832
- source=handle.source,
833
- sync_on_reconstruction=False)
834
- store_object.delete()
944
+ storage_object = data.Storage(name=handle.storage_name,
945
+ source=handle.source,
946
+ sync_on_reconstruction=False)
947
+ storage_object.delete()
948
+
949
+
950
+ # ===================
951
+ # = Catalog Observe =
952
+ # ===================
953
+ @usage_lib.entrypoint
954
+ def enabled_clouds() -> List[clouds.Cloud]:
955
+ return global_user_state.get_cached_enabled_clouds()
956
+
957
+
958
+ @usage_lib.entrypoint
959
+ def realtime_kubernetes_gpu_availability(
960
+ context: Optional[str] = None,
961
+ name_filter: Optional[str] = None,
962
+ quantity_filter: Optional[int] = None
963
+ ) -> List[models.RealtimeGpuAvailability]:
964
+
965
+ counts, capacity, available = service_catalog.list_accelerator_realtime(
966
+ gpus_only=True,
967
+ clouds='kubernetes',
968
+ name_filter=name_filter,
969
+ region_filter=context,
970
+ quantity_filter=quantity_filter,
971
+ case_sensitive=False)
972
+ assert (set(counts.keys()) == set(capacity.keys()) == set(
973
+ available.keys())), (f'Keys of counts ({list(counts.keys())}), '
974
+ f'capacity ({list(capacity.keys())}), '
975
+ f'and available ({list(available.keys())}) '
976
+ 'must be same.')
977
+ if len(counts) == 0:
978
+ err_msg = 'No GPUs found in Kubernetes cluster. '
979
+ debug_msg = 'To further debug, run: sky check '
980
+ if name_filter is not None:
981
+ gpu_info_msg = f' {name_filter!r}'
982
+ if quantity_filter is not None:
983
+ gpu_info_msg += (' with requested quantity'
984
+ f' {quantity_filter}')
985
+ err_msg = (f'Resources{gpu_info_msg} not found '
986
+ 'in Kubernetes cluster. ')
987
+ debug_msg = ('To show available accelerators on kubernetes,'
988
+ ' run: sky show-gpus --cloud kubernetes ')
989
+ full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
990
+ debug_msg)
991
+ raise ValueError(full_err_msg)
992
+
993
+ realtime_gpu_availability_list: List[models.RealtimeGpuAvailability] = []
994
+
995
+ for gpu, _ in sorted(counts.items()):
996
+ realtime_gpu_availability_list.append(
997
+ models.RealtimeGpuAvailability(
998
+ gpu,
999
+ counts.pop(gpu),
1000
+ capacity[gpu],
1001
+ available[gpu],
1002
+ ))
1003
+ return realtime_gpu_availability_list
1004
+
1005
+
1006
+ # =================
1007
+ # = Local Cluster =
1008
+ # =================
1009
+ @usage_lib.entrypoint
1010
+ def local_up(gpus: bool, ips: Optional[List[str]], ssh_user: Optional[str],
1011
+ ssh_key: Optional[str], cleanup: bool) -> None:
1012
+ """Creates a local or remote cluster."""
1013
+
1014
+ def _validate_args(ips, ssh_user, ssh_key, cleanup):
1015
+ # If any of --ips, --ssh-user, or --ssh-key-path is specified,
1016
+ # all must be specified
1017
+ if bool(ips) or bool(ssh_user) or bool(ssh_key):
1018
+ if not (ips and ssh_user and ssh_key):
1019
+ with ux_utils.print_exception_no_traceback():
1020
+ raise ValueError(
1021
+ 'All ips, ssh_user, and ssh_key must be specified '
1022
+ 'together.')
1023
+
1024
+ # --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
1025
+ # are all provided
1026
+ if cleanup and not (ips and ssh_user and ssh_key):
1027
+ with ux_utils.print_exception_no_traceback():
1028
+ raise ValueError(
1029
+ 'cleanup can only be used with ips, ssh_user and ssh_key.')
1030
+
1031
+ _validate_args(ips, ssh_user, ssh_key, cleanup)
1032
+
1033
+ # If remote deployment arguments are specified, run remote up script
1034
+ if ips:
1035
+ assert ssh_user is not None and ssh_key is not None
1036
+ kubernetes_deploy_utils.deploy_remote_cluster(ips, ssh_user, ssh_key,
1037
+ cleanup)
1038
+ else:
1039
+ # Run local deployment (kind) if no remote args are specified
1040
+ kubernetes_deploy_utils.deploy_local_cluster(gpus)
1041
+
1042
+
1043
+ def local_down() -> None:
1044
+ """Tears down the Kubernetes cluster started by local_up."""
1045
+ cluster_removed = False
1046
+
1047
+ path_to_package = os.path.dirname(__file__)
1048
+ down_script_path = os.path.join(path_to_package, 'utils/kubernetes',
1049
+ 'delete_cluster.sh')
1050
+
1051
+ cwd = os.path.dirname(os.path.abspath(down_script_path))
1052
+ run_command = shlex.split(down_script_path)
1053
+
1054
+ # Setup logging paths
1055
+ run_timestamp = sky_logging.get_run_timestamp()
1056
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
1057
+ 'local_down.log')
1058
+
1059
+ with rich_utils.safe_status(
1060
+ ux_utils.spinner_message('Removing local cluster',
1061
+ log_path=log_path,
1062
+ is_local=True)):
1063
+
1064
+ returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
1065
+ log_path=log_path,
1066
+ require_outputs=True,
1067
+ stream_logs=False,
1068
+ cwd=cwd)
1069
+ stderr = stderr.replace('No kind clusters found.\n', '')
1070
+
1071
+ if returncode == 0:
1072
+ cluster_removed = True
1073
+ elif returncode == 100:
1074
+ logger.info(ux_utils.error_message('Local cluster does not exist.'))
1075
+ else:
1076
+ with ux_utils.print_exception_no_traceback():
1077
+ raise RuntimeError('Failed to create local cluster. '
1078
+ f'Stdout: {stdout}'
1079
+ f'\nError: {stderr}')
1080
+ if cluster_removed:
1081
+ # Run sky check
1082
+ with rich_utils.safe_status(
1083
+ ux_utils.spinner_message('Running sky check...')):
1084
+ sky_check.check(clouds=['kubernetes'], quiet=True)
1085
+ logger.info(
1086
+ ux_utils.finishing_message('Local cluster removed.',
1087
+ log_path=log_path,
1088
+ is_local=True))