skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,22 @@
1
1
  """Utilities for sky status."""
2
+ import typing
2
3
  from typing import Any, Callable, Dict, List, Optional
3
4
 
4
5
  import click
5
6
  import colorama
6
7
 
7
8
  from sky import backends
8
- from sky import status_lib
9
9
  from sky.skylet import constants
10
+ from sky.utils import common_utils
10
11
  from sky.utils import log_utils
11
12
  from sky.utils import resources_utils
13
+ from sky.utils import status_lib
14
+
15
+ if typing.TYPE_CHECKING:
16
+ from sky.provision.kubernetes import utils as kubernetes_utils
17
+
18
+ if typing.TYPE_CHECKING:
19
+ from sky.provision.kubernetes import utils as kubernetes_utils
12
20
 
13
21
  COMMAND_TRUNC_LENGTH = 25
14
22
  NUM_COST_REPORT_LINES = 5
@@ -19,25 +27,6 @@ _ClusterRecord = Dict[str, Any]
19
27
  _ClusterCostReportRecord = Dict[str, Any]
20
28
 
21
29
 
22
- def truncate_long_string(s: str, max_length: int = 35) -> str:
23
- if len(s) <= max_length:
24
- return s
25
- splits = s.split(' ')
26
- if len(splits[0]) > max_length:
27
- return splits[0][:max_length] + '...' # Use '…'?
28
- # Truncate on word boundary.
29
- i = 0
30
- total = 0
31
- for i, part in enumerate(splits):
32
- total += len(part)
33
- if total >= max_length:
34
- break
35
- prefix = ' '.join(splits[:i])
36
- if len(prefix) < max_length:
37
- prefix += s[len(prefix):max_length]
38
- return prefix + '...'
39
-
40
-
41
30
  class StatusColumn:
42
31
  """One column of the displayed cluster table"""
43
32
 
@@ -54,12 +43,14 @@ class StatusColumn:
54
43
  def calc(self, record):
55
44
  val = self.calc_func(record)
56
45
  if self.trunc_length != 0:
57
- val = truncate_long_string(str(val), self.trunc_length)
46
+ val = common_utils.truncate_long_string(str(val), self.trunc_length)
58
47
  return val
59
48
 
60
49
 
61
50
  def show_status_table(cluster_records: List[_ClusterRecord],
62
- show_all: bool) -> int:
51
+ show_all: bool,
52
+ show_user: bool,
53
+ query_clusters: Optional[List[str]] = None) -> int:
63
54
  """Compute cluster table values and display.
64
55
 
65
56
  Returns:
@@ -70,6 +61,13 @@ def show_status_table(cluster_records: List[_ClusterRecord],
70
61
 
71
62
  status_columns = [
72
63
  StatusColumn('NAME', _get_name),
64
+ ]
65
+ if show_user:
66
+ status_columns.append(StatusColumn('USER', _get_user_name))
67
+ status_columns.append(
68
+ StatusColumn('USER_ID', _get_user_hash, show_by_default=False))
69
+
70
+ status_columns += [
73
71
  StatusColumn('LAUNCHED', _get_launched),
74
72
  StatusColumn('RESOURCES',
75
73
  _get_resources,
@@ -101,7 +99,21 @@ def show_status_table(cluster_records: List[_ClusterRecord],
101
99
 
102
100
  if cluster_records:
103
101
  click.echo(cluster_table)
104
- else:
102
+
103
+ if query_clusters:
104
+ cluster_names = {record['name'] for record in cluster_records}
105
+ not_found_clusters = [
106
+ repr(cluster)
107
+ for cluster in query_clusters
108
+ if cluster not in cluster_names
109
+ ]
110
+ cluster_str = 'Cluster'
111
+ if len(not_found_clusters) > 1:
112
+ cluster_str += 's'
113
+ cluster_str += ' '
114
+ cluster_str += ', '.join(not_found_clusters)
115
+ click.echo(f'{cluster_str} not found.')
116
+ elif not cluster_records:
105
117
  click.echo('No existing clusters.')
106
118
  return num_pending_autostop
107
119
 
@@ -202,6 +214,8 @@ def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord],
202
214
  # _ClusterCostReportRecord, which is okay as we guarantee the queried fields
203
215
  # exist in those cases.
204
216
  _get_name = (lambda cluster_record: cluster_record['name'])
217
+ _get_user_hash = (lambda cluster_record: cluster_record['user_hash'])
218
+ _get_user_name = (lambda cluster_record: cluster_record.get('user_name', '-'))
205
219
  _get_launched = (lambda cluster_record: log_utils.readable_time_duration(
206
220
  cluster_record['launched_at']))
207
221
  _get_region = (
@@ -220,6 +234,8 @@ def _get_status_colored(cluster_record: _ClusterRecord) -> str:
220
234
 
221
235
 
222
236
  def _get_resources(cluster_record: _ClusterRecord) -> str:
237
+ if 'resources_str' in cluster_record:
238
+ return cluster_record['resources_str']
223
239
  handle = cluster_record['handle']
224
240
  if isinstance(handle, backends.LocalDockerResourceHandle):
225
241
  resources_str = 'docker'
@@ -316,3 +332,45 @@ def _get_estimated_cost_for_cost_report(
316
332
  return '-'
317
333
 
318
334
  return f'$ {cost:.2f}'
335
+
336
+
337
+ def show_kubernetes_cluster_status_table(
338
+ clusters: List['kubernetes_utils.KubernetesSkyPilotClusterInfo'],
339
+ show_all: bool) -> None:
340
+ """Compute cluster table values and display for Kubernetes clusters."""
341
+ status_columns = [
342
+ StatusColumn('USER', lambda c: c.user),
343
+ StatusColumn('NAME', lambda c: c.cluster_name),
344
+ StatusColumn('LAUNCHED',
345
+ lambda c: log_utils.readable_time_duration(c.launched_at)),
346
+ StatusColumn('RESOURCES',
347
+ lambda c: c.resources_str,
348
+ trunc_length=70 if not show_all else 0),
349
+ StatusColumn('STATUS', lambda c: c.status.colored_str()),
350
+ # TODO(romilb): We should consider adding POD_NAME field here when --all
351
+ # is passed to help users fetch pod name programmatically.
352
+ ]
353
+
354
+ columns = [
355
+ col.name for col in status_columns if col.show_by_default or show_all
356
+ ]
357
+ cluster_table = log_utils.create_table(columns)
358
+
359
+ # Sort table by user, then by cluster name
360
+ sorted_clusters = sorted(clusters, key=lambda c: (c.user, c.cluster_name))
361
+
362
+ for cluster in sorted_clusters:
363
+ row = []
364
+ for status_column in status_columns:
365
+ if status_column.show_by_default or show_all:
366
+ row.append(status_column.calc(cluster))
367
+ cluster_table.add_row(row)
368
+
369
+ if clusters:
370
+ click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
371
+ f'SkyPilot clusters'
372
+ f'{colorama.Style.RESET_ALL}')
373
+ click.echo(cluster_table)
374
+ else:
375
+ click.echo('No SkyPilot resources found in the '
376
+ 'active Kubernetes context.')
@@ -0,0 +1,356 @@
1
+ """Utility functions for cluster yaml file."""
2
+
3
+ import functools
4
+ import glob
5
+ import os
6
+ import re
7
+ import textwrap
8
+ from typing import Dict, List, Optional
9
+ import uuid
10
+
11
+ from sky.skylet import constants
12
+ from sky.utils import command_runner
13
+ from sky.utils import common_utils
14
+ from sky.utils import timeline
15
+
16
+ # The cluster yaml used to create the current cluster where the module is
17
+ # called.
18
+ SKY_CLUSTER_YAML_REMOTE_PATH = '~/.sky/sky_ray.yml'
19
+
20
+
21
+ def get_provider_name(config: dict) -> str:
22
+ """Return the name of the provider."""
23
+
24
+ provider_module = config['provider']['module']
25
+ # Examples:
26
+ # 'sky.skylet.providers.aws.AWSNodeProviderV2' -> 'aws'
27
+ # 'sky.provision.aws' -> 'aws'
28
+ provider_search = re.search(r'(?:providers|provision)\.(\w+)\.?',
29
+ provider_module)
30
+ assert provider_search is not None, config
31
+ provider_name = provider_search.group(1).lower()
32
+ # Special handling for lambda_cloud as Lambda cloud is registered as lambda.
33
+ if provider_name == 'lambda_cloud':
34
+ provider_name = 'lambda'
35
+ return provider_name
36
+
37
+
38
+ class SSHConfigHelper(object):
39
+ """Helper for handling local SSH configuration."""
40
+
41
+ ssh_conf_path = '~/.ssh/config'
42
+ ssh_conf_lock_path = os.path.expanduser('~/.sky/locks/.ssh_config.lock')
43
+ ssh_conf_per_cluster_lock_path = os.path.expanduser(
44
+ '~/.sky/locks/.ssh_config_{}.lock')
45
+ ssh_cluster_path = constants.SKY_USER_FILE_PATH + '/ssh/{}'
46
+ ssh_cluster_key_path = constants.SKY_USER_FILE_PATH + '/ssh-keys/{}.key'
47
+
48
+ @classmethod
49
+ def _get_generated_config(cls, autogen_comment: str, host_name: str,
50
+ ip: str, username: str, ssh_key_path: str,
51
+ proxy_command: Optional[str], port: int,
52
+ docker_proxy_command: Optional[str]):
53
+ if proxy_command is not None:
54
+ # Already checked in resources
55
+ assert docker_proxy_command is None, (
56
+ 'Cannot specify both proxy_command and docker_proxy_command.')
57
+ proxy = f'ProxyCommand {proxy_command}'
58
+ elif docker_proxy_command is not None:
59
+ proxy = f'ProxyCommand {docker_proxy_command}'
60
+ else:
61
+ proxy = ''
62
+ # StrictHostKeyChecking=no skips the host key check for the first
63
+ # time. UserKnownHostsFile=/dev/null and GlobalKnownHostsFile/dev/null
64
+ # prevent the host key from being added to the known_hosts file and
65
+ # always return an empty file for known hosts, making the ssh think
66
+ # this is a first-time connection, and thus skipping the host key
67
+ # check.
68
+ # Not adding SSH agent forwarding by default here to avoid implicitly
69
+ # using users' SSH keys in their local agent. Plus on sky launch side we
70
+ # are not default adding SSH agent forwarding either.
71
+ codegen = textwrap.dedent(f"""\
72
+ {autogen_comment}
73
+ Host {host_name}
74
+ HostName {ip}
75
+ User {username}
76
+ IdentityFile {ssh_key_path}
77
+ IdentitiesOnly yes
78
+ StrictHostKeyChecking no
79
+ UserKnownHostsFile=/dev/null
80
+ GlobalKnownHostsFile=/dev/null
81
+ Port {port}
82
+ {proxy}
83
+ """.rstrip())
84
+ codegen = codegen + '\n'
85
+ return codegen
86
+
87
+ @classmethod
88
+ def generate_local_key_file(cls, cluster_name: str,
89
+ auth_config: Dict[str, str]) -> str:
90
+ key_content = auth_config.pop('ssh_private_key_content', None)
91
+ if key_content is not None:
92
+ cluster_private_key_path = cls.ssh_cluster_key_path.format(
93
+ cluster_name)
94
+ expanded_cluster_private_key_path = os.path.expanduser(
95
+ cluster_private_key_path)
96
+ expanded_cluster_private_key_dir = os.path.dirname(
97
+ expanded_cluster_private_key_path)
98
+ os.makedirs(expanded_cluster_private_key_dir,
99
+ exist_ok=True,
100
+ mode=0o700)
101
+ with open(expanded_cluster_private_key_path,
102
+ 'w',
103
+ encoding='utf-8',
104
+ opener=functools.partial(os.open, mode=0o600)) as f:
105
+ f.write(key_content)
106
+ auth_config['ssh_private_key'] = cluster_private_key_path
107
+ return auth_config['ssh_private_key']
108
+
109
+ @classmethod
110
+ @timeline.FileLockEvent(ssh_conf_lock_path)
111
+ def add_cluster(
112
+ cls,
113
+ cluster_name: str,
114
+ ips: List[str],
115
+ auth_config: Dict[str, str],
116
+ ports: List[int],
117
+ docker_user: Optional[str] = None,
118
+ ssh_user: Optional[str] = None,
119
+ ):
120
+ """Add authentication information for cluster to local SSH config file.
121
+
122
+ If a host with `cluster_name` already exists and the configuration was
123
+ not added by sky, then `ip` is used to identify the host instead in the
124
+ file.
125
+
126
+ If a host with `cluster_name` already exists and the configuration was
127
+ added by sky (e.g. a spot instance), then the configuration is
128
+ overwritten.
129
+
130
+ Args:
131
+ cluster_name: Cluster name (see `sky status`)
132
+ ips: List of public IP addresses in the cluster. First IP is head
133
+ node.
134
+ auth_config: `auth` in cluster yaml.
135
+ ports: List of port numbers for SSH corresponding to ips
136
+ docker_user: If not None, use this user to ssh into the docker
137
+ ssh_user: Override the ssh_user in auth_config
138
+ """
139
+ if ssh_user is None:
140
+ username = auth_config['ssh_user']
141
+ else:
142
+ username = ssh_user
143
+ if docker_user is not None:
144
+ username = docker_user
145
+
146
+ key_path = cls.generate_local_key_file(cluster_name, auth_config)
147
+ key_path = os.path.expanduser(key_path)
148
+ sky_autogen_comment = ('# Added by sky (use `sky stop/down '
149
+ f'{cluster_name}` to remove)')
150
+ ip = ips[0]
151
+ if docker_user is not None:
152
+ ip = 'localhost'
153
+
154
+ config_path = os.path.expanduser(cls.ssh_conf_path)
155
+ os.makedirs(os.path.dirname(config_path), exist_ok=True, mode=0o700)
156
+
157
+ if not os.path.exists(config_path):
158
+ config = ['\n']
159
+ with open(config_path,
160
+ 'w',
161
+ encoding='utf-8',
162
+ opener=functools.partial(os.open, mode=0o644)) as f:
163
+ f.writelines(config)
164
+
165
+ with open(config_path, 'r', encoding='utf-8') as f:
166
+ config = f.readlines()
167
+
168
+ ssh_dir = cls.ssh_cluster_path.format('')
169
+ os.makedirs(os.path.expanduser(ssh_dir), exist_ok=True, mode=0o700)
170
+
171
+ # Handle Include on top of Config file
172
+ include_str = f'Include {cls.ssh_cluster_path.format("*")}'
173
+ found = False
174
+ for i, line in enumerate(config):
175
+ config_str = line.strip()
176
+ if config_str == include_str:
177
+ found = True
178
+ break
179
+ if 'Host' in config_str:
180
+ break
181
+ if not found:
182
+ # Did not find Include string. Insert `Include` lines.
183
+ with open(config_path, 'w', encoding='utf-8') as f:
184
+ config.insert(
185
+ 0, '# Added by SkyPilot for ssh config of all clusters\n'
186
+ f'{include_str}\n')
187
+ f.write(''.join(config).strip())
188
+ f.write('\n' * 2)
189
+
190
+ proxy_command = auth_config.get('ssh_proxy_command', None)
191
+
192
+ docker_proxy_command_generator = None
193
+ if docker_user is not None:
194
+ docker_proxy_command_generator = lambda ip, port: ' '.join(
195
+ ['ssh'] + command_runner.ssh_options_list(
196
+ key_path, ssh_control_name=None, port=port) +
197
+ ['-W', '%h:%p', f'{auth_config["ssh_user"]}@{ip}'])
198
+
199
+ codegen = ''
200
+ # Add the nodes to the codegen
201
+ for i, ip in enumerate(ips):
202
+ docker_proxy_command = None
203
+ port = ports[i]
204
+ if docker_proxy_command_generator is not None:
205
+ docker_proxy_command = docker_proxy_command_generator(ip, port)
206
+ ip = 'localhost'
207
+ port = constants.DEFAULT_DOCKER_PORT
208
+ node_name = cluster_name if i == 0 else cluster_name + f'-worker{i}'
209
+ # TODO(romilb): Update port number when k8s supports multinode
210
+ codegen += cls._get_generated_config(
211
+ sky_autogen_comment, node_name, ip, username, key_path,
212
+ proxy_command, port, docker_proxy_command) + '\n'
213
+
214
+ cluster_config_path = os.path.expanduser(
215
+ cls.ssh_cluster_path.format(cluster_name))
216
+
217
+ with open(cluster_config_path,
218
+ 'w',
219
+ encoding='utf-8',
220
+ opener=functools.partial(os.open, mode=0o644)) as f:
221
+ f.write(codegen)
222
+
223
+ @classmethod
224
+ def _remove_stale_cluster_config_for_backward_compatibility(
225
+ cls,
226
+ cluster_name: str,
227
+ ip: str,
228
+ auth_config: Dict[str, str],
229
+ docker_user: Optional[str] = None,
230
+ ):
231
+ """Remove authentication information for cluster from local SSH config.
232
+
233
+ If no existing host matching the provided specification is found, then
234
+ nothing is removed.
235
+
236
+ Args:
237
+ ip: Head node's IP address.
238
+ auth_config: `auth` in cluster yaml.
239
+ docker_user: If not None, use this user to ssh into the docker
240
+ """
241
+ username = auth_config['ssh_user']
242
+ config_path = os.path.expanduser(cls.ssh_conf_path)
243
+ cluster_config_path = os.path.expanduser(
244
+ cls.ssh_cluster_path.format(cluster_name))
245
+ if not os.path.exists(config_path):
246
+ return
247
+
248
+ with open(config_path, 'r', encoding='utf-8') as f:
249
+ config = f.readlines()
250
+
251
+ start_line_idx = None
252
+
253
+ # Scan the config for the cluster name.
254
+ for i, line in enumerate(config):
255
+ next_line = config[i + 1] if i + 1 < len(config) else ''
256
+ if docker_user is None:
257
+ found = (line.strip() == f'HostName {ip}' and
258
+ next_line.strip() == f'User {username}')
259
+ else:
260
+ found = (line.strip() == 'HostName localhost' and
261
+ next_line.strip() == f'User {docker_user}')
262
+ if found:
263
+ # Find the line starting with ProxyCommand and contains ip
264
+ found = False
265
+ for idx in range(i, len(config)):
266
+ # Stop if we reach an empty line, which means a new host
267
+ if not config[idx].strip():
268
+ break
269
+ if config[idx].strip().startswith('ProxyCommand'):
270
+ proxy_command_line = config[idx].strip()
271
+ if proxy_command_line.endswith(f'@{ip}'):
272
+ found = True
273
+ break
274
+ if found:
275
+ start_line_idx = i - 1
276
+ break
277
+
278
+ if start_line_idx is not None:
279
+ # Scan for end of previous config.
280
+ cursor = start_line_idx
281
+ while cursor > 0 and len(config[cursor].strip()) > 0:
282
+ cursor -= 1
283
+ prev_end_line_idx = cursor
284
+
285
+ # Scan for end of the cluster config.
286
+ end_line_idx = None
287
+ cursor = start_line_idx + 1
288
+ start_line_idx -= 1 # remove auto-generated comment
289
+ while cursor < len(config):
290
+ if config[cursor].strip().startswith(
291
+ '# ') or config[cursor].strip().startswith('Host '):
292
+ end_line_idx = cursor
293
+ break
294
+ cursor += 1
295
+
296
+ # Remove sky-generated config and update the file.
297
+ config[prev_end_line_idx:end_line_idx] = [
298
+ '\n'
299
+ ] if end_line_idx is not None else []
300
+ with open(config_path, 'w', encoding='utf-8') as f:
301
+ f.write(''.join(config).strip())
302
+ f.write('\n' * 2)
303
+
304
+ # Delete include statement if it exists in the config.
305
+ sky_autogen_comment = ('# Added by sky (use `sky stop/down '
306
+ f'{cluster_name}` to remove)')
307
+ with open(config_path, 'r', encoding='utf-8') as f:
308
+ config = f.readlines()
309
+
310
+ for i, line in enumerate(config):
311
+ config_str = line.strip()
312
+ if f'Include {cluster_config_path}' in config_str:
313
+ with open(config_path, 'w', encoding='utf-8') as f:
314
+ if i < len(config) - 1 and config[i + 1] == '\n':
315
+ del config[i + 1]
316
+ # Delete Include string
317
+ del config[i]
318
+ # Delete Sky Autogen Comment
319
+ if i > 0 and sky_autogen_comment in config[i - 1].strip():
320
+ del config[i - 1]
321
+ f.write(''.join(config))
322
+ break
323
+ if 'Host' in config_str:
324
+ break
325
+
326
+ @classmethod
327
+ def remove_cluster(cls, cluster_name: str):
328
+ """Remove auth information for cluster from ~/.sky/ssh/<cluster_name>.
329
+
330
+ If no existing host matching the provided specification is found, then
331
+ nothing is removed.
332
+
333
+ Args:
334
+ cluster_name: Cluster name.
335
+ """
336
+
337
+ with timeline.FileLockEvent(
338
+ cls.ssh_conf_per_cluster_lock_path.format(cluster_name)):
339
+ cluster_config_path = os.path.expanduser(
340
+ cls.ssh_cluster_path.format(cluster_name))
341
+ common_utils.remove_file_if_exists(cluster_config_path)
342
+
343
+ @classmethod
344
+ def list_cluster_names(cls) -> List[str]:
345
+ """List all names of clusters with SSH config set up."""
346
+ cluster_config_dir = os.path.expanduser(cls.ssh_cluster_path.format(''))
347
+ return [
348
+ os.path.basename(path)
349
+ for path in glob.glob(os.path.join(cluster_config_dir, '*'))
350
+ ]
351
+
352
+
353
+ def generate_cluster_name():
354
+ # TODO: change this ID formatting to something more pleasant.
355
+ # User name is helpful in non-isolated accounts, e.g., GCP, Azure.
356
+ return f'sky-{uuid.uuid4().hex[:4]}-{common_utils.get_cleaned_username()}'