skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,73 @@
1
+ """Helper script to strip path information from exec auth in a kubeconfig file.
2
+
3
+ This script processes a kubeconfig file and removes any path information from
4
+ the 'command' field in the exec configuration, leaving only the executable name.
5
+ This is useful when moving between different environments where auth plugin
6
+ executables might be installed in different locations.
7
+
8
+ It assumes the target environment has the auth executable available in PATH.
9
+ If not, you'll need to update your environment container to include the auth
10
+ executable in PATH.
11
+
12
+ Usage:
13
+ python -m sky.utils.kubernetes.exec_kubeconfig_converter
14
+ """
15
+ import argparse
16
+ import os
17
+
18
+ import yaml
19
+
20
+
21
+ def strip_auth_plugin_paths(kubeconfig_path: str, output_path: str):
22
+ """Strip path information from exec plugin commands in a kubeconfig file.
23
+
24
+ Args:
25
+ kubeconfig_path (str): Path to the input kubeconfig file
26
+ output_path (str): Path where the modified kubeconfig will be saved
27
+ """
28
+ with open(kubeconfig_path, 'r', encoding='utf-8') as file:
29
+ config = yaml.safe_load(file)
30
+
31
+ updated = False
32
+ for user in config.get('users', []):
33
+ exec_info = user.get('user', {}).get('exec', {})
34
+ current_command = exec_info.get('command', '')
35
+
36
+ if current_command:
37
+ # Strip the path and keep only the executable name
38
+ executable = os.path.basename(current_command)
39
+ if executable != current_command:
40
+ exec_info['command'] = executable
41
+ updated = True
42
+
43
+ if updated:
44
+ with open(output_path, 'w', encoding='utf-8') as file:
45
+ yaml.safe_dump(config, file)
46
+ print('Kubeconfig updated with path-less exec auth. '
47
+ f'Saved to {output_path}')
48
+ else:
49
+ print('No updates made. No exec-based auth commands paths found.')
50
+
51
+
52
+ def main():
53
+ parser = argparse.ArgumentParser(
54
+ description='Strip path information from exec plugin commands in a '
55
+ 'kubeconfig file. Used to prepare kubeconfigs for deployment '
56
+ 'with SkyPilot.')
57
+ parser.add_argument(
58
+ '--input',
59
+ '-i',
60
+ default=os.path.expanduser('~/.kube/config'),
61
+ help='Input kubeconfig file path (default: %(default)s)')
62
+ parser.add_argument(
63
+ '--output',
64
+ '-o',
65
+ default=os.path.expanduser('~/.kube/config.converted'),
66
+ help='Output kubeconfig file path (default: %(default)s)')
67
+
68
+ args = parser.parse_args()
69
+ strip_auth_plugin_paths(args.input, args.output)
70
+
71
+
72
+ if __name__ == '__main__':
73
+ main()
@@ -0,0 +1,336 @@
1
+ #!/bin/bash
2
+ # This script creates a new k8s Service Account and generates a kubeconfig with
3
+ # its credentials. This Service Account has the minimal permissions necessary for
4
+ # SkyPilot. The kubeconfig is written in the current directory.
5
+ #
6
+ # Before running this script, you must configure your local kubectl to point to
7
+ # the right k8s cluster and have admin-level access.
8
+ #
9
+ # By default, this script will create a service account "sky-sa" in "default"
10
+ # namespace. If you want to use a different namespace or service account name:
11
+ #
12
+ # * Specify SKYPILOT_NAMESPACE env var to override the default namespace where the service account is created.
13
+ # * Specify SKYPILOT_SA_NAME env var to override the default service account name.
14
+ # * Specify SKIP_SA_CREATION=1 to skip creating the service account and use an existing one
15
+ # * Specify SUPER_USER=1 to create a service account with cluster-admin permissions
16
+ #
17
+ # Usage:
18
+ # # Create "sky-sa" service account with minimal permissions in "default" namespace and generate kubeconfig
19
+ # $ ./generate_kubeconfig.sh
20
+ #
21
+ # # Create "my-sa" service account with minimal permissions in "my-namespace" namespace and generate kubeconfig
22
+ # $ SKYPILOT_SA_NAME=my-sa SKYPILOT_NAMESPACE=my-namespace ./generate_kubeconfig.sh
23
+ #
24
+ # # Use an existing service account "my-sa" in "my-namespace" namespace and generate kubeconfig
25
+ # $ SKIP_SA_CREATION=1 SKYPILOT_SA_NAME=my-sa SKYPILOT_NAMESPACE=my-namespace ./generate_kubeconfig.sh
26
+ #
27
+ # # Create "sky-sa" service account with cluster-admin permissions in "default" namespace
28
+ # $ SUPER_USER=1 ./generate_kubeconfig.sh
29
+
30
+ set -eu -o pipefail
31
+
32
+ # Allow passing in common name and username in environment. If not provided,
33
+ # use default.
34
+ SKYPILOT_SA=${SKYPILOT_SA_NAME:-sky-sa}
35
+ NAMESPACE=${SKYPILOT_NAMESPACE:-default}
36
+ SUPER_USER=${SUPER_USER:-0}
37
+
38
+ echo "Service account: ${SKYPILOT_SA}"
39
+ echo "Namespace: ${NAMESPACE}"
40
+ echo "Super user permissions: ${SUPER_USER}"
41
+
42
+ # Set OS specific values.
43
+ if [[ "$OSTYPE" == "linux-gnu" ]]; then
44
+ BASE64_DECODE_FLAG="-d"
45
+ elif [[ "$OSTYPE" == "darwin"* ]]; then
46
+ BASE64_DECODE_FLAG="-D"
47
+ elif [[ "$OSTYPE" == "linux-musl" ]]; then
48
+ BASE64_DECODE_FLAG="-d"
49
+ else
50
+ echo "Unknown OS ${OSTYPE}"
51
+ exit 1
52
+ fi
53
+
54
+ # If the user has set SKIP_SA_CREATION=1, skip creating the service account.
55
+ if [ -z ${SKIP_SA_CREATION+x} ]; then
56
+ echo "Creating the Kubernetes Service Account with ${SUPER_USER:+super user}${SUPER_USER:-minimal} RBAC permissions."
57
+ if [ "${SUPER_USER}" = "1" ]; then
58
+ # Create service account with cluster-admin permissions
59
+ kubectl apply -f - <<EOF
60
+ apiVersion: v1
61
+ kind: Namespace
62
+ metadata:
63
+ name: ${NAMESPACE}
64
+ labels:
65
+ parent: skypilot
66
+ ---
67
+ kind: ServiceAccount
68
+ apiVersion: v1
69
+ metadata:
70
+ name: ${SKYPILOT_SA}
71
+ namespace: ${NAMESPACE}
72
+ labels:
73
+ parent: skypilot
74
+ ---
75
+ apiVersion: rbac.authorization.k8s.io/v1
76
+ kind: ClusterRoleBinding
77
+ metadata:
78
+ name: ${SKYPILOT_SA}-cluster-admin
79
+ labels:
80
+ parent: skypilot
81
+ subjects:
82
+ - kind: ServiceAccount
83
+ name: ${SKYPILOT_SA}
84
+ namespace: ${NAMESPACE}
85
+ roleRef:
86
+ kind: ClusterRole
87
+ name: cluster-admin
88
+ apiGroup: rbac.authorization.k8s.io
89
+ EOF
90
+ else
91
+ # Original RBAC rules for minimal permissions
92
+ kubectl apply -f - <<EOF
93
+ # Create/update namespace specified by the user
94
+ apiVersion: v1
95
+ kind: Namespace
96
+ metadata:
97
+ name: ${NAMESPACE}
98
+ labels:
99
+ parent: skypilot
100
+ ---
101
+ kind: ServiceAccount
102
+ apiVersion: v1
103
+ metadata:
104
+ name: ${SKYPILOT_SA}
105
+ namespace: ${NAMESPACE}
106
+ labels:
107
+ parent: skypilot
108
+ ---
109
+ # Role for the service account
110
+ kind: Role
111
+ apiVersion: rbac.authorization.k8s.io/v1
112
+ metadata:
113
+ name: ${SKYPILOT_SA}-role
114
+ namespace: ${NAMESPACE}
115
+ labels:
116
+ parent: skypilot
117
+ rules:
118
+ - apiGroups: ["*"] # Required for creating pods, services, secrets and other necessary resources in the namespace.
119
+ resources: ["*"]
120
+ verbs: ["*"]
121
+ ---
122
+ # RoleBinding for the service account
123
+ kind: RoleBinding
124
+ apiVersion: rbac.authorization.k8s.io/v1
125
+ metadata:
126
+ name: ${SKYPILOT_SA}-rb
127
+ namespace: ${NAMESPACE}
128
+ labels:
129
+ parent: skypilot
130
+ subjects:
131
+ - kind: ServiceAccount
132
+ name: ${SKYPILOT_SA}
133
+ roleRef:
134
+ kind: Role
135
+ name: ${SKYPILOT_SA}-role
136
+ apiGroup: rbac.authorization.k8s.io
137
+ ---
138
+ # ClusterRole for the service account
139
+ kind: ClusterRole
140
+ apiVersion: rbac.authorization.k8s.io/v1
141
+ metadata:
142
+ name: ${SKYPILOT_SA}-cluster-role
143
+ namespace: ${NAMESPACE}
144
+ labels:
145
+ parent: skypilot
146
+ rules:
147
+ - apiGroups: [""]
148
+ resources: ["nodes"] # Required for getting node resources.
149
+ verbs: ["get", "list", "watch"]
150
+ - apiGroups: ["node.k8s.io"]
151
+ resources: ["runtimeclasses"] # Required for autodetecting the runtime class of the nodes.
152
+ verbs: ["get", "list", "watch"]
153
+ - apiGroups: ["networking.k8s.io"] # Required for exposing services through ingresses
154
+ resources: ["ingressclasses"]
155
+ verbs: ["get", "list", "watch"]
156
+ - apiGroups: [""] # Required for sky show-gpus command
157
+ resources: ["pods"]
158
+ verbs: ["get", "list"]
159
+ ---
160
+ # ClusterRoleBinding for the service account
161
+ apiVersion: rbac.authorization.k8s.io/v1
162
+ kind: ClusterRoleBinding
163
+ metadata:
164
+ name: ${SKYPILOT_SA}-cluster-role-binding
165
+ namespace: ${NAMESPACE}
166
+ labels:
167
+ parent: skypilot
168
+ subjects:
169
+ - kind: ServiceAccount
170
+ name: ${SKYPILOT_SA}
171
+ namespace: ${NAMESPACE}
172
+ roleRef:
173
+ kind: ClusterRole
174
+ name: ${SKYPILOT_SA}-cluster-role
175
+ apiGroup: rbac.authorization.k8s.io
176
+ ---
177
+ # Optional: If using object store mounting, create the skypilot-system namespace
178
+ apiVersion: v1
179
+ kind: Namespace
180
+ metadata:
181
+ name: skypilot-system
182
+ labels:
183
+ parent: skypilot
184
+ ---
185
+ # Optional: If using object store mounting, create role in the skypilot-system
186
+ # namespace to create FUSE device manager.
187
+ kind: Role
188
+ apiVersion: rbac.authorization.k8s.io/v1
189
+ metadata:
190
+ name: skypilot-system-service-account-role
191
+ namespace: skypilot-system
192
+ labels:
193
+ parent: skypilot
194
+ rules:
195
+ - apiGroups: ["*"]
196
+ resources: ["*"]
197
+ verbs: ["*"]
198
+ ---
199
+ # Optional: If using object store mounting, create rolebinding in the skypilot-system
200
+ # namespace to create FUSE device manager.
201
+ apiVersion: rbac.authorization.k8s.io/v1
202
+ kind: RoleBinding
203
+ metadata:
204
+ name: ${SKYPILOT_SA}-skypilot-system-role-binding-${NAMESPACE}
205
+ namespace: skypilot-system # Do not change this namespace
206
+ labels:
207
+ parent: skypilot
208
+ subjects:
209
+ - kind: ServiceAccount
210
+ name: ${SKYPILOT_SA}
211
+ namespace: ${NAMESPACE}
212
+ roleRef:
213
+ kind: Role
214
+ name: skypilot-system-service-account-role
215
+ apiGroup: rbac.authorization.k8s.io
216
+ EOF
217
+ fi
218
+ # Apply optional ingress-related roles, but don't make the script fail if it fails
219
+ kubectl apply -f - <<EOF || echo "Failed to apply optional ingress-related roles. Nginx ingress is likely not installed. This is not critical and the script will continue."
220
+ # Optional: Role for accessing ingress resources
221
+ apiVersion: rbac.authorization.k8s.io/v1
222
+ kind: Role
223
+ metadata:
224
+ name: ${SKYPILOT_SA}-role-ingress-nginx
225
+ namespace: ingress-nginx # Do not change this namespace
226
+ labels:
227
+ parent: skypilot
228
+ rules:
229
+ - apiGroups: [""]
230
+ resources: ["services"]
231
+ verbs: ["list", "get", "watch"]
232
+ - apiGroups: ["rbac.authorization.k8s.io"]
233
+ resources: ["roles", "rolebindings"]
234
+ verbs: ["list", "get", "watch"]
235
+ ---
236
+ # Optional: RoleBinding for accessing ingress resources
237
+ apiVersion: rbac.authorization.k8s.io/v1
238
+ kind: RoleBinding
239
+ metadata:
240
+ name: ${SKYPILOT_SA}-rolebinding-ingress-nginx
241
+ namespace: ingress-nginx # Do not change this namespace
242
+ labels:
243
+ parent: skypilot
244
+ subjects:
245
+ - kind: ServiceAccount
246
+ name: ${SKYPILOT_SA}
247
+ namespace: ${NAMESPACE}
248
+ roleRef:
249
+ kind: Role
250
+ name: ${SKYPILOT_SA}-role-ingress-nginx # Use the same name as the role at line 119
251
+ apiGroup: rbac.authorization.k8s.io
252
+ EOF
253
+ fi
254
+
255
+ # Checks if secret entry was defined for Service account. If defined it means that Kubernetes server has a
256
+ # version bellow 1.24, otherwise one must manually create the secret and bind it to the Service account to have a non expiring token.
257
+ # After Kubernetes v1.24 Service accounts no longer generate automatic tokens/secrets.
258
+ # We can use kubectl create token but the token has a expiration time.
259
+ # https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG/CHANGELOG-1.24.md#urgent-upgrade-notes
260
+ SA_SECRET_NAME=$(kubectl get -n ${NAMESPACE} sa/${SKYPILOT_SA} -o "jsonpath={.secrets[0]..name}")
261
+ if [ -z $SA_SECRET_NAME ]
262
+ then
263
+ # Create the secret and bind it to the desired SA
264
+ kubectl apply -f - <<EOF
265
+ apiVersion: v1
266
+ kind: Secret
267
+ type: kubernetes.io/service-account-token
268
+ metadata:
269
+ name: ${SKYPILOT_SA}
270
+ namespace: ${NAMESPACE}
271
+ annotations:
272
+ kubernetes.io/service-account.name: "${SKYPILOT_SA}"
273
+ labels:
274
+ parent: skypilot
275
+ EOF
276
+
277
+ SA_SECRET_NAME=${SKYPILOT_SA}
278
+ fi
279
+
280
+ # Sleep for 2 seconds to allow the secret to be created before fetching it.
281
+ sleep 2
282
+
283
+ # Note: service account token is stored base64-encoded in the secret but must
284
+ # be plaintext in kubeconfig.
285
+ SA_TOKEN=$(kubectl get -n ${NAMESPACE} secrets/${SA_SECRET_NAME} -o "jsonpath={.data['token']}" | base64 ${BASE64_DECODE_FLAG})
286
+ CA_CERT=$(kubectl get -n ${NAMESPACE} secrets/${SA_SECRET_NAME} -o "jsonpath={.data['ca\.crt']}")
287
+
288
+ # Extract cluster IP from the current context
289
+ CURRENT_CONTEXT=$(kubectl config current-context)
290
+ CURRENT_CLUSTER=$(kubectl config view -o jsonpath="{.contexts[?(@.name == \"${CURRENT_CONTEXT}\"})].context.cluster}")
291
+ CURRENT_CLUSTER_ADDR=$(kubectl config view -o jsonpath="{.clusters[?(@.name == \"${CURRENT_CLUSTER}\"})].cluster.server}")
292
+
293
+ echo "Writing kubeconfig."
294
+ cat > kubeconfig <<EOF
295
+ apiVersion: v1
296
+ clusters:
297
+ - cluster:
298
+ certificate-authority-data: ${CA_CERT}
299
+ server: ${CURRENT_CLUSTER_ADDR}
300
+ name: ${CURRENT_CLUSTER}
301
+ contexts:
302
+ - context:
303
+ cluster: ${CURRENT_CLUSTER}
304
+ user: ${CURRENT_CLUSTER}-${SKYPILOT_SA}
305
+ namespace: ${NAMESPACE}
306
+ name: ${CURRENT_CONTEXT}
307
+ current-context: ${CURRENT_CONTEXT}
308
+ kind: Config
309
+ preferences: {}
310
+ users:
311
+ - name: ${CURRENT_CLUSTER}-${SKYPILOT_SA}
312
+ user:
313
+ token: ${SA_TOKEN}
314
+ EOF
315
+
316
+ echo "---
317
+ Done!
318
+
319
+ Kubeconfig using service acccount '${SKYPILOT_SA}' in namespace '${NAMESPACE}' written at $(pwd)/kubeconfig
320
+
321
+ Copy the generated kubeconfig file to your ~/.kube/ directory to use it with
322
+ kubectl and skypilot:
323
+
324
+ # Backup your existing kubeconfig file
325
+ mv ~/.kube/config ~/.kube/config.bak
326
+ cp kubeconfig ~/.kube/config
327
+
328
+ # Verify that you can access the cluster
329
+ kubectl get pods
330
+
331
+ Also add this to your ~/.sky/config.yaml to use the new service account:
332
+
333
+ # ~/.sky/config.yaml
334
+ kubernetes:
335
+ remote_identity: ${SKYPILOT_SA}
336
+ "
@@ -101,7 +101,7 @@ def label():
101
101
  # Get the list of nodes with GPUs
102
102
  gpu_nodes = []
103
103
  for node in nodes:
104
- if 'nvidia.com/gpu' in node.status.capacity:
104
+ if kubernetes_utils.get_gpu_resource_key() in node.status.capacity:
105
105
  gpu_nodes.append(node)
106
106
 
107
107
  print(f'Found {len(gpu_nodes)} GPU nodes in the cluster')
@@ -115,7 +115,7 @@ def label():
115
115
  print('Continuing without using nvidia RuntimeClass. '
116
116
  'This may fail on K3s clusters. '
117
117
  'For more details, refer to K3s deployment notes at: '
118
- 'https://skypilot.readthedocs.io/en/latest/reference/kubernetes/kubernetes-setup.html') # pylint: disable=line-too-long
118
+ 'https://docs.skypilot.co/en/latest/reference/kubernetes/kubernetes-setup.html') # pylint: disable=line-too-long
119
119
  nvidia_exists = False
120
120
 
121
121
  if nvidia_exists:
@@ -139,10 +139,10 @@ def label():
139
139
  # Create the job for this node`
140
140
  batch_v1.create_namespaced_job(namespace, job_manifest)
141
141
  print(f'Created GPU labeler job for node {node_name}')
142
- if len(gpu_nodes) == 0:
142
+ if not gpu_nodes:
143
143
  print('No GPU nodes found in the cluster. If you have GPU nodes, '
144
144
  'please ensure that they have the label '
145
- '`nvidia.com/gpu: <number of GPUs>`')
145
+ f'`{kubernetes_utils.get_gpu_resource_key()}: <number of GPUs>`')
146
146
  else:
147
147
  print('GPU labeling started - this may take 10 min or more to complete.'
148
148
  '\nTo check the status of GPU labeling jobs, run '
@@ -14,9 +14,10 @@ spec:
14
14
  containers:
15
15
  - name: gpu-labeler
16
16
  image: us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot-gpu:latest # Using this image also serves as a way to "pre-pull" the image onto nodes
17
- command:
18
- - "python"
19
- - "/label_gpus.py"
17
+ command: ["/bin/bash", "-i", "-c"]
18
+ args:
19
+ - |
20
+ python /label_gpus.py
20
21
  env:
21
22
  - name: MY_NODE_NAME
22
23
  valueFrom:
@@ -0,0 +1,228 @@
1
+ """Utility functions for deploying Kubernetes clusters."""
2
+ import os
3
+ import shlex
4
+ import subprocess
5
+ import tempfile
6
+ from typing import List
7
+
8
+ from sky import check as sky_check
9
+ from sky import sky_logging
10
+ from sky.backends import backend_utils
11
+ from sky.provision.kubernetes import utils as kubernetes_utils
12
+ from sky.skylet import constants
13
+ from sky.skylet import log_lib
14
+ from sky.utils import log_utils
15
+ from sky.utils import rich_utils
16
+ from sky.utils import subprocess_utils
17
+ from sky.utils import ux_utils
18
+
19
+ logger = sky_logging.init_logger(__name__)
20
+
21
+
22
+ def deploy_remote_cluster(ip_list: List[str], ssh_user: str, ssh_key: str,
23
+ cleanup: bool):
24
+ success = False
25
+ path_to_package = os.path.dirname(__file__)
26
+ up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.sh')
27
+ # Get directory of script and run it from there
28
+ cwd = os.path.dirname(os.path.abspath(up_script_path))
29
+
30
+ # Create temporary files for the IPs and SSH key
31
+ with tempfile.NamedTemporaryFile(mode='w') as ip_file, \
32
+ tempfile.NamedTemporaryFile(mode='w') as key_file:
33
+
34
+ # Write IPs and SSH key to temporary files
35
+ ip_file.write('\n'.join(ip_list))
36
+ ip_file.flush()
37
+
38
+ key_file.write(ssh_key)
39
+ key_file.flush()
40
+ os.chmod(key_file.name, 0o600)
41
+
42
+ deploy_command = (f'{up_script_path} {ip_file.name} '
43
+ f'{ssh_user} {key_file.name}')
44
+ if cleanup:
45
+ deploy_command += ' --cleanup'
46
+
47
+ # Convert the command to a format suitable for subprocess
48
+ deploy_command = shlex.split(deploy_command)
49
+
50
+ # Setup logging paths
51
+ run_timestamp = sky_logging.get_run_timestamp()
52
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
53
+ 'local_up.log')
54
+
55
+ # Check if ~/.kube/config exists:
56
+ if os.path.exists(os.path.expanduser('~/.kube/config')):
57
+ logger.info('Found existing kube config. '
58
+ 'It will be backed up to ~/.kube/config.bak.')
59
+ if cleanup:
60
+ msg_str = 'Cleaning up remote cluster...'
61
+ else:
62
+ msg_str = 'Deploying remote cluster...'
63
+ with rich_utils.safe_status(
64
+ ux_utils.spinner_message(msg_str,
65
+ log_path=log_path,
66
+ is_local=True)):
67
+ returncode, _, stderr = log_lib.run_with_log(
68
+ cmd=deploy_command,
69
+ log_path=log_path,
70
+ require_outputs=True,
71
+ stream_logs=False,
72
+ line_processor=log_utils.SkyRemoteUpLineProcessor(
73
+ log_path=log_path, is_local=True),
74
+ cwd=cwd)
75
+ if returncode == 0:
76
+ success = True
77
+ else:
78
+ with ux_utils.print_exception_no_traceback():
79
+ log_hint = ux_utils.log_path_hint(log_path, is_local=True)
80
+ raise RuntimeError('Failed to deploy remote cluster. '
81
+ f'Full log: {log_hint}'
82
+ f'\nError: {stderr}')
83
+
84
+ if success:
85
+ if cleanup:
86
+ logger.info(
87
+ ux_utils.finishing_message(
88
+ '🎉 Remote cluster cleaned up successfully.',
89
+ log_path=log_path,
90
+ is_local=True))
91
+ else:
92
+ logger.info(
93
+ ux_utils.finishing_message(
94
+ '🎉 Remote cluster deployed successfully.',
95
+ log_path=log_path,
96
+ is_local=True))
97
+
98
+
99
+ def deploy_local_cluster(gpus: bool):
100
+ cluster_created = False
101
+
102
+ # Check if GPUs are available on the host
103
+ local_gpus_available = backend_utils.check_local_gpus()
104
+ gpus = gpus and local_gpus_available
105
+
106
+ # Check if ~/.kube/config exists:
107
+ if os.path.exists(os.path.expanduser('~/.kube/config')):
108
+ curr_context = kubernetes_utils.get_current_kube_config_context_name()
109
+ skypilot_context = 'kind-skypilot'
110
+ if curr_context is not None and curr_context != skypilot_context:
111
+ logger.info(
112
+ f'Current context in kube config: {curr_context}'
113
+ '\nWill automatically switch to kind-skypilot after the local '
114
+ 'cluster is created.')
115
+ message_str = 'Creating local cluster{}...'
116
+ message_str = message_str.format((' with GPU support (this may take up '
117
+ 'to 15 minutes)') if gpus else '')
118
+ path_to_package = os.path.dirname(__file__)
119
+ up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
120
+
121
+ # Get directory of script and run it from there
122
+ cwd = os.path.dirname(os.path.abspath(up_script_path))
123
+ run_command = up_script_path + ' --gpus' if gpus else up_script_path
124
+ run_command = shlex.split(run_command)
125
+
126
+ # Setup logging paths
127
+ run_timestamp = sky_logging.get_run_timestamp()
128
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
129
+ 'local_up.log')
130
+ logger.info(message_str)
131
+
132
+ with rich_utils.safe_status(
133
+ ux_utils.spinner_message(message_str,
134
+ log_path=log_path,
135
+ is_local=True)):
136
+ returncode, _, stderr = log_lib.run_with_log(
137
+ cmd=run_command,
138
+ log_path=log_path,
139
+ require_outputs=True,
140
+ stream_logs=False,
141
+ line_processor=log_utils.SkyLocalUpLineProcessor(log_path=log_path,
142
+ is_local=True),
143
+ cwd=cwd)
144
+
145
+ # Kind always writes to stderr even if it succeeds.
146
+ # If the failure happens after the cluster is created, we need
147
+ # to strip all stderr of "No kind clusters found.", which is
148
+ # printed when querying with kind get clusters.
149
+ stderr = stderr.replace('No kind clusters found.\n', '')
150
+
151
+ if returncode == 0:
152
+ cluster_created = True
153
+ elif returncode == 100:
154
+ logger.info(
155
+ ux_utils.finishing_message(
156
+ 'Local cluster already exists.\n',
157
+ log_path=log_path,
158
+ is_local=True,
159
+ follow_up_message=
160
+ 'If you want to delete it instead, run: sky local down'))
161
+ else:
162
+ with ux_utils.print_exception_no_traceback():
163
+ log_hint = ux_utils.log_path_hint(log_path, is_local=True)
164
+ raise RuntimeError('Failed to create local cluster. '
165
+ f'Full log: {log_hint}'
166
+ f'\nError: {stderr}')
167
+ # Run sky check
168
+ with rich_utils.safe_status('[bold cyan]Running sky check...'):
169
+ sky_check.check(clouds=['kubernetes'], quiet=True)
170
+ if cluster_created:
171
+ # Prepare completion message which shows CPU and GPU count
172
+ # Get number of CPUs
173
+ p = subprocess_utils.run(
174
+ 'kubectl get nodes -o jsonpath=\'{.items[0].status.capacity.cpu}\'',
175
+ capture_output=True)
176
+ num_cpus = int(p.stdout.decode('utf-8'))
177
+
178
+ # GPU count/type parsing
179
+ gpu_message = ''
180
+ gpu_hint = ''
181
+ if gpus:
182
+ # Get GPU model by querying the node labels
183
+ label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
184
+ gpu_type_cmd = f'kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
185
+ try:
186
+ # Run the command and capture the output
187
+ gpu_count_output = subprocess.check_output(gpu_type_cmd,
188
+ shell=True,
189
+ text=True)
190
+ gpu_type_str = gpu_count_output.strip() + ' '
191
+ except subprocess.CalledProcessError as e:
192
+ output = str(e.output.decode('utf-8'))
193
+ logger.warning(f'Failed to get GPU type: {output}')
194
+ gpu_type_str = ''
195
+
196
+ # Get number of GPUs (sum of nvidia.com/gpu resources)
197
+ gpu_count_command = 'kubectl get nodes -o=jsonpath=\'{range .items[*]}{.status.allocatable.nvidia\\.com/gpu}{\"\\n\"}{end}\' | awk \'{sum += $1} END {print sum}\'' # pylint: disable=line-too-long
198
+ try:
199
+ # Run the command and capture the output
200
+ gpu_count_output = subprocess.check_output(gpu_count_command,
201
+ shell=True,
202
+ text=True)
203
+ gpu_count = gpu_count_output.strip(
204
+ ) # Remove any extra whitespace
205
+ gpu_message = f' and {gpu_count} {gpu_type_str}GPUs'
206
+ except subprocess.CalledProcessError as e:
207
+ output = str(e.output.decode('utf-8'))
208
+ logger.warning(f'Failed to get GPU count: {output}')
209
+ gpu_message = f' with {gpu_type_str}GPU support'
210
+
211
+ gpu_hint = (
212
+ '\nHint: To see the list of GPUs in the cluster, '
213
+ 'run \'sky show-gpus --cloud kubernetes\'') if gpus else ''
214
+
215
+ if num_cpus < 2:
216
+ logger.info('Warning: Local cluster has less than 2 CPUs. '
217
+ 'This may cause issues with running tasks.')
218
+ logger.info(
219
+ ux_utils.finishing_message(
220
+ message=(f'Local Kubernetes cluster created successfully with '
221
+ f'{num_cpus} CPUs{gpu_message}.'),
222
+ log_path=log_path,
223
+ is_local=True,
224
+ follow_up_message=(
225
+ '\n`sky launch` can now run tasks locally.\n'
226
+ 'Hint: To change the number of CPUs, change your docker '
227
+ 'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
228
+ f'{gpu_hint}')))