skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -2,9 +2,16 @@ ingress_spec:
2
2
  apiVersion: networking.k8s.io/v1
3
3
  kind: Ingress
4
4
  metadata:
5
+ labels:
6
+ {%- for label_key, label_value in labels.items() %}
7
+ {{ label_key }}: {{ label_value|tojson }}
8
+ {%- endfor %}
5
9
  annotations:
6
10
  nginx.ingress.kubernetes.io/use-regex: "true"
7
11
  nginx.ingress.kubernetes.io/rewrite-target: /$2
12
+ {%- for key, value in annotations.items() %}
13
+ {{ key }}: {{ value|tojson }}
14
+ {%- endfor %}
8
15
  name: {{ ingress_name }}
9
16
  namespace: {{ namespace }}
10
17
  spec:
@@ -5,6 +5,13 @@ service_spec:
5
5
  name: {{ service_name }}
6
6
  labels:
7
7
  parent: skypilot
8
+ {%- for label_key, label_value in labels.items() %}
9
+ {{ label_key }}: {{ label_value|tojson }}
10
+ {%- endfor %}
11
+ annotations:
12
+ {%- for key, value in annotations.items() %}
13
+ {{ key }}: {{ value|tojson }}
14
+ {%- endfor %}
8
15
  spec:
9
16
  type: LoadBalancer
10
17
  selector:
@@ -1,6 +1,41 @@
1
1
  #!/usr/bin/env bash
2
2
  set -uo pipefail
3
3
 
4
+ KUBE_CONTEXT=""
5
+ KUBE_NAMESPACE=""
6
+
7
+ # Parse flags
8
+ while getopts ":c:n:" opt; do
9
+ case ${opt} in
10
+ c)
11
+ KUBE_CONTEXT="$OPTARG"
12
+ ;;
13
+ n)
14
+ KUBE_NAMESPACE="$OPTARG"
15
+ ;;
16
+ \?)
17
+ echo "Invalid option: -$OPTARG" >&2
18
+ echo "Usage: $0 <pod_name> [-c kube_context] [-n kube_namespace]" >&2
19
+ exit 1
20
+ ;;
21
+ :)
22
+ echo "Option -$OPTARG requires an argument." >&2
23
+ exit 1
24
+ ;;
25
+ esac
26
+ done
27
+
28
+ # Shift the processed options away so that $1 becomes the pod name
29
+ shift $((OPTIND -1))
30
+
31
+ # Check if pod name is passed as an argument
32
+ if [ $# -lt 1 ]; then
33
+ echo "Usage: $0 <pod_name> [-c kube_context] [-n kube_namespace]" >&2
34
+ exit 1
35
+ fi
36
+
37
+ POD_NAME="$1" # The first positional argument is the name of the pod
38
+
4
39
  # Checks if socat is installed
5
40
  if ! command -v socat > /dev/null; then
6
41
  echo "Using 'port-forward' mode to run ssh session on Kubernetes instances requires 'socat' to be installed. Please install 'socat'" >&2
@@ -18,7 +53,21 @@ fi
18
53
  # This is preferred because of socket re-use issues in kubectl port-forward,
19
54
  # see - https://github.com/kubernetes/kubernetes/issues/74551#issuecomment-769185879
20
55
  KUBECTL_OUTPUT=$(mktemp)
21
- kubectl port-forward svc/{{ ssh_jump_name }} :22 > "${KUBECTL_OUTPUT}" 2>&1 &
56
+ KUBECTL_ARGS=()
57
+
58
+ if [ -n "$KUBE_CONTEXT" ]; then
59
+ KUBECTL_ARGS+=("--context=$KUBE_CONTEXT")
60
+ fi
61
+ # If context is not provided, it means we are using incluster auth. In this case,
62
+ # we need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
63
+ if [ -z "$KUBE_CONTEXT" ]; then
64
+ KUBECTL_ARGS+=("--kubeconfig=/dev/null")
65
+ fi
66
+ if [ -n "$KUBE_NAMESPACE" ]; then
67
+ KUBECTL_ARGS+=("--namespace=$KUBE_NAMESPACE")
68
+ fi
69
+
70
+ kubectl "${KUBECTL_ARGS[@]}" port-forward pod/"${POD_NAME}" :22 > "${KUBECTL_OUTPUT}" 2>&1 &
22
71
 
23
72
  # Capture the PID for the backgrounded kubectl command
24
73
  K8S_PORT_FWD_PID=$!
@@ -49,12 +98,7 @@ while ! nc -z 127.0.0.1 "${local_port}"; do
49
98
  sleep 0.1
50
99
  done
51
100
 
52
- # To avoid errors when many concurrent requests are sent (see https://github.com/skypilot-org/skypilot/issues/2628),
53
- # we add a random delay before establishing the socat connection.
54
- # Empirically, this needs to be at least 1 second. We set this to be random between 1 and 2 seconds.
55
- sleep $(shuf -i 10-20 -n 1 | awk '{printf "%f", $1/10}')
56
-
57
101
  # Establishes two directional byte streams to handle stdin/stdout between
58
102
  # terminal and the jump pod.
59
103
  # socat process terminates when port-forward terminates.
60
- socat - tcp:127.0.0.1:"${local_port}"
104
+ socat - tcp:127.0.0.1:"${local_port}"
@@ -18,12 +18,20 @@ provider:
18
18
 
19
19
  region: kubernetes
20
20
 
21
- # The namespace to create the Ray cluster in.
21
+
22
22
  namespace: {{k8s_namespace}}
23
23
 
24
+ # The kubecontext used to connect to the Kubernetes cluster.
25
+ {% if k8s_context is not none %}
26
+ context: {{k8s_context}}
27
+ {% endif %}
28
+
24
29
  # This should be one of KubernetesPortMode
25
30
  port_mode: {{k8s_port_mode}}
26
31
 
32
+ # The networking mode used to ssh to pods. One of KubernetesNetworkingMode.
33
+ networking_mode: {{k8s_networking_mode}}
34
+
27
35
  # We use internal IPs since we set up a port-forward between the kubernetes
28
36
  # cluster and the local machine, or directly use NodePort to reach the
29
37
  # head node.
@@ -214,7 +222,9 @@ provider:
214
222
  - protocol: TCP
215
223
  port: 22
216
224
  targetPort: 22
217
- # Service that maps to the head node of the Ray cluster.
225
+ # Service that maps to the head node of the Ray cluster, so that the
226
+ # worker nodes can find the head node using
227
+ # {{cluster_name_on_cloud}}-head.{{k8s_namespace}}.svc.cluster.local
218
228
  - apiVersion: v1
219
229
  kind: Service
220
230
  metadata:
@@ -227,18 +237,12 @@ provider:
227
237
  # names.
228
238
  name: {{cluster_name_on_cloud}}-head
229
239
  spec:
240
+ # Create a headless service so that the head node can be reached by
241
+ # the worker nodes with any port number.
242
+ clusterIP: None
230
243
  # This selector must match the head node pod's selector below.
231
244
  selector:
232
245
  component: {{cluster_name_on_cloud}}-head
233
- ports:
234
- - name: client
235
- protocol: TCP
236
- port: 10001
237
- targetPort: 10001
238
- - name: dashboard
239
- protocol: TCP
240
- port: 8265
241
- targetPort: 8265
242
246
 
243
247
  # Specify the pod type for the ray head node (as configured below).
244
248
  head_node_type: ray_head_default
@@ -261,7 +265,7 @@ available_node_types:
261
265
  skypilot-user: {{ user }}
262
266
  # Custom tags for the pods
263
267
  {%- for label_key, label_value in labels.items() %}
264
- {{ label_key }}: {{ label_value }}
268
+ {{ label_key }}: {{ label_value|tojson }}
265
269
  {%- endfor %}
266
270
  {% if k8s_fuse_device_required %}
267
271
  annotations:
@@ -272,13 +276,28 @@ available_node_types:
272
276
  # serviceAccountName: skypilot-service-account
273
277
  serviceAccountName: {{k8s_service_account_name}}
274
278
  automountServiceAccountToken: {{k8s_automount_sa_token}}
275
-
276
279
  restartPolicy: Never
277
280
 
278
- # Add node selector if GPUs are requested:
279
- {% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %}
281
+ # Add node selector if GPU/TPUs are requested:
282
+ {% if (k8s_acc_label_key is not none and k8s_acc_label_value is not none) or (k8s_spot_label_key is not none) %}
280
283
  nodeSelector:
284
+ {% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %}
281
285
  {{k8s_acc_label_key}}: {{k8s_acc_label_value}}
286
+ {% endif %}
287
+ {% if k8s_topology_label_key is not none and k8s_topology_label_value is not none %}
288
+ {{k8s_topology_label_key}}: {{k8s_topology_label_value}}
289
+ {% endif %}
290
+ {% if k8s_spot_label_key is not none %}
291
+ {{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}}
292
+ {% endif %}
293
+ {% endif %}
294
+
295
+ {% if k8s_spot_label_key is not none %}
296
+ tolerations:
297
+ - key: {{k8s_spot_label_key}}
298
+ operator: Equal
299
+ value: {{k8s_spot_label_value|tojson}}
300
+ effect: NoSchedule
282
301
  {% endif %}
283
302
 
284
303
  # This volume allocates shared memory for Ray to use for its plasma
@@ -298,10 +317,209 @@ available_node_types:
298
317
  - name: ray-node
299
318
  imagePullPolicy: IfNotPresent
300
319
  image: {{image_id}}
320
+ env:
321
+ - name: SKYPILOT_POD_NODE_TYPE
322
+ valueFrom:
323
+ fieldRef:
324
+ fieldPath: metadata.labels['ray-node-type']
325
+ {% for key, value in k8s_env_vars.items() if k8s_env_vars is not none %}
326
+ - name: {{ key }}
327
+ value: {{ value }}
328
+ {% endfor %}
301
329
  # Do not change this command - it keeps the pod alive until it is
302
330
  # explicitly killed.
303
331
  command: ["/bin/bash", "-c", "--"]
304
- args: ['trap : TERM INT; sleep infinity & wait;']
332
+ args:
333
+ - |
334
+ # For backwards compatibility, we put a marker file in the pod
335
+ # to indicate that the pod is running with the changes introduced
336
+ # in project nimbus: https://github.com/skypilot-org/skypilot/pull/4393
337
+ # TODO: Remove this marker file and it's usage in setup_commands
338
+ # after v0.10.0 release.
339
+ touch /tmp/skypilot_is_nimbus
340
+
341
+ # Helper function to conditionally use sudo
342
+ # TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
343
+ prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
344
+ [ $(id -u) -eq 0 ] && function sudo() { "$@"; } || true;
345
+
346
+ STEPS=("apt-ssh-setup" "runtime-setup" "env-setup")
347
+
348
+ # STEP 1: Run apt update, install missing packages, and set up ssh.
349
+ (
350
+ (
351
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > /tmp/apt-update.log 2>&1 || \
352
+ echo "Warning: apt-get update failed. Continuing anyway..." >> /tmp/apt-update.log
353
+ PACKAGES="rsync curl netcat gcc patch pciutils fuse openssh-server";
354
+
355
+ # Separate packages into two groups: packages that are installed first
356
+ # so that curl and rsync are available sooner to unblock the following
357
+ # conda installation and rsync.
358
+ set -e
359
+ INSTALL_FIRST="";
360
+ MISSING_PACKAGES="";
361
+ for pkg in $PACKAGES; do
362
+ if [ "$pkg" == "netcat" ]; then
363
+ if ! dpkg -l | grep -q "^ii \(netcat\|netcat-openbsd\|netcat-traditional\) "; then
364
+ INSTALL_FIRST="$INSTALL_FIRST netcat-openbsd";
365
+ fi
366
+ elif ! dpkg -l | grep -q "^ii $pkg "; then
367
+ if [ "$pkg" == "curl" ] || [ "$pkg" == "rsync" ]; then
368
+ INSTALL_FIRST="$INSTALL_FIRST $pkg";
369
+ else
370
+ MISSING_PACKAGES="$MISSING_PACKAGES $pkg";
371
+ fi
372
+ fi
373
+ done;
374
+ if [ ! -z "$INSTALL_FIRST" ]; then
375
+ echo "Installing core packages: $INSTALL_FIRST";
376
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $INSTALL_FIRST;
377
+ fi;
378
+ # SSH and other packages are not necessary, so we disable set -e
379
+ set +e
380
+
381
+ if [ ! -z "$MISSING_PACKAGES" ]; then
382
+ echo "Installing missing packages: $MISSING_PACKAGES";
383
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $MISSING_PACKAGES;
384
+ fi;
385
+
386
+ $(prefix_cmd) mkdir -p /var/run/sshd;
387
+ $(prefix_cmd) sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config;
388
+ $(prefix_cmd) sed "s@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g" -i /etc/pam.d/sshd;
389
+ cd /etc/ssh/ && $(prefix_cmd) ssh-keygen -A;
390
+ $(prefix_cmd) mkdir -p ~/.ssh;
391
+ $(prefix_cmd) chown -R $(whoami) ~/.ssh;
392
+ $(prefix_cmd) chmod 700 ~/.ssh;
393
+ $(prefix_cmd) cat /etc/secret-volume/ssh-publickey* > ~/.ssh/authorized_keys;
394
+ $(prefix_cmd) chmod 644 ~/.ssh/authorized_keys;
395
+ $(prefix_cmd) service ssh restart;
396
+ $(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
397
+ ) > /tmp/${STEPS[0]}.log 2>&1 || {
398
+ echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed
399
+ cat /tmp/${STEPS[0]}.log
400
+ exit 1
401
+ }
402
+ ) &
403
+
404
+ # STEP 2: Install conda, ray and skypilot (for dependencies); start
405
+ # ray cluster.
406
+ (
407
+ (
408
+ set -e
409
+ mkdir -p ~/.sky
410
+ # Wait for `curl` package to be installed before installing conda
411
+ # and ray.
412
+ until dpkg -l | grep -q "^ii curl "; do
413
+ sleep 0.1
414
+ echo "Waiting for curl package to be installed..."
415
+ done
416
+ {{ conda_installation_commands }}
417
+ {{ ray_installation_commands }}
418
+ VIRTUAL_ENV=~/skypilot-runtime ~/.local/bin/uv pip install skypilot[kubernetes,remote]
419
+ touch /tmp/ray_skypilot_installation_complete
420
+ echo "=== Ray and skypilot installation completed ==="
421
+
422
+ # Disable set -e, as we have some commands that are ok to fail
423
+ # after the ray start.
424
+ # TODO(zhwu): this is a hack, we should fix the commands that are
425
+ # ok to fail.
426
+ if [ "$SKYPILOT_POD_NODE_TYPE" == "head" ]; then
427
+ set +e
428
+ {{ ray_head_start_command }}
429
+ else
430
+ # Start ray worker on the worker pod.
431
+ # Wait until the head pod is available with an IP address
432
+ export SKYPILOT_RAY_HEAD_IP="{{cluster_name_on_cloud}}-head.{{k8s_namespace}}.svc.cluster.local"
433
+ export SKYPILOT_RAY_PORT={{skypilot_ray_port}}
434
+ # Wait until the ray cluster is started on the head pod
435
+ until dpkg -l | grep -q "^ii \(netcat\|netcat-openbsd\|netcat-traditional\) "; do
436
+ sleep 0.1
437
+ echo "Waiting for netcat package to be installed..."
438
+ done
439
+ until nc -z -w 1 ${SKYPILOT_RAY_HEAD_IP} ${SKYPILOT_RAY_PORT}; do
440
+ sleep 0.1
441
+ done
442
+
443
+ set +e
444
+ {{ ray_worker_start_command }}
445
+ fi
446
+ ) > /tmp/${STEPS[1]}.log 2>&1 || {
447
+ echo "Error: ${STEPS[1]} failed. Continuing anyway..." > /tmp/${STEPS[1]}.failed
448
+ cat /tmp/${STEPS[1]}.log
449
+ exit 1
450
+ }
451
+ ) &
452
+
453
+
454
+ # STEP 3: Set up environment variables; this should be relatively fast.
455
+ (
456
+ (
457
+ set -e
458
+ if [ $(id -u) -eq 0 ]; then
459
+ echo 'alias sudo=""' >> ~/.bashrc; echo succeed;
460
+ else
461
+ if command -v sudo >/dev/null 2>&1; then
462
+ timeout 2 sudo -l >/dev/null 2>&1 && echo succeed || { echo 52; exit 52; };
463
+ else
464
+ { echo 52; exit 52; };
465
+ fi;
466
+ fi;
467
+ printenv | while IFS='=' read -r key value; do echo "export $key=\"$value\""; done > ~/container_env_var.sh && $(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh
468
+ ) > /tmp/${STEPS[2]}.log 2>&1 || {
469
+ echo "Error: ${STEPS[2]} failed. Continuing anyway..." > /tmp/${STEPS[2]}.failed
470
+ cat /tmp/${STEPS[2]}.log
471
+ exit 1
472
+ }
473
+ ) &
474
+
475
+ function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
476
+
477
+ # Tails file and checks every 5 sec for
478
+ # open file handlers with write access
479
+ # closes if none exist
480
+ monitor_file() {
481
+ tail -f $file &
482
+ TAIL_PID=$!
483
+ while kill -0 $TAIL_PID 2> /dev/null; do
484
+ # only two PIDs should be accessing the file
485
+ # the log appender and log tailer
486
+ if [ $(mylsof $file | wc -l) -lt 2 ]; then
487
+ kill $TAIL_PID
488
+ break
489
+ fi
490
+ # Sleep for 5 seconds before checking again. Do not make this
491
+ # too short as it will consume CPU, and too long will cause
492
+ # the file to be closed too late keeping the pod alive.
493
+ sleep 5
494
+ done
495
+ }
496
+
497
+ log_tail() {
498
+ FILE_PATTERN="~/sky_logs/*/tasks/*.log"
499
+ while ! ls $(eval echo $FILE_PATTERN) 1> /dev/null 2>&1; do
500
+ sleep 1
501
+ done
502
+
503
+ # Keep track of already monitored files
504
+ already_monitored=""
505
+
506
+ # Infinite loop to continuously check for new files
507
+ while true; do
508
+ for file in $(eval echo $FILE_PATTERN); do
509
+ if echo $already_monitored | grep -q $file; then
510
+ # File is already being monitored
511
+ continue
512
+ fi
513
+
514
+ # Monitor the new file
515
+ monitor_file $file &
516
+ already_monitored="${already_monitored} ${file}"
517
+ done
518
+ sleep 0.1
519
+ done
520
+ }
521
+ trap : TERM INT; log_tail || sleep infinity & wait
522
+
305
523
  ports:
306
524
  - containerPort: 22 # Used for SSH
307
525
  - containerPort: {{ray_port}} # Redis port
@@ -330,34 +548,82 @@ available_node_types:
330
548
  requests:
331
549
  cpu: {{cpus}}
332
550
  memory: {{memory}}G
333
- nvidia.com/gpu: {{accelerator_count}}
551
+ {% if k8s_resource_key is not none %}
552
+ # Number of requested google.com/tpu must be equal to the total
553
+ # number of available TPU chips on the TPU slice node either it
554
+ # being a node from multi-host TPU slice or single-host TPU
555
+ # slice. Example reference:
556
+ # https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work
557
+ {{k8s_resource_key}}: {{accelerator_count}}
558
+ {% endif %}
334
559
  {% if k8s_fuse_device_required %}
335
560
  # Kubernetes resource exposed by the fuse device manager
336
561
  # https://gitlab.com/arm-research/smarter/smarter-device-manager
337
562
  smarter-devices/fuse: "1"
338
563
  {% endif %}
564
+ {% if k8s_resource_key is not none or k8s_fuse_device_required %}
339
565
  limits:
340
- nvidia.com/gpu: {{accelerator_count}} # Limits need to be defined for GPU requests
566
+ # Limits need to be defined for GPU/TPU requests
567
+ {% if k8s_resource_key is not none %}
568
+ {{k8s_resource_key}}: {{accelerator_count}}
569
+ {% endif %}
341
570
  {% if k8s_fuse_device_required %}
342
571
  smarter-devices/fuse: "1"
343
572
  {% endif %}
344
-
573
+ {% endif %}
574
+
345
575
  setup_commands:
346
576
  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
347
- # Create ~/.ssh/config file in case the file does not exist in the image.
577
+ # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
348
578
  # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
349
579
  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
350
580
  # Line 'mkdir -p ..': disable host key check
351
581
  # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
352
- - sudo DEBIAN_FRONTEND=noninteractive apt install gcc patch pciutils rsync fuse curl -y;
582
+ # Line 'for step in ..': check if any failure indicator exists for the setup done in pod args and print the error message. This is only a best effort, as the
583
+ # commands in pod args are asynchronous and we cannot guarantee the failure indicators are created before the setup commands finish.
584
+ - |
353
585
  mkdir -p ~/.ssh; touch ~/.ssh/config;
354
- {{ conda_installation_commands }}
355
- {{ ray_skypilot_installation_commands }}
586
+ {%- for initial_setup_command in initial_setup_commands %}
587
+ {{ initial_setup_command }}
588
+ {%- endfor %}
589
+ STEPS=("apt-ssh-setup" "runtime-setup" "env-setup")
590
+ start_epoch=$(date +%s);
591
+ echo "=== Logs for asynchronous ray and skypilot installation ===";
592
+ if [ -f /tmp/skypilot_is_nimbus ]; then
593
+ echo "=== Logs for asynchronous ray and skypilot installation ===";
594
+ [ -f /tmp/ray_skypilot_installation_complete ] && cat /tmp/${STEPS[1]}.log ||
595
+ { tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/ray_skypilot_installation_complete ]; do sleep 0.5; done; kill $TAIL_PID || true; };
596
+ [ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
597
+ fi
598
+ end_epoch=$(date +%s);
599
+ echo "=== Ray and skypilot dependencies installation completed in $(($end_epoch - $start_epoch)) secs ===";
600
+ start_epoch=$(date +%s);
601
+ {{ skypilot_wheel_installation_commands }}
602
+ end_epoch=$(date +%s);
603
+ echo "=== Skypilot wheel installation completed in $(($end_epoch - $start_epoch)) secs ===";
604
+ start_epoch=$(date +%s);
356
605
  sudo touch ~/.sudo_as_admin_successful;
357
606
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
358
- sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
359
- mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
607
+ sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf');
608
+ ulimit -n 1048576;
609
+ mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
360
610
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
611
+ end_epoch=$(date +%s);
612
+ echo "=== Setup system configs and fuse completed in $(($end_epoch - $start_epoch)) secs ===";
613
+ for step in $STEPS; do [ -f "/tmp/${step}.failed" ] && { echo "Error: /tmp/${step}.failed found:"; cat /tmp/${step}.log; exit 1; } || true; done;
614
+ {% if tpu_requested %}
615
+ # The /tmp/tpu_logs directory is where TPU-related logs, such as logs from
616
+ # the TPU runtime, are written. These capture runtime information about the
617
+ # TPU execution, including any warnings, errors, or general activity of
618
+ # the TPU driver. By default, the /tmp/tpu_logs directory is created with
619
+ # 755 permissions, and the user of the provisioned pod is not necessarily
620
+ # a root. Hence, we need to update the write permission so the logs can be
621
+ # properly written.
622
+ # TODO(Doyoung): Investigate to see why TPU workload fails to run without
623
+ # execution permission, such as granting 766 to log file. Check if it's a
624
+ # must and see if there's a workaround to grant minimum permission.
625
+ sudo chmod 777 /tmp/tpu_logs;
626
+ {% endif %}
361
627
 
362
628
  # Format: `REMOTE_PATH : LOCAL_PATH`
363
629
  file_mounts: {
@@ -365,6 +631,7 @@ file_mounts: {
365
631
  "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
366
632
  {%- for remote_path, local_path in credentials.items() %}
367
633
  "{{remote_path}}": "{{local_path}}",
634
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
368
635
  {%- endfor %}
369
636
  }
370
637
 
@@ -5,9 +5,29 @@ max_workers: {{num_nodes - 1}}
5
5
  upscaling_speed: {{num_nodes - 1}}
6
6
  idle_timeout_minutes: 60
7
7
 
8
+ {%- if docker_image is not none %}
9
+ docker:
10
+ image: {{docker_image}}
11
+ container_name: {{docker_container_name}}
12
+ run_options:
13
+ - --ulimit nofile=1048576:1048576
14
+ {%- for run_option in docker_run_options %}
15
+ - {{run_option}}
16
+ {%- endfor %}
17
+ {%- if docker_login_config is not none %}
18
+ docker_login_config:
19
+ username: |-
20
+ {{docker_login_config.username}}
21
+ password: |-
22
+ {{docker_login_config.password}}
23
+ server: |-
24
+ {{docker_login_config.server}}
25
+ {%- endif %}
26
+ {%- endif %}
27
+
8
28
  provider:
9
29
  type: external
10
- module: sky.skylet.providers.lambda_cloud.LambdaNodeProvider
30
+ module: sky.provision.lambda
11
31
  region: {{region}}
12
32
  # Disable launch config check for worker nodes as it can cause resource
13
33
  # leakage.
@@ -25,14 +45,6 @@ available_node_types:
25
45
  resources: {}
26
46
  node_config:
27
47
  InstanceType: {{instance_type}}
28
- {% if num_nodes > 1 %}
29
- ray_worker_default:
30
- min_workers: {{num_nodes - 1}}
31
- max_workers: {{num_nodes - 1}}
32
- resources: {}
33
- node_config:
34
- InstanceType: {{instance_type}}
35
- {%- endif %}
36
48
 
37
49
  head_node_type: ray_head_default
38
50
 
@@ -42,6 +54,7 @@ file_mounts: {
42
54
  "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
43
55
  {%- for remote_path, local_path in credentials.items() %}
44
56
  "{{remote_path}}": "{{local_path}}",
57
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
45
58
  {%- endfor %}
46
59
  }
47
60
 
@@ -58,13 +71,16 @@ initialization_commands: []
58
71
  # current num items (num SSH connections): 1
59
72
  setup_commands:
60
73
  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
61
- # Create ~/.ssh/config file in case the file does not exist in the image.
74
+ # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
62
75
  # Line 'rm ..': there is another installation of pip.
63
76
  # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
64
77
  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
65
78
  # Line 'mkdir -p ..': disable host key check
66
79
  # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
67
- - sudo systemctl stop unattended-upgrades || true;
80
+ - {%- for initial_setup_command in initial_setup_commands %}
81
+ {{ initial_setup_command }}
82
+ {%- endfor %}
83
+ sudo systemctl stop unattended-upgrades || true;
68
84
  sudo systemctl disable unattended-upgrades || true;
69
85
  sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
70
86
  sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
@@ -78,34 +94,8 @@ setup_commands:
78
94
  touch ~/.sudo_as_admin_successful;
79
95
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
80
96
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
81
- mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
97
+ mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
82
98
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
83
99
 
84
- # Command to start ray on the head node. You don't need to change this.
85
- # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
86
- # connection, which is expensive. Try your best to co-locate commands into fewer
87
- # items! The same comment applies for worker_start_ray_commands.
88
- #
89
- # Increment the following for catching performance bugs easier:
90
- # current num items (num SSH connections): 2
91
- head_start_ray_commands:
92
- - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
93
- which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
94
- {{dump_port_command}}; {{ray_head_wait_initialized_command}}
95
-
96
- {%- if num_nodes > 1 %}
97
- worker_start_ray_commands:
98
- - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
99
- which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
100
- {%- else %}
101
- worker_start_ray_commands: []
102
- {%- endif %}
103
-
104
- head_node: {}
105
- worker_nodes: {}
106
-
107
- # These fields are required for external cloud providers.
108
- head_setup_commands: []
109
- worker_setup_commands: []
110
- cluster_synced_files: []
111
- file_mounts_sync_continuously: False
100
+ # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
101
+ # We do not need to list it here anymore.