skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,79 @@
1
+ cluster_name: {{cluster_name_on_cloud}}
2
+
3
+ # The maximum number of workers nodes to launch in addition to the head node.
4
+ max_workers: {{num_nodes - 1}}
5
+ upscaling_speed: {{num_nodes - 1}}
6
+ idle_timeout_minutes: 60
7
+
8
+ provider:
9
+ type: external
10
+ module: sky.provision.nebius
11
+ region: "{{region}}"
12
+
13
+ auth:
14
+ ssh_user: ubuntu
15
+ ssh_private_key: {{ssh_private_key}}
16
+
17
+ available_node_types:
18
+ ray_head_default:
19
+ resources: {}
20
+ node_config:
21
+ InstanceType: {{instance_type}}
22
+ ImageId: {{image_id}}
23
+ DiskSize: {{disk_size}}
24
+ UserData: |
25
+ users:
26
+ - name: skypilot:ssh_user
27
+ shell: /bin/bash
28
+ sudo: ALL=(ALL) NOPASSWD:ALL
29
+ ssh_authorized_keys:
30
+ - |-
31
+ skypilot:ssh_public_key_content
32
+
33
+ head_node_type: ray_head_default
34
+
35
+ # Format: `REMOTE_PATH : LOCAL_PATH`
36
+ file_mounts: {
37
+ "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
38
+ "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
39
+ {%- for remote_path, local_path in credentials.items() %}
40
+ "{{remote_path}}": "{{local_path}}",
41
+ {%- endfor %}
42
+ }
43
+
44
+ rsync_exclude: []
45
+
46
+ initialization_commands: []
47
+
48
+ # List of shell commands to run to set up nodes.
49
+ # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
50
+ # connection, which is expensive. Try your best to co-locate commands into fewer
51
+ # items!
52
+ #
53
+ # Increment the following for catching performance bugs easier:
54
+ # current num items (num SSH connections): 1
55
+ setup_commands:
56
+ # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
57
+ # Create ~/.ssh/config file in case the file does not exist in the image.
58
+ # Line 'rm ..': there is another installation of pip.
59
+ # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
60
+ # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
61
+ # Line 'mkdir -p ..': disable host key check
62
+ # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
63
+ - {%- for initial_setup_command in initial_setup_commands %}
64
+ {{ initial_setup_command }}
65
+ {%- endfor %}
66
+ sudo systemctl stop unattended-upgrades || true;
67
+ sudo systemctl disable unattended-upgrades || true;
68
+ sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
69
+ sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
70
+ sudo pkill -9 apt-get;
71
+ sudo pkill -9 dpkg;
72
+ sudo dpkg --configure -a;
73
+ mkdir -p ~/.ssh; touch ~/.ssh/config;
74
+ {{ conda_installation_commands }}
75
+ {{ ray_skypilot_installation_commands }}
76
+ sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
77
+ sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
78
+ mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
79
+ [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
@@ -7,7 +7,7 @@ idle_timeout_minutes: 60
7
7
 
8
8
  provider:
9
9
  type: external
10
- module: sky.skylet.providers.oci.OCINodeProvider
10
+ module: sky.provision.oci
11
11
  region: {{region}}
12
12
  cache_stopped_nodes: True
13
13
  # Disable launch config check for worker nodes as it can cause resource leakage.
@@ -16,7 +16,11 @@ provider:
16
16
  disable_launch_config_check: true
17
17
 
18
18
  auth:
19
+ {% if os_type == "ubuntu" %}
19
20
  ssh_user: ubuntu
21
+ {% else %}
22
+ ssh_user: opc
23
+ {% endif %}
20
24
  ssh_private_key: {{ssh_private_key}}
21
25
 
22
26
  available_node_types:
@@ -35,25 +39,6 @@ available_node_types:
35
39
  Preemptible: {{use_spot}}
36
40
  AuthorizedKey: |
37
41
  skypilot:ssh_public_key_content
38
- {% if num_nodes > 1 %}
39
- ray_worker_default:
40
- min_workers: {{num_nodes - 1}}
41
- max_workers: {{num_nodes - 1}}
42
- resources: {}
43
- node_config:
44
- InstanceType: {{instance_type}}
45
- VCPUs: {{cpus}}
46
- MemoryInGbs: {{memory}}
47
- BootVolumeSize: {{disk_size}}
48
- BootVolumePerf: {{vpu}}
49
- AvailabilityDomain: {{zone}}
50
- ImageId: {{image}}
51
- AppCatalogListingId: {{app_catalog_listing_id}}
52
- ResourceVersion: {{resource_version}}
53
- Preemptible: {{use_spot}}
54
- AuthorizedKey: |
55
- skypilot:ssh_public_key_content
56
- {%- endif %}
57
42
 
58
43
  head_node_type: ray_head_default
59
44
 
@@ -63,12 +48,10 @@ file_mounts: {
63
48
  "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
64
49
  {%- for remote_path, local_path in credentials.items() %}
65
50
  "{{remote_path}}": "{{local_path}}",
51
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
66
52
  {%- endfor %}
67
53
  }
68
54
 
69
- rsync_exclude: []
70
-
71
- initialization_commands: []
72
55
 
73
56
  # List of shell commands to run to set up nodes.
74
57
  # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
@@ -79,58 +62,36 @@ initialization_commands: []
79
62
  # current num items (num SSH connections): 1
80
63
  setup_commands:
81
64
  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
82
- # Create ~/.ssh/config file in case the file does not exist in the image.
65
+ # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
83
66
  # Line 'rm ..': there is another installation of pip.
84
67
  # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
85
68
  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
86
69
  # Line 'mkdir -p ..': disable host key check
87
70
  # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
88
- - sudo systemctl stop unattended-upgrades || true;
71
+ - echo "setup commands runs at $(date)" > /tmp/provision.tmp.out || true;
72
+ {%- if os_type == "ubuntu" %}
73
+ sudo systemctl stop unattended-upgrades || true;
89
74
  sudo systemctl disable unattended-upgrades || true;
90
75
  sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
91
76
  sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
92
77
  sudo pkill -9 apt-get;
93
78
  sudo pkill -9 dpkg;
94
79
  sudo dpkg --configure -a;
95
- ([ `sudo lshw -class display | grep "NVIDIA Corporation" | wc -l` -gt 0 ]) && (sudo which nvidia-smi > /dev/null || ( sudo apt-get install nvidia-driver-530-open -y && sudo apt-get install nvidia-driver-525-server -y ) || true);
80
+ {%- else %}
81
+ sudo /usr/libexec/oci-growfs -y || true;
82
+ sudo systemctl stop firewalld || true;
83
+ sudo systemctl disable firewalld || true;
84
+ {%- endif %}
96
85
  mkdir -p ~/.ssh; touch ~/.ssh/config;
97
86
  {{ conda_installation_commands }}
98
87
  {{ ray_skypilot_installation_commands }}
99
88
  touch ~/.sudo_as_admin_successful;
100
89
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
101
90
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
102
- mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
91
+ mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
103
92
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
104
93
  sudo iptables -I INPUT -i ens3 -m state --state ESTABLISHED,RELATED,NEW -j ACCEPT;
105
94
 
106
- # Command to start ray on the head node. You don't need to change this.
107
- # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
108
- # connection, which is expensive. Try your best to co-locate commands into fewer
109
- # items! The same comment applies for worker_start_ray_commands.
110
- #
111
- # Increment the following for catching performance bugs easier:
112
- # current num items (num SSH connections): 2
113
- head_start_ray_commands:
114
- # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
115
- # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires
116
- # all the sessions to be reloaded. This is a workaround.
117
- - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
118
- which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
119
- {{dump_port_command}}; {{ray_head_wait_initialized_command}}
120
-
121
- {%- if num_nodes > 1 %}
122
- worker_start_ray_commands:
123
- - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
124
- which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
125
- {%- else %}
126
- worker_start_ray_commands: []
127
- {%- endif %}
128
-
129
- head_node: {}
130
- worker_nodes: {}
95
+ # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
96
+ # We do not need to list it here anymore.
131
97
 
132
- # These fields are required for external cloud providers.
133
- head_setup_commands: []
134
- worker_setup_commands: []
135
- cluster_synced_files: []
136
- file_mounts_sync_continuously: False
@@ -11,9 +11,9 @@ docker:
11
11
  container_name: {{docker_container_name}}
12
12
  run_options:
13
13
  - --ulimit nofile=1048576:1048576
14
- {%- if custom_resources is not none %}
15
- --gpus all
16
- {%- endif %}
14
+ {%- for run_option in docker_run_options %}
15
+ - {{run_option}}
16
+ {%- endfor %}
17
17
  {%- if docker_login_config is not none %}
18
18
  docker_login_config:
19
19
  username: |-
@@ -51,6 +51,7 @@ file_mounts: {
51
51
  "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
52
52
  {%- for remote_path, local_path in credentials.items() %}
53
53
  "{{remote_path}}": "{{local_path}}",
54
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
54
55
  {%- endfor %}
55
56
  }
56
57
 
@@ -67,13 +68,16 @@ initialization_commands: []
67
68
  # current num items (num SSH connections): 1
68
69
  setup_commands:
69
70
  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
70
- # Create ~/.ssh/config file in case the file does not exist in the image.
71
+ # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
71
72
  # Line 'rm ..': there is another installation of pip.
72
73
  # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
73
74
  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
74
75
  # Line 'mkdir -p ..': disable host key check
75
76
  # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
76
- - sudo systemctl stop unattended-upgrades || true;
77
+ - {%- for initial_setup_command in initial_setup_commands %}
78
+ {{ initial_setup_command }}
79
+ {%- endfor %}
80
+ sudo systemctl stop unattended-upgrades || true;
77
81
  sudo systemctl disable unattended-upgrades || true;
78
82
  sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
79
83
  sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
@@ -85,5 +89,5 @@ setup_commands:
85
89
  {{ ray_skypilot_installation_commands }}
86
90
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
87
91
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
88
- mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
92
+ mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
89
93
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
@@ -10,9 +10,22 @@ provider:
10
10
  module: sky.provision.runpod
11
11
  region: "{{region}}"
12
12
  disable_launch_config_check: true
13
+ # For RunPod, we directly set the image id for the docker as runtime environment
14
+ # support, thus we need to avoid the DockerInitializer detects the docker field
15
+ # and performs the initialization. Therefore we put the docker login config in
16
+ # the provider config here.
17
+ {%- if docker_login_config is not none %}
18
+ docker_login_config:
19
+ username: |-
20
+ {{docker_login_config.username}}
21
+ password: |-
22
+ {{docker_login_config.password}}
23
+ server: |-
24
+ {{docker_login_config.server}}
25
+ {%- endif %}
13
26
 
14
27
  auth:
15
- ssh_user: root
28
+ ssh_user: {{docker_username_for_runpod}}
16
29
  ssh_private_key: {{ssh_private_key}}
17
30
 
18
31
  available_node_types:
@@ -21,6 +34,11 @@ available_node_types:
21
34
  node_config:
22
35
  InstanceType: {{instance_type}}
23
36
  DiskSize: {{disk_size}}
37
+ ImageId: {{image_id}}
38
+ PublicKey: |-
39
+ skypilot:ssh_public_key_content
40
+ Preemptible: {{use_spot}}
41
+ BidPerGPU: {{bid_per_gpu}}
24
42
 
25
43
  head_node_type: ray_head_default
26
44
 
@@ -30,6 +48,7 @@ file_mounts: {
30
48
  "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
31
49
  {%- for remote_path, local_path in credentials.items() %}
32
50
  "{{remote_path}}": "{{local_path}}",
51
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
33
52
  {%- endfor %}
34
53
  }
35
54
 
@@ -46,13 +65,16 @@ initialization_commands: []
46
65
  # current num items (num SSH connections): 1
47
66
  setup_commands:
48
67
  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
49
- # Create ~/.ssh/config file in case the file does not exist in the image.
68
+ # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
50
69
  # Line 'rm ..': there is another installation of pip.
51
70
  # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
52
71
  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
53
72
  # Line 'mkdir -p ..': disable host key check
54
73
  # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
55
- - sudo systemctl stop unattended-upgrades || true;
74
+ - {%- for initial_setup_command in initial_setup_commands %}
75
+ {{ initial_setup_command }}
76
+ {%- endfor %}
77
+ sudo systemctl stop unattended-upgrades || true;
56
78
  sudo systemctl disable unattended-upgrades || true;
57
79
  sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
58
80
  sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
@@ -65,7 +87,7 @@ setup_commands:
65
87
  touch ~/.sudo_as_admin_successful;
66
88
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
67
89
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
68
- mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
90
+ mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
69
91
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
70
92
 
71
93
  # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
@@ -46,6 +46,7 @@ file_mounts: {
46
46
  "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
47
47
  {%- for remote_path, local_path in credentials.items() %}
48
48
  "{{remote_path}}": "{{local_path}}",
49
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
49
50
  {%- endfor %}
50
51
  }
51
52
 
@@ -61,8 +62,8 @@ initialization_commands: []
61
62
  # Increment the following for catching performance bugs easier:
62
63
  # current num items (num SSH connections): 1
63
64
  setup_commands:
65
+ # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
64
66
  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
65
- # Create ~/.ssh/config file in case the file does not exist in the custom image.
66
67
  # We set auto_activate_base to be false for pre-installed conda.
67
68
  # This also kills the service that is holding the lock on dpkg (problem only exists on aws/azure, not gcp)
68
69
  # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
@@ -74,7 +75,7 @@ setup_commands:
74
75
  {{ ray_skypilot_installation_commands }}
75
76
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
76
77
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
77
- mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
78
+ mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
78
79
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
79
80
 
80
81
  # Command to start ray on the head node. You don't need to change this.
@@ -15,6 +15,11 @@ setup: |
15
15
  {{cmd}}
16
16
  {%- endfor %}
17
17
 
18
+ {% if controller_envs.get('SKYPILOT_DEV') != '0' %}
19
+ grep -q 'export SKYPILOT_DEV=' ~/.bashrc || echo 'export SKYPILOT_DEV=1' >> ~/.bashrc
20
+ grep -q 'alias sky-env=' ~/.bashrc || echo 'alias sky-env="{{ sky_activate_python_env }}"' >> ~/.bashrc
21
+ {% endif %}
22
+
18
23
  # Install serve dependencies.
19
24
  # TODO(tian): Gather those into serve constants.
20
25
  pip list | grep uvicorn > /dev/null 2>&1 || pip install uvicorn > /dev/null 2>&1
@@ -23,10 +28,16 @@ setup: |
23
28
 
24
29
  file_mounts:
25
30
  {{remote_task_yaml_path}}: {{local_task_yaml_path}}
26
- {{remote_user_config_path}}: skypilot:local_skypilot_config_path
31
+ {%- if local_user_config_path is not none %}
32
+ {{remote_user_config_path}}: {{local_user_config_path}}
33
+ {%- endif %}
27
34
  {%- for remote_catalog_path, local_catalog_path in modified_catalogs.items() %}
28
35
  {{remote_catalog_path}}: {{local_catalog_path}}
29
36
  {%- endfor %}
37
+ {%- if use_tls %}
38
+ {{remote_tls_keyfile}}: {{local_tls_keyfile}}
39
+ {{remote_tls_certfile}}: {{local_tls_certfile}}
40
+ {%- endif %}
30
41
 
31
42
  run: |
32
43
  # Activate the Python environment, so that cloud SDKs can be found in the
@@ -0,0 +1,36 @@
1
+ #!/bin/bash
2
+
3
+ # Check if all required arguments are provided
4
+ if [ "$#" -ne 4 ]; then
5
+ echo "Usage: $0 <api_server> <context> <namespace> <pod_name>" >&2
6
+ exit 1
7
+ fi
8
+
9
+ API_SERVER="$1"
10
+ CONTEXT="$2"
11
+ NAMESPACE="$3"
12
+ POD_NAME="$4"
13
+
14
+ # Extract host and port from API_SERVER
15
+ HOST=$(echo $API_SERVER | cut -d: -f1)
16
+ PORT=$(echo $API_SERVER | cut -d: -f2)
17
+
18
+ # Check if nc is installed
19
+ if ! command -v nc &> /dev/null
20
+ then
21
+ echo "nc (netcat) could not be found. Please install it first." >&2
22
+ echo "You can install it using: sudo apt-get install netcat" >&2
23
+ exit 1
24
+ fi
25
+
26
+ # Construct the WebSocket upgrade request
27
+ UPGRADE_REQUEST="GET /ssh-proxy?context=$CONTEXT&namespace=$NAMESPACE&pod_name=$POD_NAME HTTP/1.1\r\n"
28
+ UPGRADE_REQUEST+="Host: $API_SERVER\r\n"
29
+ UPGRADE_REQUEST+="Upgrade: websocket\r\n"
30
+ UPGRADE_REQUEST+="Connection: Upgrade\r\n"
31
+ UPGRADE_REQUEST+="Sec-WebSocket-Key: dGhlIHNhbXBsZSBub25jZQ==\r\n"
32
+ UPGRADE_REQUEST+="Sec-WebSocket-Version: 13\r\n"
33
+ UPGRADE_REQUEST+="\r\n"
34
+
35
+ # Send the upgrade request and then relay data
36
+ (echo -en "$UPGRADE_REQUEST"; cat) | nc $HOST $PORT
@@ -0,0 +1,70 @@
1
+ cluster_name: {{cluster_name_on_cloud}}
2
+
3
+ # The maximum number of workers nodes to launch in addition to the head node.
4
+ max_workers: {{num_nodes - 1}}
5
+ upscaling_speed: {{num_nodes - 1}}
6
+ idle_timeout_minutes: 60
7
+
8
+ provider:
9
+ type: external
10
+ module: sky.provision.vast
11
+ region: "{{region}}"
12
+ disable_launch_config_check: true
13
+
14
+ auth:
15
+ ssh_user: root
16
+ ssh_private_key: {{ssh_private_key}}
17
+
18
+ available_node_types:
19
+ ray_head_default:
20
+ resources: {}
21
+ node_config:
22
+ InstanceType: {{instance_type}}
23
+ DiskSize: {{disk_size}}
24
+ ImageId: {{image_id}}
25
+ Preemptible: {{use_spot}}
26
+ PublicKey: |-
27
+ skypilot:ssh_public_key_content
28
+
29
+ head_node_type: ray_head_default
30
+
31
+ # Format: `REMOTE_PATH : LOCAL_PATH`
32
+ file_mounts: {
33
+ "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
34
+ "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
35
+ {%- for remote_path, local_path in credentials.items() %}
36
+ "{{remote_path}}": "{{local_path}}",
37
+ {%- endfor %}
38
+ }
39
+
40
+ rsync_exclude: []
41
+
42
+ initialization_commands: []
43
+
44
+ # List of shell commands to run to set up nodes.
45
+ # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
46
+ # connection, which is expensive. Try your best to co-locate commands into fewer
47
+ # items!
48
+ #
49
+ # Increment the following for catching performance bugs easier:
50
+ # current num items (num SSH connections): 1
51
+ setup_commands:
52
+ # Create ~/.ssh/config file in case the file does not exist in the image.
53
+ # Line 'rm ..': there is another installation of pip.
54
+ # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
55
+ # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
56
+ # Line 'mkdir -p ..': disable host key check
57
+ # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
58
+ - {%- for initial_setup_command in initial_setup_commands %}
59
+ {{ initial_setup_command }}
60
+ {%- endfor %}
61
+ mkdir -p ~/.ssh; touch ~/.ssh/config; which patch > /dev/null || sudo apt install -y patch;
62
+ {{ conda_installation_commands }}
63
+ {{ ray_skypilot_installation_commands }}
64
+ sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
65
+ sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
66
+ (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
67
+
68
+
69
+ # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
70
+ # We do not need to list it here anymore.
@@ -14,6 +14,7 @@ provider:
14
14
  auth:
15
15
  ssh_user: ubuntu
16
16
  ssh_private_key: {{ssh_private_key}}
17
+ ssh_public_key: skypilot:ssh_public_key_content
17
18
 
18
19
  available_node_types:
19
20
  ray.head.default:
@@ -29,6 +30,7 @@ file_mounts: {
29
30
  "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
30
31
  {%- for remote_path, local_path in credentials.items() %}
31
32
  "{{remote_path}}": "{{local_path}}",
33
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
32
34
  {%- endfor %}
33
35
  }
34
36
 
@@ -45,13 +47,16 @@ initialization_commands: []
45
47
  # current num items (num SSH connections): 1
46
48
  setup_commands:
47
49
  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
48
- # Create ~/.ssh/config file in case the file does not exist in the image.
50
+ # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
49
51
  # Line 'rm ..': there is another installation of pip.
50
52
  # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
51
53
  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
52
54
  # Line 'mkdir -p ..': disable host key check
53
55
  # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
54
- - sudo systemctl stop unattended-upgrades || true;
56
+ - {%- for initial_setup_command in initial_setup_commands %}
57
+ {{ initial_setup_command }}
58
+ {%- endfor %}
59
+ sudo systemctl stop unattended-upgrades || true;
55
60
  sudo systemctl disable unattended-upgrades || true;
56
61
  sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
57
62
  sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
@@ -64,5 +69,5 @@ setup_commands:
64
69
  {{ ray_skypilot_installation_commands }}
65
70
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
66
71
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
67
- mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
72
+ mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
68
73
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env python3
2
+ """Starting a websocket with SkyPilot API server to proxy SSH to a k8s pod.
3
+
4
+ This script is useful for users who do not have local Kubernetes credentials.
5
+ """
6
+ import asyncio
7
+ import os
8
+ import sys
9
+
10
+ import websockets
11
+
12
+
13
+ async def main(url: str) -> None:
14
+ async with websockets.connect(url, ping_interval=None) as websocket:
15
+ if os.isatty(sys.stdin.fileno()):
16
+ # pylint: disable=import-outside-toplevel
17
+ import termios
18
+ import tty
19
+ old_settings = termios.tcgetattr(sys.stdin.fileno())
20
+ tty.setraw(sys.stdin.fileno())
21
+ else:
22
+ old_settings = None
23
+
24
+ try:
25
+ await asyncio.gather(stdin_to_websocket(websocket),
26
+ websocket_to_stdout(websocket))
27
+ finally:
28
+ if old_settings:
29
+ termios.tcsetattr(sys.stdin.fileno(), termios.TCSADRAIN,
30
+ old_settings)
31
+
32
+
33
+ async def stdin_to_websocket(websocket):
34
+ try:
35
+ while True:
36
+ data = await asyncio.get_event_loop().run_in_executor(
37
+ None, sys.stdin.buffer.read, 1)
38
+ if not data:
39
+ break
40
+ await websocket.send(data)
41
+ except Exception as e: # pylint: disable=broad-except
42
+ print(f'Error in stdin_to_websocket: {e}', file=sys.stderr)
43
+ finally:
44
+ await websocket.close()
45
+
46
+
47
+ async def websocket_to_stdout(websocket):
48
+ try:
49
+ while True:
50
+ message = await websocket.recv()
51
+ sys.stdout.buffer.write(message)
52
+ await asyncio.get_event_loop().run_in_executor(
53
+ None, sys.stdout.buffer.flush)
54
+ except websockets.exceptions.ConnectionClosed:
55
+ print('WebSocket connection closed', file=sys.stderr)
56
+ except Exception as e: # pylint: disable=broad-except
57
+ print(f'Error in websocket_to_stdout: {e}', file=sys.stderr)
58
+
59
+
60
+ if __name__ == '__main__':
61
+ server_url = sys.argv[1].strip('/')
62
+ websocket_url = (f'ws://{server_url}/kubernetes-pod-ssh-proxy'
63
+ f'?cluster_name={sys.argv[2]}')
64
+ asyncio.run(main(websocket_url))
sky/usage/constants.py CHANGED
@@ -3,7 +3,6 @@
3
3
  LOG_URL = 'http://usage.skypilot.co:9090/loki/api/v1/push' # pylint: disable=line-too-long
4
4
 
5
5
  USAGE_MESSAGE_SCHEMA_VERSION = 1
6
-
7
6
  PRIVACY_POLICY_PATH = '~/.sky/privacy_policy'
8
7
 
9
8
  USAGE_POLICY_MESSAGE = (
@@ -15,3 +14,13 @@ USAGE_POLICY_MESSAGE = (
15
14
 
16
15
  USAGE_MESSAGE_REDACT_KEYS = ['setup', 'run', 'envs']
17
16
  USAGE_MESSAGE_REDACT_TYPES = {str, dict}
17
+
18
+ # Env var for the usage run id. This is used by the API server to associate
19
+ # the usage run id of a request from client to the actual functions invoked.
20
+ USAGE_RUN_ID_ENV_VAR = 'SKYPILOT_USAGE_RUN_ID'
21
+
22
+ # The file stores the usage run id on a remote cluster, so that the heartbeat
23
+ # on that remote cluster can be associated with the usage run id. This file is
24
+ # initialized when the cluster is firstly launched in:
25
+ # sky.provision.instance_setup.start_skylet_on_head_node
26
+ USAGE_RUN_ID_FILE = '~/.sky/usage_run_id'