skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -11,9 +11,9 @@ docker:
11
11
  container_name: {{docker_container_name}}
12
12
  run_options:
13
13
  - --ulimit nofile=1048576:1048576
14
- {%- if custom_resources is not none %}
15
- --gpus all
16
- {%- endif %}
14
+ {%- for run_option in docker_run_options %}
15
+ - {{run_option}}
16
+ {%- endfor %}
17
17
  {%- if docker_login_config is not none %}
18
18
  docker_login_config:
19
19
  username: |-
@@ -72,8 +72,11 @@ available_node_types:
72
72
  Ebs:
73
73
  VolumeSize: {{disk_size}}
74
74
  VolumeType: {{disk_tier}}
75
- {% if custom_disk_perf %}
75
+ Encrypted: {{disk_encrypted}}
76
+ {% if disk_iops %}
76
77
  Iops: {{disk_iops}}
78
+ {% endif %}
79
+ {% if disk_throughput %}
77
80
  Throughput: {{disk_throughput}}
78
81
  {% endif %}
79
82
  {% if use_spot %}
@@ -83,6 +86,12 @@ available_node_types:
83
86
  # SpotOptions:
84
87
  # MaxPrice: MAX_HOURLY_PRICE
85
88
  {% endif %}
89
+ CapacityReservationSpecification:
90
+ CapacityReservationPreference: open
91
+ {% if specific_reservations %}
92
+ CapacityReservationTarget:
93
+ CapacityReservationId: {{specific_reservations}}
94
+ {% endif %}
86
95
  # Use cloud init in UserData to set up the authorized_keys to get
87
96
  # around the number of keys limit and permission issues with
88
97
  # ec2.describe_key_pairs.
@@ -113,6 +122,15 @@ available_node_types:
113
122
  - path: /etc/apt/apt.conf.d/10cloudinit-disable
114
123
  content: |
115
124
  APT::Periodic::Enable "0";
125
+ - path: /etc/apt/apt.conf.d/52unattended-upgrades-local
126
+ content: |
127
+ Unattended-Upgrade::DevRelease "false";
128
+ Unattended-Upgrade::Allowed-Origins {};
129
+ bootcmd:
130
+ - systemctl stop apt-daily.timer apt-daily-upgrade.timer unattended-upgrades.service
131
+ - systemctl disable apt-daily.timer apt-daily-upgrade.timer unattended-upgrades.service
132
+ - systemctl mask apt-daily.service apt-daily-upgrade.service unattended-upgrades.service
133
+ - systemctl daemon-reload
116
134
  TagSpecifications:
117
135
  - ResourceType: instance
118
136
  Tags:
@@ -122,6 +140,9 @@ available_node_types:
122
140
  - Key: {{ label_key }}
123
141
  Value: {{ label_value|tojson }}
124
142
  {%- endfor %}
143
+ # Use IDMSv2
144
+ MetadataOptions:
145
+ HttpTokens: required
125
146
 
126
147
  head_node_type: ray.head.default
127
148
 
@@ -131,6 +152,7 @@ file_mounts: {
131
152
  "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
132
153
  {%- for remote_path, local_path in credentials.items() %}
133
154
  "{{remote_path}}": "{{local_path}}",
155
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
134
156
  {%- endfor %}
135
157
  }
136
158
 
@@ -143,7 +165,7 @@ file_mounts: {
143
165
  # Increment the following for catching performance bugs easier:
144
166
  # current num items (num SSH connections): 1
145
167
  setup_commands:
146
- # Create ~/.ssh/config file in case the file does not exist in the custom image.
168
+ # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
147
169
  # We set auto_activate_base to be false for pre-installed conda.
148
170
  # This also kills the service that is holding the lock on dpkg (problem only exists on aws/azure, not gcp)
149
171
  # Line "conda config --remove channels": remove the default channel set in the default AWS image as it cannot be accessed.
@@ -152,7 +174,12 @@ setup_commands:
152
174
  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
153
175
  # Line 'mkdir -p ..': disable host key check
154
176
  # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
177
+ # Line 'mkdir -p ~/.ssh ...': adding the key in the ssh config to allow interconnection for nodes in the cluster
178
+ # Line 'rm ~/.aws/credentials': explicitly remove the credentials file to be safe. This is to guard against the case where the credential files was uploaded once as `remote_identity` was not set in a previous launch.
155
179
  - mkdir -p ~/.ssh; touch ~/.ssh/config;
180
+ {%- for initial_setup_command in initial_setup_commands %}
181
+ {{ initial_setup_command }}
182
+ {%- endfor %}
156
183
  {{ conda_installation_commands }}
157
184
  conda config --remove channels "https://aws-ml-conda-ec2.s3.us-west-2.amazonaws.com" || true;
158
185
  {{ ray_skypilot_installation_commands }}
@@ -160,8 +187,11 @@ setup_commands:
160
187
  {%- if docker_image is none %}
161
188
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
162
189
  {%- endif %}
163
- mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
164
- [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
190
+ mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
191
+ [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
192
+ {%- if remote_identity != 'LOCAL_CREDENTIALS' %}
193
+ rm ~/.aws/credentials || true;
194
+ {%- endif %}
165
195
 
166
196
  # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
167
197
  # We do not need to list it here anymore.
@@ -11,29 +11,30 @@ docker:
11
11
  container_name: {{docker_container_name}}
12
12
  run_options:
13
13
  - --ulimit nofile=1048576:1048576
14
- {%- if custom_resources is not none %}
15
- --gpus all
16
- {%- endif %}
14
+ {%- for run_option in docker_run_options %}
15
+ - {{run_option}}
16
+ {%- endfor %}
17
+ {%- if docker_login_config is not none %}
18
+ docker_login_config:
19
+ username: |-
20
+ {{docker_login_config.username}}
21
+ password: |-
22
+ {{docker_login_config.password}}
23
+ server: |-
24
+ {{docker_login_config.server}}
25
+ {%- endif %}
17
26
  {%- endif %}
18
27
 
19
28
  provider:
20
29
  type: external
21
- module: sky.skylet.providers.azure.AzureNodeProvider
30
+ module: sky.provision.azure
22
31
  location: {{region}}
23
32
  # Ref: https://github.com/ray-project/ray/blob/2367a2cb9033913b68b1230316496ae273c25b54/python/ray/autoscaler/_private/_azure/node_provider.py#L87
24
33
  # For Azure, ray distinguishes different instances by the resource_group,
25
34
  # instead of the cluster_name. This ensures that ray creates new instances
26
35
  # for different cluster_name.
27
36
  resource_group: {{resource_group}}
28
- {%- if docker_login_config is not none %}
29
- # We put docker login config in provider section because ray's schema disabled
30
- # additionalProperties for docker config.
31
- # See: https://github.com/ray-project/ray/blob/d2fc4823126927b2c54f89ec72fa3d24b442e6a3/python/ray/autoscaler/ray-schema.json#L227
32
- docker_login_config:
33
- username: {{docker_login_config.username}}
34
- password: {{docker_login_config.password}}
35
- server: {{docker_login_config.server}}
36
- {%- endif %}
37
+ use_external_resource_group: {{use_external_resource_group}}
37
38
  # Keep (otherwise cannot reuse when re-provisioning).
38
39
  # teardown(terminate=True) will override this.
39
40
  cache_stopped_nodes: True
@@ -67,45 +68,22 @@ available_node_types:
67
68
  imageOffer: {{image_offer}}
68
69
  imageSku: "{{image_sku}}"
69
70
  imageVersion: {{image_version}}
71
+ # Community Gallery Image ID
72
+ communityGalleryImageId: {{community_gallery_image_id}}
70
73
  osDiskSizeGB: {{disk_size}}
71
74
  osDiskTier: {{disk_tier}}
72
- cloudInitSetupCommands: {{cloud_init_setup_commands}}
73
- # optionally set priority to use Spot instances
74
75
  {%- if use_spot %}
76
+ # optionally set priority to use Spot instances
75
77
  priority: Spot
76
- # set a maximum price for spot instances if desired
77
- # billingProfile:
78
- # maxPrice: -1
79
78
  {%- endif %}
79
+ cloudInitSetupCommands: |-
80
+ {%- for cmd in cloud_init_setup_commands %}
81
+ {{ cmd }}
82
+ {%- endfor %}
83
+ {%- if disk_performance_tier is not none %}
84
+ disk_performance_tier: {{disk_performance_tier}}
85
+ {%- endif %}
80
86
  # TODO: attach disk
81
- {% if num_nodes > 1 %}
82
- ray.worker.default:
83
- min_workers: {{num_nodes - 1}}
84
- max_workers: {{num_nodes - 1}}
85
- resources: {}
86
- node_config:
87
- tags:
88
- skypilot-user: {{ user }}
89
- azure_arm_parameters:
90
- adminUsername: skypilot:ssh_user
91
- publicKey: |
92
- skypilot:ssh_public_key_content
93
- vmSize: {{instance_type}}
94
- # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
95
- imagePublisher: {{image_publisher}}
96
- imageOffer: {{image_offer}}
97
- imageSku: "{{image_sku}}"
98
- imageVersion: {{image_version}}
99
- osDiskSizeGB: {{disk_size}}
100
- osDiskTier: {{disk_tier}}
101
- cloudInitSetupCommands: {{cloud_init_setup_commands}}
102
- {%- if use_spot %}
103
- priority: Spot
104
- # set a maximum price for spot instances if desired
105
- # billingProfile:
106
- # maxPrice: -1
107
- {%- endif %}
108
- {%- endif %}
109
87
 
110
88
  head_node_type: ray.head.default
111
89
 
@@ -115,12 +93,10 @@ file_mounts: {
115
93
  "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
116
94
  {%- for remote_path, local_path in credentials.items() %}
117
95
  "{{remote_path}}": "{{local_path}}",
96
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
118
97
  {%- endfor %}
119
98
  }
120
99
 
121
- rsync_exclude: []
122
-
123
- initialization_commands: []
124
100
 
125
101
  # List of shell commands to run to set up nodes.
126
102
  # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
@@ -130,7 +106,7 @@ initialization_commands: []
130
106
  # Increment the following for catching performance bugs easier:
131
107
  # current num items (num SSH connections): 1
132
108
  setup_commands:
133
- # Create ~/.ssh/config file in case the file does not exist in the image.
109
+ # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
134
110
  # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
135
111
  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
136
112
  # Line 'sudo systemctl stop jupyter ..': stop jupyter service to avoid port conflict on 8888
@@ -151,37 +127,6 @@ setup_commands:
151
127
  sudo systemctl stop jupyterhub > /dev/null 2>&1 || true;
152
128
  sudo systemctl disable jupyterhub > /dev/null 2>&1 || true;
153
129
  {%- endif %}
154
- mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
130
+ mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
155
131
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
156
132
  sudo mv /etc/nccl.conf /etc/nccl.conf.bak || true;
157
-
158
- # Command to start ray on the head node. You don't need to change this.
159
- # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
160
- # connection, which is expensive. Try your best to co-locate commands into fewer
161
- # items! The same comment applies for worker_start_ray_commands.
162
- #
163
- # Increment the following for catching performance bugs easier:
164
- # current num items (num SSH connections): 2
165
- head_start_ray_commands:
166
- # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
167
- - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
168
- which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
169
- {{dump_port_command}};
170
- {{ray_head_wait_initialized_command}}
171
-
172
- {%- if num_nodes > 1 %}
173
- worker_start_ray_commands:
174
- - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
175
- which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
176
- {%- else %}
177
- worker_start_ray_commands: []
178
- {%- endif %}
179
-
180
- head_node: {}
181
- worker_nodes: {}
182
-
183
- # These fields are required for external cloud providers.
184
- head_setup_commands: []
185
- worker_setup_commands: []
186
- cluster_synced_files: []
187
- file_mounts_sync_continuously: False
@@ -32,6 +32,7 @@ file_mounts: {
32
32
  "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
33
33
  {%- for remote_path, local_path in credentials.items() %}
34
34
  "{{remote_path}}": "{{local_path}}",
35
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
35
36
  {%- endfor %}
36
37
  }
37
38
 
@@ -48,13 +49,16 @@ initialization_commands: [ ]
48
49
  # current num items (num SSH connections): 1
49
50
  setup_commands:
50
51
  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
51
- # Create ~/.ssh/config file in case the file does not exist in the image.
52
+ # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
52
53
  # Line 'rm ..': there is another installation of pip.
53
54
  # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
54
55
  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
55
56
  # Line 'mkdir -p ..': disable host key check
56
57
  # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
57
- - sudo systemctl stop unattended-upgrades || true;
58
+ - {%- for initial_setup_command in initial_setup_commands %}
59
+ {{ initial_setup_command }}
60
+ {%- endfor %}
61
+ sudo systemctl stop unattended-upgrades || true;
58
62
  sudo systemctl disable unattended-upgrades || true;
59
63
  sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
60
64
  sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
@@ -67,5 +71,5 @@ setup_commands:
67
71
  touch ~/.sudo_as_admin_successful;
68
72
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
69
73
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
70
- mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
74
+ mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
71
75
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
@@ -0,0 +1,98 @@
1
+ cluster_name: {{cluster_name_on_cloud}}
2
+
3
+ # The maximum number of workers nodes to launch in addition to the head node.
4
+ max_workers: {{num_nodes - 1}}
5
+ upscaling_speed: {{num_nodes - 1}}
6
+ idle_timeout_minutes: 60
7
+
8
+ {%- if docker_image is not none %}
9
+ docker:
10
+ image: {{docker_image}}
11
+ container_name: {{docker_container_name}}
12
+ run_options:
13
+ - --ulimit nofile=1048576:1048576
14
+ {%- for run_option in docker_run_options %}
15
+ - {{run_option}}
16
+ {%- endfor %}
17
+ {%- if docker_login_config is not none %}
18
+ docker_login_config:
19
+ username: |-
20
+ {{docker_login_config.username}}
21
+ password: |-
22
+ {{docker_login_config.password}}
23
+ server: |-
24
+ {{docker_login_config.server}}
25
+ {%- endif %}
26
+ {%- endif %}
27
+
28
+ provider:
29
+ type: external
30
+ module: sky.provision.do
31
+ region: "{{region}}"
32
+
33
+ auth:
34
+ ssh_user: root
35
+ ssh_private_key: {{ssh_private_key}}
36
+ ssh_public_key: |-
37
+ skypilot:ssh_public_key_content
38
+
39
+ available_node_types:
40
+ ray_head_default:
41
+ resources: {}
42
+ node_config:
43
+ InstanceType: {{instance_type}}
44
+ DiskSize: {{disk_size}}
45
+ {%- if image_id is not none %}
46
+ ImageId: {{image_id}}
47
+ {%- endif %}
48
+
49
+ head_node_type: ray_head_default
50
+
51
+ # Format: `REMOTE_PATH : LOCAL_PATH`
52
+ file_mounts: {
53
+ "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
54
+ "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
55
+ {%- for remote_path, local_path in credentials.items() %}
56
+ "{{remote_path}}": "{{local_path}}",
57
+ {%- endfor %}
58
+ }
59
+
60
+ rsync_exclude: []
61
+
62
+ initialization_commands: []
63
+
64
+ # List of shell commands to run to set up nodes.
65
+ # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
66
+ # connection, which is expensive. Try your best to co-locate commands into fewer
67
+ # items!
68
+ #
69
+ # Increment the following for catching performance bugs easier:
70
+ # current num items (num SSH connections): 1
71
+ setup_commands:
72
+ # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
73
+ # Create ~/.ssh/config file in case the file does not exist in the image.
74
+ # Line 'rm ..': there is another installation of pip.
75
+ # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
76
+ # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
77
+ # Line 'mkdir -p ..': disable host key check
78
+ # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
79
+ - {%- for initial_setup_command in initial_setup_commands %}
80
+ {{ initial_setup_command }}
81
+ {%- endfor %}
82
+ sudo systemctl stop unattended-upgrades || true;
83
+ sudo systemctl disable unattended-upgrades || true;
84
+ sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
85
+ sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
86
+ sudo pkill -9 apt-get;
87
+ sudo pkill -9 dpkg;
88
+ sudo dpkg --configure -a;
89
+ mkdir -p ~/.ssh; touch ~/.ssh/config;
90
+ {{ conda_installation_commands }}
91
+ {{ ray_skypilot_installation_commands }}
92
+ sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
93
+ sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
94
+ mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
95
+ [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
96
+
97
+ # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
98
+ # We do not need to list it here anymore.
@@ -33,6 +33,7 @@ file_mounts: {
33
33
  "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
34
34
  {%- for remote_path, local_path in credentials.items() %}
35
35
  "{{remote_path}}": "{{local_path}}",
36
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
36
37
  {%- endfor %}
37
38
  }
38
39
 
@@ -49,26 +50,28 @@ initialization_commands: []
49
50
  # current num items (num SSH connections): 1
50
51
  setup_commands:
51
52
  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
52
- # Create ~/.ssh/config file in case the file does not exist in the image.
53
+ # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
53
54
  # Line 'rm ..': there is another installation of pip.
54
55
  # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
55
56
  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
56
57
  # Line 'mkdir -p ..': disable host key check
57
58
  # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
58
- - sudo systemctl stop unattended-upgrades || true;
59
+ - {%- for initial_setup_command in initial_setup_commands %}
60
+ {{ initial_setup_command }}
61
+ {%- endfor %}
62
+ sudo systemctl stop unattended-upgrades || true;
59
63
  sudo systemctl disable unattended-upgrades || true;
60
64
  sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
61
65
  sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
62
66
  sudo pkill -9 apt-get;
63
67
  sudo pkill -9 dpkg;
64
68
  sudo dpkg --configure -a;
65
- {{ cuda_installation_commands }}
66
69
  mkdir -p ~/.ssh; touch ~/.ssh/config;
67
70
  {{ conda_installation_commands }}
68
71
  {{ ray_skypilot_installation_commands }}
69
72
  touch ~/.sudo_as_admin_successful;
70
73
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
71
74
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
72
- mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
75
+ mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
73
76
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
74
77
 
@@ -12,9 +12,9 @@ docker:
12
12
  container_name: {{docker_container_name}}
13
13
  run_options:
14
14
  - --ulimit nofile=1048576:1048576
15
- {%- if gpu is not none %}
16
- --gpus all
17
- {%- endif %}
15
+ {%- for run_option in docker_run_options %}
16
+ - {{run_option}}
17
+ {%- endfor %}
18
18
  {%- if docker_login_config is not none %}
19
19
  docker_login_config:
20
20
  username: |-
@@ -29,7 +29,7 @@ docker:
29
29
  provider:
30
30
  # We use a custom node provider for GCP to create, stop and reuse instances.
31
31
  type: external # type: gcp
32
- module: sky.skylet.providers.gcp.GCPNodeProvider
32
+ module: sky.provision.gcp
33
33
  region: {{region}}
34
34
  availability_zone: {{zones}}
35
35
  # Keep (otherwise cannot reuse when re-provisioning).
@@ -47,6 +47,7 @@ provider:
47
47
  firewall_rule: {{firewall_rule}}
48
48
  {% endif %}
49
49
  use_internal_ips: {{use_internal_ips}}
50
+ force_enable_external_ips: {{force_enable_external_ips}}
50
51
  {%- if tpu_vm %}
51
52
  _has_tpus: True
52
53
  {%- endif %}
@@ -62,6 +63,10 @@ provider:
62
63
  # The upper-level SkyPilot code has make sure there will not be resource
63
64
  # leakage.
64
65
  disable_launch_config_check: true
66
+ use_managed_instance_group: {{ gcp_use_managed_instance_group }}
67
+ {%- if enable_gvnic %}
68
+ enable_gvnic: {{ enable_gvnic }}
69
+ {%- endif %}
65
70
 
66
71
  auth:
67
72
  ssh_user: gcpuser
@@ -79,6 +84,14 @@ available_node_types:
79
84
  {%- for label_key, label_value in labels.items() %}
80
85
  {{ label_key }}: {{ label_value|tojson }}
81
86
  {%- endfor %}
87
+ use-managed-instance-group: {{ gcp_use_managed_instance_group_value|tojson }}
88
+ {%- if gcp_use_managed_instance_group %}
89
+ managed-instance-group:
90
+ run_duration: {{ run_duration }}
91
+ {%- if provision_timeout is defined and provision_timeout is not none %}
92
+ provision_timeout: {{ provision_timeout }}
93
+ {%- endif %}
94
+ {%- endif %}
82
95
  {%- if specific_reservations %}
83
96
  reservationAffinity:
84
97
  consumeReservationType: SPECIFIC_RESERVATION
@@ -114,6 +127,9 @@ available_node_types:
114
127
  sourceImage: {{image_id}}
115
128
  {%- endif %}
116
129
  diskType: zones/{{zones}}/diskTypes/{{disk_tier}}
130
+ {%- if disk_iops %}
131
+ provisionedIops: {{disk_iops}}
132
+ {%- endif %}
117
133
  {%- if gpu is not none %}
118
134
  guestAccelerators:
119
135
  - acceleratorType: projects/{{gcp_project_id}}/zones/{{zones}}/acceleratorTypes/{{gpu}}
@@ -153,6 +169,7 @@ file_mounts: {
153
169
  "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
154
170
  {%- for remote_path, local_path in credentials.items() %}
155
171
  "{{remote_path}}": "{{local_path}}",
172
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
156
173
  {%- endfor %}
157
174
  }
158
175
 
@@ -164,8 +181,8 @@ file_mounts: {
164
181
  # Increment the following for catching performance bugs easier:
165
182
  # current num items (num SSH connections): 1 (+1 if tpu_vm)
166
183
  setup_commands:
184
+ # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
167
185
  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
168
- # Line 'mkdir -p ..': Create ~/.ssh/config file in case the file does not exist in the custom image.
169
186
  # Line 'which conda ..': some images (TPU VM) do not install conda by
170
187
  # default. 'source ~/.bashrc' is needed so conda takes effect for the next
171
188
  # commands.
@@ -175,6 +192,9 @@ setup_commands:
175
192
  # Line 'mkdir -p ..': disable host key check
176
193
  # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
177
194
  - function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
195
+ {%- for initial_setup_command in initial_setup_commands %}
196
+ {{ initial_setup_command }}
197
+ {%- endfor %}
178
198
  {%- if docker_image is none %}
179
199
  sudo systemctl stop unattended-upgrades || true;
180
200
  sudo systemctl disable unattended-upgrades || true;
@@ -203,7 +223,7 @@ setup_commands:
203
223
  sudo systemctl stop jupyter > /dev/null 2>&1 || true;
204
224
  sudo systemctl disable jupyter > /dev/null 2>&1 || true;
205
225
  {%- endif %}
206
- mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
226
+ mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
207
227
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
208
228
 
209
229
  # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
@@ -69,6 +69,7 @@ file_mounts: {
69
69
  "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
70
70
  {%- for remote_path, local_path in credentials.items() %}
71
71
  "{{remote_path}}": "{{local_path}}",
72
+ "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
72
73
  {%- endfor %}
73
74
  }
74
75
 
@@ -84,7 +85,7 @@ initialization_commands: []
84
85
  # Increment the following for catching performance bugs easier:
85
86
  # current num items (num SSH connections): 1
86
87
  setup_commands:
87
- # Create ~/.ssh/config file in case the file does not exist in the custom image.
88
+ # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
88
89
  # We set auto_activate_base to be false for pre-installed conda.
89
90
  # This also kills the service that is holding the lock on dpkg (problem only exists on aws/azure, not gcp)
90
91
  # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
@@ -103,7 +104,7 @@ setup_commands:
103
104
  {{ ray_skypilot_installation_commands }}
104
105
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
105
106
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
106
- mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
107
+ mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
107
108
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
108
109
 
109
110
 
@@ -4,10 +4,15 @@ name: {{dag_name}}
4
4
 
5
5
  file_mounts:
6
6
  {{remote_user_yaml_path}}: {{user_yaml_path}}
7
- {{remote_user_config_path}}: skypilot:local_skypilot_config_path
7
+ {%- if local_user_config_path is not none %}
8
+ {{remote_user_config_path}}: {{local_user_config_path}}
9
+ {%- endif %}
8
10
  {%- for remote_catalog_path, local_catalog_path in modified_catalogs.items() %}
9
11
  {{remote_catalog_path}}: {{local_catalog_path}}
10
12
  {%- endfor %}
13
+ {%- for controller_file_mount_path, local_file_mount_path in local_to_controller_file_mounts.items() %}
14
+ {{controller_file_mount_path}}: {{local_file_mount_path}}
15
+ {%- endfor %}
11
16
 
12
17
  setup: |
13
18
  {{ sky_activate_python_env }}
@@ -19,21 +24,51 @@ setup: |
19
24
  {{cmd}}
20
25
  {%- endfor %}
21
26
 
22
- {% if is_dev %}
23
- # Internal: disable logging for manually logging into the jobs controller for debugging.
24
- echo 'export SKYPILOT_DEV=1' >> ~/.bashrc
27
+ {% if controller_envs.get('SKYPILOT_DEV') != '0' %}
28
+ grep -q 'export SKYPILOT_DEV=' ~/.bashrc || echo 'export SKYPILOT_DEV=1' >> ~/.bashrc
29
+ grep -q 'alias sky-env=' ~/.bashrc || echo 'alias sky-env="{{ sky_activate_python_env }}"' >> ~/.bashrc
25
30
  {% endif %}
26
31
 
27
- # Dashboard.
28
- ps aux | grep -v nohup | grep -v grep | grep -- "-m sky.spot.dashboard" | awk '{print $2}' | xargs kill > /dev/null 2>&1 || true
29
- pip list | grep flask > /dev/null 2>&1 || pip install flask 2>&1 > /dev/null
30
- ((ps aux | grep -v nohup | grep -v grep | grep -q -- "-m sky.jobs.dashboard.dashboard") || (nohup {{ sky_python_cmd }} -m sky.jobs.dashboard.dashboard >> ~/.sky/job-dashboard.log 2>&1 &));
32
+ # Create systemd service file
33
+ mkdir -p ~/.config/systemd/user/
34
+
35
+ # Create systemd user service file
36
+ cat << EOF > ~/.config/systemd/user/skypilot-dashboard.service
37
+ [Unit]
38
+ Description=SkyPilot Jobs Dashboard
39
+ After=network.target
40
+
41
+ [Service]
42
+ Environment="PATH={{ sky_python_env_path }}:\$PATH"
43
+ Environment="SKYPILOT_USER_ID={{controller_envs.SKYPILOT_USER_ID}}"
44
+ Environment="SKYPILOT_USER={{controller_envs.SKYPILOT_USER}}"
45
+ Restart=always
46
+ StandardOutput=append:/home/$USER/.sky/job-dashboard.log
47
+ StandardError=append:/home/$USER/.sky/job-dashboard.log
48
+ ExecStart={{ sky_python_cmd }} -m sky.jobs.dashboard.dashboard
49
+
50
+ [Install]
51
+ WantedBy=default.target
52
+ EOF
53
+
54
+ {{ dashboard_setup_cmd }}
31
55
 
32
56
  run: |
33
57
  {{ sky_activate_python_env }}
34
- # Start the controller for the current managed job.
35
- python -u -m sky.jobs.controller {{remote_user_yaml_path}} \
36
- --job-id $SKYPILOT_INTERNAL_JOB_ID {% if retry_until_up %}--retry-until-up{% endif %}
58
+
59
+ # Write env vars to a file
60
+ {%- for env_name, env_value in controller_envs.items() %}
61
+ echo "export {{env_name}}='{{env_value}}'" >> {{remote_env_file_path}}
62
+ {%- endfor %}
63
+
64
+ # Submit the job to the scheduler.
65
+ # Note: The job is already in the `spot` table, marked as PENDING.
66
+ # CloudVmRayBackend._exec_code_on_head() calls
67
+ # managed_job_codegen.set_pending() before we get here.
68
+ python -u -m sky.jobs.scheduler {{remote_user_yaml_path}} \
69
+ --job-id $SKYPILOT_INTERNAL_JOB_ID \
70
+ --env-file {{remote_env_file_path}}
71
+
37
72
 
38
73
  envs:
39
74
  {%- for env_name, env_value in controller_envs.items() %}