skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,13 @@
1
1
  """RunPod library wrapper for SkyPilot."""
2
2
 
3
+ import base64
3
4
  import time
4
- from typing import Any, Dict, List
5
+ from typing import Any, Dict, List, Optional, Tuple
5
6
 
6
7
  from sky import sky_logging
7
8
  from sky.adaptors import runpod
9
+ from sky.provision import docker_utils
10
+ import sky.provision.runpod.api.commands as runpod_commands
8
11
  from sky.skylet import constants
9
12
  from sky.utils import common_utils
10
13
 
@@ -45,6 +48,11 @@ GPU_NAME_MAP = {
45
48
  }
46
49
 
47
50
 
51
+ def _construct_docker_login_template_name(cluster_name: str) -> str:
52
+ """Constructs the registry auth template name."""
53
+ return f'{cluster_name}-docker-login-template'
54
+
55
+
48
56
  def retry(func):
49
57
  """Decorator to retry a function."""
50
58
 
@@ -64,9 +72,83 @@ def retry(func):
64
72
  return wrapper
65
73
 
66
74
 
75
+ # Adapted from runpod.api.queries.pods.py::QUERY_POD.
76
+ # Adding containerRegistryAuthId to the query.
77
+ _QUERY_POD = """
78
+ query myPods {
79
+ myself {
80
+ pods {
81
+ id
82
+ containerDiskInGb
83
+ containerRegistryAuthId
84
+ costPerHr
85
+ desiredStatus
86
+ dockerArgs
87
+ dockerId
88
+ env
89
+ gpuCount
90
+ imageName
91
+ lastStatusChange
92
+ machineId
93
+ memoryInGb
94
+ name
95
+ podType
96
+ port
97
+ ports
98
+ uptimeSeconds
99
+ vcpuCount
100
+ volumeInGb
101
+ volumeMountPath
102
+ runtime {
103
+ ports{
104
+ ip
105
+ isIpPublic
106
+ privatePort
107
+ publicPort
108
+ type
109
+ }
110
+ }
111
+ machine {
112
+ gpuDisplayName
113
+ }
114
+ }
115
+ }
116
+ }
117
+ """
118
+
119
+
120
+ def _sky_get_pods() -> dict:
121
+ """List all pods with extra registry auth information.
122
+
123
+ Adapted from runpod.get_pods() to include containerRegistryAuthId.
124
+ """
125
+ raw_return = runpod.runpod.api.graphql.run_graphql_query(_QUERY_POD)
126
+ cleaned_return = raw_return['data']['myself']['pods']
127
+ return cleaned_return
128
+
129
+
130
+ _QUERY_POD_TEMPLATE_WITH_REGISTRY_AUTH = """
131
+ query myself {
132
+ myself {
133
+ podTemplates {
134
+ name
135
+ containerRegistryAuthId
136
+ }
137
+ }
138
+ }
139
+ """
140
+
141
+
142
+ def _list_pod_templates_with_container_registry() -> dict:
143
+ """List all pod templates."""
144
+ raw_return = runpod.runpod.api.graphql.run_graphql_query(
145
+ _QUERY_POD_TEMPLATE_WITH_REGISTRY_AUTH)
146
+ return raw_return['data']['myself']['podTemplates']
147
+
148
+
67
149
  def list_instances() -> Dict[str, Dict[str, Any]]:
68
150
  """Lists instances associated with API key."""
69
- instances = runpod.runpod.get_pods()
151
+ instances = _sky_get_pods()
70
152
 
71
153
  instance_dict: Dict[str, Dict[str, Any]] = {}
72
154
  for instance in instances:
@@ -74,13 +156,23 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
74
156
 
75
157
  info['status'] = instance['desiredStatus']
76
158
  info['name'] = instance['name']
159
+ info['port2endpoint'] = {}
77
160
 
78
- if instance['desiredStatus'] == 'RUNNING' and instance.get('runtime'):
161
+ # Sometimes when the cluster is in the process of being created,
162
+ # the `port` field in the runtime is None and we need to check for it.
163
+ if (instance['desiredStatus'] == 'RUNNING' and
164
+ instance.get('runtime') and
165
+ instance.get('runtime').get('ports')):
79
166
  for port in instance['runtime']['ports']:
80
- if port['privatePort'] == 22 and port['isIpPublic']:
81
- info['external_ip'] = port['ip']
82
- info['ssh_port'] = port['publicPort']
83
- elif not port['isIpPublic']:
167
+ if port['isIpPublic']:
168
+ if port['privatePort'] == 22:
169
+ info['external_ip'] = port['ip']
170
+ info['ssh_port'] = port['publicPort']
171
+ info['port2endpoint'][port['privatePort']] = {
172
+ 'host': port['ip'],
173
+ 'port': port['publicPort']
174
+ }
175
+ else:
84
176
  info['internal_ip'] = port['ip']
85
177
 
86
178
  instance_dict[instance['id']] = info
@@ -88,37 +180,161 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
88
180
  return instance_dict
89
181
 
90
182
 
91
- def launch(name: str, instance_type: str, region: str, disk_size: int) -> str:
183
+ def delete_pod_template(template_name: str) -> None:
184
+ """Deletes a pod template."""
185
+ try:
186
+ runpod.runpod.api.graphql.run_graphql_query(
187
+ f'mutation {{deleteTemplate(templateName: "{template_name}")}}')
188
+ except runpod.runpod.error.QueryError as e:
189
+ logger.warning(f'Failed to delete template {template_name}: {e} '
190
+ 'Please delete it manually.')
191
+
192
+
193
+ def delete_register_auth(registry_auth_id: str) -> None:
194
+ """Deletes a registry auth."""
195
+ try:
196
+ runpod.runpod.delete_container_registry_auth(registry_auth_id)
197
+ except runpod.runpod.error.QueryError as e:
198
+ logger.warning(
199
+ f'Failed to delete registry auth {registry_auth_id}: {e} '
200
+ 'Please delete it manually.')
201
+
202
+
203
+ def _create_template_for_docker_login(
204
+ cluster_name: str,
205
+ image_name: str,
206
+ docker_login_config: Optional[Dict[str, str]],
207
+ ) -> Tuple[str, Optional[str]]:
208
+ """Creates a template for the given image with the docker login config.
209
+
210
+ Returns:
211
+ formatted_image_name: The formatted image name.
212
+ template_id: The template ID. None for no docker login config.
213
+ """
214
+ if docker_login_config is None:
215
+ return image_name, None
216
+ login_config = docker_utils.DockerLoginConfig(**docker_login_config)
217
+ container_registry_auth_name = f'{cluster_name}-registry-auth'
218
+ container_template_name = _construct_docker_login_template_name(
219
+ cluster_name)
220
+ # The `name` argument is only for display purpose and the registry server
221
+ # will be splitted from the docker image name (Tested with AWS ECR).
222
+ # Here we only need the username and password to create the registry auth.
223
+ # TODO(tian): Now we create a template and a registry auth for each cluster.
224
+ # Consider create one for each server and reuse them. Challenges including
225
+ # calculate the reference count and delete them when no longer needed.
226
+ create_auth_resp = runpod.runpod.create_container_registry_auth(
227
+ name=container_registry_auth_name,
228
+ username=login_config.username,
229
+ password=login_config.password,
230
+ )
231
+ registry_auth_id = create_auth_resp['id']
232
+ create_template_resp = runpod.runpod.create_template(
233
+ name=container_template_name,
234
+ image_name=None,
235
+ registry_auth_id=registry_auth_id,
236
+ )
237
+ return login_config.format_image(image_name), create_template_resp['id']
238
+
239
+
240
+ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
241
+ disk_size: int, image_name: str, ports: Optional[List[int]],
242
+ public_key: str, preemptible: Optional[bool], bid_per_gpu: float,
243
+ docker_login_config: Optional[Dict[str, str]]) -> str:
92
244
  """Launches an instance with the given parameters.
93
245
 
94
246
  Converts the instance_type to the RunPod GPU name, finds the specs for the
95
247
  GPU, and launches the instance.
248
+
249
+ Returns:
250
+ instance_id: The instance ID.
96
251
  """
252
+ name = f'{cluster_name}-{node_type}'
97
253
  gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
98
254
  gpu_quantity = int(instance_type.split('_')[0].replace('x', ''))
99
255
  cloud_type = instance_type.split('_')[2]
100
256
 
101
257
  gpu_specs = runpod.runpod.get_gpu(gpu_type)
258
+ # TODO(zhwu): keep this align with setups in
259
+ # `provision.kuberunetes.instance.py`
260
+ setup_cmd = (
261
+ 'prefix_cmd() '
262
+ '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; '
263
+ '$(prefix_cmd) apt update;'
264
+ 'export DEBIAN_FRONTEND=noninteractive;'
265
+ '$(prefix_cmd) apt install openssh-server rsync curl patch -y;'
266
+ '$(prefix_cmd) mkdir -p /var/run/sshd; '
267
+ '$(prefix_cmd) '
268
+ 'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" '
269
+ '/etc/ssh/sshd_config; '
270
+ '$(prefix_cmd) sed '
271
+ '"s@session\\s*required\\s*pam_loginuid.so@session optional '
272
+ 'pam_loginuid.so@g" -i /etc/pam.d/sshd; '
273
+ 'cd /etc/ssh/ && $(prefix_cmd) ssh-keygen -A; '
274
+ '$(prefix_cmd) mkdir -p ~/.ssh; '
275
+ '$(prefix_cmd) chown -R $(whoami) ~/.ssh;'
276
+ '$(prefix_cmd) chmod 700 ~/.ssh; '
277
+ f'$(prefix_cmd) echo "{public_key}" >> ~/.ssh/authorized_keys; '
278
+ '$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys; '
279
+ '$(prefix_cmd) service ssh restart; '
280
+ '[ $(id -u) -eq 0 ] && echo alias sudo="" >> ~/.bashrc;sleep infinity')
281
+ # Use base64 to deal with the tricky quoting issues caused by runpod API.
282
+ encoded = base64.b64encode(setup_cmd.encode('utf-8')).decode('utf-8')
102
283
 
103
- new_instance = runpod.runpod.create_pod(
104
- name=name,
105
- image_name='runpod/base:0.0.2',
106
- gpu_type_id=gpu_type,
107
- cloud_type=cloud_type,
108
- container_disk_in_gb=disk_size,
109
- min_vcpu_count=4 * gpu_quantity,
110
- min_memory_in_gb=gpu_specs['memoryInGb'] * gpu_quantity,
111
- gpu_count=gpu_quantity,
112
- country_code=region,
113
- ports=(f'22/tcp,'
114
- f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
115
- f'{constants.SKY_REMOTE_RAY_PORT}/http'),
116
- support_public_ip=True,
117
- )
284
+ docker_args = (f'bash -c \'echo {encoded} | base64 --decode > init.sh; '
285
+ f'bash init.sh\'')
286
+
287
+ # Port 8081 is occupied for nginx in the base image.
288
+ custom_ports_str = ''
289
+ if ports is not None:
290
+ custom_ports_str = ''.join([f'{p}/tcp,' for p in ports])
291
+ ports_str = (f'22/tcp,'
292
+ f'{custom_ports_str}'
293
+ f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
294
+ f'{constants.SKY_REMOTE_RAY_PORT}/http')
295
+
296
+ image_name_formatted, template_id = _create_template_for_docker_login(
297
+ cluster_name, image_name, docker_login_config)
298
+
299
+ params = {
300
+ 'name': name,
301
+ 'image_name': image_name_formatted,
302
+ 'gpu_type_id': gpu_type,
303
+ 'cloud_type': cloud_type,
304
+ 'container_disk_in_gb': disk_size,
305
+ 'min_vcpu_count': 4 * gpu_quantity,
306
+ 'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
307
+ 'gpu_count': gpu_quantity,
308
+ 'country_code': region,
309
+ 'ports': ports_str,
310
+ 'support_public_ip': True,
311
+ 'docker_args': docker_args,
312
+ 'template_id': template_id,
313
+ }
314
+
315
+ if preemptible is None or not preemptible:
316
+ new_instance = runpod.runpod.create_pod(**params)
317
+ else:
318
+ new_instance = runpod_commands.create_spot_pod(
319
+ bid_per_gpu=bid_per_gpu,
320
+ **params,
321
+ )
118
322
 
119
323
  return new_instance['id']
120
324
 
121
325
 
326
+ def get_registry_auth_resources(
327
+ cluster_name: str) -> Tuple[Optional[str], Optional[str]]:
328
+ """Gets the registry auth resources."""
329
+ container_registry_auth_name = _construct_docker_login_template_name(
330
+ cluster_name)
331
+ for template in _list_pod_templates_with_container_registry():
332
+ if template['name'] == container_registry_auth_name:
333
+ return container_registry_auth_name, template[
334
+ 'containerRegistryAuthId']
335
+ return None, None
336
+
337
+
122
338
  def remove(instance_id: str) -> None:
123
339
  """Terminates the given instance."""
124
340
  runpod.runpod.terminate_pod(instance_id)
@@ -0,0 +1,10 @@
1
+ """Vast provisioner for SkyPilot."""
2
+
3
+ from sky.provision.vast.config import bootstrap_instances
4
+ from sky.provision.vast.instance import cleanup_ports
5
+ from sky.provision.vast.instance import get_cluster_info
6
+ from sky.provision.vast.instance import query_instances
7
+ from sky.provision.vast.instance import run_instances
8
+ from sky.provision.vast.instance import stop_instances
9
+ from sky.provision.vast.instance import terminate_instances
10
+ from sky.provision.vast.instance import wait_instances
@@ -0,0 +1,11 @@
1
+ """Vast configuration bootstrapping."""
2
+
3
+ from sky.provision import common
4
+
5
+
6
+ def bootstrap_instances(
7
+ region: str, cluster_name: str,
8
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
9
+ """Bootstraps instances for the given cluster."""
10
+ del region, cluster_name # unused
11
+ return config
@@ -0,0 +1,247 @@
1
+ """Vast instance provisioning."""
2
+ import time
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from sky import sky_logging
6
+ from sky.provision import common
7
+ from sky.provision.vast import utils
8
+ from sky.utils import common_utils
9
+ from sky.utils import status_lib
10
+ from sky.utils import ux_utils
11
+
12
+ POLL_INTERVAL = 10
13
+
14
+ logger = sky_logging.init_logger(__name__)
15
+ # a much more convenient method
16
+ status_filter = lambda machine_dict, stat_list: {
17
+ k: v for k, v in machine_dict.items() if v['status'] in stat_list
18
+ }
19
+
20
+
21
+ def _filter_instances(cluster_name_on_cloud: str,
22
+ status_filters: Optional[List[str]],
23
+ head_only: bool = False) -> Dict[str, Any]:
24
+
25
+ instances = utils.list_instances()
26
+ possible_names = [f'{cluster_name_on_cloud}-head']
27
+ if not head_only:
28
+ possible_names.append(f'{cluster_name_on_cloud}-worker')
29
+
30
+ filtered_instances = {}
31
+ for instance_id, instance in instances.items():
32
+ if (status_filters is not None and
33
+ instance['status'] not in status_filters):
34
+ continue
35
+ if instance.get('name') in possible_names:
36
+ filtered_instances[instance_id] = instance
37
+ return filtered_instances
38
+
39
+
40
+ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
41
+ for inst_id, inst in instances.items():
42
+ if inst['name'].endswith('-head'):
43
+ return inst_id
44
+ return None
45
+
46
+
47
+ def run_instances(region: str, cluster_name_on_cloud: str,
48
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
49
+ """Runs instances for the given cluster."""
50
+ pending_status = ['CREATED', 'RESTARTING']
51
+
52
+ created_instance_ids = []
53
+ instances: Dict[str, Any] = {}
54
+
55
+ while True:
56
+ instances = _filter_instances(cluster_name_on_cloud, None)
57
+ if not status_filter(instances, pending_status):
58
+ break
59
+ logger.info(f'Waiting for {len(instances)} instances to be ready.')
60
+ time.sleep(POLL_INTERVAL)
61
+
62
+ running_instances = status_filter(instances, ['RUNNING'])
63
+ head_instance_id = _get_head_instance_id(running_instances)
64
+ stopped_instances = status_filter(instances, ['EXITED', 'STOPPED'])
65
+
66
+ if config.resume_stopped_nodes and stopped_instances:
67
+ for instance in stopped_instances.values():
68
+ utils.start(instance['id'])
69
+ else:
70
+ to_start_count = config.count - (len(running_instances) +
71
+ len(stopped_instances))
72
+ if to_start_count < 0:
73
+ raise RuntimeError(f'Cluster {cluster_name_on_cloud} already has '
74
+ f'{len(running_instances)} nodes,'
75
+ f'but {config.count} are required.')
76
+ if to_start_count == 0:
77
+ if head_instance_id is None:
78
+ raise RuntimeError(
79
+ f'Cluster {cluster_name_on_cloud} has no head node.')
80
+ logger.info(
81
+ f'Cluster {cluster_name_on_cloud} already has '
82
+ f'{len(running_instances)} nodes, no need to start more.')
83
+ return common.ProvisionRecord(provider_name='vast',
84
+ cluster_name=cluster_name_on_cloud,
85
+ region=region,
86
+ zone=None,
87
+ head_instance_id=head_instance_id,
88
+ resumed_instance_ids=[],
89
+ created_instance_ids=[])
90
+
91
+ for _ in range(to_start_count):
92
+ node_type = 'head' if head_instance_id is None else 'worker'
93
+ try:
94
+ instance_id = utils.launch(
95
+ name=f'{cluster_name_on_cloud}-{node_type}',
96
+ instance_type=config.node_config['InstanceType'],
97
+ region=region,
98
+ disk_size=config.node_config['DiskSize'],
99
+ preemptible=config.node_config['Preemptible'],
100
+ image_name=config.node_config['ImageId'])
101
+ except Exception as e: # pylint: disable=broad-except
102
+ logger.warning(f'run_instances error: {e}')
103
+ raise
104
+ logger.info(f'Launched instance {instance_id}.')
105
+ created_instance_ids.append(instance_id)
106
+ if head_instance_id is None:
107
+ head_instance_id = instance_id
108
+
109
+ # Wait for instances to be ready.
110
+ while True:
111
+ instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
112
+ ready_instance_cnt = 0
113
+ for instance_id, instance in instances.items():
114
+ if instance.get('ssh_port') is not None:
115
+ ready_instance_cnt += 1
116
+ logger.info('Waiting for instances to be ready: '
117
+ f'({ready_instance_cnt}/{config.count}).')
118
+ if ready_instance_cnt == config.count:
119
+ break
120
+
121
+ time.sleep(POLL_INTERVAL)
122
+
123
+ head_instance_id = _get_head_instance_id(utils.list_instances())
124
+ assert head_instance_id is not None, 'head_instance_id should not be None'
125
+ return common.ProvisionRecord(provider_name='vast',
126
+ cluster_name=cluster_name_on_cloud,
127
+ region=region,
128
+ zone=None,
129
+ head_instance_id=head_instance_id,
130
+ resumed_instance_ids=[],
131
+ created_instance_ids=created_instance_ids)
132
+
133
+
134
+ def wait_instances(region: str, cluster_name_on_cloud: str,
135
+ state: Optional[status_lib.ClusterStatus]) -> None:
136
+ del region, cluster_name_on_cloud, state
137
+
138
+
139
+ def stop_instances(
140
+ cluster_name_on_cloud: str,
141
+ provider_config: Optional[Dict[str, Any]] = None,
142
+ worker_only: bool = False,
143
+ ) -> None:
144
+ return action_instances('stop', cluster_name_on_cloud, provider_config,
145
+ worker_only)
146
+
147
+
148
+ def terminate_instances(
149
+ cluster_name_on_cloud: str,
150
+ provider_config: Optional[Dict[str, Any]] = None,
151
+ worker_only: bool = False,
152
+ ) -> None:
153
+ return action_instances('remove', cluster_name_on_cloud, provider_config,
154
+ worker_only)
155
+
156
+
157
+ def action_instances(
158
+ fn: str,
159
+ cluster_name_on_cloud: str,
160
+ provider_config: Optional[Dict[str, Any]] = None,
161
+ worker_only: bool = False,
162
+ ) -> None:
163
+ """See sky/provision/__init__.py"""
164
+ del provider_config # unused
165
+ instances = _filter_instances(cluster_name_on_cloud, None)
166
+ for inst_id, inst in instances.items():
167
+ logger.debug(f'Instance {fn} {inst_id}: {inst}')
168
+ if worker_only and inst['name'].endswith('-head'):
169
+ continue
170
+ try:
171
+ getattr(utils, fn)(inst_id)
172
+ except Exception as e: # pylint: disable=broad-except
173
+ with ux_utils.print_exception_no_traceback():
174
+ raise RuntimeError(
175
+ f'Failed to {fn} instance {inst_id}: '
176
+ f'{common_utils.format_exception(e, use_bracket=False)}'
177
+ ) from e
178
+
179
+
180
+ def get_cluster_info(
181
+ region: str,
182
+ cluster_name_on_cloud: str,
183
+ provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
184
+ del region # unused
185
+ running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
186
+ instances: Dict[str, List[common.InstanceInfo]] = {}
187
+ head_instance_id = None
188
+ for instance_id, instance_info in running_instances.items():
189
+ instances[instance_id] = [
190
+ common.InstanceInfo(
191
+ instance_id=instance_id,
192
+ internal_ip=instance_info['local_ipaddrs'].strip(),
193
+ external_ip=instance_info['public_ipaddr'],
194
+ ssh_port=instance_info['ports']['22/tcp'][0]['HostPort'],
195
+ tags={},
196
+ )
197
+ ]
198
+ if instance_info['name'].endswith('-head'):
199
+ head_instance_id = instance_id
200
+
201
+ return common.ClusterInfo(
202
+ instances=instances,
203
+ head_instance_id=head_instance_id,
204
+ provider_name='vast',
205
+ provider_config=provider_config,
206
+ )
207
+
208
+
209
+ def open_ports(
210
+ cluster_name_on_cloud: str,
211
+ ports: List[str],
212
+ provider_config: Optional[Dict[str, Any]] = None,
213
+ ) -> None:
214
+ raise NotImplementedError('open_ports is not supported for Vast')
215
+
216
+
217
+ def query_instances(
218
+ cluster_name_on_cloud: str,
219
+ provider_config: Optional[Dict[str, Any]] = None,
220
+ non_terminated_only: bool = True,
221
+ ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
222
+ """See sky/provision/__init__.py"""
223
+
224
+ assert provider_config is not None, (cluster_name_on_cloud, provider_config)
225
+ instances = _filter_instances(cluster_name_on_cloud, None)
226
+ # "running", "frozen", "stopped", "unknown", "loading"
227
+ status_map = {
228
+ 'LOADING': status_lib.ClusterStatus.INIT,
229
+ 'EXITED': status_lib.ClusterStatus.STOPPED,
230
+ 'STOPPED': status_lib.ClusterStatus.STOPPED,
231
+ 'RUNNING': status_lib.ClusterStatus.UP,
232
+ }
233
+ statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
234
+ for inst_id, inst in instances.items():
235
+ status = status_map[inst['status']]
236
+ if non_terminated_only and status is None:
237
+ continue
238
+ statuses[inst_id] = status
239
+ return statuses
240
+
241
+
242
+ def cleanup_ports(
243
+ cluster_name_on_cloud: str,
244
+ ports: List[str],
245
+ provider_config: Optional[Dict[str, Any]] = None,
246
+ ) -> None:
247
+ del cluster_name_on_cloud, ports, provider_config # Unused.