skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/provision/common.py CHANGED
@@ -1,9 +1,11 @@
1
1
  """Common data structures for provisioning"""
2
2
  import abc
3
3
  import dataclasses
4
+ import functools
4
5
  import os
5
6
  from typing import Any, Dict, List, Optional, Tuple
6
7
 
8
+ from sky import sky_logging
7
9
  from sky.utils import resources_utils
8
10
 
9
11
  # NOTE: we can use pydantic instead of dataclasses or namedtuples, because
@@ -14,6 +16,10 @@ from sky.utils import resources_utils
14
16
  # -------------------- input data model -------------------- #
15
17
 
16
18
  InstanceId = str
19
+ _START_TITLE = '\n' + '-' * 20 + 'Start: {} ' + '-' * 20
20
+ _END_TITLE = '-' * 20 + 'End: {} ' + '-' * 20 + '\n'
21
+
22
+ logger = sky_logging.init_logger(__name__)
17
23
 
18
24
 
19
25
  class ProvisionerError(RuntimeError):
@@ -46,6 +52,8 @@ class ProvisionConfig:
46
52
  tags: Dict[str, str]
47
53
  # Whether or not to resume stopped instances.
48
54
  resume_stopped_nodes: bool
55
+ # Optional ports to open on launch of the cluster.
56
+ ports_to_open_on_launch: Optional[List[int]]
49
57
 
50
58
 
51
59
  # -------------------- output data model -------------------- #
@@ -123,7 +131,8 @@ class ClusterInfo:
123
131
  if self.head_instance_id is None:
124
132
  return None
125
133
  if self.head_instance_id not in self.instances:
126
- raise ValueError('Head instance ID not in the cluster metadata.')
134
+ raise ValueError('Head instance ID not in the cluster metadata. '
135
+ f'ClusterInfo: {self.__dict__}')
127
136
  return self.instances[self.head_instance_id][0]
128
137
 
129
138
  def get_worker_instances(self) -> List[InstanceInfo]:
@@ -197,8 +206,14 @@ class ClusterInfo:
197
206
  return ip_list
198
207
 
199
208
  def get_feasible_ips(self, force_internal_ips: bool = False) -> List[str]:
200
- """Get external IPs if they exist, otherwise get internal ones."""
201
- return self._get_ips(not self.has_external_ips() or force_internal_ips)
209
+ """Get internal or external IPs depends on the settings."""
210
+ if self.provider_config is not None:
211
+ use_internal_ips = self.provider_config.get('use_internal_ips',
212
+ False)
213
+ else:
214
+ use_internal_ips = False
215
+ return self._get_ips(use_internal_ips or not self.has_external_ips() or
216
+ force_internal_ips)
202
217
 
203
218
  def get_ssh_ports(self) -> List[int]:
204
219
  """Get the SSH port of all the instances."""
@@ -268,3 +283,16 @@ def query_ports_passthrough(
268
283
  for port in ports:
269
284
  result[port] = [SocketEndpoint(port=port, host=head_ip)]
270
285
  return result
286
+
287
+
288
+ def log_function_start_end(func):
289
+
290
+ @functools.wraps(func)
291
+ def wrapper(*args, **kwargs):
292
+ logger.info(_START_TITLE.format(func.__name__))
293
+ try:
294
+ return func(*args, **kwargs)
295
+ finally:
296
+ logger.info(_END_TITLE.format(func.__name__))
297
+
298
+ return wrapper
@@ -0,0 +1,25 @@
1
+ """Constants used in the SkyPilot provisioner."""
2
+
3
+ # Tag uniquely identifying all nodes of a cluster
4
+ TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
5
+ TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
6
+ # Legacy tag for backward compatibility to distinguish head and worker nodes.
7
+ TAG_RAY_NODE_KIND = 'ray-node-type'
8
+ TAG_SKYPILOT_HEAD_NODE = 'skypilot-head-node'
9
+
10
+ HEAD_NODE_TAGS = {
11
+ TAG_RAY_NODE_KIND: 'head',
12
+ TAG_SKYPILOT_HEAD_NODE: '1',
13
+ }
14
+
15
+ WORKER_NODE_TAGS = {
16
+ TAG_RAY_NODE_KIND: 'worker',
17
+ TAG_SKYPILOT_HEAD_NODE: '0',
18
+ }
19
+
20
+ # Names for Azure Deployments.
21
+ DEPLOYMENT_NAME = 'skypilot-config'
22
+ LEGACY_DEPLOYMENT_NAME = 'ray-config'
23
+ EXTERNAL_RG_BOOTSTRAP_DEPLOYMENT_NAME = (
24
+ 'skypilot-bootstrap-{cluster_name_on_cloud}')
25
+ EXTERNAL_RG_VM_DEPLOYMENT_NAME = 'skypilot-vm-{cluster_name_on_cloud}'
@@ -3,6 +3,7 @@
3
3
  from sky.provision.cudo.config import bootstrap_instances
4
4
  from sky.provision.cudo.instance import cleanup_ports
5
5
  from sky.provision.cudo.instance import get_cluster_info
6
+ from sky.provision.cudo.instance import open_ports
6
7
  from sky.provision.cudo.instance import query_instances
7
8
  from sky.provision.cudo.instance import run_instances
8
9
  from sky.provision.cudo.instance import stop_instances
@@ -11,4 +12,4 @@ from sky.provision.cudo.instance import wait_instances
11
12
 
12
13
  __all__ = ('bootstrap_instances', 'run_instances', 'stop_instances',
13
14
  'terminate_instances', 'wait_instances', 'get_cluster_info',
14
- 'cleanup_ports', 'query_instances')
15
+ 'cleanup_ports', 'query_instances', 'open_ports')
@@ -0,0 +1,112 @@
1
+ """Cudo catalog helper."""
2
+
3
+ cudo_gpu_model = {
4
+ 'NVIDIA V100': 'V100',
5
+ 'NVIDIA A40': 'A40',
6
+ 'RTX 3080': 'RTX3080',
7
+ 'RTX A4000': 'RTXA4000',
8
+ 'RTX A4500': 'RTXA4500',
9
+ 'RTX A5000': 'RTXA5000',
10
+ 'RTX A6000': 'RTXA6000',
11
+ }
12
+
13
+ cudo_gpu_mem = {
14
+ 'RTX3080': 12,
15
+ 'A40': 48,
16
+ 'RTXA4000': 16,
17
+ 'RTXA4500': 20,
18
+ 'RTXA5000': 24,
19
+ 'RTXA6000': 48,
20
+ 'V100': 16,
21
+ }
22
+
23
+ machine_specs = [
24
+ # Low
25
+ {
26
+ 'vcpu': 2,
27
+ 'mem': 4,
28
+ 'gpu': 1,
29
+ },
30
+ {
31
+ 'vcpu': 4,
32
+ 'mem': 8,
33
+ 'gpu': 1,
34
+ },
35
+ {
36
+ 'vcpu': 8,
37
+ 'mem': 16,
38
+ 'gpu': 2,
39
+ },
40
+ {
41
+ 'vcpu': 16,
42
+ 'mem': 32,
43
+ 'gpu': 2,
44
+ },
45
+ {
46
+ 'vcpu': 32,
47
+ 'mem': 64,
48
+ 'gpu': 4,
49
+ },
50
+ {
51
+ 'vcpu': 64,
52
+ 'mem': 128,
53
+ 'gpu': 8,
54
+ },
55
+ # Mid
56
+ {
57
+ 'vcpu': 96,
58
+ 'mem': 192,
59
+ 'gpu': 8
60
+ },
61
+ {
62
+ 'vcpu': 48,
63
+ 'mem': 96,
64
+ 'gpu': 4
65
+ },
66
+ {
67
+ 'vcpu': 24,
68
+ 'mem': 48,
69
+ 'gpu': 2
70
+ },
71
+ {
72
+ 'vcpu': 12,
73
+ 'mem': 24,
74
+ 'gpu': 1
75
+ },
76
+ # Hi
77
+ {
78
+ 'vcpu': 96,
79
+ 'mem': 192,
80
+ 'gpu': 4
81
+ },
82
+ {
83
+ 'vcpu': 48,
84
+ 'mem': 96,
85
+ 'gpu': 2
86
+ },
87
+ {
88
+ 'vcpu': 24,
89
+ 'mem': 48,
90
+ 'gpu': 1
91
+ },
92
+ ]
93
+
94
+
95
+ def cudo_gpu_to_skypilot_gpu(model):
96
+ if model in cudo_gpu_model:
97
+ return cudo_gpu_model[model]
98
+ else:
99
+ return model
100
+
101
+
102
+ def skypilot_gpu_to_cudo_gpu(model):
103
+ for key, value in cudo_gpu_model.items():
104
+ if value == model:
105
+ return key
106
+ return model
107
+
108
+
109
+ def gpu_exists(model):
110
+ if model in cudo_gpu_model:
111
+ return True
112
+ return False
@@ -4,29 +4,29 @@ from typing import Dict
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.adaptors import cudo
7
+ import sky.provision.cudo.cudo_utils as utils
7
8
 
8
9
  logger = sky_logging.init_logger(__name__)
9
10
 
10
11
 
11
12
  def launch(name: str, data_center_id: str, ssh_key: str, machine_type: str,
12
- memory_gib: int, vcpu_count: int, gpu_count: int, gpu_model: str,
13
+ memory_gib: int, vcpu_count: int, gpu_count: int,
13
14
  tags: Dict[str, str], disk_size: int):
14
15
  """Launches an instance with the given parameters."""
15
- disk = cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK',
16
- size_gib=disk_size)
17
-
18
- request = cudo.cudo.CreateVMBody(ssh_key_source='SSH_KEY_SOURCE_NONE',
19
- custom_ssh_keys=[ssh_key],
20
- vm_id=name,
21
- machine_type=machine_type,
22
- data_center_id=data_center_id,
23
- boot_disk_image_id='ubuntu-nvidia-docker',
24
- memory_gib=memory_gib,
25
- vcpus=vcpu_count,
26
- gpus=gpu_count,
27
- gpu_model=gpu_model,
28
- boot_disk=disk,
29
- metadata=tags)
16
+
17
+ request = cudo.cudo.CreateVMBody(
18
+ ssh_key_source='SSH_KEY_SOURCE_NONE',
19
+ custom_ssh_keys=[ssh_key],
20
+ vm_id=name,
21
+ machine_type=machine_type,
22
+ data_center_id=data_center_id,
23
+ boot_disk_image_id='ubuntu-2204-nvidia-535-docker-v20240214',
24
+ memory_gib=memory_gib,
25
+ vcpus=vcpu_count,
26
+ gpus=gpu_count,
27
+ boot_disk=cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK',
28
+ size_gib=disk_size),
29
+ metadata=tags)
30
30
 
31
31
  try:
32
32
  api = cudo.cudo.cudo_api.virtual_machines()
@@ -121,3 +121,24 @@ def list_instances():
121
121
  return instances
122
122
  except cudo.cudo.rest.ApiException as e:
123
123
  raise e
124
+
125
+
126
+ def vm_available(to_start_count, gpu_count, gpu_model, data_center_id, mem,
127
+ cpus):
128
+ try:
129
+ gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model)
130
+ api = cudo.cudo.cudo_api.virtual_machines()
131
+ types = api.list_vm_machine_types(mem,
132
+ cpus,
133
+ gpu=gpu_count,
134
+ gpu_model=gpu_model,
135
+ data_center_id=data_center_id)
136
+ types_dict = types.to_dict()
137
+ hc = types_dict['host_configs']
138
+ total_count = sum(item['count_vm_available'] for item in hc)
139
+ if total_count < to_start_count:
140
+ raise Exception(
141
+ 'Too many VMs requested, try another gpu type or region')
142
+ return total_count
143
+ except cudo.cudo.rest.ApiException as e:
144
+ raise e
@@ -4,10 +4,10 @@ import time
4
4
  from typing import Any, Dict, List, Optional
5
5
 
6
6
  from sky import sky_logging
7
- from sky import status_lib
8
7
  from sky.provision import common
9
8
  from sky.provision.cudo import cudo_machine_type
10
9
  from sky.provision.cudo import cudo_wrapper
10
+ from sky.utils import status_lib
11
11
 
12
12
  POLL_INTERVAL = 10
13
13
 
@@ -16,7 +16,6 @@ logger = sky_logging.init_logger(__name__)
16
16
 
17
17
  def _filter_instances(cluster_name_on_cloud: str,
18
18
  status_filters: Optional[List[str]]) -> Dict[str, Any]:
19
-
20
19
  instances = cudo_wrapper.list_instances()
21
20
  possible_names = [
22
21
  f'{cluster_name_on_cloud}-head', f'{cluster_name_on_cloud}-worker'
@@ -77,10 +76,19 @@ def run_instances(region: str, cluster_name_on_cloud: str,
77
76
 
78
77
  created_instance_ids = []
79
78
  public_key = config.node_config['AuthorizedKey']
80
-
79
+ instance_type = config.node_config['InstanceType']
80
+ spec = cudo_machine_type.get_spec_from_instance(instance_type, region)
81
+ gpu_count = int(float(spec['gpu_count']))
82
+ vcpu_count = int(spec['vcpu_count'])
83
+ memory_gib = int(spec['mem_gb'])
84
+ gpu_model = spec['gpu_model']
85
+ try:
86
+ cudo_wrapper.vm_available(to_start_count, gpu_count, gpu_model, region,
87
+ memory_gib, vcpu_count)
88
+ except Exception as e:
89
+ logger.warning(f'run_instances: {e}')
90
+ raise
81
91
  for _ in range(to_start_count):
82
- instance_type = config.node_config['InstanceType']
83
- spec = cudo_machine_type.get_spec_from_instance(instance_type, region)
84
92
 
85
93
  node_type = 'head' if head_instance_id is None else 'worker'
86
94
  try:
@@ -89,10 +97,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
89
97
  ssh_key=public_key,
90
98
  data_center_id=region,
91
99
  machine_type=spec['machine_type'],
92
- memory_gib=int(spec['mem_gb']),
93
- vcpu_count=int(spec['vcpu_count']),
94
- gpu_count=int(float(spec['gpu_count'])),
95
- gpu_model=spec['gpu_model'],
100
+ memory_gib=memory_gib,
101
+ vcpu_count=vcpu_count,
102
+ gpu_count=gpu_count,
96
103
  tags={},
97
104
  disk_size=config.node_config['DiskSize'])
98
105
  except Exception as e: # pylint: disable=broad-except
@@ -150,11 +157,10 @@ def terminate_instances(
150
157
  del provider_config
151
158
  instances = _filter_instances(cluster_name_on_cloud, None)
152
159
  for inst_id, inst in instances.items():
153
- logger.info(f'Terminating instance {inst_id}.'
154
- f'{inst}')
155
160
  if worker_only and inst['name'].endswith('-head'):
156
161
  continue
157
- logger.info(f'Removing {inst_id}: {inst}')
162
+ logger.debug(f'Terminating Cudo instance {inst_id}.'
163
+ f'{inst}')
158
164
  cudo_wrapper.remove(inst_id)
159
165
 
160
166
 
@@ -213,6 +219,16 @@ def query_instances(
213
219
  return statuses
214
220
 
215
221
 
222
+ def open_ports(
223
+ cluster_name_on_cloud: str,
224
+ ports: List[str],
225
+ provider_config: Optional[Dict[str, Any]] = None,
226
+ ) -> None:
227
+ del cluster_name_on_cloud, ports, provider_config
228
+ # Cudo has all ports open by default. Nothing to do here.
229
+ return
230
+
231
+
216
232
  def cleanup_ports(
217
233
  cluster_name_on_cloud: str,
218
234
  ports: List[str],
@@ -0,0 +1,11 @@
1
+ """DO provisioner for SkyPilot."""
2
+
3
+ from sky.provision.do.config import bootstrap_instances
4
+ from sky.provision.do.instance import cleanup_ports
5
+ from sky.provision.do.instance import get_cluster_info
6
+ from sky.provision.do.instance import open_ports
7
+ from sky.provision.do.instance import query_instances
8
+ from sky.provision.do.instance import run_instances
9
+ from sky.provision.do.instance import stop_instances
10
+ from sky.provision.do.instance import terminate_instances
11
+ from sky.provision.do.instance import wait_instances
@@ -0,0 +1,14 @@
1
+ """Paperspace configuration bootstrapping."""
2
+
3
+ from sky import sky_logging
4
+ from sky.provision import common
5
+
6
+ logger = sky_logging.init_logger(__name__)
7
+
8
+
9
+ def bootstrap_instances(
10
+ region: str, cluster_name: str,
11
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
12
+ """Bootstraps instances for the given cluster."""
13
+ del region, cluster_name
14
+ return config
@@ -0,0 +1,10 @@
1
+ """DO cloud constants
2
+ """
3
+
4
+ POLL_INTERVAL = 5
5
+ WAIT_DELETE_VOLUMES = 5
6
+
7
+ GPU_IMAGES = {
8
+ 'gpu-h100x1-80gb': 'gpu-h100x1-base',
9
+ 'gpu-h100x8-640gb': 'gpu-h100x8-base',
10
+ }