skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -5,11 +5,11 @@ VMs, GPUs, and TPUs. The script takes about 1-2 minutes to run.
5
5
  """
6
6
 
7
7
  import argparse
8
- import functools
9
8
  import io
10
9
  import multiprocessing
11
10
  import os
12
11
  import textwrap
12
+ import time
13
13
  import typing
14
14
  from typing import Any, Callable, Dict, List, Optional, Set
15
15
 
@@ -19,6 +19,8 @@ import numpy as np
19
19
 
20
20
  from sky.adaptors import common as adaptors_common
21
21
  from sky.adaptors import gcp
22
+ from sky.utils import annotations
23
+ from sky.utils import common_utils
22
24
 
23
25
  if typing.TYPE_CHECKING:
24
26
  import pandas as pd
@@ -38,6 +40,9 @@ TPU_SERVICE_ID = 'E000-3F24-B8AA'
38
40
  # The number of digits to round the price to.
39
41
  PRICE_ROUNDING = 5
40
42
 
43
+ # The number of retries for the TPU API.
44
+ TPU_RETRY_CNT = 3
45
+
41
46
  # This zone is only for TPU v4, and does not appear in the skus yet.
42
47
  TPU_V4_ZONES = ['us-central2-b']
43
48
  # TPU v3 pods are available in us-east1-d, but hidden in the skus.
@@ -54,6 +59,113 @@ HIDDEN_TPU_DF = pd.read_csv(
54
59
  ,tpu-v3-1024,1,,,tpu-v3-1024,1024.0,307.2,us-east1,us-east1-d
55
60
  ,tpu-v3-2048,1,,,tpu-v3-2048,2048.0,614.4,us-east1,us-east1-d
56
61
  """)))
62
+
63
+ # TPU V6e price for the following regions is missing in the SKUs.
64
+ TPU_V6E_MISSING_REGIONS = ['us-central2', 'southamerica-west1']
65
+
66
+ # TPU V5 is not visible in specific zones. We hardcode the missing zones here.
67
+ # NOTE(dev): Keep the zones and the df in sync.
68
+ TPU_V5_MISSING_ZONES_DF = {
69
+ 'europe-west4-b': pd.read_csv(
70
+ io.StringIO(
71
+ textwrap.dedent("""\
72
+ AcceleratorName,AcceleratorCount,Region,AvailabilityZone
73
+ tpu-v5p-8,1,europe-west4,europe-west4-b
74
+ tpu-v5p-16,1,europe-west4,europe-west4-b
75
+ tpu-v5p-32,1,europe-west4,europe-west4-b
76
+ tpu-v5p-64,1,europe-west4,europe-west4-b
77
+ tpu-v5p-128,1,europe-west4,europe-west4-b
78
+ tpu-v5p-256,1,europe-west4,europe-west4-b
79
+ tpu-v5p-384,1,europe-west4,europe-west4-b
80
+ tpu-v5p-512,1,europe-west4,europe-west4-b
81
+ tpu-v5p-640,1,europe-west4,europe-west4-b
82
+ tpu-v5p-768,1,europe-west4,europe-west4-b
83
+ tpu-v5p-896,1,europe-west4,europe-west4-b
84
+ tpu-v5p-1024,1,europe-west4,europe-west4-b
85
+ tpu-v5p-1152,1,europe-west4,europe-west4-b
86
+ tpu-v5p-1280,1,europe-west4,europe-west4-b
87
+ tpu-v5p-1408,1,europe-west4,europe-west4-b
88
+ tpu-v5p-1536,1,europe-west4,europe-west4-b
89
+ tpu-v5p-1664,1,europe-west4,europe-west4-b
90
+ tpu-v5p-1792,1,europe-west4,europe-west4-b
91
+ tpu-v5p-1920,1,europe-west4,europe-west4-b
92
+ tpu-v5p-2048,1,europe-west4,europe-west4-b
93
+ tpu-v5p-2176,1,europe-west4,europe-west4-b
94
+ tpu-v5p-2304,1,europe-west4,europe-west4-b
95
+ tpu-v5p-2432,1,europe-west4,europe-west4-b
96
+ tpu-v5p-2560,1,europe-west4,europe-west4-b
97
+ tpu-v5p-2688,1,europe-west4,europe-west4-b
98
+ tpu-v5p-2816,1,europe-west4,europe-west4-b
99
+ tpu-v5p-2944,1,europe-west4,europe-west4-b
100
+ tpu-v5p-3072,1,europe-west4,europe-west4-b
101
+ tpu-v5p-3200,1,europe-west4,europe-west4-b
102
+ tpu-v5p-3328,1,europe-west4,europe-west4-b
103
+ tpu-v5p-3456,1,europe-west4,europe-west4-b
104
+ tpu-v5p-3584,1,europe-west4,europe-west4-b
105
+ tpu-v5p-3712,1,europe-west4,europe-west4-b
106
+ tpu-v5p-3840,1,europe-west4,europe-west4-b
107
+ tpu-v5p-3968,1,europe-west4,europe-west4-b
108
+ tpu-v5p-4096,1,europe-west4,europe-west4-b
109
+ tpu-v5p-4224,1,europe-west4,europe-west4-b
110
+ tpu-v5p-4352,1,europe-west4,europe-west4-b
111
+ tpu-v5p-4480,1,europe-west4,europe-west4-b
112
+ tpu-v5p-4608,1,europe-west4,europe-west4-b
113
+ tpu-v5p-4736,1,europe-west4,europe-west4-b
114
+ tpu-v5p-4864,1,europe-west4,europe-west4-b
115
+ tpu-v5p-4992,1,europe-west4,europe-west4-b
116
+ tpu-v5p-5120,1,europe-west4,europe-west4-b
117
+ tpu-v5p-5248,1,europe-west4,europe-west4-b
118
+ tpu-v5p-5376,1,europe-west4,europe-west4-b
119
+ tpu-v5p-5504,1,europe-west4,europe-west4-b
120
+ tpu-v5p-5632,1,europe-west4,europe-west4-b
121
+ tpu-v5p-5760,1,europe-west4,europe-west4-b
122
+ tpu-v5p-5888,1,europe-west4,europe-west4-b
123
+ tpu-v5p-6016,1,europe-west4,europe-west4-b
124
+ tpu-v5p-6144,1,europe-west4,europe-west4-b
125
+ tpu-v5p-6272,1,europe-west4,europe-west4-b
126
+ tpu-v5p-6400,1,europe-west4,europe-west4-b
127
+ tpu-v5p-6528,1,europe-west4,europe-west4-b
128
+ tpu-v5p-6656,1,europe-west4,europe-west4-b
129
+ tpu-v5p-6784,1,europe-west4,europe-west4-b
130
+ tpu-v5p-6912,1,europe-west4,europe-west4-b
131
+ tpu-v5p-7040,1,europe-west4,europe-west4-b
132
+ tpu-v5p-7168,1,europe-west4,europe-west4-b
133
+ tpu-v5p-7296,1,europe-west4,europe-west4-b
134
+ tpu-v5p-7424,1,europe-west4,europe-west4-b
135
+ tpu-v5p-7552,1,europe-west4,europe-west4-b
136
+ tpu-v5p-7680,1,europe-west4,europe-west4-b
137
+ tpu-v5p-7808,1,europe-west4,europe-west4-b
138
+ tpu-v5p-7936,1,europe-west4,europe-west4-b
139
+ tpu-v5p-8064,1,europe-west4,europe-west4-b
140
+ tpu-v5p-8192,1,europe-west4,europe-west4-b
141
+ tpu-v5p-8320,1,europe-west4,europe-west4-b
142
+ tpu-v5p-8448,1,europe-west4,europe-west4-b
143
+ tpu-v5p-8704,1,europe-west4,europe-west4-b
144
+ tpu-v5p-8832,1,europe-west4,europe-west4-b
145
+ tpu-v5p-8960,1,europe-west4,europe-west4-b
146
+ tpu-v5p-9216,1,europe-west4,europe-west4-b
147
+ tpu-v5p-9472,1,europe-west4,europe-west4-b
148
+ tpu-v5p-9600,1,europe-west4,europe-west4-b
149
+ tpu-v5p-9728,1,europe-west4,europe-west4-b
150
+ tpu-v5p-9856,1,europe-west4,europe-west4-b
151
+ tpu-v5p-9984,1,europe-west4,europe-west4-b
152
+ tpu-v5p-10240,1,europe-west4,europe-west4-b
153
+ tpu-v5p-10368,1,europe-west4,europe-west4-b
154
+ tpu-v5p-10496,1,europe-west4,europe-west4-b
155
+ tpu-v5p-10752,1,europe-west4,europe-west4-b
156
+ tpu-v5p-10880,1,europe-west4,europe-west4-b
157
+ tpu-v5p-11008,1,europe-west4,europe-west4-b
158
+ tpu-v5p-11136,1,europe-west4,europe-west4-b
159
+ tpu-v5p-11264,1,europe-west4,europe-west4-b
160
+ tpu-v5p-11520,1,europe-west4,europe-west4-b
161
+ tpu-v5p-11648,1,europe-west4,europe-west4-b
162
+ tpu-v5p-11776,1,europe-west4,europe-west4-b
163
+ tpu-v5p-11904,1,europe-west4,europe-west4-b
164
+ tpu-v5p-12032,1,europe-west4,europe-west4-b
165
+ tpu-v5p-12160,1,europe-west4,europe-west4-b
166
+ tpu-v5p-12288,1,europe-west4,europe-west4-b
167
+ """)))
168
+ }
57
169
  # FIXME(woosuk): Remove this once the bug is fixed.
58
170
  # See https://github.com/skypilot-org/skypilot/issues/1759#issue-1619614345
59
171
  TPU_V4_HOST_DF = pd.read_csv(
@@ -169,7 +281,7 @@ def filter_zones(func: Callable[[], List[str]]) -> Callable[[], List[str]]:
169
281
 
170
282
 
171
283
  @filter_zones
172
- @functools.lru_cache(maxsize=None)
284
+ @annotations.lru_cache(scope='global', maxsize=None)
173
285
  def _get_all_zones() -> List[str]:
174
286
  zones_request = gcp_client.zones().list(project=project_id)
175
287
  zones = []
@@ -225,7 +337,7 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
225
337
  df = df[~df['AvailabilityZone'].str.startswith(tuple(TPU_V4_ZONES))]
226
338
 
227
339
  # TODO(woosuk): Make this more efficient.
228
- def get_vm_price(row: pd.Series, spot: bool) -> float:
340
+ def get_vm_price(row: pd.Series, spot: bool) -> Optional[float]:
229
341
  series = row['InstanceType'].split('-')[0].lower()
230
342
 
231
343
  ondemand_or_spot = 'OnDemand' if not spot else 'Preemptible'
@@ -276,12 +388,26 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
276
388
  if series in ['f1', 'g1']:
277
389
  memory_price = 0.0
278
390
 
279
- assert cpu_price is not None, row
280
- assert memory_price is not None, row
391
+ # TODO(tian): (2024/11/10) Some SKUs are missing in the SKUs API. We
392
+ # skip them in the catalog for now. We should investigate why they are
393
+ # missing and add them back.
394
+ if cpu_price is None or memory_price is None:
395
+ return None
281
396
  return cpu_price + memory_price
282
397
 
283
398
  df['Price'] = df.apply(lambda row: get_vm_price(row, spot=False), axis=1)
284
399
  df['SpotPrice'] = df.apply(lambda row: get_vm_price(row, spot=True), axis=1)
400
+ dropped_rows = df[df['Price'].isna() & df['SpotPrice'].isna()]
401
+ dropped_info = (dropped_rows[['InstanceType',
402
+ 'AvailabilityZone']].drop_duplicates())
403
+ az2missing = dropped_info.groupby('AvailabilityZone').apply(
404
+ lambda x: x['InstanceType'].tolist())
405
+ print('Price not found for the following zones and instance types. '
406
+ 'Dropping them.')
407
+ for az, instances in az2missing.items():
408
+ print('-' * 30, az, '-' * 30)
409
+ print(', '.join(instances))
410
+ df = df.dropna(subset=['Price', 'SpotPrice'], how='all')
285
411
  df = df.reset_index(drop=True)
286
412
  df = df.sort_values(['InstanceType', 'Region', 'AvailabilityZone'])
287
413
  return df
@@ -307,8 +433,10 @@ def _get_gpus_for_zone(zone: str) -> 'pd.DataFrame':
307
433
  gpu_name = gpu_name.upper()
308
434
  if 'H100-80GB' in gpu_name:
309
435
  gpu_name = 'H100'
436
+ if 'H100-MEGA-80GB' in gpu_name:
437
+ gpu_name = 'H100-MEGA'
310
438
  if count != 8:
311
- # H100 only has 8 cards.
439
+ # H100-MEGA only has 8 cards.
312
440
  continue
313
441
  if 'VWS' in gpu_name:
314
442
  continue
@@ -338,6 +466,7 @@ def _gpu_info_from_name(name: str) -> Optional[Dict[str, List[Dict[str, Any]]]]:
338
466
  'A100-80GB': 80 * 1024,
339
467
  'A100': 40 * 1024,
340
468
  'H100': 80 * 1024,
469
+ 'H100-MEGA': 80 * 1024,
341
470
  'P4': 8 * 1024,
342
471
  'T4': 16 * 1024,
343
472
  'V100': 16 * 1024,
@@ -382,12 +511,17 @@ def get_gpu_df(skus: List[Dict[str, Any]],
382
511
  if sku['category']['usageType'] != ondemand_or_spot:
383
512
  continue
384
513
 
385
- gpu_name = row['AcceleratorName']
386
- if gpu_name == 'A100-80GB':
387
- gpu_name = 'A100 80GB'
388
- if gpu_name == 'H100':
389
- gpu_name = 'H100 80GB'
390
- if f'{gpu_name} GPU' not in sku['description']:
514
+ gpu_names = [row['AcceleratorName']]
515
+ if gpu_names[0] == 'A100-80GB':
516
+ gpu_names = ['A100 80GB']
517
+ if gpu_names[0] == 'H100':
518
+ gpu_names = ['H100 80GB']
519
+ if gpu_names[0] == 'H100-MEGA':
520
+ # Seems that H100-MEGA has two different descriptions in SKUs in
521
+ # different regions: 'H100 80GB Mega' and 'H100 80GB Plus'.
522
+ gpu_names = ['H100 80GB Mega', 'H100 80GB Plus']
523
+ if not any(f'{gpu_name} GPU' in sku['description']
524
+ for gpu_name in gpu_names):
391
525
  continue
392
526
 
393
527
  unit_price = _get_unit_price(sku)
@@ -414,34 +548,55 @@ def get_gpu_df(skus: List[Dict[str, Any]],
414
548
  return df
415
549
 
416
550
 
551
+ def _get_tpu_response_for_zone(zone: str) -> list:
552
+ parent = f'projects/{project_id}/locations/{zone}'
553
+ # Sometimes the response is empty ({}) even for enabled zones. Here we
554
+ # retry the request for a few times.
555
+ backoff = common_utils.Backoff(initial_backoff=1)
556
+ for _ in range(TPU_RETRY_CNT):
557
+ tpus_request = (
558
+ tpu_client.projects().locations().acceleratorTypes().list(
559
+ parent=parent))
560
+ try:
561
+ tpus_response = tpus_request.execute()
562
+ if 'acceleratorTypes' in tpus_response:
563
+ return tpus_response['acceleratorTypes']
564
+ except gcp.http_error_exception() as error:
565
+ if error.resp.status == 403:
566
+ print(' TPU API is not enabled or you don\'t have TPU access '
567
+ f'to zone: {zone!r}.')
568
+ else:
569
+ print(f' An error occurred: {error}')
570
+ # If error happens, fail early.
571
+ return []
572
+ time_to_sleep = backoff.current_backoff()
573
+ print(f' Retry zone {zone!r} in {time_to_sleep} seconds...')
574
+ time.sleep(time_to_sleep)
575
+ print(f'ERROR: Failed to fetch TPUs for zone {zone!r}.')
576
+ return []
577
+
578
+
417
579
  def _get_tpu_for_zone(zone: str) -> 'pd.DataFrame':
580
+ # Use hardcoded TPU V5 data as it is invisible in some zones.
581
+ missing_tpus_df = pd.DataFrame(columns=[
582
+ 'AcceleratorName', 'AcceleratorCount', 'Region', 'AvailabilityZone'
583
+ ])
584
+ if zone in TPU_V5_MISSING_ZONES_DF:
585
+ missing_tpus_df = TPU_V5_MISSING_ZONES_DF[zone]
418
586
  tpus = []
419
- parent = f'projects/{project_id}/locations/{zone}'
420
- tpus_request = tpu_client.projects().locations().acceleratorTypes().list(
421
- parent=parent)
422
- try:
423
- tpus_response = tpus_request.execute()
424
- for tpu in tpus_response['acceleratorTypes']:
425
- tpus.append(tpu)
426
- except gcp.http_error_exception() as error:
427
- if error.resp.status == 403:
428
- print(' TPU API is not enabled or you don\'t have TPU access '
429
- f'to zone: {zone!r}.')
430
- else:
431
- print(f' An error occurred: {error}')
587
+ for tpu in _get_tpu_response_for_zone(zone):
588
+ tpus.append(tpu)
432
589
  new_tpus = []
433
590
  for tpu in tpus:
434
591
  tpu_name = tpu['type']
435
- # skip tpu v5 as we currently don't support it
436
- if 'v5' in tpu_name:
437
- continue
438
592
  new_tpus.append({
439
593
  'AcceleratorName': f'tpu-{tpu_name}',
440
594
  'AcceleratorCount': 1,
441
595
  'Region': zone.rpartition('-')[0],
442
596
  'AvailabilityZone': zone,
443
597
  })
444
- return pd.DataFrame(new_tpus).reset_index(drop=True)
598
+ new_tpu_df = pd.DataFrame(new_tpus).reset_index(drop=True)
599
+ return pd.concat([new_tpu_df, missing_tpus_df])
445
600
 
446
601
 
447
602
  def _get_tpus() -> 'pd.DataFrame':
@@ -458,11 +613,24 @@ def _get_tpus() -> 'pd.DataFrame':
458
613
 
459
614
 
460
615
  # TODO: the TPUs fetched fails to contain us-east1
461
- def get_tpu_df(skus: List[Dict[str, Any]]) -> 'pd.DataFrame':
616
+ def get_tpu_df(gce_skus: List[Dict[str, Any]],
617
+ tpu_skus: List[Dict[str, Any]]) -> 'pd.DataFrame':
462
618
  df = _get_tpus()
463
619
  if df.empty:
464
620
  return df
465
621
 
622
+ def _get_tpu_description_str(tpu_version: str) -> str:
623
+ # TPU V5 has a different naming convention since it is contained in
624
+ # the GCE SKUs. v5p -> TpuV5p, v5litepod -> TpuV5e.
625
+ if tpu_version.startswith('v5'):
626
+ if tpu_version == 'v5p':
627
+ return 'TpuV5p'
628
+ assert tpu_version == 'v5litepod', tpu_version
629
+ return 'TpuV5e'
630
+ if tpu_version.startswith('v6e'):
631
+ return 'TpuV6e'
632
+ return f'Tpu-{tpu_version}'
633
+
466
634
  def get_tpu_price(row: pd.Series, spot: bool) -> Optional[float]:
467
635
  assert row['AcceleratorCount'] == 1, row
468
636
  tpu_price = None
@@ -475,9 +643,12 @@ def get_tpu_df(skus: List[Dict[str, Any]]) -> 'pd.DataFrame':
475
643
  # whether the TPU is a single device or a pod.
476
644
  # For TPU-v4, the pricing is uniform, and thus the pricing API
477
645
  # only provides the price of TPU-v4 pods.
478
- is_pod = num_cores > 8 or tpu_version == 'v4'
646
+ # The price shown for v5 & v6e TPU is per chip hour, so there is
647
+ # no 'Pod' keyword in the description.
648
+ is_pod = ((num_cores > 8 or tpu_version == 'v4') and
649
+ not tpu_version.startswith('v5') and tpu_version != 'v6e')
479
650
 
480
- for sku in skus:
651
+ for sku in gce_skus + tpu_skus:
481
652
  if tpu_region not in sku['serviceRegions']:
482
653
  continue
483
654
  description = sku['description']
@@ -489,7 +660,7 @@ def get_tpu_df(skus: List[Dict[str, Any]]) -> 'pd.DataFrame':
489
660
  if 'Preemptible' in description:
490
661
  continue
491
662
 
492
- if f'Tpu-{tpu_version}' not in description:
663
+ if _get_tpu_description_str(tpu_version) not in description:
493
664
  continue
494
665
  if is_pod:
495
666
  if 'Pod' not in description:
@@ -500,7 +671,17 @@ def get_tpu_df(skus: List[Dict[str, Any]]) -> 'pd.DataFrame':
500
671
 
501
672
  unit_price = _get_unit_price(sku)
502
673
  tpu_device_price = unit_price
503
- tpu_core_price = tpu_device_price / 8
674
+ # v5p naming convention is v$VERSION_NUMBERp-$CORES_COUNT, while
675
+ # v5e is v$VERSION_NUMBER-$CHIP_COUNT. In the same time, V5 price
676
+ # is shown as per chip price, which is 2 cores for v5p and 1 core
677
+ # for v5e. Reference here:
678
+ # https://cloud.google.com/tpu/docs/v5p#using-accelerator-type
679
+ # https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config
680
+ # v6e is also per chip price. Reference here:
681
+ # https://cloud.google.com/tpu/docs/v6e#configurations
682
+ core_per_sku = (1 if tpu_version in ['v5litepod', 'v6e'] else
683
+ 2 if tpu_version == 'v5p' else 8)
684
+ tpu_core_price = tpu_device_price / core_per_sku
504
685
  tpu_price = num_cores * tpu_core_price
505
686
  break
506
687
 
@@ -518,7 +699,13 @@ def get_tpu_df(skus: List[Dict[str, Any]]) -> 'pd.DataFrame':
518
699
  spot_str = 'spot ' if spot else ''
519
700
  print(f'The {spot_str}price of {tpu_name} in {tpu_region} is '
520
701
  'not found in SKUs or hidden TPU price DF.')
521
- assert spot or tpu_price is not None, (row, hidden_tpu, HIDDEN_TPU_DF)
702
+ if (tpu_name.startswith('tpu-v6e') and
703
+ tpu_region in TPU_V6E_MISSING_REGIONS):
704
+ if not spot:
705
+ tpu_price = 0.0
706
+ else:
707
+ assert spot or tpu_price is not None, (row, hidden_tpu,
708
+ HIDDEN_TPU_DF)
522
709
  return tpu_price
523
710
 
524
711
  df['Price'] = df.apply(lambda row: get_tpu_price(row, spot=False), axis=1)
@@ -546,7 +733,8 @@ def get_catalog_df(region_prefix: str) -> 'pd.DataFrame':
546
733
  region_prefix)] if not gpu_df.empty else gpu_df
547
734
 
548
735
  gcp_tpu_skus = get_skus(TPU_SERVICE_ID)
549
- tpu_df = get_tpu_df(gcp_tpu_skus)
736
+ # TPU V5 SKU is not included in the TPU SKUs but in the GCE SKUs.
737
+ tpu_df = get_tpu_df(gcp_skus, gcp_tpu_skus)
550
738
 
551
739
  # Merge the dataframes.
552
740
  df = pd.concat([vm_df, gpu_df, tpu_df, TPU_V4_HOST_DF])
@@ -11,6 +11,7 @@ import argparse
11
11
  import csv
12
12
  import json
13
13
  import os
14
+ from typing import Optional, Tuple
14
15
 
15
16
  import requests
16
17
 
@@ -19,17 +20,21 @@ DEFAULT_LAMBDA_KEYS_PATH = os.path.expanduser('~/.lambda_cloud/lambda_keys')
19
20
 
20
21
  # List of all possible regions.
21
22
  REGIONS = [
22
- 'australia-southeast-1',
23
23
  'europe-central-1',
24
24
  'asia-south-1',
25
25
  'me-west-1',
26
26
  'europe-south-1',
27
27
  'asia-northeast-1',
28
28
  'asia-northeast-2',
29
+ 'australia-east-1',
29
30
  'us-east-1',
31
+ 'us-east-2',
32
+ 'us-east-3',
30
33
  'us-west-2',
31
34
  'us-west-1',
32
35
  'us-south-1',
36
+ 'us-south-2',
37
+ 'us-south-3',
33
38
  'us-west-3',
34
39
  'us-midwest-1',
35
40
  ]
@@ -43,18 +48,25 @@ GPU_TO_MEMORY = {
43
48
  'RTX6000': 24576,
44
49
  'V100': 16384,
45
50
  'H100': 81920,
51
+ 'GH200': 98304,
52
+ 'GENERAL': None
46
53
  }
47
54
 
48
55
 
49
- def name_to_gpu(name: str) -> str:
56
+ def name_to_gpu_and_cnt(name: str) -> Optional[Tuple[str, int]]:
57
+ """Extract GPU and count from instance type name.
58
+
59
+ The instance type name is in the format:
60
+ 'gpu_{gpu_count}x_{gpu_name}_<suffix>'.
61
+ """
50
62
  # Edge case
51
63
  if name == 'gpu_8x_a100_80gb_sxm4':
52
- return 'A100-80GB'
53
- return name.split('_')[2].upper()
54
-
55
-
56
- def name_to_gpu_cnt(name: str) -> int:
57
- return int(name.split('_')[1].replace('x', ''))
64
+ return 'A100-80GB', 8
65
+ gpu = name.split('_')[2].upper()
66
+ if gpu == 'GENERAL':
67
+ return None
68
+ gpu_cnt = int(name.split('_')[1].replace('x', ''))
69
+ return gpu, gpu_cnt
58
70
 
59
71
 
60
72
  def create_catalog(api_key: str, output_path: str) -> None:
@@ -71,24 +83,32 @@ def create_catalog(api_key: str, output_path: str) -> None:
71
83
  # We parse info.keys() in reverse order so gpu_1x_a100_sxm4 comes before
72
84
  # gpu_1x_a100 in the catalog (gpu_1x_a100_sxm4 has more availability).
73
85
  for vm in reversed(list(info.keys())):
74
- gpu = name_to_gpu(vm)
75
- gpu_cnt = float(name_to_gpu_cnt(vm))
86
+ gpu_and_cnt = name_to_gpu_and_cnt(vm)
87
+ gpu: Optional[str]
88
+ gpu_cnt: Optional[float]
89
+ if gpu_and_cnt is None:
90
+ gpu, gpu_cnt = None, None
91
+ else:
92
+ gpu = gpu_and_cnt[0]
93
+ gpu_cnt = float(gpu_and_cnt[1])
76
94
  vcpus = float(info[vm]['instance_type']['specs']['vcpus'])
77
95
  mem = float(info[vm]['instance_type']['specs']['memory_gib'])
78
- price = float(info[vm]['instance_type']\
79
- ['price_cents_per_hour']) / 100
80
- gpuinfo = {
81
- 'Gpus': [{
82
- 'Name': gpu,
83
- 'Manufacturer': 'NVIDIA',
84
- 'Count': gpu_cnt,
85
- 'MemoryInfo': {
86
- 'SizeInMiB': GPU_TO_MEMORY[gpu]
87
- },
88
- }],
89
- 'TotalGpuMemoryInMiB': GPU_TO_MEMORY[gpu]
90
- }
91
- gpuinfo = json.dumps(gpuinfo).replace('"', "'") # pylint: disable=invalid-string-quote
96
+ price = (float(info[vm]['instance_type']['price_cents_per_hour']) /
97
+ 100)
98
+ gpuinfo: Optional[str] = None
99
+ if gpu is not None:
100
+ gpuinfo_dict = {
101
+ 'Gpus': [{
102
+ 'Name': gpu,
103
+ 'Manufacturer': 'NVIDIA',
104
+ 'Count': gpu_cnt,
105
+ 'MemoryInfo': {
106
+ 'SizeInMiB': GPU_TO_MEMORY[gpu]
107
+ },
108
+ }],
109
+ 'TotalGpuMemoryInMiB': GPU_TO_MEMORY[gpu]
110
+ }
111
+ gpuinfo = json.dumps(gpuinfo_dict).replace('"', "'") # pylint: disable=invalid-string-quote
92
112
  for r in REGIONS:
93
113
  writer.writerow(
94
114
  [vm, gpu, gpu_cnt, vcpus, mem, price, r, gpuinfo, ''])
@@ -0,0 +1,147 @@
1
+ """A script that generates the Vast Cloud catalog. """
2
+
3
+ #
4
+ # Due to the design of the sdk, pylint has a false
5
+ # positive for the fnctions.
6
+ #
7
+ # pylint: disable=assignment-from-no-return
8
+ import collections
9
+ import csv
10
+ import json
11
+ import math
12
+ import re
13
+ import sys
14
+ from typing import Any, Dict, List
15
+
16
+ from sky.adaptors import vast
17
+
18
+ _map = {
19
+ 'TeslaV100': 'V100',
20
+ 'TeslaT4': 'T4',
21
+ 'TeslaP100': 'P100',
22
+ 'QRTX6000': 'RTX6000',
23
+ 'QRTX8000': 'RTX8000'
24
+ }
25
+
26
+
27
+ def create_instance_type(obj: Dict[str, Any]) -> str:
28
+ stubify = lambda x: re.sub(r'\s', '_', x)
29
+ return '{}x-{}-{}-{}'.format(obj['num_gpus'], stubify(obj['gpu_name']),
30
+ obj['cpu_cores'], obj['cpu_ram'])
31
+
32
+
33
+ def dot_get(d: dict, key: str) -> Any:
34
+ for k in key.split('.'):
35
+ d = d[k]
36
+ return d
37
+
38
+
39
+ if __name__ == '__main__':
40
+ seen = set()
41
+ # InstanceType and gpuInfo are basically just stubs
42
+ # so that the dictwriter is happy without weird
43
+ # code.
44
+ mapped_keys = (('gpu_name', 'InstanceType'), ('gpu_name',
45
+ 'AcceleratorName'),
46
+ ('num_gpus', 'AcceleratorCount'), ('cpu_cores', 'vCPUs'),
47
+ ('cpu_ram', 'MemoryGiB'), ('gpu_name', 'GpuInfo'),
48
+ ('search.totalHour', 'Price'), ('min_bid', 'SpotPrice'),
49
+ ('geolocation', 'Region'))
50
+ writer = csv.DictWriter(sys.stdout, fieldnames=[x[1] for x in mapped_keys])
51
+ writer.writeheader()
52
+
53
+ # Vast has a wide variety of machines, some of
54
+ # which will have less diskspace and network
55
+ # bandwidth than others.
56
+ #
57
+ # The machine normally have high specificity
58
+ # in the vast catalog - this is fairly unique
59
+ # to Vast and can make bucketing them into
60
+ # instance types difficult.
61
+ #
62
+ # The flags
63
+ #
64
+ # * georegion consolidates geographic areas
65
+ #
66
+ # * chunked rounds down specifications (such
67
+ # as 1025GB to 1024GB disk) in order to
68
+ # make machine specifications look more
69
+ # consistent
70
+ #
71
+ # * inet_down makes sure that only machines
72
+ # with "reasonable" downlink speed are
73
+ # considered
74
+ #
75
+ # * disk_space sets a lower limit of how
76
+ # much space is availble to be allocated
77
+ # in order to ensure that machines with
78
+ # small disk pools aren't listed
79
+ #
80
+ offerList = vast.vast().search_offers(
81
+ query=('georegion = true chunked = true '
82
+ 'inet_down >= 100 disk_space >= 80'),
83
+ limit=10000)
84
+
85
+ priceMap: Dict[str, List] = collections.defaultdict(list)
86
+ for offer in offerList:
87
+ entry = {}
88
+ for ours, theirs in mapped_keys:
89
+ field = dot_get(offer, ours)
90
+ entry[theirs] = field
91
+
92
+ instance_type = create_instance_type(offer)
93
+ entry['InstanceType'] = instance_type
94
+
95
+ # the documentation says
96
+ # "{'gpus': [{
97
+ # 'name': 'v100',
98
+ # 'manufacturer': 'nvidia',
99
+ # 'count': 8.0,
100
+ # 'memoryinfo': {'sizeinmib': 16384}
101
+ # }],
102
+ # 'totalgpumemoryinmib': 16384}",
103
+ # we can do that.
104
+ entry['MemoryGiB'] /= 1024
105
+
106
+ gpu = re.sub('Ada', '-Ada', re.sub(r'\s', '', offer['gpu_name']))
107
+ gpu = re.sub(r'(Ti|PCIE|SXM4|SXM|NVL)$', '', gpu)
108
+ gpu = re.sub(r'(RTX\d0\d0)(S|D)$', r'\1', gpu)
109
+
110
+ if gpu in _map:
111
+ gpu = _map[gpu]
112
+
113
+ entry['AcceleratorName'] = gpu
114
+ entry['GpuInfo'] = json.dumps({
115
+ 'Gpus': [{
116
+ 'Name': gpu,
117
+ 'Count': offer['num_gpus'],
118
+ 'MemoryInfo': {
119
+ 'SizeInMiB': offer['gpu_total_ram']
120
+ }
121
+ }],
122
+ 'TotalGpuMemoryInMiB': offer['gpu_total_ram']
123
+ }).replace('"', '\'')
124
+
125
+ priceMap[instance_type].append(entry)
126
+
127
+ for instanceList in priceMap.values():
128
+ priceList = sorted([x['Price'] for x in instanceList])
129
+ index = math.ceil(0.5 * len(priceList)) - 1
130
+ priceTarget = priceList[index]
131
+ toList: List = []
132
+ for instance in instanceList:
133
+ if instance['Price'] <= priceTarget:
134
+ instance['Price'] = '{:.2f}'.format(priceTarget)
135
+ toList.append(instance)
136
+
137
+ maxBid = max([x.get('SpotPrice') for x in toList])
138
+ for instance in toList:
139
+ stub = f'{instance["InstanceType"]} {instance["Region"][-2:]}'
140
+ if stub in seen:
141
+ printstub = f'{stub}#print'
142
+ if printstub not in seen:
143
+ instance['SpotPrice'] = f'{maxBid:.2f}'
144
+ writer.writerow(instance)
145
+ seen.add(printstub)
146
+ else:
147
+ seen.add(stub)
@@ -534,7 +534,7 @@ def initialize_images_csv(csv_saving_path: str, vc_object,
534
534
  gpu_name = tag_name.split('-')[1]
535
535
  if gpu_name not in gpu_tags:
536
536
  gpu_tags.append(gpu_name)
537
- if len(gpu_tags) > 0:
537
+ if gpu_tags:
538
538
  gpu_tags_str = str(gpu_tags).replace('\'', '\"')
539
539
  f.write(f'{item.id},{vcenter_name},{item_cpu},{item_memory}'
540
540
  f',,,\'{gpu_tags_str}\'\n')