skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -5,11 +5,11 @@ VMs, GPUs, and TPUs. The script takes about 1-2 minutes to run.
|
|
5
5
|
"""
|
6
6
|
|
7
7
|
import argparse
|
8
|
-
import functools
|
9
8
|
import io
|
10
9
|
import multiprocessing
|
11
10
|
import os
|
12
11
|
import textwrap
|
12
|
+
import time
|
13
13
|
import typing
|
14
14
|
from typing import Any, Callable, Dict, List, Optional, Set
|
15
15
|
|
@@ -19,6 +19,8 @@ import numpy as np
|
|
19
19
|
|
20
20
|
from sky.adaptors import common as adaptors_common
|
21
21
|
from sky.adaptors import gcp
|
22
|
+
from sky.utils import annotations
|
23
|
+
from sky.utils import common_utils
|
22
24
|
|
23
25
|
if typing.TYPE_CHECKING:
|
24
26
|
import pandas as pd
|
@@ -38,6 +40,9 @@ TPU_SERVICE_ID = 'E000-3F24-B8AA'
|
|
38
40
|
# The number of digits to round the price to.
|
39
41
|
PRICE_ROUNDING = 5
|
40
42
|
|
43
|
+
# The number of retries for the TPU API.
|
44
|
+
TPU_RETRY_CNT = 3
|
45
|
+
|
41
46
|
# This zone is only for TPU v4, and does not appear in the skus yet.
|
42
47
|
TPU_V4_ZONES = ['us-central2-b']
|
43
48
|
# TPU v3 pods are available in us-east1-d, but hidden in the skus.
|
@@ -54,6 +59,113 @@ HIDDEN_TPU_DF = pd.read_csv(
|
|
54
59
|
,tpu-v3-1024,1,,,tpu-v3-1024,1024.0,307.2,us-east1,us-east1-d
|
55
60
|
,tpu-v3-2048,1,,,tpu-v3-2048,2048.0,614.4,us-east1,us-east1-d
|
56
61
|
""")))
|
62
|
+
|
63
|
+
# TPU V6e price for the following regions is missing in the SKUs.
|
64
|
+
TPU_V6E_MISSING_REGIONS = ['us-central2', 'southamerica-west1']
|
65
|
+
|
66
|
+
# TPU V5 is not visible in specific zones. We hardcode the missing zones here.
|
67
|
+
# NOTE(dev): Keep the zones and the df in sync.
|
68
|
+
TPU_V5_MISSING_ZONES_DF = {
|
69
|
+
'europe-west4-b': pd.read_csv(
|
70
|
+
io.StringIO(
|
71
|
+
textwrap.dedent("""\
|
72
|
+
AcceleratorName,AcceleratorCount,Region,AvailabilityZone
|
73
|
+
tpu-v5p-8,1,europe-west4,europe-west4-b
|
74
|
+
tpu-v5p-16,1,europe-west4,europe-west4-b
|
75
|
+
tpu-v5p-32,1,europe-west4,europe-west4-b
|
76
|
+
tpu-v5p-64,1,europe-west4,europe-west4-b
|
77
|
+
tpu-v5p-128,1,europe-west4,europe-west4-b
|
78
|
+
tpu-v5p-256,1,europe-west4,europe-west4-b
|
79
|
+
tpu-v5p-384,1,europe-west4,europe-west4-b
|
80
|
+
tpu-v5p-512,1,europe-west4,europe-west4-b
|
81
|
+
tpu-v5p-640,1,europe-west4,europe-west4-b
|
82
|
+
tpu-v5p-768,1,europe-west4,europe-west4-b
|
83
|
+
tpu-v5p-896,1,europe-west4,europe-west4-b
|
84
|
+
tpu-v5p-1024,1,europe-west4,europe-west4-b
|
85
|
+
tpu-v5p-1152,1,europe-west4,europe-west4-b
|
86
|
+
tpu-v5p-1280,1,europe-west4,europe-west4-b
|
87
|
+
tpu-v5p-1408,1,europe-west4,europe-west4-b
|
88
|
+
tpu-v5p-1536,1,europe-west4,europe-west4-b
|
89
|
+
tpu-v5p-1664,1,europe-west4,europe-west4-b
|
90
|
+
tpu-v5p-1792,1,europe-west4,europe-west4-b
|
91
|
+
tpu-v5p-1920,1,europe-west4,europe-west4-b
|
92
|
+
tpu-v5p-2048,1,europe-west4,europe-west4-b
|
93
|
+
tpu-v5p-2176,1,europe-west4,europe-west4-b
|
94
|
+
tpu-v5p-2304,1,europe-west4,europe-west4-b
|
95
|
+
tpu-v5p-2432,1,europe-west4,europe-west4-b
|
96
|
+
tpu-v5p-2560,1,europe-west4,europe-west4-b
|
97
|
+
tpu-v5p-2688,1,europe-west4,europe-west4-b
|
98
|
+
tpu-v5p-2816,1,europe-west4,europe-west4-b
|
99
|
+
tpu-v5p-2944,1,europe-west4,europe-west4-b
|
100
|
+
tpu-v5p-3072,1,europe-west4,europe-west4-b
|
101
|
+
tpu-v5p-3200,1,europe-west4,europe-west4-b
|
102
|
+
tpu-v5p-3328,1,europe-west4,europe-west4-b
|
103
|
+
tpu-v5p-3456,1,europe-west4,europe-west4-b
|
104
|
+
tpu-v5p-3584,1,europe-west4,europe-west4-b
|
105
|
+
tpu-v5p-3712,1,europe-west4,europe-west4-b
|
106
|
+
tpu-v5p-3840,1,europe-west4,europe-west4-b
|
107
|
+
tpu-v5p-3968,1,europe-west4,europe-west4-b
|
108
|
+
tpu-v5p-4096,1,europe-west4,europe-west4-b
|
109
|
+
tpu-v5p-4224,1,europe-west4,europe-west4-b
|
110
|
+
tpu-v5p-4352,1,europe-west4,europe-west4-b
|
111
|
+
tpu-v5p-4480,1,europe-west4,europe-west4-b
|
112
|
+
tpu-v5p-4608,1,europe-west4,europe-west4-b
|
113
|
+
tpu-v5p-4736,1,europe-west4,europe-west4-b
|
114
|
+
tpu-v5p-4864,1,europe-west4,europe-west4-b
|
115
|
+
tpu-v5p-4992,1,europe-west4,europe-west4-b
|
116
|
+
tpu-v5p-5120,1,europe-west4,europe-west4-b
|
117
|
+
tpu-v5p-5248,1,europe-west4,europe-west4-b
|
118
|
+
tpu-v5p-5376,1,europe-west4,europe-west4-b
|
119
|
+
tpu-v5p-5504,1,europe-west4,europe-west4-b
|
120
|
+
tpu-v5p-5632,1,europe-west4,europe-west4-b
|
121
|
+
tpu-v5p-5760,1,europe-west4,europe-west4-b
|
122
|
+
tpu-v5p-5888,1,europe-west4,europe-west4-b
|
123
|
+
tpu-v5p-6016,1,europe-west4,europe-west4-b
|
124
|
+
tpu-v5p-6144,1,europe-west4,europe-west4-b
|
125
|
+
tpu-v5p-6272,1,europe-west4,europe-west4-b
|
126
|
+
tpu-v5p-6400,1,europe-west4,europe-west4-b
|
127
|
+
tpu-v5p-6528,1,europe-west4,europe-west4-b
|
128
|
+
tpu-v5p-6656,1,europe-west4,europe-west4-b
|
129
|
+
tpu-v5p-6784,1,europe-west4,europe-west4-b
|
130
|
+
tpu-v5p-6912,1,europe-west4,europe-west4-b
|
131
|
+
tpu-v5p-7040,1,europe-west4,europe-west4-b
|
132
|
+
tpu-v5p-7168,1,europe-west4,europe-west4-b
|
133
|
+
tpu-v5p-7296,1,europe-west4,europe-west4-b
|
134
|
+
tpu-v5p-7424,1,europe-west4,europe-west4-b
|
135
|
+
tpu-v5p-7552,1,europe-west4,europe-west4-b
|
136
|
+
tpu-v5p-7680,1,europe-west4,europe-west4-b
|
137
|
+
tpu-v5p-7808,1,europe-west4,europe-west4-b
|
138
|
+
tpu-v5p-7936,1,europe-west4,europe-west4-b
|
139
|
+
tpu-v5p-8064,1,europe-west4,europe-west4-b
|
140
|
+
tpu-v5p-8192,1,europe-west4,europe-west4-b
|
141
|
+
tpu-v5p-8320,1,europe-west4,europe-west4-b
|
142
|
+
tpu-v5p-8448,1,europe-west4,europe-west4-b
|
143
|
+
tpu-v5p-8704,1,europe-west4,europe-west4-b
|
144
|
+
tpu-v5p-8832,1,europe-west4,europe-west4-b
|
145
|
+
tpu-v5p-8960,1,europe-west4,europe-west4-b
|
146
|
+
tpu-v5p-9216,1,europe-west4,europe-west4-b
|
147
|
+
tpu-v5p-9472,1,europe-west4,europe-west4-b
|
148
|
+
tpu-v5p-9600,1,europe-west4,europe-west4-b
|
149
|
+
tpu-v5p-9728,1,europe-west4,europe-west4-b
|
150
|
+
tpu-v5p-9856,1,europe-west4,europe-west4-b
|
151
|
+
tpu-v5p-9984,1,europe-west4,europe-west4-b
|
152
|
+
tpu-v5p-10240,1,europe-west4,europe-west4-b
|
153
|
+
tpu-v5p-10368,1,europe-west4,europe-west4-b
|
154
|
+
tpu-v5p-10496,1,europe-west4,europe-west4-b
|
155
|
+
tpu-v5p-10752,1,europe-west4,europe-west4-b
|
156
|
+
tpu-v5p-10880,1,europe-west4,europe-west4-b
|
157
|
+
tpu-v5p-11008,1,europe-west4,europe-west4-b
|
158
|
+
tpu-v5p-11136,1,europe-west4,europe-west4-b
|
159
|
+
tpu-v5p-11264,1,europe-west4,europe-west4-b
|
160
|
+
tpu-v5p-11520,1,europe-west4,europe-west4-b
|
161
|
+
tpu-v5p-11648,1,europe-west4,europe-west4-b
|
162
|
+
tpu-v5p-11776,1,europe-west4,europe-west4-b
|
163
|
+
tpu-v5p-11904,1,europe-west4,europe-west4-b
|
164
|
+
tpu-v5p-12032,1,europe-west4,europe-west4-b
|
165
|
+
tpu-v5p-12160,1,europe-west4,europe-west4-b
|
166
|
+
tpu-v5p-12288,1,europe-west4,europe-west4-b
|
167
|
+
""")))
|
168
|
+
}
|
57
169
|
# FIXME(woosuk): Remove this once the bug is fixed.
|
58
170
|
# See https://github.com/skypilot-org/skypilot/issues/1759#issue-1619614345
|
59
171
|
TPU_V4_HOST_DF = pd.read_csv(
|
@@ -169,7 +281,7 @@ def filter_zones(func: Callable[[], List[str]]) -> Callable[[], List[str]]:
|
|
169
281
|
|
170
282
|
|
171
283
|
@filter_zones
|
172
|
-
@
|
284
|
+
@annotations.lru_cache(scope='global', maxsize=None)
|
173
285
|
def _get_all_zones() -> List[str]:
|
174
286
|
zones_request = gcp_client.zones().list(project=project_id)
|
175
287
|
zones = []
|
@@ -225,7 +337,7 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
|
|
225
337
|
df = df[~df['AvailabilityZone'].str.startswith(tuple(TPU_V4_ZONES))]
|
226
338
|
|
227
339
|
# TODO(woosuk): Make this more efficient.
|
228
|
-
def get_vm_price(row: pd.Series, spot: bool) -> float:
|
340
|
+
def get_vm_price(row: pd.Series, spot: bool) -> Optional[float]:
|
229
341
|
series = row['InstanceType'].split('-')[0].lower()
|
230
342
|
|
231
343
|
ondemand_or_spot = 'OnDemand' if not spot else 'Preemptible'
|
@@ -276,12 +388,26 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
|
|
276
388
|
if series in ['f1', 'g1']:
|
277
389
|
memory_price = 0.0
|
278
390
|
|
279
|
-
|
280
|
-
|
391
|
+
# TODO(tian): (2024/11/10) Some SKUs are missing in the SKUs API. We
|
392
|
+
# skip them in the catalog for now. We should investigate why they are
|
393
|
+
# missing and add them back.
|
394
|
+
if cpu_price is None or memory_price is None:
|
395
|
+
return None
|
281
396
|
return cpu_price + memory_price
|
282
397
|
|
283
398
|
df['Price'] = df.apply(lambda row: get_vm_price(row, spot=False), axis=1)
|
284
399
|
df['SpotPrice'] = df.apply(lambda row: get_vm_price(row, spot=True), axis=1)
|
400
|
+
dropped_rows = df[df['Price'].isna() & df['SpotPrice'].isna()]
|
401
|
+
dropped_info = (dropped_rows[['InstanceType',
|
402
|
+
'AvailabilityZone']].drop_duplicates())
|
403
|
+
az2missing = dropped_info.groupby('AvailabilityZone').apply(
|
404
|
+
lambda x: x['InstanceType'].tolist())
|
405
|
+
print('Price not found for the following zones and instance types. '
|
406
|
+
'Dropping them.')
|
407
|
+
for az, instances in az2missing.items():
|
408
|
+
print('-' * 30, az, '-' * 30)
|
409
|
+
print(', '.join(instances))
|
410
|
+
df = df.dropna(subset=['Price', 'SpotPrice'], how='all')
|
285
411
|
df = df.reset_index(drop=True)
|
286
412
|
df = df.sort_values(['InstanceType', 'Region', 'AvailabilityZone'])
|
287
413
|
return df
|
@@ -307,8 +433,10 @@ def _get_gpus_for_zone(zone: str) -> 'pd.DataFrame':
|
|
307
433
|
gpu_name = gpu_name.upper()
|
308
434
|
if 'H100-80GB' in gpu_name:
|
309
435
|
gpu_name = 'H100'
|
436
|
+
if 'H100-MEGA-80GB' in gpu_name:
|
437
|
+
gpu_name = 'H100-MEGA'
|
310
438
|
if count != 8:
|
311
|
-
# H100 only has 8 cards.
|
439
|
+
# H100-MEGA only has 8 cards.
|
312
440
|
continue
|
313
441
|
if 'VWS' in gpu_name:
|
314
442
|
continue
|
@@ -338,6 +466,7 @@ def _gpu_info_from_name(name: str) -> Optional[Dict[str, List[Dict[str, Any]]]]:
|
|
338
466
|
'A100-80GB': 80 * 1024,
|
339
467
|
'A100': 40 * 1024,
|
340
468
|
'H100': 80 * 1024,
|
469
|
+
'H100-MEGA': 80 * 1024,
|
341
470
|
'P4': 8 * 1024,
|
342
471
|
'T4': 16 * 1024,
|
343
472
|
'V100': 16 * 1024,
|
@@ -382,12 +511,17 @@ def get_gpu_df(skus: List[Dict[str, Any]],
|
|
382
511
|
if sku['category']['usageType'] != ondemand_or_spot:
|
383
512
|
continue
|
384
513
|
|
385
|
-
|
386
|
-
if
|
387
|
-
|
388
|
-
if
|
389
|
-
|
390
|
-
if
|
514
|
+
gpu_names = [row['AcceleratorName']]
|
515
|
+
if gpu_names[0] == 'A100-80GB':
|
516
|
+
gpu_names = ['A100 80GB']
|
517
|
+
if gpu_names[0] == 'H100':
|
518
|
+
gpu_names = ['H100 80GB']
|
519
|
+
if gpu_names[0] == 'H100-MEGA':
|
520
|
+
# Seems that H100-MEGA has two different descriptions in SKUs in
|
521
|
+
# different regions: 'H100 80GB Mega' and 'H100 80GB Plus'.
|
522
|
+
gpu_names = ['H100 80GB Mega', 'H100 80GB Plus']
|
523
|
+
if not any(f'{gpu_name} GPU' in sku['description']
|
524
|
+
for gpu_name in gpu_names):
|
391
525
|
continue
|
392
526
|
|
393
527
|
unit_price = _get_unit_price(sku)
|
@@ -414,34 +548,55 @@ def get_gpu_df(skus: List[Dict[str, Any]],
|
|
414
548
|
return df
|
415
549
|
|
416
550
|
|
551
|
+
def _get_tpu_response_for_zone(zone: str) -> list:
|
552
|
+
parent = f'projects/{project_id}/locations/{zone}'
|
553
|
+
# Sometimes the response is empty ({}) even for enabled zones. Here we
|
554
|
+
# retry the request for a few times.
|
555
|
+
backoff = common_utils.Backoff(initial_backoff=1)
|
556
|
+
for _ in range(TPU_RETRY_CNT):
|
557
|
+
tpus_request = (
|
558
|
+
tpu_client.projects().locations().acceleratorTypes().list(
|
559
|
+
parent=parent))
|
560
|
+
try:
|
561
|
+
tpus_response = tpus_request.execute()
|
562
|
+
if 'acceleratorTypes' in tpus_response:
|
563
|
+
return tpus_response['acceleratorTypes']
|
564
|
+
except gcp.http_error_exception() as error:
|
565
|
+
if error.resp.status == 403:
|
566
|
+
print(' TPU API is not enabled or you don\'t have TPU access '
|
567
|
+
f'to zone: {zone!r}.')
|
568
|
+
else:
|
569
|
+
print(f' An error occurred: {error}')
|
570
|
+
# If error happens, fail early.
|
571
|
+
return []
|
572
|
+
time_to_sleep = backoff.current_backoff()
|
573
|
+
print(f' Retry zone {zone!r} in {time_to_sleep} seconds...')
|
574
|
+
time.sleep(time_to_sleep)
|
575
|
+
print(f'ERROR: Failed to fetch TPUs for zone {zone!r}.')
|
576
|
+
return []
|
577
|
+
|
578
|
+
|
417
579
|
def _get_tpu_for_zone(zone: str) -> 'pd.DataFrame':
|
580
|
+
# Use hardcoded TPU V5 data as it is invisible in some zones.
|
581
|
+
missing_tpus_df = pd.DataFrame(columns=[
|
582
|
+
'AcceleratorName', 'AcceleratorCount', 'Region', 'AvailabilityZone'
|
583
|
+
])
|
584
|
+
if zone in TPU_V5_MISSING_ZONES_DF:
|
585
|
+
missing_tpus_df = TPU_V5_MISSING_ZONES_DF[zone]
|
418
586
|
tpus = []
|
419
|
-
|
420
|
-
|
421
|
-
parent=parent)
|
422
|
-
try:
|
423
|
-
tpus_response = tpus_request.execute()
|
424
|
-
for tpu in tpus_response['acceleratorTypes']:
|
425
|
-
tpus.append(tpu)
|
426
|
-
except gcp.http_error_exception() as error:
|
427
|
-
if error.resp.status == 403:
|
428
|
-
print(' TPU API is not enabled or you don\'t have TPU access '
|
429
|
-
f'to zone: {zone!r}.')
|
430
|
-
else:
|
431
|
-
print(f' An error occurred: {error}')
|
587
|
+
for tpu in _get_tpu_response_for_zone(zone):
|
588
|
+
tpus.append(tpu)
|
432
589
|
new_tpus = []
|
433
590
|
for tpu in tpus:
|
434
591
|
tpu_name = tpu['type']
|
435
|
-
# skip tpu v5 as we currently don't support it
|
436
|
-
if 'v5' in tpu_name:
|
437
|
-
continue
|
438
592
|
new_tpus.append({
|
439
593
|
'AcceleratorName': f'tpu-{tpu_name}',
|
440
594
|
'AcceleratorCount': 1,
|
441
595
|
'Region': zone.rpartition('-')[0],
|
442
596
|
'AvailabilityZone': zone,
|
443
597
|
})
|
444
|
-
|
598
|
+
new_tpu_df = pd.DataFrame(new_tpus).reset_index(drop=True)
|
599
|
+
return pd.concat([new_tpu_df, missing_tpus_df])
|
445
600
|
|
446
601
|
|
447
602
|
def _get_tpus() -> 'pd.DataFrame':
|
@@ -458,11 +613,24 @@ def _get_tpus() -> 'pd.DataFrame':
|
|
458
613
|
|
459
614
|
|
460
615
|
# TODO: the TPUs fetched fails to contain us-east1
|
461
|
-
def get_tpu_df(
|
616
|
+
def get_tpu_df(gce_skus: List[Dict[str, Any]],
|
617
|
+
tpu_skus: List[Dict[str, Any]]) -> 'pd.DataFrame':
|
462
618
|
df = _get_tpus()
|
463
619
|
if df.empty:
|
464
620
|
return df
|
465
621
|
|
622
|
+
def _get_tpu_description_str(tpu_version: str) -> str:
|
623
|
+
# TPU V5 has a different naming convention since it is contained in
|
624
|
+
# the GCE SKUs. v5p -> TpuV5p, v5litepod -> TpuV5e.
|
625
|
+
if tpu_version.startswith('v5'):
|
626
|
+
if tpu_version == 'v5p':
|
627
|
+
return 'TpuV5p'
|
628
|
+
assert tpu_version == 'v5litepod', tpu_version
|
629
|
+
return 'TpuV5e'
|
630
|
+
if tpu_version.startswith('v6e'):
|
631
|
+
return 'TpuV6e'
|
632
|
+
return f'Tpu-{tpu_version}'
|
633
|
+
|
466
634
|
def get_tpu_price(row: pd.Series, spot: bool) -> Optional[float]:
|
467
635
|
assert row['AcceleratorCount'] == 1, row
|
468
636
|
tpu_price = None
|
@@ -475,9 +643,12 @@ def get_tpu_df(skus: List[Dict[str, Any]]) -> 'pd.DataFrame':
|
|
475
643
|
# whether the TPU is a single device or a pod.
|
476
644
|
# For TPU-v4, the pricing is uniform, and thus the pricing API
|
477
645
|
# only provides the price of TPU-v4 pods.
|
478
|
-
|
646
|
+
# The price shown for v5 & v6e TPU is per chip hour, so there is
|
647
|
+
# no 'Pod' keyword in the description.
|
648
|
+
is_pod = ((num_cores > 8 or tpu_version == 'v4') and
|
649
|
+
not tpu_version.startswith('v5') and tpu_version != 'v6e')
|
479
650
|
|
480
|
-
for sku in
|
651
|
+
for sku in gce_skus + tpu_skus:
|
481
652
|
if tpu_region not in sku['serviceRegions']:
|
482
653
|
continue
|
483
654
|
description = sku['description']
|
@@ -489,7 +660,7 @@ def get_tpu_df(skus: List[Dict[str, Any]]) -> 'pd.DataFrame':
|
|
489
660
|
if 'Preemptible' in description:
|
490
661
|
continue
|
491
662
|
|
492
|
-
if
|
663
|
+
if _get_tpu_description_str(tpu_version) not in description:
|
493
664
|
continue
|
494
665
|
if is_pod:
|
495
666
|
if 'Pod' not in description:
|
@@ -500,7 +671,17 @@ def get_tpu_df(skus: List[Dict[str, Any]]) -> 'pd.DataFrame':
|
|
500
671
|
|
501
672
|
unit_price = _get_unit_price(sku)
|
502
673
|
tpu_device_price = unit_price
|
503
|
-
|
674
|
+
# v5p naming convention is v$VERSION_NUMBERp-$CORES_COUNT, while
|
675
|
+
# v5e is v$VERSION_NUMBER-$CHIP_COUNT. In the same time, V5 price
|
676
|
+
# is shown as per chip price, which is 2 cores for v5p and 1 core
|
677
|
+
# for v5e. Reference here:
|
678
|
+
# https://cloud.google.com/tpu/docs/v5p#using-accelerator-type
|
679
|
+
# https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config
|
680
|
+
# v6e is also per chip price. Reference here:
|
681
|
+
# https://cloud.google.com/tpu/docs/v6e#configurations
|
682
|
+
core_per_sku = (1 if tpu_version in ['v5litepod', 'v6e'] else
|
683
|
+
2 if tpu_version == 'v5p' else 8)
|
684
|
+
tpu_core_price = tpu_device_price / core_per_sku
|
504
685
|
tpu_price = num_cores * tpu_core_price
|
505
686
|
break
|
506
687
|
|
@@ -518,7 +699,13 @@ def get_tpu_df(skus: List[Dict[str, Any]]) -> 'pd.DataFrame':
|
|
518
699
|
spot_str = 'spot ' if spot else ''
|
519
700
|
print(f'The {spot_str}price of {tpu_name} in {tpu_region} is '
|
520
701
|
'not found in SKUs or hidden TPU price DF.')
|
521
|
-
|
702
|
+
if (tpu_name.startswith('tpu-v6e') and
|
703
|
+
tpu_region in TPU_V6E_MISSING_REGIONS):
|
704
|
+
if not spot:
|
705
|
+
tpu_price = 0.0
|
706
|
+
else:
|
707
|
+
assert spot or tpu_price is not None, (row, hidden_tpu,
|
708
|
+
HIDDEN_TPU_DF)
|
522
709
|
return tpu_price
|
523
710
|
|
524
711
|
df['Price'] = df.apply(lambda row: get_tpu_price(row, spot=False), axis=1)
|
@@ -546,7 +733,8 @@ def get_catalog_df(region_prefix: str) -> 'pd.DataFrame':
|
|
546
733
|
region_prefix)] if not gpu_df.empty else gpu_df
|
547
734
|
|
548
735
|
gcp_tpu_skus = get_skus(TPU_SERVICE_ID)
|
549
|
-
|
736
|
+
# TPU V5 SKU is not included in the TPU SKUs but in the GCE SKUs.
|
737
|
+
tpu_df = get_tpu_df(gcp_skus, gcp_tpu_skus)
|
550
738
|
|
551
739
|
# Merge the dataframes.
|
552
740
|
df = pd.concat([vm_df, gpu_df, tpu_df, TPU_V4_HOST_DF])
|
@@ -11,6 +11,7 @@ import argparse
|
|
11
11
|
import csv
|
12
12
|
import json
|
13
13
|
import os
|
14
|
+
from typing import Optional, Tuple
|
14
15
|
|
15
16
|
import requests
|
16
17
|
|
@@ -19,17 +20,21 @@ DEFAULT_LAMBDA_KEYS_PATH = os.path.expanduser('~/.lambda_cloud/lambda_keys')
|
|
19
20
|
|
20
21
|
# List of all possible regions.
|
21
22
|
REGIONS = [
|
22
|
-
'australia-southeast-1',
|
23
23
|
'europe-central-1',
|
24
24
|
'asia-south-1',
|
25
25
|
'me-west-1',
|
26
26
|
'europe-south-1',
|
27
27
|
'asia-northeast-1',
|
28
28
|
'asia-northeast-2',
|
29
|
+
'australia-east-1',
|
29
30
|
'us-east-1',
|
31
|
+
'us-east-2',
|
32
|
+
'us-east-3',
|
30
33
|
'us-west-2',
|
31
34
|
'us-west-1',
|
32
35
|
'us-south-1',
|
36
|
+
'us-south-2',
|
37
|
+
'us-south-3',
|
33
38
|
'us-west-3',
|
34
39
|
'us-midwest-1',
|
35
40
|
]
|
@@ -43,18 +48,25 @@ GPU_TO_MEMORY = {
|
|
43
48
|
'RTX6000': 24576,
|
44
49
|
'V100': 16384,
|
45
50
|
'H100': 81920,
|
51
|
+
'GH200': 98304,
|
52
|
+
'GENERAL': None
|
46
53
|
}
|
47
54
|
|
48
55
|
|
49
|
-
def
|
56
|
+
def name_to_gpu_and_cnt(name: str) -> Optional[Tuple[str, int]]:
|
57
|
+
"""Extract GPU and count from instance type name.
|
58
|
+
|
59
|
+
The instance type name is in the format:
|
60
|
+
'gpu_{gpu_count}x_{gpu_name}_<suffix>'.
|
61
|
+
"""
|
50
62
|
# Edge case
|
51
63
|
if name == 'gpu_8x_a100_80gb_sxm4':
|
52
|
-
return 'A100-80GB'
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
return
|
64
|
+
return 'A100-80GB', 8
|
65
|
+
gpu = name.split('_')[2].upper()
|
66
|
+
if gpu == 'GENERAL':
|
67
|
+
return None
|
68
|
+
gpu_cnt = int(name.split('_')[1].replace('x', ''))
|
69
|
+
return gpu, gpu_cnt
|
58
70
|
|
59
71
|
|
60
72
|
def create_catalog(api_key: str, output_path: str) -> None:
|
@@ -71,24 +83,32 @@ def create_catalog(api_key: str, output_path: str) -> None:
|
|
71
83
|
# We parse info.keys() in reverse order so gpu_1x_a100_sxm4 comes before
|
72
84
|
# gpu_1x_a100 in the catalog (gpu_1x_a100_sxm4 has more availability).
|
73
85
|
for vm in reversed(list(info.keys())):
|
74
|
-
|
75
|
-
|
86
|
+
gpu_and_cnt = name_to_gpu_and_cnt(vm)
|
87
|
+
gpu: Optional[str]
|
88
|
+
gpu_cnt: Optional[float]
|
89
|
+
if gpu_and_cnt is None:
|
90
|
+
gpu, gpu_cnt = None, None
|
91
|
+
else:
|
92
|
+
gpu = gpu_and_cnt[0]
|
93
|
+
gpu_cnt = float(gpu_and_cnt[1])
|
76
94
|
vcpus = float(info[vm]['instance_type']['specs']['vcpus'])
|
77
95
|
mem = float(info[vm]['instance_type']['specs']['memory_gib'])
|
78
|
-
price = float(info[vm]['instance_type']
|
79
|
-
|
80
|
-
gpuinfo =
|
81
|
-
|
82
|
-
|
83
|
-
'
|
84
|
-
|
85
|
-
|
86
|
-
'
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
96
|
+
price = (float(info[vm]['instance_type']['price_cents_per_hour']) /
|
97
|
+
100)
|
98
|
+
gpuinfo: Optional[str] = None
|
99
|
+
if gpu is not None:
|
100
|
+
gpuinfo_dict = {
|
101
|
+
'Gpus': [{
|
102
|
+
'Name': gpu,
|
103
|
+
'Manufacturer': 'NVIDIA',
|
104
|
+
'Count': gpu_cnt,
|
105
|
+
'MemoryInfo': {
|
106
|
+
'SizeInMiB': GPU_TO_MEMORY[gpu]
|
107
|
+
},
|
108
|
+
}],
|
109
|
+
'TotalGpuMemoryInMiB': GPU_TO_MEMORY[gpu]
|
110
|
+
}
|
111
|
+
gpuinfo = json.dumps(gpuinfo_dict).replace('"', "'") # pylint: disable=invalid-string-quote
|
92
112
|
for r in REGIONS:
|
93
113
|
writer.writerow(
|
94
114
|
[vm, gpu, gpu_cnt, vcpus, mem, price, r, gpuinfo, ''])
|
@@ -0,0 +1,147 @@
|
|
1
|
+
"""A script that generates the Vast Cloud catalog. """
|
2
|
+
|
3
|
+
#
|
4
|
+
# Due to the design of the sdk, pylint has a false
|
5
|
+
# positive for the fnctions.
|
6
|
+
#
|
7
|
+
# pylint: disable=assignment-from-no-return
|
8
|
+
import collections
|
9
|
+
import csv
|
10
|
+
import json
|
11
|
+
import math
|
12
|
+
import re
|
13
|
+
import sys
|
14
|
+
from typing import Any, Dict, List
|
15
|
+
|
16
|
+
from sky.adaptors import vast
|
17
|
+
|
18
|
+
_map = {
|
19
|
+
'TeslaV100': 'V100',
|
20
|
+
'TeslaT4': 'T4',
|
21
|
+
'TeslaP100': 'P100',
|
22
|
+
'QRTX6000': 'RTX6000',
|
23
|
+
'QRTX8000': 'RTX8000'
|
24
|
+
}
|
25
|
+
|
26
|
+
|
27
|
+
def create_instance_type(obj: Dict[str, Any]) -> str:
|
28
|
+
stubify = lambda x: re.sub(r'\s', '_', x)
|
29
|
+
return '{}x-{}-{}-{}'.format(obj['num_gpus'], stubify(obj['gpu_name']),
|
30
|
+
obj['cpu_cores'], obj['cpu_ram'])
|
31
|
+
|
32
|
+
|
33
|
+
def dot_get(d: dict, key: str) -> Any:
|
34
|
+
for k in key.split('.'):
|
35
|
+
d = d[k]
|
36
|
+
return d
|
37
|
+
|
38
|
+
|
39
|
+
if __name__ == '__main__':
|
40
|
+
seen = set()
|
41
|
+
# InstanceType and gpuInfo are basically just stubs
|
42
|
+
# so that the dictwriter is happy without weird
|
43
|
+
# code.
|
44
|
+
mapped_keys = (('gpu_name', 'InstanceType'), ('gpu_name',
|
45
|
+
'AcceleratorName'),
|
46
|
+
('num_gpus', 'AcceleratorCount'), ('cpu_cores', 'vCPUs'),
|
47
|
+
('cpu_ram', 'MemoryGiB'), ('gpu_name', 'GpuInfo'),
|
48
|
+
('search.totalHour', 'Price'), ('min_bid', 'SpotPrice'),
|
49
|
+
('geolocation', 'Region'))
|
50
|
+
writer = csv.DictWriter(sys.stdout, fieldnames=[x[1] for x in mapped_keys])
|
51
|
+
writer.writeheader()
|
52
|
+
|
53
|
+
# Vast has a wide variety of machines, some of
|
54
|
+
# which will have less diskspace and network
|
55
|
+
# bandwidth than others.
|
56
|
+
#
|
57
|
+
# The machine normally have high specificity
|
58
|
+
# in the vast catalog - this is fairly unique
|
59
|
+
# to Vast and can make bucketing them into
|
60
|
+
# instance types difficult.
|
61
|
+
#
|
62
|
+
# The flags
|
63
|
+
#
|
64
|
+
# * georegion consolidates geographic areas
|
65
|
+
#
|
66
|
+
# * chunked rounds down specifications (such
|
67
|
+
# as 1025GB to 1024GB disk) in order to
|
68
|
+
# make machine specifications look more
|
69
|
+
# consistent
|
70
|
+
#
|
71
|
+
# * inet_down makes sure that only machines
|
72
|
+
# with "reasonable" downlink speed are
|
73
|
+
# considered
|
74
|
+
#
|
75
|
+
# * disk_space sets a lower limit of how
|
76
|
+
# much space is availble to be allocated
|
77
|
+
# in order to ensure that machines with
|
78
|
+
# small disk pools aren't listed
|
79
|
+
#
|
80
|
+
offerList = vast.vast().search_offers(
|
81
|
+
query=('georegion = true chunked = true '
|
82
|
+
'inet_down >= 100 disk_space >= 80'),
|
83
|
+
limit=10000)
|
84
|
+
|
85
|
+
priceMap: Dict[str, List] = collections.defaultdict(list)
|
86
|
+
for offer in offerList:
|
87
|
+
entry = {}
|
88
|
+
for ours, theirs in mapped_keys:
|
89
|
+
field = dot_get(offer, ours)
|
90
|
+
entry[theirs] = field
|
91
|
+
|
92
|
+
instance_type = create_instance_type(offer)
|
93
|
+
entry['InstanceType'] = instance_type
|
94
|
+
|
95
|
+
# the documentation says
|
96
|
+
# "{'gpus': [{
|
97
|
+
# 'name': 'v100',
|
98
|
+
# 'manufacturer': 'nvidia',
|
99
|
+
# 'count': 8.0,
|
100
|
+
# 'memoryinfo': {'sizeinmib': 16384}
|
101
|
+
# }],
|
102
|
+
# 'totalgpumemoryinmib': 16384}",
|
103
|
+
# we can do that.
|
104
|
+
entry['MemoryGiB'] /= 1024
|
105
|
+
|
106
|
+
gpu = re.sub('Ada', '-Ada', re.sub(r'\s', '', offer['gpu_name']))
|
107
|
+
gpu = re.sub(r'(Ti|PCIE|SXM4|SXM|NVL)$', '', gpu)
|
108
|
+
gpu = re.sub(r'(RTX\d0\d0)(S|D)$', r'\1', gpu)
|
109
|
+
|
110
|
+
if gpu in _map:
|
111
|
+
gpu = _map[gpu]
|
112
|
+
|
113
|
+
entry['AcceleratorName'] = gpu
|
114
|
+
entry['GpuInfo'] = json.dumps({
|
115
|
+
'Gpus': [{
|
116
|
+
'Name': gpu,
|
117
|
+
'Count': offer['num_gpus'],
|
118
|
+
'MemoryInfo': {
|
119
|
+
'SizeInMiB': offer['gpu_total_ram']
|
120
|
+
}
|
121
|
+
}],
|
122
|
+
'TotalGpuMemoryInMiB': offer['gpu_total_ram']
|
123
|
+
}).replace('"', '\'')
|
124
|
+
|
125
|
+
priceMap[instance_type].append(entry)
|
126
|
+
|
127
|
+
for instanceList in priceMap.values():
|
128
|
+
priceList = sorted([x['Price'] for x in instanceList])
|
129
|
+
index = math.ceil(0.5 * len(priceList)) - 1
|
130
|
+
priceTarget = priceList[index]
|
131
|
+
toList: List = []
|
132
|
+
for instance in instanceList:
|
133
|
+
if instance['Price'] <= priceTarget:
|
134
|
+
instance['Price'] = '{:.2f}'.format(priceTarget)
|
135
|
+
toList.append(instance)
|
136
|
+
|
137
|
+
maxBid = max([x.get('SpotPrice') for x in toList])
|
138
|
+
for instance in toList:
|
139
|
+
stub = f'{instance["InstanceType"]} {instance["Region"][-2:]}'
|
140
|
+
if stub in seen:
|
141
|
+
printstub = f'{stub}#print'
|
142
|
+
if printstub not in seen:
|
143
|
+
instance['SpotPrice'] = f'{maxBid:.2f}'
|
144
|
+
writer.writerow(instance)
|
145
|
+
seen.add(printstub)
|
146
|
+
else:
|
147
|
+
seen.add(stub)
|
@@ -534,7 +534,7 @@ def initialize_images_csv(csv_saving_path: str, vc_object,
|
|
534
534
|
gpu_name = tag_name.split('-')[1]
|
535
535
|
if gpu_name not in gpu_tags:
|
536
536
|
gpu_tags.append(gpu_name)
|
537
|
-
if
|
537
|
+
if gpu_tags:
|
538
538
|
gpu_tags_str = str(gpu_tags).replace('\'', '\"')
|
539
539
|
f.write(f'{item.id},{vcenter_name},{item_cpu},{item_memory}'
|
540
540
|
f',,,\'{gpu_tags_str}\'\n')
|