skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,111 @@
|
|
1
|
+
"""Digital ocean service catalog.
|
2
|
+
|
3
|
+
This module loads the service catalog file and can be used to
|
4
|
+
query instance types and pricing information for digital ocean.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import typing
|
8
|
+
from typing import Dict, List, Optional, Tuple, Union
|
9
|
+
|
10
|
+
from sky.clouds.service_catalog import common
|
11
|
+
from sky.utils import ux_utils
|
12
|
+
|
13
|
+
if typing.TYPE_CHECKING:
|
14
|
+
from sky.clouds import cloud
|
15
|
+
|
16
|
+
_df = common.read_catalog('do/vms.csv')
|
17
|
+
|
18
|
+
|
19
|
+
def instance_type_exists(instance_type: str) -> bool:
|
20
|
+
return common.instance_type_exists_impl(_df, instance_type)
|
21
|
+
|
22
|
+
|
23
|
+
def validate_region_zone(
|
24
|
+
region: Optional[str],
|
25
|
+
zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
|
26
|
+
if zone is not None:
|
27
|
+
with ux_utils.print_exception_no_traceback():
|
28
|
+
raise ValueError('DO does not support zones.')
|
29
|
+
return common.validate_region_zone_impl('DO', _df, region, zone)
|
30
|
+
|
31
|
+
|
32
|
+
def get_hourly_cost(
|
33
|
+
instance_type: str,
|
34
|
+
use_spot: bool = False,
|
35
|
+
region: Optional[str] = None,
|
36
|
+
zone: Optional[str] = None,
|
37
|
+
) -> float:
|
38
|
+
"""Returns the cost, or the cheapest cost among all zones for spot."""
|
39
|
+
if zone is not None:
|
40
|
+
with ux_utils.print_exception_no_traceback():
|
41
|
+
raise ValueError('DO does not support zones.')
|
42
|
+
return common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
|
43
|
+
zone)
|
44
|
+
|
45
|
+
|
46
|
+
def get_vcpus_mem_from_instance_type(
|
47
|
+
instance_type: str,) -> Tuple[Optional[float], Optional[float]]:
|
48
|
+
return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
|
49
|
+
|
50
|
+
|
51
|
+
def get_default_instance_type(
|
52
|
+
cpus: Optional[str] = None,
|
53
|
+
memory: Optional[str] = None,
|
54
|
+
disk_tier: Optional[str] = None,
|
55
|
+
) -> Optional[str]:
|
56
|
+
# NOTE: After expanding catalog to multiple entries, you may
|
57
|
+
# want to specify a default instance type or family.
|
58
|
+
del disk_tier # unused
|
59
|
+
return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory)
|
60
|
+
|
61
|
+
|
62
|
+
def get_accelerators_from_instance_type(
|
63
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
64
|
+
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
65
|
+
|
66
|
+
|
67
|
+
def get_instance_type_for_accelerator(
|
68
|
+
acc_name: str,
|
69
|
+
acc_count: int,
|
70
|
+
cpus: Optional[str] = None,
|
71
|
+
memory: Optional[str] = None,
|
72
|
+
use_spot: bool = False,
|
73
|
+
region: Optional[str] = None,
|
74
|
+
zone: Optional[str] = None,
|
75
|
+
) -> Tuple[Optional[List[str]], List[str]]:
|
76
|
+
"""Returns a list of instance types that have the given accelerator."""
|
77
|
+
if zone is not None:
|
78
|
+
with ux_utils.print_exception_no_traceback():
|
79
|
+
raise ValueError('DO does not support zones.')
|
80
|
+
return common.get_instance_type_for_accelerator_impl(
|
81
|
+
df=_df,
|
82
|
+
acc_name=acc_name,
|
83
|
+
acc_count=acc_count,
|
84
|
+
cpus=cpus,
|
85
|
+
memory=memory,
|
86
|
+
use_spot=use_spot,
|
87
|
+
region=region,
|
88
|
+
zone=zone,
|
89
|
+
)
|
90
|
+
|
91
|
+
|
92
|
+
def get_region_zones_for_instance_type(instance_type: str,
|
93
|
+
use_spot: bool) -> List['cloud.Region']:
|
94
|
+
df = _df[_df['InstanceType'] == instance_type]
|
95
|
+
return common.get_region_zones(df, use_spot)
|
96
|
+
|
97
|
+
|
98
|
+
def list_accelerators(
|
99
|
+
gpus_only: bool,
|
100
|
+
name_filter: Optional[str],
|
101
|
+
region_filter: Optional[str],
|
102
|
+
quantity_filter: Optional[int],
|
103
|
+
case_sensitive: bool = True,
|
104
|
+
all_regions: bool = False,
|
105
|
+
require_price: bool = True,
|
106
|
+
) -> Dict[str, List[common.InstanceTypeInfo]]:
|
107
|
+
"""Returns all instance types in DO offering GPUs."""
|
108
|
+
del require_price # unused
|
109
|
+
return common.list_accelerators_impl('DO', _df, gpus_only, name_filter,
|
110
|
+
region_filter, quantity_filter,
|
111
|
+
case_sensitive, all_regions)
|
@@ -4,7 +4,7 @@ This module loads the service catalog file and can be used to query
|
|
4
4
|
instance types and pricing information for FluidStack.
|
5
5
|
"""
|
6
6
|
import typing
|
7
|
-
from typing import Dict, List, Optional, Tuple
|
7
|
+
from typing import Dict, List, Optional, Tuple, Union
|
8
8
|
|
9
9
|
from sky.clouds.service_catalog import common
|
10
10
|
from sky.utils import ux_utils
|
@@ -65,7 +65,7 @@ def get_default_instance_type(cpus: Optional[str] = None,
|
|
65
65
|
|
66
66
|
|
67
67
|
def get_accelerators_from_instance_type(
|
68
|
-
instance_type: str) -> Optional[Dict[str, int]]:
|
68
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
69
69
|
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
70
70
|
|
71
71
|
|
@@ -9,6 +9,7 @@ from typing import Dict, List, Optional, Tuple
|
|
9
9
|
from sky import exceptions
|
10
10
|
from sky import sky_logging
|
11
11
|
from sky.adaptors import common as adaptors_common
|
12
|
+
from sky.clouds import GCP
|
12
13
|
from sky.clouds.service_catalog import common
|
13
14
|
from sky.utils import resources_utils
|
14
15
|
from sky.utils import ux_utils
|
@@ -96,7 +97,13 @@ _ACC_INSTANCE_TYPE_DICTS = {
|
|
96
97
|
8: ['g2-standard-96'],
|
97
98
|
},
|
98
99
|
'H100': {
|
100
|
+
1: ['a3-highgpu-1g'],
|
101
|
+
2: ['a3-highgpu-2g'],
|
102
|
+
4: ['a3-highgpu-4g'],
|
99
103
|
8: ['a3-highgpu-8g'],
|
104
|
+
},
|
105
|
+
'H100-MEGA': {
|
106
|
+
8: ['a3-megagpu-8g'],
|
100
107
|
}
|
101
108
|
}
|
102
109
|
|
@@ -243,7 +250,6 @@ def get_default_instance_type(
|
|
243
250
|
cpus: Optional[str] = None,
|
244
251
|
memory: Optional[str] = None,
|
245
252
|
disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
|
246
|
-
del disk_tier # unused
|
247
253
|
if cpus is None and memory is None:
|
248
254
|
cpus = f'{_DEFAULT_NUM_VCPUS}+'
|
249
255
|
if memory is None:
|
@@ -254,6 +260,12 @@ def get_default_instance_type(
|
|
254
260
|
f'{family}-' for family in _DEFAULT_INSTANCE_FAMILY)
|
255
261
|
df = _df[_df['InstanceType'].notna()]
|
256
262
|
df = df[df['InstanceType'].str.startswith(instance_type_prefix)]
|
263
|
+
|
264
|
+
def _filter_disk_type(instance_type: str) -> bool:
|
265
|
+
valid, _ = GCP.check_disk_tier(instance_type, disk_tier)
|
266
|
+
return valid
|
267
|
+
|
268
|
+
df = df.loc[df['InstanceType'].apply(_filter_disk_type)]
|
257
269
|
return common.get_instance_type_for_cpus_mem_impl(df, cpus,
|
258
270
|
memory_gb_or_ratio)
|
259
271
|
|
@@ -280,7 +292,9 @@ def get_instance_type_for_accelerator(
|
|
280
292
|
|
281
293
|
if acc_name in _ACC_INSTANCE_TYPE_DICTS:
|
282
294
|
df = _df[_df['InstanceType'].notna()]
|
283
|
-
instance_types = _ACC_INSTANCE_TYPE_DICTS[acc_name]
|
295
|
+
instance_types = _ACC_INSTANCE_TYPE_DICTS[acc_name].get(acc_count, None)
|
296
|
+
if instance_types is None:
|
297
|
+
return None, []
|
284
298
|
df = df[df['InstanceType'].isin(instance_types)]
|
285
299
|
|
286
300
|
# Check the cpus and memory specified by the user.
|
@@ -4,7 +4,7 @@ This module loads the service catalog file and can be used to query
|
|
4
4
|
instance types and pricing information for IBM.
|
5
5
|
"""
|
6
6
|
|
7
|
-
from typing import Dict, List, Optional, Tuple
|
7
|
+
from typing import Dict, List, Optional, Tuple, Union
|
8
8
|
|
9
9
|
from sky import sky_logging
|
10
10
|
from sky.adaptors import ibm
|
@@ -43,7 +43,7 @@ def get_vcpus_mem_from_instance_type(
|
|
43
43
|
|
44
44
|
|
45
45
|
def get_accelerators_from_instance_type(
|
46
|
-
instance_type: str) -> Optional[Dict[str, int]]:
|
46
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
47
47
|
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
48
48
|
|
49
49
|
|
@@ -8,17 +8,23 @@ import typing
|
|
8
8
|
from typing import Dict, List, Optional, Set, Tuple
|
9
9
|
|
10
10
|
from sky import check as sky_check
|
11
|
+
from sky import clouds as sky_clouds
|
12
|
+
from sky import sky_logging
|
11
13
|
from sky.adaptors import common as adaptors_common
|
12
|
-
from sky.
|
14
|
+
from sky.adaptors import kubernetes
|
13
15
|
from sky.clouds.service_catalog import CloudFilter
|
14
16
|
from sky.clouds.service_catalog import common
|
15
17
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
16
18
|
|
19
|
+
logger = sky_logging.init_logger(__name__)
|
20
|
+
|
17
21
|
if typing.TYPE_CHECKING:
|
18
22
|
import pandas as pd
|
19
23
|
else:
|
20
24
|
pd = adaptors_common.LazyImport('pandas')
|
21
25
|
|
26
|
+
logger = sky_logging.init_logger(__name__)
|
27
|
+
|
22
28
|
_PULL_FREQUENCY_HOURS = 7
|
23
29
|
|
24
30
|
# We keep pull_frequency_hours so we can remotely update the default image paths
|
@@ -31,7 +37,16 @@ _image_df = common.read_catalog('kubernetes/images.csv',
|
|
31
37
|
|
32
38
|
def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
|
33
39
|
"""Returns the image id from the tag."""
|
34
|
-
|
40
|
+
global _image_df
|
41
|
+
image_id = common.get_image_id_from_tag_impl(_image_df, tag, region)
|
42
|
+
if image_id is None:
|
43
|
+
# Refresh the image catalog and try again, if the image tag is not
|
44
|
+
# found.
|
45
|
+
logger.debug('Refreshing the image catalog and trying again.')
|
46
|
+
_image_df = common.read_catalog('kubernetes/images.csv',
|
47
|
+
pull_frequency_hours=0)
|
48
|
+
image_id = common.get_image_id_from_tag_impl(_image_df, tag, region)
|
49
|
+
return image_id
|
35
50
|
|
36
51
|
|
37
52
|
def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:
|
@@ -50,9 +65,14 @@ def list_accelerators(
|
|
50
65
|
# TODO(romilb): We should consider putting a lru_cache() with TTL to
|
51
66
|
# avoid multiple calls to kubernetes API in a short period of time (e.g.,
|
52
67
|
# from the optimizer).
|
53
|
-
return
|
54
|
-
|
55
|
-
|
68
|
+
return _list_accelerators(gpus_only,
|
69
|
+
name_filter,
|
70
|
+
region_filter,
|
71
|
+
quantity_filter,
|
72
|
+
case_sensitive,
|
73
|
+
all_regions,
|
74
|
+
require_price,
|
75
|
+
realtime=False)[0]
|
56
76
|
|
57
77
|
|
58
78
|
def list_accelerators_realtime(
|
@@ -65,27 +85,100 @@ def list_accelerators_realtime(
|
|
65
85
|
require_price: bool = True
|
66
86
|
) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
|
67
87
|
int]]:
|
88
|
+
return _list_accelerators(gpus_only,
|
89
|
+
name_filter,
|
90
|
+
region_filter,
|
91
|
+
quantity_filter,
|
92
|
+
case_sensitive,
|
93
|
+
all_regions,
|
94
|
+
require_price,
|
95
|
+
realtime=True)
|
96
|
+
|
97
|
+
|
98
|
+
def _list_accelerators(
|
99
|
+
gpus_only: bool,
|
100
|
+
name_filter: Optional[str],
|
101
|
+
region_filter: Optional[str],
|
102
|
+
quantity_filter: Optional[int],
|
103
|
+
case_sensitive: bool = True,
|
104
|
+
all_regions: bool = False,
|
105
|
+
require_price: bool = True,
|
106
|
+
realtime: bool = False
|
107
|
+
) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
|
108
|
+
int]]:
|
109
|
+
"""List accelerators in the Kubernetes cluster.
|
110
|
+
|
111
|
+
If realtime is True, the function will query the cluster to fetch real-time
|
112
|
+
GPU usage, which is returned in total_accelerators_available. Note that
|
113
|
+
this may require an expensive list_pod_for_all_namespaces call, which
|
114
|
+
requires cluster-wide pod read permissions.
|
115
|
+
|
116
|
+
If the user does not have sufficient permissions to list pods in all
|
117
|
+
namespaces, the function will return free GPUs as -1.
|
118
|
+
|
119
|
+
Returns:
|
120
|
+
A tuple of three dictionaries:
|
121
|
+
- qtys_map: Dict mapping accelerator names to lists of InstanceTypeInfo
|
122
|
+
objects with quantity information.
|
123
|
+
- total_accelerators_capacity: Dict mapping accelerator names to their
|
124
|
+
total capacity in the cluster.
|
125
|
+
- total_accelerators_available: Dict mapping accelerator names to their
|
126
|
+
current availability. Returns -1 for each accelerator if
|
127
|
+
realtime=False or if insufficient permissions.
|
128
|
+
"""
|
129
|
+
# TODO(romilb): This should be refactored to use get_kubernetes_node_info()
|
130
|
+
# function from kubernetes_utils.
|
68
131
|
del all_regions, require_price # Unused.
|
69
|
-
|
70
|
-
if
|
71
|
-
|
72
|
-
|
73
|
-
|
132
|
+
|
133
|
+
# First check if Kubernetes is enabled. This ensures k8s python client is
|
134
|
+
# installed. Do not put any k8s-specific logic before this check.
|
135
|
+
enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh()
|
136
|
+
if not sky_clouds.cloud_in_iterable(sky_clouds.Kubernetes(),
|
137
|
+
enabled_clouds):
|
138
|
+
return {}, {}, {}
|
139
|
+
|
140
|
+
# TODO(zhwu): this should return all accelerators in multiple kubernetes
|
141
|
+
# clusters defined by allowed_contexts.
|
142
|
+
if region_filter is None:
|
143
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
144
|
+
if context is None and kubernetes_utils.is_incluster_config_available():
|
145
|
+
# If context is None and we are running in a kubernetes pod, use the
|
146
|
+
# in-cluster context as the current context.
|
147
|
+
context = kubernetes.in_cluster_context_name()
|
148
|
+
else:
|
149
|
+
context = region_filter
|
150
|
+
if context is None:
|
74
151
|
return {}, {}, {}
|
75
152
|
|
76
|
-
|
153
|
+
# Verify that the credentials are still valid.
|
154
|
+
if not kubernetes_utils.check_credentials(context)[0]:
|
155
|
+
return {}, {}, {}
|
156
|
+
|
157
|
+
has_gpu = kubernetes_utils.detect_accelerator_resource(context)
|
77
158
|
if not has_gpu:
|
78
159
|
return {}, {}, {}
|
79
160
|
|
80
|
-
|
81
|
-
if not
|
161
|
+
lf, _ = kubernetes_utils.detect_gpu_label_formatter(context)
|
162
|
+
if not lf:
|
82
163
|
return {}, {}, {}
|
83
164
|
|
84
165
|
accelerators_qtys: Set[Tuple[str, int]] = set()
|
85
|
-
|
86
|
-
nodes = kubernetes_utils.get_kubernetes_nodes()
|
87
|
-
|
88
|
-
|
166
|
+
keys = lf.get_label_keys()
|
167
|
+
nodes = kubernetes_utils.get_kubernetes_nodes(context)
|
168
|
+
pods = None
|
169
|
+
if realtime:
|
170
|
+
# Get the pods to get the real-time GPU usage
|
171
|
+
try:
|
172
|
+
pods = kubernetes_utils.get_all_pods_in_kubernetes_cluster(context)
|
173
|
+
except kubernetes.api_exception() as e:
|
174
|
+
if e.status == 403:
|
175
|
+
logger.warning(
|
176
|
+
'Failed to get pods in the Kubernetes cluster '
|
177
|
+
'(forbidden). Please check if your account has '
|
178
|
+
'necessary permissions to list pods. Realtime GPU '
|
179
|
+
'availability information may be incorrect.')
|
180
|
+
else:
|
181
|
+
raise
|
89
182
|
# Total number of GPUs in the cluster
|
90
183
|
total_accelerators_capacity: Dict[str, int] = {}
|
91
184
|
# Total number of GPUs currently available in the cluster
|
@@ -93,58 +186,88 @@ def list_accelerators_realtime(
|
|
93
186
|
min_quantity_filter = quantity_filter if quantity_filter else 1
|
94
187
|
|
95
188
|
for node in nodes:
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
#
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
189
|
+
for key in keys:
|
190
|
+
if key in node.metadata.labels:
|
191
|
+
allocated_qty = 0
|
192
|
+
accelerator_name = lf.get_accelerator_from_label_value(
|
193
|
+
node.metadata.labels.get(key))
|
194
|
+
|
195
|
+
# Exclude multi-host TPUs from being processed.
|
196
|
+
# TODO(Doyoung): Remove the logic when adding support for
|
197
|
+
# multi-host TPUs.
|
198
|
+
if kubernetes_utils.is_multi_host_tpu(node.metadata.labels):
|
199
|
+
continue
|
200
|
+
|
201
|
+
# Check if name_filter regex matches the accelerator_name
|
202
|
+
regex_flags = 0 if case_sensitive else re.IGNORECASE
|
203
|
+
if name_filter and not re.match(
|
204
|
+
name_filter, accelerator_name, flags=regex_flags):
|
205
|
+
continue
|
206
|
+
|
207
|
+
# Generate the accelerator quantities
|
208
|
+
accelerator_count = (
|
209
|
+
kubernetes_utils.get_node_accelerator_count(
|
210
|
+
node.status.allocatable))
|
211
|
+
|
212
|
+
if accelerator_name and accelerator_count > 0:
|
213
|
+
# TPUs are counted in a different way compared to GPUs.
|
214
|
+
# Multi-node GPUs can be split into smaller units and be
|
215
|
+
# provisioned, but TPUs are considered as an atomic unit.
|
216
|
+
if kubernetes_utils.is_tpu_on_gke(accelerator_name):
|
217
|
+
accelerators_qtys.add(
|
218
|
+
(accelerator_name, accelerator_count))
|
219
|
+
else:
|
220
|
+
count = 1
|
221
|
+
while count <= accelerator_count:
|
222
|
+
accelerators_qtys.add((accelerator_name, count))
|
223
|
+
count *= 2
|
224
|
+
# Add the accelerator count if it's not already in the
|
225
|
+
# set (e.g., if there's 12 GPUs, we should have qtys 1,
|
226
|
+
# 2, 4, 8, 12)
|
227
|
+
if accelerator_count not in accelerators_qtys:
|
228
|
+
accelerators_qtys.add(
|
229
|
+
(accelerator_name, accelerator_count))
|
230
|
+
|
231
|
+
if accelerator_count >= min_quantity_filter:
|
232
|
+
quantized_count = (
|
233
|
+
min_quantity_filter *
|
234
|
+
(accelerator_count // min_quantity_filter))
|
235
|
+
if accelerator_name not in total_accelerators_capacity:
|
236
|
+
total_accelerators_capacity[
|
237
|
+
accelerator_name] = quantized_count
|
238
|
+
else:
|
239
|
+
total_accelerators_capacity[
|
240
|
+
accelerator_name] += quantized_count
|
241
|
+
|
242
|
+
if pods is None:
|
243
|
+
# If we can't get the pods, we can't get the GPU usage
|
244
|
+
total_accelerators_available[accelerator_name] = -1
|
245
|
+
continue
|
246
|
+
|
247
|
+
for pod in pods:
|
248
|
+
# Get all the pods running on the node
|
249
|
+
if (pod.spec.node_name == node.metadata.name and
|
250
|
+
pod.status.phase in ['Running', 'Pending']):
|
251
|
+
# Iterate over all the containers in the pod and sum
|
252
|
+
# the GPU requests
|
253
|
+
for container in pod.spec.containers:
|
254
|
+
if container.resources.requests:
|
255
|
+
allocated_qty += (
|
256
|
+
kubernetes_utils.get_node_accelerator_count(
|
257
|
+
container.resources.requests))
|
258
|
+
|
259
|
+
accelerators_available = accelerator_count - allocated_qty
|
260
|
+
|
261
|
+
# Initialize the entry if it doesn't exist yet
|
142
262
|
if accelerator_name not in total_accelerators_available:
|
143
|
-
total_accelerators_available[
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
263
|
+
total_accelerators_available[accelerator_name] = 0
|
264
|
+
|
265
|
+
if accelerators_available >= min_quantity_filter:
|
266
|
+
quantized_availability = min_quantity_filter * (
|
267
|
+
accelerators_available // min_quantity_filter)
|
268
|
+
total_accelerators_available[accelerator_name] = (
|
269
|
+
total_accelerators_available.get(accelerator_name, 0) +
|
270
|
+
quantized_availability)
|
148
271
|
|
149
272
|
result = []
|
150
273
|
|
@@ -160,7 +283,7 @@ def list_accelerators_realtime(
|
|
160
283
|
memory=None,
|
161
284
|
price=0.0,
|
162
285
|
spot_price=0.0,
|
163
|
-
region=
|
286
|
+
region=context))
|
164
287
|
|
165
288
|
df = pd.DataFrame(result,
|
166
289
|
columns=[
|
@@ -175,7 +298,6 @@ def list_accelerators_realtime(
|
|
175
298
|
qtys_map = common.list_accelerators_impl('Kubernetes', df, gpus_only,
|
176
299
|
name_filter, region_filter,
|
177
300
|
quantity_filter, case_sensitive)
|
178
|
-
|
179
301
|
return qtys_map, total_accelerators_capacity, total_accelerators_available
|
180
302
|
|
181
303
|
|
@@ -4,7 +4,7 @@ This module loads the service catalog file and can be used to query
|
|
4
4
|
instance types and pricing information for Lambda.
|
5
5
|
"""
|
6
6
|
import typing
|
7
|
-
from typing import Dict, List, Optional, Tuple
|
7
|
+
from typing import Dict, List, Optional, Tuple, Union
|
8
8
|
|
9
9
|
from sky.clouds.service_catalog import common
|
10
10
|
from sky.utils import resources_utils
|
@@ -13,7 +13,12 @@ from sky.utils import ux_utils
|
|
13
13
|
if typing.TYPE_CHECKING:
|
14
14
|
from sky.clouds import cloud
|
15
15
|
|
16
|
-
|
16
|
+
# Keep it synced with the frequency in
|
17
|
+
# skypilot-catalog/.github/workflows/update-lambda-catalog.yml
|
18
|
+
_PULL_FREQUENCY_HOURS = 7
|
19
|
+
|
20
|
+
_df = common.read_catalog('lambda/vms.csv',
|
21
|
+
pull_frequency_hours=_PULL_FREQUENCY_HOURS)
|
17
22
|
|
18
23
|
# Number of vCPUS for gpu_1x_a10
|
19
24
|
_DEFAULT_NUM_VCPUS = 30
|
@@ -67,7 +72,7 @@ def get_default_instance_type(
|
|
67
72
|
|
68
73
|
|
69
74
|
def get_accelerators_from_instance_type(
|
70
|
-
instance_type: str) -> Optional[Dict[str, int]]:
|
75
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
71
76
|
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
72
77
|
|
73
78
|
|