skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/provision/common.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
"""Common data structures for provisioning"""
|
2
2
|
import abc
|
3
3
|
import dataclasses
|
4
|
+
import functools
|
4
5
|
import os
|
5
6
|
from typing import Any, Dict, List, Optional, Tuple
|
6
7
|
|
8
|
+
from sky import sky_logging
|
7
9
|
from sky.utils import resources_utils
|
8
10
|
|
9
11
|
# NOTE: we can use pydantic instead of dataclasses or namedtuples, because
|
@@ -14,6 +16,10 @@ from sky.utils import resources_utils
|
|
14
16
|
# -------------------- input data model -------------------- #
|
15
17
|
|
16
18
|
InstanceId = str
|
19
|
+
_START_TITLE = '\n' + '-' * 20 + 'Start: {} ' + '-' * 20
|
20
|
+
_END_TITLE = '-' * 20 + 'End: {} ' + '-' * 20 + '\n'
|
21
|
+
|
22
|
+
logger = sky_logging.init_logger(__name__)
|
17
23
|
|
18
24
|
|
19
25
|
class ProvisionerError(RuntimeError):
|
@@ -46,6 +52,8 @@ class ProvisionConfig:
|
|
46
52
|
tags: Dict[str, str]
|
47
53
|
# Whether or not to resume stopped instances.
|
48
54
|
resume_stopped_nodes: bool
|
55
|
+
# Optional ports to open on launch of the cluster.
|
56
|
+
ports_to_open_on_launch: Optional[List[int]]
|
49
57
|
|
50
58
|
|
51
59
|
# -------------------- output data model -------------------- #
|
@@ -123,7 +131,8 @@ class ClusterInfo:
|
|
123
131
|
if self.head_instance_id is None:
|
124
132
|
return None
|
125
133
|
if self.head_instance_id not in self.instances:
|
126
|
-
raise ValueError('Head instance ID not in the cluster metadata.'
|
134
|
+
raise ValueError('Head instance ID not in the cluster metadata. '
|
135
|
+
f'ClusterInfo: {self.__dict__}')
|
127
136
|
return self.instances[self.head_instance_id][0]
|
128
137
|
|
129
138
|
def get_worker_instances(self) -> List[InstanceInfo]:
|
@@ -197,8 +206,14 @@ class ClusterInfo:
|
|
197
206
|
return ip_list
|
198
207
|
|
199
208
|
def get_feasible_ips(self, force_internal_ips: bool = False) -> List[str]:
|
200
|
-
"""Get external IPs
|
201
|
-
|
209
|
+
"""Get internal or external IPs depends on the settings."""
|
210
|
+
if self.provider_config is not None:
|
211
|
+
use_internal_ips = self.provider_config.get('use_internal_ips',
|
212
|
+
False)
|
213
|
+
else:
|
214
|
+
use_internal_ips = False
|
215
|
+
return self._get_ips(use_internal_ips or not self.has_external_ips() or
|
216
|
+
force_internal_ips)
|
202
217
|
|
203
218
|
def get_ssh_ports(self) -> List[int]:
|
204
219
|
"""Get the SSH port of all the instances."""
|
@@ -268,3 +283,16 @@ def query_ports_passthrough(
|
|
268
283
|
for port in ports:
|
269
284
|
result[port] = [SocketEndpoint(port=port, host=head_ip)]
|
270
285
|
return result
|
286
|
+
|
287
|
+
|
288
|
+
def log_function_start_end(func):
|
289
|
+
|
290
|
+
@functools.wraps(func)
|
291
|
+
def wrapper(*args, **kwargs):
|
292
|
+
logger.info(_START_TITLE.format(func.__name__))
|
293
|
+
try:
|
294
|
+
return func(*args, **kwargs)
|
295
|
+
finally:
|
296
|
+
logger.info(_END_TITLE.format(func.__name__))
|
297
|
+
|
298
|
+
return wrapper
|
@@ -0,0 +1,25 @@
|
|
1
|
+
"""Constants used in the SkyPilot provisioner."""
|
2
|
+
|
3
|
+
# Tag uniquely identifying all nodes of a cluster
|
4
|
+
TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
|
5
|
+
TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
|
6
|
+
# Legacy tag for backward compatibility to distinguish head and worker nodes.
|
7
|
+
TAG_RAY_NODE_KIND = 'ray-node-type'
|
8
|
+
TAG_SKYPILOT_HEAD_NODE = 'skypilot-head-node'
|
9
|
+
|
10
|
+
HEAD_NODE_TAGS = {
|
11
|
+
TAG_RAY_NODE_KIND: 'head',
|
12
|
+
TAG_SKYPILOT_HEAD_NODE: '1',
|
13
|
+
}
|
14
|
+
|
15
|
+
WORKER_NODE_TAGS = {
|
16
|
+
TAG_RAY_NODE_KIND: 'worker',
|
17
|
+
TAG_SKYPILOT_HEAD_NODE: '0',
|
18
|
+
}
|
19
|
+
|
20
|
+
# Names for Azure Deployments.
|
21
|
+
DEPLOYMENT_NAME = 'skypilot-config'
|
22
|
+
LEGACY_DEPLOYMENT_NAME = 'ray-config'
|
23
|
+
EXTERNAL_RG_BOOTSTRAP_DEPLOYMENT_NAME = (
|
24
|
+
'skypilot-bootstrap-{cluster_name_on_cloud}')
|
25
|
+
EXTERNAL_RG_VM_DEPLOYMENT_NAME = 'skypilot-vm-{cluster_name_on_cloud}'
|
sky/provision/cudo/__init__.py
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
from sky.provision.cudo.config import bootstrap_instances
|
4
4
|
from sky.provision.cudo.instance import cleanup_ports
|
5
5
|
from sky.provision.cudo.instance import get_cluster_info
|
6
|
+
from sky.provision.cudo.instance import open_ports
|
6
7
|
from sky.provision.cudo.instance import query_instances
|
7
8
|
from sky.provision.cudo.instance import run_instances
|
8
9
|
from sky.provision.cudo.instance import stop_instances
|
@@ -11,4 +12,4 @@ from sky.provision.cudo.instance import wait_instances
|
|
11
12
|
|
12
13
|
__all__ = ('bootstrap_instances', 'run_instances', 'stop_instances',
|
13
14
|
'terminate_instances', 'wait_instances', 'get_cluster_info',
|
14
|
-
'cleanup_ports', 'query_instances')
|
15
|
+
'cleanup_ports', 'query_instances', 'open_ports')
|
@@ -0,0 +1,112 @@
|
|
1
|
+
"""Cudo catalog helper."""
|
2
|
+
|
3
|
+
cudo_gpu_model = {
|
4
|
+
'NVIDIA V100': 'V100',
|
5
|
+
'NVIDIA A40': 'A40',
|
6
|
+
'RTX 3080': 'RTX3080',
|
7
|
+
'RTX A4000': 'RTXA4000',
|
8
|
+
'RTX A4500': 'RTXA4500',
|
9
|
+
'RTX A5000': 'RTXA5000',
|
10
|
+
'RTX A6000': 'RTXA6000',
|
11
|
+
}
|
12
|
+
|
13
|
+
cudo_gpu_mem = {
|
14
|
+
'RTX3080': 12,
|
15
|
+
'A40': 48,
|
16
|
+
'RTXA4000': 16,
|
17
|
+
'RTXA4500': 20,
|
18
|
+
'RTXA5000': 24,
|
19
|
+
'RTXA6000': 48,
|
20
|
+
'V100': 16,
|
21
|
+
}
|
22
|
+
|
23
|
+
machine_specs = [
|
24
|
+
# Low
|
25
|
+
{
|
26
|
+
'vcpu': 2,
|
27
|
+
'mem': 4,
|
28
|
+
'gpu': 1,
|
29
|
+
},
|
30
|
+
{
|
31
|
+
'vcpu': 4,
|
32
|
+
'mem': 8,
|
33
|
+
'gpu': 1,
|
34
|
+
},
|
35
|
+
{
|
36
|
+
'vcpu': 8,
|
37
|
+
'mem': 16,
|
38
|
+
'gpu': 2,
|
39
|
+
},
|
40
|
+
{
|
41
|
+
'vcpu': 16,
|
42
|
+
'mem': 32,
|
43
|
+
'gpu': 2,
|
44
|
+
},
|
45
|
+
{
|
46
|
+
'vcpu': 32,
|
47
|
+
'mem': 64,
|
48
|
+
'gpu': 4,
|
49
|
+
},
|
50
|
+
{
|
51
|
+
'vcpu': 64,
|
52
|
+
'mem': 128,
|
53
|
+
'gpu': 8,
|
54
|
+
},
|
55
|
+
# Mid
|
56
|
+
{
|
57
|
+
'vcpu': 96,
|
58
|
+
'mem': 192,
|
59
|
+
'gpu': 8
|
60
|
+
},
|
61
|
+
{
|
62
|
+
'vcpu': 48,
|
63
|
+
'mem': 96,
|
64
|
+
'gpu': 4
|
65
|
+
},
|
66
|
+
{
|
67
|
+
'vcpu': 24,
|
68
|
+
'mem': 48,
|
69
|
+
'gpu': 2
|
70
|
+
},
|
71
|
+
{
|
72
|
+
'vcpu': 12,
|
73
|
+
'mem': 24,
|
74
|
+
'gpu': 1
|
75
|
+
},
|
76
|
+
# Hi
|
77
|
+
{
|
78
|
+
'vcpu': 96,
|
79
|
+
'mem': 192,
|
80
|
+
'gpu': 4
|
81
|
+
},
|
82
|
+
{
|
83
|
+
'vcpu': 48,
|
84
|
+
'mem': 96,
|
85
|
+
'gpu': 2
|
86
|
+
},
|
87
|
+
{
|
88
|
+
'vcpu': 24,
|
89
|
+
'mem': 48,
|
90
|
+
'gpu': 1
|
91
|
+
},
|
92
|
+
]
|
93
|
+
|
94
|
+
|
95
|
+
def cudo_gpu_to_skypilot_gpu(model):
|
96
|
+
if model in cudo_gpu_model:
|
97
|
+
return cudo_gpu_model[model]
|
98
|
+
else:
|
99
|
+
return model
|
100
|
+
|
101
|
+
|
102
|
+
def skypilot_gpu_to_cudo_gpu(model):
|
103
|
+
for key, value in cudo_gpu_model.items():
|
104
|
+
if value == model:
|
105
|
+
return key
|
106
|
+
return model
|
107
|
+
|
108
|
+
|
109
|
+
def gpu_exists(model):
|
110
|
+
if model in cudo_gpu_model:
|
111
|
+
return True
|
112
|
+
return False
|
@@ -4,29 +4,29 @@ from typing import Dict
|
|
4
4
|
|
5
5
|
from sky import sky_logging
|
6
6
|
from sky.adaptors import cudo
|
7
|
+
import sky.provision.cudo.cudo_utils as utils
|
7
8
|
|
8
9
|
logger = sky_logging.init_logger(__name__)
|
9
10
|
|
10
11
|
|
11
12
|
def launch(name: str, data_center_id: str, ssh_key: str, machine_type: str,
|
12
|
-
memory_gib: int, vcpu_count: int, gpu_count: int,
|
13
|
+
memory_gib: int, vcpu_count: int, gpu_count: int,
|
13
14
|
tags: Dict[str, str], disk_size: int):
|
14
15
|
"""Launches an instance with the given parameters."""
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
metadata=tags)
|
16
|
+
|
17
|
+
request = cudo.cudo.CreateVMBody(
|
18
|
+
ssh_key_source='SSH_KEY_SOURCE_NONE',
|
19
|
+
custom_ssh_keys=[ssh_key],
|
20
|
+
vm_id=name,
|
21
|
+
machine_type=machine_type,
|
22
|
+
data_center_id=data_center_id,
|
23
|
+
boot_disk_image_id='ubuntu-2204-nvidia-535-docker-v20240214',
|
24
|
+
memory_gib=memory_gib,
|
25
|
+
vcpus=vcpu_count,
|
26
|
+
gpus=gpu_count,
|
27
|
+
boot_disk=cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK',
|
28
|
+
size_gib=disk_size),
|
29
|
+
metadata=tags)
|
30
30
|
|
31
31
|
try:
|
32
32
|
api = cudo.cudo.cudo_api.virtual_machines()
|
@@ -121,3 +121,24 @@ def list_instances():
|
|
121
121
|
return instances
|
122
122
|
except cudo.cudo.rest.ApiException as e:
|
123
123
|
raise e
|
124
|
+
|
125
|
+
|
126
|
+
def vm_available(to_start_count, gpu_count, gpu_model, data_center_id, mem,
|
127
|
+
cpus):
|
128
|
+
try:
|
129
|
+
gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model)
|
130
|
+
api = cudo.cudo.cudo_api.virtual_machines()
|
131
|
+
types = api.list_vm_machine_types(mem,
|
132
|
+
cpus,
|
133
|
+
gpu=gpu_count,
|
134
|
+
gpu_model=gpu_model,
|
135
|
+
data_center_id=data_center_id)
|
136
|
+
types_dict = types.to_dict()
|
137
|
+
hc = types_dict['host_configs']
|
138
|
+
total_count = sum(item['count_vm_available'] for item in hc)
|
139
|
+
if total_count < to_start_count:
|
140
|
+
raise Exception(
|
141
|
+
'Too many VMs requested, try another gpu type or region')
|
142
|
+
return total_count
|
143
|
+
except cudo.cudo.rest.ApiException as e:
|
144
|
+
raise e
|
sky/provision/cudo/instance.py
CHANGED
@@ -4,10 +4,10 @@ import time
|
|
4
4
|
from typing import Any, Dict, List, Optional
|
5
5
|
|
6
6
|
from sky import sky_logging
|
7
|
-
from sky import status_lib
|
8
7
|
from sky.provision import common
|
9
8
|
from sky.provision.cudo import cudo_machine_type
|
10
9
|
from sky.provision.cudo import cudo_wrapper
|
10
|
+
from sky.utils import status_lib
|
11
11
|
|
12
12
|
POLL_INTERVAL = 10
|
13
13
|
|
@@ -16,7 +16,6 @@ logger = sky_logging.init_logger(__name__)
|
|
16
16
|
|
17
17
|
def _filter_instances(cluster_name_on_cloud: str,
|
18
18
|
status_filters: Optional[List[str]]) -> Dict[str, Any]:
|
19
|
-
|
20
19
|
instances = cudo_wrapper.list_instances()
|
21
20
|
possible_names = [
|
22
21
|
f'{cluster_name_on_cloud}-head', f'{cluster_name_on_cloud}-worker'
|
@@ -77,10 +76,19 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
77
76
|
|
78
77
|
created_instance_ids = []
|
79
78
|
public_key = config.node_config['AuthorizedKey']
|
80
|
-
|
79
|
+
instance_type = config.node_config['InstanceType']
|
80
|
+
spec = cudo_machine_type.get_spec_from_instance(instance_type, region)
|
81
|
+
gpu_count = int(float(spec['gpu_count']))
|
82
|
+
vcpu_count = int(spec['vcpu_count'])
|
83
|
+
memory_gib = int(spec['mem_gb'])
|
84
|
+
gpu_model = spec['gpu_model']
|
85
|
+
try:
|
86
|
+
cudo_wrapper.vm_available(to_start_count, gpu_count, gpu_model, region,
|
87
|
+
memory_gib, vcpu_count)
|
88
|
+
except Exception as e:
|
89
|
+
logger.warning(f'run_instances: {e}')
|
90
|
+
raise
|
81
91
|
for _ in range(to_start_count):
|
82
|
-
instance_type = config.node_config['InstanceType']
|
83
|
-
spec = cudo_machine_type.get_spec_from_instance(instance_type, region)
|
84
92
|
|
85
93
|
node_type = 'head' if head_instance_id is None else 'worker'
|
86
94
|
try:
|
@@ -89,10 +97,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
89
97
|
ssh_key=public_key,
|
90
98
|
data_center_id=region,
|
91
99
|
machine_type=spec['machine_type'],
|
92
|
-
memory_gib=
|
93
|
-
vcpu_count=
|
94
|
-
gpu_count=
|
95
|
-
gpu_model=spec['gpu_model'],
|
100
|
+
memory_gib=memory_gib,
|
101
|
+
vcpu_count=vcpu_count,
|
102
|
+
gpu_count=gpu_count,
|
96
103
|
tags={},
|
97
104
|
disk_size=config.node_config['DiskSize'])
|
98
105
|
except Exception as e: # pylint: disable=broad-except
|
@@ -150,11 +157,10 @@ def terminate_instances(
|
|
150
157
|
del provider_config
|
151
158
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
152
159
|
for inst_id, inst in instances.items():
|
153
|
-
logger.info(f'Terminating instance {inst_id}.'
|
154
|
-
f'{inst}')
|
155
160
|
if worker_only and inst['name'].endswith('-head'):
|
156
161
|
continue
|
157
|
-
logger.
|
162
|
+
logger.debug(f'Terminating Cudo instance {inst_id}.'
|
163
|
+
f'{inst}')
|
158
164
|
cudo_wrapper.remove(inst_id)
|
159
165
|
|
160
166
|
|
@@ -213,6 +219,16 @@ def query_instances(
|
|
213
219
|
return statuses
|
214
220
|
|
215
221
|
|
222
|
+
def open_ports(
|
223
|
+
cluster_name_on_cloud: str,
|
224
|
+
ports: List[str],
|
225
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
226
|
+
) -> None:
|
227
|
+
del cluster_name_on_cloud, ports, provider_config
|
228
|
+
# Cudo has all ports open by default. Nothing to do here.
|
229
|
+
return
|
230
|
+
|
231
|
+
|
216
232
|
def cleanup_ports(
|
217
233
|
cluster_name_on_cloud: str,
|
218
234
|
ports: List[str],
|
@@ -0,0 +1,11 @@
|
|
1
|
+
"""DO provisioner for SkyPilot."""
|
2
|
+
|
3
|
+
from sky.provision.do.config import bootstrap_instances
|
4
|
+
from sky.provision.do.instance import cleanup_ports
|
5
|
+
from sky.provision.do.instance import get_cluster_info
|
6
|
+
from sky.provision.do.instance import open_ports
|
7
|
+
from sky.provision.do.instance import query_instances
|
8
|
+
from sky.provision.do.instance import run_instances
|
9
|
+
from sky.provision.do.instance import stop_instances
|
10
|
+
from sky.provision.do.instance import terminate_instances
|
11
|
+
from sky.provision.do.instance import wait_instances
|
@@ -0,0 +1,14 @@
|
|
1
|
+
"""Paperspace configuration bootstrapping."""
|
2
|
+
|
3
|
+
from sky import sky_logging
|
4
|
+
from sky.provision import common
|
5
|
+
|
6
|
+
logger = sky_logging.init_logger(__name__)
|
7
|
+
|
8
|
+
|
9
|
+
def bootstrap_instances(
|
10
|
+
region: str, cluster_name: str,
|
11
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
12
|
+
"""Bootstraps instances for the given cluster."""
|
13
|
+
del region, cluster_name
|
14
|
+
return config
|