skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/provision/runpod/utils.py
CHANGED
@@ -1,10 +1,13 @@
|
|
1
1
|
"""RunPod library wrapper for SkyPilot."""
|
2
2
|
|
3
|
+
import base64
|
3
4
|
import time
|
4
|
-
from typing import Any, Dict, List
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple
|
5
6
|
|
6
7
|
from sky import sky_logging
|
7
8
|
from sky.adaptors import runpod
|
9
|
+
from sky.provision import docker_utils
|
10
|
+
import sky.provision.runpod.api.commands as runpod_commands
|
8
11
|
from sky.skylet import constants
|
9
12
|
from sky.utils import common_utils
|
10
13
|
|
@@ -45,6 +48,11 @@ GPU_NAME_MAP = {
|
|
45
48
|
}
|
46
49
|
|
47
50
|
|
51
|
+
def _construct_docker_login_template_name(cluster_name: str) -> str:
|
52
|
+
"""Constructs the registry auth template name."""
|
53
|
+
return f'{cluster_name}-docker-login-template'
|
54
|
+
|
55
|
+
|
48
56
|
def retry(func):
|
49
57
|
"""Decorator to retry a function."""
|
50
58
|
|
@@ -64,9 +72,83 @@ def retry(func):
|
|
64
72
|
return wrapper
|
65
73
|
|
66
74
|
|
75
|
+
# Adapted from runpod.api.queries.pods.py::QUERY_POD.
|
76
|
+
# Adding containerRegistryAuthId to the query.
|
77
|
+
_QUERY_POD = """
|
78
|
+
query myPods {
|
79
|
+
myself {
|
80
|
+
pods {
|
81
|
+
id
|
82
|
+
containerDiskInGb
|
83
|
+
containerRegistryAuthId
|
84
|
+
costPerHr
|
85
|
+
desiredStatus
|
86
|
+
dockerArgs
|
87
|
+
dockerId
|
88
|
+
env
|
89
|
+
gpuCount
|
90
|
+
imageName
|
91
|
+
lastStatusChange
|
92
|
+
machineId
|
93
|
+
memoryInGb
|
94
|
+
name
|
95
|
+
podType
|
96
|
+
port
|
97
|
+
ports
|
98
|
+
uptimeSeconds
|
99
|
+
vcpuCount
|
100
|
+
volumeInGb
|
101
|
+
volumeMountPath
|
102
|
+
runtime {
|
103
|
+
ports{
|
104
|
+
ip
|
105
|
+
isIpPublic
|
106
|
+
privatePort
|
107
|
+
publicPort
|
108
|
+
type
|
109
|
+
}
|
110
|
+
}
|
111
|
+
machine {
|
112
|
+
gpuDisplayName
|
113
|
+
}
|
114
|
+
}
|
115
|
+
}
|
116
|
+
}
|
117
|
+
"""
|
118
|
+
|
119
|
+
|
120
|
+
def _sky_get_pods() -> dict:
|
121
|
+
"""List all pods with extra registry auth information.
|
122
|
+
|
123
|
+
Adapted from runpod.get_pods() to include containerRegistryAuthId.
|
124
|
+
"""
|
125
|
+
raw_return = runpod.runpod.api.graphql.run_graphql_query(_QUERY_POD)
|
126
|
+
cleaned_return = raw_return['data']['myself']['pods']
|
127
|
+
return cleaned_return
|
128
|
+
|
129
|
+
|
130
|
+
_QUERY_POD_TEMPLATE_WITH_REGISTRY_AUTH = """
|
131
|
+
query myself {
|
132
|
+
myself {
|
133
|
+
podTemplates {
|
134
|
+
name
|
135
|
+
containerRegistryAuthId
|
136
|
+
}
|
137
|
+
}
|
138
|
+
}
|
139
|
+
"""
|
140
|
+
|
141
|
+
|
142
|
+
def _list_pod_templates_with_container_registry() -> dict:
|
143
|
+
"""List all pod templates."""
|
144
|
+
raw_return = runpod.runpod.api.graphql.run_graphql_query(
|
145
|
+
_QUERY_POD_TEMPLATE_WITH_REGISTRY_AUTH)
|
146
|
+
return raw_return['data']['myself']['podTemplates']
|
147
|
+
|
148
|
+
|
67
149
|
def list_instances() -> Dict[str, Dict[str, Any]]:
|
68
150
|
"""Lists instances associated with API key."""
|
69
|
-
instances =
|
151
|
+
instances = _sky_get_pods()
|
70
152
|
|
71
153
|
instance_dict: Dict[str, Dict[str, Any]] = {}
|
72
154
|
for instance in instances:
|
@@ -74,13 +156,23 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
|
|
74
156
|
|
75
157
|
info['status'] = instance['desiredStatus']
|
76
158
|
info['name'] = instance['name']
|
159
|
+
info['port2endpoint'] = {}
|
77
160
|
|
78
|
-
|
161
|
+
# Sometimes when the cluster is in the process of being created,
|
162
|
+
# the `port` field in the runtime is None and we need to check for it.
|
163
|
+
if (instance['desiredStatus'] == 'RUNNING' and
|
164
|
+
instance.get('runtime') and
|
165
|
+
instance.get('runtime').get('ports')):
|
79
166
|
for port in instance['runtime']['ports']:
|
80
|
-
if port['
|
81
|
-
|
82
|
-
|
83
|
-
|
167
|
+
if port['isIpPublic']:
|
168
|
+
if port['privatePort'] == 22:
|
169
|
+
info['external_ip'] = port['ip']
|
170
|
+
info['ssh_port'] = port['publicPort']
|
171
|
+
info['port2endpoint'][port['privatePort']] = {
|
172
|
+
'host': port['ip'],
|
173
|
+
'port': port['publicPort']
|
174
|
+
}
|
175
|
+
else:
|
84
176
|
info['internal_ip'] = port['ip']
|
85
177
|
|
86
178
|
instance_dict[instance['id']] = info
|
@@ -88,37 +180,161 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
|
|
88
180
|
return instance_dict
|
89
181
|
|
90
182
|
|
91
|
-
def
|
183
|
+
def delete_pod_template(template_name: str) -> None:
|
184
|
+
"""Deletes a pod template."""
|
185
|
+
try:
|
186
|
+
runpod.runpod.api.graphql.run_graphql_query(
|
187
|
+
f'mutation {{deleteTemplate(templateName: "{template_name}")}}')
|
188
|
+
except runpod.runpod.error.QueryError as e:
|
189
|
+
logger.warning(f'Failed to delete template {template_name}: {e} '
|
190
|
+
'Please delete it manually.')
|
191
|
+
|
192
|
+
|
193
|
+
def delete_register_auth(registry_auth_id: str) -> None:
|
194
|
+
"""Deletes a registry auth."""
|
195
|
+
try:
|
196
|
+
runpod.runpod.delete_container_registry_auth(registry_auth_id)
|
197
|
+
except runpod.runpod.error.QueryError as e:
|
198
|
+
logger.warning(
|
199
|
+
f'Failed to delete registry auth {registry_auth_id}: {e} '
|
200
|
+
'Please delete it manually.')
|
201
|
+
|
202
|
+
|
203
|
+
def _create_template_for_docker_login(
|
204
|
+
cluster_name: str,
|
205
|
+
image_name: str,
|
206
|
+
docker_login_config: Optional[Dict[str, str]],
|
207
|
+
) -> Tuple[str, Optional[str]]:
|
208
|
+
"""Creates a template for the given image with the docker login config.
|
209
|
+
|
210
|
+
Returns:
|
211
|
+
formatted_image_name: The formatted image name.
|
212
|
+
template_id: The template ID. None for no docker login config.
|
213
|
+
"""
|
214
|
+
if docker_login_config is None:
|
215
|
+
return image_name, None
|
216
|
+
login_config = docker_utils.DockerLoginConfig(**docker_login_config)
|
217
|
+
container_registry_auth_name = f'{cluster_name}-registry-auth'
|
218
|
+
container_template_name = _construct_docker_login_template_name(
|
219
|
+
cluster_name)
|
220
|
+
# The `name` argument is only for display purpose and the registry server
|
221
|
+
# will be splitted from the docker image name (Tested with AWS ECR).
|
222
|
+
# Here we only need the username and password to create the registry auth.
|
223
|
+
# TODO(tian): Now we create a template and a registry auth for each cluster.
|
224
|
+
# Consider create one for each server and reuse them. Challenges including
|
225
|
+
# calculate the reference count and delete them when no longer needed.
|
226
|
+
create_auth_resp = runpod.runpod.create_container_registry_auth(
|
227
|
+
name=container_registry_auth_name,
|
228
|
+
username=login_config.username,
|
229
|
+
password=login_config.password,
|
230
|
+
)
|
231
|
+
registry_auth_id = create_auth_resp['id']
|
232
|
+
create_template_resp = runpod.runpod.create_template(
|
233
|
+
name=container_template_name,
|
234
|
+
image_name=None,
|
235
|
+
registry_auth_id=registry_auth_id,
|
236
|
+
)
|
237
|
+
return login_config.format_image(image_name), create_template_resp['id']
|
238
|
+
|
239
|
+
|
240
|
+
def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
|
241
|
+
disk_size: int, image_name: str, ports: Optional[List[int]],
|
242
|
+
public_key: str, preemptible: Optional[bool], bid_per_gpu: float,
|
243
|
+
docker_login_config: Optional[Dict[str, str]]) -> str:
|
92
244
|
"""Launches an instance with the given parameters.
|
93
245
|
|
94
246
|
Converts the instance_type to the RunPod GPU name, finds the specs for the
|
95
247
|
GPU, and launches the instance.
|
248
|
+
|
249
|
+
Returns:
|
250
|
+
instance_id: The instance ID.
|
96
251
|
"""
|
252
|
+
name = f'{cluster_name}-{node_type}'
|
97
253
|
gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
|
98
254
|
gpu_quantity = int(instance_type.split('_')[0].replace('x', ''))
|
99
255
|
cloud_type = instance_type.split('_')[2]
|
100
256
|
|
101
257
|
gpu_specs = runpod.runpod.get_gpu(gpu_type)
|
258
|
+
# TODO(zhwu): keep this align with setups in
|
259
|
+
# `provision.kuberunetes.instance.py`
|
260
|
+
setup_cmd = (
|
261
|
+
'prefix_cmd() '
|
262
|
+
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; '
|
263
|
+
'$(prefix_cmd) apt update;'
|
264
|
+
'export DEBIAN_FRONTEND=noninteractive;'
|
265
|
+
'$(prefix_cmd) apt install openssh-server rsync curl patch -y;'
|
266
|
+
'$(prefix_cmd) mkdir -p /var/run/sshd; '
|
267
|
+
'$(prefix_cmd) '
|
268
|
+
'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" '
|
269
|
+
'/etc/ssh/sshd_config; '
|
270
|
+
'$(prefix_cmd) sed '
|
271
|
+
'"s@session\\s*required\\s*pam_loginuid.so@session optional '
|
272
|
+
'pam_loginuid.so@g" -i /etc/pam.d/sshd; '
|
273
|
+
'cd /etc/ssh/ && $(prefix_cmd) ssh-keygen -A; '
|
274
|
+
'$(prefix_cmd) mkdir -p ~/.ssh; '
|
275
|
+
'$(prefix_cmd) chown -R $(whoami) ~/.ssh;'
|
276
|
+
'$(prefix_cmd) chmod 700 ~/.ssh; '
|
277
|
+
f'$(prefix_cmd) echo "{public_key}" >> ~/.ssh/authorized_keys; '
|
278
|
+
'$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys; '
|
279
|
+
'$(prefix_cmd) service ssh restart; '
|
280
|
+
'[ $(id -u) -eq 0 ] && echo alias sudo="" >> ~/.bashrc;sleep infinity')
|
281
|
+
# Use base64 to deal with the tricky quoting issues caused by runpod API.
|
282
|
+
encoded = base64.b64encode(setup_cmd.encode('utf-8')).decode('utf-8')
|
102
283
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
284
|
+
docker_args = (f'bash -c \'echo {encoded} | base64 --decode > init.sh; '
|
285
|
+
f'bash init.sh\'')
|
286
|
+
|
287
|
+
# Port 8081 is occupied for nginx in the base image.
|
288
|
+
custom_ports_str = ''
|
289
|
+
if ports is not None:
|
290
|
+
custom_ports_str = ''.join([f'{p}/tcp,' for p in ports])
|
291
|
+
ports_str = (f'22/tcp,'
|
292
|
+
f'{custom_ports_str}'
|
293
|
+
f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
|
294
|
+
f'{constants.SKY_REMOTE_RAY_PORT}/http')
|
295
|
+
|
296
|
+
image_name_formatted, template_id = _create_template_for_docker_login(
|
297
|
+
cluster_name, image_name, docker_login_config)
|
298
|
+
|
299
|
+
params = {
|
300
|
+
'name': name,
|
301
|
+
'image_name': image_name_formatted,
|
302
|
+
'gpu_type_id': gpu_type,
|
303
|
+
'cloud_type': cloud_type,
|
304
|
+
'container_disk_in_gb': disk_size,
|
305
|
+
'min_vcpu_count': 4 * gpu_quantity,
|
306
|
+
'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
|
307
|
+
'gpu_count': gpu_quantity,
|
308
|
+
'country_code': region,
|
309
|
+
'ports': ports_str,
|
310
|
+
'support_public_ip': True,
|
311
|
+
'docker_args': docker_args,
|
312
|
+
'template_id': template_id,
|
313
|
+
}
|
314
|
+
|
315
|
+
if preemptible is None or not preemptible:
|
316
|
+
new_instance = runpod.runpod.create_pod(**params)
|
317
|
+
else:
|
318
|
+
new_instance = runpod_commands.create_spot_pod(
|
319
|
+
bid_per_gpu=bid_per_gpu,
|
320
|
+
**params,
|
321
|
+
)
|
118
322
|
|
119
323
|
return new_instance['id']
|
120
324
|
|
121
325
|
|
326
|
+
def get_registry_auth_resources(
|
327
|
+
cluster_name: str) -> Tuple[Optional[str], Optional[str]]:
|
328
|
+
"""Gets the registry auth resources."""
|
329
|
+
container_registry_auth_name = _construct_docker_login_template_name(
|
330
|
+
cluster_name)
|
331
|
+
for template in _list_pod_templates_with_container_registry():
|
332
|
+
if template['name'] == container_registry_auth_name:
|
333
|
+
return container_registry_auth_name, template[
|
334
|
+
'containerRegistryAuthId']
|
335
|
+
return None, None
|
336
|
+
|
337
|
+
|
122
338
|
def remove(instance_id: str) -> None:
|
123
339
|
"""Terminates the given instance."""
|
124
340
|
runpod.runpod.terminate_pod(instance_id)
|
@@ -0,0 +1,10 @@
|
|
1
|
+
"""Vast provisioner for SkyPilot."""
|
2
|
+
|
3
|
+
from sky.provision.vast.config import bootstrap_instances
|
4
|
+
from sky.provision.vast.instance import cleanup_ports
|
5
|
+
from sky.provision.vast.instance import get_cluster_info
|
6
|
+
from sky.provision.vast.instance import query_instances
|
7
|
+
from sky.provision.vast.instance import run_instances
|
8
|
+
from sky.provision.vast.instance import stop_instances
|
9
|
+
from sky.provision.vast.instance import terminate_instances
|
10
|
+
from sky.provision.vast.instance import wait_instances
|
@@ -0,0 +1,11 @@
|
|
1
|
+
"""Vast configuration bootstrapping."""
|
2
|
+
|
3
|
+
from sky.provision import common
|
4
|
+
|
5
|
+
|
6
|
+
def bootstrap_instances(
|
7
|
+
region: str, cluster_name: str,
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
9
|
+
"""Bootstraps instances for the given cluster."""
|
10
|
+
del region, cluster_name # unused
|
11
|
+
return config
|
@@ -0,0 +1,247 @@
|
|
1
|
+
"""Vast instance provisioning."""
|
2
|
+
import time
|
3
|
+
from typing import Any, Dict, List, Optional
|
4
|
+
|
5
|
+
from sky import sky_logging
|
6
|
+
from sky.provision import common
|
7
|
+
from sky.provision.vast import utils
|
8
|
+
from sky.utils import common_utils
|
9
|
+
from sky.utils import status_lib
|
10
|
+
from sky.utils import ux_utils
|
11
|
+
|
12
|
+
POLL_INTERVAL = 10
|
13
|
+
|
14
|
+
logger = sky_logging.init_logger(__name__)
|
15
|
+
# a much more convenient method
|
16
|
+
status_filter = lambda machine_dict, stat_list: {
|
17
|
+
k: v for k, v in machine_dict.items() if v['status'] in stat_list
|
18
|
+
}
|
19
|
+
|
20
|
+
|
21
|
+
def _filter_instances(cluster_name_on_cloud: str,
|
22
|
+
status_filters: Optional[List[str]],
|
23
|
+
head_only: bool = False) -> Dict[str, Any]:
|
24
|
+
|
25
|
+
instances = utils.list_instances()
|
26
|
+
possible_names = [f'{cluster_name_on_cloud}-head']
|
27
|
+
if not head_only:
|
28
|
+
possible_names.append(f'{cluster_name_on_cloud}-worker')
|
29
|
+
|
30
|
+
filtered_instances = {}
|
31
|
+
for instance_id, instance in instances.items():
|
32
|
+
if (status_filters is not None and
|
33
|
+
instance['status'] not in status_filters):
|
34
|
+
continue
|
35
|
+
if instance.get('name') in possible_names:
|
36
|
+
filtered_instances[instance_id] = instance
|
37
|
+
return filtered_instances
|
38
|
+
|
39
|
+
|
40
|
+
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
41
|
+
for inst_id, inst in instances.items():
|
42
|
+
if inst['name'].endswith('-head'):
|
43
|
+
return inst_id
|
44
|
+
return None
|
45
|
+
|
46
|
+
|
47
|
+
def run_instances(region: str, cluster_name_on_cloud: str,
|
48
|
+
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
49
|
+
"""Runs instances for the given cluster."""
|
50
|
+
pending_status = ['CREATED', 'RESTARTING']
|
51
|
+
|
52
|
+
created_instance_ids = []
|
53
|
+
instances: Dict[str, Any] = {}
|
54
|
+
|
55
|
+
while True:
|
56
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
57
|
+
if not status_filter(instances, pending_status):
|
58
|
+
break
|
59
|
+
logger.info(f'Waiting for {len(instances)} instances to be ready.')
|
60
|
+
time.sleep(POLL_INTERVAL)
|
61
|
+
|
62
|
+
running_instances = status_filter(instances, ['RUNNING'])
|
63
|
+
head_instance_id = _get_head_instance_id(running_instances)
|
64
|
+
stopped_instances = status_filter(instances, ['EXITED', 'STOPPED'])
|
65
|
+
|
66
|
+
if config.resume_stopped_nodes and stopped_instances:
|
67
|
+
for instance in stopped_instances.values():
|
68
|
+
utils.start(instance['id'])
|
69
|
+
else:
|
70
|
+
to_start_count = config.count - (len(running_instances) +
|
71
|
+
len(stopped_instances))
|
72
|
+
if to_start_count < 0:
|
73
|
+
raise RuntimeError(f'Cluster {cluster_name_on_cloud} already has '
|
74
|
+
f'{len(running_instances)} nodes,'
|
75
|
+
f'but {config.count} are required.')
|
76
|
+
if to_start_count == 0:
|
77
|
+
if head_instance_id is None:
|
78
|
+
raise RuntimeError(
|
79
|
+
f'Cluster {cluster_name_on_cloud} has no head node.')
|
80
|
+
logger.info(
|
81
|
+
f'Cluster {cluster_name_on_cloud} already has '
|
82
|
+
f'{len(running_instances)} nodes, no need to start more.')
|
83
|
+
return common.ProvisionRecord(provider_name='vast',
|
84
|
+
cluster_name=cluster_name_on_cloud,
|
85
|
+
region=region,
|
86
|
+
zone=None,
|
87
|
+
head_instance_id=head_instance_id,
|
88
|
+
resumed_instance_ids=[],
|
89
|
+
created_instance_ids=[])
|
90
|
+
|
91
|
+
for _ in range(to_start_count):
|
92
|
+
node_type = 'head' if head_instance_id is None else 'worker'
|
93
|
+
try:
|
94
|
+
instance_id = utils.launch(
|
95
|
+
name=f'{cluster_name_on_cloud}-{node_type}',
|
96
|
+
instance_type=config.node_config['InstanceType'],
|
97
|
+
region=region,
|
98
|
+
disk_size=config.node_config['DiskSize'],
|
99
|
+
preemptible=config.node_config['Preemptible'],
|
100
|
+
image_name=config.node_config['ImageId'])
|
101
|
+
except Exception as e: # pylint: disable=broad-except
|
102
|
+
logger.warning(f'run_instances error: {e}')
|
103
|
+
raise
|
104
|
+
logger.info(f'Launched instance {instance_id}.')
|
105
|
+
created_instance_ids.append(instance_id)
|
106
|
+
if head_instance_id is None:
|
107
|
+
head_instance_id = instance_id
|
108
|
+
|
109
|
+
# Wait for instances to be ready.
|
110
|
+
while True:
|
111
|
+
instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
|
112
|
+
ready_instance_cnt = 0
|
113
|
+
for instance_id, instance in instances.items():
|
114
|
+
if instance.get('ssh_port') is not None:
|
115
|
+
ready_instance_cnt += 1
|
116
|
+
logger.info('Waiting for instances to be ready: '
|
117
|
+
f'({ready_instance_cnt}/{config.count}).')
|
118
|
+
if ready_instance_cnt == config.count:
|
119
|
+
break
|
120
|
+
|
121
|
+
time.sleep(POLL_INTERVAL)
|
122
|
+
|
123
|
+
head_instance_id = _get_head_instance_id(utils.list_instances())
|
124
|
+
assert head_instance_id is not None, 'head_instance_id should not be None'
|
125
|
+
return common.ProvisionRecord(provider_name='vast',
|
126
|
+
cluster_name=cluster_name_on_cloud,
|
127
|
+
region=region,
|
128
|
+
zone=None,
|
129
|
+
head_instance_id=head_instance_id,
|
130
|
+
resumed_instance_ids=[],
|
131
|
+
created_instance_ids=created_instance_ids)
|
132
|
+
|
133
|
+
|
134
|
+
def wait_instances(region: str, cluster_name_on_cloud: str,
|
135
|
+
state: Optional[status_lib.ClusterStatus]) -> None:
|
136
|
+
del region, cluster_name_on_cloud, state
|
137
|
+
|
138
|
+
|
139
|
+
def stop_instances(
|
140
|
+
cluster_name_on_cloud: str,
|
141
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
142
|
+
worker_only: bool = False,
|
143
|
+
) -> None:
|
144
|
+
return action_instances('stop', cluster_name_on_cloud, provider_config,
|
145
|
+
worker_only)
|
146
|
+
|
147
|
+
|
148
|
+
def terminate_instances(
|
149
|
+
cluster_name_on_cloud: str,
|
150
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
151
|
+
worker_only: bool = False,
|
152
|
+
) -> None:
|
153
|
+
return action_instances('remove', cluster_name_on_cloud, provider_config,
|
154
|
+
worker_only)
|
155
|
+
|
156
|
+
|
157
|
+
def action_instances(
|
158
|
+
fn: str,
|
159
|
+
cluster_name_on_cloud: str,
|
160
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
161
|
+
worker_only: bool = False,
|
162
|
+
) -> None:
|
163
|
+
"""See sky/provision/__init__.py"""
|
164
|
+
del provider_config # unused
|
165
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
166
|
+
for inst_id, inst in instances.items():
|
167
|
+
logger.debug(f'Instance {fn} {inst_id}: {inst}')
|
168
|
+
if worker_only and inst['name'].endswith('-head'):
|
169
|
+
continue
|
170
|
+
try:
|
171
|
+
getattr(utils, fn)(inst_id)
|
172
|
+
except Exception as e: # pylint: disable=broad-except
|
173
|
+
with ux_utils.print_exception_no_traceback():
|
174
|
+
raise RuntimeError(
|
175
|
+
f'Failed to {fn} instance {inst_id}: '
|
176
|
+
f'{common_utils.format_exception(e, use_bracket=False)}'
|
177
|
+
) from e
|
178
|
+
|
179
|
+
|
180
|
+
def get_cluster_info(
|
181
|
+
region: str,
|
182
|
+
cluster_name_on_cloud: str,
|
183
|
+
provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
|
184
|
+
del region # unused
|
185
|
+
running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
|
186
|
+
instances: Dict[str, List[common.InstanceInfo]] = {}
|
187
|
+
head_instance_id = None
|
188
|
+
for instance_id, instance_info in running_instances.items():
|
189
|
+
instances[instance_id] = [
|
190
|
+
common.InstanceInfo(
|
191
|
+
instance_id=instance_id,
|
192
|
+
internal_ip=instance_info['local_ipaddrs'].strip(),
|
193
|
+
external_ip=instance_info['public_ipaddr'],
|
194
|
+
ssh_port=instance_info['ports']['22/tcp'][0]['HostPort'],
|
195
|
+
tags={},
|
196
|
+
)
|
197
|
+
]
|
198
|
+
if instance_info['name'].endswith('-head'):
|
199
|
+
head_instance_id = instance_id
|
200
|
+
|
201
|
+
return common.ClusterInfo(
|
202
|
+
instances=instances,
|
203
|
+
head_instance_id=head_instance_id,
|
204
|
+
provider_name='vast',
|
205
|
+
provider_config=provider_config,
|
206
|
+
)
|
207
|
+
|
208
|
+
|
209
|
+
def open_ports(
|
210
|
+
cluster_name_on_cloud: str,
|
211
|
+
ports: List[str],
|
212
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
213
|
+
) -> None:
|
214
|
+
raise NotImplementedError('open_ports is not supported for Vast')
|
215
|
+
|
216
|
+
|
217
|
+
def query_instances(
|
218
|
+
cluster_name_on_cloud: str,
|
219
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
220
|
+
non_terminated_only: bool = True,
|
221
|
+
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
222
|
+
"""See sky/provision/__init__.py"""
|
223
|
+
|
224
|
+
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
225
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
226
|
+
# "running", "frozen", "stopped", "unknown", "loading"
|
227
|
+
status_map = {
|
228
|
+
'LOADING': status_lib.ClusterStatus.INIT,
|
229
|
+
'EXITED': status_lib.ClusterStatus.STOPPED,
|
230
|
+
'STOPPED': status_lib.ClusterStatus.STOPPED,
|
231
|
+
'RUNNING': status_lib.ClusterStatus.UP,
|
232
|
+
}
|
233
|
+
statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
|
234
|
+
for inst_id, inst in instances.items():
|
235
|
+
status = status_map[inst['status']]
|
236
|
+
if non_terminated_only and status is None:
|
237
|
+
continue
|
238
|
+
statuses[inst_id] = status
|
239
|
+
return statuses
|
240
|
+
|
241
|
+
|
242
|
+
def cleanup_ports(
|
243
|
+
cluster_name_on_cloud: str,
|
244
|
+
ports: List[str],
|
245
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
246
|
+
) -> None:
|
247
|
+
del cluster_name_on_cloud, ports, provider_config # Unused.
|