dstack 0.19.30__py3-none-any.whl → 0.19.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +8 -0
- dstack/_internal/cli/commands/project.py +27 -20
- dstack/_internal/cli/commands/server.py +5 -0
- dstack/_internal/cli/main.py +1 -3
- dstack/_internal/core/backends/aws/compute.py +2 -0
- dstack/_internal/core/backends/azure/compute.py +2 -0
- dstack/_internal/core/backends/base/compute.py +32 -9
- dstack/_internal/core/backends/base/offers.py +1 -0
- dstack/_internal/core/backends/cloudrift/compute.py +2 -0
- dstack/_internal/core/backends/cudo/compute.py +2 -0
- dstack/_internal/core/backends/datacrunch/compute.py +2 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +2 -0
- dstack/_internal/core/backends/features.py +5 -0
- dstack/_internal/core/backends/gcp/compute.py +74 -34
- dstack/_internal/core/backends/gcp/configurator.py +1 -1
- dstack/_internal/core/backends/gcp/models.py +14 -1
- dstack/_internal/core/backends/gcp/resources.py +35 -12
- dstack/_internal/core/backends/hotaisle/compute.py +2 -0
- dstack/_internal/core/backends/kubernetes/compute.py +466 -213
- dstack/_internal/core/backends/kubernetes/models.py +13 -16
- dstack/_internal/core/backends/kubernetes/utils.py +145 -8
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -0
- dstack/_internal/core/backends/local/compute.py +2 -0
- dstack/_internal/core/backends/nebius/compute.py +2 -0
- dstack/_internal/core/backends/oci/compute.py +2 -0
- dstack/_internal/core/backends/template/compute.py.jinja +2 -0
- dstack/_internal/core/backends/tensordock/compute.py +2 -0
- dstack/_internal/core/backends/vultr/compute.py +2 -0
- dstack/_internal/server/background/tasks/common.py +2 -0
- dstack/_internal/server/background/tasks/process_instances.py +2 -2
- dstack/_internal/server/services/offers.py +7 -1
- dstack/_internal/server/testing/common.py +2 -0
- dstack/_internal/server/utils/provisioning.py +3 -10
- dstack/version.py +1 -1
- {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/METADATA +11 -9
- {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/RECORD +39 -39
- {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/WHEEL +0 -0
- {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -2,7 +2,7 @@ import subprocess
|
|
|
2
2
|
import tempfile
|
|
3
3
|
import threading
|
|
4
4
|
import time
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import List, Optional, Tuple
|
|
6
6
|
|
|
7
7
|
from gpuhunt import KNOWN_NVIDIA_GPUS, AcceleratorVendor
|
|
8
8
|
from kubernetes import client
|
|
@@ -11,19 +11,24 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
11
11
|
Compute,
|
|
12
12
|
ComputeWithFilteredOffersCached,
|
|
13
13
|
ComputeWithGatewaySupport,
|
|
14
|
+
ComputeWithMultinodeSupport,
|
|
15
|
+
ComputeWithPrivilegedSupport,
|
|
14
16
|
generate_unique_gateway_instance_name,
|
|
15
17
|
generate_unique_instance_name_for_job,
|
|
16
18
|
get_docker_commands,
|
|
17
19
|
get_dstack_gateway_commands,
|
|
20
|
+
normalize_arch,
|
|
18
21
|
)
|
|
19
22
|
from dstack._internal.core.backends.base.offers import filter_offers_by_requirements
|
|
20
23
|
from dstack._internal.core.backends.kubernetes.models import (
|
|
21
24
|
KubernetesConfig,
|
|
22
|
-
|
|
25
|
+
KubernetesProxyJumpConfig,
|
|
23
26
|
)
|
|
24
27
|
from dstack._internal.core.backends.kubernetes.utils import (
|
|
28
|
+
call_api_method,
|
|
25
29
|
get_api_from_config_data,
|
|
26
30
|
get_cluster_public_ip,
|
|
31
|
+
get_value,
|
|
27
32
|
)
|
|
28
33
|
from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
|
|
29
34
|
from dstack._internal.core.errors import ComputeError
|
|
@@ -44,6 +49,7 @@ from dstack._internal.core.models.instances import (
|
|
|
44
49
|
Resources,
|
|
45
50
|
SSHConnectionParams,
|
|
46
51
|
)
|
|
52
|
+
from dstack._internal.core.models.resources import CPUSpec, Memory
|
|
47
53
|
from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
|
|
48
54
|
from dstack._internal.core.models.volumes import Volume
|
|
49
55
|
from dstack._internal.utils.common import parse_memory
|
|
@@ -52,52 +58,73 @@ from dstack._internal.utils.logging import get_logger
|
|
|
52
58
|
logger = get_logger(__name__)
|
|
53
59
|
|
|
54
60
|
JUMP_POD_SSH_PORT = 22
|
|
55
|
-
DEFAULT_NAMESPACE = "default"
|
|
56
61
|
|
|
57
62
|
NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
|
|
58
63
|
NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
|
|
59
64
|
|
|
65
|
+
DUMMY_REGION = "-"
|
|
66
|
+
|
|
60
67
|
|
|
61
68
|
class KubernetesCompute(
|
|
62
69
|
ComputeWithFilteredOffersCached,
|
|
70
|
+
ComputeWithPrivilegedSupport,
|
|
63
71
|
ComputeWithGatewaySupport,
|
|
72
|
+
ComputeWithMultinodeSupport,
|
|
64
73
|
Compute,
|
|
65
74
|
):
|
|
66
75
|
def __init__(self, config: KubernetesConfig):
|
|
67
76
|
super().__init__()
|
|
68
77
|
self.config = config.copy()
|
|
69
|
-
|
|
70
|
-
if
|
|
71
|
-
|
|
72
|
-
self.
|
|
78
|
+
proxy_jump = self.config.proxy_jump
|
|
79
|
+
if proxy_jump is None:
|
|
80
|
+
proxy_jump = KubernetesProxyJumpConfig()
|
|
81
|
+
self.proxy_jump = proxy_jump
|
|
73
82
|
self.api = get_api_from_config_data(config.kubeconfig.data)
|
|
74
83
|
|
|
75
84
|
def get_offers_by_requirements(
|
|
76
85
|
self, requirements: Requirements
|
|
77
86
|
) -> List[InstanceOfferWithAvailability]:
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
87
|
+
instance_offers: list[InstanceOfferWithAvailability] = []
|
|
88
|
+
node_list = call_api_method(
|
|
89
|
+
self.api.list_node,
|
|
90
|
+
client.V1NodeList,
|
|
91
|
+
)
|
|
92
|
+
nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
|
|
93
|
+
for node in nodes:
|
|
94
|
+
try:
|
|
95
|
+
labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
|
|
96
|
+
name = get_value(node, ".metadata.name", str, required=True)
|
|
97
|
+
cpus = _parse_cpu(
|
|
98
|
+
get_value(node, ".status.allocatable['cpu']", str, required=True)
|
|
99
|
+
)
|
|
100
|
+
cpu_arch = normalize_arch(
|
|
101
|
+
get_value(node, ".status.node_info.architecture", str)
|
|
102
|
+
).to_cpu_architecture()
|
|
103
|
+
memory_mib = _parse_memory(
|
|
104
|
+
get_value(node, ".status.allocatable['memory']", str, required=True)
|
|
105
|
+
)
|
|
106
|
+
gpus, _ = _get_gpus_from_node_labels(labels)
|
|
107
|
+
disk_size_mib = _parse_memory(
|
|
108
|
+
get_value(node, ".status.allocatable['ephemeral-storage']", str, required=True)
|
|
109
|
+
)
|
|
110
|
+
except (AttributeError, KeyError, ValueError) as e:
|
|
111
|
+
logger.exception("Failed to process node: %s: %s", type(e).__name__, e)
|
|
112
|
+
continue
|
|
81
113
|
instance_offer = InstanceOfferWithAvailability(
|
|
82
114
|
backend=BackendType.KUBERNETES,
|
|
83
115
|
instance=InstanceType(
|
|
84
|
-
name=
|
|
116
|
+
name=name,
|
|
85
117
|
resources=Resources(
|
|
86
|
-
cpus=
|
|
87
|
-
|
|
88
|
-
|
|
118
|
+
cpus=cpus,
|
|
119
|
+
cpu_arch=cpu_arch,
|
|
120
|
+
memory_mib=memory_mib,
|
|
121
|
+
gpus=gpus,
|
|
89
122
|
spot=False,
|
|
90
|
-
disk=Disk(
|
|
91
|
-
size_mib=int(
|
|
92
|
-
parse_memory(
|
|
93
|
-
node.status.capacity["ephemeral-storage"], as_untis="M"
|
|
94
|
-
)
|
|
95
|
-
)
|
|
96
|
-
),
|
|
123
|
+
disk=Disk(size_mib=disk_size_mib),
|
|
97
124
|
),
|
|
98
125
|
),
|
|
99
126
|
price=0,
|
|
100
|
-
region=
|
|
127
|
+
region=DUMMY_REGION,
|
|
101
128
|
availability=InstanceAvailability.AVAILABLE,
|
|
102
129
|
instance_runtime=InstanceRuntime.RUNNER,
|
|
103
130
|
)
|
|
@@ -122,7 +149,7 @@ class KubernetesCompute(
|
|
|
122
149
|
# as an ssh proxy jump to connect to all other services in Kubernetes.
|
|
123
150
|
# Setup jump pod in a separate thread to avoid long-running run_job.
|
|
124
151
|
# In case the thread fails, the job will be failed and resubmitted.
|
|
125
|
-
jump_pod_hostname = self.
|
|
152
|
+
jump_pod_hostname = self.proxy_jump.hostname
|
|
126
153
|
if jump_pod_hostname is None:
|
|
127
154
|
jump_pod_hostname = get_cluster_public_ip(self.api)
|
|
128
155
|
if jump_pod_hostname is None:
|
|
@@ -132,15 +159,17 @@ class KubernetesCompute(
|
|
|
132
159
|
)
|
|
133
160
|
jump_pod_port, created = _create_jump_pod_service_if_not_exists(
|
|
134
161
|
api=self.api,
|
|
162
|
+
namespace=self.config.namespace,
|
|
135
163
|
project_name=run.project_name,
|
|
136
164
|
ssh_public_keys=[project_ssh_public_key.strip(), run.run_spec.ssh_key_pub.strip()],
|
|
137
|
-
jump_pod_port=self.
|
|
165
|
+
jump_pod_port=self.proxy_jump.port,
|
|
138
166
|
)
|
|
139
167
|
if not created:
|
|
140
168
|
threading.Thread(
|
|
141
169
|
target=_continue_setup_jump_pod,
|
|
142
170
|
kwargs={
|
|
143
171
|
"api": self.api,
|
|
172
|
+
"namespace": self.config.namespace,
|
|
144
173
|
"project_name": run.project_name,
|
|
145
174
|
"project_ssh_private_key": project_ssh_private_key.strip(),
|
|
146
175
|
"user_ssh_public_key": run.run_spec.ssh_key_pub.strip(),
|
|
@@ -148,41 +177,146 @@ class KubernetesCompute(
|
|
|
148
177
|
"jump_pod_port": jump_pod_port,
|
|
149
178
|
},
|
|
150
179
|
).start()
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
180
|
+
|
|
181
|
+
resources_requests: dict[str, str] = {}
|
|
182
|
+
resources_limits: dict[str, str] = {}
|
|
183
|
+
node_affinity: Optional[client.V1NodeAffinity] = None
|
|
184
|
+
volumes_: list[client.V1Volume] = []
|
|
185
|
+
volume_mounts: list[client.V1VolumeMount] = []
|
|
186
|
+
|
|
187
|
+
resources_spec = job.job_spec.requirements.resources
|
|
188
|
+
assert isinstance(resources_spec.cpu, CPUSpec)
|
|
189
|
+
if (cpu_min := resources_spec.cpu.count.min) is not None:
|
|
190
|
+
resources_requests["cpu"] = str(cpu_min)
|
|
191
|
+
if (gpu_spec := resources_spec.gpu) is not None:
|
|
192
|
+
gpu_min = gpu_spec.count.min
|
|
193
|
+
if gpu_min is not None and gpu_min > 0:
|
|
194
|
+
if not (offer_gpus := instance_offer.instance.resources.gpus):
|
|
195
|
+
raise ComputeError(
|
|
196
|
+
"GPU is requested but the offer has no GPUs:"
|
|
197
|
+
f" {gpu_spec=} {instance_offer=}",
|
|
198
|
+
)
|
|
199
|
+
offer_gpu = offer_gpus[0]
|
|
200
|
+
matching_gpu_label_values: set[str] = set()
|
|
201
|
+
# We cannot generate an expected GPU label value from the Gpu model instance
|
|
202
|
+
# as the actual values may have additional components (socket, memory type, etc.)
|
|
203
|
+
# that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
|
|
204
|
+
# Moreover, a single Gpu may match multiple label values.
|
|
205
|
+
# As a workaround, we iterate and process all node labels once again (we already
|
|
206
|
+
# processed them in `get_offers_by_requirements()`).
|
|
207
|
+
node_list = call_api_method(
|
|
208
|
+
self.api.list_node,
|
|
209
|
+
client.V1NodeList,
|
|
210
|
+
)
|
|
211
|
+
nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
|
|
212
|
+
for node in nodes:
|
|
213
|
+
labels = get_value(node, ".metadata.labels", dict[str, str])
|
|
214
|
+
if not labels:
|
|
215
|
+
continue
|
|
216
|
+
gpus, gpu_label_value = _get_gpus_from_node_labels(labels)
|
|
217
|
+
if not gpus or gpu_label_value is None:
|
|
218
|
+
continue
|
|
219
|
+
if gpus[0] == offer_gpu:
|
|
220
|
+
matching_gpu_label_values.add(gpu_label_value)
|
|
221
|
+
if not matching_gpu_label_values:
|
|
222
|
+
raise ComputeError(
|
|
223
|
+
f"GPU is requested but no matching GPU labels found: {gpu_spec=}"
|
|
224
|
+
)
|
|
225
|
+
logger.debug(
|
|
226
|
+
"Requesting %d GPU(s), node labels: %s", gpu_min, matching_gpu_label_values
|
|
227
|
+
)
|
|
228
|
+
# TODO: support other GPU vendors
|
|
229
|
+
resources_requests["nvidia.com/gpu"] = str(gpu_min)
|
|
230
|
+
resources_limits["nvidia.com/gpu"] = str(gpu_min)
|
|
231
|
+
node_affinity = client.V1NodeAffinity(
|
|
232
|
+
required_during_scheduling_ignored_during_execution=[
|
|
233
|
+
client.V1NodeSelectorTerm(
|
|
234
|
+
match_expressions=[
|
|
235
|
+
client.V1NodeSelectorRequirement(
|
|
236
|
+
key="nvidia.com/gpu.product",
|
|
237
|
+
operator="In",
|
|
238
|
+
values=list(matching_gpu_label_values),
|
|
239
|
+
),
|
|
169
240
|
],
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
241
|
+
),
|
|
242
|
+
],
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
if (memory_min := resources_spec.memory.min) is not None:
|
|
246
|
+
resources_requests["memory"] = _render_memory(memory_min)
|
|
247
|
+
if (
|
|
248
|
+
resources_spec.disk is not None
|
|
249
|
+
and (disk_min := resources_spec.disk.size.min) is not None
|
|
250
|
+
):
|
|
251
|
+
resources_requests["ephemeral-storage"] = _render_memory(disk_min)
|
|
252
|
+
if (shm_size := resources_spec.shm_size) is not None:
|
|
253
|
+
shm_volume_name = "dev-shm"
|
|
254
|
+
volumes_.append(
|
|
255
|
+
client.V1Volume(
|
|
256
|
+
name=shm_volume_name,
|
|
257
|
+
empty_dir=client.V1EmptyDirVolumeSource(
|
|
258
|
+
medium="Memory",
|
|
259
|
+
size_limit=_render_memory(shm_size),
|
|
260
|
+
),
|
|
261
|
+
)
|
|
262
|
+
)
|
|
263
|
+
volume_mounts.append(
|
|
264
|
+
client.V1VolumeMount(
|
|
265
|
+
name=shm_volume_name,
|
|
266
|
+
mount_path="/dev/shm",
|
|
267
|
+
)
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
pod = client.V1Pod(
|
|
271
|
+
metadata=client.V1ObjectMeta(
|
|
272
|
+
name=instance_name,
|
|
273
|
+
labels={"app.kubernetes.io/name": instance_name},
|
|
274
|
+
),
|
|
275
|
+
spec=client.V1PodSpec(
|
|
276
|
+
containers=[
|
|
277
|
+
client.V1Container(
|
|
278
|
+
name=f"{instance_name}-container",
|
|
279
|
+
image=job.job_spec.image_name,
|
|
280
|
+
command=["/bin/sh"],
|
|
281
|
+
args=["-c", " && ".join(commands)],
|
|
282
|
+
ports=[
|
|
283
|
+
client.V1ContainerPort(
|
|
284
|
+
container_port=DSTACK_RUNNER_SSH_PORT,
|
|
285
|
+
)
|
|
286
|
+
],
|
|
287
|
+
security_context=client.V1SecurityContext(
|
|
288
|
+
# TODO(#1535): support non-root images properly
|
|
289
|
+
run_as_user=0,
|
|
290
|
+
run_as_group=0,
|
|
291
|
+
privileged=job.job_spec.privileged,
|
|
292
|
+
capabilities=client.V1Capabilities(
|
|
293
|
+
add=[
|
|
294
|
+
# Allow to increase hard resource limits, see getrlimit(2)
|
|
295
|
+
"SYS_RESOURCE",
|
|
296
|
+
],
|
|
174
297
|
),
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
)
|
|
180
|
-
|
|
181
|
-
|
|
298
|
+
),
|
|
299
|
+
resources=client.V1ResourceRequirements(
|
|
300
|
+
requests=resources_requests,
|
|
301
|
+
limits=resources_limits,
|
|
302
|
+
),
|
|
303
|
+
volume_mounts=volume_mounts,
|
|
304
|
+
)
|
|
305
|
+
],
|
|
306
|
+
affinity=node_affinity,
|
|
307
|
+
volumes=volumes_,
|
|
182
308
|
),
|
|
183
309
|
)
|
|
184
|
-
|
|
185
|
-
|
|
310
|
+
call_api_method(
|
|
311
|
+
self.api.create_namespaced_pod,
|
|
312
|
+
client.V1Pod,
|
|
313
|
+
namespace=self.config.namespace,
|
|
314
|
+
body=pod,
|
|
315
|
+
)
|
|
316
|
+
call_api_method(
|
|
317
|
+
self.api.create_namespaced_service,
|
|
318
|
+
client.V1Service,
|
|
319
|
+
namespace=self.config.namespace,
|
|
186
320
|
body=client.V1Service(
|
|
187
321
|
metadata=client.V1ObjectMeta(name=_get_pod_service_name(instance_name)),
|
|
188
322
|
spec=client.V1ServiceSpec(
|
|
@@ -192,14 +326,16 @@ class KubernetesCompute(
|
|
|
192
326
|
),
|
|
193
327
|
),
|
|
194
328
|
)
|
|
195
|
-
service_ip = service_response.spec.cluster_ip
|
|
196
329
|
return JobProvisioningData(
|
|
197
330
|
backend=instance_offer.backend,
|
|
198
331
|
instance_type=instance_offer.instance,
|
|
199
332
|
instance_id=instance_name,
|
|
200
|
-
|
|
333
|
+
# Although we can already get Service's ClusterIP from the `V1Service` object returned
|
|
334
|
+
# by the `create_namespaced_service` method, we still need PodIP for multinode runs.
|
|
335
|
+
# We'll update both hostname and internal_ip once the pod is assigned to the node.
|
|
336
|
+
hostname=None,
|
|
201
337
|
internal_ip=None,
|
|
202
|
-
region=
|
|
338
|
+
region=instance_offer.region,
|
|
203
339
|
price=instance_offer.price,
|
|
204
340
|
username="root",
|
|
205
341
|
ssh_port=DSTACK_RUNNER_SSH_PORT,
|
|
@@ -212,25 +348,49 @@ class KubernetesCompute(
|
|
|
212
348
|
backend_data=None,
|
|
213
349
|
)
|
|
214
350
|
|
|
351
|
+
def update_provisioning_data(
|
|
352
|
+
self,
|
|
353
|
+
provisioning_data: JobProvisioningData,
|
|
354
|
+
project_ssh_public_key: str,
|
|
355
|
+
project_ssh_private_key: str,
|
|
356
|
+
):
|
|
357
|
+
pod = call_api_method(
|
|
358
|
+
self.api.read_namespaced_pod,
|
|
359
|
+
client.V1Pod,
|
|
360
|
+
name=provisioning_data.instance_id,
|
|
361
|
+
namespace=self.config.namespace,
|
|
362
|
+
)
|
|
363
|
+
pod_ip = get_value(pod, ".status.pod_ip", str)
|
|
364
|
+
if not pod_ip:
|
|
365
|
+
return
|
|
366
|
+
provisioning_data.internal_ip = pod_ip
|
|
367
|
+
service = call_api_method(
|
|
368
|
+
self.api.read_namespaced_service,
|
|
369
|
+
client.V1Service,
|
|
370
|
+
name=_get_pod_service_name(provisioning_data.instance_id),
|
|
371
|
+
namespace=self.config.namespace,
|
|
372
|
+
)
|
|
373
|
+
provisioning_data.hostname = get_value(service, ".spec.cluster_ip", str, required=True)
|
|
374
|
+
|
|
215
375
|
def terminate_instance(
|
|
216
376
|
self, instance_id: str, region: str, backend_data: Optional[str] = None
|
|
217
377
|
):
|
|
218
|
-
|
|
219
|
-
self.api.delete_namespaced_service
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
378
|
+
call_api_method(
|
|
379
|
+
self.api.delete_namespaced_service,
|
|
380
|
+
client.V1Service,
|
|
381
|
+
expected=404,
|
|
382
|
+
name=_get_pod_service_name(instance_id),
|
|
383
|
+
namespace=self.config.namespace,
|
|
384
|
+
body=client.V1DeleteOptions(),
|
|
385
|
+
)
|
|
386
|
+
call_api_method(
|
|
387
|
+
self.api.delete_namespaced_pod,
|
|
388
|
+
client.V1Pod,
|
|
389
|
+
expected=404,
|
|
390
|
+
name=instance_id,
|
|
391
|
+
namespace=self.config.namespace,
|
|
392
|
+
body=client.V1DeleteOptions(),
|
|
393
|
+
)
|
|
234
394
|
|
|
235
395
|
def create_gateway(
|
|
236
396
|
self,
|
|
@@ -247,70 +407,79 @@ class KubernetesCompute(
|
|
|
247
407
|
# https://docs.aws.amazon.com/eks/latest/userguide/network-load-balancing.html
|
|
248
408
|
instance_name = generate_unique_gateway_instance_name(configuration)
|
|
249
409
|
commands = _get_gateway_commands(authorized_keys=[configuration.ssh_key_pub])
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
)
|
|
276
|
-
]
|
|
277
|
-
),
|
|
410
|
+
pod = client.V1Pod(
|
|
411
|
+
metadata=client.V1ObjectMeta(
|
|
412
|
+
name=instance_name,
|
|
413
|
+
labels={"app.kubernetes.io/name": instance_name},
|
|
414
|
+
),
|
|
415
|
+
spec=client.V1PodSpec(
|
|
416
|
+
containers=[
|
|
417
|
+
client.V1Container(
|
|
418
|
+
name=f"{instance_name}-container",
|
|
419
|
+
image="ubuntu:22.04",
|
|
420
|
+
command=["/bin/sh"],
|
|
421
|
+
args=["-c", " && ".join(commands)],
|
|
422
|
+
ports=[
|
|
423
|
+
client.V1ContainerPort(
|
|
424
|
+
container_port=22,
|
|
425
|
+
),
|
|
426
|
+
client.V1ContainerPort(
|
|
427
|
+
container_port=80,
|
|
428
|
+
),
|
|
429
|
+
client.V1ContainerPort(
|
|
430
|
+
container_port=443,
|
|
431
|
+
),
|
|
432
|
+
],
|
|
433
|
+
)
|
|
434
|
+
]
|
|
278
435
|
),
|
|
279
436
|
)
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
437
|
+
call_api_method(
|
|
438
|
+
self.api.create_namespaced_pod,
|
|
439
|
+
client.V1Pod,
|
|
440
|
+
namespace=self.config.namespace,
|
|
441
|
+
body=pod,
|
|
442
|
+
)
|
|
443
|
+
service = client.V1Service(
|
|
444
|
+
metadata=client.V1ObjectMeta(
|
|
445
|
+
name=_get_pod_service_name(instance_name),
|
|
446
|
+
),
|
|
447
|
+
spec=client.V1ServiceSpec(
|
|
448
|
+
type="LoadBalancer",
|
|
449
|
+
selector={"app.kubernetes.io/name": instance_name},
|
|
450
|
+
ports=[
|
|
451
|
+
client.V1ServicePort(
|
|
452
|
+
name="ssh",
|
|
453
|
+
port=22,
|
|
454
|
+
target_port=22,
|
|
455
|
+
),
|
|
456
|
+
client.V1ServicePort(
|
|
457
|
+
name="http",
|
|
458
|
+
port=80,
|
|
459
|
+
target_port=80,
|
|
460
|
+
),
|
|
461
|
+
client.V1ServicePort(
|
|
462
|
+
name="https",
|
|
463
|
+
port=443,
|
|
464
|
+
target_port=443,
|
|
465
|
+
),
|
|
466
|
+
],
|
|
307
467
|
),
|
|
308
468
|
)
|
|
469
|
+
call_api_method(
|
|
470
|
+
self.api.create_namespaced_service,
|
|
471
|
+
client.V1Service,
|
|
472
|
+
namespace=self.config.namespace,
|
|
473
|
+
body=service,
|
|
474
|
+
)
|
|
309
475
|
hostname = _wait_for_load_balancer_hostname(
|
|
310
|
-
api=self.api,
|
|
476
|
+
api=self.api,
|
|
477
|
+
namespace=self.config.namespace,
|
|
478
|
+
service_name=_get_pod_service_name(instance_name),
|
|
311
479
|
)
|
|
480
|
+
region = DUMMY_REGION
|
|
312
481
|
if hostname is None:
|
|
313
|
-
self.terminate_instance(instance_name, region=
|
|
482
|
+
self.terminate_instance(instance_name, region=region)
|
|
314
483
|
raise ComputeError(
|
|
315
484
|
"Failed to get gateway hostname. "
|
|
316
485
|
"Ensure the Kubernetes cluster supports Load Balancer services."
|
|
@@ -318,7 +487,7 @@ class KubernetesCompute(
|
|
|
318
487
|
return GatewayProvisioningData(
|
|
319
488
|
instance_id=instance_name,
|
|
320
489
|
ip_address=hostname,
|
|
321
|
-
region=
|
|
490
|
+
region=region,
|
|
322
491
|
)
|
|
323
492
|
|
|
324
493
|
def terminate_gateway(
|
|
@@ -334,15 +503,34 @@ class KubernetesCompute(
|
|
|
334
503
|
)
|
|
335
504
|
|
|
336
505
|
|
|
337
|
-
def
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
506
|
+
def _parse_cpu(cpu: str) -> int:
|
|
507
|
+
if cpu.endswith("m"):
|
|
508
|
+
# "m" means millicpu (1/1000 CPU), e.g., 7900m -> 7.9 -> 7
|
|
509
|
+
return int(float(cpu[:-1]) / 1000)
|
|
510
|
+
return int(cpu)
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def _parse_memory(memory: str) -> int:
|
|
514
|
+
if memory.isdigit():
|
|
515
|
+
# no suffix means that the value is in bytes
|
|
516
|
+
return int(memory) // 2**20
|
|
517
|
+
return int(parse_memory(memory, as_untis="M"))
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def _render_memory(memory: Memory) -> str:
|
|
521
|
+
return f"{float(memory)}Gi"
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
def _get_gpus_from_node_labels(labels: dict[str, str]) -> tuple[list[Gpu], Optional[str]]:
|
|
525
|
+
# We rely on https://github.com/NVIDIA/k8s-device-plugin/tree/main/docs/gpu-feature-discovery
|
|
526
|
+
# to detect gpus. Note that "nvidia.com/gpu.product" is not a short gpu name like "T4" or
|
|
527
|
+
# "A100" but a product name like "Tesla-T4" or "A100-SXM4-40GB".
|
|
341
528
|
# Thus, we convert the product name to a known gpu name.
|
|
529
|
+
# TODO: support other GPU vendors
|
|
342
530
|
gpu_count = labels.get("nvidia.com/gpu.count")
|
|
343
531
|
gpu_product = labels.get("nvidia.com/gpu.product")
|
|
344
532
|
if gpu_count is None or gpu_product is None:
|
|
345
|
-
return []
|
|
533
|
+
return [], None
|
|
346
534
|
gpu_count = int(gpu_count)
|
|
347
535
|
gpu_name = None
|
|
348
536
|
for known_gpu_name in NVIDIA_GPU_NAMES:
|
|
@@ -350,20 +538,22 @@ def _get_gpus_from_node_labels(labels: Dict) -> List[Gpu]:
|
|
|
350
538
|
gpu_name = known_gpu_name
|
|
351
539
|
break
|
|
352
540
|
if gpu_name is None:
|
|
353
|
-
return []
|
|
541
|
+
return [], None
|
|
354
542
|
gpu_info = NVIDIA_GPU_NAME_TO_GPU_INFO[gpu_name]
|
|
355
543
|
gpu_memory = gpu_info.memory * 1024
|
|
356
544
|
# A100 may come in two variants
|
|
357
545
|
if "40GB" in gpu_product:
|
|
358
546
|
gpu_memory = 40 * 1024
|
|
359
|
-
|
|
547
|
+
gpus = [
|
|
360
548
|
Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory)
|
|
361
549
|
for _ in range(gpu_count)
|
|
362
550
|
]
|
|
551
|
+
return gpus, gpu_product
|
|
363
552
|
|
|
364
553
|
|
|
365
554
|
def _continue_setup_jump_pod(
|
|
366
555
|
api: client.CoreV1Api,
|
|
556
|
+
namespace: str,
|
|
367
557
|
project_name: str,
|
|
368
558
|
project_ssh_private_key: str,
|
|
369
559
|
user_ssh_public_key: str,
|
|
@@ -372,6 +562,7 @@ def _continue_setup_jump_pod(
|
|
|
372
562
|
):
|
|
373
563
|
_wait_for_pod_ready(
|
|
374
564
|
api=api,
|
|
565
|
+
namespace=namespace,
|
|
375
566
|
pod_name=_get_jump_pod_name(project_name),
|
|
376
567
|
)
|
|
377
568
|
_add_authorized_key_to_jump_pod(
|
|
@@ -384,82 +575,135 @@ def _continue_setup_jump_pod(
|
|
|
384
575
|
|
|
385
576
|
def _create_jump_pod_service_if_not_exists(
|
|
386
577
|
api: client.CoreV1Api,
|
|
578
|
+
namespace: str,
|
|
387
579
|
project_name: str,
|
|
388
580
|
ssh_public_keys: List[str],
|
|
389
581
|
jump_pod_port: Optional[int],
|
|
390
582
|
) -> Tuple[int, bool]:
|
|
391
583
|
created = False
|
|
392
|
-
|
|
393
|
-
|
|
584
|
+
service: Optional[client.V1Service] = None
|
|
585
|
+
pod: Optional[client.V1Pod] = None
|
|
586
|
+
_namespace = call_api_method(
|
|
587
|
+
api.read_namespace,
|
|
588
|
+
client.V1Namespace,
|
|
589
|
+
expected=404,
|
|
590
|
+
name=namespace,
|
|
591
|
+
)
|
|
592
|
+
if _namespace is None:
|
|
593
|
+
_namespace = client.V1Namespace(
|
|
594
|
+
metadata=client.V1ObjectMeta(
|
|
595
|
+
name=namespace,
|
|
596
|
+
labels={"app.kubernetes.io/name": namespace},
|
|
597
|
+
),
|
|
598
|
+
)
|
|
599
|
+
call_api_method(
|
|
600
|
+
api.create_namespace,
|
|
601
|
+
client.V1Namespace,
|
|
602
|
+
body=_namespace,
|
|
603
|
+
)
|
|
604
|
+
else:
|
|
605
|
+
service = call_api_method(
|
|
606
|
+
api.read_namespaced_service,
|
|
607
|
+
client.V1Service,
|
|
608
|
+
expected=404,
|
|
394
609
|
name=_get_jump_pod_service_name(project_name),
|
|
395
|
-
namespace=
|
|
610
|
+
namespace=namespace,
|
|
396
611
|
)
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
612
|
+
pod = call_api_method(
|
|
613
|
+
api.read_namespaced_pod,
|
|
614
|
+
client.V1Pod,
|
|
615
|
+
expected=404,
|
|
616
|
+
name=_get_jump_pod_name(project_name),
|
|
617
|
+
namespace=namespace,
|
|
618
|
+
)
|
|
619
|
+
# The service may exist without the pod if the node on which the jump pod was running
|
|
620
|
+
# has been deleted.
|
|
621
|
+
if service is None or pod is None:
|
|
622
|
+
service = _create_jump_pod_service(
|
|
623
|
+
api=api,
|
|
624
|
+
namespace=namespace,
|
|
625
|
+
project_name=project_name,
|
|
626
|
+
ssh_public_keys=ssh_public_keys,
|
|
627
|
+
jump_pod_port=jump_pod_port,
|
|
628
|
+
)
|
|
629
|
+
created = True
|
|
630
|
+
port = get_value(service, ".spec.ports[0].node_port", int, required=True)
|
|
631
|
+
return port, created
|
|
409
632
|
|
|
410
633
|
|
|
411
634
|
def _create_jump_pod_service(
|
|
412
635
|
api: client.CoreV1Api,
|
|
636
|
+
namespace: str,
|
|
413
637
|
project_name: str,
|
|
414
638
|
ssh_public_keys: List[str],
|
|
415
639
|
jump_pod_port: Optional[int],
|
|
416
640
|
) -> client.V1Service:
|
|
417
641
|
# TODO use restricted ssh-forwarding-only user for jump pod instead of root.
|
|
418
|
-
commands = _get_jump_pod_commands(authorized_keys=ssh_public_keys)
|
|
419
642
|
pod_name = _get_jump_pod_name(project_name)
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
)
|
|
441
|
-
|
|
442
|
-
|
|
643
|
+
call_api_method(
|
|
644
|
+
api.delete_namespaced_pod,
|
|
645
|
+
client.V1Pod,
|
|
646
|
+
expected=404,
|
|
647
|
+
namespace=namespace,
|
|
648
|
+
name=pod_name,
|
|
649
|
+
)
|
|
650
|
+
commands = _get_jump_pod_commands(authorized_keys=ssh_public_keys)
|
|
651
|
+
pod = client.V1Pod(
|
|
652
|
+
metadata=client.V1ObjectMeta(
|
|
653
|
+
name=pod_name,
|
|
654
|
+
labels={"app.kubernetes.io/name": pod_name},
|
|
655
|
+
),
|
|
656
|
+
spec=client.V1PodSpec(
|
|
657
|
+
containers=[
|
|
658
|
+
client.V1Container(
|
|
659
|
+
name=f"{pod_name}-container",
|
|
660
|
+
# TODO: Choose appropriate image for jump pod
|
|
661
|
+
image="dstackai/base:py3.11-0.4rc4",
|
|
662
|
+
command=["/bin/sh"],
|
|
663
|
+
args=["-c", " && ".join(commands)],
|
|
664
|
+
ports=[
|
|
665
|
+
client.V1ContainerPort(
|
|
666
|
+
container_port=JUMP_POD_SSH_PORT,
|
|
667
|
+
)
|
|
668
|
+
],
|
|
669
|
+
)
|
|
670
|
+
]
|
|
443
671
|
),
|
|
444
672
|
)
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
673
|
+
call_api_method(
|
|
674
|
+
api.create_namespaced_pod,
|
|
675
|
+
client.V1Pod,
|
|
676
|
+
namespace=namespace,
|
|
677
|
+
body=pod,
|
|
678
|
+
)
|
|
679
|
+
service_name = _get_jump_pod_service_name(project_name)
|
|
680
|
+
call_api_method(
|
|
681
|
+
api.delete_namespaced_service,
|
|
682
|
+
client.V1Service,
|
|
683
|
+
expected=404,
|
|
684
|
+
namespace=namespace,
|
|
685
|
+
name=service_name,
|
|
686
|
+
)
|
|
687
|
+
service = client.V1Service(
|
|
688
|
+
metadata=client.V1ObjectMeta(name=service_name),
|
|
689
|
+
spec=client.V1ServiceSpec(
|
|
690
|
+
type="NodePort",
|
|
691
|
+
selector={"app.kubernetes.io/name": pod_name},
|
|
692
|
+
ports=[
|
|
693
|
+
client.V1ServicePort(
|
|
694
|
+
port=JUMP_POD_SSH_PORT,
|
|
695
|
+
target_port=JUMP_POD_SSH_PORT,
|
|
696
|
+
node_port=jump_pod_port,
|
|
697
|
+
)
|
|
698
|
+
],
|
|
460
699
|
),
|
|
461
700
|
)
|
|
462
|
-
return
|
|
701
|
+
return call_api_method(
|
|
702
|
+
api.create_namespaced_service,
|
|
703
|
+
client.V1Service,
|
|
704
|
+
namespace=namespace,
|
|
705
|
+
body=service,
|
|
706
|
+
)
|
|
463
707
|
|
|
464
708
|
|
|
465
709
|
def _get_jump_pod_commands(authorized_keys: List[str]) -> List[str]:
|
|
@@ -484,20 +728,25 @@ def _get_jump_pod_commands(authorized_keys: List[str]) -> List[str]:
|
|
|
484
728
|
|
|
485
729
|
def _wait_for_pod_ready(
|
|
486
730
|
api: client.CoreV1Api,
|
|
731
|
+
namespace: str,
|
|
487
732
|
pod_name: str,
|
|
488
733
|
timeout_seconds: int = 300,
|
|
489
734
|
):
|
|
490
735
|
start_time = time.time()
|
|
491
736
|
while True:
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
)
|
|
737
|
+
pod = call_api_method(
|
|
738
|
+
api.read_namespaced_pod,
|
|
739
|
+
client.V1Pod,
|
|
740
|
+
expected=404,
|
|
741
|
+
name=pod_name,
|
|
742
|
+
namespace=namespace,
|
|
743
|
+
)
|
|
744
|
+
if pod is not None:
|
|
745
|
+
phase = get_value(pod, ".status.phase", str, required=True)
|
|
746
|
+
container_statuses = get_value(
|
|
747
|
+
pod, ".status.container_statuses", list[client.V1ContainerStatus], required=True
|
|
748
|
+
)
|
|
749
|
+
if phase == "Running" and all(status.ready for status in container_statuses):
|
|
501
750
|
return True
|
|
502
751
|
elapsed_time = time.time() - start_time
|
|
503
752
|
if elapsed_time >= timeout_seconds:
|
|
@@ -508,19 +757,23 @@ def _wait_for_pod_ready(
|
|
|
508
757
|
|
|
509
758
|
def _wait_for_load_balancer_hostname(
|
|
510
759
|
api: client.CoreV1Api,
|
|
760
|
+
namespace: str,
|
|
511
761
|
service_name: str,
|
|
512
762
|
timeout_seconds: int = 120,
|
|
513
763
|
) -> Optional[str]:
|
|
514
764
|
start_time = time.time()
|
|
515
765
|
while True:
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
766
|
+
service = call_api_method(
|
|
767
|
+
api.read_namespaced_service,
|
|
768
|
+
client.V1Service,
|
|
769
|
+
expected=404,
|
|
770
|
+
name=service_name,
|
|
771
|
+
namespace=namespace,
|
|
772
|
+
)
|
|
773
|
+
if service is not None:
|
|
774
|
+
hostname = get_value(service, ".status.load_balancer.ingress[0].hostname", str)
|
|
775
|
+
if hostname is not None:
|
|
776
|
+
return hostname
|
|
524
777
|
elapsed_time = time.time() - start_time
|
|
525
778
|
if elapsed_time >= timeout_seconds:
|
|
526
779
|
logger.warning("Timeout waiting for load balancer %s to get ip", service_name)
|
|
@@ -607,11 +860,11 @@ def _run_ssh_command(hostname: str, port: int, ssh_private_key: str, command: st
|
|
|
607
860
|
|
|
608
861
|
|
|
609
862
|
def _get_jump_pod_name(project_name: str) -> str:
|
|
610
|
-
return f"{project_name}-ssh-jump-pod"
|
|
863
|
+
return f"dstack-{project_name}-ssh-jump-pod"
|
|
611
864
|
|
|
612
865
|
|
|
613
866
|
def _get_jump_pod_service_name(project_name: str) -> str:
|
|
614
|
-
return f"{project_name}-ssh-jump-pod-service"
|
|
867
|
+
return f"dstack-{project_name}-ssh-jump-pod-service"
|
|
615
868
|
|
|
616
869
|
|
|
617
870
|
def _get_pod_service_name(pod_name: str) -> str:
|