dstack 0.19.30__py3-none-any.whl → 0.19.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (39) hide show
  1. dstack/_internal/cli/commands/__init__.py +8 -0
  2. dstack/_internal/cli/commands/project.py +27 -20
  3. dstack/_internal/cli/commands/server.py +5 -0
  4. dstack/_internal/cli/main.py +1 -3
  5. dstack/_internal/core/backends/aws/compute.py +2 -0
  6. dstack/_internal/core/backends/azure/compute.py +2 -0
  7. dstack/_internal/core/backends/base/compute.py +32 -9
  8. dstack/_internal/core/backends/base/offers.py +1 -0
  9. dstack/_internal/core/backends/cloudrift/compute.py +2 -0
  10. dstack/_internal/core/backends/cudo/compute.py +2 -0
  11. dstack/_internal/core/backends/datacrunch/compute.py +2 -0
  12. dstack/_internal/core/backends/digitalocean_base/compute.py +2 -0
  13. dstack/_internal/core/backends/features.py +5 -0
  14. dstack/_internal/core/backends/gcp/compute.py +74 -34
  15. dstack/_internal/core/backends/gcp/configurator.py +1 -1
  16. dstack/_internal/core/backends/gcp/models.py +14 -1
  17. dstack/_internal/core/backends/gcp/resources.py +35 -12
  18. dstack/_internal/core/backends/hotaisle/compute.py +2 -0
  19. dstack/_internal/core/backends/kubernetes/compute.py +466 -213
  20. dstack/_internal/core/backends/kubernetes/models.py +13 -16
  21. dstack/_internal/core/backends/kubernetes/utils.py +145 -8
  22. dstack/_internal/core/backends/lambdalabs/compute.py +2 -0
  23. dstack/_internal/core/backends/local/compute.py +2 -0
  24. dstack/_internal/core/backends/nebius/compute.py +2 -0
  25. dstack/_internal/core/backends/oci/compute.py +2 -0
  26. dstack/_internal/core/backends/template/compute.py.jinja +2 -0
  27. dstack/_internal/core/backends/tensordock/compute.py +2 -0
  28. dstack/_internal/core/backends/vultr/compute.py +2 -0
  29. dstack/_internal/server/background/tasks/common.py +2 -0
  30. dstack/_internal/server/background/tasks/process_instances.py +2 -2
  31. dstack/_internal/server/services/offers.py +7 -1
  32. dstack/_internal/server/testing/common.py +2 -0
  33. dstack/_internal/server/utils/provisioning.py +3 -10
  34. dstack/version.py +1 -1
  35. {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/METADATA +11 -9
  36. {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/RECORD +39 -39
  37. {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/WHEEL +0 -0
  38. {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/entry_points.txt +0 -0
  39. {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/licenses/LICENSE.md +0 -0
@@ -2,7 +2,7 @@ import subprocess
2
2
  import tempfile
3
3
  import threading
4
4
  import time
5
- from typing import Dict, List, Optional, Tuple
5
+ from typing import List, Optional, Tuple
6
6
 
7
7
  from gpuhunt import KNOWN_NVIDIA_GPUS, AcceleratorVendor
8
8
  from kubernetes import client
@@ -11,19 +11,24 @@ from dstack._internal.core.backends.base.compute import (
11
11
  Compute,
12
12
  ComputeWithFilteredOffersCached,
13
13
  ComputeWithGatewaySupport,
14
+ ComputeWithMultinodeSupport,
15
+ ComputeWithPrivilegedSupport,
14
16
  generate_unique_gateway_instance_name,
15
17
  generate_unique_instance_name_for_job,
16
18
  get_docker_commands,
17
19
  get_dstack_gateway_commands,
20
+ normalize_arch,
18
21
  )
19
22
  from dstack._internal.core.backends.base.offers import filter_offers_by_requirements
20
23
  from dstack._internal.core.backends.kubernetes.models import (
21
24
  KubernetesConfig,
22
- KubernetesNetworkingConfig,
25
+ KubernetesProxyJumpConfig,
23
26
  )
24
27
  from dstack._internal.core.backends.kubernetes.utils import (
28
+ call_api_method,
25
29
  get_api_from_config_data,
26
30
  get_cluster_public_ip,
31
+ get_value,
27
32
  )
28
33
  from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
29
34
  from dstack._internal.core.errors import ComputeError
@@ -44,6 +49,7 @@ from dstack._internal.core.models.instances import (
44
49
  Resources,
45
50
  SSHConnectionParams,
46
51
  )
52
+ from dstack._internal.core.models.resources import CPUSpec, Memory
47
53
  from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
48
54
  from dstack._internal.core.models.volumes import Volume
49
55
  from dstack._internal.utils.common import parse_memory
@@ -52,52 +58,73 @@ from dstack._internal.utils.logging import get_logger
52
58
  logger = get_logger(__name__)
53
59
 
54
60
  JUMP_POD_SSH_PORT = 22
55
- DEFAULT_NAMESPACE = "default"
56
61
 
57
62
  NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
58
63
  NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
59
64
 
65
+ DUMMY_REGION = "-"
66
+
60
67
 
61
68
  class KubernetesCompute(
62
69
  ComputeWithFilteredOffersCached,
70
+ ComputeWithPrivilegedSupport,
63
71
  ComputeWithGatewaySupport,
72
+ ComputeWithMultinodeSupport,
64
73
  Compute,
65
74
  ):
66
75
  def __init__(self, config: KubernetesConfig):
67
76
  super().__init__()
68
77
  self.config = config.copy()
69
- networking_config = self.config.networking
70
- if networking_config is None:
71
- networking_config = KubernetesNetworkingConfig()
72
- self.networking_config = networking_config
78
+ proxy_jump = self.config.proxy_jump
79
+ if proxy_jump is None:
80
+ proxy_jump = KubernetesProxyJumpConfig()
81
+ self.proxy_jump = proxy_jump
73
82
  self.api = get_api_from_config_data(config.kubeconfig.data)
74
83
 
75
84
  def get_offers_by_requirements(
76
85
  self, requirements: Requirements
77
86
  ) -> List[InstanceOfferWithAvailability]:
78
- nodes = self.api.list_node()
79
- instance_offers = []
80
- for node in nodes.items:
87
+ instance_offers: list[InstanceOfferWithAvailability] = []
88
+ node_list = call_api_method(
89
+ self.api.list_node,
90
+ client.V1NodeList,
91
+ )
92
+ nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
93
+ for node in nodes:
94
+ try:
95
+ labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
96
+ name = get_value(node, ".metadata.name", str, required=True)
97
+ cpus = _parse_cpu(
98
+ get_value(node, ".status.allocatable['cpu']", str, required=True)
99
+ )
100
+ cpu_arch = normalize_arch(
101
+ get_value(node, ".status.node_info.architecture", str)
102
+ ).to_cpu_architecture()
103
+ memory_mib = _parse_memory(
104
+ get_value(node, ".status.allocatable['memory']", str, required=True)
105
+ )
106
+ gpus, _ = _get_gpus_from_node_labels(labels)
107
+ disk_size_mib = _parse_memory(
108
+ get_value(node, ".status.allocatable['ephemeral-storage']", str, required=True)
109
+ )
110
+ except (AttributeError, KeyError, ValueError) as e:
111
+ logger.exception("Failed to process node: %s: %s", type(e).__name__, e)
112
+ continue
81
113
  instance_offer = InstanceOfferWithAvailability(
82
114
  backend=BackendType.KUBERNETES,
83
115
  instance=InstanceType(
84
- name=node.metadata.name,
116
+ name=name,
85
117
  resources=Resources(
86
- cpus=node.status.capacity["cpu"],
87
- memory_mib=int(parse_memory(node.status.capacity["memory"], as_untis="M")),
88
- gpus=_get_gpus_from_node_labels(node.metadata.labels),
118
+ cpus=cpus,
119
+ cpu_arch=cpu_arch,
120
+ memory_mib=memory_mib,
121
+ gpus=gpus,
89
122
  spot=False,
90
- disk=Disk(
91
- size_mib=int(
92
- parse_memory(
93
- node.status.capacity["ephemeral-storage"], as_untis="M"
94
- )
95
- )
96
- ),
123
+ disk=Disk(size_mib=disk_size_mib),
97
124
  ),
98
125
  ),
99
126
  price=0,
100
- region="-",
127
+ region=DUMMY_REGION,
101
128
  availability=InstanceAvailability.AVAILABLE,
102
129
  instance_runtime=InstanceRuntime.RUNNER,
103
130
  )
@@ -122,7 +149,7 @@ class KubernetesCompute(
122
149
  # as an ssh proxy jump to connect to all other services in Kubernetes.
123
150
  # Setup jump pod in a separate thread to avoid long-running run_job.
124
151
  # In case the thread fails, the job will be failed and resubmitted.
125
- jump_pod_hostname = self.networking_config.ssh_host
152
+ jump_pod_hostname = self.proxy_jump.hostname
126
153
  if jump_pod_hostname is None:
127
154
  jump_pod_hostname = get_cluster_public_ip(self.api)
128
155
  if jump_pod_hostname is None:
@@ -132,15 +159,17 @@ class KubernetesCompute(
132
159
  )
133
160
  jump_pod_port, created = _create_jump_pod_service_if_not_exists(
134
161
  api=self.api,
162
+ namespace=self.config.namespace,
135
163
  project_name=run.project_name,
136
164
  ssh_public_keys=[project_ssh_public_key.strip(), run.run_spec.ssh_key_pub.strip()],
137
- jump_pod_port=self.networking_config.ssh_port,
165
+ jump_pod_port=self.proxy_jump.port,
138
166
  )
139
167
  if not created:
140
168
  threading.Thread(
141
169
  target=_continue_setup_jump_pod,
142
170
  kwargs={
143
171
  "api": self.api,
172
+ "namespace": self.config.namespace,
144
173
  "project_name": run.project_name,
145
174
  "project_ssh_private_key": project_ssh_private_key.strip(),
146
175
  "user_ssh_public_key": run.run_spec.ssh_key_pub.strip(),
@@ -148,41 +177,146 @@ class KubernetesCompute(
148
177
  "jump_pod_port": jump_pod_port,
149
178
  },
150
179
  ).start()
151
- self.api.create_namespaced_pod(
152
- namespace=DEFAULT_NAMESPACE,
153
- body=client.V1Pod(
154
- metadata=client.V1ObjectMeta(
155
- name=instance_name,
156
- labels={"app.kubernetes.io/name": instance_name},
157
- ),
158
- spec=client.V1PodSpec(
159
- containers=[
160
- client.V1Container(
161
- name=f"{instance_name}-container",
162
- image=job.job_spec.image_name,
163
- command=["/bin/sh"],
164
- args=["-c", " && ".join(commands)],
165
- ports=[
166
- client.V1ContainerPort(
167
- container_port=DSTACK_RUNNER_SSH_PORT,
168
- )
180
+
181
+ resources_requests: dict[str, str] = {}
182
+ resources_limits: dict[str, str] = {}
183
+ node_affinity: Optional[client.V1NodeAffinity] = None
184
+ volumes_: list[client.V1Volume] = []
185
+ volume_mounts: list[client.V1VolumeMount] = []
186
+
187
+ resources_spec = job.job_spec.requirements.resources
188
+ assert isinstance(resources_spec.cpu, CPUSpec)
189
+ if (cpu_min := resources_spec.cpu.count.min) is not None:
190
+ resources_requests["cpu"] = str(cpu_min)
191
+ if (gpu_spec := resources_spec.gpu) is not None:
192
+ gpu_min = gpu_spec.count.min
193
+ if gpu_min is not None and gpu_min > 0:
194
+ if not (offer_gpus := instance_offer.instance.resources.gpus):
195
+ raise ComputeError(
196
+ "GPU is requested but the offer has no GPUs:"
197
+ f" {gpu_spec=} {instance_offer=}",
198
+ )
199
+ offer_gpu = offer_gpus[0]
200
+ matching_gpu_label_values: set[str] = set()
201
+ # We cannot generate an expected GPU label value from the Gpu model instance
202
+ # as the actual values may have additional components (socket, memory type, etc.)
203
+ # that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
204
+ # Moreover, a single Gpu may match multiple label values.
205
+ # As a workaround, we iterate and process all node labels once again (we already
206
+ # processed them in `get_offers_by_requirements()`).
207
+ node_list = call_api_method(
208
+ self.api.list_node,
209
+ client.V1NodeList,
210
+ )
211
+ nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
212
+ for node in nodes:
213
+ labels = get_value(node, ".metadata.labels", dict[str, str])
214
+ if not labels:
215
+ continue
216
+ gpus, gpu_label_value = _get_gpus_from_node_labels(labels)
217
+ if not gpus or gpu_label_value is None:
218
+ continue
219
+ if gpus[0] == offer_gpu:
220
+ matching_gpu_label_values.add(gpu_label_value)
221
+ if not matching_gpu_label_values:
222
+ raise ComputeError(
223
+ f"GPU is requested but no matching GPU labels found: {gpu_spec=}"
224
+ )
225
+ logger.debug(
226
+ "Requesting %d GPU(s), node labels: %s", gpu_min, matching_gpu_label_values
227
+ )
228
+ # TODO: support other GPU vendors
229
+ resources_requests["nvidia.com/gpu"] = str(gpu_min)
230
+ resources_limits["nvidia.com/gpu"] = str(gpu_min)
231
+ node_affinity = client.V1NodeAffinity(
232
+ required_during_scheduling_ignored_during_execution=[
233
+ client.V1NodeSelectorTerm(
234
+ match_expressions=[
235
+ client.V1NodeSelectorRequirement(
236
+ key="nvidia.com/gpu.product",
237
+ operator="In",
238
+ values=list(matching_gpu_label_values),
239
+ ),
169
240
  ],
170
- security_context=client.V1SecurityContext(
171
- # TODO(#1535): support non-root images properly
172
- run_as_user=0,
173
- run_as_group=0,
241
+ ),
242
+ ],
243
+ )
244
+
245
+ if (memory_min := resources_spec.memory.min) is not None:
246
+ resources_requests["memory"] = _render_memory(memory_min)
247
+ if (
248
+ resources_spec.disk is not None
249
+ and (disk_min := resources_spec.disk.size.min) is not None
250
+ ):
251
+ resources_requests["ephemeral-storage"] = _render_memory(disk_min)
252
+ if (shm_size := resources_spec.shm_size) is not None:
253
+ shm_volume_name = "dev-shm"
254
+ volumes_.append(
255
+ client.V1Volume(
256
+ name=shm_volume_name,
257
+ empty_dir=client.V1EmptyDirVolumeSource(
258
+ medium="Memory",
259
+ size_limit=_render_memory(shm_size),
260
+ ),
261
+ )
262
+ )
263
+ volume_mounts.append(
264
+ client.V1VolumeMount(
265
+ name=shm_volume_name,
266
+ mount_path="/dev/shm",
267
+ )
268
+ )
269
+
270
+ pod = client.V1Pod(
271
+ metadata=client.V1ObjectMeta(
272
+ name=instance_name,
273
+ labels={"app.kubernetes.io/name": instance_name},
274
+ ),
275
+ spec=client.V1PodSpec(
276
+ containers=[
277
+ client.V1Container(
278
+ name=f"{instance_name}-container",
279
+ image=job.job_spec.image_name,
280
+ command=["/bin/sh"],
281
+ args=["-c", " && ".join(commands)],
282
+ ports=[
283
+ client.V1ContainerPort(
284
+ container_port=DSTACK_RUNNER_SSH_PORT,
285
+ )
286
+ ],
287
+ security_context=client.V1SecurityContext(
288
+ # TODO(#1535): support non-root images properly
289
+ run_as_user=0,
290
+ run_as_group=0,
291
+ privileged=job.job_spec.privileged,
292
+ capabilities=client.V1Capabilities(
293
+ add=[
294
+ # Allow to increase hard resource limits, see getrlimit(2)
295
+ "SYS_RESOURCE",
296
+ ],
174
297
  ),
175
- # TODO: Pass cpu, memory, gpu as requests.
176
- # Beware that node capacity != allocatable, so
177
- # if the node has 2xCPU – then cpu=2 request will probably fail.
178
- resources=client.V1ResourceRequirements(requests={}),
179
- )
180
- ]
181
- ),
298
+ ),
299
+ resources=client.V1ResourceRequirements(
300
+ requests=resources_requests,
301
+ limits=resources_limits,
302
+ ),
303
+ volume_mounts=volume_mounts,
304
+ )
305
+ ],
306
+ affinity=node_affinity,
307
+ volumes=volumes_,
182
308
  ),
183
309
  )
184
- service_response = self.api.create_namespaced_service(
185
- namespace=DEFAULT_NAMESPACE,
310
+ call_api_method(
311
+ self.api.create_namespaced_pod,
312
+ client.V1Pod,
313
+ namespace=self.config.namespace,
314
+ body=pod,
315
+ )
316
+ call_api_method(
317
+ self.api.create_namespaced_service,
318
+ client.V1Service,
319
+ namespace=self.config.namespace,
186
320
  body=client.V1Service(
187
321
  metadata=client.V1ObjectMeta(name=_get_pod_service_name(instance_name)),
188
322
  spec=client.V1ServiceSpec(
@@ -192,14 +326,16 @@ class KubernetesCompute(
192
326
  ),
193
327
  ),
194
328
  )
195
- service_ip = service_response.spec.cluster_ip
196
329
  return JobProvisioningData(
197
330
  backend=instance_offer.backend,
198
331
  instance_type=instance_offer.instance,
199
332
  instance_id=instance_name,
200
- hostname=service_ip,
333
+ # Although we can already get Service's ClusterIP from the `V1Service` object returned
334
+ # by the `create_namespaced_service` method, we still need PodIP for multinode runs.
335
+ # We'll update both hostname and internal_ip once the pod is assigned to the node.
336
+ hostname=None,
201
337
  internal_ip=None,
202
- region="local",
338
+ region=instance_offer.region,
203
339
  price=instance_offer.price,
204
340
  username="root",
205
341
  ssh_port=DSTACK_RUNNER_SSH_PORT,
@@ -212,25 +348,49 @@ class KubernetesCompute(
212
348
  backend_data=None,
213
349
  )
214
350
 
351
+ def update_provisioning_data(
352
+ self,
353
+ provisioning_data: JobProvisioningData,
354
+ project_ssh_public_key: str,
355
+ project_ssh_private_key: str,
356
+ ):
357
+ pod = call_api_method(
358
+ self.api.read_namespaced_pod,
359
+ client.V1Pod,
360
+ name=provisioning_data.instance_id,
361
+ namespace=self.config.namespace,
362
+ )
363
+ pod_ip = get_value(pod, ".status.pod_ip", str)
364
+ if not pod_ip:
365
+ return
366
+ provisioning_data.internal_ip = pod_ip
367
+ service = call_api_method(
368
+ self.api.read_namespaced_service,
369
+ client.V1Service,
370
+ name=_get_pod_service_name(provisioning_data.instance_id),
371
+ namespace=self.config.namespace,
372
+ )
373
+ provisioning_data.hostname = get_value(service, ".spec.cluster_ip", str, required=True)
374
+
215
375
  def terminate_instance(
216
376
  self, instance_id: str, region: str, backend_data: Optional[str] = None
217
377
  ):
218
- try:
219
- self.api.delete_namespaced_service(
220
- name=_get_pod_service_name(instance_id),
221
- namespace=DEFAULT_NAMESPACE,
222
- body=client.V1DeleteOptions(),
223
- )
224
- except client.ApiException as e:
225
- if e.status != 404:
226
- raise
227
- try:
228
- self.api.delete_namespaced_pod(
229
- name=instance_id, namespace=DEFAULT_NAMESPACE, body=client.V1DeleteOptions()
230
- )
231
- except client.ApiException as e:
232
- if e.status != 404:
233
- raise
378
+ call_api_method(
379
+ self.api.delete_namespaced_service,
380
+ client.V1Service,
381
+ expected=404,
382
+ name=_get_pod_service_name(instance_id),
383
+ namespace=self.config.namespace,
384
+ body=client.V1DeleteOptions(),
385
+ )
386
+ call_api_method(
387
+ self.api.delete_namespaced_pod,
388
+ client.V1Pod,
389
+ expected=404,
390
+ name=instance_id,
391
+ namespace=self.config.namespace,
392
+ body=client.V1DeleteOptions(),
393
+ )
234
394
 
235
395
  def create_gateway(
236
396
  self,
@@ -247,70 +407,79 @@ class KubernetesCompute(
247
407
  # https://docs.aws.amazon.com/eks/latest/userguide/network-load-balancing.html
248
408
  instance_name = generate_unique_gateway_instance_name(configuration)
249
409
  commands = _get_gateway_commands(authorized_keys=[configuration.ssh_key_pub])
250
- self.api.create_namespaced_pod(
251
- namespace=DEFAULT_NAMESPACE,
252
- body=client.V1Pod(
253
- metadata=client.V1ObjectMeta(
254
- name=instance_name,
255
- labels={"app.kubernetes.io/name": instance_name},
256
- ),
257
- spec=client.V1PodSpec(
258
- containers=[
259
- client.V1Container(
260
- name=f"{instance_name}-container",
261
- image="ubuntu:22.04",
262
- command=["/bin/sh"],
263
- args=["-c", " && ".join(commands)],
264
- ports=[
265
- client.V1ContainerPort(
266
- container_port=22,
267
- ),
268
- client.V1ContainerPort(
269
- container_port=80,
270
- ),
271
- client.V1ContainerPort(
272
- container_port=443,
273
- ),
274
- ],
275
- )
276
- ]
277
- ),
410
+ pod = client.V1Pod(
411
+ metadata=client.V1ObjectMeta(
412
+ name=instance_name,
413
+ labels={"app.kubernetes.io/name": instance_name},
414
+ ),
415
+ spec=client.V1PodSpec(
416
+ containers=[
417
+ client.V1Container(
418
+ name=f"{instance_name}-container",
419
+ image="ubuntu:22.04",
420
+ command=["/bin/sh"],
421
+ args=["-c", " && ".join(commands)],
422
+ ports=[
423
+ client.V1ContainerPort(
424
+ container_port=22,
425
+ ),
426
+ client.V1ContainerPort(
427
+ container_port=80,
428
+ ),
429
+ client.V1ContainerPort(
430
+ container_port=443,
431
+ ),
432
+ ],
433
+ )
434
+ ]
278
435
  ),
279
436
  )
280
- self.api.create_namespaced_service(
281
- namespace=DEFAULT_NAMESPACE,
282
- body=client.V1Service(
283
- metadata=client.V1ObjectMeta(
284
- name=_get_pod_service_name(instance_name),
285
- ),
286
- spec=client.V1ServiceSpec(
287
- type="LoadBalancer",
288
- selector={"app.kubernetes.io/name": instance_name},
289
- ports=[
290
- client.V1ServicePort(
291
- name="ssh",
292
- port=22,
293
- target_port=22,
294
- ),
295
- client.V1ServicePort(
296
- name="http",
297
- port=80,
298
- target_port=80,
299
- ),
300
- client.V1ServicePort(
301
- name="https",
302
- port=443,
303
- target_port=443,
304
- ),
305
- ],
306
- ),
437
+ call_api_method(
438
+ self.api.create_namespaced_pod,
439
+ client.V1Pod,
440
+ namespace=self.config.namespace,
441
+ body=pod,
442
+ )
443
+ service = client.V1Service(
444
+ metadata=client.V1ObjectMeta(
445
+ name=_get_pod_service_name(instance_name),
446
+ ),
447
+ spec=client.V1ServiceSpec(
448
+ type="LoadBalancer",
449
+ selector={"app.kubernetes.io/name": instance_name},
450
+ ports=[
451
+ client.V1ServicePort(
452
+ name="ssh",
453
+ port=22,
454
+ target_port=22,
455
+ ),
456
+ client.V1ServicePort(
457
+ name="http",
458
+ port=80,
459
+ target_port=80,
460
+ ),
461
+ client.V1ServicePort(
462
+ name="https",
463
+ port=443,
464
+ target_port=443,
465
+ ),
466
+ ],
307
467
  ),
308
468
  )
469
+ call_api_method(
470
+ self.api.create_namespaced_service,
471
+ client.V1Service,
472
+ namespace=self.config.namespace,
473
+ body=service,
474
+ )
309
475
  hostname = _wait_for_load_balancer_hostname(
310
- api=self.api, service_name=_get_pod_service_name(instance_name)
476
+ api=self.api,
477
+ namespace=self.config.namespace,
478
+ service_name=_get_pod_service_name(instance_name),
311
479
  )
480
+ region = DUMMY_REGION
312
481
  if hostname is None:
313
- self.terminate_instance(instance_name, region="-")
482
+ self.terminate_instance(instance_name, region=region)
314
483
  raise ComputeError(
315
484
  "Failed to get gateway hostname. "
316
485
  "Ensure the Kubernetes cluster supports Load Balancer services."
@@ -318,7 +487,7 @@ class KubernetesCompute(
318
487
  return GatewayProvisioningData(
319
488
  instance_id=instance_name,
320
489
  ip_address=hostname,
321
- region="-",
490
+ region=region,
322
491
  )
323
492
 
324
493
  def terminate_gateway(
@@ -334,15 +503,34 @@ class KubernetesCompute(
334
503
  )
335
504
 
336
505
 
337
- def _get_gpus_from_node_labels(labels: Dict) -> List[Gpu]:
338
- # We rely on https://github.com/NVIDIA/gpu-feature-discovery to detect gpus.
339
- # Note that "nvidia.com/gpu.product" is not a short gpu name like "T4" or "A100" but a product name
340
- # from nvidia-smi like "Tesla-T4" or "A100-SXM4-40GB".
506
+ def _parse_cpu(cpu: str) -> int:
507
+ if cpu.endswith("m"):
508
+ # "m" means millicpu (1/1000 CPU), e.g., 7900m -> 7.9 -> 7
509
+ return int(float(cpu[:-1]) / 1000)
510
+ return int(cpu)
511
+
512
+
513
+ def _parse_memory(memory: str) -> int:
514
+ if memory.isdigit():
515
+ # no suffix means that the value is in bytes
516
+ return int(memory) // 2**20
517
+ return int(parse_memory(memory, as_untis="M"))
518
+
519
+
520
+ def _render_memory(memory: Memory) -> str:
521
+ return f"{float(memory)}Gi"
522
+
523
+
524
+ def _get_gpus_from_node_labels(labels: dict[str, str]) -> tuple[list[Gpu], Optional[str]]:
525
+ # We rely on https://github.com/NVIDIA/k8s-device-plugin/tree/main/docs/gpu-feature-discovery
526
+ # to detect gpus. Note that "nvidia.com/gpu.product" is not a short gpu name like "T4" or
527
+ # "A100" but a product name like "Tesla-T4" or "A100-SXM4-40GB".
341
528
  # Thus, we convert the product name to a known gpu name.
529
+ # TODO: support other GPU vendors
342
530
  gpu_count = labels.get("nvidia.com/gpu.count")
343
531
  gpu_product = labels.get("nvidia.com/gpu.product")
344
532
  if gpu_count is None or gpu_product is None:
345
- return []
533
+ return [], None
346
534
  gpu_count = int(gpu_count)
347
535
  gpu_name = None
348
536
  for known_gpu_name in NVIDIA_GPU_NAMES:
@@ -350,20 +538,22 @@ def _get_gpus_from_node_labels(labels: Dict) -> List[Gpu]:
350
538
  gpu_name = known_gpu_name
351
539
  break
352
540
  if gpu_name is None:
353
- return []
541
+ return [], None
354
542
  gpu_info = NVIDIA_GPU_NAME_TO_GPU_INFO[gpu_name]
355
543
  gpu_memory = gpu_info.memory * 1024
356
544
  # A100 may come in two variants
357
545
  if "40GB" in gpu_product:
358
546
  gpu_memory = 40 * 1024
359
- return [
547
+ gpus = [
360
548
  Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory)
361
549
  for _ in range(gpu_count)
362
550
  ]
551
+ return gpus, gpu_product
363
552
 
364
553
 
365
554
  def _continue_setup_jump_pod(
366
555
  api: client.CoreV1Api,
556
+ namespace: str,
367
557
  project_name: str,
368
558
  project_ssh_private_key: str,
369
559
  user_ssh_public_key: str,
@@ -372,6 +562,7 @@ def _continue_setup_jump_pod(
372
562
  ):
373
563
  _wait_for_pod_ready(
374
564
  api=api,
565
+ namespace=namespace,
375
566
  pod_name=_get_jump_pod_name(project_name),
376
567
  )
377
568
  _add_authorized_key_to_jump_pod(
@@ -384,82 +575,135 @@ def _continue_setup_jump_pod(
384
575
 
385
576
  def _create_jump_pod_service_if_not_exists(
386
577
  api: client.CoreV1Api,
578
+ namespace: str,
387
579
  project_name: str,
388
580
  ssh_public_keys: List[str],
389
581
  jump_pod_port: Optional[int],
390
582
  ) -> Tuple[int, bool]:
391
583
  created = False
392
- try:
393
- service = api.read_namespaced_service(
584
+ service: Optional[client.V1Service] = None
585
+ pod: Optional[client.V1Pod] = None
586
+ _namespace = call_api_method(
587
+ api.read_namespace,
588
+ client.V1Namespace,
589
+ expected=404,
590
+ name=namespace,
591
+ )
592
+ if _namespace is None:
593
+ _namespace = client.V1Namespace(
594
+ metadata=client.V1ObjectMeta(
595
+ name=namespace,
596
+ labels={"app.kubernetes.io/name": namespace},
597
+ ),
598
+ )
599
+ call_api_method(
600
+ api.create_namespace,
601
+ client.V1Namespace,
602
+ body=_namespace,
603
+ )
604
+ else:
605
+ service = call_api_method(
606
+ api.read_namespaced_service,
607
+ client.V1Service,
608
+ expected=404,
394
609
  name=_get_jump_pod_service_name(project_name),
395
- namespace=DEFAULT_NAMESPACE,
610
+ namespace=namespace,
396
611
  )
397
- except client.ApiException as e:
398
- if e.status == 404:
399
- service = _create_jump_pod_service(
400
- api=api,
401
- project_name=project_name,
402
- ssh_public_keys=ssh_public_keys,
403
- jump_pod_port=jump_pod_port,
404
- )
405
- created = True
406
- else:
407
- raise
408
- return service.spec.ports[0].node_port, created
612
+ pod = call_api_method(
613
+ api.read_namespaced_pod,
614
+ client.V1Pod,
615
+ expected=404,
616
+ name=_get_jump_pod_name(project_name),
617
+ namespace=namespace,
618
+ )
619
+ # The service may exist without the pod if the node on which the jump pod was running
620
+ # has been deleted.
621
+ if service is None or pod is None:
622
+ service = _create_jump_pod_service(
623
+ api=api,
624
+ namespace=namespace,
625
+ project_name=project_name,
626
+ ssh_public_keys=ssh_public_keys,
627
+ jump_pod_port=jump_pod_port,
628
+ )
629
+ created = True
630
+ port = get_value(service, ".spec.ports[0].node_port", int, required=True)
631
+ return port, created
409
632
 
410
633
 
411
634
  def _create_jump_pod_service(
412
635
  api: client.CoreV1Api,
636
+ namespace: str,
413
637
  project_name: str,
414
638
  ssh_public_keys: List[str],
415
639
  jump_pod_port: Optional[int],
416
640
  ) -> client.V1Service:
417
641
  # TODO use restricted ssh-forwarding-only user for jump pod instead of root.
418
- commands = _get_jump_pod_commands(authorized_keys=ssh_public_keys)
419
642
  pod_name = _get_jump_pod_name(project_name)
420
- api.create_namespaced_pod(
421
- namespace=DEFAULT_NAMESPACE,
422
- body=client.V1Pod(
423
- metadata=client.V1ObjectMeta(
424
- name=pod_name,
425
- labels={"app.kubernetes.io/name": pod_name},
426
- ),
427
- spec=client.V1PodSpec(
428
- containers=[
429
- client.V1Container(
430
- name=f"{pod_name}-container",
431
- # TODO: Choose appropriate image for jump pod
432
- image="dstackai/base:py3.11-0.4rc4",
433
- command=["/bin/sh"],
434
- args=["-c", " && ".join(commands)],
435
- ports=[
436
- client.V1ContainerPort(
437
- container_port=JUMP_POD_SSH_PORT,
438
- )
439
- ],
440
- )
441
- ]
442
- ),
643
+ call_api_method(
644
+ api.delete_namespaced_pod,
645
+ client.V1Pod,
646
+ expected=404,
647
+ namespace=namespace,
648
+ name=pod_name,
649
+ )
650
+ commands = _get_jump_pod_commands(authorized_keys=ssh_public_keys)
651
+ pod = client.V1Pod(
652
+ metadata=client.V1ObjectMeta(
653
+ name=pod_name,
654
+ labels={"app.kubernetes.io/name": pod_name},
655
+ ),
656
+ spec=client.V1PodSpec(
657
+ containers=[
658
+ client.V1Container(
659
+ name=f"{pod_name}-container",
660
+ # TODO: Choose appropriate image for jump pod
661
+ image="dstackai/base:py3.11-0.4rc4",
662
+ command=["/bin/sh"],
663
+ args=["-c", " && ".join(commands)],
664
+ ports=[
665
+ client.V1ContainerPort(
666
+ container_port=JUMP_POD_SSH_PORT,
667
+ )
668
+ ],
669
+ )
670
+ ]
443
671
  ),
444
672
  )
445
- service_response = api.create_namespaced_service(
446
- namespace=DEFAULT_NAMESPACE,
447
- body=client.V1Service(
448
- metadata=client.V1ObjectMeta(name=_get_jump_pod_service_name(project_name)),
449
- spec=client.V1ServiceSpec(
450
- type="NodePort",
451
- selector={"app.kubernetes.io/name": pod_name},
452
- ports=[
453
- client.V1ServicePort(
454
- port=JUMP_POD_SSH_PORT,
455
- target_port=JUMP_POD_SSH_PORT,
456
- node_port=jump_pod_port,
457
- )
458
- ],
459
- ),
673
+ call_api_method(
674
+ api.create_namespaced_pod,
675
+ client.V1Pod,
676
+ namespace=namespace,
677
+ body=pod,
678
+ )
679
+ service_name = _get_jump_pod_service_name(project_name)
680
+ call_api_method(
681
+ api.delete_namespaced_service,
682
+ client.V1Service,
683
+ expected=404,
684
+ namespace=namespace,
685
+ name=service_name,
686
+ )
687
+ service = client.V1Service(
688
+ metadata=client.V1ObjectMeta(name=service_name),
689
+ spec=client.V1ServiceSpec(
690
+ type="NodePort",
691
+ selector={"app.kubernetes.io/name": pod_name},
692
+ ports=[
693
+ client.V1ServicePort(
694
+ port=JUMP_POD_SSH_PORT,
695
+ target_port=JUMP_POD_SSH_PORT,
696
+ node_port=jump_pod_port,
697
+ )
698
+ ],
460
699
  ),
461
700
  )
462
- return service_response
701
+ return call_api_method(
702
+ api.create_namespaced_service,
703
+ client.V1Service,
704
+ namespace=namespace,
705
+ body=service,
706
+ )
463
707
 
464
708
 
465
709
  def _get_jump_pod_commands(authorized_keys: List[str]) -> List[str]:
@@ -484,20 +728,25 @@ def _get_jump_pod_commands(authorized_keys: List[str]) -> List[str]:
484
728
 
485
729
  def _wait_for_pod_ready(
486
730
  api: client.CoreV1Api,
731
+ namespace: str,
487
732
  pod_name: str,
488
733
  timeout_seconds: int = 300,
489
734
  ):
490
735
  start_time = time.time()
491
736
  while True:
492
- try:
493
- pod = api.read_namespaced_pod(name=pod_name, namespace=DEFAULT_NAMESPACE)
494
- except client.ApiException as e:
495
- if e.status != 404:
496
- raise
497
- else:
498
- if pod.status.phase == "Running" and all(
499
- container_status.ready for container_status in pod.status.container_statuses
500
- ):
737
+ pod = call_api_method(
738
+ api.read_namespaced_pod,
739
+ client.V1Pod,
740
+ expected=404,
741
+ name=pod_name,
742
+ namespace=namespace,
743
+ )
744
+ if pod is not None:
745
+ phase = get_value(pod, ".status.phase", str, required=True)
746
+ container_statuses = get_value(
747
+ pod, ".status.container_statuses", list[client.V1ContainerStatus], required=True
748
+ )
749
+ if phase == "Running" and all(status.ready for status in container_statuses):
501
750
  return True
502
751
  elapsed_time = time.time() - start_time
503
752
  if elapsed_time >= timeout_seconds:
@@ -508,19 +757,23 @@ def _wait_for_pod_ready(
508
757
 
509
758
  def _wait_for_load_balancer_hostname(
510
759
  api: client.CoreV1Api,
760
+ namespace: str,
511
761
  service_name: str,
512
762
  timeout_seconds: int = 120,
513
763
  ) -> Optional[str]:
514
764
  start_time = time.time()
515
765
  while True:
516
- try:
517
- service = api.read_namespaced_service(name=service_name, namespace=DEFAULT_NAMESPACE)
518
- except client.ApiException as e:
519
- if e.status != 404:
520
- raise
521
- else:
522
- if service.status.load_balancer.ingress is not None:
523
- return service.status.load_balancer.ingress[0].hostname
766
+ service = call_api_method(
767
+ api.read_namespaced_service,
768
+ client.V1Service,
769
+ expected=404,
770
+ name=service_name,
771
+ namespace=namespace,
772
+ )
773
+ if service is not None:
774
+ hostname = get_value(service, ".status.load_balancer.ingress[0].hostname", str)
775
+ if hostname is not None:
776
+ return hostname
524
777
  elapsed_time = time.time() - start_time
525
778
  if elapsed_time >= timeout_seconds:
526
779
  logger.warning("Timeout waiting for load balancer %s to get ip", service_name)
@@ -607,11 +860,11 @@ def _run_ssh_command(hostname: str, port: int, ssh_private_key: str, command: st
607
860
 
608
861
 
609
862
  def _get_jump_pod_name(project_name: str) -> str:
610
- return f"{project_name}-ssh-jump-pod"
863
+ return f"dstack-{project_name}-ssh-jump-pod"
611
864
 
612
865
 
613
866
  def _get_jump_pod_service_name(project_name: str) -> str:
614
- return f"{project_name}-ssh-jump-pod-service"
867
+ return f"dstack-{project_name}-ssh-jump-pod-service"
615
868
 
616
869
 
617
870
  def _get_pod_service_name(pod_name: str) -> str: