dstack 0.19.34__py3-none-any.whl → 0.19.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (41) hide show
  1. dstack/_internal/cli/services/configurators/run.py +1 -1
  2. dstack/_internal/core/backends/base/compute.py +20 -1
  3. dstack/_internal/core/backends/base/models.py +10 -0
  4. dstack/_internal/core/backends/base/offers.py +1 -0
  5. dstack/_internal/core/backends/features.py +5 -0
  6. dstack/_internal/core/backends/nebius/compute.py +28 -16
  7. dstack/_internal/core/backends/nebius/configurator.py +1 -1
  8. dstack/_internal/core/backends/nebius/models.py +4 -0
  9. dstack/_internal/core/backends/nebius/resources.py +41 -20
  10. dstack/_internal/core/backends/runpod/api_client.py +245 -59
  11. dstack/_internal/core/backends/runpod/compute.py +157 -13
  12. dstack/_internal/core/models/compute_groups.py +39 -0
  13. dstack/_internal/core/models/fleets.py +6 -1
  14. dstack/_internal/core/models/profiles.py +3 -1
  15. dstack/_internal/core/models/runs.py +3 -0
  16. dstack/_internal/server/app.py +14 -2
  17. dstack/_internal/server/background/__init__.py +7 -0
  18. dstack/_internal/server/background/tasks/process_compute_groups.py +164 -0
  19. dstack/_internal/server/background/tasks/process_instances.py +81 -49
  20. dstack/_internal/server/background/tasks/process_submitted_jobs.py +179 -84
  21. dstack/_internal/server/migrations/env.py +20 -2
  22. dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py +93 -0
  23. dstack/_internal/server/models.py +39 -0
  24. dstack/_internal/server/routers/runs.py +15 -6
  25. dstack/_internal/server/services/compute_groups.py +22 -0
  26. dstack/_internal/server/services/fleets.py +1 -0
  27. dstack/_internal/server/services/jobs/__init__.py +13 -0
  28. dstack/_internal/server/services/jobs/configurators/base.py +3 -2
  29. dstack/_internal/server/services/requirements/combine.py +1 -0
  30. dstack/_internal/server/services/runs.py +17 -3
  31. dstack/_internal/server/testing/common.py +51 -0
  32. dstack/_internal/server/utils/routers.py +18 -20
  33. dstack/_internal/settings.py +4 -1
  34. dstack/_internal/utils/version.py +22 -0
  35. dstack/version.py +1 -1
  36. {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/METADATA +3 -3
  37. {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/RECORD +40 -36
  38. dstack/_internal/core/backends/nebius/fabrics.py +0 -49
  39. {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/WHEEL +0 -0
  40. {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/entry_points.txt +0 -0
  41. {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/licenses/LICENSE.md +0 -0
@@ -11,6 +11,14 @@ from dstack._internal.utils.common import get_current_datetime
11
11
  API_URL = "https://api.runpod.io/graphql"
12
12
 
13
13
 
14
+ class RunpodApiClientError(BackendError):
15
+ errors: List[Dict]
16
+
17
+ def __init__(self, errors: List[Dict]):
18
+ self.errors = errors
19
+ super().__init__(errors)
20
+
21
+
14
22
  class RunpodApiClient:
15
23
  def __init__(self, api_key: str):
16
24
  self.api_key = api_key
@@ -23,7 +31,19 @@ class RunpodApiClient:
23
31
  return True
24
32
 
25
33
  def get_user_details(self) -> Dict:
26
- resp = self._make_request({"query": user_details_query, "variable": {}})
34
+ resp = self._make_request(
35
+ {
36
+ "query": """
37
+ query myself {
38
+ myself {
39
+ id
40
+ authId
41
+ email
42
+ }
43
+ }
44
+ """
45
+ }
46
+ )
27
47
  return resp.json()
28
48
 
29
49
  def create_pod(
@@ -52,28 +72,28 @@ class RunpodApiClient:
52
72
  ) -> Dict:
53
73
  resp = self._make_request(
54
74
  {
55
- "query": generate_pod_deployment_mutation(
56
- name,
57
- image_name,
58
- gpu_type_id,
59
- cloud_type,
60
- support_public_ip,
61
- start_ssh,
62
- data_center_id,
63
- country_code,
64
- gpu_count,
65
- volume_in_gb,
66
- container_disk_in_gb,
67
- min_vcpu_count,
68
- min_memory_in_gb,
69
- docker_args,
70
- ports,
71
- volume_mount_path,
72
- env,
73
- template_id,
74
- network_volume_id,
75
- allowed_cuda_versions,
76
- bid_per_gpu,
75
+ "query": _generate_pod_deployment_mutation(
76
+ name=name,
77
+ image_name=image_name,
78
+ gpu_type_id=gpu_type_id,
79
+ cloud_type=cloud_type,
80
+ support_public_ip=support_public_ip,
81
+ start_ssh=start_ssh,
82
+ data_center_id=data_center_id,
83
+ country_code=country_code,
84
+ gpu_count=gpu_count,
85
+ volume_in_gb=volume_in_gb,
86
+ container_disk_in_gb=container_disk_in_gb,
87
+ min_vcpu_count=min_vcpu_count,
88
+ min_memory_in_gb=min_memory_in_gb,
89
+ docker_args=docker_args,
90
+ ports=ports,
91
+ volume_mount_path=volume_mount_path,
92
+ env=env,
93
+ template_id=template_id,
94
+ network_volume_id=network_volume_id,
95
+ allowed_cuda_versions=allowed_cuda_versions,
96
+ bid_per_gpu=bid_per_gpu,
77
97
  )
78
98
  }
79
99
  )
@@ -86,7 +106,9 @@ class RunpodApiClient:
86
106
  image_name: str,
87
107
  container_disk_in_gb: int,
88
108
  container_registry_auth_id: str,
89
- volume_in_gb: int = 0,
109
+ # Default pod volume is 20GB.
110
+ # RunPod errors if it's not specified for podEditJob.
111
+ volume_in_gb: int = 20,
90
112
  ) -> str:
91
113
  resp = self._make_request(
92
114
  {
@@ -108,12 +130,12 @@ class RunpodApiClient:
108
130
  return resp.json()["data"]["podEditJob"]["id"]
109
131
 
110
132
  def get_pod(self, pod_id: str) -> Dict:
111
- resp = self._make_request({"query": generate_pod_query(pod_id)})
133
+ resp = self._make_request({"query": _generate_pod_query(pod_id)})
112
134
  data = resp.json()
113
135
  return data["data"]["pod"]
114
136
 
115
137
  def terminate_pod(self, pod_id: str) -> Dict:
116
- resp = self._make_request({"query": generate_pod_terminate_mutation(pod_id)})
138
+ resp = self._make_request({"query": _generate_pod_terminate_mutation(pod_id)})
117
139
  data = resp.json()
118
140
  return data["data"]
119
141
 
@@ -213,7 +235,7 @@ class RunpodApiClient:
213
235
  )
214
236
  return response.json()["data"]["createNetworkVolume"]["id"]
215
237
 
216
- def delete_network_volume(self, volume_id: str):
238
+ def delete_network_volume(self, volume_id: str) -> None:
217
239
  self._make_request(
218
240
  {
219
241
  "query": f"""
@@ -228,7 +250,66 @@ class RunpodApiClient:
228
250
  }
229
251
  )
230
252
 
231
- def _make_request(self, data: Any = None) -> Response:
253
+ def create_cluster(
254
+ self,
255
+ cluster_name: str,
256
+ gpu_type_id: str,
257
+ pod_count: int,
258
+ gpu_count_per_pod: int,
259
+ image_name: str,
260
+ deploy_cost: str,
261
+ template_id: Optional[str] = None,
262
+ cluster_type: str = "TRAINING",
263
+ network_volume_id: Optional[str] = None,
264
+ volume_in_gb: Optional[int] = None,
265
+ throughput: Optional[int] = None,
266
+ allowed_cuda_versions: Optional[List[str]] = None,
267
+ volume_key: Optional[str] = None,
268
+ data_center_id: Optional[str] = None,
269
+ start_jupyter: bool = False,
270
+ start_ssh: bool = False,
271
+ container_disk_in_gb: Optional[int] = None,
272
+ docker_args: Optional[str] = None,
273
+ env: Optional[Dict[str, Any]] = None,
274
+ volume_mount_path: Optional[str] = None,
275
+ ports: Optional[str] = None,
276
+ ) -> Dict:
277
+ resp = self._make_request(
278
+ {
279
+ "query": _generate_create_cluster_mutation(
280
+ cluster_name=cluster_name,
281
+ gpu_type_id=gpu_type_id,
282
+ pod_count=pod_count,
283
+ gpu_count_per_pod=gpu_count_per_pod,
284
+ image_name=image_name,
285
+ cluster_type=cluster_type,
286
+ deploy_cost=deploy_cost,
287
+ template_id=template_id,
288
+ network_volume_id=network_volume_id,
289
+ volume_in_gb=volume_in_gb,
290
+ throughput=throughput,
291
+ allowed_cuda_versions=allowed_cuda_versions,
292
+ volume_key=volume_key,
293
+ data_center_id=data_center_id,
294
+ start_jupyter=start_jupyter,
295
+ start_ssh=start_ssh,
296
+ container_disk_in_gb=container_disk_in_gb,
297
+ docker_args=docker_args,
298
+ env=env,
299
+ volume_mount_path=volume_mount_path,
300
+ ports=ports,
301
+ )
302
+ }
303
+ )
304
+ data = resp.json()["data"]
305
+ return data["createCluster"]
306
+
307
+ def delete_cluster(self, cluster_id: str) -> bool:
308
+ resp = self._make_request({"query": _generate_delete_cluster_mutation(cluster_id)})
309
+ data = resp.json()["data"]
310
+ return data["deleteCluster"]
311
+
312
+ def _make_request(self, data: Optional[Dict[str, Any]] = None) -> Response:
232
313
  try:
233
314
  response = requests.request(
234
315
  method="POST",
@@ -237,10 +318,10 @@ class RunpodApiClient:
237
318
  timeout=120,
238
319
  )
239
320
  response.raise_for_status()
240
- if "errors" in response.json():
241
- if "podTerminate" in response.json()["errors"][0]["path"]:
242
- raise BackendError("Instance Not Found")
243
- raise BackendError(response.json()["errors"][0]["message"])
321
+ response_json = response.json()
322
+ # RunPod returns 200 on client errors
323
+ if "errors" in response_json:
324
+ raise RunpodApiClientError(errors=response_json["errors"])
244
325
  return response
245
326
  except requests.HTTPError as e:
246
327
  if e.response is not None and e.response.status_code in (
@@ -250,7 +331,7 @@ class RunpodApiClient:
250
331
  raise BackendInvalidCredentialsError(e.response.text)
251
332
  raise
252
333
 
253
- def wait_for_instance(self, instance_id) -> Optional[Dict]:
334
+ def wait_for_instance(self, instance_id: str) -> Optional[Dict]:
254
335
  start = get_current_datetime()
255
336
  wait_for_instance_interval = 5
256
337
  # To change the status to "running," the image must be pulled and then started.
@@ -263,18 +344,7 @@ class RunpodApiClient:
263
344
  return
264
345
 
265
346
 
266
- user_details_query = """
267
- query myself {
268
- myself {
269
- id
270
- authId
271
- email
272
- }
273
- }
274
- """
275
-
276
-
277
- def generate_pod_query(pod_id: str) -> str:
347
+ def _generate_pod_query(pod_id: str) -> str:
278
348
  """
279
349
  Generate a query for a specific GPU type
280
350
  """
@@ -283,6 +353,7 @@ def generate_pod_query(pod_id: str) -> str:
283
353
  query pod {{
284
354
  pod(input: {{podId: "{pod_id}"}}) {{
285
355
  id
356
+ clusterIp
286
357
  containerDiskInGb
287
358
  costPerHr
288
359
  desiredStatus
@@ -319,26 +390,26 @@ def generate_pod_query(pod_id: str) -> str:
319
390
  """
320
391
 
321
392
 
322
- def generate_pod_deployment_mutation(
393
+ def _generate_pod_deployment_mutation(
323
394
  name: str,
324
395
  image_name: str,
325
396
  gpu_type_id: str,
326
397
  cloud_type: str,
327
398
  support_public_ip: bool = True,
328
399
  start_ssh: bool = True,
329
- data_center_id=None,
330
- country_code=None,
331
- gpu_count=None,
332
- volume_in_gb=None,
333
- container_disk_in_gb=None,
334
- min_vcpu_count=None,
335
- min_memory_in_gb=None,
336
- docker_args=None,
337
- ports=None,
338
- volume_mount_path=None,
400
+ data_center_id: Optional[str] = None,
401
+ country_code: Optional[str] = None,
402
+ gpu_count: Optional[int] = None,
403
+ volume_in_gb: Optional[int] = None,
404
+ container_disk_in_gb: Optional[int] = None,
405
+ min_vcpu_count: Optional[int] = None,
406
+ min_memory_in_gb: Optional[int] = None,
407
+ docker_args: Optional[str] = None,
408
+ ports: Optional[str] = None,
409
+ volume_mount_path: Optional[str] = None,
339
410
  env: Optional[Dict[str, Any]] = None,
340
- template_id=None,
341
- network_volume_id=None,
411
+ template_id: Optional[str] = None,
412
+ network_volume_id: Optional[str] = None,
342
413
  allowed_cuda_versions: Optional[List[str]] = None,
343
414
  bid_per_gpu: Optional[float] = None,
344
415
  ) -> str:
@@ -425,7 +496,7 @@ def generate_pod_deployment_mutation(
425
496
  """
426
497
 
427
498
 
428
- def generate_pod_terminate_mutation(pod_id: str) -> str:
499
+ def _generate_pod_terminate_mutation(pod_id: str) -> str:
429
500
  """
430
501
  Generates a mutation to terminate a pod.
431
502
  """
@@ -434,3 +505,118 @@ def generate_pod_terminate_mutation(pod_id: str) -> str:
434
505
  podTerminate(input: {{ podId: "{pod_id}" }})
435
506
  }}
436
507
  """
508
+
509
+
510
+ def _generate_delete_cluster_mutation(cluster_id: str) -> str:
511
+ """
512
+ Generates a mutation to delete a cluster.
513
+ """
514
+ return f"""
515
+ mutation {{
516
+ deleteCluster(
517
+ input: {{
518
+ id: "{cluster_id}"
519
+ }}
520
+ )
521
+ }}
522
+ """
523
+
524
+
525
+ def _generate_create_cluster_mutation(
526
+ cluster_name: str,
527
+ gpu_type_id: str,
528
+ pod_count: int,
529
+ gpu_count_per_pod: int,
530
+ image_name: str,
531
+ cluster_type: str,
532
+ deploy_cost: str,
533
+ template_id: Optional[str] = None,
534
+ network_volume_id: Optional[str] = None,
535
+ volume_in_gb: Optional[int] = None,
536
+ throughput: Optional[int] = None,
537
+ allowed_cuda_versions: Optional[List[str]] = None,
538
+ volume_key: Optional[str] = None,
539
+ data_center_id: Optional[str] = None,
540
+ start_jupyter: bool = False,
541
+ start_ssh: bool = False,
542
+ container_disk_in_gb: Optional[int] = None,
543
+ docker_args: Optional[str] = None,
544
+ env: Optional[Dict[str, Any]] = None,
545
+ volume_mount_path: Optional[str] = None,
546
+ ports: Optional[str] = None,
547
+ ) -> str:
548
+ """
549
+ Generates a mutation to create a cluster.
550
+ """
551
+ input_fields = []
552
+
553
+ # ------------------------------ Required Fields ----------------------------- #
554
+ input_fields.append(f'clusterName: "{cluster_name}"')
555
+ input_fields.append(f'gpuTypeId: "{gpu_type_id}"')
556
+ input_fields.append(f"podCount: {pod_count}")
557
+ input_fields.append(f'imageName: "{image_name}"')
558
+ input_fields.append(f"type: {cluster_type}")
559
+ input_fields.append(f"gpuCountPerPod: {gpu_count_per_pod}")
560
+ # If deploy_cost is not specified, Runpod returns Insufficient resources error.
561
+ input_fields.append(f"deployCost: {deploy_cost}")
562
+
563
+ # ------------------------------ Optional Fields ----------------------------- #
564
+ if template_id is not None:
565
+ input_fields.append(f'templateId: "{template_id}"')
566
+ if network_volume_id is not None:
567
+ input_fields.append(f'networkVolumeId: "{network_volume_id}"')
568
+ if volume_in_gb is not None:
569
+ input_fields.append(f"volumeInGb: {volume_in_gb}")
570
+ if throughput is not None:
571
+ input_fields.append(f"throughput: {throughput}")
572
+ if allowed_cuda_versions is not None:
573
+ allowed_cuda_versions_string = ", ".join(
574
+ [f'"{version}"' for version in allowed_cuda_versions]
575
+ )
576
+ input_fields.append(f"allowedCudaVersions: [{allowed_cuda_versions_string}]")
577
+ if volume_key is not None:
578
+ input_fields.append(f'volumeKey: "{volume_key}"')
579
+ if data_center_id is not None:
580
+ input_fields.append(f'dataCenterId: "{data_center_id}"')
581
+ if start_jupyter:
582
+ input_fields.append("startJupyter: true")
583
+ if start_ssh:
584
+ input_fields.append("startSsh: true")
585
+ if container_disk_in_gb is not None:
586
+ input_fields.append(f"containerDiskInGb: {container_disk_in_gb}")
587
+ if docker_args is not None:
588
+ input_fields.append(f'dockerArgs: "{docker_args}"')
589
+ if env is not None:
590
+ env_string = ", ".join(
591
+ [f'{{ key: "{key}", value: "{value}" }}' for key, value in env.items()]
592
+ )
593
+ input_fields.append(f"env: [{env_string}]")
594
+ if volume_mount_path is not None:
595
+ input_fields.append(f'volumeMountPath: "{volume_mount_path}"')
596
+ if ports is not None:
597
+ ports = ports.replace(" ", "")
598
+ input_fields.append(f'ports: "{ports}"')
599
+
600
+ # Format input fields
601
+ input_string = ", ".join(input_fields)
602
+ return f"""
603
+ mutation {{
604
+ createCluster(
605
+ input: {{
606
+ {input_string}
607
+ }}
608
+ ) {{
609
+ id
610
+ name
611
+ pods {{
612
+ id
613
+ clusterIp
614
+ lastStatusChange
615
+ imageName
616
+ machine {{
617
+ podHostId
618
+ }}
619
+ }}
620
+ }}
621
+ }}
622
+ """
@@ -2,31 +2,34 @@ import json
2
2
  import uuid
3
3
  from collections.abc import Iterable
4
4
  from datetime import timedelta
5
- from typing import List, Optional
5
+ from typing import Callable, List, Optional
6
6
 
7
7
  from dstack._internal.core.backends.base.backend import Compute
8
8
  from dstack._internal.core.backends.base.compute import (
9
9
  ComputeWithAllOffersCached,
10
+ ComputeWithGroupProvisioningSupport,
11
+ ComputeWithMultinodeSupport,
10
12
  ComputeWithVolumeSupport,
11
13
  generate_unique_instance_name,
12
14
  generate_unique_volume_name,
13
15
  get_docker_commands,
14
16
  get_job_instance_name,
15
17
  )
18
+ from dstack._internal.core.backends.base.models import JobConfiguration
16
19
  from dstack._internal.core.backends.base.offers import (
17
20
  OfferModifier,
18
21
  get_catalog_offers,
19
22
  get_offers_disk_modifier,
20
23
  )
21
- from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
24
+ from dstack._internal.core.backends.runpod.api_client import RunpodApiClient, RunpodApiClientError
22
25
  from dstack._internal.core.backends.runpod.models import RunpodConfig
23
26
  from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
24
27
  from dstack._internal.core.errors import (
25
- BackendError,
26
28
  ComputeError,
27
29
  )
28
30
  from dstack._internal.core.models.backends.base import BackendType
29
- from dstack._internal.core.models.common import RegistryAuth
31
+ from dstack._internal.core.models.common import CoreModel, RegistryAuth
32
+ from dstack._internal.core.models.compute_groups import ComputeGroup, ComputeGroupProvisioningData
30
33
  from dstack._internal.core.models.instances import (
31
34
  InstanceAvailability,
32
35
  InstanceConfiguration,
@@ -36,7 +39,7 @@ from dstack._internal.core.models.instances import (
36
39
  from dstack._internal.core.models.resources import Memory, Range
37
40
  from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
38
41
  from dstack._internal.core.models.volumes import Volume, VolumeProvisioningData
39
- from dstack._internal.utils.common import get_current_datetime
42
+ from dstack._internal.utils.common import get_current_datetime, get_or_error
40
43
  from dstack._internal.utils.logging import get_logger
41
44
 
42
45
  logger = get_logger(__name__)
@@ -50,9 +53,15 @@ CONTAINER_REGISTRY_AUTH_CLEANUP_INTERVAL = 60 * 60 * 24 # 24 hour
50
53
  CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("1GB"), max=None)
51
54
 
52
55
 
56
+ class RunpodOfferBackendData(CoreModel):
57
+ pod_counts: Optional[list[int]] = None
58
+
59
+
53
60
  class RunpodCompute(
54
61
  ComputeWithAllOffersCached,
55
62
  ComputeWithVolumeSupport,
63
+ ComputeWithMultinodeSupport,
64
+ ComputeWithGroupProvisioningSupport,
56
65
  Compute,
57
66
  ):
58
67
  _last_cleanup_time = None
@@ -80,6 +89,18 @@ class RunpodCompute(
80
89
  def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
81
90
  return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
82
91
 
92
+ def get_offers_post_filter(
93
+ self, requirements: Requirements
94
+ ) -> Optional[Callable[[InstanceOfferWithAvailability], bool]]:
95
+ def offers_post_filter(offer: InstanceOfferWithAvailability) -> bool:
96
+ pod_counts = _get_offer_pod_counts(offer)
97
+ is_cluster_offer = len(pod_counts) > 0 and any(pc != 1 for pc in pod_counts)
98
+ if requirements.multinode:
99
+ return is_cluster_offer
100
+ return not is_cluster_offer
101
+
102
+ return offers_post_filter
103
+
83
104
  def run_job(
84
105
  self,
85
106
  run: Run,
@@ -151,6 +172,8 @@ class RunpodCompute(
151
172
 
152
173
  instance_id = resp["id"]
153
174
 
175
+ # Call edit_pod to pass container_registry_auth_id.
176
+ # Expect a long time (~5m) for the pod to pick up the creds.
154
177
  # TODO: remove editPod once createPod supports docker's username and password
155
178
  # editPod is temporary solution to set container_registry_auth_id because createPod does not
156
179
  # support it currently. This will be removed once createPod supports container_registry_auth_id
@@ -186,14 +209,127 @@ class RunpodCompute(
186
209
  backend_data=None,
187
210
  )
188
211
 
212
+ def run_jobs(
213
+ self,
214
+ run: Run,
215
+ job_configurations: List[JobConfiguration],
216
+ instance_offer: InstanceOfferWithAvailability,
217
+ project_ssh_public_key: str,
218
+ project_ssh_private_key: str,
219
+ ) -> ComputeGroupProvisioningData:
220
+ master_job_configuration = job_configurations[0]
221
+ master_job = master_job_configuration.job
222
+ master_job_volumes = master_job_configuration.volumes
223
+ all_volumes_names = set(v.name for jc in job_configurations for v in jc.volumes)
224
+ instance_config = InstanceConfiguration(
225
+ project_name=run.project_name,
226
+ instance_name=get_job_instance_name(run, master_job),
227
+ ssh_keys=[
228
+ SSHKey(public=get_or_error(run.run_spec.ssh_key_pub).strip()),
229
+ SSHKey(public=project_ssh_public_key.strip()),
230
+ ],
231
+ user=run.user,
232
+ )
233
+
234
+ pod_name = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN)
235
+ authorized_keys = instance_config.get_public_keys()
236
+ disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
237
+
238
+ network_volume_id = None
239
+ volume_mount_path = None
240
+ if len(master_job_volumes) > 1:
241
+ raise ComputeError("Mounting more than one network volume is not supported in runpod")
242
+ if len(all_volumes_names) > 1:
243
+ raise ComputeError(
244
+ "Mounting different volumes to different jobs is not supported in runpod"
245
+ )
246
+ if len(master_job_volumes) == 1:
247
+ network_volume_id = master_job_volumes[0].volume_id
248
+ volume_mount_path = run.run_spec.configuration.volumes[0].path
249
+
250
+ offer_pod_counts = _get_offer_pod_counts(instance_offer)
251
+ pod_count = len(job_configurations)
252
+ gpu_count = len(instance_offer.instance.resources.gpus)
253
+ data_center_id = instance_offer.region
254
+
255
+ if pod_count not in offer_pod_counts:
256
+ raise ComputeError(
257
+ f"Failed to provision {pod_count} pods. Available pod counts: {offer_pod_counts}"
258
+ )
259
+
260
+ container_registry_auth_id = self._generate_container_registry_auth_id(
261
+ master_job.job_spec.registry_auth
262
+ )
263
+ resp = self.api_client.create_cluster(
264
+ cluster_name=pod_name,
265
+ gpu_type_id=instance_offer.instance.name,
266
+ pod_count=pod_count,
267
+ gpu_count_per_pod=gpu_count,
268
+ deploy_cost=f"{instance_offer.price * pod_count:.2f}",
269
+ image_name=master_job.job_spec.image_name,
270
+ cluster_type="TRAINING",
271
+ data_center_id=data_center_id,
272
+ container_disk_in_gb=disk_size,
273
+ docker_args=_get_docker_args(authorized_keys),
274
+ ports=f"{DSTACK_RUNNER_SSH_PORT}/tcp",
275
+ network_volume_id=network_volume_id,
276
+ volume_mount_path=volume_mount_path,
277
+ env={"RUNPOD_POD_USER": "0"},
278
+ )
279
+
280
+ # An "edit pod" trick to pass container registry creds.
281
+ if container_registry_auth_id is not None:
282
+ for pod in resp["pods"]:
283
+ self.api_client.edit_pod(
284
+ pod_id=pod["id"],
285
+ image_name=master_job.job_spec.image_name,
286
+ container_disk_in_gb=disk_size,
287
+ container_registry_auth_id=container_registry_auth_id,
288
+ )
289
+
290
+ jpds = [
291
+ JobProvisioningData(
292
+ backend=instance_offer.backend,
293
+ instance_type=instance_offer.instance,
294
+ instance_id=pod["id"],
295
+ hostname=None,
296
+ internal_ip=pod["clusterIp"],
297
+ region=instance_offer.region,
298
+ price=instance_offer.price,
299
+ username="root",
300
+ dockerized=False,
301
+ )
302
+ for pod in resp["pods"]
303
+ ]
304
+ return ComputeGroupProvisioningData(
305
+ compute_group_id=resp["id"],
306
+ compute_group_name=resp["name"],
307
+ backend=BackendType.RUNPOD,
308
+ region=instance_offer.region,
309
+ job_provisioning_datas=jpds,
310
+ )
311
+
189
312
  def terminate_instance(
190
313
  self, instance_id: str, region: str, backend_data: Optional[str] = None
191
- ) -> None:
314
+ ):
192
315
  try:
193
316
  self.api_client.terminate_pod(instance_id)
194
- except BackendError as e:
195
- if e.args[0] == "Instance Not Found":
196
- logger.debug("The instance with name %s not found", instance_id)
317
+ except RunpodApiClientError as e:
318
+ if len(e.errors) > 0 and e.errors[0]["message"] == "pod not found to terminate":
319
+ logger.debug("The instance %s not found. Skipping deletion.", instance_id)
320
+ return
321
+ raise
322
+
323
+ def terminate_compute_group(self, compute_group: ComputeGroup):
324
+ provisioning_data = compute_group.provisioning_data
325
+ try:
326
+ self.api_client.delete_cluster(provisioning_data.compute_group_id)
327
+ except RunpodApiClientError as e:
328
+ if len(e.errors) > 0 and e.errors[0]["extensions"]["code"] == "Cluster not found":
329
+ logger.debug(
330
+ "The cluster %s not found. Skipping deletion.",
331
+ provisioning_data.compute_group_id,
332
+ )
197
333
  return
198
334
  raise
199
335
 
@@ -216,7 +352,9 @@ class RunpodCompute(
216
352
  provisioning_data.ssh_port = port["publicPort"]
217
353
 
218
354
  def register_volume(self, volume: Volume) -> VolumeProvisioningData:
219
- volume_data = self.api_client.get_network_volume(volume_id=volume.configuration.volume_id)
355
+ volume_data = self.api_client.get_network_volume(
356
+ volume_id=get_or_error(volume.configuration.volume_id)
357
+ )
220
358
  if volume_data is None:
221
359
  raise ComputeError(f"Volume {volume.configuration.volume_id} not found")
222
360
  size_gb = volume_data["size"]
@@ -258,14 +396,12 @@ class RunpodCompute(
258
396
  ) -> Optional[str]:
259
397
  if registry_auth is None:
260
398
  return None
261
-
262
399
  return self.api_client.add_container_registry_auth(
263
400
  uuid.uuid4().hex, registry_auth.username, registry_auth.password
264
401
  )
265
402
 
266
403
  def _clean_stale_container_registry_auths(self) -> None:
267
404
  container_registry_auths = self.api_client.get_container_registry_auths()
268
-
269
405
  # Container_registry_auths sorted by creation time so try to delete the oldest first
270
406
  # when we reach container_registry_auths that is still in use, we stop
271
407
  for container_registry_auth in container_registry_auths:
@@ -289,9 +425,17 @@ def _get_volume_price(size: int) -> float:
289
425
  return 0.05 * size
290
426
 
291
427
 
292
- def _is_secure_cloud(region: str) -> str:
428
+ def _is_secure_cloud(region: str) -> bool:
293
429
  """
294
430
  Secure cloud regions are datacenter IDs: CA-MTL-1, EU-NL-1, etc.
295
431
  Community cloud regions are country codes: CA, NL, etc.
296
432
  """
297
433
  return "-" in region
434
+
435
+
436
+ def _get_offer_pod_counts(offer: InstanceOfferWithAvailability) -> list[int]:
437
+ backend_data: RunpodOfferBackendData = RunpodOfferBackendData.__response__.parse_obj(
438
+ offer.backend_data
439
+ )
440
+ pod_counts = backend_data.pod_counts or []
441
+ return pod_counts