dstack 0.19.34__py3-none-any.whl → 0.19.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/configurators/run.py +1 -1
- dstack/_internal/core/backends/base/compute.py +20 -1
- dstack/_internal/core/backends/base/models.py +10 -0
- dstack/_internal/core/backends/base/offers.py +1 -0
- dstack/_internal/core/backends/features.py +5 -0
- dstack/_internal/core/backends/nebius/compute.py +28 -16
- dstack/_internal/core/backends/nebius/configurator.py +1 -1
- dstack/_internal/core/backends/nebius/models.py +4 -0
- dstack/_internal/core/backends/nebius/resources.py +41 -20
- dstack/_internal/core/backends/runpod/api_client.py +245 -59
- dstack/_internal/core/backends/runpod/compute.py +157 -13
- dstack/_internal/core/models/compute_groups.py +39 -0
- dstack/_internal/core/models/fleets.py +6 -1
- dstack/_internal/core/models/profiles.py +3 -1
- dstack/_internal/core/models/runs.py +3 -0
- dstack/_internal/server/app.py +14 -2
- dstack/_internal/server/background/__init__.py +7 -0
- dstack/_internal/server/background/tasks/process_compute_groups.py +164 -0
- dstack/_internal/server/background/tasks/process_instances.py +81 -49
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +179 -84
- dstack/_internal/server/migrations/env.py +20 -2
- dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py +93 -0
- dstack/_internal/server/models.py +39 -0
- dstack/_internal/server/routers/runs.py +15 -6
- dstack/_internal/server/services/compute_groups.py +22 -0
- dstack/_internal/server/services/fleets.py +1 -0
- dstack/_internal/server/services/jobs/__init__.py +13 -0
- dstack/_internal/server/services/jobs/configurators/base.py +3 -2
- dstack/_internal/server/services/requirements/combine.py +1 -0
- dstack/_internal/server/services/runs.py +17 -3
- dstack/_internal/server/testing/common.py +51 -0
- dstack/_internal/server/utils/routers.py +18 -20
- dstack/_internal/settings.py +4 -1
- dstack/_internal/utils/version.py +22 -0
- dstack/version.py +1 -1
- {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/METADATA +3 -3
- {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/RECORD +40 -36
- dstack/_internal/core/backends/nebius/fabrics.py +0 -49
- {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/WHEEL +0 -0
- {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -11,6 +11,14 @@ from dstack._internal.utils.common import get_current_datetime
|
|
|
11
11
|
API_URL = "https://api.runpod.io/graphql"
|
|
12
12
|
|
|
13
13
|
|
|
14
|
+
class RunpodApiClientError(BackendError):
|
|
15
|
+
errors: List[Dict]
|
|
16
|
+
|
|
17
|
+
def __init__(self, errors: List[Dict]):
|
|
18
|
+
self.errors = errors
|
|
19
|
+
super().__init__(errors)
|
|
20
|
+
|
|
21
|
+
|
|
14
22
|
class RunpodApiClient:
|
|
15
23
|
def __init__(self, api_key: str):
|
|
16
24
|
self.api_key = api_key
|
|
@@ -23,7 +31,19 @@ class RunpodApiClient:
|
|
|
23
31
|
return True
|
|
24
32
|
|
|
25
33
|
def get_user_details(self) -> Dict:
|
|
26
|
-
resp = self._make_request(
|
|
34
|
+
resp = self._make_request(
|
|
35
|
+
{
|
|
36
|
+
"query": """
|
|
37
|
+
query myself {
|
|
38
|
+
myself {
|
|
39
|
+
id
|
|
40
|
+
authId
|
|
41
|
+
email
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
"""
|
|
45
|
+
}
|
|
46
|
+
)
|
|
27
47
|
return resp.json()
|
|
28
48
|
|
|
29
49
|
def create_pod(
|
|
@@ -52,28 +72,28 @@ class RunpodApiClient:
|
|
|
52
72
|
) -> Dict:
|
|
53
73
|
resp = self._make_request(
|
|
54
74
|
{
|
|
55
|
-
"query":
|
|
56
|
-
name,
|
|
57
|
-
image_name,
|
|
58
|
-
gpu_type_id,
|
|
59
|
-
cloud_type,
|
|
60
|
-
support_public_ip,
|
|
61
|
-
start_ssh,
|
|
62
|
-
data_center_id,
|
|
63
|
-
country_code,
|
|
64
|
-
gpu_count,
|
|
65
|
-
volume_in_gb,
|
|
66
|
-
container_disk_in_gb,
|
|
67
|
-
min_vcpu_count,
|
|
68
|
-
min_memory_in_gb,
|
|
69
|
-
docker_args,
|
|
70
|
-
ports,
|
|
71
|
-
volume_mount_path,
|
|
72
|
-
env,
|
|
73
|
-
template_id,
|
|
74
|
-
network_volume_id,
|
|
75
|
-
allowed_cuda_versions,
|
|
76
|
-
bid_per_gpu,
|
|
75
|
+
"query": _generate_pod_deployment_mutation(
|
|
76
|
+
name=name,
|
|
77
|
+
image_name=image_name,
|
|
78
|
+
gpu_type_id=gpu_type_id,
|
|
79
|
+
cloud_type=cloud_type,
|
|
80
|
+
support_public_ip=support_public_ip,
|
|
81
|
+
start_ssh=start_ssh,
|
|
82
|
+
data_center_id=data_center_id,
|
|
83
|
+
country_code=country_code,
|
|
84
|
+
gpu_count=gpu_count,
|
|
85
|
+
volume_in_gb=volume_in_gb,
|
|
86
|
+
container_disk_in_gb=container_disk_in_gb,
|
|
87
|
+
min_vcpu_count=min_vcpu_count,
|
|
88
|
+
min_memory_in_gb=min_memory_in_gb,
|
|
89
|
+
docker_args=docker_args,
|
|
90
|
+
ports=ports,
|
|
91
|
+
volume_mount_path=volume_mount_path,
|
|
92
|
+
env=env,
|
|
93
|
+
template_id=template_id,
|
|
94
|
+
network_volume_id=network_volume_id,
|
|
95
|
+
allowed_cuda_versions=allowed_cuda_versions,
|
|
96
|
+
bid_per_gpu=bid_per_gpu,
|
|
77
97
|
)
|
|
78
98
|
}
|
|
79
99
|
)
|
|
@@ -86,7 +106,9 @@ class RunpodApiClient:
|
|
|
86
106
|
image_name: str,
|
|
87
107
|
container_disk_in_gb: int,
|
|
88
108
|
container_registry_auth_id: str,
|
|
89
|
-
|
|
109
|
+
# Default pod volume is 20GB.
|
|
110
|
+
# RunPod errors if it's not specified for podEditJob.
|
|
111
|
+
volume_in_gb: int = 20,
|
|
90
112
|
) -> str:
|
|
91
113
|
resp = self._make_request(
|
|
92
114
|
{
|
|
@@ -108,12 +130,12 @@ class RunpodApiClient:
|
|
|
108
130
|
return resp.json()["data"]["podEditJob"]["id"]
|
|
109
131
|
|
|
110
132
|
def get_pod(self, pod_id: str) -> Dict:
|
|
111
|
-
resp = self._make_request({"query":
|
|
133
|
+
resp = self._make_request({"query": _generate_pod_query(pod_id)})
|
|
112
134
|
data = resp.json()
|
|
113
135
|
return data["data"]["pod"]
|
|
114
136
|
|
|
115
137
|
def terminate_pod(self, pod_id: str) -> Dict:
|
|
116
|
-
resp = self._make_request({"query":
|
|
138
|
+
resp = self._make_request({"query": _generate_pod_terminate_mutation(pod_id)})
|
|
117
139
|
data = resp.json()
|
|
118
140
|
return data["data"]
|
|
119
141
|
|
|
@@ -213,7 +235,7 @@ class RunpodApiClient:
|
|
|
213
235
|
)
|
|
214
236
|
return response.json()["data"]["createNetworkVolume"]["id"]
|
|
215
237
|
|
|
216
|
-
def delete_network_volume(self, volume_id: str):
|
|
238
|
+
def delete_network_volume(self, volume_id: str) -> None:
|
|
217
239
|
self._make_request(
|
|
218
240
|
{
|
|
219
241
|
"query": f"""
|
|
@@ -228,7 +250,66 @@ class RunpodApiClient:
|
|
|
228
250
|
}
|
|
229
251
|
)
|
|
230
252
|
|
|
231
|
-
def
|
|
253
|
+
def create_cluster(
|
|
254
|
+
self,
|
|
255
|
+
cluster_name: str,
|
|
256
|
+
gpu_type_id: str,
|
|
257
|
+
pod_count: int,
|
|
258
|
+
gpu_count_per_pod: int,
|
|
259
|
+
image_name: str,
|
|
260
|
+
deploy_cost: str,
|
|
261
|
+
template_id: Optional[str] = None,
|
|
262
|
+
cluster_type: str = "TRAINING",
|
|
263
|
+
network_volume_id: Optional[str] = None,
|
|
264
|
+
volume_in_gb: Optional[int] = None,
|
|
265
|
+
throughput: Optional[int] = None,
|
|
266
|
+
allowed_cuda_versions: Optional[List[str]] = None,
|
|
267
|
+
volume_key: Optional[str] = None,
|
|
268
|
+
data_center_id: Optional[str] = None,
|
|
269
|
+
start_jupyter: bool = False,
|
|
270
|
+
start_ssh: bool = False,
|
|
271
|
+
container_disk_in_gb: Optional[int] = None,
|
|
272
|
+
docker_args: Optional[str] = None,
|
|
273
|
+
env: Optional[Dict[str, Any]] = None,
|
|
274
|
+
volume_mount_path: Optional[str] = None,
|
|
275
|
+
ports: Optional[str] = None,
|
|
276
|
+
) -> Dict:
|
|
277
|
+
resp = self._make_request(
|
|
278
|
+
{
|
|
279
|
+
"query": _generate_create_cluster_mutation(
|
|
280
|
+
cluster_name=cluster_name,
|
|
281
|
+
gpu_type_id=gpu_type_id,
|
|
282
|
+
pod_count=pod_count,
|
|
283
|
+
gpu_count_per_pod=gpu_count_per_pod,
|
|
284
|
+
image_name=image_name,
|
|
285
|
+
cluster_type=cluster_type,
|
|
286
|
+
deploy_cost=deploy_cost,
|
|
287
|
+
template_id=template_id,
|
|
288
|
+
network_volume_id=network_volume_id,
|
|
289
|
+
volume_in_gb=volume_in_gb,
|
|
290
|
+
throughput=throughput,
|
|
291
|
+
allowed_cuda_versions=allowed_cuda_versions,
|
|
292
|
+
volume_key=volume_key,
|
|
293
|
+
data_center_id=data_center_id,
|
|
294
|
+
start_jupyter=start_jupyter,
|
|
295
|
+
start_ssh=start_ssh,
|
|
296
|
+
container_disk_in_gb=container_disk_in_gb,
|
|
297
|
+
docker_args=docker_args,
|
|
298
|
+
env=env,
|
|
299
|
+
volume_mount_path=volume_mount_path,
|
|
300
|
+
ports=ports,
|
|
301
|
+
)
|
|
302
|
+
}
|
|
303
|
+
)
|
|
304
|
+
data = resp.json()["data"]
|
|
305
|
+
return data["createCluster"]
|
|
306
|
+
|
|
307
|
+
def delete_cluster(self, cluster_id: str) -> bool:
|
|
308
|
+
resp = self._make_request({"query": _generate_delete_cluster_mutation(cluster_id)})
|
|
309
|
+
data = resp.json()["data"]
|
|
310
|
+
return data["deleteCluster"]
|
|
311
|
+
|
|
312
|
+
def _make_request(self, data: Optional[Dict[str, Any]] = None) -> Response:
|
|
232
313
|
try:
|
|
233
314
|
response = requests.request(
|
|
234
315
|
method="POST",
|
|
@@ -237,10 +318,10 @@ class RunpodApiClient:
|
|
|
237
318
|
timeout=120,
|
|
238
319
|
)
|
|
239
320
|
response.raise_for_status()
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
raise
|
|
321
|
+
response_json = response.json()
|
|
322
|
+
# RunPod returns 200 on client errors
|
|
323
|
+
if "errors" in response_json:
|
|
324
|
+
raise RunpodApiClientError(errors=response_json["errors"])
|
|
244
325
|
return response
|
|
245
326
|
except requests.HTTPError as e:
|
|
246
327
|
if e.response is not None and e.response.status_code in (
|
|
@@ -250,7 +331,7 @@ class RunpodApiClient:
|
|
|
250
331
|
raise BackendInvalidCredentialsError(e.response.text)
|
|
251
332
|
raise
|
|
252
333
|
|
|
253
|
-
def wait_for_instance(self, instance_id) -> Optional[Dict]:
|
|
334
|
+
def wait_for_instance(self, instance_id: str) -> Optional[Dict]:
|
|
254
335
|
start = get_current_datetime()
|
|
255
336
|
wait_for_instance_interval = 5
|
|
256
337
|
# To change the status to "running," the image must be pulled and then started.
|
|
@@ -263,18 +344,7 @@ class RunpodApiClient:
|
|
|
263
344
|
return
|
|
264
345
|
|
|
265
346
|
|
|
266
|
-
|
|
267
|
-
query myself {
|
|
268
|
-
myself {
|
|
269
|
-
id
|
|
270
|
-
authId
|
|
271
|
-
email
|
|
272
|
-
}
|
|
273
|
-
}
|
|
274
|
-
"""
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
def generate_pod_query(pod_id: str) -> str:
|
|
347
|
+
def _generate_pod_query(pod_id: str) -> str:
|
|
278
348
|
"""
|
|
279
349
|
Generate a query for a specific GPU type
|
|
280
350
|
"""
|
|
@@ -283,6 +353,7 @@ def generate_pod_query(pod_id: str) -> str:
|
|
|
283
353
|
query pod {{
|
|
284
354
|
pod(input: {{podId: "{pod_id}"}}) {{
|
|
285
355
|
id
|
|
356
|
+
clusterIp
|
|
286
357
|
containerDiskInGb
|
|
287
358
|
costPerHr
|
|
288
359
|
desiredStatus
|
|
@@ -319,26 +390,26 @@ def generate_pod_query(pod_id: str) -> str:
|
|
|
319
390
|
"""
|
|
320
391
|
|
|
321
392
|
|
|
322
|
-
def
|
|
393
|
+
def _generate_pod_deployment_mutation(
|
|
323
394
|
name: str,
|
|
324
395
|
image_name: str,
|
|
325
396
|
gpu_type_id: str,
|
|
326
397
|
cloud_type: str,
|
|
327
398
|
support_public_ip: bool = True,
|
|
328
399
|
start_ssh: bool = True,
|
|
329
|
-
data_center_id=None,
|
|
330
|
-
country_code=None,
|
|
331
|
-
gpu_count=None,
|
|
332
|
-
volume_in_gb=None,
|
|
333
|
-
container_disk_in_gb=None,
|
|
334
|
-
min_vcpu_count=None,
|
|
335
|
-
min_memory_in_gb=None,
|
|
336
|
-
docker_args=None,
|
|
337
|
-
ports=None,
|
|
338
|
-
volume_mount_path=None,
|
|
400
|
+
data_center_id: Optional[str] = None,
|
|
401
|
+
country_code: Optional[str] = None,
|
|
402
|
+
gpu_count: Optional[int] = None,
|
|
403
|
+
volume_in_gb: Optional[int] = None,
|
|
404
|
+
container_disk_in_gb: Optional[int] = None,
|
|
405
|
+
min_vcpu_count: Optional[int] = None,
|
|
406
|
+
min_memory_in_gb: Optional[int] = None,
|
|
407
|
+
docker_args: Optional[str] = None,
|
|
408
|
+
ports: Optional[str] = None,
|
|
409
|
+
volume_mount_path: Optional[str] = None,
|
|
339
410
|
env: Optional[Dict[str, Any]] = None,
|
|
340
|
-
template_id=None,
|
|
341
|
-
network_volume_id=None,
|
|
411
|
+
template_id: Optional[str] = None,
|
|
412
|
+
network_volume_id: Optional[str] = None,
|
|
342
413
|
allowed_cuda_versions: Optional[List[str]] = None,
|
|
343
414
|
bid_per_gpu: Optional[float] = None,
|
|
344
415
|
) -> str:
|
|
@@ -425,7 +496,7 @@ def generate_pod_deployment_mutation(
|
|
|
425
496
|
"""
|
|
426
497
|
|
|
427
498
|
|
|
428
|
-
def
|
|
499
|
+
def _generate_pod_terminate_mutation(pod_id: str) -> str:
|
|
429
500
|
"""
|
|
430
501
|
Generates a mutation to terminate a pod.
|
|
431
502
|
"""
|
|
@@ -434,3 +505,118 @@ def generate_pod_terminate_mutation(pod_id: str) -> str:
|
|
|
434
505
|
podTerminate(input: {{ podId: "{pod_id}" }})
|
|
435
506
|
}}
|
|
436
507
|
"""
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def _generate_delete_cluster_mutation(cluster_id: str) -> str:
|
|
511
|
+
"""
|
|
512
|
+
Generates a mutation to delete a cluster.
|
|
513
|
+
"""
|
|
514
|
+
return f"""
|
|
515
|
+
mutation {{
|
|
516
|
+
deleteCluster(
|
|
517
|
+
input: {{
|
|
518
|
+
id: "{cluster_id}"
|
|
519
|
+
}}
|
|
520
|
+
)
|
|
521
|
+
}}
|
|
522
|
+
"""
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
def _generate_create_cluster_mutation(
|
|
526
|
+
cluster_name: str,
|
|
527
|
+
gpu_type_id: str,
|
|
528
|
+
pod_count: int,
|
|
529
|
+
gpu_count_per_pod: int,
|
|
530
|
+
image_name: str,
|
|
531
|
+
cluster_type: str,
|
|
532
|
+
deploy_cost: str,
|
|
533
|
+
template_id: Optional[str] = None,
|
|
534
|
+
network_volume_id: Optional[str] = None,
|
|
535
|
+
volume_in_gb: Optional[int] = None,
|
|
536
|
+
throughput: Optional[int] = None,
|
|
537
|
+
allowed_cuda_versions: Optional[List[str]] = None,
|
|
538
|
+
volume_key: Optional[str] = None,
|
|
539
|
+
data_center_id: Optional[str] = None,
|
|
540
|
+
start_jupyter: bool = False,
|
|
541
|
+
start_ssh: bool = False,
|
|
542
|
+
container_disk_in_gb: Optional[int] = None,
|
|
543
|
+
docker_args: Optional[str] = None,
|
|
544
|
+
env: Optional[Dict[str, Any]] = None,
|
|
545
|
+
volume_mount_path: Optional[str] = None,
|
|
546
|
+
ports: Optional[str] = None,
|
|
547
|
+
) -> str:
|
|
548
|
+
"""
|
|
549
|
+
Generates a mutation to create a cluster.
|
|
550
|
+
"""
|
|
551
|
+
input_fields = []
|
|
552
|
+
|
|
553
|
+
# ------------------------------ Required Fields ----------------------------- #
|
|
554
|
+
input_fields.append(f'clusterName: "{cluster_name}"')
|
|
555
|
+
input_fields.append(f'gpuTypeId: "{gpu_type_id}"')
|
|
556
|
+
input_fields.append(f"podCount: {pod_count}")
|
|
557
|
+
input_fields.append(f'imageName: "{image_name}"')
|
|
558
|
+
input_fields.append(f"type: {cluster_type}")
|
|
559
|
+
input_fields.append(f"gpuCountPerPod: {gpu_count_per_pod}")
|
|
560
|
+
# If deploy_cost is not specified, Runpod returns Insufficient resources error.
|
|
561
|
+
input_fields.append(f"deployCost: {deploy_cost}")
|
|
562
|
+
|
|
563
|
+
# ------------------------------ Optional Fields ----------------------------- #
|
|
564
|
+
if template_id is not None:
|
|
565
|
+
input_fields.append(f'templateId: "{template_id}"')
|
|
566
|
+
if network_volume_id is not None:
|
|
567
|
+
input_fields.append(f'networkVolumeId: "{network_volume_id}"')
|
|
568
|
+
if volume_in_gb is not None:
|
|
569
|
+
input_fields.append(f"volumeInGb: {volume_in_gb}")
|
|
570
|
+
if throughput is not None:
|
|
571
|
+
input_fields.append(f"throughput: {throughput}")
|
|
572
|
+
if allowed_cuda_versions is not None:
|
|
573
|
+
allowed_cuda_versions_string = ", ".join(
|
|
574
|
+
[f'"{version}"' for version in allowed_cuda_versions]
|
|
575
|
+
)
|
|
576
|
+
input_fields.append(f"allowedCudaVersions: [{allowed_cuda_versions_string}]")
|
|
577
|
+
if volume_key is not None:
|
|
578
|
+
input_fields.append(f'volumeKey: "{volume_key}"')
|
|
579
|
+
if data_center_id is not None:
|
|
580
|
+
input_fields.append(f'dataCenterId: "{data_center_id}"')
|
|
581
|
+
if start_jupyter:
|
|
582
|
+
input_fields.append("startJupyter: true")
|
|
583
|
+
if start_ssh:
|
|
584
|
+
input_fields.append("startSsh: true")
|
|
585
|
+
if container_disk_in_gb is not None:
|
|
586
|
+
input_fields.append(f"containerDiskInGb: {container_disk_in_gb}")
|
|
587
|
+
if docker_args is not None:
|
|
588
|
+
input_fields.append(f'dockerArgs: "{docker_args}"')
|
|
589
|
+
if env is not None:
|
|
590
|
+
env_string = ", ".join(
|
|
591
|
+
[f'{{ key: "{key}", value: "{value}" }}' for key, value in env.items()]
|
|
592
|
+
)
|
|
593
|
+
input_fields.append(f"env: [{env_string}]")
|
|
594
|
+
if volume_mount_path is not None:
|
|
595
|
+
input_fields.append(f'volumeMountPath: "{volume_mount_path}"')
|
|
596
|
+
if ports is not None:
|
|
597
|
+
ports = ports.replace(" ", "")
|
|
598
|
+
input_fields.append(f'ports: "{ports}"')
|
|
599
|
+
|
|
600
|
+
# Format input fields
|
|
601
|
+
input_string = ", ".join(input_fields)
|
|
602
|
+
return f"""
|
|
603
|
+
mutation {{
|
|
604
|
+
createCluster(
|
|
605
|
+
input: {{
|
|
606
|
+
{input_string}
|
|
607
|
+
}}
|
|
608
|
+
) {{
|
|
609
|
+
id
|
|
610
|
+
name
|
|
611
|
+
pods {{
|
|
612
|
+
id
|
|
613
|
+
clusterIp
|
|
614
|
+
lastStatusChange
|
|
615
|
+
imageName
|
|
616
|
+
machine {{
|
|
617
|
+
podHostId
|
|
618
|
+
}}
|
|
619
|
+
}}
|
|
620
|
+
}}
|
|
621
|
+
}}
|
|
622
|
+
"""
|
|
@@ -2,31 +2,34 @@ import json
|
|
|
2
2
|
import uuid
|
|
3
3
|
from collections.abc import Iterable
|
|
4
4
|
from datetime import timedelta
|
|
5
|
-
from typing import List, Optional
|
|
5
|
+
from typing import Callable, List, Optional
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.backends.base.backend import Compute
|
|
8
8
|
from dstack._internal.core.backends.base.compute import (
|
|
9
9
|
ComputeWithAllOffersCached,
|
|
10
|
+
ComputeWithGroupProvisioningSupport,
|
|
11
|
+
ComputeWithMultinodeSupport,
|
|
10
12
|
ComputeWithVolumeSupport,
|
|
11
13
|
generate_unique_instance_name,
|
|
12
14
|
generate_unique_volume_name,
|
|
13
15
|
get_docker_commands,
|
|
14
16
|
get_job_instance_name,
|
|
15
17
|
)
|
|
18
|
+
from dstack._internal.core.backends.base.models import JobConfiguration
|
|
16
19
|
from dstack._internal.core.backends.base.offers import (
|
|
17
20
|
OfferModifier,
|
|
18
21
|
get_catalog_offers,
|
|
19
22
|
get_offers_disk_modifier,
|
|
20
23
|
)
|
|
21
|
-
from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
|
|
24
|
+
from dstack._internal.core.backends.runpod.api_client import RunpodApiClient, RunpodApiClientError
|
|
22
25
|
from dstack._internal.core.backends.runpod.models import RunpodConfig
|
|
23
26
|
from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
|
|
24
27
|
from dstack._internal.core.errors import (
|
|
25
|
-
BackendError,
|
|
26
28
|
ComputeError,
|
|
27
29
|
)
|
|
28
30
|
from dstack._internal.core.models.backends.base import BackendType
|
|
29
|
-
from dstack._internal.core.models.common import RegistryAuth
|
|
31
|
+
from dstack._internal.core.models.common import CoreModel, RegistryAuth
|
|
32
|
+
from dstack._internal.core.models.compute_groups import ComputeGroup, ComputeGroupProvisioningData
|
|
30
33
|
from dstack._internal.core.models.instances import (
|
|
31
34
|
InstanceAvailability,
|
|
32
35
|
InstanceConfiguration,
|
|
@@ -36,7 +39,7 @@ from dstack._internal.core.models.instances import (
|
|
|
36
39
|
from dstack._internal.core.models.resources import Memory, Range
|
|
37
40
|
from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
|
|
38
41
|
from dstack._internal.core.models.volumes import Volume, VolumeProvisioningData
|
|
39
|
-
from dstack._internal.utils.common import get_current_datetime
|
|
42
|
+
from dstack._internal.utils.common import get_current_datetime, get_or_error
|
|
40
43
|
from dstack._internal.utils.logging import get_logger
|
|
41
44
|
|
|
42
45
|
logger = get_logger(__name__)
|
|
@@ -50,9 +53,15 @@ CONTAINER_REGISTRY_AUTH_CLEANUP_INTERVAL = 60 * 60 * 24 # 24 hour
|
|
|
50
53
|
CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("1GB"), max=None)
|
|
51
54
|
|
|
52
55
|
|
|
56
|
+
class RunpodOfferBackendData(CoreModel):
|
|
57
|
+
pod_counts: Optional[list[int]] = None
|
|
58
|
+
|
|
59
|
+
|
|
53
60
|
class RunpodCompute(
|
|
54
61
|
ComputeWithAllOffersCached,
|
|
55
62
|
ComputeWithVolumeSupport,
|
|
63
|
+
ComputeWithMultinodeSupport,
|
|
64
|
+
ComputeWithGroupProvisioningSupport,
|
|
56
65
|
Compute,
|
|
57
66
|
):
|
|
58
67
|
_last_cleanup_time = None
|
|
@@ -80,6 +89,18 @@ class RunpodCompute(
|
|
|
80
89
|
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
81
90
|
return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
|
|
82
91
|
|
|
92
|
+
def get_offers_post_filter(
|
|
93
|
+
self, requirements: Requirements
|
|
94
|
+
) -> Optional[Callable[[InstanceOfferWithAvailability], bool]]:
|
|
95
|
+
def offers_post_filter(offer: InstanceOfferWithAvailability) -> bool:
|
|
96
|
+
pod_counts = _get_offer_pod_counts(offer)
|
|
97
|
+
is_cluster_offer = len(pod_counts) > 0 and any(pc != 1 for pc in pod_counts)
|
|
98
|
+
if requirements.multinode:
|
|
99
|
+
return is_cluster_offer
|
|
100
|
+
return not is_cluster_offer
|
|
101
|
+
|
|
102
|
+
return offers_post_filter
|
|
103
|
+
|
|
83
104
|
def run_job(
|
|
84
105
|
self,
|
|
85
106
|
run: Run,
|
|
@@ -151,6 +172,8 @@ class RunpodCompute(
|
|
|
151
172
|
|
|
152
173
|
instance_id = resp["id"]
|
|
153
174
|
|
|
175
|
+
# Call edit_pod to pass container_registry_auth_id.
|
|
176
|
+
# Expect a long time (~5m) for the pod to pick up the creds.
|
|
154
177
|
# TODO: remove editPod once createPod supports docker's username and password
|
|
155
178
|
# editPod is temporary solution to set container_registry_auth_id because createPod does not
|
|
156
179
|
# support it currently. This will be removed once createPod supports container_registry_auth_id
|
|
@@ -186,14 +209,127 @@ class RunpodCompute(
|
|
|
186
209
|
backend_data=None,
|
|
187
210
|
)
|
|
188
211
|
|
|
212
|
+
def run_jobs(
|
|
213
|
+
self,
|
|
214
|
+
run: Run,
|
|
215
|
+
job_configurations: List[JobConfiguration],
|
|
216
|
+
instance_offer: InstanceOfferWithAvailability,
|
|
217
|
+
project_ssh_public_key: str,
|
|
218
|
+
project_ssh_private_key: str,
|
|
219
|
+
) -> ComputeGroupProvisioningData:
|
|
220
|
+
master_job_configuration = job_configurations[0]
|
|
221
|
+
master_job = master_job_configuration.job
|
|
222
|
+
master_job_volumes = master_job_configuration.volumes
|
|
223
|
+
all_volumes_names = set(v.name for jc in job_configurations for v in jc.volumes)
|
|
224
|
+
instance_config = InstanceConfiguration(
|
|
225
|
+
project_name=run.project_name,
|
|
226
|
+
instance_name=get_job_instance_name(run, master_job),
|
|
227
|
+
ssh_keys=[
|
|
228
|
+
SSHKey(public=get_or_error(run.run_spec.ssh_key_pub).strip()),
|
|
229
|
+
SSHKey(public=project_ssh_public_key.strip()),
|
|
230
|
+
],
|
|
231
|
+
user=run.user,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
pod_name = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN)
|
|
235
|
+
authorized_keys = instance_config.get_public_keys()
|
|
236
|
+
disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
|
|
237
|
+
|
|
238
|
+
network_volume_id = None
|
|
239
|
+
volume_mount_path = None
|
|
240
|
+
if len(master_job_volumes) > 1:
|
|
241
|
+
raise ComputeError("Mounting more than one network volume is not supported in runpod")
|
|
242
|
+
if len(all_volumes_names) > 1:
|
|
243
|
+
raise ComputeError(
|
|
244
|
+
"Mounting different volumes to different jobs is not supported in runpod"
|
|
245
|
+
)
|
|
246
|
+
if len(master_job_volumes) == 1:
|
|
247
|
+
network_volume_id = master_job_volumes[0].volume_id
|
|
248
|
+
volume_mount_path = run.run_spec.configuration.volumes[0].path
|
|
249
|
+
|
|
250
|
+
offer_pod_counts = _get_offer_pod_counts(instance_offer)
|
|
251
|
+
pod_count = len(job_configurations)
|
|
252
|
+
gpu_count = len(instance_offer.instance.resources.gpus)
|
|
253
|
+
data_center_id = instance_offer.region
|
|
254
|
+
|
|
255
|
+
if pod_count not in offer_pod_counts:
|
|
256
|
+
raise ComputeError(
|
|
257
|
+
f"Failed to provision {pod_count} pods. Available pod counts: {offer_pod_counts}"
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
container_registry_auth_id = self._generate_container_registry_auth_id(
|
|
261
|
+
master_job.job_spec.registry_auth
|
|
262
|
+
)
|
|
263
|
+
resp = self.api_client.create_cluster(
|
|
264
|
+
cluster_name=pod_name,
|
|
265
|
+
gpu_type_id=instance_offer.instance.name,
|
|
266
|
+
pod_count=pod_count,
|
|
267
|
+
gpu_count_per_pod=gpu_count,
|
|
268
|
+
deploy_cost=f"{instance_offer.price * pod_count:.2f}",
|
|
269
|
+
image_name=master_job.job_spec.image_name,
|
|
270
|
+
cluster_type="TRAINING",
|
|
271
|
+
data_center_id=data_center_id,
|
|
272
|
+
container_disk_in_gb=disk_size,
|
|
273
|
+
docker_args=_get_docker_args(authorized_keys),
|
|
274
|
+
ports=f"{DSTACK_RUNNER_SSH_PORT}/tcp",
|
|
275
|
+
network_volume_id=network_volume_id,
|
|
276
|
+
volume_mount_path=volume_mount_path,
|
|
277
|
+
env={"RUNPOD_POD_USER": "0"},
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
# An "edit pod" trick to pass container registry creds.
|
|
281
|
+
if container_registry_auth_id is not None:
|
|
282
|
+
for pod in resp["pods"]:
|
|
283
|
+
self.api_client.edit_pod(
|
|
284
|
+
pod_id=pod["id"],
|
|
285
|
+
image_name=master_job.job_spec.image_name,
|
|
286
|
+
container_disk_in_gb=disk_size,
|
|
287
|
+
container_registry_auth_id=container_registry_auth_id,
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
jpds = [
|
|
291
|
+
JobProvisioningData(
|
|
292
|
+
backend=instance_offer.backend,
|
|
293
|
+
instance_type=instance_offer.instance,
|
|
294
|
+
instance_id=pod["id"],
|
|
295
|
+
hostname=None,
|
|
296
|
+
internal_ip=pod["clusterIp"],
|
|
297
|
+
region=instance_offer.region,
|
|
298
|
+
price=instance_offer.price,
|
|
299
|
+
username="root",
|
|
300
|
+
dockerized=False,
|
|
301
|
+
)
|
|
302
|
+
for pod in resp["pods"]
|
|
303
|
+
]
|
|
304
|
+
return ComputeGroupProvisioningData(
|
|
305
|
+
compute_group_id=resp["id"],
|
|
306
|
+
compute_group_name=resp["name"],
|
|
307
|
+
backend=BackendType.RUNPOD,
|
|
308
|
+
region=instance_offer.region,
|
|
309
|
+
job_provisioning_datas=jpds,
|
|
310
|
+
)
|
|
311
|
+
|
|
189
312
|
def terminate_instance(
|
|
190
313
|
self, instance_id: str, region: str, backend_data: Optional[str] = None
|
|
191
|
-
)
|
|
314
|
+
):
|
|
192
315
|
try:
|
|
193
316
|
self.api_client.terminate_pod(instance_id)
|
|
194
|
-
except
|
|
195
|
-
if e.
|
|
196
|
-
logger.debug("The instance
|
|
317
|
+
except RunpodApiClientError as e:
|
|
318
|
+
if len(e.errors) > 0 and e.errors[0]["message"] == "pod not found to terminate":
|
|
319
|
+
logger.debug("The instance %s not found. Skipping deletion.", instance_id)
|
|
320
|
+
return
|
|
321
|
+
raise
|
|
322
|
+
|
|
323
|
+
def terminate_compute_group(self, compute_group: ComputeGroup):
|
|
324
|
+
provisioning_data = compute_group.provisioning_data
|
|
325
|
+
try:
|
|
326
|
+
self.api_client.delete_cluster(provisioning_data.compute_group_id)
|
|
327
|
+
except RunpodApiClientError as e:
|
|
328
|
+
if len(e.errors) > 0 and e.errors[0]["extensions"]["code"] == "Cluster not found":
|
|
329
|
+
logger.debug(
|
|
330
|
+
"The cluster %s not found. Skipping deletion.",
|
|
331
|
+
provisioning_data.compute_group_id,
|
|
332
|
+
)
|
|
197
333
|
return
|
|
198
334
|
raise
|
|
199
335
|
|
|
@@ -216,7 +352,9 @@ class RunpodCompute(
|
|
|
216
352
|
provisioning_data.ssh_port = port["publicPort"]
|
|
217
353
|
|
|
218
354
|
def register_volume(self, volume: Volume) -> VolumeProvisioningData:
|
|
219
|
-
volume_data = self.api_client.get_network_volume(
|
|
355
|
+
volume_data = self.api_client.get_network_volume(
|
|
356
|
+
volume_id=get_or_error(volume.configuration.volume_id)
|
|
357
|
+
)
|
|
220
358
|
if volume_data is None:
|
|
221
359
|
raise ComputeError(f"Volume {volume.configuration.volume_id} not found")
|
|
222
360
|
size_gb = volume_data["size"]
|
|
@@ -258,14 +396,12 @@ class RunpodCompute(
|
|
|
258
396
|
) -> Optional[str]:
|
|
259
397
|
if registry_auth is None:
|
|
260
398
|
return None
|
|
261
|
-
|
|
262
399
|
return self.api_client.add_container_registry_auth(
|
|
263
400
|
uuid.uuid4().hex, registry_auth.username, registry_auth.password
|
|
264
401
|
)
|
|
265
402
|
|
|
266
403
|
def _clean_stale_container_registry_auths(self) -> None:
|
|
267
404
|
container_registry_auths = self.api_client.get_container_registry_auths()
|
|
268
|
-
|
|
269
405
|
# Container_registry_auths sorted by creation time so try to delete the oldest first
|
|
270
406
|
# when we reach container_registry_auths that is still in use, we stop
|
|
271
407
|
for container_registry_auth in container_registry_auths:
|
|
@@ -289,9 +425,17 @@ def _get_volume_price(size: int) -> float:
|
|
|
289
425
|
return 0.05 * size
|
|
290
426
|
|
|
291
427
|
|
|
292
|
-
def _is_secure_cloud(region: str) ->
|
|
428
|
+
def _is_secure_cloud(region: str) -> bool:
|
|
293
429
|
"""
|
|
294
430
|
Secure cloud regions are datacenter IDs: CA-MTL-1, EU-NL-1, etc.
|
|
295
431
|
Community cloud regions are country codes: CA, NL, etc.
|
|
296
432
|
"""
|
|
297
433
|
return "-" in region
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def _get_offer_pod_counts(offer: InstanceOfferWithAvailability) -> list[int]:
|
|
437
|
+
backend_data: RunpodOfferBackendData = RunpodOfferBackendData.__response__.parse_obj(
|
|
438
|
+
offer.backend_data
|
|
439
|
+
)
|
|
440
|
+
pod_counts = backend_data.pod_counts or []
|
|
441
|
+
return pod_counts
|