dstack 0.19.34__py3-none-any.whl → 0.19.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (41) hide show
  1. dstack/_internal/cli/services/configurators/run.py +1 -1
  2. dstack/_internal/core/backends/base/compute.py +20 -1
  3. dstack/_internal/core/backends/base/models.py +10 -0
  4. dstack/_internal/core/backends/base/offers.py +1 -0
  5. dstack/_internal/core/backends/features.py +5 -0
  6. dstack/_internal/core/backends/nebius/compute.py +28 -16
  7. dstack/_internal/core/backends/nebius/configurator.py +1 -1
  8. dstack/_internal/core/backends/nebius/models.py +4 -0
  9. dstack/_internal/core/backends/nebius/resources.py +41 -20
  10. dstack/_internal/core/backends/runpod/api_client.py +245 -59
  11. dstack/_internal/core/backends/runpod/compute.py +157 -13
  12. dstack/_internal/core/models/compute_groups.py +39 -0
  13. dstack/_internal/core/models/fleets.py +6 -1
  14. dstack/_internal/core/models/profiles.py +3 -1
  15. dstack/_internal/core/models/runs.py +3 -0
  16. dstack/_internal/server/app.py +14 -2
  17. dstack/_internal/server/background/__init__.py +7 -0
  18. dstack/_internal/server/background/tasks/process_compute_groups.py +164 -0
  19. dstack/_internal/server/background/tasks/process_instances.py +81 -49
  20. dstack/_internal/server/background/tasks/process_submitted_jobs.py +179 -84
  21. dstack/_internal/server/migrations/env.py +20 -2
  22. dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py +93 -0
  23. dstack/_internal/server/models.py +39 -0
  24. dstack/_internal/server/routers/runs.py +15 -6
  25. dstack/_internal/server/services/compute_groups.py +22 -0
  26. dstack/_internal/server/services/fleets.py +1 -0
  27. dstack/_internal/server/services/jobs/__init__.py +13 -0
  28. dstack/_internal/server/services/jobs/configurators/base.py +3 -2
  29. dstack/_internal/server/services/requirements/combine.py +1 -0
  30. dstack/_internal/server/services/runs.py +17 -3
  31. dstack/_internal/server/testing/common.py +51 -0
  32. dstack/_internal/server/utils/routers.py +18 -20
  33. dstack/_internal/settings.py +4 -1
  34. dstack/_internal/utils/version.py +22 -0
  35. dstack/version.py +1 -1
  36. {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/METADATA +3 -3
  37. {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/RECORD +40 -36
  38. dstack/_internal/core/backends/nebius/fabrics.py +0 -49
  39. {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/WHEEL +0 -0
  40. {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/entry_points.txt +0 -0
  41. {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/licenses/LICENSE.md +0 -0
@@ -941,7 +941,7 @@ def _warn_fleet_autocreated(api: APIClient, run: Run):
941
941
  if not fleet.spec.autocreated:
942
942
  return
943
943
  warn(
944
- f"\nNo existing fleet matched, so the run created a new fleet [code]{fleet.name}[/code].\n"
944
+ f"\nThe run is using automatically created fleet [code]{fleet.name}[/code].\n"
945
945
  "Future dstack versions won't create fleets automatically.\n"
946
946
  "Create a fleet explicitly: https://dstack.ai/docs/concepts/fleets/"
947
947
  )
@@ -17,6 +17,7 @@ from cachetools import TTLCache, cachedmethod
17
17
  from gpuhunt import CPUArchitecture
18
18
 
19
19
  from dstack._internal import settings
20
+ from dstack._internal.core.backends.base.models import JobConfiguration
20
21
  from dstack._internal.core.backends.base.offers import OfferModifier, filter_offers_by_requirements
21
22
  from dstack._internal.core.consts import (
22
23
  DSTACK_RUNNER_HTTP_PORT,
@@ -24,6 +25,7 @@ from dstack._internal.core.consts import (
24
25
  DSTACK_SHIM_HTTP_PORT,
25
26
  )
26
27
  from dstack._internal.core.models.backends.base import BackendType
28
+ from dstack._internal.core.models.compute_groups import ComputeGroup, ComputeGroupProvisioningData
27
29
  from dstack._internal.core.models.configurations import LEGACY_REPO_DIR
28
30
  from dstack._internal.core.models.gateways import (
29
31
  GatewayComputeConfiguration,
@@ -324,6 +326,23 @@ class ComputeWithCreateInstanceSupport(ABC):
324
326
  ]
325
327
 
326
328
 
329
+ class ComputeWithGroupProvisioningSupport(ABC):
330
+ @abstractmethod
331
+ def run_jobs(
332
+ self,
333
+ run: Run,
334
+ job_configurations: List[JobConfiguration],
335
+ instance_offer: InstanceOfferWithAvailability,
336
+ project_ssh_public_key: str,
337
+ project_ssh_private_key: str,
338
+ ) -> ComputeGroupProvisioningData:
339
+ pass
340
+
341
+ @abstractmethod
342
+ def terminate_compute_group(self, compute_group: ComputeGroup):
343
+ pass
344
+
345
+
327
346
  class ComputeWithPrivilegedSupport:
328
347
  """
329
348
  Must be subclassed to support runs with `privileged: true`.
@@ -680,7 +699,7 @@ def get_shim_env(
680
699
  backend_shim_env: Optional[Dict[str, str]] = None,
681
700
  arch: Optional[str] = None,
682
701
  ) -> Dict[str, str]:
683
- log_level = "6" # Trace
702
+ log_level = "5" # Debug
684
703
  envs = {
685
704
  "DSTACK_SHIM_HOME": get_dstack_working_dir(base_path),
686
705
  "DSTACK_SHIM_HTTP_PORT": str(DSTACK_SHIM_HTTP_PORT),
@@ -1,4 +1,14 @@
1
1
  from pathlib import Path
2
+ from typing import List
3
+
4
+ from dstack._internal.core.models.common import CoreModel
5
+ from dstack._internal.core.models.runs import Job
6
+ from dstack._internal.core.models.volumes import Volume
7
+
8
+
9
+ class JobConfiguration(CoreModel):
10
+ job: Job
11
+ volumes: List[Volume]
2
12
 
3
13
 
4
14
  def fill_data(values: dict, filename_field: str = "filename", data_field: str = "data") -> dict:
@@ -25,6 +25,7 @@ SUPPORTED_GPUHUNT_FLAGS = [
25
25
  "gcp-a4",
26
26
  "gcp-g4",
27
27
  "gcp-dws-calendar-mode",
28
+ "runpod-cluster",
28
29
  ]
29
30
 
30
31
 
@@ -1,6 +1,7 @@
1
1
  from dstack._internal.core.backends.base.compute import (
2
2
  ComputeWithCreateInstanceSupport,
3
3
  ComputeWithGatewaySupport,
4
+ ComputeWithGroupProvisioningSupport,
4
5
  ComputeWithMultinodeSupport,
5
6
  ComputeWithPlacementGroupSupport,
6
7
  ComputeWithPrivateGatewaySupport,
@@ -39,6 +40,10 @@ BACKENDS_WITH_CREATE_INSTANCE_SUPPORT = _get_backends_with_compute_feature(
39
40
  configurator_classes=_configurator_classes,
40
41
  compute_feature_class=ComputeWithCreateInstanceSupport,
41
42
  )
43
+ BACKENDS_WITH_GROUP_PROVISIONING_SUPPORT = _get_backends_with_compute_feature(
44
+ configurator_classes=_configurator_classes,
45
+ compute_feature_class=ComputeWithGroupProvisioningSupport,
46
+ )
42
47
  BACKENDS_WITH_PRIVILEGED_SUPPORT = _get_backends_with_compute_feature(
43
48
  configurator_classes=_configurator_classes,
44
49
  compute_feature_class=ComputeWithPrivilegedSupport,
@@ -28,8 +28,11 @@ from dstack._internal.core.backends.base.offers import (
28
28
  get_offers_disk_modifier,
29
29
  )
30
30
  from dstack._internal.core.backends.nebius import resources
31
- from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
32
- from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
31
+ from dstack._internal.core.backends.nebius.models import (
32
+ NebiusConfig,
33
+ NebiusOfferBackendData,
34
+ NebiusServiceAccountCreds,
35
+ )
33
36
  from dstack._internal.core.errors import (
34
37
  BackendError,
35
38
  NotYetTerminated,
@@ -281,12 +284,16 @@ class NebiusCompute(
281
284
  master_instance_offer: InstanceOffer,
282
285
  ) -> PlacementGroupProvisioningData:
283
286
  assert placement_group.configuration.placement_strategy == PlacementStrategy.CLUSTER
284
- backend_data = NebiusPlacementGroupBackendData(cluster=None)
287
+ master_instance_offer_backend_data: NebiusOfferBackendData = (
288
+ NebiusOfferBackendData.__response__.parse_obj(master_instance_offer.backend_data)
289
+ )
290
+ fabrics = list(master_instance_offer_backend_data.fabrics)
291
+ if self.config.fabrics is not None:
292
+ fabrics = [f for f in fabrics if f in self.config.fabrics]
293
+ placement_group_backend_data = NebiusPlacementGroupBackendData(cluster=None)
285
294
  # Only create a Nebius cluster if the instance supports it.
286
295
  # For other instances, return dummy PlacementGroupProvisioningData.
287
- if fabrics := get_suitable_infiniband_fabrics(
288
- master_instance_offer, allowed_fabrics=self.config.fabrics
289
- ):
296
+ if fabrics:
290
297
  fabric = random.choice(fabrics)
291
298
  op = resources.create_cluster(
292
299
  self._sdk,
@@ -294,10 +301,13 @@ class NebiusCompute(
294
301
  project_id=self._region_to_project_id[placement_group.configuration.region],
295
302
  fabric=fabric,
296
303
  )
297
- backend_data.cluster = NebiusClusterBackendData(id=op.resource_id, fabric=fabric)
304
+ placement_group_backend_data.cluster = NebiusClusterBackendData(
305
+ id=op.resource_id,
306
+ fabric=fabric,
307
+ )
298
308
  return PlacementGroupProvisioningData(
299
309
  backend=BackendType.NEBIUS,
300
- backend_data=backend_data.json(),
310
+ backend_data=placement_group_backend_data.json(),
301
311
  )
302
312
 
303
313
  def delete_placement_group(self, placement_group: PlacementGroup) -> None:
@@ -317,16 +327,15 @@ class NebiusCompute(
317
327
  if placement_group.configuration.region != instance_offer.region:
318
328
  return False
319
329
  assert placement_group.provisioning_data is not None
320
- backend_data = NebiusPlacementGroupBackendData.load(
330
+ placement_group_backend_data = NebiusPlacementGroupBackendData.load(
321
331
  placement_group.provisioning_data.backend_data
322
332
  )
333
+ instance_offer_backend_data: NebiusOfferBackendData = (
334
+ NebiusOfferBackendData.__response__.parse_obj(instance_offer.backend_data)
335
+ )
323
336
  return (
324
- backend_data.cluster is None
325
- or backend_data.cluster.fabric
326
- in get_suitable_infiniband_fabrics(
327
- instance_offer,
328
- allowed_fabrics=None, # enforced at cluster creation time, no need to enforce here
329
- )
337
+ placement_group_backend_data.cluster is None
338
+ or placement_group_backend_data.cluster.fabric in instance_offer_backend_data.fabrics
330
339
  )
331
340
 
332
341
 
@@ -380,7 +389,10 @@ def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:
380
389
  )
381
390
  time.sleep(WAIT_FOR_INSTANCE_UPDATE_INTERVAL)
382
391
  resources.LOOP.await_(
383
- op.update(per_retry_timeout=resources.REQUEST_TIMEOUT, metadata=resources.REQUEST_MD)
392
+ op.update(
393
+ per_retry_timeout=resources.REQUEST_TIMEOUT,
394
+ auth_options=resources.REQUEST_AUTH_OPTIONS,
395
+ )
384
396
  )
385
397
 
386
398
 
@@ -10,7 +10,6 @@ from dstack._internal.core.backends.base.configurator import (
10
10
  )
11
11
  from dstack._internal.core.backends.nebius import resources
12
12
  from dstack._internal.core.backends.nebius.backend import NebiusBackend
13
- from dstack._internal.core.backends.nebius.fabrics import get_all_infiniband_fabrics
14
13
  from dstack._internal.core.backends.nebius.models import (
15
14
  NebiusBackendConfig,
16
15
  NebiusBackendConfigWithCreds,
@@ -19,6 +18,7 @@ from dstack._internal.core.backends.nebius.models import (
19
18
  NebiusServiceAccountCreds,
20
19
  NebiusStoredConfig,
21
20
  )
21
+ from dstack._internal.core.backends.nebius.resources import get_all_infiniband_fabrics
22
22
  from dstack._internal.core.errors import BackendError, ServerClientError
23
23
  from dstack._internal.core.models.backends.base import BackendType
24
24
 
@@ -179,3 +179,7 @@ class NebiusConfig(NebiusStoredConfig):
179
179
  """
180
180
 
181
181
  creds: AnyNebiusCreds
182
+
183
+
184
+ class NebiusOfferBackendData(CoreModel):
185
+ fabrics: set[str] = set()
@@ -8,7 +8,6 @@ from contextlib import contextmanager
8
8
  from tempfile import NamedTemporaryFile
9
9
  from typing import Dict, Optional
10
10
 
11
- from nebius.aio.authorization.options import options_to_metadata
12
11
  from nebius.aio.operation import Operation as SDKOperation
13
12
  from nebius.aio.service_error import RequestError, StatusCode
14
13
  from nebius.aio.token.renewable import OPTION_RENEW_REQUEST_TIMEOUT, OPTION_RENEW_SYNCHRONOUS
@@ -50,11 +49,14 @@ from nebius.api.nebius.vpc.v1 import ListSubnetsRequest, Subnet, SubnetServiceCl
50
49
  from nebius.sdk import SDK
51
50
 
52
51
  from dstack._internal.core.backends.base.configurator import raise_invalid_credentials_error
52
+ from dstack._internal.core.backends.base.offers import get_catalog_offers
53
53
  from dstack._internal.core.backends.nebius.models import (
54
54
  DEFAULT_PROJECT_NAME_PREFIX,
55
+ NebiusOfferBackendData,
55
56
  NebiusServiceAccountCreds,
56
57
  )
57
58
  from dstack._internal.core.errors import BackendError, NoCapacityError
59
+ from dstack._internal.core.models.backends.base import BackendType
58
60
  from dstack._internal.utils.event_loop import DaemonEventLoop
59
61
  from dstack._internal.utils.logging import get_logger
60
62
 
@@ -66,13 +68,11 @@ from dstack._internal.utils.logging import get_logger
66
68
  LOOP = DaemonEventLoop()
67
69
  # Pass a timeout to all methods to avoid infinite waiting
68
70
  REQUEST_TIMEOUT = 10
69
- # Pass REQUEST_MD to all methods to avoid infinite retries in case of invalid credentials
70
- REQUEST_MD = options_to_metadata(
71
- {
72
- OPTION_RENEW_SYNCHRONOUS: "true",
73
- OPTION_RENEW_REQUEST_TIMEOUT: "5",
74
- }
75
- )
71
+ # Pass REQUEST_AUTH_OPTIONS to all methods to avoid infinite retries in case of invalid credentials
72
+ REQUEST_AUTH_OPTIONS = {
73
+ OPTION_RENEW_SYNCHRONOUS: "true",
74
+ OPTION_RENEW_REQUEST_TIMEOUT: "5",
75
+ }
76
76
 
77
77
  # disables log messages about errors such as invalid creds or expired timeouts
78
78
  logging.getLogger("nebius").setLevel(logging.CRITICAL)
@@ -120,7 +120,9 @@ def wait_for_operation(
120
120
  if time.monotonic() + interval > deadline:
121
121
  raise TimeoutError(f"Operation {op.id} wait timeout")
122
122
  time.sleep(interval)
123
- LOOP.await_(op.update(per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
123
+ LOOP.await_(
124
+ op.update(per_retry_timeout=REQUEST_TIMEOUT, auth_options=REQUEST_AUTH_OPTIONS)
125
+ )
124
126
 
125
127
 
126
128
  def get_region_to_project_id_map(
@@ -156,7 +158,9 @@ def validate_regions(configured: set[str], available: set[str]) -> None:
156
158
  def list_tenant_projects(sdk: SDK) -> Sequence[Container]:
157
159
  tenants = LOOP.await_(
158
160
  TenantServiceClient(sdk).list(
159
- ListTenantsRequest(), per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
161
+ ListTenantsRequest(),
162
+ per_retry_timeout=REQUEST_TIMEOUT,
163
+ auth_options=REQUEST_AUTH_OPTIONS,
160
164
  )
161
165
  )
162
166
  if len(tenants.items) != 1:
@@ -166,7 +170,7 @@ def list_tenant_projects(sdk: SDK) -> Sequence[Container]:
166
170
  ProjectServiceClient(sdk).list(
167
171
  ListProjectsRequest(parent_id=tenant_id, page_size=999),
168
172
  per_retry_timeout=REQUEST_TIMEOUT,
169
- metadata=REQUEST_MD,
173
+ auth_options=REQUEST_AUTH_OPTIONS,
170
174
  )
171
175
  )
172
176
  return projects.items
@@ -240,7 +244,7 @@ def get_default_subnet(sdk: SDK, project_id: str) -> Subnet:
240
244
  SubnetServiceClient(sdk).list(
241
245
  ListSubnetsRequest(parent_id=project_id, page_size=999),
242
246
  per_retry_timeout=REQUEST_TIMEOUT,
243
- metadata=REQUEST_MD,
247
+ auth_options=REQUEST_AUTH_OPTIONS,
244
248
  )
245
249
  )
246
250
  for subnet in subnets.items:
@@ -249,6 +253,17 @@ def get_default_subnet(sdk: SDK, project_id: str) -> Subnet:
249
253
  raise BackendError(f"Could not find default subnet in project {project_id}")
250
254
 
251
255
 
256
+ def get_all_infiniband_fabrics() -> set[str]:
257
+ offers = get_catalog_offers(backend=BackendType.NEBIUS)
258
+ result = set()
259
+ for offer in offers:
260
+ backend_data: NebiusOfferBackendData = NebiusOfferBackendData.__response__.parse_obj(
261
+ offer.backend_data
262
+ )
263
+ result |= backend_data.fabrics
264
+ return result
265
+
266
+
252
267
  def create_disk(
253
268
  sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str, labels: Dict[str, str]
254
269
  ) -> SDKOperation[Operation]:
@@ -267,14 +282,18 @@ def create_disk(
267
282
  )
268
283
  with wrap_capacity_errors():
269
284
  return LOOP.await_(
270
- client.create(request, per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD)
285
+ client.create(
286
+ request, per_retry_timeout=REQUEST_TIMEOUT, auth_options=REQUEST_AUTH_OPTIONS
287
+ )
271
288
  )
272
289
 
273
290
 
274
291
  def delete_disk(sdk: SDK, disk_id: str) -> None:
275
292
  LOOP.await_(
276
293
  DiskServiceClient(sdk).delete(
277
- DeleteDiskRequest(id=disk_id), per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
294
+ DeleteDiskRequest(id=disk_id),
295
+ per_retry_timeout=REQUEST_TIMEOUT,
296
+ auth_options=REQUEST_AUTH_OPTIONS,
278
297
  )
279
298
  )
280
299
 
@@ -325,7 +344,9 @@ def create_instance(
325
344
  )
326
345
  with wrap_capacity_errors():
327
346
  return LOOP.await_(
328
- client.create(request, per_retry_timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD)
347
+ client.create(
348
+ request, per_retry_timeout=REQUEST_TIMEOUT, auth_options=REQUEST_AUTH_OPTIONS
349
+ )
329
350
  )
330
351
 
331
352
 
@@ -334,7 +355,7 @@ def get_instance(sdk: SDK, instance_id: str) -> Instance:
334
355
  InstanceServiceClient(sdk).get(
335
356
  GetInstanceRequest(id=instance_id),
336
357
  per_retry_timeout=REQUEST_TIMEOUT,
337
- metadata=REQUEST_MD,
358
+ auth_options=REQUEST_AUTH_OPTIONS,
338
359
  )
339
360
  )
340
361
 
@@ -344,7 +365,7 @@ def delete_instance(sdk: SDK, instance_id: str) -> SDKOperation[Operation]:
344
365
  InstanceServiceClient(sdk).delete(
345
366
  DeleteInstanceRequest(id=instance_id),
346
367
  per_retry_timeout=REQUEST_TIMEOUT,
347
- metadata=REQUEST_MD,
368
+ auth_options=REQUEST_AUTH_OPTIONS,
348
369
  )
349
370
  )
350
371
 
@@ -358,17 +379,17 @@ def create_cluster(sdk: SDK, name: str, project_id: str, fabric: str) -> SDKOper
358
379
  spec=GpuClusterSpec(infiniband_fabric=fabric),
359
380
  ),
360
381
  per_retry_timeout=REQUEST_TIMEOUT,
361
- metadata=REQUEST_MD,
382
+ auth_options=REQUEST_AUTH_OPTIONS,
362
383
  )
363
384
  )
364
385
 
365
386
 
366
387
  def delete_cluster(sdk: SDK, cluster_id: str) -> None:
367
- return LOOP.await_(
388
+ LOOP.await_(
368
389
  GpuClusterServiceClient(sdk).delete(
369
390
  DeleteGpuClusterRequest(id=cluster_id),
370
391
  per_retry_timeout=REQUEST_TIMEOUT,
371
- metadata=REQUEST_MD,
392
+ auth_options=REQUEST_AUTH_OPTIONS,
372
393
  )
373
394
  )
374
395