dstack 0.19.7__py3-none-any.whl → 0.19.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (52) hide show
  1. dstack/_internal/cli/services/args.py +2 -2
  2. dstack/_internal/cli/services/configurators/run.py +38 -2
  3. dstack/_internal/cli/utils/run.py +3 -3
  4. dstack/_internal/core/backends/aws/compute.py +13 -1
  5. dstack/_internal/core/backends/azure/compute.py +42 -13
  6. dstack/_internal/core/backends/azure/configurator.py +21 -0
  7. dstack/_internal/core/backends/azure/models.py +9 -0
  8. dstack/_internal/core/backends/base/compute.py +101 -27
  9. dstack/_internal/core/backends/base/offers.py +13 -3
  10. dstack/_internal/core/backends/cudo/compute.py +2 -0
  11. dstack/_internal/core/backends/datacrunch/compute.py +2 -0
  12. dstack/_internal/core/backends/gcp/auth.py +1 -1
  13. dstack/_internal/core/backends/gcp/compute.py +51 -35
  14. dstack/_internal/core/backends/lambdalabs/compute.py +20 -8
  15. dstack/_internal/core/backends/local/compute.py +2 -0
  16. dstack/_internal/core/backends/nebius/compute.py +95 -1
  17. dstack/_internal/core/backends/nebius/configurator.py +11 -0
  18. dstack/_internal/core/backends/nebius/fabrics.py +47 -0
  19. dstack/_internal/core/backends/nebius/models.py +8 -0
  20. dstack/_internal/core/backends/nebius/resources.py +29 -0
  21. dstack/_internal/core/backends/oci/compute.py +2 -0
  22. dstack/_internal/core/backends/remote/provisioning.py +27 -2
  23. dstack/_internal/core/backends/template/compute.py.jinja +2 -0
  24. dstack/_internal/core/backends/tensordock/compute.py +2 -0
  25. dstack/_internal/core/backends/vultr/compute.py +5 -1
  26. dstack/_internal/core/models/instances.py +2 -1
  27. dstack/_internal/core/models/resources.py +78 -3
  28. dstack/_internal/core/models/runs.py +7 -2
  29. dstack/_internal/core/models/volumes.py +1 -1
  30. dstack/_internal/server/background/tasks/process_fleets.py +4 -13
  31. dstack/_internal/server/background/tasks/process_instances.py +176 -55
  32. dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
  33. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +5 -2
  34. dstack/_internal/server/models.py +1 -0
  35. dstack/_internal/server/services/fleets.py +9 -26
  36. dstack/_internal/server/services/instances.py +0 -2
  37. dstack/_internal/server/services/offers.py +15 -0
  38. dstack/_internal/server/services/placement.py +27 -6
  39. dstack/_internal/server/services/resources.py +21 -0
  40. dstack/_internal/server/services/runs.py +16 -6
  41. dstack/_internal/server/testing/common.py +35 -26
  42. dstack/_internal/utils/common.py +13 -1
  43. dstack/_internal/utils/json_schema.py +6 -3
  44. dstack/api/__init__.py +1 -0
  45. dstack/api/server/_fleets.py +16 -0
  46. dstack/api/server/_runs.py +44 -3
  47. dstack/version.py +1 -1
  48. {dstack-0.19.7.dist-info → dstack-0.19.8.dist-info}/METADATA +3 -1
  49. {dstack-0.19.7.dist-info → dstack-0.19.8.dist-info}/RECORD +52 -50
  50. {dstack-0.19.7.dist-info → dstack-0.19.8.dist-info}/WHEEL +0 -0
  51. {dstack-0.19.7.dist-info → dstack-0.19.8.dist-info}/entry_points.txt +0 -0
  52. {dstack-0.19.7.dist-info → dstack-0.19.8.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,5 +1,3 @@
1
- import random
2
- import string
3
1
  import uuid
4
2
  from datetime import datetime, timezone
5
3
  from typing import List, Literal, Optional, Tuple, Union, cast
@@ -33,6 +31,7 @@ from dstack._internal.core.models.instances import (
33
31
  SSHConnectionParams,
34
32
  SSHKey,
35
33
  )
34
+ from dstack._internal.core.models.placement import PlacementGroup
36
35
  from dstack._internal.core.models.profiles import (
37
36
  Profile,
38
37
  SpotPolicy,
@@ -62,6 +61,7 @@ from dstack._internal.server.services.projects import (
62
61
  list_project_models,
63
62
  list_user_project_models,
64
63
  )
64
+ from dstack._internal.server.services.resources import set_resources_defaults
65
65
  from dstack._internal.utils import random_names
66
66
  from dstack._internal.utils.logging import get_logger
67
67
  from dstack._internal.utils.ssh import pkey_from_str
@@ -243,6 +243,7 @@ async def get_plan(
243
243
  spec=effective_spec,
244
244
  )
245
245
  effective_spec = FleetSpec.parse_obj(effective_spec.dict())
246
+ _validate_fleet_spec_and_set_defaults(spec)
246
247
  current_fleet: Optional[Fleet] = None
247
248
  current_fleet_id: Optional[uuid.UUID] = None
248
249
  if effective_spec.configuration.name is not None:
@@ -282,6 +283,7 @@ async def get_create_instance_offers(
282
283
  project: ProjectModel,
283
284
  profile: Profile,
284
285
  requirements: Requirements,
286
+ placement_group: Optional[PlacementGroup] = None,
285
287
  fleet_spec: Optional[FleetSpec] = None,
286
288
  fleet_model: Optional[FleetModel] = None,
287
289
  blocks: Union[int, Literal["auto"]] = 1,
@@ -307,6 +309,7 @@ async def get_create_instance_offers(
307
309
  exclude_not_available=exclude_not_available,
308
310
  multinode=multinode,
309
311
  master_job_provisioning_data=master_job_provisioning_data,
312
+ placement_group=placement_group,
310
313
  blocks=blocks,
311
314
  )
312
315
  offers = [
@@ -345,7 +348,7 @@ async def create_fleet(
345
348
  spec=spec,
346
349
  )
347
350
  spec = FleetSpec.parse_obj(spec.dict())
348
- _validate_fleet_spec(spec)
351
+ _validate_fleet_spec_and_set_defaults(spec)
349
352
 
350
353
  if spec.configuration.ssh_config is not None:
351
354
  _check_can_manage_ssh_fleets(user=user, project=project)
@@ -393,17 +396,12 @@ async def create_fleet(
393
396
  )
394
397
  fleet_model.instances.append(instances_model)
395
398
  else:
396
- placement_group_name = _get_placement_group_name(
397
- project=project,
398
- fleet_spec=spec,
399
- )
400
399
  for i in range(_get_fleet_nodes_to_provision(spec)):
401
400
  instance_model = await create_fleet_instance_model(
402
401
  session=session,
403
402
  project=project,
404
403
  user=user,
405
404
  spec=spec,
406
- placement_group_name=placement_group_name,
407
405
  reservation=spec.configuration.reservation,
408
406
  instance_num=i,
409
407
  )
@@ -417,7 +415,6 @@ async def create_fleet_instance_model(
417
415
  project: ProjectModel,
418
416
  user: UserModel,
419
417
  spec: FleetSpec,
420
- placement_group_name: Optional[str],
421
418
  reservation: Optional[str],
422
419
  instance_num: int,
423
420
  ) -> InstanceModel:
@@ -431,7 +428,6 @@ async def create_fleet_instance_model(
431
428
  requirements=requirements,
432
429
  instance_name=f"{spec.configuration.name}-{instance_num}",
433
430
  instance_num=instance_num,
434
- placement_group_name=placement_group_name,
435
431
  reservation=reservation,
436
432
  blocks=spec.configuration.blocks,
437
433
  tags=spec.configuration.tags,
@@ -652,7 +648,7 @@ def _remove_fleet_spec_sensitive_info(spec: FleetSpec):
652
648
  host.ssh_key = None
653
649
 
654
650
 
655
- def _validate_fleet_spec(spec: FleetSpec):
651
+ def _validate_fleet_spec_and_set_defaults(spec: FleetSpec):
656
652
  if spec.configuration.name is not None:
657
653
  validate_dstack_resource_name(spec.configuration.name)
658
654
  if spec.configuration.ssh_config is None and spec.configuration.nodes is None:
@@ -665,6 +661,8 @@ def _validate_fleet_spec(spec: FleetSpec):
665
661
  if isinstance(host, SSHHostParams) and host.ssh_key is not None:
666
662
  _validate_ssh_key(host.ssh_key)
667
663
  _validate_internal_ips(spec.configuration.ssh_config)
664
+ if spec.configuration.resources is not None:
665
+ set_resources_defaults(spec.configuration.resources)
668
666
 
669
667
 
670
668
  def _validate_all_ssh_params_specified(ssh_config: SSHParams):
@@ -735,18 +733,3 @@ def _get_fleet_requirements(fleet_spec: FleetSpec) -> Requirements:
735
733
  reservation=fleet_spec.configuration.reservation,
736
734
  )
737
735
  return requirements
738
-
739
-
740
- def _get_placement_group_name(
741
- project: ProjectModel,
742
- fleet_spec: FleetSpec,
743
- ) -> Optional[str]:
744
- if fleet_spec.configuration.placement != InstanceGroupPlacement.CLUSTER:
745
- return None
746
- # A random suffix to avoid clashing with to-be-deleted placement groups left by old fleets
747
- suffix = _generate_random_placement_group_suffix()
748
- return f"{project.name}-{fleet_spec.configuration.name}-{suffix}-pg"
749
-
750
-
751
- def _generate_random_placement_group_suffix(length: int = 8) -> str:
752
- return "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(length))
@@ -408,7 +408,6 @@ async def create_instance_model(
408
408
  requirements: Requirements,
409
409
  instance_name: str,
410
410
  instance_num: int,
411
- placement_group_name: Optional[str],
412
411
  reservation: Optional[str],
413
412
  blocks: Union[Literal["auto"], int],
414
413
  tags: Optional[Dict[str, str]],
@@ -427,7 +426,6 @@ async def create_instance_model(
427
426
  user=user.name,
428
427
  ssh_keys=[project_ssh_key],
429
428
  instance_id=str(instance_id),
430
- placement_group_name=placement_group_name,
431
429
  reservation=reservation,
432
430
  tags=tags,
433
431
  )
@@ -8,12 +8,14 @@ from dstack._internal.core.backends import (
8
8
  BACKENDS_WITH_RESERVATION_SUPPORT,
9
9
  )
10
10
  from dstack._internal.core.backends.base.backend import Backend
11
+ from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
11
12
  from dstack._internal.core.models.backends.base import BackendType
12
13
  from dstack._internal.core.models.instances import (
13
14
  InstanceOfferWithAvailability,
14
15
  InstanceType,
15
16
  Resources,
16
17
  )
18
+ from dstack._internal.core.models.placement import PlacementGroup
17
19
  from dstack._internal.core.models.profiles import Profile
18
20
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
19
21
  from dstack._internal.core.models.volumes import Volume
@@ -31,6 +33,7 @@ async def get_offers_by_requirements(
31
33
  volumes: Optional[List[List[Volume]]] = None,
32
34
  privileged: bool = False,
33
35
  instance_mounts: bool = False,
36
+ placement_group: Optional[PlacementGroup] = None,
34
37
  blocks: Union[int, Literal["auto"]] = 1,
35
38
  ) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
36
39
  backends: List[Backend] = await backends_services.get_project_backends(project=project)
@@ -116,6 +119,18 @@ async def get_offers_by_requirements(
116
119
  new_offers.append((b, new_offer))
117
120
  offers = new_offers
118
121
 
122
+ if placement_group is not None:
123
+ new_offers = []
124
+ for b, o in offers:
125
+ for backend in backends:
126
+ compute = backend.compute()
127
+ if isinstance(
128
+ compute, ComputeWithPlacementGroupSupport
129
+ ) and compute.is_suitable_placement_group(placement_group, o):
130
+ new_offers.append((b, o))
131
+ break
132
+ offers = new_offers
133
+
119
134
  if profile.instance_types is not None:
120
135
  instance_types = [i.lower() for i in profile.instance_types]
121
136
  offers = [(b, o) for b, o in offers if o.instance.name.lower() in instance_types]
@@ -1,8 +1,9 @@
1
+ from collections.abc import Iterable
1
2
  from typing import Optional
2
3
  from uuid import UUID
3
4
 
4
5
  from git import List
5
- from sqlalchemy import select
6
+ from sqlalchemy import and_, select, update
6
7
  from sqlalchemy.ext.asyncio import AsyncSession
7
8
 
8
9
  from dstack._internal.core.models.placement import (
@@ -13,15 +14,35 @@ from dstack._internal.core.models.placement import (
13
14
  from dstack._internal.server.models import PlacementGroupModel
14
15
 
15
16
 
16
- async def get_fleet_placement_groups(
17
+ async def get_fleet_placement_group_models(
17
18
  session: AsyncSession,
18
19
  fleet_id: UUID,
19
- ) -> List[PlacementGroup]:
20
+ ) -> List[PlacementGroupModel]:
20
21
  res = await session.execute(
21
- select(PlacementGroupModel).where(PlacementGroupModel.fleet_id == fleet_id)
22
+ select(PlacementGroupModel).where(
23
+ and_(
24
+ PlacementGroupModel.fleet_id == fleet_id,
25
+ PlacementGroupModel.deleted == False,
26
+ PlacementGroupModel.fleet_deleted == False,
27
+ )
28
+ )
29
+ )
30
+ return list(res.scalars().all())
31
+
32
+
33
+ async def schedule_fleet_placement_groups_deletion(
34
+ session: AsyncSession, fleet_id: UUID, except_placement_group_ids: Iterable[UUID] = ()
35
+ ) -> None:
36
+ await session.execute(
37
+ update(PlacementGroupModel)
38
+ .where(
39
+ and_(
40
+ PlacementGroupModel.fleet_id == fleet_id,
41
+ PlacementGroupModel.id.not_in(except_placement_group_ids),
42
+ )
43
+ )
44
+ .values(fleet_deleted=True) # TODO: rename `fleet_deleted` -> `to_be_deleted`
22
45
  )
23
- placement_groups = res.scalars().all()
24
- return [placement_group_model_to_placement_group(pg) for pg in placement_groups]
25
46
 
26
47
 
27
48
  def placement_group_model_to_placement_group(
@@ -0,0 +1,21 @@
1
+ import gpuhunt
2
+ from pydantic import parse_obj_as
3
+
4
+ from dstack._internal.core.models.resources import CPUSpec, ResourcesSpec
5
+
6
+
7
+ def set_resources_defaults(resources: ResourcesSpec) -> None:
8
+ # TODO: Remove in 0.20. Use resources.cpu directly
9
+ cpu = parse_obj_as(CPUSpec, resources.cpu)
10
+ if cpu.arch is None:
11
+ gpu = resources.gpu
12
+ if (
13
+ gpu is not None
14
+ and gpu.vendor in [None, gpuhunt.AcceleratorVendor.NVIDIA]
15
+ and gpu.name
16
+ and any(map(gpuhunt.is_nvidia_superchip, gpu.name))
17
+ ):
18
+ cpu.arch = gpuhunt.CPUArchitecture.ARM
19
+ else:
20
+ cpu.arch = gpuhunt.CPUArchitecture.X86
21
+ resources.cpu = cpu
@@ -81,6 +81,7 @@ from dstack._internal.server.services.logging import fmt
81
81
  from dstack._internal.server.services.offers import get_offers_by_requirements
82
82
  from dstack._internal.server.services.plugins import apply_plugin_policies
83
83
  from dstack._internal.server.services.projects import list_project_models, list_user_project_models
84
+ from dstack._internal.server.services.resources import set_resources_defaults
84
85
  from dstack._internal.server.services.users import get_user_model_by_name
85
86
  from dstack._internal.utils.logging import get_logger
86
87
  from dstack._internal.utils.random_names import generate_name
@@ -301,12 +302,14 @@ async def get_plan(
301
302
  project=project,
302
303
  run_name=effective_run_spec.run_name,
303
304
  )
304
- if (
305
- current_resource is not None
306
- and not current_resource.status.is_finished()
307
- and _can_update_run_spec(current_resource.run_spec, effective_run_spec)
308
- ):
309
- action = ApplyAction.UPDATE
305
+ if current_resource is not None:
306
+ # For backward compatibility (current_resource may has been submitted before
307
+ # some fields, e.g., CPUSpec.arch, were added)
308
+ set_resources_defaults(current_resource.run_spec.configuration.resources)
309
+ if not current_resource.status.is_finished() and _can_update_run_spec(
310
+ current_resource.run_spec, effective_run_spec
311
+ ):
312
+ action = ApplyAction.UPDATE
310
313
 
311
314
  jobs = await get_jobs_from_run_spec(effective_run_spec, replica_num=0)
312
315
 
@@ -406,6 +409,10 @@ async def apply_plan(
406
409
  project=project,
407
410
  run_spec=run_spec,
408
411
  )
412
+
413
+ # For backward compatibility (current_resource may has been submitted before
414
+ # some fields, e.g., CPUSpec.arch, were added)
415
+ set_resources_defaults(current_resource.run_spec.configuration.resources)
409
416
  try:
410
417
  _check_can_update_run_spec(current_resource.run_spec, run_spec)
411
418
  except ServerClientError:
@@ -414,6 +421,8 @@ async def apply_plan(
414
421
  raise ServerClientError("Cannot override active run. Stop the run first.")
415
422
  raise
416
423
  if not force:
424
+ if plan.current_resource is not None:
425
+ set_resources_defaults(plan.current_resource.run_spec.configuration.resources)
417
426
  if (
418
427
  plan.current_resource is None
419
428
  or plan.current_resource.id != current_resource.id
@@ -866,6 +875,7 @@ def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
866
875
  raise ServerClientError(
867
876
  f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_TTL_SECONDS}s"
868
877
  )
878
+ set_resources_defaults(run_spec.configuration.resources)
869
879
 
870
880
 
871
881
  _UPDATABLE_SPEC_FIELDS = ["repo_code_hash", "configuration"]
@@ -2,7 +2,7 @@ import json
2
2
  import uuid
3
3
  from contextlib import contextmanager
4
4
  from datetime import datetime, timezone
5
- from typing import Dict, List, Optional, Union
5
+ from typing import Dict, List, Literal, Optional, Union
6
6
  from uuid import UUID
7
7
 
8
8
  import gpuhunt
@@ -25,7 +25,12 @@ from dstack._internal.core.models.configurations import (
25
25
  DevEnvironmentConfiguration,
26
26
  )
27
27
  from dstack._internal.core.models.envs import Env
28
- from dstack._internal.core.models.fleets import FleetConfiguration, FleetSpec, FleetStatus
28
+ from dstack._internal.core.models.fleets import (
29
+ FleetConfiguration,
30
+ FleetSpec,
31
+ FleetStatus,
32
+ InstanceGroupPlacement,
33
+ )
29
34
  from dstack._internal.core.models.gateways import GatewayComputeConfiguration, GatewayStatus
30
35
  from dstack._internal.core.models.instances import (
31
36
  Disk,
@@ -51,7 +56,7 @@ from dstack._internal.core.models.profiles import (
51
56
  )
52
57
  from dstack._internal.core.models.repos.base import RepoType
53
58
  from dstack._internal.core.models.repos.local import LocalRunRepoData
54
- from dstack._internal.core.models.resources import Memory, Range, ResourcesSpec
59
+ from dstack._internal.core.models.resources import CPUSpec, Memory, Range, ResourcesSpec
55
60
  from dstack._internal.core.models.runs import (
56
61
  JobProvisioningData,
57
62
  JobRuntimeData,
@@ -497,10 +502,12 @@ def get_fleet_spec(conf: Optional[FleetConfiguration] = None) -> FleetSpec:
497
502
  def get_fleet_configuration(
498
503
  name: str = "test-fleet",
499
504
  nodes: Range[int] = Range(min=1, max=1),
505
+ placement: Optional[InstanceGroupPlacement] = None,
500
506
  ) -> FleetConfiguration:
501
507
  return FleetConfiguration(
502
508
  name=name,
503
509
  nodes=nodes,
510
+ placement=placement,
504
511
  )
505
512
 
506
513
 
@@ -519,13 +526,13 @@ async def create_instance(
519
526
  instance_id: Optional[UUID] = None,
520
527
  job: Optional[JobModel] = None,
521
528
  instance_num: int = 0,
522
- backend: BackendType = BackendType.DATACRUNCH,
529
+ backend: Optional[BackendType] = BackendType.DATACRUNCH,
523
530
  termination_policy: Optional[TerminationPolicy] = None,
524
531
  termination_idle_time: int = DEFAULT_FLEET_TERMINATION_IDLE_TIME,
525
- region: str = "eu-west",
532
+ region: Optional[str] = "eu-west",
526
533
  remote_connection_info: Optional[RemoteConnectionInfo] = None,
527
- offer: Optional[InstanceOfferWithAvailability] = None,
528
- job_provisioning_data: Optional[JobProvisioningData] = None,
534
+ offer: Optional[Union[InstanceOfferWithAvailability, Literal["auto"]]] = "auto",
535
+ job_provisioning_data: Optional[Union[JobProvisioningData, Literal["auto"]]] = "auto",
529
536
  total_blocks: Optional[int] = 1,
530
537
  busy_blocks: int = 0,
531
538
  name: str = "test_instance",
@@ -534,7 +541,7 @@ async def create_instance(
534
541
  ) -> InstanceModel:
535
542
  if instance_id is None:
536
543
  instance_id = uuid.uuid4()
537
- if job_provisioning_data is None:
544
+ if job_provisioning_data == "auto":
538
545
  job_provisioning_data = get_job_provisioning_data(
539
546
  dockerized=True,
540
547
  backend=backend,
@@ -543,13 +550,13 @@ async def create_instance(
543
550
  hostname="running_instance.ip",
544
551
  internal_ip=None,
545
552
  )
546
- if offer is None:
553
+ if offer == "auto":
547
554
  offer = get_instance_offer_with_availability(backend=backend, region=region, spot=spot)
548
555
  if profile is None:
549
556
  profile = Profile(name="test_name")
550
557
 
551
558
  if requirements is None:
552
- requirements = Requirements(resources=ResourcesSpec(cpu=1))
559
+ requirements = Requirements(resources=ResourcesSpec(cpu=CPUSpec.parse("1")))
553
560
 
554
561
  if instance_configuration is None:
555
562
  instance_configuration = get_instance_configuration()
@@ -571,8 +578,8 @@ async def create_instance(
571
578
  created_at=created_at,
572
579
  started_at=created_at,
573
580
  finished_at=finished_at,
574
- job_provisioning_data=job_provisioning_data.json(),
575
- offer=offer.json(),
581
+ job_provisioning_data=job_provisioning_data.json() if job_provisioning_data else None,
582
+ offer=offer.json() if offer else None,
576
583
  price=price,
577
584
  region=region,
578
585
  backend=backend,
@@ -659,20 +666,7 @@ def get_remote_connection_info(
659
666
  env: Optional[Union[Env, dict]] = None,
660
667
  ):
661
668
  if ssh_keys is None:
662
- ssh_keys = [
663
- SSHKey(
664
- public="ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIO6mJxVbNtm0zXgMLvByrhXJCmJRveSrJxLB5/OzcyCk",
665
- private="""
666
- -----BEGIN OPENSSH PRIVATE KEY-----
667
- b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW
668
- QyNTUxOQAAACDupicVWzbZtM14DC7wcq4VyQpiUb3kqycSwefzs3MgpAAAAJCiWa5Volmu
669
- VQAAAAtzc2gtZWQyNTUxOQAAACDupicVWzbZtM14DC7wcq4VyQpiUb3kqycSwefzs3MgpA
670
- AAAEAncHi4AhS6XdMp5Gzd+IMse/4ekyQ54UngByf0Sp0uH+6mJxVbNtm0zXgMLvByrhXJ
671
- CmJRveSrJxLB5/OzcyCkAAAACWRlZkBkZWZwYwECAwQ=
672
- -----END OPENSSH PRIVATE KEY-----
673
- """,
674
- )
675
- ]
669
+ ssh_keys = [get_ssh_key()]
676
670
  if env is None:
677
671
  env = Env()
678
672
  elif isinstance(env, dict):
@@ -686,6 +680,21 @@ def get_remote_connection_info(
686
680
  )
687
681
 
688
682
 
683
+ def get_ssh_key() -> SSHKey:
684
+ return SSHKey(
685
+ public="ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIO6mJxVbNtm0zXgMLvByrhXJCmJRveSrJxLB5/OzcyCk",
686
+ private="""
687
+ -----BEGIN OPENSSH PRIVATE KEY-----
688
+ b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW
689
+ QyNTUxOQAAACDupicVWzbZtM14DC7wcq4VyQpiUb3kqycSwefzs3MgpAAAAJCiWa5Volmu
690
+ VQAAAAtzc2gtZWQyNTUxOQAAACDupicVWzbZtM14DC7wcq4VyQpiUb3kqycSwefzs3MgpA
691
+ AAAEAncHi4AhS6XdMp5Gzd+IMse/4ekyQ54UngByf0Sp0uH+6mJxVbNtm0zXgMLvByrhXJ
692
+ CmJRveSrJxLB5/OzcyCkAAAACWRlZkBkZWZwYwECAwQ=
693
+ -----END OPENSSH PRIVATE KEY-----
694
+ """,
695
+ )
696
+
697
+
689
698
  async def create_volume(
690
699
  session: AsyncSession,
691
700
  project: ProjectModel,
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ import enum
2
3
  import itertools
3
4
  import re
4
5
  import time
@@ -83,6 +84,8 @@ def pretty_date(time: datetime) -> str:
83
84
 
84
85
 
85
86
  def pretty_resources(
87
+ *,
88
+ cpu_arch: Optional[Any] = None,
86
89
  cpus: Optional[Any] = None,
87
90
  memory: Optional[Any] = None,
88
91
  gpu_count: Optional[Any] = None,
@@ -110,7 +113,16 @@ def pretty_resources(
110
113
  """
111
114
  parts = []
112
115
  if cpus is not None:
113
- parts.append(f"cpu={cpus}")
116
+ cpu_arch_lower: Optional[str] = None
117
+ if isinstance(cpu_arch, enum.Enum):
118
+ cpu_arch_lower = str(cpu_arch.value).lower()
119
+ elif isinstance(cpu_arch, str):
120
+ cpu_arch_lower = cpu_arch.lower()
121
+ if cpu_arch_lower == "arm":
122
+ cpu_arch_prefix = "arm:"
123
+ else:
124
+ cpu_arch_prefix = ""
125
+ parts.append(f"cpu={cpu_arch_prefix}{cpus}")
114
126
  if memory is not None:
115
127
  parts.append(f"mem={memory}")
116
128
  if disk_size:
@@ -1,6 +1,9 @@
1
1
  def add_extra_schema_types(schema_property: dict, extra_types: list[dict]):
2
2
  if "allOf" in schema_property:
3
- ref = schema_property.pop("allOf")[0]
3
+ refs = [schema_property.pop("allOf")[0]]
4
+ elif "anyOf" in schema_property:
5
+ refs = schema_property.pop("anyOf")
4
6
  else:
5
- ref = {"type": schema_property.pop("type")}
6
- schema_property["anyOf"] = [ref, *extra_types]
7
+ refs = [{"type": schema_property.pop("type")}]
8
+ refs.extend(extra_types)
9
+ schema_property["anyOf"] = refs
dstack/api/__init__.py CHANGED
@@ -14,6 +14,7 @@ from dstack._internal.core.models.repos.local import LocalRepo
14
14
  from dstack._internal.core.models.repos.remote import RemoteRepo
15
15
  from dstack._internal.core.models.repos.virtual import VirtualRepo
16
16
  from dstack._internal.core.models.resources import ComputeCapability, Memory, Range
17
+ from dstack._internal.core.models.resources import CPUSpec as CPU
17
18
  from dstack._internal.core.models.resources import DiskSpec as Disk
18
19
  from dstack._internal.core.models.resources import GPUSpec as GPU
19
20
  from dstack._internal.core.models.resources import ResourcesSpec as Resources
@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Union
3
3
  from pydantic import parse_obj_as
4
4
 
5
5
  from dstack._internal.core.models.fleets import ApplyFleetPlanInput, Fleet, FleetPlan, FleetSpec
6
+ from dstack._internal.core.models.instances import Instance
6
7
  from dstack._internal.server.schemas.fleets import (
7
8
  ApplyFleetPlanRequest,
8
9
  CreateFleetRequest,
@@ -83,9 +84,24 @@ def _get_apply_plan_excludes(plan_input: ApplyFleetPlanInput) -> Dict:
83
84
  spec_excludes = _get_fleet_spec_excludes(plan_input.spec)
84
85
  if spec_excludes:
85
86
  apply_plan_excludes["spec"] = apply_plan_excludes
87
+ current_resource = plan_input.current_resource
88
+ if current_resource is not None:
89
+ current_resource_excludes = {}
90
+ apply_plan_excludes["current_resource"] = current_resource_excludes
91
+ if all(map(_should_exclude_instance_cpu_arch, current_resource.instances)):
92
+ current_resource_excludes["instances"] = {
93
+ "__all__": {"instance_type": {"resources": {"cpu_arch"}}}
94
+ }
86
95
  return {"plan": apply_plan_excludes}
87
96
 
88
97
 
98
+ def _should_exclude_instance_cpu_arch(instance: Instance) -> bool:
99
+ try:
100
+ return instance.instance_type.resources.cpu_arch is None
101
+ except AttributeError:
102
+ return True
103
+
104
+
89
105
  def _get_create_fleet_excludes(fleet_spec: FleetSpec) -> Dict:
90
106
  create_fleet_excludes = {}
91
107
  spec_excludes = _get_fleet_spec_excludes(fleet_spec)
@@ -7,6 +7,7 @@ from pydantic import parse_obj_as
7
7
  from dstack._internal.core.models.configurations import ServiceConfiguration
8
8
  from dstack._internal.core.models.runs import (
9
9
  ApplyRunPlanInput,
10
+ JobSubmission,
10
11
  Run,
11
12
  RunPlan,
12
13
  RunSpec,
@@ -96,13 +97,53 @@ def _get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
96
97
  run_spec_excludes = _get_run_spec_excludes(plan.run_spec)
97
98
  if run_spec_excludes is not None:
98
99
  apply_plan_excludes["run_spec"] = run_spec_excludes
99
- if plan.current_resource is not None:
100
- apply_plan_excludes["current_resource"] = {
101
- "run_spec": _get_run_spec_excludes(plan.current_resource.run_spec)
100
+ current_resource = plan.current_resource
101
+ if current_resource is not None:
102
+ current_resource_excludes = {}
103
+ apply_plan_excludes["current_resource"] = current_resource_excludes
104
+ current_resource_excludes["run_spec"] = _get_run_spec_excludes(current_resource.run_spec)
105
+ job_submissions_excludes = {}
106
+ current_resource_excludes["jobs"] = {
107
+ "__all__": {"job_submissions": {"__all__": job_submissions_excludes}}
102
108
  }
109
+ job_submissions = [js for j in current_resource.jobs for js in j.job_submissions]
110
+ if all(map(_should_exclude_job_submission_jpd_cpu_arch, job_submissions)):
111
+ job_submissions_excludes["job_provisioning_data"] = {
112
+ "instance_type": {"resources": {"cpu_arch"}}
113
+ }
114
+ if all(map(_should_exclude_job_submission_jrd_cpu_arch, job_submissions)):
115
+ job_submissions_excludes["job_runtime_data"] = {
116
+ "offer": {"instance": {"resources": {"cpu_arch"}}}
117
+ }
118
+ latest_job_submission = current_resource.latest_job_submission
119
+ if latest_job_submission is not None:
120
+ latest_job_submission_excludes = {}
121
+ current_resource_excludes["latest_job_submission"] = latest_job_submission_excludes
122
+ if _should_exclude_job_submission_jpd_cpu_arch(latest_job_submission):
123
+ latest_job_submission_excludes["job_provisioning_data"] = {
124
+ "instance_type": {"resources": {"cpu_arch"}}
125
+ }
126
+ if _should_exclude_job_submission_jrd_cpu_arch(latest_job_submission):
127
+ latest_job_submission_excludes["job_runtime_data"] = {
128
+ "offer": {"instance": {"resources": {"cpu_arch"}}}
129
+ }
103
130
  return {"plan": apply_plan_excludes}
104
131
 
105
132
 
133
+ def _should_exclude_job_submission_jpd_cpu_arch(job_submission: JobSubmission) -> bool:
134
+ try:
135
+ return job_submission.job_provisioning_data.instance_type.resources.cpu_arch is None
136
+ except AttributeError:
137
+ return True
138
+
139
+
140
+ def _should_exclude_job_submission_jrd_cpu_arch(job_submission: JobSubmission) -> bool:
141
+ try:
142
+ return job_submission.job_runtime_data.offer.instance.resources.cpu_arch is None
143
+ except AttributeError:
144
+ return True
145
+
146
+
106
147
  def _get_get_plan_excludes(request: GetRunPlanRequest) -> Optional[Dict]:
107
148
  """
108
149
  Excludes new fields when they are not set to keep
dstack/version.py CHANGED
@@ -1,3 +1,3 @@
1
- __version__ = "0.19.7"
1
+ __version__ = "0.19.8"
2
2
  __is_release__ = True
3
3
  base_image = "0.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dstack
3
- Version: 0.19.7
3
+ Version: 0.19.8
4
4
  Summary: dstack is an open-source orchestration engine for running AI workloads on any cloud or on-premises.
5
5
  Project-URL: Homepage, https://dstack.ai
6
6
  Project-URL: Source, https://github.com/dstackai/dstack
@@ -49,6 +49,7 @@ Requires-Dist: asyncpg; extra == 'all'
49
49
  Requires-Dist: azure-identity>=1.12.0; extra == 'all'
50
50
  Requires-Dist: azure-mgmt-authorization>=3.0.0; extra == 'all'
51
51
  Requires-Dist: azure-mgmt-compute>=29.1.0; extra == 'all'
52
+ Requires-Dist: azure-mgmt-msi>=7.0.0; extra == 'all'
52
53
  Requires-Dist: azure-mgmt-network<28.0.0,>=23.0.0; extra == 'all'
53
54
  Requires-Dist: azure-mgmt-resource>=22.0.0; extra == 'all'
54
55
  Requires-Dist: azure-mgmt-subscription>=3.1.1; extra == 'all'
@@ -116,6 +117,7 @@ Requires-Dist: asyncpg; extra == 'azure'
116
117
  Requires-Dist: azure-identity>=1.12.0; extra == 'azure'
117
118
  Requires-Dist: azure-mgmt-authorization>=3.0.0; extra == 'azure'
118
119
  Requires-Dist: azure-mgmt-compute>=29.1.0; extra == 'azure'
120
+ Requires-Dist: azure-mgmt-msi>=7.0.0; extra == 'azure'
119
121
  Requires-Dist: azure-mgmt-network<28.0.0,>=23.0.0; extra == 'azure'
120
122
  Requires-Dist: azure-mgmt-resource>=22.0.0; extra == 'azure'
121
123
  Requires-Dist: azure-mgmt-subscription>=3.1.1; extra == 'azure'