dstack 0.19.7__py3-none-any.whl → 0.19.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (60) hide show
  1. dstack/_internal/cli/services/args.py +2 -2
  2. dstack/_internal/cli/services/configurators/run.py +56 -13
  3. dstack/_internal/cli/utils/run.py +10 -5
  4. dstack/_internal/core/backends/aws/compute.py +13 -1
  5. dstack/_internal/core/backends/azure/compute.py +42 -13
  6. dstack/_internal/core/backends/azure/configurator.py +21 -0
  7. dstack/_internal/core/backends/azure/models.py +9 -0
  8. dstack/_internal/core/backends/base/compute.py +101 -27
  9. dstack/_internal/core/backends/base/offers.py +13 -3
  10. dstack/_internal/core/backends/cudo/compute.py +3 -1
  11. dstack/_internal/core/backends/datacrunch/compute.py +2 -0
  12. dstack/_internal/core/backends/gcp/auth.py +1 -1
  13. dstack/_internal/core/backends/gcp/compute.py +51 -35
  14. dstack/_internal/core/backends/lambdalabs/compute.py +20 -8
  15. dstack/_internal/core/backends/local/compute.py +2 -0
  16. dstack/_internal/core/backends/nebius/compute.py +95 -1
  17. dstack/_internal/core/backends/nebius/configurator.py +11 -0
  18. dstack/_internal/core/backends/nebius/fabrics.py +48 -0
  19. dstack/_internal/core/backends/nebius/models.py +9 -1
  20. dstack/_internal/core/backends/nebius/resources.py +29 -0
  21. dstack/_internal/core/backends/oci/compute.py +2 -0
  22. dstack/_internal/core/backends/remote/provisioning.py +27 -2
  23. dstack/_internal/core/backends/template/compute.py.jinja +2 -0
  24. dstack/_internal/core/backends/tensordock/compute.py +2 -0
  25. dstack/_internal/core/backends/vultr/compute.py +5 -1
  26. dstack/_internal/core/models/instances.py +2 -1
  27. dstack/_internal/core/models/resources.py +79 -4
  28. dstack/_internal/core/models/runs.py +26 -9
  29. dstack/_internal/core/models/volumes.py +1 -1
  30. dstack/_internal/server/background/tasks/process_fleets.py +4 -13
  31. dstack/_internal/server/background/tasks/process_instances.py +176 -55
  32. dstack/_internal/server/background/tasks/process_metrics.py +26 -9
  33. dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
  34. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +5 -2
  35. dstack/_internal/server/background/tasks/process_running_jobs.py +56 -18
  36. dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
  37. dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
  38. dstack/_internal/server/models.py +6 -1
  39. dstack/_internal/server/schemas/runner.py +41 -8
  40. dstack/_internal/server/services/fleets.py +9 -26
  41. dstack/_internal/server/services/instances.py +0 -2
  42. dstack/_internal/server/services/jobs/__init__.py +1 -0
  43. dstack/_internal/server/services/offers.py +15 -0
  44. dstack/_internal/server/services/placement.py +27 -6
  45. dstack/_internal/server/services/resources.py +21 -0
  46. dstack/_internal/server/services/runner/client.py +7 -4
  47. dstack/_internal/server/services/runs.py +18 -8
  48. dstack/_internal/server/settings.py +20 -1
  49. dstack/_internal/server/testing/common.py +37 -26
  50. dstack/_internal/utils/common.py +13 -1
  51. dstack/_internal/utils/json_schema.py +6 -3
  52. dstack/api/__init__.py +1 -0
  53. dstack/api/server/_fleets.py +16 -0
  54. dstack/api/server/_runs.py +48 -3
  55. dstack/version.py +1 -1
  56. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/METADATA +38 -29
  57. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/RECORD +60 -56
  58. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/WHEEL +0 -0
  59. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/entry_points.txt +0 -0
  60. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/licenses/LICENSE.md +0 -0
@@ -6,8 +6,9 @@ from textwrap import dedent
6
6
  from typing import Any, Dict, Generator, List, Optional
7
7
 
8
8
  import paramiko
9
- from gpuhunt import AcceleratorVendor, correct_gpu_memory_gib
9
+ from gpuhunt import AcceleratorVendor, CPUArchitecture, correct_gpu_memory_gib
10
10
 
11
+ from dstack._internal.core.backends.base.compute import GoArchType, normalize_arch
11
12
  from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
12
13
 
13
14
  # FIXME: ProvisioningError is a subclass of ComputeError and should not be used outside of Compute
@@ -36,6 +37,22 @@ DSTACK_SHIM_ENV_FILE = "shim.env"
36
37
  HOST_INFO_FILE = "host_info.json"
37
38
 
38
39
 
40
+ def detect_cpu_arch(client: paramiko.SSHClient) -> GoArchType:
41
+ cmd = "uname -m"
42
+ try:
43
+ _, stdout, stderr = client.exec_command(cmd, timeout=20)
44
+ except (paramiko.SSHException, OSError) as e:
45
+ raise ProvisioningError(f"detect_cpu_arch: {e}") from e
46
+ out = stdout.read().strip().decode()
47
+ err = stderr.read().strip().decode()
48
+ if err:
49
+ raise ProvisioningError(f"detect_cpu_arch: {cmd} failed, stdout: {out}, stderr: {err}")
50
+ try:
51
+ return normalize_arch(out)
52
+ except ValueError as e:
53
+ raise ProvisioningError(f"detect_cpu_arch: failed to normalize arch: {e}") from e
54
+
55
+
39
56
  def sftp_upload(client: paramiko.SSHClient, path: str, body: str) -> None:
40
57
  try:
41
58
  sftp = client.open_sftp()
@@ -226,7 +243,14 @@ def get_shim_healthcheck(client: paramiko.SSHClient) -> str:
226
243
  raise ProvisioningError(f"get_shim_healthcheck failed: {e}") from e
227
244
 
228
245
 
229
- def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType:
246
+ def host_info_to_instance_type(host_info: Dict[str, Any], cpu_arch: GoArchType) -> InstanceType:
247
+ _cpu_arch: CPUArchitecture
248
+ if cpu_arch == "amd64":
249
+ _cpu_arch = CPUArchitecture.X86
250
+ elif cpu_arch == "arm64":
251
+ _cpu_arch = CPUArchitecture.ARM
252
+ else:
253
+ raise ValueError(f"Unexpected cpu_arch: {cpu_arch}")
230
254
  gpu_count = host_info.get("gpu_count", 0)
231
255
  if gpu_count > 0:
232
256
  gpu_vendor = AcceleratorVendor.cast(host_info.get("gpu_vendor", "nvidia"))
@@ -251,6 +275,7 @@ def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType:
251
275
  instance_type = InstanceType(
252
276
  name="instance",
253
277
  resources=Resources(
278
+ cpu_arch=_cpu_arch,
254
279
  cpus=host_info["cpus"],
255
280
  memory_mib=host_info["memory"] / 1024 / 1024,
256
281
  spot=False,
@@ -18,6 +18,7 @@ from dstack._internal.core.models.instances import (
18
18
  InstanceConfiguration,
19
19
  InstanceOfferWithAvailability,
20
20
  )
21
+ from dstack._internal.core.models.placement import PlacementGroup
21
22
  from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
22
23
  from dstack._internal.core.models.volumes import Volume
23
24
  from dstack._internal.utils.logging import get_logger
@@ -64,6 +65,7 @@ class {{ backend_name }}Compute(
64
65
  self,
65
66
  instance_offer: InstanceOfferWithAvailability,
66
67
  instance_config: InstanceConfiguration,
68
+ placement_group: Optional[PlacementGroup],
67
69
  ) -> JobProvisioningData:
68
70
  # TODO: Implement if backend supports creating instances (VM-based).
69
71
  # Delete if backend can only run jobs (container-based).
@@ -19,6 +19,7 @@ from dstack._internal.core.models.instances import (
19
19
  InstanceConfiguration,
20
20
  InstanceOfferWithAvailability,
21
21
  )
22
+ from dstack._internal.core.models.placement import PlacementGroup
22
23
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
23
24
  from dstack._internal.utils.logging import get_logger
24
25
 
@@ -57,6 +58,7 @@ class TensorDockCompute(
57
58
  self,
58
59
  instance_offer: InstanceOfferWithAvailability,
59
60
  instance_config: InstanceConfiguration,
61
+ placement_group: Optional[PlacementGroup],
60
62
  ) -> JobProvisioningData:
61
63
  instance_name = generate_unique_instance_name(
62
64
  instance_config, max_length=MAX_INSTANCE_NAME_LEN
@@ -22,6 +22,7 @@ from dstack._internal.core.models.instances import (
22
22
  InstanceOffer,
23
23
  InstanceOfferWithAvailability,
24
24
  )
25
+ from dstack._internal.core.models.placement import PlacementGroup
25
26
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
26
27
  from dstack._internal.utils.logging import get_logger
27
28
 
@@ -58,7 +59,10 @@ class VultrCompute(
58
59
  return offers
59
60
 
60
61
  def create_instance(
61
- self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration
62
+ self,
63
+ instance_offer: InstanceOfferWithAvailability,
64
+ instance_config: InstanceConfiguration,
65
+ placement_group: Optional[PlacementGroup],
62
66
  ) -> JobProvisioningData:
63
67
  instance_name = generate_unique_instance_name(
64
68
  instance_config, max_length=MAX_INSTANCE_NAME_LEN
@@ -49,11 +49,13 @@ class Resources(CoreModel):
49
49
  spot: bool
50
50
  disk: Disk = Disk(size_mib=102400) # the default value (100GB) for backward compatibility
51
51
  description: str = ""
52
+ cpu_arch: Optional[gpuhunt.CPUArchitecture] = None
52
53
 
53
54
  def pretty_format(self, include_spot: bool = False) -> str:
54
55
  resources = {}
55
56
  if self.cpus > 0:
56
57
  resources["cpus"] = self.cpus
58
+ resources["cpu_arch"] = self.cpu_arch
57
59
  if self.memory_mib > 0:
58
60
  resources["memory"] = f"{self.memory_mib / 1024:.0f}GB"
59
61
  if self.disk.size_mib > 0:
@@ -105,7 +107,6 @@ class InstanceConfiguration(CoreModel):
105
107
  user: str # dstack user name
106
108
  ssh_keys: List[SSHKey]
107
109
  instance_id: Optional[str] = None
108
- placement_group_name: Optional[str] = None
109
110
  reservation: Optional[str] = None
110
111
  volumes: Optional[List[Volume]] = None
111
112
  tags: Optional[Dict[str, str]] = None
@@ -1,8 +1,9 @@
1
1
  import math
2
+ from collections.abc import Mapping
2
3
  from typing import Any, Dict, Generic, List, Optional, Tuple, TypeVar, Union
3
4
 
4
5
  import gpuhunt
5
- from pydantic import Field, root_validator, validator
6
+ from pydantic import Field, parse_obj_as, root_validator, validator
6
7
  from pydantic.generics import GenericModel
7
8
  from typing_extensions import Annotated
8
9
 
@@ -125,7 +126,68 @@ class ComputeCapability(Tuple[int, int]):
125
126
 
126
127
  DEFAULT_CPU_COUNT = Range[int](min=2)
127
128
  DEFAULT_MEMORY_SIZE = Range[Memory](min=Memory.parse("8GB"))
128
- DEFAULT_GPU_COUNT = Range[int](min=1, max=1)
129
+ DEFAULT_GPU_COUNT = Range[int](min=1)
130
+
131
+
132
+ class CPUSpec(CoreModel):
133
+ class Config:
134
+ @staticmethod
135
+ def schema_extra(schema: Dict[str, Any]):
136
+ add_extra_schema_types(
137
+ schema["properties"]["count"],
138
+ extra_types=[{"type": "integer"}, {"type": "string"}],
139
+ )
140
+
141
+ arch: Annotated[
142
+ Optional[gpuhunt.CPUArchitecture],
143
+ Field(description="The CPU architecture, one of: `x86`, `arm`"),
144
+ ] = None
145
+ count: Annotated[Range[int], Field(description="The number of CPU cores")] = DEFAULT_CPU_COUNT
146
+
147
+ @classmethod
148
+ def __get_validators__(cls):
149
+ yield cls.parse
150
+ yield cls.validate
151
+
152
+ @classmethod
153
+ def parse(cls, v: Any) -> Any:
154
+ if isinstance(v, int):
155
+ v = str(v)
156
+ if isinstance(v, str):
157
+ tokens = v.replace(" ", "").split(":")
158
+ spec = {}
159
+ for token in tokens:
160
+ if not token:
161
+ raise ValueError(f"CPU spec contains empty token: {v}")
162
+ if ".." in token or token.isdigit():
163
+ if "count" in spec:
164
+ raise ValueError(f"CPU spec count conflict: {v}")
165
+ spec["count"] = token
166
+ else:
167
+ try:
168
+ arch = gpuhunt.CPUArchitecture.cast(token)
169
+ except ValueError:
170
+ raise ValueError(f"Invalid CPU architecture: {v}")
171
+ if "arch" in spec:
172
+ raise ValueError(f"CPU spec arch conflict: {v}")
173
+ spec["arch"] = arch
174
+ return spec
175
+ # Range and min/max dict - for backward compatibility
176
+ if isinstance(v, Range):
177
+ return {"arch": None, "count": v}
178
+ if isinstance(v, Mapping) and v.keys() == {"min", "max"}:
179
+ return {"arch": None, "count": v}
180
+ return v
181
+
182
+ @validator("arch", pre=True)
183
+ def _validate_arch(cls, v: Any) -> Any:
184
+ if v is None:
185
+ return None
186
+ if isinstance(v, gpuhunt.CPUArchitecture):
187
+ return v
188
+ if isinstance(v, str):
189
+ return gpuhunt.CPUArchitecture.cast(v)
190
+ return v
129
191
 
130
192
 
131
193
  class GPUSpec(CoreModel):
@@ -302,7 +364,10 @@ class ResourcesSpec(CoreModel):
302
364
  extra_types=[{"type": "integer"}, {"type": "string"}],
303
365
  )
304
366
 
305
- cpu: Annotated[Range[int], Field(description="The number of CPU cores")] = DEFAULT_CPU_COUNT
367
+ # TODO: Remove Range[int] in 0.20. Range[int] for backward compatibility only.
368
+ cpu: Annotated[Union[CPUSpec, Range[int]], Field(description="The CPU requirements")] = (
369
+ CPUSpec()
370
+ )
306
371
  memory: Annotated[Range[Memory], Field(description="The RAM size (e.g., `8GB`)")] = (
307
372
  DEFAULT_MEMORY_SIZE
308
373
  )
@@ -317,8 +382,18 @@ class ResourcesSpec(CoreModel):
317
382
  gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = None
318
383
  disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK
319
384
 
385
+ # TODO: Remove in 0.20. Added for backward compatibility.
386
+ @root_validator
387
+ def _post_validate(cls, values):
388
+ cpu = values.get("cpu")
389
+ if isinstance(cpu, CPUSpec) and cpu.arch in [None, gpuhunt.CPUArchitecture.X86]:
390
+ values["cpu"] = cpu.count
391
+ return values
392
+
320
393
  def pretty_format(self) -> str:
321
- resources: Dict[str, Any] = dict(cpus=self.cpu, memory=self.memory)
394
+ # TODO: Remove in 0.20. Use self.cpu directly
395
+ cpu = parse_obj_as(CPUSpec, self.cpu)
396
+ resources: Dict[str, Any] = dict(cpu_arch=cpu.arch, cpus=cpu.count, memory=self.memory)
322
397
  if self.gpu:
323
398
  gpu = self.gpu
324
399
  resources.update(
@@ -104,6 +104,7 @@ class JobTerminationReason(str, Enum):
104
104
  # Set by the server
105
105
  FAILED_TO_START_DUE_TO_NO_CAPACITY = "failed_to_start_due_to_no_capacity"
106
106
  INTERRUPTED_BY_NO_CAPACITY = "interrupted_by_no_capacity"
107
+ INSTANCE_UNREACHABLE = "instance_unreachable"
107
108
  WAITING_INSTANCE_LIMIT_EXCEEDED = "waiting_instance_limit_exceeded"
108
109
  WAITING_RUNNER_LIMIT_EXCEEDED = "waiting_runner_limit_exceeded"
109
110
  TERMINATED_BY_USER = "terminated_by_user"
@@ -126,6 +127,7 @@ class JobTerminationReason(str, Enum):
126
127
  mapping = {
127
128
  self.FAILED_TO_START_DUE_TO_NO_CAPACITY: JobStatus.FAILED,
128
129
  self.INTERRUPTED_BY_NO_CAPACITY: JobStatus.FAILED,
130
+ self.INSTANCE_UNREACHABLE: JobStatus.FAILED,
129
131
  self.WAITING_INSTANCE_LIMIT_EXCEEDED: JobStatus.FAILED,
130
132
  self.WAITING_RUNNER_LIMIT_EXCEEDED: JobStatus.FAILED,
131
133
  self.TERMINATED_BY_USER: JobStatus.TERMINATED,
@@ -262,9 +264,9 @@ class JobRuntimeData(CoreModel):
262
264
  # or not applicable (container-based backends)
263
265
  ports: Optional[dict[int, int]] = None
264
266
  # List of volumes used by the job
265
- volume_names: Optional[list[str]] = None # None for backward compalibility
267
+ volume_names: Optional[list[str]] = None # None for backward compatibility
266
268
  # Virtual shared offer
267
- offer: Optional[InstanceOfferWithAvailability] = None # None for backward compalibility
269
+ offer: Optional[InstanceOfferWithAvailability] = None # None for backward compatibility
268
270
 
269
271
 
270
272
  class ClusterInfo(CoreModel):
@@ -283,6 +285,7 @@ class JobSubmission(CoreModel):
283
285
  status: JobStatus
284
286
  termination_reason: Optional[JobTerminationReason]
285
287
  termination_reason_message: Optional[str]
288
+ exit_status: Optional[int]
286
289
  job_provisioning_data: Optional[JobProvisioningData]
287
290
  job_runtime_data: Optional[JobRuntimeData]
288
291
 
@@ -439,9 +442,14 @@ class Run(CoreModel):
439
442
 
440
443
  @root_validator
441
444
  def _error(cls, values) -> Dict:
445
+ try:
446
+ termination_reason = values["termination_reason"]
447
+ jobs = values["jobs"]
448
+ except KeyError:
449
+ return values
442
450
  values["error"] = _get_run_error(
443
- run_termination_reason=values["termination_reason"],
444
- run_jobs=values["jobs"],
451
+ run_termination_reason=termination_reason,
452
+ run_jobs=jobs,
445
453
  )
446
454
  return values
447
455
 
@@ -503,7 +511,9 @@ def _get_run_error(
503
511
  return ""
504
512
  if len(run_jobs) > 1:
505
513
  return run_termination_reason.name
506
- run_job_termination_reason = _get_run_job_termination_reason(run_jobs)
514
+ run_job_termination_reason, exit_status = _get_run_job_termination_reason_and_exit_status(
515
+ run_jobs
516
+ )
507
517
  # For failed runs, also show termination reason to provide more context.
508
518
  # For other run statuses, the job termination reason will duplicate run status.
509
519
  if run_job_termination_reason is not None and run_termination_reason in [
@@ -511,13 +521,20 @@ def _get_run_error(
511
521
  RunTerminationReason.SERVER_ERROR,
512
522
  RunTerminationReason.RETRY_LIMIT_EXCEEDED,
513
523
  ]:
524
+ if exit_status:
525
+ return (
526
+ f"{run_termination_reason.name}\n({run_job_termination_reason.name} {exit_status})"
527
+ )
514
528
  return f"{run_termination_reason.name}\n({run_job_termination_reason.name})"
515
529
  return run_termination_reason.name
516
530
 
517
531
 
518
- def _get_run_job_termination_reason(run_jobs: List[Job]) -> Optional[JobTerminationReason]:
532
+ def _get_run_job_termination_reason_and_exit_status(
533
+ run_jobs: List[Job],
534
+ ) -> tuple[Optional[JobTerminationReason], Optional[int]]:
519
535
  for job in run_jobs:
520
536
  if len(job.job_submissions) > 0:
521
- if job.job_submissions[-1].termination_reason is not None:
522
- return job.job_submissions[-1].termination_reason
523
- return None
537
+ job_submission = job.job_submissions[-1]
538
+ if job_submission.termination_reason is not None:
539
+ return job_submission.termination_reason, job_submission.exit_status
540
+ return None, None
@@ -159,7 +159,7 @@ class VolumeMountPoint(CoreModel):
159
159
  description=(
160
160
  "The network volume name or the list of network volume names to mount."
161
161
  " If a list is specified, one of the volumes in the list will be mounted."
162
- " Specify volumes from different backends/regions to increase availability."
162
+ " Specify volumes from different backends/regions to increase availability"
163
163
  )
164
164
  ),
165
165
  ]
@@ -1,15 +1,16 @@
1
- from sqlalchemy import select, update
1
+ from sqlalchemy import select
2
2
  from sqlalchemy.ext.asyncio import AsyncSession
3
3
  from sqlalchemy.orm import joinedload
4
4
 
5
5
  from dstack._internal.core.models.fleets import FleetStatus
6
6
  from dstack._internal.server.db import get_session_ctx
7
- from dstack._internal.server.models import FleetModel, PlacementGroupModel
7
+ from dstack._internal.server.models import FleetModel
8
8
  from dstack._internal.server.services.fleets import (
9
9
  is_fleet_empty,
10
10
  is_fleet_in_use,
11
11
  )
12
12
  from dstack._internal.server.services.locking import get_locker
13
+ from dstack._internal.server.services.placement import schedule_fleet_placement_groups_deletion
13
14
  from dstack._internal.utils.common import get_current_datetime
14
15
  from dstack._internal.utils.logging import get_logger
15
16
 
@@ -68,16 +69,6 @@ async def _autodelete_fleet(session: AsyncSession, fleet_model: FleetModel):
68
69
  fleet_model.status = FleetStatus.TERMINATED
69
70
  fleet_model.deleted = True
70
71
  fleet_model.last_processed_at = get_current_datetime()
71
- await _mark_placement_groups_as_ready_for_deletion(session=session, fleet_model=fleet_model)
72
+ await schedule_fleet_placement_groups_deletion(session=session, fleet_id=fleet_model.id)
72
73
  await session.commit()
73
74
  logger.info("Fleet %s deleted", fleet_model.name)
74
-
75
-
76
- async def _mark_placement_groups_as_ready_for_deletion(
77
- session: AsyncSession, fleet_model: FleetModel
78
- ):
79
- await session.execute(
80
- update(PlacementGroupModel)
81
- .where(PlacementGroupModel.fleet_id == fleet_model.id)
82
- .values(fleet_deleted=True)
83
- )