dstack 0.19.7__py3-none-any.whl → 0.19.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/args.py +2 -2
- dstack/_internal/cli/services/configurators/run.py +56 -13
- dstack/_internal/cli/utils/run.py +10 -5
- dstack/_internal/core/backends/aws/compute.py +13 -1
- dstack/_internal/core/backends/azure/compute.py +42 -13
- dstack/_internal/core/backends/azure/configurator.py +21 -0
- dstack/_internal/core/backends/azure/models.py +9 -0
- dstack/_internal/core/backends/base/compute.py +101 -27
- dstack/_internal/core/backends/base/offers.py +13 -3
- dstack/_internal/core/backends/cudo/compute.py +3 -1
- dstack/_internal/core/backends/datacrunch/compute.py +2 -0
- dstack/_internal/core/backends/gcp/auth.py +1 -1
- dstack/_internal/core/backends/gcp/compute.py +51 -35
- dstack/_internal/core/backends/lambdalabs/compute.py +20 -8
- dstack/_internal/core/backends/local/compute.py +2 -0
- dstack/_internal/core/backends/nebius/compute.py +95 -1
- dstack/_internal/core/backends/nebius/configurator.py +11 -0
- dstack/_internal/core/backends/nebius/fabrics.py +48 -0
- dstack/_internal/core/backends/nebius/models.py +9 -1
- dstack/_internal/core/backends/nebius/resources.py +29 -0
- dstack/_internal/core/backends/oci/compute.py +2 -0
- dstack/_internal/core/backends/remote/provisioning.py +27 -2
- dstack/_internal/core/backends/template/compute.py.jinja +2 -0
- dstack/_internal/core/backends/tensordock/compute.py +2 -0
- dstack/_internal/core/backends/vultr/compute.py +5 -1
- dstack/_internal/core/models/instances.py +2 -1
- dstack/_internal/core/models/resources.py +79 -4
- dstack/_internal/core/models/runs.py +26 -9
- dstack/_internal/core/models/volumes.py +1 -1
- dstack/_internal/server/background/tasks/process_fleets.py +4 -13
- dstack/_internal/server/background/tasks/process_instances.py +176 -55
- dstack/_internal/server/background/tasks/process_metrics.py +26 -9
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +5 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +56 -18
- dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
- dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
- dstack/_internal/server/models.py +6 -1
- dstack/_internal/server/schemas/runner.py +41 -8
- dstack/_internal/server/services/fleets.py +9 -26
- dstack/_internal/server/services/instances.py +0 -2
- dstack/_internal/server/services/jobs/__init__.py +1 -0
- dstack/_internal/server/services/offers.py +15 -0
- dstack/_internal/server/services/placement.py +27 -6
- dstack/_internal/server/services/resources.py +21 -0
- dstack/_internal/server/services/runner/client.py +7 -4
- dstack/_internal/server/services/runs.py +18 -8
- dstack/_internal/server/settings.py +20 -1
- dstack/_internal/server/testing/common.py +37 -26
- dstack/_internal/utils/common.py +13 -1
- dstack/_internal/utils/json_schema.py +6 -3
- dstack/api/__init__.py +1 -0
- dstack/api/server/_fleets.py +16 -0
- dstack/api/server/_runs.py +48 -3
- dstack/version.py +1 -1
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/METADATA +38 -29
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/RECORD +60 -56
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/WHEEL +0 -0
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Add JobModel.disconnected_at
|
|
2
|
+
|
|
3
|
+
Revision ID: 20166748b60c
|
|
4
|
+
Revises: 6c1a9d6530ee
|
|
5
|
+
Create Date: 2025-05-13 16:24:32.496578
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
from alembic_postgresql_enum import TableReference
|
|
12
|
+
|
|
13
|
+
import dstack._internal.server.models
|
|
14
|
+
|
|
15
|
+
# revision identifiers, used by Alembic.
|
|
16
|
+
revision = "20166748b60c"
|
|
17
|
+
down_revision = "6c1a9d6530ee"
|
|
18
|
+
branch_labels = None
|
|
19
|
+
depends_on = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def upgrade() -> None:
|
|
23
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
24
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
25
|
+
batch_op.add_column(
|
|
26
|
+
sa.Column(
|
|
27
|
+
"disconnected_at", dstack._internal.server.models.NaiveDateTime(), nullable=True
|
|
28
|
+
)
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
op.sync_enum_values(
|
|
32
|
+
enum_schema="public",
|
|
33
|
+
enum_name="jobterminationreason",
|
|
34
|
+
new_values=[
|
|
35
|
+
"FAILED_TO_START_DUE_TO_NO_CAPACITY",
|
|
36
|
+
"INTERRUPTED_BY_NO_CAPACITY",
|
|
37
|
+
"INSTANCE_UNREACHABLE",
|
|
38
|
+
"WAITING_INSTANCE_LIMIT_EXCEEDED",
|
|
39
|
+
"WAITING_RUNNER_LIMIT_EXCEEDED",
|
|
40
|
+
"TERMINATED_BY_USER",
|
|
41
|
+
"VOLUME_ERROR",
|
|
42
|
+
"GATEWAY_ERROR",
|
|
43
|
+
"SCALED_DOWN",
|
|
44
|
+
"DONE_BY_RUNNER",
|
|
45
|
+
"ABORTED_BY_USER",
|
|
46
|
+
"TERMINATED_BY_SERVER",
|
|
47
|
+
"INACTIVITY_DURATION_EXCEEDED",
|
|
48
|
+
"TERMINATED_DUE_TO_UTILIZATION_POLICY",
|
|
49
|
+
"CONTAINER_EXITED_WITH_ERROR",
|
|
50
|
+
"PORTS_BINDING_FAILED",
|
|
51
|
+
"CREATING_CONTAINER_ERROR",
|
|
52
|
+
"EXECUTOR_ERROR",
|
|
53
|
+
"MAX_DURATION_EXCEEDED",
|
|
54
|
+
],
|
|
55
|
+
affected_columns=[
|
|
56
|
+
TableReference(
|
|
57
|
+
table_schema="public", table_name="jobs", column_name="termination_reason"
|
|
58
|
+
)
|
|
59
|
+
],
|
|
60
|
+
enum_values_to_rename=[],
|
|
61
|
+
)
|
|
62
|
+
# ### end Alembic commands ###
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def downgrade() -> None:
|
|
66
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
67
|
+
op.sync_enum_values(
|
|
68
|
+
enum_schema="public",
|
|
69
|
+
enum_name="jobterminationreason",
|
|
70
|
+
new_values=[
|
|
71
|
+
"FAILED_TO_START_DUE_TO_NO_CAPACITY",
|
|
72
|
+
"INTERRUPTED_BY_NO_CAPACITY",
|
|
73
|
+
"WAITING_INSTANCE_LIMIT_EXCEEDED",
|
|
74
|
+
"WAITING_RUNNER_LIMIT_EXCEEDED",
|
|
75
|
+
"TERMINATED_BY_USER",
|
|
76
|
+
"VOLUME_ERROR",
|
|
77
|
+
"GATEWAY_ERROR",
|
|
78
|
+
"SCALED_DOWN",
|
|
79
|
+
"DONE_BY_RUNNER",
|
|
80
|
+
"ABORTED_BY_USER",
|
|
81
|
+
"TERMINATED_BY_SERVER",
|
|
82
|
+
"INACTIVITY_DURATION_EXCEEDED",
|
|
83
|
+
"TERMINATED_DUE_TO_UTILIZATION_POLICY",
|
|
84
|
+
"CONTAINER_EXITED_WITH_ERROR",
|
|
85
|
+
"PORTS_BINDING_FAILED",
|
|
86
|
+
"CREATING_CONTAINER_ERROR",
|
|
87
|
+
"EXECUTOR_ERROR",
|
|
88
|
+
"MAX_DURATION_EXCEEDED",
|
|
89
|
+
],
|
|
90
|
+
affected_columns=[
|
|
91
|
+
TableReference(
|
|
92
|
+
table_schema="public", table_name="jobs", column_name="termination_reason"
|
|
93
|
+
)
|
|
94
|
+
],
|
|
95
|
+
enum_values_to_rename=[],
|
|
96
|
+
)
|
|
97
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
98
|
+
batch_op.drop_column("disconnected_at")
|
|
99
|
+
|
|
100
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Add JobModel.exit_status
|
|
2
|
+
|
|
3
|
+
Revision ID: 6c1a9d6530ee
|
|
4
|
+
Revises: 7ba3b59d7ca6
|
|
5
|
+
Create Date: 2025-05-09 10:25:19.715852
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = "6c1a9d6530ee"
|
|
14
|
+
down_revision = "7ba3b59d7ca6"
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
21
|
+
batch_op.add_column(sa.Column("exit_status", sa.Integer(), nullable=True))
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def downgrade() -> None:
|
|
25
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
26
|
+
batch_op.drop_column("exit_status")
|
|
@@ -382,6 +382,10 @@ class JobModel(BaseModel):
|
|
|
382
382
|
Enum(JobTerminationReason)
|
|
383
383
|
)
|
|
384
384
|
termination_reason_message: Mapped[Optional[str]] = mapped_column(Text)
|
|
385
|
+
# `disconnected_at` stores the first time of connectivity issues with the instance.
|
|
386
|
+
# Resets every time connectivity is restored.
|
|
387
|
+
disconnected_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
388
|
+
exit_status: Mapped[Optional[int]] = mapped_column(Integer)
|
|
385
389
|
job_spec_data: Mapped[str] = mapped_column(Text)
|
|
386
390
|
job_provisioning_data: Mapped[Optional[str]] = mapped_column(Text)
|
|
387
391
|
runner_timestamp: Mapped[Optional[int]] = mapped_column(BigInteger)
|
|
@@ -390,7 +394,7 @@ class JobModel(BaseModel):
|
|
|
390
394
|
remove_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
391
395
|
volumes_detached_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
392
396
|
# `instance_assigned` means instance assignment was done.
|
|
393
|
-
# if `instance_assigned` is True and `instance` is None, no instance was
|
|
397
|
+
# if `instance_assigned` is True and `instance` is None, no instance was assigned.
|
|
394
398
|
instance_assigned: Mapped[bool] = mapped_column(Boolean, default=False)
|
|
395
399
|
instance_id: Mapped[Optional[uuid.UUID]] = mapped_column(
|
|
396
400
|
ForeignKey("instances.id", ondelete="CASCADE")
|
|
@@ -659,6 +663,7 @@ class PlacementGroupModel(BaseModel):
|
|
|
659
663
|
|
|
660
664
|
fleet_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("fleets.id"))
|
|
661
665
|
fleet: Mapped["FleetModel"] = relationship(foreign_keys=[fleet_id])
|
|
666
|
+
# TODO: rename `fleet_deleted` -> `to_be_deleted`
|
|
662
667
|
fleet_deleted: Mapped[bool] = mapped_column(Boolean, default=False)
|
|
663
668
|
|
|
664
669
|
created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime)
|
|
@@ -7,7 +7,14 @@ from typing_extensions import Annotated
|
|
|
7
7
|
|
|
8
8
|
from dstack._internal.core.models.common import CoreModel, NetworkMode
|
|
9
9
|
from dstack._internal.core.models.repos.remote import RemoteRepoCreds
|
|
10
|
-
from dstack._internal.core.models.runs import
|
|
10
|
+
from dstack._internal.core.models.runs import (
|
|
11
|
+
ClusterInfo,
|
|
12
|
+
JobSpec,
|
|
13
|
+
JobStatus,
|
|
14
|
+
JobSubmission,
|
|
15
|
+
Run,
|
|
16
|
+
RunSpec,
|
|
17
|
+
)
|
|
11
18
|
from dstack._internal.core.models.volumes import InstanceMountPoint, VolumeMountPoint
|
|
12
19
|
|
|
13
20
|
|
|
@@ -16,6 +23,7 @@ class JobStateEvent(CoreModel):
|
|
|
16
23
|
state: JobStatus
|
|
17
24
|
termination_reason: Optional[str] = None
|
|
18
25
|
termination_message: Optional[str] = None
|
|
26
|
+
exit_status: Optional[int] = None
|
|
19
27
|
|
|
20
28
|
|
|
21
29
|
class LogEvent(CoreModel):
|
|
@@ -38,15 +46,18 @@ class PullResponse(CoreModel):
|
|
|
38
46
|
|
|
39
47
|
|
|
40
48
|
class SubmitBody(CoreModel):
|
|
41
|
-
|
|
42
|
-
|
|
49
|
+
run: Annotated[
|
|
50
|
+
Run,
|
|
43
51
|
Field(
|
|
44
52
|
include={
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
53
|
+
"id": True,
|
|
54
|
+
"run_spec": {
|
|
55
|
+
"run_name",
|
|
56
|
+
"repo_id",
|
|
57
|
+
"repo_data",
|
|
58
|
+
"configuration",
|
|
59
|
+
"configuration_path",
|
|
60
|
+
},
|
|
50
61
|
}
|
|
51
62
|
),
|
|
52
63
|
]
|
|
@@ -69,9 +80,31 @@ class SubmitBody(CoreModel):
|
|
|
69
80
|
}
|
|
70
81
|
),
|
|
71
82
|
]
|
|
83
|
+
job_submission: Annotated[
|
|
84
|
+
JobSubmission,
|
|
85
|
+
Field(
|
|
86
|
+
include={
|
|
87
|
+
"id",
|
|
88
|
+
}
|
|
89
|
+
),
|
|
90
|
+
]
|
|
72
91
|
cluster_info: Annotated[Optional[ClusterInfo], Field(include=True)]
|
|
73
92
|
secrets: Annotated[Optional[Dict[str, str]], Field(include=True)]
|
|
74
93
|
repo_credentials: Annotated[Optional[RemoteRepoCreds], Field(include=True)]
|
|
94
|
+
# run_spec is deprecated in favor of run.run_spec
|
|
95
|
+
# TODO: Remove once we no longer support instances deployed with 0.19.8 or earlier.
|
|
96
|
+
run_spec: Annotated[
|
|
97
|
+
RunSpec,
|
|
98
|
+
Field(
|
|
99
|
+
include={
|
|
100
|
+
"run_name",
|
|
101
|
+
"repo_id",
|
|
102
|
+
"repo_data",
|
|
103
|
+
"configuration",
|
|
104
|
+
"configuration_path",
|
|
105
|
+
},
|
|
106
|
+
),
|
|
107
|
+
]
|
|
75
108
|
|
|
76
109
|
|
|
77
110
|
class HealthcheckResponse(CoreModel):
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import random
|
|
2
|
-
import string
|
|
3
1
|
import uuid
|
|
4
2
|
from datetime import datetime, timezone
|
|
5
3
|
from typing import List, Literal, Optional, Tuple, Union, cast
|
|
@@ -33,6 +31,7 @@ from dstack._internal.core.models.instances import (
|
|
|
33
31
|
SSHConnectionParams,
|
|
34
32
|
SSHKey,
|
|
35
33
|
)
|
|
34
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
36
35
|
from dstack._internal.core.models.profiles import (
|
|
37
36
|
Profile,
|
|
38
37
|
SpotPolicy,
|
|
@@ -62,6 +61,7 @@ from dstack._internal.server.services.projects import (
|
|
|
62
61
|
list_project_models,
|
|
63
62
|
list_user_project_models,
|
|
64
63
|
)
|
|
64
|
+
from dstack._internal.server.services.resources import set_resources_defaults
|
|
65
65
|
from dstack._internal.utils import random_names
|
|
66
66
|
from dstack._internal.utils.logging import get_logger
|
|
67
67
|
from dstack._internal.utils.ssh import pkey_from_str
|
|
@@ -243,6 +243,7 @@ async def get_plan(
|
|
|
243
243
|
spec=effective_spec,
|
|
244
244
|
)
|
|
245
245
|
effective_spec = FleetSpec.parse_obj(effective_spec.dict())
|
|
246
|
+
_validate_fleet_spec_and_set_defaults(spec)
|
|
246
247
|
current_fleet: Optional[Fleet] = None
|
|
247
248
|
current_fleet_id: Optional[uuid.UUID] = None
|
|
248
249
|
if effective_spec.configuration.name is not None:
|
|
@@ -282,6 +283,7 @@ async def get_create_instance_offers(
|
|
|
282
283
|
project: ProjectModel,
|
|
283
284
|
profile: Profile,
|
|
284
285
|
requirements: Requirements,
|
|
286
|
+
placement_group: Optional[PlacementGroup] = None,
|
|
285
287
|
fleet_spec: Optional[FleetSpec] = None,
|
|
286
288
|
fleet_model: Optional[FleetModel] = None,
|
|
287
289
|
blocks: Union[int, Literal["auto"]] = 1,
|
|
@@ -307,6 +309,7 @@ async def get_create_instance_offers(
|
|
|
307
309
|
exclude_not_available=exclude_not_available,
|
|
308
310
|
multinode=multinode,
|
|
309
311
|
master_job_provisioning_data=master_job_provisioning_data,
|
|
312
|
+
placement_group=placement_group,
|
|
310
313
|
blocks=blocks,
|
|
311
314
|
)
|
|
312
315
|
offers = [
|
|
@@ -345,7 +348,7 @@ async def create_fleet(
|
|
|
345
348
|
spec=spec,
|
|
346
349
|
)
|
|
347
350
|
spec = FleetSpec.parse_obj(spec.dict())
|
|
348
|
-
|
|
351
|
+
_validate_fleet_spec_and_set_defaults(spec)
|
|
349
352
|
|
|
350
353
|
if spec.configuration.ssh_config is not None:
|
|
351
354
|
_check_can_manage_ssh_fleets(user=user, project=project)
|
|
@@ -393,17 +396,12 @@ async def create_fleet(
|
|
|
393
396
|
)
|
|
394
397
|
fleet_model.instances.append(instances_model)
|
|
395
398
|
else:
|
|
396
|
-
placement_group_name = _get_placement_group_name(
|
|
397
|
-
project=project,
|
|
398
|
-
fleet_spec=spec,
|
|
399
|
-
)
|
|
400
399
|
for i in range(_get_fleet_nodes_to_provision(spec)):
|
|
401
400
|
instance_model = await create_fleet_instance_model(
|
|
402
401
|
session=session,
|
|
403
402
|
project=project,
|
|
404
403
|
user=user,
|
|
405
404
|
spec=spec,
|
|
406
|
-
placement_group_name=placement_group_name,
|
|
407
405
|
reservation=spec.configuration.reservation,
|
|
408
406
|
instance_num=i,
|
|
409
407
|
)
|
|
@@ -417,7 +415,6 @@ async def create_fleet_instance_model(
|
|
|
417
415
|
project: ProjectModel,
|
|
418
416
|
user: UserModel,
|
|
419
417
|
spec: FleetSpec,
|
|
420
|
-
placement_group_name: Optional[str],
|
|
421
418
|
reservation: Optional[str],
|
|
422
419
|
instance_num: int,
|
|
423
420
|
) -> InstanceModel:
|
|
@@ -431,7 +428,6 @@ async def create_fleet_instance_model(
|
|
|
431
428
|
requirements=requirements,
|
|
432
429
|
instance_name=f"{spec.configuration.name}-{instance_num}",
|
|
433
430
|
instance_num=instance_num,
|
|
434
|
-
placement_group_name=placement_group_name,
|
|
435
431
|
reservation=reservation,
|
|
436
432
|
blocks=spec.configuration.blocks,
|
|
437
433
|
tags=spec.configuration.tags,
|
|
@@ -652,7 +648,7 @@ def _remove_fleet_spec_sensitive_info(spec: FleetSpec):
|
|
|
652
648
|
host.ssh_key = None
|
|
653
649
|
|
|
654
650
|
|
|
655
|
-
def
|
|
651
|
+
def _validate_fleet_spec_and_set_defaults(spec: FleetSpec):
|
|
656
652
|
if spec.configuration.name is not None:
|
|
657
653
|
validate_dstack_resource_name(spec.configuration.name)
|
|
658
654
|
if spec.configuration.ssh_config is None and spec.configuration.nodes is None:
|
|
@@ -665,6 +661,8 @@ def _validate_fleet_spec(spec: FleetSpec):
|
|
|
665
661
|
if isinstance(host, SSHHostParams) and host.ssh_key is not None:
|
|
666
662
|
_validate_ssh_key(host.ssh_key)
|
|
667
663
|
_validate_internal_ips(spec.configuration.ssh_config)
|
|
664
|
+
if spec.configuration.resources is not None:
|
|
665
|
+
set_resources_defaults(spec.configuration.resources)
|
|
668
666
|
|
|
669
667
|
|
|
670
668
|
def _validate_all_ssh_params_specified(ssh_config: SSHParams):
|
|
@@ -735,18 +733,3 @@ def _get_fleet_requirements(fleet_spec: FleetSpec) -> Requirements:
|
|
|
735
733
|
reservation=fleet_spec.configuration.reservation,
|
|
736
734
|
)
|
|
737
735
|
return requirements
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
def _get_placement_group_name(
|
|
741
|
-
project: ProjectModel,
|
|
742
|
-
fleet_spec: FleetSpec,
|
|
743
|
-
) -> Optional[str]:
|
|
744
|
-
if fleet_spec.configuration.placement != InstanceGroupPlacement.CLUSTER:
|
|
745
|
-
return None
|
|
746
|
-
# A random suffix to avoid clashing with to-be-deleted placement groups left by old fleets
|
|
747
|
-
suffix = _generate_random_placement_group_suffix()
|
|
748
|
-
return f"{project.name}-{fleet_spec.configuration.name}-{suffix}-pg"
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
def _generate_random_placement_group_suffix(length: int = 8) -> str:
|
|
752
|
-
return "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(length))
|
|
@@ -408,7 +408,6 @@ async def create_instance_model(
|
|
|
408
408
|
requirements: Requirements,
|
|
409
409
|
instance_name: str,
|
|
410
410
|
instance_num: int,
|
|
411
|
-
placement_group_name: Optional[str],
|
|
412
411
|
reservation: Optional[str],
|
|
413
412
|
blocks: Union[Literal["auto"], int],
|
|
414
413
|
tags: Optional[Dict[str, str]],
|
|
@@ -427,7 +426,6 @@ async def create_instance_model(
|
|
|
427
426
|
user=user.name,
|
|
428
427
|
ssh_keys=[project_ssh_key],
|
|
429
428
|
instance_id=str(instance_id),
|
|
430
|
-
placement_group_name=placement_group_name,
|
|
431
429
|
reservation=reservation,
|
|
432
430
|
tags=tags,
|
|
433
431
|
)
|
|
@@ -135,6 +135,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
|
|
|
135
135
|
status=job_model.status,
|
|
136
136
|
termination_reason=job_model.termination_reason,
|
|
137
137
|
termination_reason_message=job_model.termination_reason_message,
|
|
138
|
+
exit_status=job_model.exit_status,
|
|
138
139
|
job_provisioning_data=job_provisioning_data,
|
|
139
140
|
job_runtime_data=get_job_runtime_data(job_model),
|
|
140
141
|
)
|
|
@@ -8,12 +8,14 @@ from dstack._internal.core.backends import (
|
|
|
8
8
|
BACKENDS_WITH_RESERVATION_SUPPORT,
|
|
9
9
|
)
|
|
10
10
|
from dstack._internal.core.backends.base.backend import Backend
|
|
11
|
+
from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
|
|
11
12
|
from dstack._internal.core.models.backends.base import BackendType
|
|
12
13
|
from dstack._internal.core.models.instances import (
|
|
13
14
|
InstanceOfferWithAvailability,
|
|
14
15
|
InstanceType,
|
|
15
16
|
Resources,
|
|
16
17
|
)
|
|
18
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
17
19
|
from dstack._internal.core.models.profiles import Profile
|
|
18
20
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
19
21
|
from dstack._internal.core.models.volumes import Volume
|
|
@@ -31,6 +33,7 @@ async def get_offers_by_requirements(
|
|
|
31
33
|
volumes: Optional[List[List[Volume]]] = None,
|
|
32
34
|
privileged: bool = False,
|
|
33
35
|
instance_mounts: bool = False,
|
|
36
|
+
placement_group: Optional[PlacementGroup] = None,
|
|
34
37
|
blocks: Union[int, Literal["auto"]] = 1,
|
|
35
38
|
) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
|
|
36
39
|
backends: List[Backend] = await backends_services.get_project_backends(project=project)
|
|
@@ -116,6 +119,18 @@ async def get_offers_by_requirements(
|
|
|
116
119
|
new_offers.append((b, new_offer))
|
|
117
120
|
offers = new_offers
|
|
118
121
|
|
|
122
|
+
if placement_group is not None:
|
|
123
|
+
new_offers = []
|
|
124
|
+
for b, o in offers:
|
|
125
|
+
for backend in backends:
|
|
126
|
+
compute = backend.compute()
|
|
127
|
+
if isinstance(
|
|
128
|
+
compute, ComputeWithPlacementGroupSupport
|
|
129
|
+
) and compute.is_suitable_placement_group(placement_group, o):
|
|
130
|
+
new_offers.append((b, o))
|
|
131
|
+
break
|
|
132
|
+
offers = new_offers
|
|
133
|
+
|
|
119
134
|
if profile.instance_types is not None:
|
|
120
135
|
instance_types = [i.lower() for i in profile.instance_types]
|
|
121
136
|
offers = [(b, o) for b, o in offers if o.instance.name.lower() in instance_types]
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
1
2
|
from typing import Optional
|
|
2
3
|
from uuid import UUID
|
|
3
4
|
|
|
4
5
|
from git import List
|
|
5
|
-
from sqlalchemy import select
|
|
6
|
+
from sqlalchemy import and_, select, update
|
|
6
7
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
7
8
|
|
|
8
9
|
from dstack._internal.core.models.placement import (
|
|
@@ -13,15 +14,35 @@ from dstack._internal.core.models.placement import (
|
|
|
13
14
|
from dstack._internal.server.models import PlacementGroupModel
|
|
14
15
|
|
|
15
16
|
|
|
16
|
-
async def
|
|
17
|
+
async def get_fleet_placement_group_models(
|
|
17
18
|
session: AsyncSession,
|
|
18
19
|
fleet_id: UUID,
|
|
19
|
-
) -> List[
|
|
20
|
+
) -> List[PlacementGroupModel]:
|
|
20
21
|
res = await session.execute(
|
|
21
|
-
select(PlacementGroupModel).where(
|
|
22
|
+
select(PlacementGroupModel).where(
|
|
23
|
+
and_(
|
|
24
|
+
PlacementGroupModel.fleet_id == fleet_id,
|
|
25
|
+
PlacementGroupModel.deleted == False,
|
|
26
|
+
PlacementGroupModel.fleet_deleted == False,
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
)
|
|
30
|
+
return list(res.scalars().all())
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def schedule_fleet_placement_groups_deletion(
|
|
34
|
+
session: AsyncSession, fleet_id: UUID, except_placement_group_ids: Iterable[UUID] = ()
|
|
35
|
+
) -> None:
|
|
36
|
+
await session.execute(
|
|
37
|
+
update(PlacementGroupModel)
|
|
38
|
+
.where(
|
|
39
|
+
and_(
|
|
40
|
+
PlacementGroupModel.fleet_id == fleet_id,
|
|
41
|
+
PlacementGroupModel.id.not_in(except_placement_group_ids),
|
|
42
|
+
)
|
|
43
|
+
)
|
|
44
|
+
.values(fleet_deleted=True) # TODO: rename `fleet_deleted` -> `to_be_deleted`
|
|
22
45
|
)
|
|
23
|
-
placement_groups = res.scalars().all()
|
|
24
|
-
return [placement_group_model_to_placement_group(pg) for pg in placement_groups]
|
|
25
46
|
|
|
26
47
|
|
|
27
48
|
def placement_group_model_to_placement_group(
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import gpuhunt
|
|
2
|
+
from pydantic import parse_obj_as
|
|
3
|
+
|
|
4
|
+
from dstack._internal.core.models.resources import CPUSpec, ResourcesSpec
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def set_resources_defaults(resources: ResourcesSpec) -> None:
|
|
8
|
+
# TODO: Remove in 0.20. Use resources.cpu directly
|
|
9
|
+
cpu = parse_obj_as(CPUSpec, resources.cpu)
|
|
10
|
+
if cpu.arch is None:
|
|
11
|
+
gpu = resources.gpu
|
|
12
|
+
if (
|
|
13
|
+
gpu is not None
|
|
14
|
+
and gpu.vendor in [None, gpuhunt.AcceleratorVendor.NVIDIA]
|
|
15
|
+
and gpu.name
|
|
16
|
+
and any(map(gpuhunt.is_nvidia_superchip, gpu.name))
|
|
17
|
+
):
|
|
18
|
+
cpu.arch = gpuhunt.CPUArchitecture.ARM
|
|
19
|
+
else:
|
|
20
|
+
cpu.arch = gpuhunt.CPUArchitecture.X86
|
|
21
|
+
resources.cpu = cpu
|
|
@@ -12,7 +12,7 @@ from dstack._internal.core.models.common import CoreModel, NetworkMode
|
|
|
12
12
|
from dstack._internal.core.models.envs import Env
|
|
13
13
|
from dstack._internal.core.models.repos.remote import RemoteRepoCreds
|
|
14
14
|
from dstack._internal.core.models.resources import Memory
|
|
15
|
-
from dstack._internal.core.models.runs import ClusterInfo,
|
|
15
|
+
from dstack._internal.core.models.runs import ClusterInfo, Job, Run
|
|
16
16
|
from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
|
|
17
17
|
from dstack._internal.server.schemas.runner import (
|
|
18
18
|
GPUDevice,
|
|
@@ -72,8 +72,8 @@ class RunnerClient:
|
|
|
72
72
|
|
|
73
73
|
def submit_job(
|
|
74
74
|
self,
|
|
75
|
-
|
|
76
|
-
|
|
75
|
+
run: Run,
|
|
76
|
+
job: Job,
|
|
77
77
|
cluster_info: ClusterInfo,
|
|
78
78
|
secrets: Dict[str, str],
|
|
79
79
|
repo_credentials: Optional[RemoteRepoCreds],
|
|
@@ -81,6 +81,7 @@ class RunnerClient:
|
|
|
81
81
|
):
|
|
82
82
|
# XXX: This is a quick-and-dirty hack to deliver InstanceModel-specific environment
|
|
83
83
|
# variables to the runner without runner API modification.
|
|
84
|
+
job_spec = job.job_spec
|
|
84
85
|
if instance_env is not None:
|
|
85
86
|
if isinstance(instance_env, Env):
|
|
86
87
|
merged_env = instance_env.as_dict()
|
|
@@ -90,11 +91,13 @@ class RunnerClient:
|
|
|
90
91
|
job_spec = job_spec.copy(deep=True)
|
|
91
92
|
job_spec.env = merged_env
|
|
92
93
|
body = SubmitBody(
|
|
93
|
-
|
|
94
|
+
run=run,
|
|
94
95
|
job_spec=job_spec,
|
|
96
|
+
job_submission=job.job_submissions[-1],
|
|
95
97
|
cluster_info=cluster_info,
|
|
96
98
|
secrets=secrets,
|
|
97
99
|
repo_credentials=repo_credentials,
|
|
100
|
+
run_spec=run.run_spec,
|
|
98
101
|
)
|
|
99
102
|
resp = requests.post(
|
|
100
103
|
# use .json() to encode enums
|
|
@@ -81,6 +81,7 @@ from dstack._internal.server.services.logging import fmt
|
|
|
81
81
|
from dstack._internal.server.services.offers import get_offers_by_requirements
|
|
82
82
|
from dstack._internal.server.services.plugins import apply_plugin_policies
|
|
83
83
|
from dstack._internal.server.services.projects import list_project_models, list_user_project_models
|
|
84
|
+
from dstack._internal.server.services.resources import set_resources_defaults
|
|
84
85
|
from dstack._internal.server.services.users import get_user_model_by_name
|
|
85
86
|
from dstack._internal.utils.logging import get_logger
|
|
86
87
|
from dstack._internal.utils.random_names import generate_name
|
|
@@ -301,12 +302,14 @@ async def get_plan(
|
|
|
301
302
|
project=project,
|
|
302
303
|
run_name=effective_run_spec.run_name,
|
|
303
304
|
)
|
|
304
|
-
if
|
|
305
|
-
current_resource
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
305
|
+
if current_resource is not None:
|
|
306
|
+
# For backward compatibility (current_resource may has been submitted before
|
|
307
|
+
# some fields, e.g., CPUSpec.arch, were added)
|
|
308
|
+
set_resources_defaults(current_resource.run_spec.configuration.resources)
|
|
309
|
+
if not current_resource.status.is_finished() and _can_update_run_spec(
|
|
310
|
+
current_resource.run_spec, effective_run_spec
|
|
311
|
+
):
|
|
312
|
+
action = ApplyAction.UPDATE
|
|
310
313
|
|
|
311
314
|
jobs = await get_jobs_from_run_spec(effective_run_spec, replica_num=0)
|
|
312
315
|
|
|
@@ -406,6 +409,10 @@ async def apply_plan(
|
|
|
406
409
|
project=project,
|
|
407
410
|
run_spec=run_spec,
|
|
408
411
|
)
|
|
412
|
+
|
|
413
|
+
# For backward compatibility (current_resource may has been submitted before
|
|
414
|
+
# some fields, e.g., CPUSpec.arch, were added)
|
|
415
|
+
set_resources_defaults(current_resource.run_spec.configuration.resources)
|
|
409
416
|
try:
|
|
410
417
|
_check_can_update_run_spec(current_resource.run_spec, run_spec)
|
|
411
418
|
except ServerClientError:
|
|
@@ -414,6 +421,8 @@ async def apply_plan(
|
|
|
414
421
|
raise ServerClientError("Cannot override active run. Stop the run first.")
|
|
415
422
|
raise
|
|
416
423
|
if not force:
|
|
424
|
+
if plan.current_resource is not None:
|
|
425
|
+
set_resources_defaults(plan.current_resource.run_spec.configuration.resources)
|
|
417
426
|
if (
|
|
418
427
|
plan.current_resource is None
|
|
419
428
|
or plan.current_resource.id != current_resource.id
|
|
@@ -861,11 +870,12 @@ def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
|
|
|
861
870
|
if (
|
|
862
871
|
run_spec.merged_profile.utilization_policy is not None
|
|
863
872
|
and run_spec.merged_profile.utilization_policy.time_window
|
|
864
|
-
> settings.
|
|
873
|
+
> settings.SERVER_METRICS_RUNNING_TTL_SECONDS
|
|
865
874
|
):
|
|
866
875
|
raise ServerClientError(
|
|
867
|
-
f"Maximum utilization_policy.time_window is {settings.
|
|
876
|
+
f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_RUNNING_TTL_SECONDS}s"
|
|
868
877
|
)
|
|
878
|
+
set_resources_defaults(run_spec.configuration.resources)
|
|
869
879
|
|
|
870
880
|
|
|
871
881
|
_UPDATABLE_SPEC_FIELDS = ["repo_code_hash", "configuration"]
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import warnings
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
|
|
4
5
|
DSTACK_DIR_PATH = Path("~/.dstack/").expanduser()
|
|
@@ -45,7 +46,25 @@ SERVER_CLOUDWATCH_LOG_REGION = os.getenv("DSTACK_SERVER_CLOUDWATCH_LOG_REGION")
|
|
|
45
46
|
|
|
46
47
|
SERVER_GCP_LOGGING_PROJECT = os.getenv("DSTACK_SERVER_GCP_LOGGING_PROJECT")
|
|
47
48
|
|
|
48
|
-
|
|
49
|
+
SERVER_METRICS_RUNNING_TTL_SECONDS: int
|
|
50
|
+
_SERVER_METRICS_RUNNING_TTL_SECONDS = os.getenv("DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS")
|
|
51
|
+
if _SERVER_METRICS_RUNNING_TTL_SECONDS is None:
|
|
52
|
+
_SERVER_METRICS_RUNNING_TTL_SECONDS = os.getenv("DSTACK_SERVER_METRICS_TTL_SECONDS")
|
|
53
|
+
if _SERVER_METRICS_RUNNING_TTL_SECONDS is not None:
|
|
54
|
+
warnings.warn(
|
|
55
|
+
(
|
|
56
|
+
"DSTACK_SERVER_METRICS_TTL_SECONDS is deprecated,"
|
|
57
|
+
" use DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS instead"
|
|
58
|
+
),
|
|
59
|
+
DeprecationWarning,
|
|
60
|
+
)
|
|
61
|
+
else:
|
|
62
|
+
_SERVER_METRICS_RUNNING_TTL_SECONDS = 3600
|
|
63
|
+
SERVER_METRICS_RUNNING_TTL_SECONDS = int(_SERVER_METRICS_RUNNING_TTL_SECONDS)
|
|
64
|
+
del _SERVER_METRICS_RUNNING_TTL_SECONDS
|
|
65
|
+
SERVER_METRICS_FINISHED_TTL_SECONDS = int(
|
|
66
|
+
os.getenv("DSTACK_SERVER_METRICS_FINISHED_TTL_SECONDS", 7 * 24 * 3600)
|
|
67
|
+
)
|
|
49
68
|
|
|
50
69
|
DEFAULT_PROJECT_NAME = "main"
|
|
51
70
|
|