dstack 0.19.8__py3-none-any.whl → 0.19.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/config.py +1 -1
- dstack/_internal/cli/commands/project.py +161 -0
- dstack/_internal/cli/commands/ps.py +9 -2
- dstack/_internal/cli/main.py +2 -0
- dstack/_internal/cli/services/configurators/run.py +18 -11
- dstack/_internal/cli/utils/run.py +7 -2
- dstack/_internal/core/backends/azure/compute.py +5 -2
- dstack/_internal/core/backends/cudo/compute.py +1 -1
- dstack/_internal/core/backends/nebius/fabrics.py +1 -0
- dstack/_internal/core/backends/nebius/models.py +1 -1
- dstack/_internal/core/models/configurations.py +19 -3
- dstack/_internal/core/models/resources.py +1 -1
- dstack/_internal/core/models/runs.py +19 -7
- dstack/_internal/server/background/tasks/process_metrics.py +30 -11
- dstack/_internal/server/background/tasks/process_running_jobs.py +56 -18
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +21 -12
- dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
- dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
- dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py +34 -0
- dstack/_internal/server/models.py +6 -1
- dstack/_internal/server/routers/repos.py +8 -4
- dstack/_internal/server/schemas/runner.py +41 -8
- dstack/_internal/server/services/instances.py +6 -2
- dstack/_internal/server/services/jobs/__init__.py +1 -0
- dstack/_internal/server/services/jobs/configurators/base.py +3 -3
- dstack/_internal/server/services/runner/client.py +7 -4
- dstack/_internal/server/services/runs.py +33 -20
- dstack/_internal/server/settings.py +21 -1
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-b4f65323f5df007e1664.js → main-b4803049eac16aea9a49.js} +4 -4
- dstack/_internal/server/statics/{main-b4f65323f5df007e1664.js.map → main-b4803049eac16aea9a49.js.map} +1 -1
- dstack/_internal/server/testing/common.py +4 -0
- dstack/_internal/server/utils/routers.py +3 -6
- dstack/_internal/settings.py +4 -0
- dstack/api/_public/runs.py +6 -3
- dstack/api/server/_runs.py +6 -0
- dstack/version.py +1 -1
- {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/METADATA +46 -34
- {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/RECORD +42 -38
- {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/WHEEL +0 -0
- {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -93,11 +93,20 @@ async def _process_next_submitted_job():
|
|
|
93
93
|
async with lock:
|
|
94
94
|
res = await session.execute(
|
|
95
95
|
select(JobModel)
|
|
96
|
+
.join(JobModel.run)
|
|
96
97
|
.where(
|
|
97
98
|
JobModel.status == JobStatus.SUBMITTED,
|
|
98
99
|
JobModel.id.not_in(lockset),
|
|
99
100
|
)
|
|
100
|
-
|
|
101
|
+
# Jobs are process in FIFO sorted by priority globally,
|
|
102
|
+
# thus runs from different project can "overtake" each other by using higher priorities.
|
|
103
|
+
# That's not a big problem as long as projects do not compete for the same compute resources.
|
|
104
|
+
# Jobs with lower priorities from other projects will be processed without major lag
|
|
105
|
+
# as long as new higher priority runs are not constantly submitted.
|
|
106
|
+
# TODO: Consider processing jobs from different projects fairly/round-robin
|
|
107
|
+
# Fully fair processing can be tricky to implement via the current DB queue as
|
|
108
|
+
# there can be many projects and we are limited by the max DB connections.
|
|
109
|
+
.order_by(RunModel.priority.desc(), JobModel.last_processed_at.asc())
|
|
101
110
|
.limit(1)
|
|
102
111
|
.with_for_update(skip_locked=True)
|
|
103
112
|
)
|
|
@@ -360,16 +369,16 @@ async def _assign_job_to_pool_instance(
|
|
|
360
369
|
(instance, common_utils.get_or_error(get_instance_offer(instance)))
|
|
361
370
|
for instance in nonshared_instances
|
|
362
371
|
]
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
372
|
+
shared_instances_with_offers = get_shared_pool_instances_with_offers(
|
|
373
|
+
pool_instances=pool_instances,
|
|
374
|
+
profile=profile,
|
|
375
|
+
requirements=job.job_spec.requirements,
|
|
376
|
+
idle_only=True,
|
|
377
|
+
fleet_model=fleet_model,
|
|
378
|
+
multinode=multinode,
|
|
379
|
+
volumes=volumes,
|
|
380
|
+
)
|
|
381
|
+
instances_with_offers.extend(shared_instances_with_offers)
|
|
373
382
|
|
|
374
383
|
if len(instances_with_offers) == 0:
|
|
375
384
|
return None
|
|
@@ -572,7 +581,7 @@ def _create_instance_model_for_job(
|
|
|
572
581
|
|
|
573
582
|
|
|
574
583
|
def _prepare_job_runtime_data(offer: InstanceOfferWithAvailability) -> JobRuntimeData:
|
|
575
|
-
if offer.
|
|
584
|
+
if offer.blocks == offer.total_blocks:
|
|
576
585
|
if env_utils.get_bool("DSTACK_FORCE_BRIDGE_NETWORK"):
|
|
577
586
|
network_mode = NetworkMode.BRIDGE
|
|
578
587
|
else:
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Add JobModel.disconnected_at
|
|
2
|
+
|
|
3
|
+
Revision ID: 20166748b60c
|
|
4
|
+
Revises: 6c1a9d6530ee
|
|
5
|
+
Create Date: 2025-05-13 16:24:32.496578
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
from alembic_postgresql_enum import TableReference
|
|
12
|
+
|
|
13
|
+
import dstack._internal.server.models
|
|
14
|
+
|
|
15
|
+
# revision identifiers, used by Alembic.
|
|
16
|
+
revision = "20166748b60c"
|
|
17
|
+
down_revision = "6c1a9d6530ee"
|
|
18
|
+
branch_labels = None
|
|
19
|
+
depends_on = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def upgrade() -> None:
|
|
23
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
24
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
25
|
+
batch_op.add_column(
|
|
26
|
+
sa.Column(
|
|
27
|
+
"disconnected_at", dstack._internal.server.models.NaiveDateTime(), nullable=True
|
|
28
|
+
)
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
op.sync_enum_values(
|
|
32
|
+
enum_schema="public",
|
|
33
|
+
enum_name="jobterminationreason",
|
|
34
|
+
new_values=[
|
|
35
|
+
"FAILED_TO_START_DUE_TO_NO_CAPACITY",
|
|
36
|
+
"INTERRUPTED_BY_NO_CAPACITY",
|
|
37
|
+
"INSTANCE_UNREACHABLE",
|
|
38
|
+
"WAITING_INSTANCE_LIMIT_EXCEEDED",
|
|
39
|
+
"WAITING_RUNNER_LIMIT_EXCEEDED",
|
|
40
|
+
"TERMINATED_BY_USER",
|
|
41
|
+
"VOLUME_ERROR",
|
|
42
|
+
"GATEWAY_ERROR",
|
|
43
|
+
"SCALED_DOWN",
|
|
44
|
+
"DONE_BY_RUNNER",
|
|
45
|
+
"ABORTED_BY_USER",
|
|
46
|
+
"TERMINATED_BY_SERVER",
|
|
47
|
+
"INACTIVITY_DURATION_EXCEEDED",
|
|
48
|
+
"TERMINATED_DUE_TO_UTILIZATION_POLICY",
|
|
49
|
+
"CONTAINER_EXITED_WITH_ERROR",
|
|
50
|
+
"PORTS_BINDING_FAILED",
|
|
51
|
+
"CREATING_CONTAINER_ERROR",
|
|
52
|
+
"EXECUTOR_ERROR",
|
|
53
|
+
"MAX_DURATION_EXCEEDED",
|
|
54
|
+
],
|
|
55
|
+
affected_columns=[
|
|
56
|
+
TableReference(
|
|
57
|
+
table_schema="public", table_name="jobs", column_name="termination_reason"
|
|
58
|
+
)
|
|
59
|
+
],
|
|
60
|
+
enum_values_to_rename=[],
|
|
61
|
+
)
|
|
62
|
+
# ### end Alembic commands ###
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def downgrade() -> None:
|
|
66
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
67
|
+
op.sync_enum_values(
|
|
68
|
+
enum_schema="public",
|
|
69
|
+
enum_name="jobterminationreason",
|
|
70
|
+
new_values=[
|
|
71
|
+
"FAILED_TO_START_DUE_TO_NO_CAPACITY",
|
|
72
|
+
"INTERRUPTED_BY_NO_CAPACITY",
|
|
73
|
+
"WAITING_INSTANCE_LIMIT_EXCEEDED",
|
|
74
|
+
"WAITING_RUNNER_LIMIT_EXCEEDED",
|
|
75
|
+
"TERMINATED_BY_USER",
|
|
76
|
+
"VOLUME_ERROR",
|
|
77
|
+
"GATEWAY_ERROR",
|
|
78
|
+
"SCALED_DOWN",
|
|
79
|
+
"DONE_BY_RUNNER",
|
|
80
|
+
"ABORTED_BY_USER",
|
|
81
|
+
"TERMINATED_BY_SERVER",
|
|
82
|
+
"INACTIVITY_DURATION_EXCEEDED",
|
|
83
|
+
"TERMINATED_DUE_TO_UTILIZATION_POLICY",
|
|
84
|
+
"CONTAINER_EXITED_WITH_ERROR",
|
|
85
|
+
"PORTS_BINDING_FAILED",
|
|
86
|
+
"CREATING_CONTAINER_ERROR",
|
|
87
|
+
"EXECUTOR_ERROR",
|
|
88
|
+
"MAX_DURATION_EXCEEDED",
|
|
89
|
+
],
|
|
90
|
+
affected_columns=[
|
|
91
|
+
TableReference(
|
|
92
|
+
table_schema="public", table_name="jobs", column_name="termination_reason"
|
|
93
|
+
)
|
|
94
|
+
],
|
|
95
|
+
enum_values_to_rename=[],
|
|
96
|
+
)
|
|
97
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
98
|
+
batch_op.drop_column("disconnected_at")
|
|
99
|
+
|
|
100
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Add JobModel.exit_status
|
|
2
|
+
|
|
3
|
+
Revision ID: 6c1a9d6530ee
|
|
4
|
+
Revises: 7ba3b59d7ca6
|
|
5
|
+
Create Date: 2025-05-09 10:25:19.715852
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = "6c1a9d6530ee"
|
|
14
|
+
down_revision = "7ba3b59d7ca6"
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
21
|
+
batch_op.add_column(sa.Column("exit_status", sa.Integer(), nullable=True))
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def downgrade() -> None:
|
|
25
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
26
|
+
batch_op.drop_column("exit_status")
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Add RunModel.priority
|
|
2
|
+
|
|
3
|
+
Revision ID: bca2fdf130bf
|
|
4
|
+
Revises: 20166748b60c
|
|
5
|
+
Create Date: 2025-05-14 15:24:21.269775
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = "bca2fdf130bf"
|
|
14
|
+
down_revision = "20166748b60c"
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
21
|
+
with op.batch_alter_table("runs", schema=None) as batch_op:
|
|
22
|
+
batch_op.add_column(sa.Column("priority", sa.Integer(), nullable=True))
|
|
23
|
+
batch_op.execute("UPDATE runs SET priority = 0")
|
|
24
|
+
with op.batch_alter_table("runs", schema=None) as batch_op:
|
|
25
|
+
batch_op.alter_column("priority", nullable=False)
|
|
26
|
+
# ### end Alembic commands ###
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def downgrade() -> None:
|
|
30
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
31
|
+
with op.batch_alter_table("runs", schema=None) as batch_op:
|
|
32
|
+
batch_op.drop_column("priority")
|
|
33
|
+
|
|
34
|
+
# ### end Alembic commands ###
|
|
@@ -348,6 +348,7 @@ class RunModel(BaseModel):
|
|
|
348
348
|
resubmission_attempt: Mapped[int] = mapped_column(Integer, default=0)
|
|
349
349
|
run_spec: Mapped[str] = mapped_column(Text)
|
|
350
350
|
service_spec: Mapped[Optional[str]] = mapped_column(Text)
|
|
351
|
+
priority: Mapped[int] = mapped_column(Integer, default=0)
|
|
351
352
|
|
|
352
353
|
jobs: Mapped[List["JobModel"]] = relationship(
|
|
353
354
|
back_populates="run", lazy="selectin", order_by="[JobModel.replica_num, JobModel.job_num]"
|
|
@@ -382,6 +383,10 @@ class JobModel(BaseModel):
|
|
|
382
383
|
Enum(JobTerminationReason)
|
|
383
384
|
)
|
|
384
385
|
termination_reason_message: Mapped[Optional[str]] = mapped_column(Text)
|
|
386
|
+
# `disconnected_at` stores the first time of connectivity issues with the instance.
|
|
387
|
+
# Resets every time connectivity is restored.
|
|
388
|
+
disconnected_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
389
|
+
exit_status: Mapped[Optional[int]] = mapped_column(Integer)
|
|
385
390
|
job_spec_data: Mapped[str] = mapped_column(Text)
|
|
386
391
|
job_provisioning_data: Mapped[Optional[str]] = mapped_column(Text)
|
|
387
392
|
runner_timestamp: Mapped[Optional[int]] = mapped_column(BigInteger)
|
|
@@ -390,7 +395,7 @@ class JobModel(BaseModel):
|
|
|
390
395
|
remove_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
391
396
|
volumes_detached_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
392
397
|
# `instance_assigned` means instance assignment was done.
|
|
393
|
-
# if `instance_assigned` is True and `instance` is None, no instance was
|
|
398
|
+
# if `instance_assigned` is True and `instance` is None, no instance was assigned.
|
|
394
399
|
instance_assigned: Mapped[bool] = mapped_column(Boolean, default=False)
|
|
395
400
|
instance_id: Mapped[Optional[uuid.UUID]] = mapped_column(
|
|
396
401
|
ForeignKey("instances.id", ondelete="CASCADE")
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List, Tuple
|
|
2
2
|
|
|
3
3
|
from fastapi import APIRouter, Depends, Request, UploadFile
|
|
4
|
+
from humanize import naturalsize
|
|
4
5
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
5
6
|
|
|
6
7
|
from dstack._internal.core.errors import ResourceNotExistsError, ServerClientError
|
|
@@ -14,9 +15,10 @@ from dstack._internal.server.schemas.repos import (
|
|
|
14
15
|
)
|
|
15
16
|
from dstack._internal.server.security.permissions import ProjectMember
|
|
16
17
|
from dstack._internal.server.services import repos
|
|
18
|
+
from dstack._internal.server.settings import SERVER_CODE_UPLOAD_LIMIT
|
|
17
19
|
from dstack._internal.server.utils.routers import (
|
|
18
20
|
get_base_api_additional_responses,
|
|
19
|
-
|
|
21
|
+
get_request_size,
|
|
20
22
|
)
|
|
21
23
|
|
|
22
24
|
router = APIRouter(
|
|
@@ -94,10 +96,12 @@ async def upload_code(
|
|
|
94
96
|
session: AsyncSession = Depends(get_session),
|
|
95
97
|
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
|
|
96
98
|
):
|
|
97
|
-
|
|
99
|
+
request_size = get_request_size(request)
|
|
100
|
+
if SERVER_CODE_UPLOAD_LIMIT > 0 and request_size > SERVER_CODE_UPLOAD_LIMIT:
|
|
98
101
|
raise ServerClientError(
|
|
99
|
-
"Repo diff size exceeds the limit of
|
|
100
|
-
"Use .gitignore to exclude large files from the repo."
|
|
102
|
+
f"Repo diff size is {naturalsize(request_size)}, which exceeds the limit of "
|
|
103
|
+
f"{naturalsize(SERVER_CODE_UPLOAD_LIMIT)}. Use .gitignore to exclude large files from the repo. This "
|
|
104
|
+
f"limit can be modified by setting the DSTACK_SERVER_CODE_UPLOAD_LIMIT_BYTES environment variable"
|
|
101
105
|
)
|
|
102
106
|
_, project = user_project
|
|
103
107
|
await repos.upload_code(
|
|
@@ -7,7 +7,14 @@ from typing_extensions import Annotated
|
|
|
7
7
|
|
|
8
8
|
from dstack._internal.core.models.common import CoreModel, NetworkMode
|
|
9
9
|
from dstack._internal.core.models.repos.remote import RemoteRepoCreds
|
|
10
|
-
from dstack._internal.core.models.runs import
|
|
10
|
+
from dstack._internal.core.models.runs import (
|
|
11
|
+
ClusterInfo,
|
|
12
|
+
JobSpec,
|
|
13
|
+
JobStatus,
|
|
14
|
+
JobSubmission,
|
|
15
|
+
Run,
|
|
16
|
+
RunSpec,
|
|
17
|
+
)
|
|
11
18
|
from dstack._internal.core.models.volumes import InstanceMountPoint, VolumeMountPoint
|
|
12
19
|
|
|
13
20
|
|
|
@@ -16,6 +23,7 @@ class JobStateEvent(CoreModel):
|
|
|
16
23
|
state: JobStatus
|
|
17
24
|
termination_reason: Optional[str] = None
|
|
18
25
|
termination_message: Optional[str] = None
|
|
26
|
+
exit_status: Optional[int] = None
|
|
19
27
|
|
|
20
28
|
|
|
21
29
|
class LogEvent(CoreModel):
|
|
@@ -38,15 +46,18 @@ class PullResponse(CoreModel):
|
|
|
38
46
|
|
|
39
47
|
|
|
40
48
|
class SubmitBody(CoreModel):
|
|
41
|
-
|
|
42
|
-
|
|
49
|
+
run: Annotated[
|
|
50
|
+
Run,
|
|
43
51
|
Field(
|
|
44
52
|
include={
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
53
|
+
"id": True,
|
|
54
|
+
"run_spec": {
|
|
55
|
+
"run_name",
|
|
56
|
+
"repo_id",
|
|
57
|
+
"repo_data",
|
|
58
|
+
"configuration",
|
|
59
|
+
"configuration_path",
|
|
60
|
+
},
|
|
50
61
|
}
|
|
51
62
|
),
|
|
52
63
|
]
|
|
@@ -69,9 +80,31 @@ class SubmitBody(CoreModel):
|
|
|
69
80
|
}
|
|
70
81
|
),
|
|
71
82
|
]
|
|
83
|
+
job_submission: Annotated[
|
|
84
|
+
JobSubmission,
|
|
85
|
+
Field(
|
|
86
|
+
include={
|
|
87
|
+
"id",
|
|
88
|
+
}
|
|
89
|
+
),
|
|
90
|
+
]
|
|
72
91
|
cluster_info: Annotated[Optional[ClusterInfo], Field(include=True)]
|
|
73
92
|
secrets: Annotated[Optional[Dict[str, str]], Field(include=True)]
|
|
74
93
|
repo_credentials: Annotated[Optional[RemoteRepoCreds], Field(include=True)]
|
|
94
|
+
# run_spec is deprecated in favor of run.run_spec
|
|
95
|
+
# TODO: Remove once we no longer support instances deployed with 0.19.8 or earlier.
|
|
96
|
+
run_spec: Annotated[
|
|
97
|
+
RunSpec,
|
|
98
|
+
Field(
|
|
99
|
+
include={
|
|
100
|
+
"run_name",
|
|
101
|
+
"repo_id",
|
|
102
|
+
"repo_data",
|
|
103
|
+
"configuration",
|
|
104
|
+
"configuration_path",
|
|
105
|
+
},
|
|
106
|
+
),
|
|
107
|
+
]
|
|
75
108
|
|
|
76
109
|
|
|
77
110
|
class HealthcheckResponse(CoreModel):
|
|
@@ -235,6 +235,7 @@ def get_shared_pool_instances_with_offers(
|
|
|
235
235
|
*,
|
|
236
236
|
idle_only: bool = False,
|
|
237
237
|
fleet_model: Optional[FleetModel] = None,
|
|
238
|
+
multinode: bool = False,
|
|
238
239
|
volumes: Optional[List[List[Volume]]] = None,
|
|
239
240
|
) -> list[tuple[InstanceModel, InstanceOfferWithAvailability]]:
|
|
240
241
|
instances_with_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]] = []
|
|
@@ -243,19 +244,22 @@ def get_shared_pool_instances_with_offers(
|
|
|
243
244
|
pool_instances=pool_instances,
|
|
244
245
|
profile=profile,
|
|
245
246
|
fleet_model=fleet_model,
|
|
246
|
-
multinode=
|
|
247
|
+
multinode=multinode,
|
|
247
248
|
volumes=volumes,
|
|
248
249
|
shared=True,
|
|
249
250
|
)
|
|
250
251
|
for instance in filtered_instances:
|
|
251
252
|
if idle_only and instance.status not in [InstanceStatus.IDLE, InstanceStatus.BUSY]:
|
|
252
253
|
continue
|
|
254
|
+
if multinode and instance.busy_blocks > 0:
|
|
255
|
+
continue
|
|
253
256
|
offer = get_instance_offer(instance)
|
|
254
257
|
if offer is None:
|
|
255
258
|
continue
|
|
256
259
|
total_blocks = common_utils.get_or_error(instance.total_blocks)
|
|
257
260
|
idle_blocks = total_blocks - instance.busy_blocks
|
|
258
|
-
|
|
261
|
+
min_blocks = total_blocks if multinode else 1
|
|
262
|
+
for blocks in range(min_blocks, total_blocks + 1):
|
|
259
263
|
shared_offer = generate_shared_offer(offer, blocks, total_blocks)
|
|
260
264
|
catalog_item = offer_to_catalog_item(shared_offer)
|
|
261
265
|
if gpuhunt.matches(catalog_item, query_filter):
|
|
@@ -135,6 +135,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
|
|
|
135
135
|
status=job_model.status,
|
|
136
136
|
termination_reason=job_model.termination_reason,
|
|
137
137
|
termination_reason_message=job_model.termination_reason_message,
|
|
138
|
+
exit_status=job_model.exit_status,
|
|
138
139
|
job_provisioning_data=job_provisioning_data,
|
|
139
140
|
job_runtime_data=get_job_runtime_data(job_model),
|
|
140
141
|
)
|
|
@@ -6,7 +6,7 @@ from typing import Dict, List, Optional, Union
|
|
|
6
6
|
|
|
7
7
|
from cachetools import TTLCache, cached
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
from dstack._internal import settings
|
|
10
10
|
from dstack._internal.core.errors import DockerRegistryError, ServerClientError
|
|
11
11
|
from dstack._internal.core.models.common import RegistryAuth
|
|
12
12
|
from dstack._internal.core.models.configurations import (
|
|
@@ -53,14 +53,14 @@ def get_default_image(python_version: str, nvcc: bool = False) -> str:
|
|
|
53
53
|
suffix = ""
|
|
54
54
|
if nvcc:
|
|
55
55
|
suffix = "-devel"
|
|
56
|
-
return f"
|
|
56
|
+
return f"{settings.DSTACK_BASE_IMAGE}:py{python_version}-{settings.DSTACK_BASE_IMAGE_VERSION}-cuda-12.1{suffix}"
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
class JobConfigurator(ABC):
|
|
60
60
|
TYPE: RunConfigurationType
|
|
61
61
|
|
|
62
62
|
_image_config: Optional[ImageConfig] = None
|
|
63
|
-
# JobSSHKey should be shared for all jobs in a replica for inter-node
|
|
63
|
+
# JobSSHKey should be shared for all jobs in a replica for inter-node communication.
|
|
64
64
|
_job_ssh_key: Optional[JobSSHKey] = None
|
|
65
65
|
|
|
66
66
|
def __init__(self, run_spec: RunSpec):
|
|
@@ -12,7 +12,7 @@ from dstack._internal.core.models.common import CoreModel, NetworkMode
|
|
|
12
12
|
from dstack._internal.core.models.envs import Env
|
|
13
13
|
from dstack._internal.core.models.repos.remote import RemoteRepoCreds
|
|
14
14
|
from dstack._internal.core.models.resources import Memory
|
|
15
|
-
from dstack._internal.core.models.runs import ClusterInfo,
|
|
15
|
+
from dstack._internal.core.models.runs import ClusterInfo, Job, Run
|
|
16
16
|
from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
|
|
17
17
|
from dstack._internal.server.schemas.runner import (
|
|
18
18
|
GPUDevice,
|
|
@@ -72,8 +72,8 @@ class RunnerClient:
|
|
|
72
72
|
|
|
73
73
|
def submit_job(
|
|
74
74
|
self,
|
|
75
|
-
|
|
76
|
-
|
|
75
|
+
run: Run,
|
|
76
|
+
job: Job,
|
|
77
77
|
cluster_info: ClusterInfo,
|
|
78
78
|
secrets: Dict[str, str],
|
|
79
79
|
repo_credentials: Optional[RemoteRepoCreds],
|
|
@@ -81,6 +81,7 @@ class RunnerClient:
|
|
|
81
81
|
):
|
|
82
82
|
# XXX: This is a quick-and-dirty hack to deliver InstanceModel-specific environment
|
|
83
83
|
# variables to the runner without runner API modification.
|
|
84
|
+
job_spec = job.job_spec
|
|
84
85
|
if instance_env is not None:
|
|
85
86
|
if isinstance(instance_env, Env):
|
|
86
87
|
merged_env = instance_env.as_dict()
|
|
@@ -90,11 +91,13 @@ class RunnerClient:
|
|
|
90
91
|
job_spec = job_spec.copy(deep=True)
|
|
91
92
|
job_spec.env = merged_env
|
|
92
93
|
body = SubmitBody(
|
|
93
|
-
|
|
94
|
+
run=run,
|
|
94
95
|
job_spec=job_spec,
|
|
96
|
+
job_submission=job.job_submissions[-1],
|
|
95
97
|
cluster_info=cluster_info,
|
|
96
98
|
secrets=secrets,
|
|
97
99
|
repo_credentials=repo_credentials,
|
|
100
|
+
run_spec=run.run_spec,
|
|
98
101
|
)
|
|
99
102
|
resp = requests.post(
|
|
100
103
|
# use .json() to encode enums
|
|
@@ -16,7 +16,7 @@ from dstack._internal.core.errors import (
|
|
|
16
16
|
ServerClientError,
|
|
17
17
|
)
|
|
18
18
|
from dstack._internal.core.models.common import ApplyAction
|
|
19
|
-
from dstack._internal.core.models.configurations import AnyRunConfiguration
|
|
19
|
+
from dstack._internal.core.models.configurations import RUN_PRIORITY_DEFAULT, AnyRunConfiguration
|
|
20
20
|
from dstack._internal.core.models.instances import (
|
|
21
21
|
InstanceAvailability,
|
|
22
22
|
InstanceOfferWithAvailability,
|
|
@@ -434,7 +434,12 @@ async def apply_plan(
|
|
|
434
434
|
# FIXME: potentially long write transaction
|
|
435
435
|
# Avoid getting run_model after update
|
|
436
436
|
await session.execute(
|
|
437
|
-
update(RunModel)
|
|
437
|
+
update(RunModel)
|
|
438
|
+
.where(RunModel.id == current_resource.id)
|
|
439
|
+
.values(
|
|
440
|
+
run_spec=run_spec.json(),
|
|
441
|
+
priority=run_spec.configuration.priority,
|
|
442
|
+
)
|
|
438
443
|
)
|
|
439
444
|
run = await get_run_by_name(
|
|
440
445
|
session=session,
|
|
@@ -495,6 +500,7 @@ async def submit_run(
|
|
|
495
500
|
status=RunStatus.SUBMITTED,
|
|
496
501
|
run_spec=run_spec.json(),
|
|
497
502
|
last_processed_at=submitted_at,
|
|
503
|
+
priority=run_spec.configuration.priority,
|
|
498
504
|
)
|
|
499
505
|
session.add(run_model)
|
|
500
506
|
|
|
@@ -721,15 +727,15 @@ async def _get_pool_offers(
|
|
|
721
727
|
pool_instances = [i for i in pool_instances if i.id not in detaching_instances_ids]
|
|
722
728
|
multinode = job.job_spec.jobs_per_replica > 1
|
|
723
729
|
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
730
|
+
shared_instances_with_offers = get_shared_pool_instances_with_offers(
|
|
731
|
+
pool_instances=pool_instances,
|
|
732
|
+
profile=run_spec.merged_profile,
|
|
733
|
+
requirements=job.job_spec.requirements,
|
|
734
|
+
volumes=volumes,
|
|
735
|
+
multinode=multinode,
|
|
736
|
+
)
|
|
737
|
+
for _, offer in shared_instances_with_offers:
|
|
738
|
+
pool_offers.append(offer)
|
|
733
739
|
|
|
734
740
|
nonshared_instances = filter_pool_instances(
|
|
735
741
|
pool_instances=pool_instances,
|
|
@@ -852,6 +858,13 @@ def _get_job_submission_cost(job_submission: JobSubmission) -> float:
|
|
|
852
858
|
|
|
853
859
|
|
|
854
860
|
def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
|
|
861
|
+
# This function may set defaults for null run_spec values,
|
|
862
|
+
# although most defaults are resolved when building job_spec
|
|
863
|
+
# so that we can keep both the original user-supplied value (null in run_spec)
|
|
864
|
+
# and the default in job_spec.
|
|
865
|
+
# If a property is stored in job_spec - resolve the default there.
|
|
866
|
+
# Server defaults are preferable over client defaults so that
|
|
867
|
+
# the defaults depend on the server version, not the client version.
|
|
855
868
|
if run_spec.run_name is not None:
|
|
856
869
|
validate_dstack_resource_name(run_spec.run_name)
|
|
857
870
|
for mount_point in run_spec.configuration.volumes:
|
|
@@ -870,16 +883,19 @@ def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
|
|
|
870
883
|
if (
|
|
871
884
|
run_spec.merged_profile.utilization_policy is not None
|
|
872
885
|
and run_spec.merged_profile.utilization_policy.time_window
|
|
873
|
-
> settings.
|
|
886
|
+
> settings.SERVER_METRICS_RUNNING_TTL_SECONDS
|
|
874
887
|
):
|
|
875
888
|
raise ServerClientError(
|
|
876
|
-
f"Maximum utilization_policy.time_window is {settings.
|
|
889
|
+
f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_RUNNING_TTL_SECONDS}s"
|
|
877
890
|
)
|
|
891
|
+
if run_spec.configuration.priority is None:
|
|
892
|
+
run_spec.configuration.priority = RUN_PRIORITY_DEFAULT
|
|
878
893
|
set_resources_defaults(run_spec.configuration.resources)
|
|
879
894
|
|
|
880
895
|
|
|
881
896
|
_UPDATABLE_SPEC_FIELDS = ["repo_code_hash", "configuration"]
|
|
882
|
-
|
|
897
|
+
_CONF_UPDATABLE_FIELDS = ["priority"]
|
|
898
|
+
_TYPE_SPECIFIC_CONF_UPDATABLE_FIELDS = {
|
|
883
899
|
"dev-environment": ["inactivity_duration"],
|
|
884
900
|
# Most service fields can be updated via replica redeployment.
|
|
885
901
|
# TODO: Allow updating other fields when rolling deployment is supported.
|
|
@@ -915,12 +931,9 @@ def _check_can_update_configuration(
|
|
|
915
931
|
raise ServerClientError(
|
|
916
932
|
f"Configuration type changed from {current.type} to {new.type}, cannot update"
|
|
917
933
|
)
|
|
918
|
-
updatable_fields =
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
f"Can only update {', '.join(_CONF_TYPE_TO_UPDATABLE_FIELDS)} configurations."
|
|
922
|
-
f" Not {new.type}"
|
|
923
|
-
)
|
|
934
|
+
updatable_fields = _CONF_UPDATABLE_FIELDS + _TYPE_SPECIFIC_CONF_UPDATABLE_FIELDS.get(
|
|
935
|
+
new.type, []
|
|
936
|
+
)
|
|
924
937
|
diff = diff_models(current, new)
|
|
925
938
|
changed_fields = list(diff.keys())
|
|
926
939
|
for key in changed_fields:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import warnings
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
|
|
4
5
|
DSTACK_DIR_PATH = Path("~/.dstack/").expanduser()
|
|
@@ -45,7 +46,25 @@ SERVER_CLOUDWATCH_LOG_REGION = os.getenv("DSTACK_SERVER_CLOUDWATCH_LOG_REGION")
|
|
|
45
46
|
|
|
46
47
|
SERVER_GCP_LOGGING_PROJECT = os.getenv("DSTACK_SERVER_GCP_LOGGING_PROJECT")
|
|
47
48
|
|
|
48
|
-
|
|
49
|
+
SERVER_METRICS_RUNNING_TTL_SECONDS: int
|
|
50
|
+
_SERVER_METRICS_RUNNING_TTL_SECONDS = os.getenv("DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS")
|
|
51
|
+
if _SERVER_METRICS_RUNNING_TTL_SECONDS is None:
|
|
52
|
+
_SERVER_METRICS_RUNNING_TTL_SECONDS = os.getenv("DSTACK_SERVER_METRICS_TTL_SECONDS")
|
|
53
|
+
if _SERVER_METRICS_RUNNING_TTL_SECONDS is not None:
|
|
54
|
+
warnings.warn(
|
|
55
|
+
(
|
|
56
|
+
"DSTACK_SERVER_METRICS_TTL_SECONDS is deprecated,"
|
|
57
|
+
" use DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS instead"
|
|
58
|
+
),
|
|
59
|
+
DeprecationWarning,
|
|
60
|
+
)
|
|
61
|
+
else:
|
|
62
|
+
_SERVER_METRICS_RUNNING_TTL_SECONDS = 3600
|
|
63
|
+
SERVER_METRICS_RUNNING_TTL_SECONDS = int(_SERVER_METRICS_RUNNING_TTL_SECONDS)
|
|
64
|
+
del _SERVER_METRICS_RUNNING_TTL_SECONDS
|
|
65
|
+
SERVER_METRICS_FINISHED_TTL_SECONDS = int(
|
|
66
|
+
os.getenv("DSTACK_SERVER_METRICS_FINISHED_TTL_SECONDS", 7 * 24 * 3600)
|
|
67
|
+
)
|
|
49
68
|
|
|
50
69
|
DEFAULT_PROJECT_NAME = "main"
|
|
51
70
|
|
|
@@ -66,6 +85,7 @@ DEFAULT_SERVICE_CLIENT_MAX_BODY_SIZE = int(
|
|
|
66
85
|
USER_PROJECT_DEFAULT_QUOTA = int(os.getenv("DSTACK_USER_PROJECT_DEFAULT_QUOTA", 10))
|
|
67
86
|
FORBID_SERVICES_WITHOUT_GATEWAY = os.getenv("DSTACK_FORBID_SERVICES_WITHOUT_GATEWAY") is not None
|
|
68
87
|
|
|
88
|
+
SERVER_CODE_UPLOAD_LIMIT = int(os.getenv("DSTACK_SERVER_CODE_UPLOAD_LIMIT", 2 * 2**20))
|
|
69
89
|
|
|
70
90
|
# Development settings
|
|
71
91
|
|