dstack 0.19.17__py3-none-any.whl → 0.19.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/configurators/fleet.py +111 -1
- dstack/_internal/cli/services/profile.py +1 -1
- dstack/_internal/core/backends/aws/compute.py +237 -18
- dstack/_internal/core/backends/base/compute.py +20 -2
- dstack/_internal/core/backends/cudo/compute.py +23 -9
- dstack/_internal/core/backends/gcp/compute.py +13 -7
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
- dstack/_internal/core/compatibility/fleets.py +12 -11
- dstack/_internal/core/compatibility/gateways.py +9 -8
- dstack/_internal/core/compatibility/logs.py +4 -3
- dstack/_internal/core/compatibility/runs.py +29 -21
- dstack/_internal/core/compatibility/volumes.py +11 -8
- dstack/_internal/core/errors.py +4 -0
- dstack/_internal/core/models/common.py +45 -2
- dstack/_internal/core/models/configurations.py +9 -1
- dstack/_internal/core/models/fleets.py +2 -1
- dstack/_internal/core/models/profiles.py +8 -5
- dstack/_internal/core/models/resources.py +15 -8
- dstack/_internal/core/models/runs.py +41 -138
- dstack/_internal/core/models/volumes.py +14 -0
- dstack/_internal/core/services/diff.py +56 -3
- dstack/_internal/core/services/ssh/attach.py +2 -0
- dstack/_internal/server/app.py +37 -9
- dstack/_internal/server/background/__init__.py +66 -40
- dstack/_internal/server/background/tasks/process_fleets.py +19 -3
- dstack/_internal/server/background/tasks/process_gateways.py +47 -29
- dstack/_internal/server/background/tasks/process_idle_volumes.py +139 -0
- dstack/_internal/server/background/tasks/process_instances.py +13 -2
- dstack/_internal/server/background/tasks/process_placement_groups.py +4 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +14 -3
- dstack/_internal/server/background/tasks/process_runs.py +8 -4
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +38 -7
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +5 -3
- dstack/_internal/server/background/tasks/process_volumes.py +2 -2
- dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +6 -6
- dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
- dstack/_internal/server/models.py +1 -0
- dstack/_internal/server/routers/backends.py +23 -16
- dstack/_internal/server/routers/files.py +7 -6
- dstack/_internal/server/routers/fleets.py +47 -36
- dstack/_internal/server/routers/gateways.py +27 -18
- dstack/_internal/server/routers/instances.py +18 -13
- dstack/_internal/server/routers/logs.py +7 -3
- dstack/_internal/server/routers/metrics.py +14 -8
- dstack/_internal/server/routers/projects.py +33 -22
- dstack/_internal/server/routers/repos.py +7 -6
- dstack/_internal/server/routers/runs.py +49 -28
- dstack/_internal/server/routers/secrets.py +20 -15
- dstack/_internal/server/routers/server.py +7 -4
- dstack/_internal/server/routers/users.py +22 -19
- dstack/_internal/server/routers/volumes.py +34 -25
- dstack/_internal/server/schemas/logs.py +2 -2
- dstack/_internal/server/schemas/runs.py +17 -5
- dstack/_internal/server/services/fleets.py +358 -75
- dstack/_internal/server/services/gateways/__init__.py +17 -6
- dstack/_internal/server/services/gateways/client.py +5 -3
- dstack/_internal/server/services/instances.py +8 -0
- dstack/_internal/server/services/jobs/__init__.py +45 -0
- dstack/_internal/server/services/jobs/configurators/base.py +12 -1
- dstack/_internal/server/services/locking.py +104 -13
- dstack/_internal/server/services/logging.py +4 -2
- dstack/_internal/server/services/logs/__init__.py +15 -2
- dstack/_internal/server/services/logs/aws.py +2 -4
- dstack/_internal/server/services/logs/filelog.py +33 -27
- dstack/_internal/server/services/logs/gcp.py +3 -5
- dstack/_internal/server/services/proxy/repo.py +4 -1
- dstack/_internal/server/services/runs.py +139 -72
- dstack/_internal/server/services/services/__init__.py +2 -1
- dstack/_internal/server/services/users.py +3 -1
- dstack/_internal/server/services/volumes.py +15 -2
- dstack/_internal/server/settings.py +25 -6
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-d151637af20f70b2e796.js → main-64f8273740c4b52c18f5.js} +71 -67
- dstack/_internal/server/statics/{main-d151637af20f70b2e796.js.map → main-64f8273740c4b52c18f5.js.map} +1 -1
- dstack/_internal/server/statics/{main-d48635d8fe670d53961c.css → main-d58fc0460cb0eae7cb5c.css} +1 -1
- dstack/_internal/server/testing/common.py +48 -8
- dstack/_internal/server/utils/routers.py +31 -8
- dstack/_internal/utils/json_utils.py +54 -0
- dstack/api/_public/runs.py +13 -2
- dstack/api/server/_runs.py +12 -2
- dstack/version.py +1 -1
- {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/METADATA +17 -14
- {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/RECORD +86 -83
- {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/WHEEL +0 -0
- {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -11,6 +11,7 @@ from dstack._internal.core.models.configurations import (
|
|
|
11
11
|
DEFAULT_REPO_DIR,
|
|
12
12
|
AnyRunConfiguration,
|
|
13
13
|
RunConfiguration,
|
|
14
|
+
ServiceConfiguration,
|
|
14
15
|
)
|
|
15
16
|
from dstack._internal.core.models.files import FileArchiveMapping
|
|
16
17
|
from dstack._internal.core.models.instances import (
|
|
@@ -101,6 +102,14 @@ class RunTerminationReason(str, Enum):
|
|
|
101
102
|
}
|
|
102
103
|
return mapping[self]
|
|
103
104
|
|
|
105
|
+
def to_error(self) -> Optional[str]:
|
|
106
|
+
if self == RunTerminationReason.RETRY_LIMIT_EXCEEDED:
|
|
107
|
+
return "retry limit exceeded"
|
|
108
|
+
elif self == RunTerminationReason.SERVER_ERROR:
|
|
109
|
+
return "server error"
|
|
110
|
+
else:
|
|
111
|
+
return None
|
|
112
|
+
|
|
104
113
|
|
|
105
114
|
class JobTerminationReason(str, Enum):
|
|
106
115
|
# Set by the server
|
|
@@ -162,6 +171,24 @@ class JobTerminationReason(str, Enum):
|
|
|
162
171
|
default = RetryEvent.ERROR if self.to_status() == JobStatus.FAILED else None
|
|
163
172
|
return mapping.get(self, default)
|
|
164
173
|
|
|
174
|
+
def to_error(self) -> Optional[str]:
|
|
175
|
+
# Should return None for values that are already
|
|
176
|
+
# handled and shown in status_message.
|
|
177
|
+
error_mapping = {
|
|
178
|
+
JobTerminationReason.INSTANCE_UNREACHABLE: "instance unreachable",
|
|
179
|
+
JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED: "waiting instance limit exceeded",
|
|
180
|
+
JobTerminationReason.VOLUME_ERROR: "volume error",
|
|
181
|
+
JobTerminationReason.GATEWAY_ERROR: "gateway error",
|
|
182
|
+
JobTerminationReason.SCALED_DOWN: "scaled down",
|
|
183
|
+
JobTerminationReason.INACTIVITY_DURATION_EXCEEDED: "inactivity duration exceeded",
|
|
184
|
+
JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY: "utilization policy",
|
|
185
|
+
JobTerminationReason.PORTS_BINDING_FAILED: "ports binding failed",
|
|
186
|
+
JobTerminationReason.CREATING_CONTAINER_ERROR: "runner error",
|
|
187
|
+
JobTerminationReason.EXECUTOR_ERROR: "executor error",
|
|
188
|
+
JobTerminationReason.MAX_DURATION_EXCEEDED: "max duration exceeded",
|
|
189
|
+
}
|
|
190
|
+
return error_mapping.get(self)
|
|
191
|
+
|
|
165
192
|
|
|
166
193
|
class Requirements(CoreModel):
|
|
167
194
|
# TODO: Make requirements' fields required
|
|
@@ -227,6 +254,8 @@ class JobSpec(CoreModel):
|
|
|
227
254
|
# TODO: drop this comment when supporting jobs submitted before 0.19.17 is no longer relevant.
|
|
228
255
|
repo_code_hash: Optional[str] = None
|
|
229
256
|
file_archives: list[FileArchiveMapping] = []
|
|
257
|
+
# None for non-services and pre-0.19.19 services. See `get_service_port`
|
|
258
|
+
service_port: Optional[int] = None
|
|
230
259
|
|
|
231
260
|
|
|
232
261
|
class JobProvisioningData(CoreModel):
|
|
@@ -305,13 +334,12 @@ class JobSubmission(CoreModel):
|
|
|
305
334
|
finished_at: Optional[datetime]
|
|
306
335
|
inactivity_secs: Optional[int]
|
|
307
336
|
status: JobStatus
|
|
337
|
+
status_message: str = "" # default for backward compatibility
|
|
308
338
|
termination_reason: Optional[JobTerminationReason]
|
|
309
339
|
termination_reason_message: Optional[str]
|
|
310
340
|
exit_status: Optional[int]
|
|
311
341
|
job_provisioning_data: Optional[JobProvisioningData]
|
|
312
342
|
job_runtime_data: Optional[JobRuntimeData]
|
|
313
|
-
# TODO: make status_message and error a computed field after migrating to pydanticV2
|
|
314
|
-
status_message: Optional[str] = None
|
|
315
343
|
error: Optional[str] = None
|
|
316
344
|
|
|
317
345
|
@property
|
|
@@ -325,71 +353,6 @@ class JobSubmission(CoreModel):
|
|
|
325
353
|
end_time = self.finished_at
|
|
326
354
|
return end_time - self.submitted_at
|
|
327
355
|
|
|
328
|
-
@root_validator
|
|
329
|
-
def _status_message(cls, values) -> Dict:
|
|
330
|
-
try:
|
|
331
|
-
status = values["status"]
|
|
332
|
-
termination_reason = values["termination_reason"]
|
|
333
|
-
exit_code = values["exit_status"]
|
|
334
|
-
except KeyError:
|
|
335
|
-
return values
|
|
336
|
-
values["status_message"] = JobSubmission._get_status_message(
|
|
337
|
-
status=status,
|
|
338
|
-
termination_reason=termination_reason,
|
|
339
|
-
exit_status=exit_code,
|
|
340
|
-
)
|
|
341
|
-
return values
|
|
342
|
-
|
|
343
|
-
@staticmethod
|
|
344
|
-
def _get_status_message(
|
|
345
|
-
status: JobStatus,
|
|
346
|
-
termination_reason: Optional[JobTerminationReason],
|
|
347
|
-
exit_status: Optional[int],
|
|
348
|
-
) -> str:
|
|
349
|
-
if status == JobStatus.DONE:
|
|
350
|
-
return "exited (0)"
|
|
351
|
-
elif status == JobStatus.FAILED:
|
|
352
|
-
if termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR:
|
|
353
|
-
return f"exited ({exit_status})"
|
|
354
|
-
elif termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY:
|
|
355
|
-
return "no offers"
|
|
356
|
-
elif termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY:
|
|
357
|
-
return "interrupted"
|
|
358
|
-
else:
|
|
359
|
-
return "error"
|
|
360
|
-
elif status == JobStatus.TERMINATED:
|
|
361
|
-
if termination_reason == JobTerminationReason.TERMINATED_BY_USER:
|
|
362
|
-
return "stopped"
|
|
363
|
-
elif termination_reason == JobTerminationReason.ABORTED_BY_USER:
|
|
364
|
-
return "aborted"
|
|
365
|
-
return status.value
|
|
366
|
-
|
|
367
|
-
@root_validator
|
|
368
|
-
def _error(cls, values) -> Dict:
|
|
369
|
-
try:
|
|
370
|
-
termination_reason = values["termination_reason"]
|
|
371
|
-
except KeyError:
|
|
372
|
-
return values
|
|
373
|
-
values["error"] = JobSubmission._get_error(termination_reason=termination_reason)
|
|
374
|
-
return values
|
|
375
|
-
|
|
376
|
-
@staticmethod
|
|
377
|
-
def _get_error(termination_reason: Optional[JobTerminationReason]) -> Optional[str]:
|
|
378
|
-
error_mapping = {
|
|
379
|
-
JobTerminationReason.INSTANCE_UNREACHABLE: "instance unreachable",
|
|
380
|
-
JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED: "waiting instance limit exceeded",
|
|
381
|
-
JobTerminationReason.VOLUME_ERROR: "volume error",
|
|
382
|
-
JobTerminationReason.GATEWAY_ERROR: "gateway error",
|
|
383
|
-
JobTerminationReason.SCALED_DOWN: "scaled down",
|
|
384
|
-
JobTerminationReason.INACTIVITY_DURATION_EXCEEDED: "inactivity duration exceeded",
|
|
385
|
-
JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY: "utilization policy",
|
|
386
|
-
JobTerminationReason.PORTS_BINDING_FAILED: "ports binding failed",
|
|
387
|
-
JobTerminationReason.CREATING_CONTAINER_ERROR: "runner error",
|
|
388
|
-
JobTerminationReason.EXECUTOR_ERROR: "executor error",
|
|
389
|
-
JobTerminationReason.MAX_DURATION_EXCEEDED: "max duration exceeded",
|
|
390
|
-
}
|
|
391
|
-
return error_mapping.get(termination_reason)
|
|
392
|
-
|
|
393
356
|
|
|
394
357
|
class Job(CoreModel):
|
|
395
358
|
job_spec: JobSpec
|
|
@@ -524,85 +487,17 @@ class Run(CoreModel):
|
|
|
524
487
|
submitted_at: datetime
|
|
525
488
|
last_processed_at: datetime
|
|
526
489
|
status: RunStatus
|
|
527
|
-
status_message:
|
|
528
|
-
termination_reason: Optional[RunTerminationReason]
|
|
490
|
+
status_message: str = "" # default for backward compatibility
|
|
491
|
+
termination_reason: Optional[RunTerminationReason] = None
|
|
529
492
|
run_spec: RunSpec
|
|
530
493
|
jobs: List[Job]
|
|
531
|
-
latest_job_submission: Optional[JobSubmission]
|
|
494
|
+
latest_job_submission: Optional[JobSubmission] = None
|
|
532
495
|
cost: float = 0
|
|
533
496
|
service: Optional[ServiceSpec] = None
|
|
534
497
|
deployment_num: int = 0 # default for compatibility with pre-0.19.14 servers
|
|
535
|
-
# TODO: make error a computed field after migrating to pydanticV2
|
|
536
498
|
error: Optional[str] = None
|
|
537
499
|
deleted: Optional[bool] = None
|
|
538
500
|
|
|
539
|
-
@root_validator
|
|
540
|
-
def _error(cls, values) -> Dict:
|
|
541
|
-
try:
|
|
542
|
-
termination_reason = values["termination_reason"]
|
|
543
|
-
except KeyError:
|
|
544
|
-
return values
|
|
545
|
-
values["error"] = Run._get_error(termination_reason=termination_reason)
|
|
546
|
-
return values
|
|
547
|
-
|
|
548
|
-
@staticmethod
|
|
549
|
-
def _get_error(termination_reason: Optional[RunTerminationReason]) -> Optional[str]:
|
|
550
|
-
if termination_reason == RunTerminationReason.RETRY_LIMIT_EXCEEDED:
|
|
551
|
-
return "retry limit exceeded"
|
|
552
|
-
elif termination_reason == RunTerminationReason.SERVER_ERROR:
|
|
553
|
-
return "server error"
|
|
554
|
-
else:
|
|
555
|
-
return None
|
|
556
|
-
|
|
557
|
-
@root_validator
|
|
558
|
-
def _status_message(cls, values) -> Dict:
|
|
559
|
-
try:
|
|
560
|
-
status = values["status"]
|
|
561
|
-
jobs: List[Job] = values["jobs"]
|
|
562
|
-
retry_on_events = (
|
|
563
|
-
jobs[0].job_spec.retry.on_events if jobs and jobs[0].job_spec.retry else []
|
|
564
|
-
)
|
|
565
|
-
job_status = (
|
|
566
|
-
jobs[0].job_submissions[-1].status
|
|
567
|
-
if len(jobs) == 1 and jobs[0].job_submissions
|
|
568
|
-
else None
|
|
569
|
-
)
|
|
570
|
-
termination_reason = Run.get_last_termination_reason(jobs[0]) if jobs else None
|
|
571
|
-
except KeyError:
|
|
572
|
-
return values
|
|
573
|
-
values["status_message"] = Run._get_status_message(
|
|
574
|
-
status=status,
|
|
575
|
-
job_status=job_status,
|
|
576
|
-
retry_on_events=retry_on_events,
|
|
577
|
-
termination_reason=termination_reason,
|
|
578
|
-
)
|
|
579
|
-
return values
|
|
580
|
-
|
|
581
|
-
@staticmethod
|
|
582
|
-
def get_last_termination_reason(job: "Job") -> Optional[JobTerminationReason]:
|
|
583
|
-
for submission in reversed(job.job_submissions):
|
|
584
|
-
if submission.termination_reason is not None:
|
|
585
|
-
return submission.termination_reason
|
|
586
|
-
return None
|
|
587
|
-
|
|
588
|
-
@staticmethod
|
|
589
|
-
def _get_status_message(
|
|
590
|
-
status: RunStatus,
|
|
591
|
-
job_status: Optional[JobStatus],
|
|
592
|
-
retry_on_events: List[RetryEvent],
|
|
593
|
-
termination_reason: Optional[JobTerminationReason],
|
|
594
|
-
) -> str:
|
|
595
|
-
if job_status == JobStatus.PULLING:
|
|
596
|
-
return "pulling"
|
|
597
|
-
# Currently, `retrying` is shown only for `no-capacity` events
|
|
598
|
-
if (
|
|
599
|
-
status in [RunStatus.SUBMITTED, RunStatus.PENDING]
|
|
600
|
-
and termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
|
|
601
|
-
and RetryEvent.NO_CAPACITY in retry_on_events
|
|
602
|
-
):
|
|
603
|
-
return "retrying"
|
|
604
|
-
return status.value
|
|
605
|
-
|
|
606
501
|
def is_deployment_in_progress(self) -> bool:
|
|
607
502
|
return any(
|
|
608
503
|
not j.job_submissions[-1].status.is_finished()
|
|
@@ -658,3 +553,11 @@ def get_policy_map(spot_policy: Optional[SpotPolicy], default: SpotPolicy) -> Op
|
|
|
658
553
|
SpotPolicy.ONDEMAND: False,
|
|
659
554
|
}
|
|
660
555
|
return policy_map[spot_policy]
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def get_service_port(job_spec: JobSpec, configuration: ServiceConfiguration) -> int:
|
|
559
|
+
# Compatibility with pre-0.19.19 job specs that do not have the `service_port` property.
|
|
560
|
+
# TODO: drop when pre-0.19.19 jobs are no longer relevant.
|
|
561
|
+
if job_spec.service_port is None:
|
|
562
|
+
return configuration.port.container_port
|
|
563
|
+
return job_spec.service_port
|
|
@@ -9,6 +9,7 @@ from typing_extensions import Annotated, Self
|
|
|
9
9
|
|
|
10
10
|
from dstack._internal.core.models.backends.base import BackendType
|
|
11
11
|
from dstack._internal.core.models.common import CoreModel
|
|
12
|
+
from dstack._internal.core.models.profiles import parse_idle_duration
|
|
12
13
|
from dstack._internal.core.models.resources import Memory
|
|
13
14
|
from dstack._internal.utils.common import get_or_error
|
|
14
15
|
from dstack._internal.utils.tags import tags_validator
|
|
@@ -44,6 +45,16 @@ class VolumeConfiguration(CoreModel):
|
|
|
44
45
|
Optional[str],
|
|
45
46
|
Field(description="The volume ID. Must be specified when registering external volumes"),
|
|
46
47
|
] = None
|
|
48
|
+
auto_cleanup_duration: Annotated[
|
|
49
|
+
Optional[Union[str, int]],
|
|
50
|
+
Field(
|
|
51
|
+
description=(
|
|
52
|
+
"Time to wait after volume is no longer used by any job before deleting it. "
|
|
53
|
+
"Defaults to keep the volume indefinitely. "
|
|
54
|
+
"Use the value 'off' or -1 to disable auto-cleanup."
|
|
55
|
+
)
|
|
56
|
+
),
|
|
57
|
+
] = None
|
|
47
58
|
tags: Annotated[
|
|
48
59
|
Optional[Dict[str, str]],
|
|
49
60
|
Field(
|
|
@@ -56,6 +67,9 @@ class VolumeConfiguration(CoreModel):
|
|
|
56
67
|
] = None
|
|
57
68
|
|
|
58
69
|
_validate_tags = validator("tags", pre=True, allow_reuse=True)(tags_validator)
|
|
70
|
+
_validate_auto_cleanup_duration = validator(
|
|
71
|
+
"auto_cleanup_duration", pre=True, allow_reuse=True
|
|
72
|
+
)(parse_idle_duration)
|
|
59
73
|
|
|
60
74
|
@property
|
|
61
75
|
def size_gb(self) -> int:
|
|
@@ -1,14 +1,46 @@
|
|
|
1
|
-
from typing import Any,
|
|
1
|
+
from typing import Any, Optional, TypedDict, TypeVar
|
|
2
2
|
|
|
3
3
|
from pydantic import BaseModel
|
|
4
4
|
|
|
5
|
+
from dstack._internal.core.models.common import IncludeExcludeType
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ModelFieldDiff(TypedDict):
|
|
9
|
+
old: Any
|
|
10
|
+
new: Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
ModelDiff = dict[str, ModelFieldDiff]
|
|
14
|
+
|
|
5
15
|
|
|
6
16
|
# TODO: calculate nested diffs
|
|
7
|
-
def diff_models(
|
|
17
|
+
def diff_models(
|
|
18
|
+
old: BaseModel, new: BaseModel, reset: Optional[IncludeExcludeType] = None
|
|
19
|
+
) -> ModelDiff:
|
|
20
|
+
"""
|
|
21
|
+
Returns a diff of model instances fields.
|
|
22
|
+
|
|
23
|
+
The fields specified in the `reset` option are reset to their default values, effectively
|
|
24
|
+
excluding them from comparison (assuming that the default value is equal to itself, e.g,
|
|
25
|
+
`None == None`, `"task" == "task"`, but `math.nan != math.nan`).
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
old: The "old" model instance.
|
|
29
|
+
new: The "new" model instance.
|
|
30
|
+
reset: Fields to reset to their default values before comparison.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
A dict of changed fields in the form of
|
|
34
|
+
`{<field_name>: {"old": old_value, "new": new_value}}`
|
|
35
|
+
"""
|
|
8
36
|
if type(old) is not type(new):
|
|
9
37
|
raise TypeError("Both instances must be of the same Pydantic model class.")
|
|
10
38
|
|
|
11
|
-
|
|
39
|
+
if reset is not None:
|
|
40
|
+
old = copy_model(old, reset=reset)
|
|
41
|
+
new = copy_model(new, reset=reset)
|
|
42
|
+
|
|
43
|
+
changes: ModelDiff = {}
|
|
12
44
|
for field in old.__fields__:
|
|
13
45
|
old_value = getattr(old, field)
|
|
14
46
|
new_value = getattr(new, field)
|
|
@@ -16,3 +48,24 @@ def diff_models(old: BaseModel, new: BaseModel) -> Dict[str, Any]:
|
|
|
16
48
|
changes[field] = {"old": old_value, "new": new_value}
|
|
17
49
|
|
|
18
50
|
return changes
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
M = TypeVar("M", bound=BaseModel)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def copy_model(model: M, reset: Optional[IncludeExcludeType] = None) -> M:
|
|
57
|
+
"""
|
|
58
|
+
Returns a deep copy of the model instance.
|
|
59
|
+
|
|
60
|
+
Implemented as `BaseModel.parse_obj(BaseModel.dict())`, thus,
|
|
61
|
+
unlike `BaseModel.copy(deep=True)`, runs all validations.
|
|
62
|
+
|
|
63
|
+
The fields specified in the `reset` option are reset to their default values.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
reset: Fields to reset to their default values.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
A deep copy of the model instance.
|
|
70
|
+
"""
|
|
71
|
+
return type(model).parse_obj(model.dict(exclude=reset))
|
|
@@ -64,6 +64,7 @@ class SSHAttach:
|
|
|
64
64
|
run_name: str,
|
|
65
65
|
dockerized: bool,
|
|
66
66
|
ssh_proxy: Optional[SSHConnectionParams] = None,
|
|
67
|
+
service_port: Optional[int] = None,
|
|
67
68
|
local_backend: bool = False,
|
|
68
69
|
bind_address: Optional[str] = None,
|
|
69
70
|
):
|
|
@@ -90,6 +91,7 @@ class SSHAttach:
|
|
|
90
91
|
},
|
|
91
92
|
)
|
|
92
93
|
self.ssh_proxy = ssh_proxy
|
|
94
|
+
self.service_port = service_port
|
|
93
95
|
|
|
94
96
|
hosts: dict[str, dict[str, Union[str, int, FilePath]]] = {}
|
|
95
97
|
self.hosts = hosts
|
dstack/_internal/server/app.py
CHANGED
|
@@ -2,6 +2,7 @@ import asyncio
|
|
|
2
2
|
import importlib.resources
|
|
3
3
|
import os
|
|
4
4
|
import time
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
5
6
|
from contextlib import asynccontextmanager
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from typing import Awaitable, Callable, List
|
|
@@ -9,7 +10,7 @@ from typing import Awaitable, Callable, List
|
|
|
9
10
|
import sentry_sdk
|
|
10
11
|
from fastapi import FastAPI, Request, Response, status
|
|
11
12
|
from fastapi.datastructures import URL
|
|
12
|
-
from fastapi.responses import HTMLResponse,
|
|
13
|
+
from fastapi.responses import HTMLResponse, RedirectResponse
|
|
13
14
|
from fastapi.staticfiles import StaticFiles
|
|
14
15
|
from prometheus_client import Counter, Histogram
|
|
15
16
|
|
|
@@ -55,6 +56,7 @@ from dstack._internal.server.settings import (
|
|
|
55
56
|
)
|
|
56
57
|
from dstack._internal.server.utils.logging import configure_logging
|
|
57
58
|
from dstack._internal.server.utils.routers import (
|
|
59
|
+
CustomORJSONResponse,
|
|
58
60
|
check_client_server_compatibility,
|
|
59
61
|
error_detail,
|
|
60
62
|
get_server_client_error_details,
|
|
@@ -89,7 +91,10 @@ def create_app() -> FastAPI:
|
|
|
89
91
|
profiles_sample_rate=settings.SENTRY_PROFILES_SAMPLE_RATE,
|
|
90
92
|
)
|
|
91
93
|
|
|
92
|
-
app = FastAPI(
|
|
94
|
+
app = FastAPI(
|
|
95
|
+
docs_url="/api/docs",
|
|
96
|
+
lifespan=lifespan,
|
|
97
|
+
)
|
|
93
98
|
app.state.proxy_dependency_injector = ServerProxyDependencyInjector()
|
|
94
99
|
return app
|
|
95
100
|
|
|
@@ -97,6 +102,8 @@ def create_app() -> FastAPI:
|
|
|
97
102
|
@asynccontextmanager
|
|
98
103
|
async def lifespan(app: FastAPI):
|
|
99
104
|
configure_logging()
|
|
105
|
+
server_executor = ThreadPoolExecutor(max_workers=settings.SERVER_EXECUTOR_MAX_WORKERS)
|
|
106
|
+
asyncio.get_running_loop().set_default_executor(server_executor)
|
|
100
107
|
await migrate()
|
|
101
108
|
_print_dstack_logo()
|
|
102
109
|
if not check_required_ssh_version():
|
|
@@ -144,7 +151,10 @@ async def lifespan(app: FastAPI):
|
|
|
144
151
|
)
|
|
145
152
|
if settings.SERVER_S3_BUCKET is not None or settings.SERVER_GCS_BUCKET is not None:
|
|
146
153
|
init_default_storage()
|
|
147
|
-
|
|
154
|
+
if settings.SERVER_BACKGROUND_PROCESSING_ENABLED:
|
|
155
|
+
scheduler = start_background_tasks()
|
|
156
|
+
else:
|
|
157
|
+
logger.info("Background processing is disabled")
|
|
148
158
|
dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)"
|
|
149
159
|
logger.info(f"The admin token is {admin.token.get_plaintext_or_error()}", {"show_path": False})
|
|
150
160
|
logger.info(
|
|
@@ -154,7 +164,8 @@ async def lifespan(app: FastAPI):
|
|
|
154
164
|
for func in _ON_STARTUP_HOOKS:
|
|
155
165
|
await func(app)
|
|
156
166
|
yield
|
|
157
|
-
|
|
167
|
+
if settings.SERVER_BACKGROUND_PROCESSING_ENABLED:
|
|
168
|
+
scheduler.shutdown()
|
|
158
169
|
await gateway_connections_pool.remove_all()
|
|
159
170
|
service_conn_pool = await get_injector_from_app(app).get_service_connection_pool()
|
|
160
171
|
await service_conn_pool.remove_all()
|
|
@@ -205,14 +216,14 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
205
216
|
msg = "Access denied"
|
|
206
217
|
if len(exc.args) > 0:
|
|
207
218
|
msg = exc.args[0]
|
|
208
|
-
return
|
|
219
|
+
return CustomORJSONResponse(
|
|
209
220
|
status_code=status.HTTP_403_FORBIDDEN,
|
|
210
221
|
content=error_detail(msg),
|
|
211
222
|
)
|
|
212
223
|
|
|
213
224
|
@app.exception_handler(ServerClientError)
|
|
214
225
|
async def server_client_error_handler(request: Request, exc: ServerClientError):
|
|
215
|
-
return
|
|
226
|
+
return CustomORJSONResponse(
|
|
216
227
|
status_code=status.HTTP_400_BAD_REQUEST,
|
|
217
228
|
content={"detail": get_server_client_error_details(exc)},
|
|
218
229
|
)
|
|
@@ -220,7 +231,7 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
220
231
|
@app.exception_handler(OSError)
|
|
221
232
|
async def os_error_handler(request, exc: OSError):
|
|
222
233
|
if exc.errno in [36, 63]:
|
|
223
|
-
return
|
|
234
|
+
return CustomORJSONResponse(
|
|
224
235
|
{"detail": "Filename too long"},
|
|
225
236
|
status_code=status.HTTP_400_BAD_REQUEST,
|
|
226
237
|
)
|
|
@@ -242,6 +253,23 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
242
253
|
)
|
|
243
254
|
return response
|
|
244
255
|
|
|
256
|
+
if settings.SERVER_PROFILING_ENABLED:
|
|
257
|
+
from pyinstrument import Profiler
|
|
258
|
+
|
|
259
|
+
@app.middleware("http")
|
|
260
|
+
async def profile_request(request: Request, call_next):
|
|
261
|
+
profiling = request.query_params.get("profile", False)
|
|
262
|
+
if profiling:
|
|
263
|
+
profiler = Profiler()
|
|
264
|
+
profiler.start()
|
|
265
|
+
respone = await call_next(request)
|
|
266
|
+
profiler.stop()
|
|
267
|
+
with open("profiling_results.html", "w+") as f:
|
|
268
|
+
f.write(profiler.output_html())
|
|
269
|
+
return respone
|
|
270
|
+
else:
|
|
271
|
+
return await call_next(request)
|
|
272
|
+
|
|
245
273
|
# this middleware must be defined after the log_request middleware
|
|
246
274
|
@app.middleware("http")
|
|
247
275
|
async def log_http_metrics(request: Request, call_next):
|
|
@@ -289,7 +317,7 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
289
317
|
|
|
290
318
|
@app.get("/healthcheck")
|
|
291
319
|
async def healthcheck():
|
|
292
|
-
return
|
|
320
|
+
return CustomORJSONResponse(content={"status": "running"})
|
|
293
321
|
|
|
294
322
|
if ui and Path(__file__).parent.joinpath("statics").exists():
|
|
295
323
|
app.mount(
|
|
@@ -303,7 +331,7 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
303
331
|
or _is_proxy_request(request)
|
|
304
332
|
or _is_prometheus_request(request)
|
|
305
333
|
):
|
|
306
|
-
return
|
|
334
|
+
return CustomORJSONResponse(
|
|
307
335
|
{"detail": exc.detail},
|
|
308
336
|
status_code=status.HTTP_404_NOT_FOUND,
|
|
309
337
|
)
|
|
@@ -4,9 +4,10 @@ from apscheduler.triggers.interval import IntervalTrigger
|
|
|
4
4
|
from dstack._internal.server import settings
|
|
5
5
|
from dstack._internal.server.background.tasks.process_fleets import process_fleets
|
|
6
6
|
from dstack._internal.server.background.tasks.process_gateways import (
|
|
7
|
+
process_gateways,
|
|
7
8
|
process_gateways_connections,
|
|
8
|
-
process_submitted_gateways,
|
|
9
9
|
)
|
|
10
|
+
from dstack._internal.server.background.tasks.process_idle_volumes import process_idle_volumes
|
|
10
11
|
from dstack._internal.server.background.tasks.process_instances import (
|
|
11
12
|
process_instances,
|
|
12
13
|
)
|
|
@@ -37,15 +38,31 @@ def get_scheduler() -> AsyncIOScheduler:
|
|
|
37
38
|
|
|
38
39
|
|
|
39
40
|
def start_background_tasks() -> AsyncIOScheduler:
|
|
41
|
+
# We try to process as many resources as possible without exhausting DB connections.
|
|
42
|
+
#
|
|
43
|
+
# Quick tasks can process multiple resources per transaction.
|
|
44
|
+
# Potentially long tasks process one resource per transaction
|
|
45
|
+
# to avoid holding locks for all the resources if one is slow to process.
|
|
46
|
+
# Still, the next batch won't be processed unless all resources are processed,
|
|
47
|
+
# so larger batches do not increase processing rate linearly.
|
|
48
|
+
#
|
|
49
|
+
# The interval, batch_size, and max_instances determine background tasks processing rates.
|
|
50
|
+
# By default, one server replica can handle:
|
|
51
|
+
#
|
|
52
|
+
# * 150 active jobs with 2 minutes processing latency
|
|
53
|
+
# * 150 active runs with 2 minutes processing latency
|
|
54
|
+
# * 150 active instances with 2 minutes processing latency
|
|
55
|
+
#
|
|
56
|
+
# These latency numbers do not account for provisioning time,
|
|
57
|
+
# so it may be slower if a backend is slow to provision.
|
|
58
|
+
#
|
|
59
|
+
# Users can set SERVER_BACKGROUND_PROCESSING_FACTOR to process more resources per replica.
|
|
60
|
+
# They also need to increase max db connections on the client side and db side.
|
|
61
|
+
#
|
|
40
62
|
# In-memory locking via locksets does not guarantee
|
|
41
63
|
# that the first waiting for the lock will acquire it.
|
|
42
64
|
# The jitter is needed to give all tasks a chance to acquire locks.
|
|
43
65
|
|
|
44
|
-
# The batch_size and interval determine background tasks processing rates.
|
|
45
|
-
# Currently one server replica can handle:
|
|
46
|
-
# * 150 active jobs with up to 2 minutes processing latency
|
|
47
|
-
# * 150 active runs with up to 2 minutes processing latency
|
|
48
|
-
# * 150 active instances with up to 2 minutes processing latency
|
|
49
66
|
_scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
|
|
50
67
|
_scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
|
|
51
68
|
if settings.ENABLE_PROMETHEUS_METRICS:
|
|
@@ -53,45 +70,54 @@ def start_background_tasks() -> AsyncIOScheduler:
|
|
|
53
70
|
collect_prometheus_metrics, IntervalTrigger(seconds=10), max_instances=1
|
|
54
71
|
)
|
|
55
72
|
_scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1)
|
|
56
|
-
# process_submitted_jobs and process_instances max processing rate is 75 jobs(instances) per minute.
|
|
57
|
-
_scheduler.add_job(
|
|
58
|
-
process_submitted_jobs,
|
|
59
|
-
IntervalTrigger(seconds=4, jitter=2),
|
|
60
|
-
kwargs={"batch_size": 5},
|
|
61
|
-
max_instances=2,
|
|
62
|
-
)
|
|
63
|
-
_scheduler.add_job(
|
|
64
|
-
process_running_jobs,
|
|
65
|
-
IntervalTrigger(seconds=4, jitter=2),
|
|
66
|
-
kwargs={"batch_size": 5},
|
|
67
|
-
max_instances=2,
|
|
68
|
-
)
|
|
69
|
-
_scheduler.add_job(
|
|
70
|
-
process_terminating_jobs,
|
|
71
|
-
IntervalTrigger(seconds=4, jitter=2),
|
|
72
|
-
kwargs={"batch_size": 5},
|
|
73
|
-
max_instances=2,
|
|
74
|
-
)
|
|
75
|
-
_scheduler.add_job(
|
|
76
|
-
process_runs,
|
|
77
|
-
IntervalTrigger(seconds=2, jitter=1),
|
|
78
|
-
kwargs={"batch_size": 5},
|
|
79
|
-
max_instances=2,
|
|
80
|
-
)
|
|
81
|
-
_scheduler.add_job(
|
|
82
|
-
process_instances,
|
|
83
|
-
IntervalTrigger(seconds=4, jitter=2),
|
|
84
|
-
kwargs={"batch_size": 5},
|
|
85
|
-
max_instances=2,
|
|
86
|
-
)
|
|
87
|
-
_scheduler.add_job(process_fleets, IntervalTrigger(seconds=10, jitter=2))
|
|
88
73
|
_scheduler.add_job(process_gateways_connections, IntervalTrigger(seconds=15))
|
|
74
|
+
_scheduler.add_job(process_gateways, IntervalTrigger(seconds=10, jitter=2), max_instances=5)
|
|
89
75
|
_scheduler.add_job(
|
|
90
|
-
|
|
76
|
+
process_submitted_volumes, IntervalTrigger(seconds=10, jitter=2), max_instances=5
|
|
91
77
|
)
|
|
92
78
|
_scheduler.add_job(
|
|
93
|
-
|
|
79
|
+
process_idle_volumes, IntervalTrigger(seconds=60, jitter=10), max_instances=1
|
|
94
80
|
)
|
|
95
81
|
_scheduler.add_job(process_placement_groups, IntervalTrigger(seconds=30, jitter=5))
|
|
82
|
+
for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR):
|
|
83
|
+
# Add multiple copies of tasks if requested.
|
|
84
|
+
# max_instances=1 for additional copies to avoid running too many tasks.
|
|
85
|
+
# Move other tasks here when they need per-replica scaling.
|
|
86
|
+
_scheduler.add_job(
|
|
87
|
+
process_submitted_jobs,
|
|
88
|
+
IntervalTrigger(seconds=4, jitter=2),
|
|
89
|
+
kwargs={"batch_size": 5},
|
|
90
|
+
max_instances=4 if replica == 0 else 1,
|
|
91
|
+
)
|
|
92
|
+
_scheduler.add_job(
|
|
93
|
+
process_running_jobs,
|
|
94
|
+
IntervalTrigger(seconds=4, jitter=2),
|
|
95
|
+
kwargs={"batch_size": 5},
|
|
96
|
+
max_instances=2 if replica == 0 else 1,
|
|
97
|
+
)
|
|
98
|
+
_scheduler.add_job(
|
|
99
|
+
process_terminating_jobs,
|
|
100
|
+
IntervalTrigger(seconds=4, jitter=2),
|
|
101
|
+
kwargs={"batch_size": 5},
|
|
102
|
+
max_instances=2 if replica == 0 else 1,
|
|
103
|
+
)
|
|
104
|
+
_scheduler.add_job(
|
|
105
|
+
process_runs,
|
|
106
|
+
IntervalTrigger(seconds=2, jitter=1),
|
|
107
|
+
kwargs={"batch_size": 5},
|
|
108
|
+
max_instances=2 if replica == 0 else 1,
|
|
109
|
+
)
|
|
110
|
+
_scheduler.add_job(
|
|
111
|
+
process_instances,
|
|
112
|
+
IntervalTrigger(seconds=4, jitter=2),
|
|
113
|
+
kwargs={"batch_size": 5},
|
|
114
|
+
max_instances=2 if replica == 0 else 1,
|
|
115
|
+
)
|
|
116
|
+
_scheduler.add_job(
|
|
117
|
+
process_fleets,
|
|
118
|
+
IntervalTrigger(seconds=10, jitter=2),
|
|
119
|
+
kwargs={"batch_size": 5},
|
|
120
|
+
max_instances=2 if replica == 0 else 1,
|
|
121
|
+
)
|
|
96
122
|
_scheduler.start()
|
|
97
123
|
return _scheduler
|
|
@@ -1,9 +1,12 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from datetime import timedelta
|
|
3
|
+
|
|
1
4
|
from sqlalchemy import select
|
|
2
5
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
3
6
|
from sqlalchemy.orm import joinedload
|
|
4
7
|
|
|
5
8
|
from dstack._internal.core.models.fleets import FleetStatus
|
|
6
|
-
from dstack._internal.server.db import get_session_ctx
|
|
9
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
7
10
|
from dstack._internal.server.models import FleetModel
|
|
8
11
|
from dstack._internal.server.services.fleets import (
|
|
9
12
|
is_fleet_empty,
|
|
@@ -17,8 +20,18 @@ from dstack._internal.utils.logging import get_logger
|
|
|
17
20
|
logger = get_logger(__name__)
|
|
18
21
|
|
|
19
22
|
|
|
20
|
-
|
|
21
|
-
|
|
23
|
+
MIN_PROCESSING_INTERVAL = timedelta(seconds=30)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
async def process_fleets(batch_size: int = 1):
|
|
27
|
+
tasks = []
|
|
28
|
+
for _ in range(batch_size):
|
|
29
|
+
tasks.append(_process_next_fleet())
|
|
30
|
+
await asyncio.gather(*tasks)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def _process_next_fleet():
|
|
34
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(FleetModel.__tablename__)
|
|
22
35
|
async with get_session_ctx() as session:
|
|
23
36
|
async with lock:
|
|
24
37
|
res = await session.execute(
|
|
@@ -26,6 +39,8 @@ async def process_fleets():
|
|
|
26
39
|
.where(
|
|
27
40
|
FleetModel.deleted == False,
|
|
28
41
|
FleetModel.id.not_in(lockset),
|
|
42
|
+
FleetModel.last_processed_at
|
|
43
|
+
< get_current_datetime().replace(tzinfo=None) - MIN_PROCESSING_INTERVAL,
|
|
29
44
|
)
|
|
30
45
|
.order_by(FleetModel.last_processed_at.asc())
|
|
31
46
|
.limit(1)
|
|
@@ -43,6 +58,7 @@ async def process_fleets():
|
|
|
43
58
|
|
|
44
59
|
|
|
45
60
|
async def _process_fleet(session: AsyncSession, fleet_model: FleetModel):
|
|
61
|
+
logger.debug("Processing fleet %s", fleet_model.name)
|
|
46
62
|
# Refetch to load related attributes.
|
|
47
63
|
# joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
|
|
48
64
|
res = await session.execute(
|