dstack 0.19.17__py3-none-any.whl → 0.19.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (86) hide show
  1. dstack/_internal/cli/services/configurators/fleet.py +111 -1
  2. dstack/_internal/cli/services/profile.py +1 -1
  3. dstack/_internal/core/backends/aws/compute.py +237 -18
  4. dstack/_internal/core/backends/base/compute.py +20 -2
  5. dstack/_internal/core/backends/cudo/compute.py +23 -9
  6. dstack/_internal/core/backends/gcp/compute.py +13 -7
  7. dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
  8. dstack/_internal/core/compatibility/fleets.py +12 -11
  9. dstack/_internal/core/compatibility/gateways.py +9 -8
  10. dstack/_internal/core/compatibility/logs.py +4 -3
  11. dstack/_internal/core/compatibility/runs.py +29 -21
  12. dstack/_internal/core/compatibility/volumes.py +11 -8
  13. dstack/_internal/core/errors.py +4 -0
  14. dstack/_internal/core/models/common.py +45 -2
  15. dstack/_internal/core/models/configurations.py +9 -1
  16. dstack/_internal/core/models/fleets.py +2 -1
  17. dstack/_internal/core/models/profiles.py +8 -5
  18. dstack/_internal/core/models/resources.py +15 -8
  19. dstack/_internal/core/models/runs.py +41 -138
  20. dstack/_internal/core/models/volumes.py +14 -0
  21. dstack/_internal/core/services/diff.py +56 -3
  22. dstack/_internal/core/services/ssh/attach.py +2 -0
  23. dstack/_internal/server/app.py +37 -9
  24. dstack/_internal/server/background/__init__.py +66 -40
  25. dstack/_internal/server/background/tasks/process_fleets.py +19 -3
  26. dstack/_internal/server/background/tasks/process_gateways.py +47 -29
  27. dstack/_internal/server/background/tasks/process_idle_volumes.py +139 -0
  28. dstack/_internal/server/background/tasks/process_instances.py +13 -2
  29. dstack/_internal/server/background/tasks/process_placement_groups.py +4 -2
  30. dstack/_internal/server/background/tasks/process_running_jobs.py +14 -3
  31. dstack/_internal/server/background/tasks/process_runs.py +8 -4
  32. dstack/_internal/server/background/tasks/process_submitted_jobs.py +38 -7
  33. dstack/_internal/server/background/tasks/process_terminating_jobs.py +5 -3
  34. dstack/_internal/server/background/tasks/process_volumes.py +2 -2
  35. dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +6 -6
  36. dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
  37. dstack/_internal/server/models.py +1 -0
  38. dstack/_internal/server/routers/backends.py +23 -16
  39. dstack/_internal/server/routers/files.py +7 -6
  40. dstack/_internal/server/routers/fleets.py +47 -36
  41. dstack/_internal/server/routers/gateways.py +27 -18
  42. dstack/_internal/server/routers/instances.py +18 -13
  43. dstack/_internal/server/routers/logs.py +7 -3
  44. dstack/_internal/server/routers/metrics.py +14 -8
  45. dstack/_internal/server/routers/projects.py +33 -22
  46. dstack/_internal/server/routers/repos.py +7 -6
  47. dstack/_internal/server/routers/runs.py +49 -28
  48. dstack/_internal/server/routers/secrets.py +20 -15
  49. dstack/_internal/server/routers/server.py +7 -4
  50. dstack/_internal/server/routers/users.py +22 -19
  51. dstack/_internal/server/routers/volumes.py +34 -25
  52. dstack/_internal/server/schemas/logs.py +2 -2
  53. dstack/_internal/server/schemas/runs.py +17 -5
  54. dstack/_internal/server/services/fleets.py +358 -75
  55. dstack/_internal/server/services/gateways/__init__.py +17 -6
  56. dstack/_internal/server/services/gateways/client.py +5 -3
  57. dstack/_internal/server/services/instances.py +8 -0
  58. dstack/_internal/server/services/jobs/__init__.py +45 -0
  59. dstack/_internal/server/services/jobs/configurators/base.py +12 -1
  60. dstack/_internal/server/services/locking.py +104 -13
  61. dstack/_internal/server/services/logging.py +4 -2
  62. dstack/_internal/server/services/logs/__init__.py +15 -2
  63. dstack/_internal/server/services/logs/aws.py +2 -4
  64. dstack/_internal/server/services/logs/filelog.py +33 -27
  65. dstack/_internal/server/services/logs/gcp.py +3 -5
  66. dstack/_internal/server/services/proxy/repo.py +4 -1
  67. dstack/_internal/server/services/runs.py +139 -72
  68. dstack/_internal/server/services/services/__init__.py +2 -1
  69. dstack/_internal/server/services/users.py +3 -1
  70. dstack/_internal/server/services/volumes.py +15 -2
  71. dstack/_internal/server/settings.py +25 -6
  72. dstack/_internal/server/statics/index.html +1 -1
  73. dstack/_internal/server/statics/{main-d151637af20f70b2e796.js → main-64f8273740c4b52c18f5.js} +71 -67
  74. dstack/_internal/server/statics/{main-d151637af20f70b2e796.js.map → main-64f8273740c4b52c18f5.js.map} +1 -1
  75. dstack/_internal/server/statics/{main-d48635d8fe670d53961c.css → main-d58fc0460cb0eae7cb5c.css} +1 -1
  76. dstack/_internal/server/testing/common.py +48 -8
  77. dstack/_internal/server/utils/routers.py +31 -8
  78. dstack/_internal/utils/json_utils.py +54 -0
  79. dstack/api/_public/runs.py +13 -2
  80. dstack/api/server/_runs.py +12 -2
  81. dstack/version.py +1 -1
  82. {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/METADATA +17 -14
  83. {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/RECORD +86 -83
  84. {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/WHEEL +0 -0
  85. {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/entry_points.txt +0 -0
  86. {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/licenses/LICENSE.md +0 -0
@@ -11,6 +11,7 @@ from dstack._internal.core.models.configurations import (
11
11
  DEFAULT_REPO_DIR,
12
12
  AnyRunConfiguration,
13
13
  RunConfiguration,
14
+ ServiceConfiguration,
14
15
  )
15
16
  from dstack._internal.core.models.files import FileArchiveMapping
16
17
  from dstack._internal.core.models.instances import (
@@ -101,6 +102,14 @@ class RunTerminationReason(str, Enum):
101
102
  }
102
103
  return mapping[self]
103
104
 
105
+ def to_error(self) -> Optional[str]:
106
+ if self == RunTerminationReason.RETRY_LIMIT_EXCEEDED:
107
+ return "retry limit exceeded"
108
+ elif self == RunTerminationReason.SERVER_ERROR:
109
+ return "server error"
110
+ else:
111
+ return None
112
+
104
113
 
105
114
  class JobTerminationReason(str, Enum):
106
115
  # Set by the server
@@ -162,6 +171,24 @@ class JobTerminationReason(str, Enum):
162
171
  default = RetryEvent.ERROR if self.to_status() == JobStatus.FAILED else None
163
172
  return mapping.get(self, default)
164
173
 
174
+ def to_error(self) -> Optional[str]:
175
+ # Should return None for values that are already
176
+ # handled and shown in status_message.
177
+ error_mapping = {
178
+ JobTerminationReason.INSTANCE_UNREACHABLE: "instance unreachable",
179
+ JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED: "waiting instance limit exceeded",
180
+ JobTerminationReason.VOLUME_ERROR: "volume error",
181
+ JobTerminationReason.GATEWAY_ERROR: "gateway error",
182
+ JobTerminationReason.SCALED_DOWN: "scaled down",
183
+ JobTerminationReason.INACTIVITY_DURATION_EXCEEDED: "inactivity duration exceeded",
184
+ JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY: "utilization policy",
185
+ JobTerminationReason.PORTS_BINDING_FAILED: "ports binding failed",
186
+ JobTerminationReason.CREATING_CONTAINER_ERROR: "runner error",
187
+ JobTerminationReason.EXECUTOR_ERROR: "executor error",
188
+ JobTerminationReason.MAX_DURATION_EXCEEDED: "max duration exceeded",
189
+ }
190
+ return error_mapping.get(self)
191
+
165
192
 
166
193
  class Requirements(CoreModel):
167
194
  # TODO: Make requirements' fields required
@@ -227,6 +254,8 @@ class JobSpec(CoreModel):
227
254
  # TODO: drop this comment when supporting jobs submitted before 0.19.17 is no longer relevant.
228
255
  repo_code_hash: Optional[str] = None
229
256
  file_archives: list[FileArchiveMapping] = []
257
+ # None for non-services and pre-0.19.19 services. See `get_service_port`
258
+ service_port: Optional[int] = None
230
259
 
231
260
 
232
261
  class JobProvisioningData(CoreModel):
@@ -305,13 +334,12 @@ class JobSubmission(CoreModel):
305
334
  finished_at: Optional[datetime]
306
335
  inactivity_secs: Optional[int]
307
336
  status: JobStatus
337
+ status_message: str = "" # default for backward compatibility
308
338
  termination_reason: Optional[JobTerminationReason]
309
339
  termination_reason_message: Optional[str]
310
340
  exit_status: Optional[int]
311
341
  job_provisioning_data: Optional[JobProvisioningData]
312
342
  job_runtime_data: Optional[JobRuntimeData]
313
- # TODO: make status_message and error a computed field after migrating to pydanticV2
314
- status_message: Optional[str] = None
315
343
  error: Optional[str] = None
316
344
 
317
345
  @property
@@ -325,71 +353,6 @@ class JobSubmission(CoreModel):
325
353
  end_time = self.finished_at
326
354
  return end_time - self.submitted_at
327
355
 
328
- @root_validator
329
- def _status_message(cls, values) -> Dict:
330
- try:
331
- status = values["status"]
332
- termination_reason = values["termination_reason"]
333
- exit_code = values["exit_status"]
334
- except KeyError:
335
- return values
336
- values["status_message"] = JobSubmission._get_status_message(
337
- status=status,
338
- termination_reason=termination_reason,
339
- exit_status=exit_code,
340
- )
341
- return values
342
-
343
- @staticmethod
344
- def _get_status_message(
345
- status: JobStatus,
346
- termination_reason: Optional[JobTerminationReason],
347
- exit_status: Optional[int],
348
- ) -> str:
349
- if status == JobStatus.DONE:
350
- return "exited (0)"
351
- elif status == JobStatus.FAILED:
352
- if termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR:
353
- return f"exited ({exit_status})"
354
- elif termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY:
355
- return "no offers"
356
- elif termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY:
357
- return "interrupted"
358
- else:
359
- return "error"
360
- elif status == JobStatus.TERMINATED:
361
- if termination_reason == JobTerminationReason.TERMINATED_BY_USER:
362
- return "stopped"
363
- elif termination_reason == JobTerminationReason.ABORTED_BY_USER:
364
- return "aborted"
365
- return status.value
366
-
367
- @root_validator
368
- def _error(cls, values) -> Dict:
369
- try:
370
- termination_reason = values["termination_reason"]
371
- except KeyError:
372
- return values
373
- values["error"] = JobSubmission._get_error(termination_reason=termination_reason)
374
- return values
375
-
376
- @staticmethod
377
- def _get_error(termination_reason: Optional[JobTerminationReason]) -> Optional[str]:
378
- error_mapping = {
379
- JobTerminationReason.INSTANCE_UNREACHABLE: "instance unreachable",
380
- JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED: "waiting instance limit exceeded",
381
- JobTerminationReason.VOLUME_ERROR: "volume error",
382
- JobTerminationReason.GATEWAY_ERROR: "gateway error",
383
- JobTerminationReason.SCALED_DOWN: "scaled down",
384
- JobTerminationReason.INACTIVITY_DURATION_EXCEEDED: "inactivity duration exceeded",
385
- JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY: "utilization policy",
386
- JobTerminationReason.PORTS_BINDING_FAILED: "ports binding failed",
387
- JobTerminationReason.CREATING_CONTAINER_ERROR: "runner error",
388
- JobTerminationReason.EXECUTOR_ERROR: "executor error",
389
- JobTerminationReason.MAX_DURATION_EXCEEDED: "max duration exceeded",
390
- }
391
- return error_mapping.get(termination_reason)
392
-
393
356
 
394
357
  class Job(CoreModel):
395
358
  job_spec: JobSpec
@@ -524,85 +487,17 @@ class Run(CoreModel):
524
487
  submitted_at: datetime
525
488
  last_processed_at: datetime
526
489
  status: RunStatus
527
- status_message: Optional[str] = None
528
- termination_reason: Optional[RunTerminationReason]
490
+ status_message: str = "" # default for backward compatibility
491
+ termination_reason: Optional[RunTerminationReason] = None
529
492
  run_spec: RunSpec
530
493
  jobs: List[Job]
531
- latest_job_submission: Optional[JobSubmission]
494
+ latest_job_submission: Optional[JobSubmission] = None
532
495
  cost: float = 0
533
496
  service: Optional[ServiceSpec] = None
534
497
  deployment_num: int = 0 # default for compatibility with pre-0.19.14 servers
535
- # TODO: make error a computed field after migrating to pydanticV2
536
498
  error: Optional[str] = None
537
499
  deleted: Optional[bool] = None
538
500
 
539
- @root_validator
540
- def _error(cls, values) -> Dict:
541
- try:
542
- termination_reason = values["termination_reason"]
543
- except KeyError:
544
- return values
545
- values["error"] = Run._get_error(termination_reason=termination_reason)
546
- return values
547
-
548
- @staticmethod
549
- def _get_error(termination_reason: Optional[RunTerminationReason]) -> Optional[str]:
550
- if termination_reason == RunTerminationReason.RETRY_LIMIT_EXCEEDED:
551
- return "retry limit exceeded"
552
- elif termination_reason == RunTerminationReason.SERVER_ERROR:
553
- return "server error"
554
- else:
555
- return None
556
-
557
- @root_validator
558
- def _status_message(cls, values) -> Dict:
559
- try:
560
- status = values["status"]
561
- jobs: List[Job] = values["jobs"]
562
- retry_on_events = (
563
- jobs[0].job_spec.retry.on_events if jobs and jobs[0].job_spec.retry else []
564
- )
565
- job_status = (
566
- jobs[0].job_submissions[-1].status
567
- if len(jobs) == 1 and jobs[0].job_submissions
568
- else None
569
- )
570
- termination_reason = Run.get_last_termination_reason(jobs[0]) if jobs else None
571
- except KeyError:
572
- return values
573
- values["status_message"] = Run._get_status_message(
574
- status=status,
575
- job_status=job_status,
576
- retry_on_events=retry_on_events,
577
- termination_reason=termination_reason,
578
- )
579
- return values
580
-
581
- @staticmethod
582
- def get_last_termination_reason(job: "Job") -> Optional[JobTerminationReason]:
583
- for submission in reversed(job.job_submissions):
584
- if submission.termination_reason is not None:
585
- return submission.termination_reason
586
- return None
587
-
588
- @staticmethod
589
- def _get_status_message(
590
- status: RunStatus,
591
- job_status: Optional[JobStatus],
592
- retry_on_events: List[RetryEvent],
593
- termination_reason: Optional[JobTerminationReason],
594
- ) -> str:
595
- if job_status == JobStatus.PULLING:
596
- return "pulling"
597
- # Currently, `retrying` is shown only for `no-capacity` events
598
- if (
599
- status in [RunStatus.SUBMITTED, RunStatus.PENDING]
600
- and termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
601
- and RetryEvent.NO_CAPACITY in retry_on_events
602
- ):
603
- return "retrying"
604
- return status.value
605
-
606
501
  def is_deployment_in_progress(self) -> bool:
607
502
  return any(
608
503
  not j.job_submissions[-1].status.is_finished()
@@ -658,3 +553,11 @@ def get_policy_map(spot_policy: Optional[SpotPolicy], default: SpotPolicy) -> Op
658
553
  SpotPolicy.ONDEMAND: False,
659
554
  }
660
555
  return policy_map[spot_policy]
556
+
557
+
558
+ def get_service_port(job_spec: JobSpec, configuration: ServiceConfiguration) -> int:
559
+ # Compatibility with pre-0.19.19 job specs that do not have the `service_port` property.
560
+ # TODO: drop when pre-0.19.19 jobs are no longer relevant.
561
+ if job_spec.service_port is None:
562
+ return configuration.port.container_port
563
+ return job_spec.service_port
@@ -9,6 +9,7 @@ from typing_extensions import Annotated, Self
9
9
 
10
10
  from dstack._internal.core.models.backends.base import BackendType
11
11
  from dstack._internal.core.models.common import CoreModel
12
+ from dstack._internal.core.models.profiles import parse_idle_duration
12
13
  from dstack._internal.core.models.resources import Memory
13
14
  from dstack._internal.utils.common import get_or_error
14
15
  from dstack._internal.utils.tags import tags_validator
@@ -44,6 +45,16 @@ class VolumeConfiguration(CoreModel):
44
45
  Optional[str],
45
46
  Field(description="The volume ID. Must be specified when registering external volumes"),
46
47
  ] = None
48
+ auto_cleanup_duration: Annotated[
49
+ Optional[Union[str, int]],
50
+ Field(
51
+ description=(
52
+ "Time to wait after volume is no longer used by any job before deleting it. "
53
+ "Defaults to keep the volume indefinitely. "
54
+ "Use the value 'off' or -1 to disable auto-cleanup."
55
+ )
56
+ ),
57
+ ] = None
47
58
  tags: Annotated[
48
59
  Optional[Dict[str, str]],
49
60
  Field(
@@ -56,6 +67,9 @@ class VolumeConfiguration(CoreModel):
56
67
  ] = None
57
68
 
58
69
  _validate_tags = validator("tags", pre=True, allow_reuse=True)(tags_validator)
70
+ _validate_auto_cleanup_duration = validator(
71
+ "auto_cleanup_duration", pre=True, allow_reuse=True
72
+ )(parse_idle_duration)
59
73
 
60
74
  @property
61
75
  def size_gb(self) -> int:
@@ -1,14 +1,46 @@
1
- from typing import Any, Dict
1
+ from typing import Any, Optional, TypedDict, TypeVar
2
2
 
3
3
  from pydantic import BaseModel
4
4
 
5
+ from dstack._internal.core.models.common import IncludeExcludeType
6
+
7
+
8
+ class ModelFieldDiff(TypedDict):
9
+ old: Any
10
+ new: Any
11
+
12
+
13
+ ModelDiff = dict[str, ModelFieldDiff]
14
+
5
15
 
6
16
  # TODO: calculate nested diffs
7
- def diff_models(old: BaseModel, new: BaseModel) -> Dict[str, Any]:
17
+ def diff_models(
18
+ old: BaseModel, new: BaseModel, reset: Optional[IncludeExcludeType] = None
19
+ ) -> ModelDiff:
20
+ """
21
+ Returns a diff of model instances fields.
22
+
23
+ The fields specified in the `reset` option are reset to their default values, effectively
24
+ excluding them from comparison (assuming that the default value is equal to itself, e.g,
25
+ `None == None`, `"task" == "task"`, but `math.nan != math.nan`).
26
+
27
+ Args:
28
+ old: The "old" model instance.
29
+ new: The "new" model instance.
30
+ reset: Fields to reset to their default values before comparison.
31
+
32
+ Returns:
33
+ A dict of changed fields in the form of
34
+ `{<field_name>: {"old": old_value, "new": new_value}}`
35
+ """
8
36
  if type(old) is not type(new):
9
37
  raise TypeError("Both instances must be of the same Pydantic model class.")
10
38
 
11
- changes = {}
39
+ if reset is not None:
40
+ old = copy_model(old, reset=reset)
41
+ new = copy_model(new, reset=reset)
42
+
43
+ changes: ModelDiff = {}
12
44
  for field in old.__fields__:
13
45
  old_value = getattr(old, field)
14
46
  new_value = getattr(new, field)
@@ -16,3 +48,24 @@ def diff_models(old: BaseModel, new: BaseModel) -> Dict[str, Any]:
16
48
  changes[field] = {"old": old_value, "new": new_value}
17
49
 
18
50
  return changes
51
+
52
+
53
+ M = TypeVar("M", bound=BaseModel)
54
+
55
+
56
+ def copy_model(model: M, reset: Optional[IncludeExcludeType] = None) -> M:
57
+ """
58
+ Returns a deep copy of the model instance.
59
+
60
+ Implemented as `BaseModel.parse_obj(BaseModel.dict())`, thus,
61
+ unlike `BaseModel.copy(deep=True)`, runs all validations.
62
+
63
+ The fields specified in the `reset` option are reset to their default values.
64
+
65
+ Args:
66
+ reset: Fields to reset to their default values.
67
+
68
+ Returns:
69
+ A deep copy of the model instance.
70
+ """
71
+ return type(model).parse_obj(model.dict(exclude=reset))
@@ -64,6 +64,7 @@ class SSHAttach:
64
64
  run_name: str,
65
65
  dockerized: bool,
66
66
  ssh_proxy: Optional[SSHConnectionParams] = None,
67
+ service_port: Optional[int] = None,
67
68
  local_backend: bool = False,
68
69
  bind_address: Optional[str] = None,
69
70
  ):
@@ -90,6 +91,7 @@ class SSHAttach:
90
91
  },
91
92
  )
92
93
  self.ssh_proxy = ssh_proxy
94
+ self.service_port = service_port
93
95
 
94
96
  hosts: dict[str, dict[str, Union[str, int, FilePath]]] = {}
95
97
  self.hosts = hosts
@@ -2,6 +2,7 @@ import asyncio
2
2
  import importlib.resources
3
3
  import os
4
4
  import time
5
+ from concurrent.futures import ThreadPoolExecutor
5
6
  from contextlib import asynccontextmanager
6
7
  from pathlib import Path
7
8
  from typing import Awaitable, Callable, List
@@ -9,7 +10,7 @@ from typing import Awaitable, Callable, List
9
10
  import sentry_sdk
10
11
  from fastapi import FastAPI, Request, Response, status
11
12
  from fastapi.datastructures import URL
12
- from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse
13
+ from fastapi.responses import HTMLResponse, RedirectResponse
13
14
  from fastapi.staticfiles import StaticFiles
14
15
  from prometheus_client import Counter, Histogram
15
16
 
@@ -55,6 +56,7 @@ from dstack._internal.server.settings import (
55
56
  )
56
57
  from dstack._internal.server.utils.logging import configure_logging
57
58
  from dstack._internal.server.utils.routers import (
59
+ CustomORJSONResponse,
58
60
  check_client_server_compatibility,
59
61
  error_detail,
60
62
  get_server_client_error_details,
@@ -89,7 +91,10 @@ def create_app() -> FastAPI:
89
91
  profiles_sample_rate=settings.SENTRY_PROFILES_SAMPLE_RATE,
90
92
  )
91
93
 
92
- app = FastAPI(docs_url="/api/docs", lifespan=lifespan)
94
+ app = FastAPI(
95
+ docs_url="/api/docs",
96
+ lifespan=lifespan,
97
+ )
93
98
  app.state.proxy_dependency_injector = ServerProxyDependencyInjector()
94
99
  return app
95
100
 
@@ -97,6 +102,8 @@ def create_app() -> FastAPI:
97
102
  @asynccontextmanager
98
103
  async def lifespan(app: FastAPI):
99
104
  configure_logging()
105
+ server_executor = ThreadPoolExecutor(max_workers=settings.SERVER_EXECUTOR_MAX_WORKERS)
106
+ asyncio.get_running_loop().set_default_executor(server_executor)
100
107
  await migrate()
101
108
  _print_dstack_logo()
102
109
  if not check_required_ssh_version():
@@ -144,7 +151,10 @@ async def lifespan(app: FastAPI):
144
151
  )
145
152
  if settings.SERVER_S3_BUCKET is not None or settings.SERVER_GCS_BUCKET is not None:
146
153
  init_default_storage()
147
- scheduler = start_background_tasks()
154
+ if settings.SERVER_BACKGROUND_PROCESSING_ENABLED:
155
+ scheduler = start_background_tasks()
156
+ else:
157
+ logger.info("Background processing is disabled")
148
158
  dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)"
149
159
  logger.info(f"The admin token is {admin.token.get_plaintext_or_error()}", {"show_path": False})
150
160
  logger.info(
@@ -154,7 +164,8 @@ async def lifespan(app: FastAPI):
154
164
  for func in _ON_STARTUP_HOOKS:
155
165
  await func(app)
156
166
  yield
157
- scheduler.shutdown()
167
+ if settings.SERVER_BACKGROUND_PROCESSING_ENABLED:
168
+ scheduler.shutdown()
158
169
  await gateway_connections_pool.remove_all()
159
170
  service_conn_pool = await get_injector_from_app(app).get_service_connection_pool()
160
171
  await service_conn_pool.remove_all()
@@ -205,14 +216,14 @@ def register_routes(app: FastAPI, ui: bool = True):
205
216
  msg = "Access denied"
206
217
  if len(exc.args) > 0:
207
218
  msg = exc.args[0]
208
- return JSONResponse(
219
+ return CustomORJSONResponse(
209
220
  status_code=status.HTTP_403_FORBIDDEN,
210
221
  content=error_detail(msg),
211
222
  )
212
223
 
213
224
  @app.exception_handler(ServerClientError)
214
225
  async def server_client_error_handler(request: Request, exc: ServerClientError):
215
- return JSONResponse(
226
+ return CustomORJSONResponse(
216
227
  status_code=status.HTTP_400_BAD_REQUEST,
217
228
  content={"detail": get_server_client_error_details(exc)},
218
229
  )
@@ -220,7 +231,7 @@ def register_routes(app: FastAPI, ui: bool = True):
220
231
  @app.exception_handler(OSError)
221
232
  async def os_error_handler(request, exc: OSError):
222
233
  if exc.errno in [36, 63]:
223
- return JSONResponse(
234
+ return CustomORJSONResponse(
224
235
  {"detail": "Filename too long"},
225
236
  status_code=status.HTTP_400_BAD_REQUEST,
226
237
  )
@@ -242,6 +253,23 @@ def register_routes(app: FastAPI, ui: bool = True):
242
253
  )
243
254
  return response
244
255
 
256
+ if settings.SERVER_PROFILING_ENABLED:
257
+ from pyinstrument import Profiler
258
+
259
+ @app.middleware("http")
260
+ async def profile_request(request: Request, call_next):
261
+ profiling = request.query_params.get("profile", False)
262
+ if profiling:
263
+ profiler = Profiler()
264
+ profiler.start()
265
+ respone = await call_next(request)
266
+ profiler.stop()
267
+ with open("profiling_results.html", "w+") as f:
268
+ f.write(profiler.output_html())
269
+ return respone
270
+ else:
271
+ return await call_next(request)
272
+
245
273
  # this middleware must be defined after the log_request middleware
246
274
  @app.middleware("http")
247
275
  async def log_http_metrics(request: Request, call_next):
@@ -289,7 +317,7 @@ def register_routes(app: FastAPI, ui: bool = True):
289
317
 
290
318
  @app.get("/healthcheck")
291
319
  async def healthcheck():
292
- return JSONResponse(content={"status": "running"})
320
+ return CustomORJSONResponse(content={"status": "running"})
293
321
 
294
322
  if ui and Path(__file__).parent.joinpath("statics").exists():
295
323
  app.mount(
@@ -303,7 +331,7 @@ def register_routes(app: FastAPI, ui: bool = True):
303
331
  or _is_proxy_request(request)
304
332
  or _is_prometheus_request(request)
305
333
  ):
306
- return JSONResponse(
334
+ return CustomORJSONResponse(
307
335
  {"detail": exc.detail},
308
336
  status_code=status.HTTP_404_NOT_FOUND,
309
337
  )
@@ -4,9 +4,10 @@ from apscheduler.triggers.interval import IntervalTrigger
4
4
  from dstack._internal.server import settings
5
5
  from dstack._internal.server.background.tasks.process_fleets import process_fleets
6
6
  from dstack._internal.server.background.tasks.process_gateways import (
7
+ process_gateways,
7
8
  process_gateways_connections,
8
- process_submitted_gateways,
9
9
  )
10
+ from dstack._internal.server.background.tasks.process_idle_volumes import process_idle_volumes
10
11
  from dstack._internal.server.background.tasks.process_instances import (
11
12
  process_instances,
12
13
  )
@@ -37,15 +38,31 @@ def get_scheduler() -> AsyncIOScheduler:
37
38
 
38
39
 
39
40
  def start_background_tasks() -> AsyncIOScheduler:
41
+ # We try to process as many resources as possible without exhausting DB connections.
42
+ #
43
+ # Quick tasks can process multiple resources per transaction.
44
+ # Potentially long tasks process one resource per transaction
45
+ # to avoid holding locks for all the resources if one is slow to process.
46
+ # Still, the next batch won't be processed unless all resources are processed,
47
+ # so larger batches do not increase processing rate linearly.
48
+ #
49
+ # The interval, batch_size, and max_instances determine background tasks processing rates.
50
+ # By default, one server replica can handle:
51
+ #
52
+ # * 150 active jobs with 2 minutes processing latency
53
+ # * 150 active runs with 2 minutes processing latency
54
+ # * 150 active instances with 2 minutes processing latency
55
+ #
56
+ # These latency numbers do not account for provisioning time,
57
+ # so it may be slower if a backend is slow to provision.
58
+ #
59
+ # Users can set SERVER_BACKGROUND_PROCESSING_FACTOR to process more resources per replica.
60
+ # They also need to increase max db connections on the client side and db side.
61
+ #
40
62
  # In-memory locking via locksets does not guarantee
41
63
  # that the first waiting for the lock will acquire it.
42
64
  # The jitter is needed to give all tasks a chance to acquire locks.
43
65
 
44
- # The batch_size and interval determine background tasks processing rates.
45
- # Currently one server replica can handle:
46
- # * 150 active jobs with up to 2 minutes processing latency
47
- # * 150 active runs with up to 2 minutes processing latency
48
- # * 150 active instances with up to 2 minutes processing latency
49
66
  _scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
50
67
  _scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
51
68
  if settings.ENABLE_PROMETHEUS_METRICS:
@@ -53,45 +70,54 @@ def start_background_tasks() -> AsyncIOScheduler:
53
70
  collect_prometheus_metrics, IntervalTrigger(seconds=10), max_instances=1
54
71
  )
55
72
  _scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1)
56
- # process_submitted_jobs and process_instances max processing rate is 75 jobs(instances) per minute.
57
- _scheduler.add_job(
58
- process_submitted_jobs,
59
- IntervalTrigger(seconds=4, jitter=2),
60
- kwargs={"batch_size": 5},
61
- max_instances=2,
62
- )
63
- _scheduler.add_job(
64
- process_running_jobs,
65
- IntervalTrigger(seconds=4, jitter=2),
66
- kwargs={"batch_size": 5},
67
- max_instances=2,
68
- )
69
- _scheduler.add_job(
70
- process_terminating_jobs,
71
- IntervalTrigger(seconds=4, jitter=2),
72
- kwargs={"batch_size": 5},
73
- max_instances=2,
74
- )
75
- _scheduler.add_job(
76
- process_runs,
77
- IntervalTrigger(seconds=2, jitter=1),
78
- kwargs={"batch_size": 5},
79
- max_instances=2,
80
- )
81
- _scheduler.add_job(
82
- process_instances,
83
- IntervalTrigger(seconds=4, jitter=2),
84
- kwargs={"batch_size": 5},
85
- max_instances=2,
86
- )
87
- _scheduler.add_job(process_fleets, IntervalTrigger(seconds=10, jitter=2))
88
73
  _scheduler.add_job(process_gateways_connections, IntervalTrigger(seconds=15))
74
+ _scheduler.add_job(process_gateways, IntervalTrigger(seconds=10, jitter=2), max_instances=5)
89
75
  _scheduler.add_job(
90
- process_submitted_gateways, IntervalTrigger(seconds=10, jitter=2), max_instances=5
76
+ process_submitted_volumes, IntervalTrigger(seconds=10, jitter=2), max_instances=5
91
77
  )
92
78
  _scheduler.add_job(
93
- process_submitted_volumes, IntervalTrigger(seconds=10, jitter=2), max_instances=5
79
+ process_idle_volumes, IntervalTrigger(seconds=60, jitter=10), max_instances=1
94
80
  )
95
81
  _scheduler.add_job(process_placement_groups, IntervalTrigger(seconds=30, jitter=5))
82
+ for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR):
83
+ # Add multiple copies of tasks if requested.
84
+ # max_instances=1 for additional copies to avoid running too many tasks.
85
+ # Move other tasks here when they need per-replica scaling.
86
+ _scheduler.add_job(
87
+ process_submitted_jobs,
88
+ IntervalTrigger(seconds=4, jitter=2),
89
+ kwargs={"batch_size": 5},
90
+ max_instances=4 if replica == 0 else 1,
91
+ )
92
+ _scheduler.add_job(
93
+ process_running_jobs,
94
+ IntervalTrigger(seconds=4, jitter=2),
95
+ kwargs={"batch_size": 5},
96
+ max_instances=2 if replica == 0 else 1,
97
+ )
98
+ _scheduler.add_job(
99
+ process_terminating_jobs,
100
+ IntervalTrigger(seconds=4, jitter=2),
101
+ kwargs={"batch_size": 5},
102
+ max_instances=2 if replica == 0 else 1,
103
+ )
104
+ _scheduler.add_job(
105
+ process_runs,
106
+ IntervalTrigger(seconds=2, jitter=1),
107
+ kwargs={"batch_size": 5},
108
+ max_instances=2 if replica == 0 else 1,
109
+ )
110
+ _scheduler.add_job(
111
+ process_instances,
112
+ IntervalTrigger(seconds=4, jitter=2),
113
+ kwargs={"batch_size": 5},
114
+ max_instances=2 if replica == 0 else 1,
115
+ )
116
+ _scheduler.add_job(
117
+ process_fleets,
118
+ IntervalTrigger(seconds=10, jitter=2),
119
+ kwargs={"batch_size": 5},
120
+ max_instances=2 if replica == 0 else 1,
121
+ )
96
122
  _scheduler.start()
97
123
  return _scheduler
@@ -1,9 +1,12 @@
1
+ import asyncio
2
+ from datetime import timedelta
3
+
1
4
  from sqlalchemy import select
2
5
  from sqlalchemy.ext.asyncio import AsyncSession
3
6
  from sqlalchemy.orm import joinedload
4
7
 
5
8
  from dstack._internal.core.models.fleets import FleetStatus
6
- from dstack._internal.server.db import get_session_ctx
9
+ from dstack._internal.server.db import get_db, get_session_ctx
7
10
  from dstack._internal.server.models import FleetModel
8
11
  from dstack._internal.server.services.fleets import (
9
12
  is_fleet_empty,
@@ -17,8 +20,18 @@ from dstack._internal.utils.logging import get_logger
17
20
  logger = get_logger(__name__)
18
21
 
19
22
 
20
- async def process_fleets():
21
- lock, lockset = get_locker().get_lockset(FleetModel.__tablename__)
23
+ MIN_PROCESSING_INTERVAL = timedelta(seconds=30)
24
+
25
+
26
+ async def process_fleets(batch_size: int = 1):
27
+ tasks = []
28
+ for _ in range(batch_size):
29
+ tasks.append(_process_next_fleet())
30
+ await asyncio.gather(*tasks)
31
+
32
+
33
+ async def _process_next_fleet():
34
+ lock, lockset = get_locker(get_db().dialect_name).get_lockset(FleetModel.__tablename__)
22
35
  async with get_session_ctx() as session:
23
36
  async with lock:
24
37
  res = await session.execute(
@@ -26,6 +39,8 @@ async def process_fleets():
26
39
  .where(
27
40
  FleetModel.deleted == False,
28
41
  FleetModel.id.not_in(lockset),
42
+ FleetModel.last_processed_at
43
+ < get_current_datetime().replace(tzinfo=None) - MIN_PROCESSING_INTERVAL,
29
44
  )
30
45
  .order_by(FleetModel.last_processed_at.asc())
31
46
  .limit(1)
@@ -43,6 +58,7 @@ async def process_fleets():
43
58
 
44
59
 
45
60
  async def _process_fleet(session: AsyncSession, fleet_model: FleetModel):
61
+ logger.debug("Processing fleet %s", fleet_model.name)
46
62
  # Refetch to load related attributes.
47
63
  # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
48
64
  res = await session.execute(