dstack 0.19.16__py3-none-any.whl → 0.19.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (80) hide show
  1. dstack/_internal/cli/commands/secrets.py +92 -0
  2. dstack/_internal/cli/main.py +2 -0
  3. dstack/_internal/cli/services/completion.py +5 -0
  4. dstack/_internal/cli/services/configurators/fleet.py +13 -1
  5. dstack/_internal/cli/services/configurators/run.py +59 -17
  6. dstack/_internal/cli/utils/secrets.py +25 -0
  7. dstack/_internal/core/backends/__init__.py +10 -4
  8. dstack/_internal/core/backends/aws/compute.py +237 -18
  9. dstack/_internal/core/backends/base/compute.py +20 -2
  10. dstack/_internal/core/backends/cudo/compute.py +23 -9
  11. dstack/_internal/core/backends/gcp/compute.py +13 -7
  12. dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
  13. dstack/_internal/core/compatibility/fleets.py +12 -11
  14. dstack/_internal/core/compatibility/gateways.py +9 -8
  15. dstack/_internal/core/compatibility/logs.py +4 -3
  16. dstack/_internal/core/compatibility/runs.py +41 -17
  17. dstack/_internal/core/compatibility/volumes.py +9 -8
  18. dstack/_internal/core/errors.py +4 -0
  19. dstack/_internal/core/models/common.py +7 -0
  20. dstack/_internal/core/models/configurations.py +11 -0
  21. dstack/_internal/core/models/files.py +67 -0
  22. dstack/_internal/core/models/runs.py +14 -0
  23. dstack/_internal/core/models/secrets.py +9 -2
  24. dstack/_internal/core/services/diff.py +36 -3
  25. dstack/_internal/server/app.py +22 -0
  26. dstack/_internal/server/background/__init__.py +61 -37
  27. dstack/_internal/server/background/tasks/process_fleets.py +19 -3
  28. dstack/_internal/server/background/tasks/process_gateways.py +1 -1
  29. dstack/_internal/server/background/tasks/process_instances.py +13 -2
  30. dstack/_internal/server/background/tasks/process_placement_groups.py +4 -2
  31. dstack/_internal/server/background/tasks/process_running_jobs.py +123 -15
  32. dstack/_internal/server/background/tasks/process_runs.py +23 -7
  33. dstack/_internal/server/background/tasks/process_submitted_jobs.py +36 -7
  34. dstack/_internal/server/background/tasks/process_terminating_jobs.py +5 -3
  35. dstack/_internal/server/background/tasks/process_volumes.py +2 -2
  36. dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py +39 -0
  37. dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py +49 -0
  38. dstack/_internal/server/models.py +33 -0
  39. dstack/_internal/server/routers/files.py +67 -0
  40. dstack/_internal/server/routers/secrets.py +57 -15
  41. dstack/_internal/server/schemas/files.py +5 -0
  42. dstack/_internal/server/schemas/runner.py +2 -0
  43. dstack/_internal/server/schemas/secrets.py +7 -11
  44. dstack/_internal/server/services/backends/__init__.py +1 -1
  45. dstack/_internal/server/services/files.py +91 -0
  46. dstack/_internal/server/services/fleets.py +5 -4
  47. dstack/_internal/server/services/gateways/__init__.py +4 -2
  48. dstack/_internal/server/services/jobs/__init__.py +19 -8
  49. dstack/_internal/server/services/jobs/configurators/base.py +25 -3
  50. dstack/_internal/server/services/jobs/configurators/dev.py +3 -3
  51. dstack/_internal/server/services/locking.py +101 -12
  52. dstack/_internal/server/services/proxy/repo.py +3 -0
  53. dstack/_internal/server/services/runner/client.py +8 -0
  54. dstack/_internal/server/services/runs.py +76 -47
  55. dstack/_internal/server/services/secrets.py +204 -0
  56. dstack/_internal/server/services/storage/base.py +21 -0
  57. dstack/_internal/server/services/storage/gcs.py +28 -6
  58. dstack/_internal/server/services/storage/s3.py +27 -9
  59. dstack/_internal/server/services/volumes.py +2 -2
  60. dstack/_internal/server/settings.py +19 -5
  61. dstack/_internal/server/statics/index.html +1 -1
  62. dstack/_internal/server/statics/{main-a4eafa74304e587d037c.js → main-d1ac2e8c38ed5f08a114.js} +86 -34
  63. dstack/_internal/server/statics/{main-a4eafa74304e587d037c.js.map → main-d1ac2e8c38ed5f08a114.js.map} +1 -1
  64. dstack/_internal/server/statics/{main-f53d6d0d42f8d61df1de.css → main-d58fc0460cb0eae7cb5c.css} +1 -1
  65. dstack/_internal/server/statics/static/media/google.b194b06fafd0a52aeb566922160ea514.svg +1 -0
  66. dstack/_internal/server/testing/common.py +50 -8
  67. dstack/_internal/settings.py +4 -0
  68. dstack/_internal/utils/files.py +69 -0
  69. dstack/_internal/utils/nested_list.py +47 -0
  70. dstack/_internal/utils/path.py +12 -4
  71. dstack/api/_public/runs.py +67 -7
  72. dstack/api/server/__init__.py +6 -0
  73. dstack/api/server/_files.py +18 -0
  74. dstack/api/server/_secrets.py +15 -15
  75. dstack/version.py +1 -1
  76. {dstack-0.19.16.dist-info → dstack-0.19.18.dist-info}/METADATA +13 -13
  77. {dstack-0.19.16.dist-info → dstack-0.19.18.dist-info}/RECORD +80 -67
  78. {dstack-0.19.16.dist-info → dstack-0.19.18.dist-info}/WHEEL +0 -0
  79. {dstack-0.19.16.dist-info → dstack-0.19.18.dist-info}/entry_points.txt +0 -0
  80. {dstack-0.19.16.dist-info → dstack-0.19.18.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,15 +1,16 @@
1
- from typing import Dict, Optional
1
+ from typing import Optional
2
2
 
3
+ from dstack._internal.core.models.common import IncludeExcludeDictType
3
4
  from dstack._internal.server.schemas.logs import PollLogsRequest
4
5
 
5
6
 
6
- def get_poll_logs_excludes(request: PollLogsRequest) -> Optional[Dict]:
7
+ def get_poll_logs_excludes(request: PollLogsRequest) -> Optional[IncludeExcludeDictType]:
7
8
  """
8
9
  Returns exclude mapping to exclude certain fields from the request.
9
10
  Use this method to exclude new fields when they are not set to keep
10
11
  clients backward-compatibility with older servers.
11
12
  """
12
- excludes = {}
13
+ excludes: IncludeExcludeDictType = {}
13
14
  if request.next_token is None:
14
15
  excludes["next_token"] = True
15
16
  return excludes if excludes else None
@@ -1,31 +1,35 @@
1
- from typing import Any, Dict, Optional
1
+ from typing import Optional
2
2
 
3
+ from dstack._internal.core.models.common import IncludeExcludeDictType, IncludeExcludeSetType
3
4
  from dstack._internal.core.models.configurations import ServiceConfiguration
4
- from dstack._internal.core.models.runs import ApplyRunPlanInput, JobSubmission, RunSpec
5
+ from dstack._internal.core.models.runs import ApplyRunPlanInput, JobSpec, JobSubmission, RunSpec
5
6
  from dstack._internal.server.schemas.runs import GetRunPlanRequest
6
7
 
7
8
 
8
- def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
9
+ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeDictType]:
9
10
  """
10
11
  Returns `plan` exclude mapping to exclude certain fields from the request.
11
12
  Use this method to exclude new fields when they are not set to keep
12
13
  clients backward-compatibility with older servers.
13
14
  """
14
- apply_plan_excludes = {}
15
+ apply_plan_excludes: IncludeExcludeDictType = {}
15
16
  run_spec_excludes = get_run_spec_excludes(plan.run_spec)
16
17
  if run_spec_excludes is not None:
17
18
  apply_plan_excludes["run_spec"] = run_spec_excludes
18
19
  current_resource = plan.current_resource
19
20
  if current_resource is not None:
20
- current_resource_excludes = {}
21
+ current_resource_excludes: IncludeExcludeDictType = {}
21
22
  current_resource_excludes["status_message"] = True
22
23
  if current_resource.deployment_num == 0:
23
24
  current_resource_excludes["deployment_num"] = True
24
25
  apply_plan_excludes["current_resource"] = current_resource_excludes
25
26
  current_resource_excludes["run_spec"] = get_run_spec_excludes(current_resource.run_spec)
26
- job_submissions_excludes = {}
27
+ job_submissions_excludes: IncludeExcludeDictType = {}
27
28
  current_resource_excludes["jobs"] = {
28
- "__all__": {"job_submissions": {"__all__": job_submissions_excludes}}
29
+ "__all__": {
30
+ "job_spec": get_job_spec_excludes([job.job_spec for job in current_resource.jobs]),
31
+ "job_submissions": {"__all__": job_submissions_excludes},
32
+ }
29
33
  }
30
34
  job_submissions = [js for j in current_resource.jobs for js in j.job_submissions]
31
35
  if all(map(_should_exclude_job_submission_jpd_cpu_arch, job_submissions)):
@@ -42,7 +46,7 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
42
46
  job_submissions_excludes["deployment_num"] = True
43
47
  latest_job_submission = current_resource.latest_job_submission
44
48
  if latest_job_submission is not None:
45
- latest_job_submission_excludes = {}
49
+ latest_job_submission_excludes: IncludeExcludeDictType = {}
46
50
  current_resource_excludes["latest_job_submission"] = latest_job_submission_excludes
47
51
  if _should_exclude_job_submission_jpd_cpu_arch(latest_job_submission):
48
52
  latest_job_submission_excludes["job_provisioning_data"] = {
@@ -59,12 +63,12 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
59
63
  return {"plan": apply_plan_excludes}
60
64
 
61
65
 
62
- def get_get_plan_excludes(request: GetRunPlanRequest) -> Optional[Dict]:
66
+ def get_get_plan_excludes(request: GetRunPlanRequest) -> Optional[IncludeExcludeDictType]:
63
67
  """
64
68
  Excludes new fields when they are not set to keep
65
69
  clients backward-compatibility with older servers.
66
70
  """
67
- get_plan_excludes = {}
71
+ get_plan_excludes: IncludeExcludeDictType = {}
68
72
  run_spec_excludes = get_run_spec_excludes(request.run_spec)
69
73
  if run_spec_excludes is not None:
70
74
  get_plan_excludes["run_spec"] = run_spec_excludes
@@ -73,15 +77,15 @@ def get_get_plan_excludes(request: GetRunPlanRequest) -> Optional[Dict]:
73
77
  return get_plan_excludes
74
78
 
75
79
 
76
- def get_run_spec_excludes(run_spec: RunSpec) -> Optional[Dict]:
80
+ def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType:
77
81
  """
78
82
  Returns `run_spec` exclude mapping to exclude certain fields from the request.
79
83
  Use this method to exclude new fields when they are not set to keep
80
84
  clients backward-compatibility with older servers.
81
85
  """
82
- spec_excludes: dict[str, Any] = {}
83
- configuration_excludes: dict[str, Any] = {}
84
- profile_excludes: set[str] = set()
86
+ spec_excludes: IncludeExcludeDictType = {}
87
+ configuration_excludes: IncludeExcludeDictType = {}
88
+ profile_excludes: IncludeExcludeSetType = set()
85
89
  configuration = run_spec.configuration
86
90
  profile = run_spec.profile
87
91
 
@@ -109,14 +113,34 @@ def get_run_spec_excludes(run_spec: RunSpec) -> Optional[Dict]:
109
113
  configuration_excludes["stop_criteria"] = True
110
114
  if profile is not None and profile.stop_criteria is None:
111
115
  profile_excludes.add("stop_criteria")
116
+ if not configuration.files:
117
+ configuration_excludes["files"] = True
118
+ if not run_spec.file_archives:
119
+ spec_excludes["file_archives"] = True
112
120
 
113
121
  if configuration_excludes:
114
122
  spec_excludes["configuration"] = configuration_excludes
115
123
  if profile_excludes:
116
124
  spec_excludes["profile"] = profile_excludes
117
- if spec_excludes:
118
- return spec_excludes
119
- return None
125
+ return spec_excludes
126
+
127
+
128
+ def get_job_spec_excludes(job_specs: list[JobSpec]) -> IncludeExcludeDictType:
129
+ """
130
+ Returns `job_spec` exclude mapping to exclude certain fields from the request.
131
+ Use this method to exclude new fields when they are not set to keep
132
+ clients backward-compatibility with older servers.
133
+ """
134
+ spec_excludes: IncludeExcludeDictType = {}
135
+
136
+ if all(s.repo_code_hash is None for s in job_specs):
137
+ spec_excludes["repo_code_hash"] = True
138
+ if all(s.repo_data is None for s in job_specs):
139
+ spec_excludes["repo_data"] = True
140
+ if all(not s.file_archives for s in job_specs):
141
+ spec_excludes["file_archives"] = True
142
+
143
+ return spec_excludes
120
144
 
121
145
 
122
146
  def _should_exclude_job_submission_jpd_cpu_arch(job_submission: JobSubmission) -> bool:
@@ -1,32 +1,33 @@
1
- from typing import Dict
2
-
1
+ from dstack._internal.core.models.common import IncludeExcludeDictType
3
2
  from dstack._internal.core.models.volumes import VolumeConfiguration, VolumeSpec
4
3
 
5
4
 
6
- def get_volume_spec_excludes(volume_spec: VolumeSpec) -> Dict:
5
+ def get_volume_spec_excludes(volume_spec: VolumeSpec) -> IncludeExcludeDictType:
7
6
  """
8
7
  Returns `volume_spec` exclude mapping to exclude certain fields from the request.
9
8
  Use this method to exclude new fields when they are not set to keep
10
9
  clients backward-compatibility with older servers.
11
10
  """
12
- spec_excludes = {}
11
+ spec_excludes: IncludeExcludeDictType = {}
13
12
  spec_excludes["configuration"] = _get_volume_configuration_excludes(volume_spec.configuration)
14
13
  return spec_excludes
15
14
 
16
15
 
17
- def get_create_volume_excludes(configuration: VolumeConfiguration) -> Dict:
16
+ def get_create_volume_excludes(configuration: VolumeConfiguration) -> IncludeExcludeDictType:
18
17
  """
19
18
  Returns an exclude mapping to exclude certain fields from the create volume request.
20
19
  Use this method to exclude new fields when they are not set to keep
21
20
  clients backward-compatibility with older servers.
22
21
  """
23
- create_volume_excludes = {}
22
+ create_volume_excludes: IncludeExcludeDictType = {}
24
23
  create_volume_excludes["configuration"] = _get_volume_configuration_excludes(configuration)
25
24
  return create_volume_excludes
26
25
 
27
26
 
28
- def _get_volume_configuration_excludes(configuration: VolumeConfiguration) -> Dict:
29
- configuration_excludes = {}
27
+ def _get_volume_configuration_excludes(
28
+ configuration: VolumeConfiguration,
29
+ ) -> IncludeExcludeDictType:
30
+ configuration_excludes: IncludeExcludeDictType = {}
30
31
  if configuration.tags is None:
31
32
  configuration_excludes["tags"] = True
32
33
  return configuration_excludes
@@ -110,6 +110,10 @@ class PlacementGroupInUseError(ComputeError):
110
110
  pass
111
111
 
112
112
 
113
+ class PlacementGroupNotSupportedError(ComputeError):
114
+ pass
115
+
116
+
113
117
  class NotYetTerminated(ComputeError):
114
118
  """
115
119
  Used by Compute.terminate_instance to signal that instance termination is not complete
@@ -6,6 +6,13 @@ from pydantic import Field
6
6
  from pydantic_duality import DualBaseModel
7
7
  from typing_extensions import Annotated
8
8
 
9
+ IncludeExcludeFieldType = Union[int, str]
10
+ IncludeExcludeSetType = set[IncludeExcludeFieldType]
11
+ IncludeExcludeDictType = dict[
12
+ IncludeExcludeFieldType, Union[bool, IncludeExcludeSetType, "IncludeExcludeDictType"]
13
+ ]
14
+ IncludeExcludeType = Union[IncludeExcludeSetType, IncludeExcludeDictType]
15
+
9
16
 
10
17
  # DualBaseModel creates two classes for the model:
11
18
  # one with extra = "forbid" (CoreModel/CoreModel.__request__),
@@ -10,6 +10,7 @@ from typing_extensions import Annotated, Literal
10
10
  from dstack._internal.core.errors import ConfigurationError
11
11
  from dstack._internal.core.models.common import CoreModel, Duration, RegistryAuth
12
12
  from dstack._internal.core.models.envs import Env
13
+ from dstack._internal.core.models.files import FilePathMapping
13
14
  from dstack._internal.core.models.fleets import FleetConfiguration
14
15
  from dstack._internal.core.models.gateways import GatewayConfiguration
15
16
  from dstack._internal.core.models.profiles import ProfileParams, parse_off_duration
@@ -252,6 +253,10 @@ class BaseRunConfiguration(CoreModel):
252
253
  description="Use Docker inside the container. Mutually exclusive with `image`, `python`, and `nvcc`. Overrides `privileged`"
253
254
  ),
254
255
  ] = None
256
+ files: Annotated[
257
+ list[Union[FilePathMapping, str]],
258
+ Field(description="The local to container file path mappings"),
259
+ ] = []
255
260
  # deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
256
261
  setup: CommandsList = []
257
262
 
@@ -285,6 +290,12 @@ class BaseRunConfiguration(CoreModel):
285
290
  return parse_mount_point(v)
286
291
  return v
287
292
 
293
+ @validator("files", each_item=True)
294
+ def convert_files(cls, v) -> FilePathMapping:
295
+ if isinstance(v, str):
296
+ return FilePathMapping.parse(v)
297
+ return v
298
+
288
299
  @validator("user")
289
300
  def validate_user(cls, v) -> Optional[str]:
290
301
  if v is None:
@@ -0,0 +1,67 @@
1
+ import pathlib
2
+ import string
3
+ from uuid import UUID
4
+
5
+ from pydantic import Field, validator
6
+ from typing_extensions import Annotated, Self
7
+
8
+ from dstack._internal.core.models.common import CoreModel
9
+
10
+
11
+ class FileArchive(CoreModel):
12
+ id: UUID
13
+ hash: str
14
+
15
+
16
+ class FilePathMapping(CoreModel):
17
+ local_path: Annotated[
18
+ str,
19
+ Field(
20
+ description=(
21
+ "The path on the user's machine. Relative paths are resolved relative to"
22
+ " the parent directory of the the configuration file"
23
+ )
24
+ ),
25
+ ]
26
+ path: Annotated[
27
+ str,
28
+ Field(
29
+ description=(
30
+ "The path in the container. Relative paths are resolved relative to"
31
+ " the repo directory (`/workflow`)"
32
+ )
33
+ ),
34
+ ]
35
+
36
+ @classmethod
37
+ def parse(cls, v: str) -> Self:
38
+ local_path: str
39
+ path: str
40
+ parts = v.split(":")
41
+ # A special case for Windows paths, e.g., `C:\path\to`, 'c:/path/to'
42
+ if (
43
+ len(parts) > 1
44
+ and len(parts[0]) == 1
45
+ and parts[0] in string.ascii_letters
46
+ and parts[1][:1] in ["\\", "/"]
47
+ ):
48
+ parts = [f"{parts[0]}:{parts[1]}", *parts[2:]]
49
+ if len(parts) == 1:
50
+ local_path = path = parts[0]
51
+ elif len(parts) == 2:
52
+ local_path, path = parts
53
+ else:
54
+ raise ValueError(f"invalid file path mapping: {v}")
55
+ return cls(local_path=local_path, path=path)
56
+
57
+ @validator("path")
58
+ def validate_path(cls, v) -> str:
59
+ # True for `C:/.*`, False otherwise, including `/abs/unix/path`, `rel\windows\path`, etc.
60
+ if pathlib.PureWindowsPath(v).is_absolute():
61
+ raise ValueError(f"path must be a Unix file path: {v}")
62
+ return v
63
+
64
+
65
+ class FileArchiveMapping(CoreModel):
66
+ id: Annotated[UUID, Field(description="The File archive ID")]
67
+ path: Annotated[str, Field(description="The path in the container")]
@@ -12,6 +12,7 @@ from dstack._internal.core.models.configurations import (
12
12
  AnyRunConfiguration,
13
13
  RunConfiguration,
14
14
  )
15
+ from dstack._internal.core.models.files import FileArchiveMapping
15
16
  from dstack._internal.core.models.instances import (
16
17
  InstanceOfferWithAvailability,
17
18
  InstanceType,
@@ -217,6 +218,15 @@ class JobSpec(CoreModel):
217
218
  volumes: Optional[List[MountPoint]] = None
218
219
  ssh_key: Optional[JobSSHKey] = None
219
220
  working_dir: Optional[str]
221
+ # `repo_data` is optional for client compatibility with pre-0.19.17 servers and for compatibility
222
+ # with jobs submitted before 0.19.17. All new jobs are expected to have non-None `repo_data`.
223
+ # For --no-repo runs, `repo_data` is `VirtualRunRepoData()`.
224
+ repo_data: Annotated[Optional[AnyRunRepoData], Field(discriminator="repo_type")] = None
225
+ # `repo_code_hash` can be None because it is not used for the repo or because the job was
226
+ # submitted before 0.19.17. See `_get_repo_code_hash` on how to get the correct `repo_code_hash`
227
+ # TODO: drop this comment when supporting jobs submitted before 0.19.17 is no longer relevant.
228
+ repo_code_hash: Optional[str] = None
229
+ file_archives: list[FileArchiveMapping] = []
220
230
 
221
231
 
222
232
  class JobProvisioningData(CoreModel):
@@ -413,6 +423,10 @@ class RunSpec(CoreModel):
413
423
  Optional[str],
414
424
  Field(description="The hash of the repo diff. Can be omitted if there is no repo diff."),
415
425
  ] = None
426
+ file_archives: Annotated[
427
+ list[FileArchiveMapping],
428
+ Field(description="The list of file archive ID to container path mappings"),
429
+ ] = []
416
430
  working_dir: Annotated[
417
431
  Optional[str],
418
432
  Field(
@@ -1,9 +1,16 @@
1
+ from typing import Optional
2
+ from uuid import UUID
3
+
1
4
  from dstack._internal.core.models.common import CoreModel
2
5
 
3
6
 
4
7
  class Secret(CoreModel):
8
+ id: UUID
5
9
  name: str
6
- value: str
10
+ value: Optional[str] = None
7
11
 
8
12
  def __str__(self) -> str:
9
- return f'Secret(name="{self.name}", value={"*" * len(self.value)})'
13
+ displayed_value = "*"
14
+ if self.value is not None:
15
+ displayed_value = "*" * len(self.value)
16
+ return f'Secret(name="{self.name}", value={displayed_value})'
@@ -1,14 +1,47 @@
1
- from typing import Any, Dict
1
+ from typing import Any, Optional, TypedDict
2
2
 
3
3
  from pydantic import BaseModel
4
4
 
5
+ from dstack._internal.core.models.common import IncludeExcludeType
6
+
7
+
8
+ class ModelFieldDiff(TypedDict):
9
+ old: Any
10
+ new: Any
11
+
12
+
13
+ ModelDiff = dict[str, ModelFieldDiff]
14
+
5
15
 
6
16
  # TODO: calculate nested diffs
7
- def diff_models(old: BaseModel, new: BaseModel) -> Dict[str, Any]:
17
+ def diff_models(
18
+ old: BaseModel, new: BaseModel, ignore: Optional[IncludeExcludeType] = None
19
+ ) -> ModelDiff:
20
+ """
21
+ Returns a diff of model instances fields.
22
+
23
+ NOTE: `ignore` is implemented as `BaseModel.parse_obj(BaseModel.dict(exclude=ignore))`,
24
+ that is, the "ignored" fields are actually not ignored but reset to the default values
25
+ before comparison, meaning that 1) any field in `ignore` must have a default value,
26
+ 2) the default value must be equal to itself (e.g. `math.nan` != `math.nan`).
27
+
28
+ Args:
29
+ old: The "old" model instance.
30
+ new: The "new" model instance.
31
+ ignore: Optional fields to ignore.
32
+
33
+ Returns:
34
+ A dict of changed fields in the form of
35
+ `{<field_name>: {"old": old_value, "new": new_value}}`
36
+ """
8
37
  if type(old) is not type(new):
9
38
  raise TypeError("Both instances must be of the same Pydantic model class.")
10
39
 
11
- changes = {}
40
+ if ignore is not None:
41
+ old = type(old).parse_obj(old.dict(exclude=ignore))
42
+ new = type(new).parse_obj(new.dict(exclude=ignore))
43
+
44
+ changes: ModelDiff = {}
12
45
  for field in old.__fields__:
13
46
  old_value = getattr(old, field)
14
47
  new_value = getattr(new, field)
@@ -2,6 +2,7 @@ import asyncio
2
2
  import importlib.resources
3
3
  import os
4
4
  import time
5
+ from concurrent.futures import ThreadPoolExecutor
5
6
  from contextlib import asynccontextmanager
6
7
  from pathlib import Path
7
8
  from typing import Awaitable, Callable, List
@@ -23,6 +24,7 @@ from dstack._internal.server.background import start_background_tasks
23
24
  from dstack._internal.server.db import get_db, get_session_ctx, migrate
24
25
  from dstack._internal.server.routers import (
25
26
  backends,
27
+ files,
26
28
  fleets,
27
29
  gateways,
28
30
  instances,
@@ -96,6 +98,8 @@ def create_app() -> FastAPI:
96
98
  @asynccontextmanager
97
99
  async def lifespan(app: FastAPI):
98
100
  configure_logging()
101
+ server_executor = ThreadPoolExecutor(max_workers=settings.SERVER_EXECUTOR_MAX_WORKERS)
102
+ asyncio.get_running_loop().set_default_executor(server_executor)
99
103
  await migrate()
100
104
  _print_dstack_logo()
101
105
  if not check_required_ssh_version():
@@ -197,6 +201,7 @@ def register_routes(app: FastAPI, ui: bool = True):
197
201
  app.include_router(service_proxy.router, prefix="/proxy/services", tags=["service-proxy"])
198
202
  app.include_router(model_proxy.router, prefix="/proxy/models", tags=["model-proxy"])
199
203
  app.include_router(prometheus.router)
204
+ app.include_router(files.router)
200
205
 
201
206
  @app.exception_handler(ForbiddenError)
202
207
  async def forbidden_error_handler(request: Request, exc: ForbiddenError):
@@ -240,6 +245,23 @@ def register_routes(app: FastAPI, ui: bool = True):
240
245
  )
241
246
  return response
242
247
 
248
+ if settings.SERVER_PROFILING_ENABLED:
249
+ from pyinstrument import Profiler
250
+
251
+ @app.middleware("http")
252
+ async def profile_request(request: Request, call_next):
253
+ profiling = request.query_params.get("profile", False)
254
+ if profiling:
255
+ profiler = Profiler()
256
+ profiler.start()
257
+ respone = await call_next(request)
258
+ profiler.stop()
259
+ with open("profiling_results.html", "w+") as f:
260
+ f.write(profiler.output_html())
261
+ return respone
262
+ else:
263
+ return await call_next(request)
264
+
243
265
  # this middleware must be defined after the log_request middleware
244
266
  @app.middleware("http")
245
267
  async def log_http_metrics(request: Request, call_next):
@@ -37,15 +37,31 @@ def get_scheduler() -> AsyncIOScheduler:
37
37
 
38
38
 
39
39
  def start_background_tasks() -> AsyncIOScheduler:
40
+ # We try to process as many resources as possible without exhausting DB connections.
41
+ #
42
+ # Quick tasks can process multiple resources per transaction.
43
+ # Potentially long tasks process one resource per transaction
44
+ # to avoid holding locks for all the resources if one is slow to process.
45
+ # Still, the next batch won't be processed unless all resources are processed,
46
+ # so larger batches do not increase processing rate linearly.
47
+ #
48
+ # The interval, batch_size, and max_instances determine background tasks processing rates.
49
+ # By default, one server replica can handle:
50
+ #
51
+ # * 150 active jobs with 2 minutes processing latency
52
+ # * 150 active runs with 2 minutes processing latency
53
+ # * 150 active instances with 2 minutes processing latency
54
+ #
55
+ # These latency numbers do not account for provisioning time,
56
+ # so it may be slower if a backend is slow to provision.
57
+ #
58
+ # Users can set SERVER_BACKGROUND_PROCESSING_FACTOR to process more resources per replica.
59
+ # They also need to increase max db connections on the client side and db side.
60
+ #
40
61
  # In-memory locking via locksets does not guarantee
41
62
  # that the first waiting for the lock will acquire it.
42
63
  # The jitter is needed to give all tasks a chance to acquire locks.
43
64
 
44
- # The batch_size and interval determine background tasks processing rates.
45
- # Currently one server replica can handle:
46
- # * 150 active jobs with up to 2 minutes processing latency
47
- # * 150 active runs with up to 2 minutes processing latency
48
- # * 150 active instances with up to 2 minutes processing latency
49
65
  _scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
50
66
  _scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
51
67
  if settings.ENABLE_PROMETHEUS_METRICS:
@@ -53,38 +69,6 @@ def start_background_tasks() -> AsyncIOScheduler:
53
69
  collect_prometheus_metrics, IntervalTrigger(seconds=10), max_instances=1
54
70
  )
55
71
  _scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1)
56
- # process_submitted_jobs and process_instances max processing rate is 75 jobs(instances) per minute.
57
- _scheduler.add_job(
58
- process_submitted_jobs,
59
- IntervalTrigger(seconds=4, jitter=2),
60
- kwargs={"batch_size": 5},
61
- max_instances=2,
62
- )
63
- _scheduler.add_job(
64
- process_running_jobs,
65
- IntervalTrigger(seconds=4, jitter=2),
66
- kwargs={"batch_size": 5},
67
- max_instances=2,
68
- )
69
- _scheduler.add_job(
70
- process_terminating_jobs,
71
- IntervalTrigger(seconds=4, jitter=2),
72
- kwargs={"batch_size": 5},
73
- max_instances=2,
74
- )
75
- _scheduler.add_job(
76
- process_runs,
77
- IntervalTrigger(seconds=2, jitter=1),
78
- kwargs={"batch_size": 5},
79
- max_instances=2,
80
- )
81
- _scheduler.add_job(
82
- process_instances,
83
- IntervalTrigger(seconds=4, jitter=2),
84
- kwargs={"batch_size": 5},
85
- max_instances=2,
86
- )
87
- _scheduler.add_job(process_fleets, IntervalTrigger(seconds=10, jitter=2))
88
72
  _scheduler.add_job(process_gateways_connections, IntervalTrigger(seconds=15))
89
73
  _scheduler.add_job(
90
74
  process_submitted_gateways, IntervalTrigger(seconds=10, jitter=2), max_instances=5
@@ -93,5 +77,45 @@ def start_background_tasks() -> AsyncIOScheduler:
93
77
  process_submitted_volumes, IntervalTrigger(seconds=10, jitter=2), max_instances=5
94
78
  )
95
79
  _scheduler.add_job(process_placement_groups, IntervalTrigger(seconds=30, jitter=5))
80
+ for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR):
81
+ # Add multiple copies of tasks if requested.
82
+ # max_instances=1 for additional copies to avoid running too many tasks.
83
+ # Move other tasks here when they need per-replica scaling.
84
+ _scheduler.add_job(
85
+ process_submitted_jobs,
86
+ IntervalTrigger(seconds=4, jitter=2),
87
+ kwargs={"batch_size": 5},
88
+ max_instances=4 if replica == 0 else 1,
89
+ )
90
+ _scheduler.add_job(
91
+ process_running_jobs,
92
+ IntervalTrigger(seconds=4, jitter=2),
93
+ kwargs={"batch_size": 5},
94
+ max_instances=2 if replica == 0 else 1,
95
+ )
96
+ _scheduler.add_job(
97
+ process_terminating_jobs,
98
+ IntervalTrigger(seconds=4, jitter=2),
99
+ kwargs={"batch_size": 5},
100
+ max_instances=2 if replica == 0 else 1,
101
+ )
102
+ _scheduler.add_job(
103
+ process_runs,
104
+ IntervalTrigger(seconds=2, jitter=1),
105
+ kwargs={"batch_size": 5},
106
+ max_instances=2 if replica == 0 else 1,
107
+ )
108
+ _scheduler.add_job(
109
+ process_instances,
110
+ IntervalTrigger(seconds=4, jitter=2),
111
+ kwargs={"batch_size": 5},
112
+ max_instances=2 if replica == 0 else 1,
113
+ )
114
+ _scheduler.add_job(
115
+ process_fleets,
116
+ IntervalTrigger(seconds=10, jitter=2),
117
+ kwargs={"batch_size": 5},
118
+ max_instances=2 if replica == 0 else 1,
119
+ )
96
120
  _scheduler.start()
97
121
  return _scheduler
@@ -1,9 +1,12 @@
1
+ import asyncio
2
+ from datetime import timedelta
3
+
1
4
  from sqlalchemy import select
2
5
  from sqlalchemy.ext.asyncio import AsyncSession
3
6
  from sqlalchemy.orm import joinedload
4
7
 
5
8
  from dstack._internal.core.models.fleets import FleetStatus
6
- from dstack._internal.server.db import get_session_ctx
9
+ from dstack._internal.server.db import get_db, get_session_ctx
7
10
  from dstack._internal.server.models import FleetModel
8
11
  from dstack._internal.server.services.fleets import (
9
12
  is_fleet_empty,
@@ -17,8 +20,18 @@ from dstack._internal.utils.logging import get_logger
17
20
  logger = get_logger(__name__)
18
21
 
19
22
 
20
- async def process_fleets():
21
- lock, lockset = get_locker().get_lockset(FleetModel.__tablename__)
23
+ MIN_PROCESSING_INTERVAL = timedelta(seconds=30)
24
+
25
+
26
+ async def process_fleets(batch_size: int = 1):
27
+ tasks = []
28
+ for _ in range(batch_size):
29
+ tasks.append(_process_next_fleet())
30
+ await asyncio.gather(*tasks)
31
+
32
+
33
+ async def _process_next_fleet():
34
+ lock, lockset = get_locker(get_db().dialect_name).get_lockset(FleetModel.__tablename__)
22
35
  async with get_session_ctx() as session:
23
36
  async with lock:
24
37
  res = await session.execute(
@@ -26,6 +39,8 @@ async def process_fleets():
26
39
  .where(
27
40
  FleetModel.deleted == False,
28
41
  FleetModel.id.not_in(lockset),
42
+ FleetModel.last_processed_at
43
+ < get_current_datetime().replace(tzinfo=None) - MIN_PROCESSING_INTERVAL,
29
44
  )
30
45
  .order_by(FleetModel.last_processed_at.asc())
31
46
  .limit(1)
@@ -43,6 +58,7 @@ async def process_fleets():
43
58
 
44
59
 
45
60
  async def _process_fleet(session: AsyncSession, fleet_model: FleetModel):
61
+ logger.debug("Processing fleet %s", fleet_model.name)
46
62
  # Refetch to load related attributes.
47
63
  # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
48
64
  res = await session.execute(
@@ -28,7 +28,7 @@ async def process_gateways_connections():
28
28
 
29
29
 
30
30
  async def process_submitted_gateways():
31
- lock, lockset = get_locker().get_lockset(GatewayModel.__tablename__)
31
+ lock, lockset = get_locker(get_db().dialect_name).get_lockset(GatewayModel.__tablename__)
32
32
  async with get_session_ctx() as session:
33
33
  async with lock:
34
34
  res = await session.execute(