dstack 0.19.15rc1__py3-none-any.whl → 0.19.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/secrets.py +92 -0
- dstack/_internal/cli/main.py +2 -0
- dstack/_internal/cli/services/completion.py +5 -0
- dstack/_internal/cli/services/configurators/run.py +59 -17
- dstack/_internal/cli/utils/secrets.py +25 -0
- dstack/_internal/core/backends/__init__.py +10 -4
- dstack/_internal/core/backends/cloudrift/__init__.py +0 -0
- dstack/_internal/core/backends/cloudrift/api_client.py +208 -0
- dstack/_internal/core/backends/cloudrift/backend.py +16 -0
- dstack/_internal/core/backends/cloudrift/compute.py +138 -0
- dstack/_internal/core/backends/cloudrift/configurator.py +66 -0
- dstack/_internal/core/backends/cloudrift/models.py +40 -0
- dstack/_internal/core/backends/configurators.py +9 -0
- dstack/_internal/core/backends/models.py +7 -0
- dstack/_internal/core/compatibility/logs.py +15 -0
- dstack/_internal/core/compatibility/runs.py +31 -2
- dstack/_internal/core/models/backends/base.py +2 -0
- dstack/_internal/core/models/configurations.py +33 -2
- dstack/_internal/core/models/files.py +67 -0
- dstack/_internal/core/models/logs.py +2 -1
- dstack/_internal/core/models/runs.py +24 -1
- dstack/_internal/core/models/secrets.py +9 -2
- dstack/_internal/server/app.py +2 -0
- dstack/_internal/server/background/tasks/process_fleets.py +1 -1
- dstack/_internal/server/background/tasks/process_gateways.py +1 -1
- dstack/_internal/server/background/tasks/process_instances.py +1 -1
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
- dstack/_internal/server/background/tasks/process_running_jobs.py +110 -13
- dstack/_internal/server/background/tasks/process_runs.py +36 -5
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +10 -4
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
- dstack/_internal/server/background/tasks/process_volumes.py +1 -1
- dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py +39 -0
- dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py +49 -0
- dstack/_internal/server/models.py +33 -0
- dstack/_internal/server/routers/files.py +67 -0
- dstack/_internal/server/routers/gateways.py +6 -3
- dstack/_internal/server/routers/projects.py +63 -0
- dstack/_internal/server/routers/prometheus.py +5 -5
- dstack/_internal/server/routers/secrets.py +57 -15
- dstack/_internal/server/schemas/files.py +5 -0
- dstack/_internal/server/schemas/logs.py +10 -1
- dstack/_internal/server/schemas/projects.py +12 -0
- dstack/_internal/server/schemas/runner.py +2 -0
- dstack/_internal/server/schemas/secrets.py +7 -11
- dstack/_internal/server/security/permissions.py +75 -2
- dstack/_internal/server/services/backends/__init__.py +1 -1
- dstack/_internal/server/services/files.py +91 -0
- dstack/_internal/server/services/fleets.py +1 -1
- dstack/_internal/server/services/gateways/__init__.py +1 -1
- dstack/_internal/server/services/jobs/__init__.py +19 -8
- dstack/_internal/server/services/jobs/configurators/base.py +27 -3
- dstack/_internal/server/services/jobs/configurators/dev.py +3 -3
- dstack/_internal/server/services/logs/aws.py +38 -38
- dstack/_internal/server/services/logs/filelog.py +48 -14
- dstack/_internal/server/services/logs/gcp.py +17 -16
- dstack/_internal/server/services/projects.py +164 -5
- dstack/_internal/server/services/prometheus/__init__.py +0 -0
- dstack/_internal/server/services/prometheus/client_metrics.py +52 -0
- dstack/_internal/server/services/proxy/repo.py +3 -0
- dstack/_internal/server/services/runner/client.py +8 -0
- dstack/_internal/server/services/runs.py +55 -10
- dstack/_internal/server/services/secrets.py +204 -0
- dstack/_internal/server/services/services/__init__.py +2 -1
- dstack/_internal/server/services/storage/base.py +21 -0
- dstack/_internal/server/services/storage/gcs.py +28 -6
- dstack/_internal/server/services/storage/s3.py +27 -9
- dstack/_internal/server/services/users.py +1 -3
- dstack/_internal/server/services/volumes.py +1 -1
- dstack/_internal/server/settings.py +2 -2
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-0ac1e1583684417ae4d1.js → main-d151637af20f70b2e796.js} +104 -48
- dstack/_internal/server/statics/{main-0ac1e1583684417ae4d1.js.map → main-d151637af20f70b2e796.js.map} +1 -1
- dstack/_internal/server/statics/{main-f39c418b05fe14772dd8.css → main-d48635d8fe670d53961c.css} +1 -1
- dstack/_internal/server/statics/static/media/google.b194b06fafd0a52aeb566922160ea514.svg +1 -0
- dstack/_internal/server/testing/common.py +43 -5
- dstack/_internal/settings.py +5 -0
- dstack/_internal/utils/files.py +69 -0
- dstack/_internal/utils/nested_list.py +47 -0
- dstack/_internal/utils/path.py +12 -4
- dstack/api/_public/runs.py +73 -12
- dstack/api/server/__init__.py +6 -0
- dstack/api/server/_files.py +18 -0
- dstack/api/server/_logs.py +5 -1
- dstack/api/server/_projects.py +24 -0
- dstack/api/server/_secrets.py +15 -15
- dstack/version.py +1 -1
- {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/METADATA +3 -4
- {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/RECORD +93 -71
- /dstack/_internal/server/services/{prometheus.py → prometheus/custom_metrics.py} +0 -0
- {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/WHEEL +0 -0
- {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import re
|
|
3
|
+
import uuid
|
|
3
4
|
from collections.abc import Iterable
|
|
4
5
|
from datetime import timedelta, timezone
|
|
5
6
|
from typing import Dict, List, Optional
|
|
@@ -14,6 +15,7 @@ from dstack._internal.core.errors import GatewayError
|
|
|
14
15
|
from dstack._internal.core.models.backends.base import BackendType
|
|
15
16
|
from dstack._internal.core.models.common import NetworkMode, RegistryAuth
|
|
16
17
|
from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
|
|
18
|
+
from dstack._internal.core.models.files import FileArchiveMapping
|
|
17
19
|
from dstack._internal.core.models.instances import (
|
|
18
20
|
InstanceStatus,
|
|
19
21
|
RemoteConnectionInfo,
|
|
@@ -42,8 +44,10 @@ from dstack._internal.server.models import (
|
|
|
42
44
|
ProjectModel,
|
|
43
45
|
RepoModel,
|
|
44
46
|
RunModel,
|
|
47
|
+
UserModel,
|
|
45
48
|
)
|
|
46
49
|
from dstack._internal.server.schemas.runner import GPUDevice, TaskStatus
|
|
50
|
+
from dstack._internal.server.services import files as files_services
|
|
47
51
|
from dstack._internal.server.services import logs as logs_services
|
|
48
52
|
from dstack._internal.server.services import services
|
|
49
53
|
from dstack._internal.server.services.instances import get_instance_ssh_private_keys
|
|
@@ -66,9 +70,10 @@ from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
|
|
|
66
70
|
from dstack._internal.server.services.runs import (
|
|
67
71
|
run_model_to_run,
|
|
68
72
|
)
|
|
73
|
+
from dstack._internal.server.services.secrets import get_project_secrets_mapping
|
|
69
74
|
from dstack._internal.server.services.storage import get_default_storage
|
|
70
75
|
from dstack._internal.utils import common as common_utils
|
|
71
|
-
from dstack._internal.utils.interpolator import VariablesInterpolator
|
|
76
|
+
from dstack._internal.utils.interpolator import InterpolatorError, VariablesInterpolator
|
|
72
77
|
from dstack._internal.utils.logging import get_logger
|
|
73
78
|
|
|
74
79
|
logger = get_logger(__name__)
|
|
@@ -101,7 +106,7 @@ async def _process_next_running_job():
|
|
|
101
106
|
)
|
|
102
107
|
.order_by(JobModel.last_processed_at.asc())
|
|
103
108
|
.limit(1)
|
|
104
|
-
.with_for_update(skip_locked=True)
|
|
109
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
105
110
|
)
|
|
106
111
|
job_model = res.unique().scalar()
|
|
107
112
|
if job_model is None:
|
|
@@ -177,7 +182,17 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
177
182
|
common_utils.get_or_error(job_model.instance)
|
|
178
183
|
)
|
|
179
184
|
|
|
180
|
-
secrets =
|
|
185
|
+
secrets = await get_project_secrets_mapping(session=session, project=project)
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
_interpolate_secrets(secrets, job.job_spec)
|
|
189
|
+
except InterpolatorError as e:
|
|
190
|
+
logger.info("%s: terminating due to secrets interpolation error", fmt(job_model))
|
|
191
|
+
job_model.status = JobStatus.TERMINATING
|
|
192
|
+
job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
|
|
193
|
+
job_model.termination_reason_message = e.args[0]
|
|
194
|
+
job_model.last_processed_at = common_utils.get_current_datetime()
|
|
195
|
+
return
|
|
181
196
|
|
|
182
197
|
repo_creds_model = await get_repo_creds(session=session, repo=repo_model, user=run_model.user)
|
|
183
198
|
repo_creds = repo_model_to_repo_head_with_creds(repo_model, repo_creds_model).repo_creds
|
|
@@ -214,7 +229,6 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
214
229
|
job_model,
|
|
215
230
|
job_provisioning_data,
|
|
216
231
|
volumes,
|
|
217
|
-
secrets,
|
|
218
232
|
job.job_spec.registry_auth,
|
|
219
233
|
public_keys,
|
|
220
234
|
ssh_user,
|
|
@@ -226,12 +240,20 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
226
240
|
fmt(job_model),
|
|
227
241
|
job_submission.age,
|
|
228
242
|
)
|
|
243
|
+
# FIXME: downloading file archives and code here is a waste of time if
|
|
244
|
+
# the runner is not ready yet
|
|
245
|
+
file_archives = await _get_job_file_archives(
|
|
246
|
+
session=session,
|
|
247
|
+
archive_mappings=job.job_spec.file_archives,
|
|
248
|
+
user=run_model.user,
|
|
249
|
+
)
|
|
229
250
|
code = await _get_job_code(
|
|
230
251
|
session=session,
|
|
231
252
|
project=project,
|
|
232
253
|
repo=repo_model,
|
|
233
|
-
code_hash=run
|
|
254
|
+
code_hash=_get_repo_code_hash(run, job),
|
|
234
255
|
)
|
|
256
|
+
|
|
235
257
|
success = await common_utils.run_async(
|
|
236
258
|
_submit_job_to_runner,
|
|
237
259
|
server_ssh_private_keys,
|
|
@@ -242,6 +264,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
242
264
|
job,
|
|
243
265
|
cluster_info,
|
|
244
266
|
code,
|
|
267
|
+
file_archives,
|
|
245
268
|
secrets,
|
|
246
269
|
repo_creds,
|
|
247
270
|
success_if_not_available=False,
|
|
@@ -269,11 +292,18 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
269
292
|
logger.debug(
|
|
270
293
|
"%s: process pulling job with shim, age=%s", fmt(job_model), job_submission.age
|
|
271
294
|
)
|
|
295
|
+
# FIXME: downloading file archives and code here is a waste of time if
|
|
296
|
+
# the runner is not ready yet
|
|
297
|
+
file_archives = await _get_job_file_archives(
|
|
298
|
+
session=session,
|
|
299
|
+
archive_mappings=job.job_spec.file_archives,
|
|
300
|
+
user=run_model.user,
|
|
301
|
+
)
|
|
272
302
|
code = await _get_job_code(
|
|
273
303
|
session=session,
|
|
274
304
|
project=project,
|
|
275
305
|
repo=repo_model,
|
|
276
|
-
code_hash=run
|
|
306
|
+
code_hash=_get_repo_code_hash(run, job),
|
|
277
307
|
)
|
|
278
308
|
success = await common_utils.run_async(
|
|
279
309
|
_process_pulling_with_shim,
|
|
@@ -285,6 +315,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
285
315
|
job,
|
|
286
316
|
cluster_info,
|
|
287
317
|
code,
|
|
318
|
+
file_archives,
|
|
288
319
|
secrets,
|
|
289
320
|
repo_creds,
|
|
290
321
|
server_ssh_private_keys,
|
|
@@ -306,8 +337,9 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
306
337
|
else:
|
|
307
338
|
if job_model.termination_reason:
|
|
308
339
|
logger.warning(
|
|
309
|
-
"%s: failed
|
|
340
|
+
"%s: failed due to %s, age=%s",
|
|
310
341
|
fmt(job_model),
|
|
342
|
+
job_model.termination_reason.value,
|
|
311
343
|
job_submission.age,
|
|
312
344
|
)
|
|
313
345
|
job_model.status = JobStatus.TERMINATING
|
|
@@ -450,7 +482,6 @@ def _process_provisioning_with_shim(
|
|
|
450
482
|
job_model: JobModel,
|
|
451
483
|
job_provisioning_data: JobProvisioningData,
|
|
452
484
|
volumes: List[Volume],
|
|
453
|
-
secrets: Dict[str, str],
|
|
454
485
|
registry_auth: Optional[RegistryAuth],
|
|
455
486
|
public_keys: List[str],
|
|
456
487
|
ssh_user: str,
|
|
@@ -476,10 +507,8 @@ def _process_provisioning_with_shim(
|
|
|
476
507
|
registry_username = ""
|
|
477
508
|
registry_password = ""
|
|
478
509
|
if registry_auth is not None:
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
registry_username = interpolate(registry_auth.username)
|
|
482
|
-
registry_password = interpolate(registry_auth.password)
|
|
510
|
+
registry_username = registry_auth.username
|
|
511
|
+
registry_password = registry_auth.password
|
|
483
512
|
|
|
484
513
|
volume_mounts: List[VolumeMountPoint] = []
|
|
485
514
|
instance_mounts: List[InstanceMountPoint] = []
|
|
@@ -588,6 +617,7 @@ def _process_pulling_with_shim(
|
|
|
588
617
|
job: Job,
|
|
589
618
|
cluster_info: ClusterInfo,
|
|
590
619
|
code: bytes,
|
|
620
|
+
file_archives: Iterable[tuple[uuid.UUID, bytes]],
|
|
591
621
|
secrets: Dict[str, str],
|
|
592
622
|
repo_credentials: Optional[RemoteRepoCreds],
|
|
593
623
|
server_ssh_private_keys: tuple[str, Optional[str]],
|
|
@@ -663,6 +693,7 @@ def _process_pulling_with_shim(
|
|
|
663
693
|
job=job,
|
|
664
694
|
cluster_info=cluster_info,
|
|
665
695
|
code=code,
|
|
696
|
+
file_archives=file_archives,
|
|
666
697
|
secrets=secrets,
|
|
667
698
|
repo_credentials=repo_credentials,
|
|
668
699
|
success_if_not_available=True,
|
|
@@ -826,6 +857,19 @@ def _get_cluster_info(
|
|
|
826
857
|
return cluster_info
|
|
827
858
|
|
|
828
859
|
|
|
860
|
+
def _get_repo_code_hash(run: Run, job: Job) -> Optional[str]:
|
|
861
|
+
# TODO: drop this function when supporting jobs submitted before 0.19.17 is no longer relevant.
|
|
862
|
+
if (
|
|
863
|
+
job.job_spec.repo_code_hash is None
|
|
864
|
+
and run.run_spec.repo_code_hash is not None
|
|
865
|
+
and job.job_submissions[-1].deployment_num == run.deployment_num
|
|
866
|
+
):
|
|
867
|
+
# The job spec does not have `repo_code_hash`, because it was submitted before 0.19.17.
|
|
868
|
+
# Use `repo_code_hash` from the run.
|
|
869
|
+
return run.run_spec.repo_code_hash
|
|
870
|
+
return job.job_spec.repo_code_hash
|
|
871
|
+
|
|
872
|
+
|
|
829
873
|
async def _get_job_code(
|
|
830
874
|
session: AsyncSession, project: ProjectModel, repo: RepoModel, code_hash: Optional[str]
|
|
831
875
|
) -> bytes:
|
|
@@ -853,6 +897,43 @@ async def _get_job_code(
|
|
|
853
897
|
return blob
|
|
854
898
|
|
|
855
899
|
|
|
900
|
+
async def _get_job_file_archives(
|
|
901
|
+
session: AsyncSession,
|
|
902
|
+
archive_mappings: Iterable[FileArchiveMapping],
|
|
903
|
+
user: UserModel,
|
|
904
|
+
) -> list[tuple[uuid.UUID, bytes]]:
|
|
905
|
+
archives: list[tuple[uuid.UUID, bytes]] = []
|
|
906
|
+
for archive_mapping in archive_mappings:
|
|
907
|
+
archive_id = archive_mapping.id
|
|
908
|
+
archive_blob = await _get_job_file_archive(
|
|
909
|
+
session=session, archive_id=archive_id, user=user
|
|
910
|
+
)
|
|
911
|
+
archives.append((archive_id, archive_blob))
|
|
912
|
+
return archives
|
|
913
|
+
|
|
914
|
+
|
|
915
|
+
async def _get_job_file_archive(
|
|
916
|
+
session: AsyncSession, archive_id: uuid.UUID, user: UserModel
|
|
917
|
+
) -> bytes:
|
|
918
|
+
archive_model = await files_services.get_archive_model(session, id=archive_id, user=user)
|
|
919
|
+
if archive_model is None:
|
|
920
|
+
return b""
|
|
921
|
+
if archive_model.blob is not None:
|
|
922
|
+
return archive_model.blob
|
|
923
|
+
storage = get_default_storage()
|
|
924
|
+
if storage is None:
|
|
925
|
+
return b""
|
|
926
|
+
blob = await common_utils.run_async(
|
|
927
|
+
storage.get_archive,
|
|
928
|
+
str(archive_model.user_id),
|
|
929
|
+
archive_model.blob_hash,
|
|
930
|
+
)
|
|
931
|
+
if blob is None:
|
|
932
|
+
logger.error("Failed to get file archive %s from storage", archive_id)
|
|
933
|
+
return b""
|
|
934
|
+
return blob
|
|
935
|
+
|
|
936
|
+
|
|
856
937
|
@runner_ssh_tunnel(ports=[DSTACK_RUNNER_HTTP_PORT], retries=1)
|
|
857
938
|
def _submit_job_to_runner(
|
|
858
939
|
ports: Dict[int, int],
|
|
@@ -861,6 +942,7 @@ def _submit_job_to_runner(
|
|
|
861
942
|
job: Job,
|
|
862
943
|
cluster_info: ClusterInfo,
|
|
863
944
|
code: bytes,
|
|
945
|
+
file_archives: Iterable[tuple[uuid.UUID, bytes]],
|
|
864
946
|
secrets: Dict[str, str],
|
|
865
947
|
repo_credentials: Optional[RemoteRepoCreds],
|
|
866
948
|
success_if_not_available: bool,
|
|
@@ -896,10 +978,15 @@ def _submit_job_to_runner(
|
|
|
896
978
|
run=run,
|
|
897
979
|
job=job,
|
|
898
980
|
cluster_info=cluster_info,
|
|
899
|
-
secrets
|
|
981
|
+
# Do not send all the secrets since interpolation is already done by the server.
|
|
982
|
+
# TODO: Passing secrets may be necessary for filtering out secret values from logs.
|
|
983
|
+
secrets={},
|
|
900
984
|
repo_credentials=repo_credentials,
|
|
901
985
|
instance_env=instance_env,
|
|
902
986
|
)
|
|
987
|
+
logger.debug("%s: uploading file archive(s)", fmt(job_model))
|
|
988
|
+
for archive_id, archive in file_archives:
|
|
989
|
+
runner_client.upload_archive(archive_id, archive)
|
|
903
990
|
logger.debug("%s: uploading code", fmt(job_model))
|
|
904
991
|
runner_client.upload_code(code)
|
|
905
992
|
logger.debug("%s: starting job", fmt(job_model))
|
|
@@ -911,6 +998,16 @@ def _submit_job_to_runner(
|
|
|
911
998
|
return True
|
|
912
999
|
|
|
913
1000
|
|
|
1001
|
+
def _interpolate_secrets(secrets: Dict[str, str], job_spec: JobSpec):
|
|
1002
|
+
interpolate = VariablesInterpolator({"secrets": secrets}).interpolate_or_error
|
|
1003
|
+
job_spec.env = {k: interpolate(v) for k, v in job_spec.env.items()}
|
|
1004
|
+
if job_spec.registry_auth is not None:
|
|
1005
|
+
job_spec.registry_auth = RegistryAuth(
|
|
1006
|
+
username=interpolate(job_spec.registry_auth.username),
|
|
1007
|
+
password=interpolate(job_spec.registry_auth.password),
|
|
1008
|
+
)
|
|
1009
|
+
|
|
1010
|
+
|
|
914
1011
|
def _get_instance_specific_mounts(
|
|
915
1012
|
backend_type: BackendType, instance_type_name: str
|
|
916
1013
|
) -> List[InstanceMountPoint]:
|
|
@@ -27,6 +27,7 @@ from dstack._internal.server.services.jobs import (
|
|
|
27
27
|
group_jobs_by_replica_latest,
|
|
28
28
|
)
|
|
29
29
|
from dstack._internal.server.services.locking import get_locker
|
|
30
|
+
from dstack._internal.server.services.prometheus.client_metrics import run_metrics
|
|
30
31
|
from dstack._internal.server.services.runs import (
|
|
31
32
|
fmt,
|
|
32
33
|
process_terminating_run,
|
|
@@ -34,6 +35,7 @@ from dstack._internal.server.services.runs import (
|
|
|
34
35
|
run_model_to_run,
|
|
35
36
|
scale_run_replicas,
|
|
36
37
|
)
|
|
38
|
+
from dstack._internal.server.services.secrets import get_project_secrets_mapping
|
|
37
39
|
from dstack._internal.server.services.services import update_service_desired_replica_count
|
|
38
40
|
from dstack._internal.utils import common
|
|
39
41
|
from dstack._internal.utils.logging import get_logger
|
|
@@ -62,7 +64,7 @@ async def _process_next_run():
|
|
|
62
64
|
)
|
|
63
65
|
.order_by(RunModel.last_processed_at.asc())
|
|
64
66
|
.limit(1)
|
|
65
|
-
.with_for_update(skip_locked=True)
|
|
67
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
66
68
|
)
|
|
67
69
|
run_model = res.scalar()
|
|
68
70
|
if run_model is None:
|
|
@@ -74,7 +76,7 @@ async def _process_next_run():
|
|
|
74
76
|
JobModel.id.not_in(job_lockset),
|
|
75
77
|
)
|
|
76
78
|
.order_by(JobModel.id) # take locks in order
|
|
77
|
-
.with_for_update(skip_locked=True)
|
|
79
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
78
80
|
)
|
|
79
81
|
job_models = res.scalars().all()
|
|
80
82
|
if len(run_model.jobs) != len(job_models):
|
|
@@ -329,6 +331,24 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
|
|
|
329
331
|
run_model.status.name,
|
|
330
332
|
new_status.name,
|
|
331
333
|
)
|
|
334
|
+
if run_model.status == RunStatus.SUBMITTED and new_status == RunStatus.PROVISIONING:
|
|
335
|
+
current_time = common.get_current_datetime()
|
|
336
|
+
submit_to_provision_duration = (
|
|
337
|
+
current_time - run_model.submitted_at.replace(tzinfo=datetime.timezone.utc)
|
|
338
|
+
).total_seconds()
|
|
339
|
+
logger.info(
|
|
340
|
+
"%s: run took %.2f seconds from submision to provisioning.",
|
|
341
|
+
fmt(run_model),
|
|
342
|
+
submit_to_provision_duration,
|
|
343
|
+
)
|
|
344
|
+
project_name = run_model.project.name
|
|
345
|
+
run_metrics.log_submit_to_provision_duration(
|
|
346
|
+
submit_to_provision_duration, project_name, run_spec.configuration.type
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
if new_status == RunStatus.PENDING:
|
|
350
|
+
run_metrics.increment_pending_runs(run_model.project.name, run_spec.configuration.type)
|
|
351
|
+
|
|
332
352
|
run_model.status = new_status
|
|
333
353
|
run_model.termination_reason = termination_reason
|
|
334
354
|
# While a run goes to pending without provisioning, resubmission_attempt increases.
|
|
@@ -385,7 +405,11 @@ async def _handle_run_replicas(
|
|
|
385
405
|
)
|
|
386
406
|
return
|
|
387
407
|
|
|
388
|
-
await _update_jobs_to_new_deployment_in_place(
|
|
408
|
+
await _update_jobs_to_new_deployment_in_place(
|
|
409
|
+
session=session,
|
|
410
|
+
run_model=run_model,
|
|
411
|
+
run_spec=run_spec,
|
|
412
|
+
)
|
|
389
413
|
if _has_out_of_date_replicas(run_model):
|
|
390
414
|
non_terminated_replica_count = len(
|
|
391
415
|
{j.replica_num for j in run_model.jobs if not j.status.is_finished()}
|
|
@@ -425,18 +449,25 @@ async def _handle_run_replicas(
|
|
|
425
449
|
)
|
|
426
450
|
|
|
427
451
|
|
|
428
|
-
async def _update_jobs_to_new_deployment_in_place(
|
|
452
|
+
async def _update_jobs_to_new_deployment_in_place(
|
|
453
|
+
session: AsyncSession, run_model: RunModel, run_spec: RunSpec
|
|
454
|
+
) -> None:
|
|
429
455
|
"""
|
|
430
456
|
Bump deployment_num for jobs that do not require redeployment.
|
|
431
457
|
"""
|
|
432
|
-
|
|
458
|
+
secrets = await get_project_secrets_mapping(
|
|
459
|
+
session=session,
|
|
460
|
+
project=run_model.project,
|
|
461
|
+
)
|
|
433
462
|
for replica_num, job_models in group_jobs_by_replica_latest(run_model.jobs):
|
|
434
463
|
if all(j.status.is_finished() for j in job_models):
|
|
435
464
|
continue
|
|
436
465
|
if all(j.deployment_num == run_model.deployment_num for j in job_models):
|
|
437
466
|
continue
|
|
467
|
+
# FIXME: Handle getting image configuration errors or skip it.
|
|
438
468
|
new_job_specs = await get_job_specs_from_run_spec(
|
|
439
469
|
run_spec=run_spec,
|
|
470
|
+
secrets=secrets,
|
|
440
471
|
replica_num=replica_num,
|
|
441
472
|
)
|
|
442
473
|
assert len(new_job_specs) == len(job_models), (
|
|
@@ -99,7 +99,7 @@ async def _process_next_submitted_job():
|
|
|
99
99
|
JobModel.id.not_in(lockset),
|
|
100
100
|
)
|
|
101
101
|
# Jobs are process in FIFO sorted by priority globally,
|
|
102
|
-
# thus runs from different
|
|
102
|
+
# thus runs from different projects can "overtake" each other by using higher priorities.
|
|
103
103
|
# That's not a big problem as long as projects do not compete for the same compute resources.
|
|
104
104
|
# Jobs with lower priorities from other projects will be processed without major lag
|
|
105
105
|
# as long as new higher priority runs are not constantly submitted.
|
|
@@ -108,7 +108,13 @@ async def _process_next_submitted_job():
|
|
|
108
108
|
# there can be many projects and we are limited by the max DB connections.
|
|
109
109
|
.order_by(RunModel.priority.desc(), JobModel.last_processed_at.asc())
|
|
110
110
|
.limit(1)
|
|
111
|
-
.with_for_update(
|
|
111
|
+
.with_for_update(
|
|
112
|
+
skip_locked=True,
|
|
113
|
+
key_share=True,
|
|
114
|
+
# Do not lock joined run, only job.
|
|
115
|
+
# Locking run here may cause deadlock.
|
|
116
|
+
of=JobModel,
|
|
117
|
+
)
|
|
112
118
|
)
|
|
113
119
|
job_model = res.scalar()
|
|
114
120
|
if job_model is None:
|
|
@@ -201,7 +207,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
201
207
|
)
|
|
202
208
|
.options(lazyload(InstanceModel.jobs))
|
|
203
209
|
.order_by(InstanceModel.id) # take locks in order
|
|
204
|
-
.with_for_update()
|
|
210
|
+
.with_for_update(key_share=True)
|
|
205
211
|
)
|
|
206
212
|
pool_instances = list(res.unique().scalars().all())
|
|
207
213
|
instances_ids = sorted([i.id for i in pool_instances])
|
|
@@ -326,7 +332,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
326
332
|
.where(VolumeModel.id.in_(volumes_ids))
|
|
327
333
|
.options(selectinload(VolumeModel.user))
|
|
328
334
|
.order_by(VolumeModel.id) # take locks in order
|
|
329
|
-
.with_for_update()
|
|
335
|
+
.with_for_update(key_share=True)
|
|
330
336
|
)
|
|
331
337
|
async with get_locker().lock_ctx(VolumeModel.__tablename__, volumes_ids):
|
|
332
338
|
if len(volume_models) > 0:
|
|
@@ -45,7 +45,7 @@ async def _process_next_terminating_job():
|
|
|
45
45
|
)
|
|
46
46
|
.order_by(JobModel.last_processed_at.asc())
|
|
47
47
|
.limit(1)
|
|
48
|
-
.with_for_update(skip_locked=True)
|
|
48
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
49
49
|
)
|
|
50
50
|
job_model = res.scalar()
|
|
51
51
|
if job_model is None:
|
|
@@ -58,7 +58,7 @@ async def _process_next_terminating_job():
|
|
|
58
58
|
InstanceModel.id.not_in(instance_lockset),
|
|
59
59
|
)
|
|
60
60
|
.options(lazyload(InstanceModel.jobs))
|
|
61
|
-
.with_for_update(skip_locked=True)
|
|
61
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
62
62
|
)
|
|
63
63
|
instance_model = res.scalar()
|
|
64
64
|
if instance_model is None:
|
|
@@ -33,7 +33,7 @@ async def process_submitted_volumes():
|
|
|
33
33
|
)
|
|
34
34
|
.order_by(VolumeModel.last_processed_at.asc())
|
|
35
35
|
.limit(1)
|
|
36
|
-
.with_for_update(skip_locked=True)
|
|
36
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
37
37
|
)
|
|
38
38
|
volume_model = res.scalar()
|
|
39
39
|
if volume_model is None:
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Add FileArchiveModel
|
|
2
|
+
|
|
3
|
+
Revision ID: 5f1707c525d2
|
|
4
|
+
Revises: 35e90e1b0d3e
|
|
5
|
+
Create Date: 2025-06-12 12:28:26.678380
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
import sqlalchemy_utils
|
|
11
|
+
from alembic import op
|
|
12
|
+
|
|
13
|
+
# revision identifiers, used by Alembic.
|
|
14
|
+
revision = "5f1707c525d2"
|
|
15
|
+
down_revision = "35e90e1b0d3e"
|
|
16
|
+
branch_labels = None
|
|
17
|
+
depends_on = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def upgrade() -> None:
|
|
21
|
+
op.create_table(
|
|
22
|
+
"file_archives",
|
|
23
|
+
sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
|
|
24
|
+
sa.Column("user_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
|
|
25
|
+
sa.Column("blob_hash", sa.Text(), nullable=False),
|
|
26
|
+
sa.Column("blob", sa.LargeBinary(), nullable=True),
|
|
27
|
+
sa.ForeignKeyConstraint(
|
|
28
|
+
["user_id"],
|
|
29
|
+
["users.id"],
|
|
30
|
+
name=op.f("fk_file_archives_user_id_users"),
|
|
31
|
+
ondelete="CASCADE",
|
|
32
|
+
),
|
|
33
|
+
sa.PrimaryKeyConstraint("id", name=op.f("pk_file_archives")),
|
|
34
|
+
sa.UniqueConstraint("user_id", "blob_hash", name="uq_file_archives_user_id_blob_hash"),
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def downgrade() -> None:
|
|
39
|
+
op.drop_table("file_archives")
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Add SecretModel
|
|
2
|
+
|
|
3
|
+
Revision ID: 644b8a114187
|
|
4
|
+
Revises: 5f1707c525d2
|
|
5
|
+
Create Date: 2025-06-30 11:00:04.326290
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
import sqlalchemy_utils
|
|
11
|
+
from alembic import op
|
|
12
|
+
|
|
13
|
+
import dstack._internal.server.models
|
|
14
|
+
|
|
15
|
+
# revision identifiers, used by Alembic.
|
|
16
|
+
revision = "644b8a114187"
|
|
17
|
+
down_revision = "5f1707c525d2"
|
|
18
|
+
branch_labels = None
|
|
19
|
+
depends_on = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def upgrade() -> None:
|
|
23
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
24
|
+
op.create_table(
|
|
25
|
+
"secrets",
|
|
26
|
+
sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
|
|
27
|
+
sa.Column(
|
|
28
|
+
"project_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False
|
|
29
|
+
),
|
|
30
|
+
sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=False),
|
|
31
|
+
sa.Column("updated_at", dstack._internal.server.models.NaiveDateTime(), nullable=False),
|
|
32
|
+
sa.Column("name", sa.String(length=200), nullable=False),
|
|
33
|
+
sa.Column("value", dstack._internal.server.models.EncryptedString(), nullable=False),
|
|
34
|
+
sa.ForeignKeyConstraint(
|
|
35
|
+
["project_id"],
|
|
36
|
+
["projects.id"],
|
|
37
|
+
name=op.f("fk_secrets_project_id_projects"),
|
|
38
|
+
ondelete="CASCADE",
|
|
39
|
+
),
|
|
40
|
+
sa.PrimaryKeyConstraint("id", name=op.f("pk_secrets")),
|
|
41
|
+
sa.UniqueConstraint("project_id", "name", name="uq_secrets_project_id_name"),
|
|
42
|
+
)
|
|
43
|
+
# ### end Alembic commands ###
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def downgrade() -> None:
|
|
47
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
48
|
+
op.drop_table("secrets")
|
|
49
|
+
# ### end Alembic commands ###
|
|
@@ -315,6 +315,21 @@ class CodeModel(BaseModel):
|
|
|
315
315
|
blob: Mapped[Optional[bytes]] = mapped_column(LargeBinary) # None means blob is stored on s3
|
|
316
316
|
|
|
317
317
|
|
|
318
|
+
class FileArchiveModel(BaseModel):
|
|
319
|
+
__tablename__ = "file_archives"
|
|
320
|
+
__table_args__ = (
|
|
321
|
+
UniqueConstraint("user_id", "blob_hash", name="uq_file_archives_user_id_blob_hash"),
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
id: Mapped[uuid.UUID] = mapped_column(
|
|
325
|
+
UUIDType(binary=False), primary_key=True, default=uuid.uuid4
|
|
326
|
+
)
|
|
327
|
+
user_id: Mapped["UserModel"] = mapped_column(ForeignKey("users.id", ondelete="CASCADE"))
|
|
328
|
+
user: Mapped["UserModel"] = relationship()
|
|
329
|
+
blob_hash: Mapped[str] = mapped_column(Text)
|
|
330
|
+
blob: Mapped[Optional[bytes]] = mapped_column(LargeBinary) # None means blob is stored on s3
|
|
331
|
+
|
|
332
|
+
|
|
318
333
|
class RunModel(BaseModel):
|
|
319
334
|
__tablename__ = "runs"
|
|
320
335
|
|
|
@@ -711,3 +726,21 @@ class JobPrometheusMetrics(BaseModel):
|
|
|
711
726
|
collected_at: Mapped[datetime] = mapped_column(NaiveDateTime)
|
|
712
727
|
# Raw Prometheus text response
|
|
713
728
|
text: Mapped[str] = mapped_column(Text)
|
|
729
|
+
|
|
730
|
+
|
|
731
|
+
class SecretModel(BaseModel):
|
|
732
|
+
__tablename__ = "secrets"
|
|
733
|
+
__table_args__ = (UniqueConstraint("project_id", "name", name="uq_secrets_project_id_name"),)
|
|
734
|
+
|
|
735
|
+
id: Mapped[uuid.UUID] = mapped_column(
|
|
736
|
+
UUIDType(binary=False), primary_key=True, default=uuid.uuid4
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE"))
|
|
740
|
+
project: Mapped["ProjectModel"] = relationship()
|
|
741
|
+
|
|
742
|
+
created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime)
|
|
743
|
+
updated_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime)
|
|
744
|
+
|
|
745
|
+
name: Mapped[str] = mapped_column(String(200))
|
|
746
|
+
value: Mapped[DecryptedString] = mapped_column(EncryptedString())
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from typing import Annotated
|
|
2
|
+
|
|
3
|
+
from fastapi import APIRouter, Depends, Request, UploadFile
|
|
4
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
5
|
+
|
|
6
|
+
from dstack._internal.core.errors import ResourceNotExistsError, ServerClientError
|
|
7
|
+
from dstack._internal.core.models.files import FileArchive
|
|
8
|
+
from dstack._internal.server.db import get_session
|
|
9
|
+
from dstack._internal.server.models import UserModel
|
|
10
|
+
from dstack._internal.server.schemas.files import GetFileArchiveByHashRequest
|
|
11
|
+
from dstack._internal.server.security.permissions import Authenticated
|
|
12
|
+
from dstack._internal.server.services import files
|
|
13
|
+
from dstack._internal.server.settings import SERVER_CODE_UPLOAD_LIMIT
|
|
14
|
+
from dstack._internal.server.utils.routers import (
|
|
15
|
+
get_base_api_additional_responses,
|
|
16
|
+
get_request_size,
|
|
17
|
+
)
|
|
18
|
+
from dstack._internal.utils.common import sizeof_fmt
|
|
19
|
+
|
|
20
|
+
router = APIRouter(
|
|
21
|
+
prefix="/api/files",
|
|
22
|
+
tags=["files"],
|
|
23
|
+
responses=get_base_api_additional_responses(),
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@router.post("/get_archive_by_hash")
|
|
28
|
+
async def get_archive_by_hash(
|
|
29
|
+
body: GetFileArchiveByHashRequest,
|
|
30
|
+
session: Annotated[AsyncSession, Depends(get_session)],
|
|
31
|
+
user: Annotated[UserModel, Depends(Authenticated())],
|
|
32
|
+
) -> FileArchive:
|
|
33
|
+
archive = await files.get_archive_by_hash(
|
|
34
|
+
session=session,
|
|
35
|
+
user=user,
|
|
36
|
+
hash=body.hash,
|
|
37
|
+
)
|
|
38
|
+
if archive is None:
|
|
39
|
+
raise ResourceNotExistsError()
|
|
40
|
+
return archive
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@router.post("/upload_archive")
|
|
44
|
+
async def upload_archive(
|
|
45
|
+
request: Request,
|
|
46
|
+
file: UploadFile,
|
|
47
|
+
session: Annotated[AsyncSession, Depends(get_session)],
|
|
48
|
+
user: Annotated[UserModel, Depends(Authenticated())],
|
|
49
|
+
) -> FileArchive:
|
|
50
|
+
request_size = get_request_size(request)
|
|
51
|
+
if SERVER_CODE_UPLOAD_LIMIT > 0 and request_size > SERVER_CODE_UPLOAD_LIMIT:
|
|
52
|
+
diff_size_fmt = sizeof_fmt(request_size)
|
|
53
|
+
limit_fmt = sizeof_fmt(SERVER_CODE_UPLOAD_LIMIT)
|
|
54
|
+
if diff_size_fmt == limit_fmt:
|
|
55
|
+
diff_size_fmt = f"{request_size}B"
|
|
56
|
+
limit_fmt = f"{SERVER_CODE_UPLOAD_LIMIT}B"
|
|
57
|
+
raise ServerClientError(
|
|
58
|
+
f"Archive size is {diff_size_fmt}, which exceeds the limit of {limit_fmt}."
|
|
59
|
+
" Use .gitignore/.dstackignore to exclude large files."
|
|
60
|
+
" This limit can be modified by setting the DSTACK_SERVER_CODE_UPLOAD_LIMIT environment variable."
|
|
61
|
+
)
|
|
62
|
+
archive = await files.upload_archive(
|
|
63
|
+
session=session,
|
|
64
|
+
user=user,
|
|
65
|
+
file=file,
|
|
66
|
+
)
|
|
67
|
+
return archive
|
|
@@ -9,7 +9,10 @@ import dstack._internal.server.services.gateways as gateways
|
|
|
9
9
|
from dstack._internal.core.errors import ResourceNotExistsError
|
|
10
10
|
from dstack._internal.server.db import get_session
|
|
11
11
|
from dstack._internal.server.models import ProjectModel, UserModel
|
|
12
|
-
from dstack._internal.server.security.permissions import
|
|
12
|
+
from dstack._internal.server.security.permissions import (
|
|
13
|
+
ProjectAdmin,
|
|
14
|
+
ProjectMemberOrPublicAccess,
|
|
15
|
+
)
|
|
13
16
|
from dstack._internal.server.utils.routers import get_base_api_additional_responses
|
|
14
17
|
|
|
15
18
|
router = APIRouter(
|
|
@@ -22,7 +25,7 @@ router = APIRouter(
|
|
|
22
25
|
@router.post("/list")
|
|
23
26
|
async def list_gateways(
|
|
24
27
|
session: AsyncSession = Depends(get_session),
|
|
25
|
-
user_project: Tuple[UserModel, ProjectModel] = Depends(
|
|
28
|
+
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMemberOrPublicAccess()),
|
|
26
29
|
) -> List[models.Gateway]:
|
|
27
30
|
_, project = user_project
|
|
28
31
|
return await gateways.list_project_gateways(session=session, project=project)
|
|
@@ -32,7 +35,7 @@ async def list_gateways(
|
|
|
32
35
|
async def get_gateway(
|
|
33
36
|
body: schemas.GetGatewayRequest,
|
|
34
37
|
session: AsyncSession = Depends(get_session),
|
|
35
|
-
user_project: Tuple[UserModel, ProjectModel] = Depends(
|
|
38
|
+
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMemberOrPublicAccess()),
|
|
36
39
|
) -> models.Gateway:
|
|
37
40
|
_, project = user_project
|
|
38
41
|
gateway = await gateways.get_gateway_by_name(session=session, project=project, name=body.name)
|