dstack 0.19.15rc1__py3-none-any.whl → 0.19.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (93) hide show
  1. dstack/_internal/cli/commands/secrets.py +92 -0
  2. dstack/_internal/cli/main.py +2 -0
  3. dstack/_internal/cli/services/completion.py +5 -0
  4. dstack/_internal/cli/services/configurators/run.py +59 -17
  5. dstack/_internal/cli/utils/secrets.py +25 -0
  6. dstack/_internal/core/backends/__init__.py +10 -4
  7. dstack/_internal/core/backends/cloudrift/__init__.py +0 -0
  8. dstack/_internal/core/backends/cloudrift/api_client.py +208 -0
  9. dstack/_internal/core/backends/cloudrift/backend.py +16 -0
  10. dstack/_internal/core/backends/cloudrift/compute.py +138 -0
  11. dstack/_internal/core/backends/cloudrift/configurator.py +66 -0
  12. dstack/_internal/core/backends/cloudrift/models.py +40 -0
  13. dstack/_internal/core/backends/configurators.py +9 -0
  14. dstack/_internal/core/backends/models.py +7 -0
  15. dstack/_internal/core/compatibility/logs.py +15 -0
  16. dstack/_internal/core/compatibility/runs.py +31 -2
  17. dstack/_internal/core/models/backends/base.py +2 -0
  18. dstack/_internal/core/models/configurations.py +33 -2
  19. dstack/_internal/core/models/files.py +67 -0
  20. dstack/_internal/core/models/logs.py +2 -1
  21. dstack/_internal/core/models/runs.py +24 -1
  22. dstack/_internal/core/models/secrets.py +9 -2
  23. dstack/_internal/server/app.py +2 -0
  24. dstack/_internal/server/background/tasks/process_fleets.py +1 -1
  25. dstack/_internal/server/background/tasks/process_gateways.py +1 -1
  26. dstack/_internal/server/background/tasks/process_instances.py +1 -1
  27. dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
  28. dstack/_internal/server/background/tasks/process_running_jobs.py +110 -13
  29. dstack/_internal/server/background/tasks/process_runs.py +36 -5
  30. dstack/_internal/server/background/tasks/process_submitted_jobs.py +10 -4
  31. dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
  32. dstack/_internal/server/background/tasks/process_volumes.py +1 -1
  33. dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py +39 -0
  34. dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py +49 -0
  35. dstack/_internal/server/models.py +33 -0
  36. dstack/_internal/server/routers/files.py +67 -0
  37. dstack/_internal/server/routers/gateways.py +6 -3
  38. dstack/_internal/server/routers/projects.py +63 -0
  39. dstack/_internal/server/routers/prometheus.py +5 -5
  40. dstack/_internal/server/routers/secrets.py +57 -15
  41. dstack/_internal/server/schemas/files.py +5 -0
  42. dstack/_internal/server/schemas/logs.py +10 -1
  43. dstack/_internal/server/schemas/projects.py +12 -0
  44. dstack/_internal/server/schemas/runner.py +2 -0
  45. dstack/_internal/server/schemas/secrets.py +7 -11
  46. dstack/_internal/server/security/permissions.py +75 -2
  47. dstack/_internal/server/services/backends/__init__.py +1 -1
  48. dstack/_internal/server/services/files.py +91 -0
  49. dstack/_internal/server/services/fleets.py +1 -1
  50. dstack/_internal/server/services/gateways/__init__.py +1 -1
  51. dstack/_internal/server/services/jobs/__init__.py +19 -8
  52. dstack/_internal/server/services/jobs/configurators/base.py +27 -3
  53. dstack/_internal/server/services/jobs/configurators/dev.py +3 -3
  54. dstack/_internal/server/services/logs/aws.py +38 -38
  55. dstack/_internal/server/services/logs/filelog.py +48 -14
  56. dstack/_internal/server/services/logs/gcp.py +17 -16
  57. dstack/_internal/server/services/projects.py +164 -5
  58. dstack/_internal/server/services/prometheus/__init__.py +0 -0
  59. dstack/_internal/server/services/prometheus/client_metrics.py +52 -0
  60. dstack/_internal/server/services/proxy/repo.py +3 -0
  61. dstack/_internal/server/services/runner/client.py +8 -0
  62. dstack/_internal/server/services/runs.py +55 -10
  63. dstack/_internal/server/services/secrets.py +204 -0
  64. dstack/_internal/server/services/services/__init__.py +2 -1
  65. dstack/_internal/server/services/storage/base.py +21 -0
  66. dstack/_internal/server/services/storage/gcs.py +28 -6
  67. dstack/_internal/server/services/storage/s3.py +27 -9
  68. dstack/_internal/server/services/users.py +1 -3
  69. dstack/_internal/server/services/volumes.py +1 -1
  70. dstack/_internal/server/settings.py +2 -2
  71. dstack/_internal/server/statics/index.html +1 -1
  72. dstack/_internal/server/statics/{main-0ac1e1583684417ae4d1.js → main-d151637af20f70b2e796.js} +104 -48
  73. dstack/_internal/server/statics/{main-0ac1e1583684417ae4d1.js.map → main-d151637af20f70b2e796.js.map} +1 -1
  74. dstack/_internal/server/statics/{main-f39c418b05fe14772dd8.css → main-d48635d8fe670d53961c.css} +1 -1
  75. dstack/_internal/server/statics/static/media/google.b194b06fafd0a52aeb566922160ea514.svg +1 -0
  76. dstack/_internal/server/testing/common.py +43 -5
  77. dstack/_internal/settings.py +5 -0
  78. dstack/_internal/utils/files.py +69 -0
  79. dstack/_internal/utils/nested_list.py +47 -0
  80. dstack/_internal/utils/path.py +12 -4
  81. dstack/api/_public/runs.py +73 -12
  82. dstack/api/server/__init__.py +6 -0
  83. dstack/api/server/_files.py +18 -0
  84. dstack/api/server/_logs.py +5 -1
  85. dstack/api/server/_projects.py +24 -0
  86. dstack/api/server/_secrets.py +15 -15
  87. dstack/version.py +1 -1
  88. {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/METADATA +3 -4
  89. {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/RECORD +93 -71
  90. /dstack/_internal/server/services/{prometheus.py → prometheus/custom_metrics.py} +0 -0
  91. {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/WHEEL +0 -0
  92. {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/entry_points.txt +0 -0
  93. {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,5 +1,6 @@
1
1
  import asyncio
2
2
  import re
3
+ import uuid
3
4
  from collections.abc import Iterable
4
5
  from datetime import timedelta, timezone
5
6
  from typing import Dict, List, Optional
@@ -14,6 +15,7 @@ from dstack._internal.core.errors import GatewayError
14
15
  from dstack._internal.core.models.backends.base import BackendType
15
16
  from dstack._internal.core.models.common import NetworkMode, RegistryAuth
16
17
  from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
18
+ from dstack._internal.core.models.files import FileArchiveMapping
17
19
  from dstack._internal.core.models.instances import (
18
20
  InstanceStatus,
19
21
  RemoteConnectionInfo,
@@ -42,8 +44,10 @@ from dstack._internal.server.models import (
42
44
  ProjectModel,
43
45
  RepoModel,
44
46
  RunModel,
47
+ UserModel,
45
48
  )
46
49
  from dstack._internal.server.schemas.runner import GPUDevice, TaskStatus
50
+ from dstack._internal.server.services import files as files_services
47
51
  from dstack._internal.server.services import logs as logs_services
48
52
  from dstack._internal.server.services import services
49
53
  from dstack._internal.server.services.instances import get_instance_ssh_private_keys
@@ -66,9 +70,10 @@ from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
66
70
  from dstack._internal.server.services.runs import (
67
71
  run_model_to_run,
68
72
  )
73
+ from dstack._internal.server.services.secrets import get_project_secrets_mapping
69
74
  from dstack._internal.server.services.storage import get_default_storage
70
75
  from dstack._internal.utils import common as common_utils
71
- from dstack._internal.utils.interpolator import VariablesInterpolator
76
+ from dstack._internal.utils.interpolator import InterpolatorError, VariablesInterpolator
72
77
  from dstack._internal.utils.logging import get_logger
73
78
 
74
79
  logger = get_logger(__name__)
@@ -101,7 +106,7 @@ async def _process_next_running_job():
101
106
  )
102
107
  .order_by(JobModel.last_processed_at.asc())
103
108
  .limit(1)
104
- .with_for_update(skip_locked=True)
109
+ .with_for_update(skip_locked=True, key_share=True)
105
110
  )
106
111
  job_model = res.unique().scalar()
107
112
  if job_model is None:
@@ -177,7 +182,17 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
177
182
  common_utils.get_or_error(job_model.instance)
178
183
  )
179
184
 
180
- secrets = {} # TODO secrets
185
+ secrets = await get_project_secrets_mapping(session=session, project=project)
186
+
187
+ try:
188
+ _interpolate_secrets(secrets, job.job_spec)
189
+ except InterpolatorError as e:
190
+ logger.info("%s: terminating due to secrets interpolation error", fmt(job_model))
191
+ job_model.status = JobStatus.TERMINATING
192
+ job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
193
+ job_model.termination_reason_message = e.args[0]
194
+ job_model.last_processed_at = common_utils.get_current_datetime()
195
+ return
181
196
 
182
197
  repo_creds_model = await get_repo_creds(session=session, repo=repo_model, user=run_model.user)
183
198
  repo_creds = repo_model_to_repo_head_with_creds(repo_model, repo_creds_model).repo_creds
@@ -214,7 +229,6 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
214
229
  job_model,
215
230
  job_provisioning_data,
216
231
  volumes,
217
- secrets,
218
232
  job.job_spec.registry_auth,
219
233
  public_keys,
220
234
  ssh_user,
@@ -226,12 +240,20 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
226
240
  fmt(job_model),
227
241
  job_submission.age,
228
242
  )
243
+ # FIXME: downloading file archives and code here is a waste of time if
244
+ # the runner is not ready yet
245
+ file_archives = await _get_job_file_archives(
246
+ session=session,
247
+ archive_mappings=job.job_spec.file_archives,
248
+ user=run_model.user,
249
+ )
229
250
  code = await _get_job_code(
230
251
  session=session,
231
252
  project=project,
232
253
  repo=repo_model,
233
- code_hash=run.run_spec.repo_code_hash,
254
+ code_hash=_get_repo_code_hash(run, job),
234
255
  )
256
+
235
257
  success = await common_utils.run_async(
236
258
  _submit_job_to_runner,
237
259
  server_ssh_private_keys,
@@ -242,6 +264,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
242
264
  job,
243
265
  cluster_info,
244
266
  code,
267
+ file_archives,
245
268
  secrets,
246
269
  repo_creds,
247
270
  success_if_not_available=False,
@@ -269,11 +292,18 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
269
292
  logger.debug(
270
293
  "%s: process pulling job with shim, age=%s", fmt(job_model), job_submission.age
271
294
  )
295
+ # FIXME: downloading file archives and code here is a waste of time if
296
+ # the runner is not ready yet
297
+ file_archives = await _get_job_file_archives(
298
+ session=session,
299
+ archive_mappings=job.job_spec.file_archives,
300
+ user=run_model.user,
301
+ )
272
302
  code = await _get_job_code(
273
303
  session=session,
274
304
  project=project,
275
305
  repo=repo_model,
276
- code_hash=run.run_spec.repo_code_hash,
306
+ code_hash=_get_repo_code_hash(run, job),
277
307
  )
278
308
  success = await common_utils.run_async(
279
309
  _process_pulling_with_shim,
@@ -285,6 +315,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
285
315
  job,
286
316
  cluster_info,
287
317
  code,
318
+ file_archives,
288
319
  secrets,
289
320
  repo_creds,
290
321
  server_ssh_private_keys,
@@ -306,8 +337,9 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
306
337
  else:
307
338
  if job_model.termination_reason:
308
339
  logger.warning(
309
- "%s: failed because shim/runner returned an error, age=%s",
340
+ "%s: failed due to %s, age=%s",
310
341
  fmt(job_model),
342
+ job_model.termination_reason.value,
311
343
  job_submission.age,
312
344
  )
313
345
  job_model.status = JobStatus.TERMINATING
@@ -450,7 +482,6 @@ def _process_provisioning_with_shim(
450
482
  job_model: JobModel,
451
483
  job_provisioning_data: JobProvisioningData,
452
484
  volumes: List[Volume],
453
- secrets: Dict[str, str],
454
485
  registry_auth: Optional[RegistryAuth],
455
486
  public_keys: List[str],
456
487
  ssh_user: str,
@@ -476,10 +507,8 @@ def _process_provisioning_with_shim(
476
507
  registry_username = ""
477
508
  registry_password = ""
478
509
  if registry_auth is not None:
479
- logger.debug("%s: authenticating to the registry...", fmt(job_model))
480
- interpolate = VariablesInterpolator({"secrets": secrets}).interpolate
481
- registry_username = interpolate(registry_auth.username)
482
- registry_password = interpolate(registry_auth.password)
510
+ registry_username = registry_auth.username
511
+ registry_password = registry_auth.password
483
512
 
484
513
  volume_mounts: List[VolumeMountPoint] = []
485
514
  instance_mounts: List[InstanceMountPoint] = []
@@ -588,6 +617,7 @@ def _process_pulling_with_shim(
588
617
  job: Job,
589
618
  cluster_info: ClusterInfo,
590
619
  code: bytes,
620
+ file_archives: Iterable[tuple[uuid.UUID, bytes]],
591
621
  secrets: Dict[str, str],
592
622
  repo_credentials: Optional[RemoteRepoCreds],
593
623
  server_ssh_private_keys: tuple[str, Optional[str]],
@@ -663,6 +693,7 @@ def _process_pulling_with_shim(
663
693
  job=job,
664
694
  cluster_info=cluster_info,
665
695
  code=code,
696
+ file_archives=file_archives,
666
697
  secrets=secrets,
667
698
  repo_credentials=repo_credentials,
668
699
  success_if_not_available=True,
@@ -826,6 +857,19 @@ def _get_cluster_info(
826
857
  return cluster_info
827
858
 
828
859
 
860
+ def _get_repo_code_hash(run: Run, job: Job) -> Optional[str]:
861
+ # TODO: drop this function when supporting jobs submitted before 0.19.17 is no longer relevant.
862
+ if (
863
+ job.job_spec.repo_code_hash is None
864
+ and run.run_spec.repo_code_hash is not None
865
+ and job.job_submissions[-1].deployment_num == run.deployment_num
866
+ ):
867
+ # The job spec does not have `repo_code_hash`, because it was submitted before 0.19.17.
868
+ # Use `repo_code_hash` from the run.
869
+ return run.run_spec.repo_code_hash
870
+ return job.job_spec.repo_code_hash
871
+
872
+
829
873
  async def _get_job_code(
830
874
  session: AsyncSession, project: ProjectModel, repo: RepoModel, code_hash: Optional[str]
831
875
  ) -> bytes:
@@ -853,6 +897,43 @@ async def _get_job_code(
853
897
  return blob
854
898
 
855
899
 
900
+ async def _get_job_file_archives(
901
+ session: AsyncSession,
902
+ archive_mappings: Iterable[FileArchiveMapping],
903
+ user: UserModel,
904
+ ) -> list[tuple[uuid.UUID, bytes]]:
905
+ archives: list[tuple[uuid.UUID, bytes]] = []
906
+ for archive_mapping in archive_mappings:
907
+ archive_id = archive_mapping.id
908
+ archive_blob = await _get_job_file_archive(
909
+ session=session, archive_id=archive_id, user=user
910
+ )
911
+ archives.append((archive_id, archive_blob))
912
+ return archives
913
+
914
+
915
+ async def _get_job_file_archive(
916
+ session: AsyncSession, archive_id: uuid.UUID, user: UserModel
917
+ ) -> bytes:
918
+ archive_model = await files_services.get_archive_model(session, id=archive_id, user=user)
919
+ if archive_model is None:
920
+ return b""
921
+ if archive_model.blob is not None:
922
+ return archive_model.blob
923
+ storage = get_default_storage()
924
+ if storage is None:
925
+ return b""
926
+ blob = await common_utils.run_async(
927
+ storage.get_archive,
928
+ str(archive_model.user_id),
929
+ archive_model.blob_hash,
930
+ )
931
+ if blob is None:
932
+ logger.error("Failed to get file archive %s from storage", archive_id)
933
+ return b""
934
+ return blob
935
+
936
+
856
937
  @runner_ssh_tunnel(ports=[DSTACK_RUNNER_HTTP_PORT], retries=1)
857
938
  def _submit_job_to_runner(
858
939
  ports: Dict[int, int],
@@ -861,6 +942,7 @@ def _submit_job_to_runner(
861
942
  job: Job,
862
943
  cluster_info: ClusterInfo,
863
944
  code: bytes,
945
+ file_archives: Iterable[tuple[uuid.UUID, bytes]],
864
946
  secrets: Dict[str, str],
865
947
  repo_credentials: Optional[RemoteRepoCreds],
866
948
  success_if_not_available: bool,
@@ -896,10 +978,15 @@ def _submit_job_to_runner(
896
978
  run=run,
897
979
  job=job,
898
980
  cluster_info=cluster_info,
899
- secrets=secrets,
981
+ # Do not send all the secrets since interpolation is already done by the server.
982
+ # TODO: Passing secrets may be necessary for filtering out secret values from logs.
983
+ secrets={},
900
984
  repo_credentials=repo_credentials,
901
985
  instance_env=instance_env,
902
986
  )
987
+ logger.debug("%s: uploading file archive(s)", fmt(job_model))
988
+ for archive_id, archive in file_archives:
989
+ runner_client.upload_archive(archive_id, archive)
903
990
  logger.debug("%s: uploading code", fmt(job_model))
904
991
  runner_client.upload_code(code)
905
992
  logger.debug("%s: starting job", fmt(job_model))
@@ -911,6 +998,16 @@ def _submit_job_to_runner(
911
998
  return True
912
999
 
913
1000
 
1001
+ def _interpolate_secrets(secrets: Dict[str, str], job_spec: JobSpec):
1002
+ interpolate = VariablesInterpolator({"secrets": secrets}).interpolate_or_error
1003
+ job_spec.env = {k: interpolate(v) for k, v in job_spec.env.items()}
1004
+ if job_spec.registry_auth is not None:
1005
+ job_spec.registry_auth = RegistryAuth(
1006
+ username=interpolate(job_spec.registry_auth.username),
1007
+ password=interpolate(job_spec.registry_auth.password),
1008
+ )
1009
+
1010
+
914
1011
  def _get_instance_specific_mounts(
915
1012
  backend_type: BackendType, instance_type_name: str
916
1013
  ) -> List[InstanceMountPoint]:
@@ -27,6 +27,7 @@ from dstack._internal.server.services.jobs import (
27
27
  group_jobs_by_replica_latest,
28
28
  )
29
29
  from dstack._internal.server.services.locking import get_locker
30
+ from dstack._internal.server.services.prometheus.client_metrics import run_metrics
30
31
  from dstack._internal.server.services.runs import (
31
32
  fmt,
32
33
  process_terminating_run,
@@ -34,6 +35,7 @@ from dstack._internal.server.services.runs import (
34
35
  run_model_to_run,
35
36
  scale_run_replicas,
36
37
  )
38
+ from dstack._internal.server.services.secrets import get_project_secrets_mapping
37
39
  from dstack._internal.server.services.services import update_service_desired_replica_count
38
40
  from dstack._internal.utils import common
39
41
  from dstack._internal.utils.logging import get_logger
@@ -62,7 +64,7 @@ async def _process_next_run():
62
64
  )
63
65
  .order_by(RunModel.last_processed_at.asc())
64
66
  .limit(1)
65
- .with_for_update(skip_locked=True)
67
+ .with_for_update(skip_locked=True, key_share=True)
66
68
  )
67
69
  run_model = res.scalar()
68
70
  if run_model is None:
@@ -74,7 +76,7 @@ async def _process_next_run():
74
76
  JobModel.id.not_in(job_lockset),
75
77
  )
76
78
  .order_by(JobModel.id) # take locks in order
77
- .with_for_update(skip_locked=True)
79
+ .with_for_update(skip_locked=True, key_share=True)
78
80
  )
79
81
  job_models = res.scalars().all()
80
82
  if len(run_model.jobs) != len(job_models):
@@ -329,6 +331,24 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
329
331
  run_model.status.name,
330
332
  new_status.name,
331
333
  )
334
+ if run_model.status == RunStatus.SUBMITTED and new_status == RunStatus.PROVISIONING:
335
+ current_time = common.get_current_datetime()
336
+ submit_to_provision_duration = (
337
+ current_time - run_model.submitted_at.replace(tzinfo=datetime.timezone.utc)
338
+ ).total_seconds()
339
+ logger.info(
340
+ "%s: run took %.2f seconds from submision to provisioning.",
341
+ fmt(run_model),
342
+ submit_to_provision_duration,
343
+ )
344
+ project_name = run_model.project.name
345
+ run_metrics.log_submit_to_provision_duration(
346
+ submit_to_provision_duration, project_name, run_spec.configuration.type
347
+ )
348
+
349
+ if new_status == RunStatus.PENDING:
350
+ run_metrics.increment_pending_runs(run_model.project.name, run_spec.configuration.type)
351
+
332
352
  run_model.status = new_status
333
353
  run_model.termination_reason = termination_reason
334
354
  # While a run goes to pending without provisioning, resubmission_attempt increases.
@@ -385,7 +405,11 @@ async def _handle_run_replicas(
385
405
  )
386
406
  return
387
407
 
388
- await _update_jobs_to_new_deployment_in_place(run_model, run_spec)
408
+ await _update_jobs_to_new_deployment_in_place(
409
+ session=session,
410
+ run_model=run_model,
411
+ run_spec=run_spec,
412
+ )
389
413
  if _has_out_of_date_replicas(run_model):
390
414
  non_terminated_replica_count = len(
391
415
  {j.replica_num for j in run_model.jobs if not j.status.is_finished()}
@@ -425,18 +449,25 @@ async def _handle_run_replicas(
425
449
  )
426
450
 
427
451
 
428
- async def _update_jobs_to_new_deployment_in_place(run_model: RunModel, run_spec: RunSpec) -> None:
452
+ async def _update_jobs_to_new_deployment_in_place(
453
+ session: AsyncSession, run_model: RunModel, run_spec: RunSpec
454
+ ) -> None:
429
455
  """
430
456
  Bump deployment_num for jobs that do not require redeployment.
431
457
  """
432
-
458
+ secrets = await get_project_secrets_mapping(
459
+ session=session,
460
+ project=run_model.project,
461
+ )
433
462
  for replica_num, job_models in group_jobs_by_replica_latest(run_model.jobs):
434
463
  if all(j.status.is_finished() for j in job_models):
435
464
  continue
436
465
  if all(j.deployment_num == run_model.deployment_num for j in job_models):
437
466
  continue
467
+ # FIXME: Handle getting image configuration errors or skip it.
438
468
  new_job_specs = await get_job_specs_from_run_spec(
439
469
  run_spec=run_spec,
470
+ secrets=secrets,
440
471
  replica_num=replica_num,
441
472
  )
442
473
  assert len(new_job_specs) == len(job_models), (
@@ -99,7 +99,7 @@ async def _process_next_submitted_job():
99
99
  JobModel.id.not_in(lockset),
100
100
  )
101
101
  # Jobs are process in FIFO sorted by priority globally,
102
- # thus runs from different project can "overtake" each other by using higher priorities.
102
+ # thus runs from different projects can "overtake" each other by using higher priorities.
103
103
  # That's not a big problem as long as projects do not compete for the same compute resources.
104
104
  # Jobs with lower priorities from other projects will be processed without major lag
105
105
  # as long as new higher priority runs are not constantly submitted.
@@ -108,7 +108,13 @@ async def _process_next_submitted_job():
108
108
  # there can be many projects and we are limited by the max DB connections.
109
109
  .order_by(RunModel.priority.desc(), JobModel.last_processed_at.asc())
110
110
  .limit(1)
111
- .with_for_update(skip_locked=True)
111
+ .with_for_update(
112
+ skip_locked=True,
113
+ key_share=True,
114
+ # Do not lock joined run, only job.
115
+ # Locking run here may cause deadlock.
116
+ of=JobModel,
117
+ )
112
118
  )
113
119
  job_model = res.scalar()
114
120
  if job_model is None:
@@ -201,7 +207,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
201
207
  )
202
208
  .options(lazyload(InstanceModel.jobs))
203
209
  .order_by(InstanceModel.id) # take locks in order
204
- .with_for_update()
210
+ .with_for_update(key_share=True)
205
211
  )
206
212
  pool_instances = list(res.unique().scalars().all())
207
213
  instances_ids = sorted([i.id for i in pool_instances])
@@ -326,7 +332,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
326
332
  .where(VolumeModel.id.in_(volumes_ids))
327
333
  .options(selectinload(VolumeModel.user))
328
334
  .order_by(VolumeModel.id) # take locks in order
329
- .with_for_update()
335
+ .with_for_update(key_share=True)
330
336
  )
331
337
  async with get_locker().lock_ctx(VolumeModel.__tablename__, volumes_ids):
332
338
  if len(volume_models) > 0:
@@ -45,7 +45,7 @@ async def _process_next_terminating_job():
45
45
  )
46
46
  .order_by(JobModel.last_processed_at.asc())
47
47
  .limit(1)
48
- .with_for_update(skip_locked=True)
48
+ .with_for_update(skip_locked=True, key_share=True)
49
49
  )
50
50
  job_model = res.scalar()
51
51
  if job_model is None:
@@ -58,7 +58,7 @@ async def _process_next_terminating_job():
58
58
  InstanceModel.id.not_in(instance_lockset),
59
59
  )
60
60
  .options(lazyload(InstanceModel.jobs))
61
- .with_for_update(skip_locked=True)
61
+ .with_for_update(skip_locked=True, key_share=True)
62
62
  )
63
63
  instance_model = res.scalar()
64
64
  if instance_model is None:
@@ -33,7 +33,7 @@ async def process_submitted_volumes():
33
33
  )
34
34
  .order_by(VolumeModel.last_processed_at.asc())
35
35
  .limit(1)
36
- .with_for_update(skip_locked=True)
36
+ .with_for_update(skip_locked=True, key_share=True)
37
37
  )
38
38
  volume_model = res.scalar()
39
39
  if volume_model is None:
@@ -0,0 +1,39 @@
1
+ """Add FileArchiveModel
2
+
3
+ Revision ID: 5f1707c525d2
4
+ Revises: 35e90e1b0d3e
5
+ Create Date: 2025-06-12 12:28:26.678380
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ import sqlalchemy_utils
11
+ from alembic import op
12
+
13
+ # revision identifiers, used by Alembic.
14
+ revision = "5f1707c525d2"
15
+ down_revision = "35e90e1b0d3e"
16
+ branch_labels = None
17
+ depends_on = None
18
+
19
+
20
+ def upgrade() -> None:
21
+ op.create_table(
22
+ "file_archives",
23
+ sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
24
+ sa.Column("user_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
25
+ sa.Column("blob_hash", sa.Text(), nullable=False),
26
+ sa.Column("blob", sa.LargeBinary(), nullable=True),
27
+ sa.ForeignKeyConstraint(
28
+ ["user_id"],
29
+ ["users.id"],
30
+ name=op.f("fk_file_archives_user_id_users"),
31
+ ondelete="CASCADE",
32
+ ),
33
+ sa.PrimaryKeyConstraint("id", name=op.f("pk_file_archives")),
34
+ sa.UniqueConstraint("user_id", "blob_hash", name="uq_file_archives_user_id_blob_hash"),
35
+ )
36
+
37
+
38
+ def downgrade() -> None:
39
+ op.drop_table("file_archives")
@@ -0,0 +1,49 @@
1
+ """Add SecretModel
2
+
3
+ Revision ID: 644b8a114187
4
+ Revises: 5f1707c525d2
5
+ Create Date: 2025-06-30 11:00:04.326290
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ import sqlalchemy_utils
11
+ from alembic import op
12
+
13
+ import dstack._internal.server.models
14
+
15
+ # revision identifiers, used by Alembic.
16
+ revision = "644b8a114187"
17
+ down_revision = "5f1707c525d2"
18
+ branch_labels = None
19
+ depends_on = None
20
+
21
+
22
+ def upgrade() -> None:
23
+ # ### commands auto generated by Alembic - please adjust! ###
24
+ op.create_table(
25
+ "secrets",
26
+ sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
27
+ sa.Column(
28
+ "project_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False
29
+ ),
30
+ sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=False),
31
+ sa.Column("updated_at", dstack._internal.server.models.NaiveDateTime(), nullable=False),
32
+ sa.Column("name", sa.String(length=200), nullable=False),
33
+ sa.Column("value", dstack._internal.server.models.EncryptedString(), nullable=False),
34
+ sa.ForeignKeyConstraint(
35
+ ["project_id"],
36
+ ["projects.id"],
37
+ name=op.f("fk_secrets_project_id_projects"),
38
+ ondelete="CASCADE",
39
+ ),
40
+ sa.PrimaryKeyConstraint("id", name=op.f("pk_secrets")),
41
+ sa.UniqueConstraint("project_id", "name", name="uq_secrets_project_id_name"),
42
+ )
43
+ # ### end Alembic commands ###
44
+
45
+
46
+ def downgrade() -> None:
47
+ # ### commands auto generated by Alembic - please adjust! ###
48
+ op.drop_table("secrets")
49
+ # ### end Alembic commands ###
@@ -315,6 +315,21 @@ class CodeModel(BaseModel):
315
315
  blob: Mapped[Optional[bytes]] = mapped_column(LargeBinary) # None means blob is stored on s3
316
316
 
317
317
 
318
+ class FileArchiveModel(BaseModel):
319
+ __tablename__ = "file_archives"
320
+ __table_args__ = (
321
+ UniqueConstraint("user_id", "blob_hash", name="uq_file_archives_user_id_blob_hash"),
322
+ )
323
+
324
+ id: Mapped[uuid.UUID] = mapped_column(
325
+ UUIDType(binary=False), primary_key=True, default=uuid.uuid4
326
+ )
327
+ user_id: Mapped["UserModel"] = mapped_column(ForeignKey("users.id", ondelete="CASCADE"))
328
+ user: Mapped["UserModel"] = relationship()
329
+ blob_hash: Mapped[str] = mapped_column(Text)
330
+ blob: Mapped[Optional[bytes]] = mapped_column(LargeBinary) # None means blob is stored on s3
331
+
332
+
318
333
  class RunModel(BaseModel):
319
334
  __tablename__ = "runs"
320
335
 
@@ -711,3 +726,21 @@ class JobPrometheusMetrics(BaseModel):
711
726
  collected_at: Mapped[datetime] = mapped_column(NaiveDateTime)
712
727
  # Raw Prometheus text response
713
728
  text: Mapped[str] = mapped_column(Text)
729
+
730
+
731
+ class SecretModel(BaseModel):
732
+ __tablename__ = "secrets"
733
+ __table_args__ = (UniqueConstraint("project_id", "name", name="uq_secrets_project_id_name"),)
734
+
735
+ id: Mapped[uuid.UUID] = mapped_column(
736
+ UUIDType(binary=False), primary_key=True, default=uuid.uuid4
737
+ )
738
+
739
+ project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE"))
740
+ project: Mapped["ProjectModel"] = relationship()
741
+
742
+ created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime)
743
+ updated_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime)
744
+
745
+ name: Mapped[str] = mapped_column(String(200))
746
+ value: Mapped[DecryptedString] = mapped_column(EncryptedString())
@@ -0,0 +1,67 @@
1
+ from typing import Annotated
2
+
3
+ from fastapi import APIRouter, Depends, Request, UploadFile
4
+ from sqlalchemy.ext.asyncio import AsyncSession
5
+
6
+ from dstack._internal.core.errors import ResourceNotExistsError, ServerClientError
7
+ from dstack._internal.core.models.files import FileArchive
8
+ from dstack._internal.server.db import get_session
9
+ from dstack._internal.server.models import UserModel
10
+ from dstack._internal.server.schemas.files import GetFileArchiveByHashRequest
11
+ from dstack._internal.server.security.permissions import Authenticated
12
+ from dstack._internal.server.services import files
13
+ from dstack._internal.server.settings import SERVER_CODE_UPLOAD_LIMIT
14
+ from dstack._internal.server.utils.routers import (
15
+ get_base_api_additional_responses,
16
+ get_request_size,
17
+ )
18
+ from dstack._internal.utils.common import sizeof_fmt
19
+
20
+ router = APIRouter(
21
+ prefix="/api/files",
22
+ tags=["files"],
23
+ responses=get_base_api_additional_responses(),
24
+ )
25
+
26
+
27
+ @router.post("/get_archive_by_hash")
28
+ async def get_archive_by_hash(
29
+ body: GetFileArchiveByHashRequest,
30
+ session: Annotated[AsyncSession, Depends(get_session)],
31
+ user: Annotated[UserModel, Depends(Authenticated())],
32
+ ) -> FileArchive:
33
+ archive = await files.get_archive_by_hash(
34
+ session=session,
35
+ user=user,
36
+ hash=body.hash,
37
+ )
38
+ if archive is None:
39
+ raise ResourceNotExistsError()
40
+ return archive
41
+
42
+
43
+ @router.post("/upload_archive")
44
+ async def upload_archive(
45
+ request: Request,
46
+ file: UploadFile,
47
+ session: Annotated[AsyncSession, Depends(get_session)],
48
+ user: Annotated[UserModel, Depends(Authenticated())],
49
+ ) -> FileArchive:
50
+ request_size = get_request_size(request)
51
+ if SERVER_CODE_UPLOAD_LIMIT > 0 and request_size > SERVER_CODE_UPLOAD_LIMIT:
52
+ diff_size_fmt = sizeof_fmt(request_size)
53
+ limit_fmt = sizeof_fmt(SERVER_CODE_UPLOAD_LIMIT)
54
+ if diff_size_fmt == limit_fmt:
55
+ diff_size_fmt = f"{request_size}B"
56
+ limit_fmt = f"{SERVER_CODE_UPLOAD_LIMIT}B"
57
+ raise ServerClientError(
58
+ f"Archive size is {diff_size_fmt}, which exceeds the limit of {limit_fmt}."
59
+ " Use .gitignore/.dstackignore to exclude large files."
60
+ " This limit can be modified by setting the DSTACK_SERVER_CODE_UPLOAD_LIMIT environment variable."
61
+ )
62
+ archive = await files.upload_archive(
63
+ session=session,
64
+ user=user,
65
+ file=file,
66
+ )
67
+ return archive
@@ -9,7 +9,10 @@ import dstack._internal.server.services.gateways as gateways
9
9
  from dstack._internal.core.errors import ResourceNotExistsError
10
10
  from dstack._internal.server.db import get_session
11
11
  from dstack._internal.server.models import ProjectModel, UserModel
12
- from dstack._internal.server.security.permissions import ProjectAdmin, ProjectMember
12
+ from dstack._internal.server.security.permissions import (
13
+ ProjectAdmin,
14
+ ProjectMemberOrPublicAccess,
15
+ )
13
16
  from dstack._internal.server.utils.routers import get_base_api_additional_responses
14
17
 
15
18
  router = APIRouter(
@@ -22,7 +25,7 @@ router = APIRouter(
22
25
  @router.post("/list")
23
26
  async def list_gateways(
24
27
  session: AsyncSession = Depends(get_session),
25
- user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
28
+ user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMemberOrPublicAccess()),
26
29
  ) -> List[models.Gateway]:
27
30
  _, project = user_project
28
31
  return await gateways.list_project_gateways(session=session, project=project)
@@ -32,7 +35,7 @@ async def list_gateways(
32
35
  async def get_gateway(
33
36
  body: schemas.GetGatewayRequest,
34
37
  session: AsyncSession = Depends(get_session),
35
- user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
38
+ user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMemberOrPublicAccess()),
36
39
  ) -> models.Gateway:
37
40
  _, project = user_project
38
41
  gateway = await gateways.get_gateway_by_name(session=session, project=project, name=body.name)