dstack 0.18.42__py3-none-any.whl → 0.18.44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (115) hide show
  1. dstack/_internal/cli/commands/__init__.py +2 -1
  2. dstack/_internal/cli/commands/apply.py +4 -2
  3. dstack/_internal/cli/commands/attach.py +21 -1
  4. dstack/_internal/cli/commands/completion.py +20 -0
  5. dstack/_internal/cli/commands/delete.py +3 -1
  6. dstack/_internal/cli/commands/fleet.py +2 -1
  7. dstack/_internal/cli/commands/gateway.py +7 -2
  8. dstack/_internal/cli/commands/logs.py +3 -2
  9. dstack/_internal/cli/commands/stats.py +2 -1
  10. dstack/_internal/cli/commands/stop.py +2 -1
  11. dstack/_internal/cli/commands/volume.py +2 -1
  12. dstack/_internal/cli/main.py +6 -0
  13. dstack/_internal/cli/services/completion.py +86 -0
  14. dstack/_internal/cli/services/configurators/run.py +11 -17
  15. dstack/_internal/cli/utils/fleet.py +5 -1
  16. dstack/_internal/cli/utils/run.py +11 -0
  17. dstack/_internal/core/backends/aws/compute.py +23 -10
  18. dstack/_internal/core/backends/aws/resources.py +3 -3
  19. dstack/_internal/core/backends/azure/compute.py +15 -9
  20. dstack/_internal/core/backends/azure/resources.py +2 -0
  21. dstack/_internal/core/backends/base/compute.py +102 -2
  22. dstack/_internal/core/backends/base/offers.py +7 -1
  23. dstack/_internal/core/backends/cudo/compute.py +8 -4
  24. dstack/_internal/core/backends/datacrunch/compute.py +10 -4
  25. dstack/_internal/core/backends/gcp/auth.py +19 -13
  26. dstack/_internal/core/backends/gcp/compute.py +26 -20
  27. dstack/_internal/core/backends/gcp/resources.py +3 -10
  28. dstack/_internal/core/backends/kubernetes/compute.py +4 -3
  29. dstack/_internal/core/backends/lambdalabs/compute.py +9 -3
  30. dstack/_internal/core/backends/nebius/compute.py +2 -2
  31. dstack/_internal/core/backends/oci/compute.py +10 -4
  32. dstack/_internal/core/backends/runpod/compute.py +32 -7
  33. dstack/_internal/core/backends/runpod/config.py +8 -0
  34. dstack/_internal/core/backends/tensordock/compute.py +14 -3
  35. dstack/_internal/core/backends/vastai/compute.py +12 -2
  36. dstack/_internal/core/backends/vultr/api_client.py +3 -3
  37. dstack/_internal/core/backends/vultr/compute.py +9 -3
  38. dstack/_internal/core/models/backends/aws.py +2 -0
  39. dstack/_internal/core/models/backends/base.py +1 -0
  40. dstack/_internal/core/models/backends/runpod.py +2 -0
  41. dstack/_internal/core/models/configurations.py +2 -2
  42. dstack/_internal/core/models/profiles.py +46 -1
  43. dstack/_internal/core/models/runs.py +4 -0
  44. dstack/_internal/core/services/__init__.py +5 -1
  45. dstack/_internal/core/services/configs/__init__.py +3 -0
  46. dstack/_internal/server/app.py +11 -1
  47. dstack/_internal/server/background/__init__.py +10 -0
  48. dstack/_internal/server/background/tasks/common.py +22 -0
  49. dstack/_internal/server/background/tasks/process_instances.py +11 -18
  50. dstack/_internal/server/background/tasks/process_placement_groups.py +1 -0
  51. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
  52. dstack/_internal/server/background/tasks/process_running_jobs.py +74 -34
  53. dstack/_internal/server/background/tasks/process_runs.py +1 -0
  54. dstack/_internal/server/background/tasks/process_submitted_jobs.py +4 -1
  55. dstack/_internal/server/background/tasks/process_terminating_jobs.py +1 -7
  56. dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
  57. dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
  58. dstack/_internal/server/models.py +11 -0
  59. dstack/_internal/server/routers/logs.py +3 -0
  60. dstack/_internal/server/routers/metrics.py +21 -2
  61. dstack/_internal/server/routers/prometheus.py +36 -0
  62. dstack/_internal/server/security/permissions.py +1 -1
  63. dstack/_internal/server/services/backends/configurators/aws.py +31 -1
  64. dstack/_internal/server/services/backends/configurators/gcp.py +8 -15
  65. dstack/_internal/server/services/backends/configurators/runpod.py +3 -33
  66. dstack/_internal/server/services/config.py +24 -4
  67. dstack/_internal/server/services/fleets.py +1 -0
  68. dstack/_internal/server/services/gateways/__init__.py +1 -0
  69. dstack/_internal/server/services/jobs/__init__.py +12 -9
  70. dstack/_internal/server/services/jobs/configurators/base.py +9 -1
  71. dstack/_internal/server/services/jobs/configurators/dev.py +1 -3
  72. dstack/_internal/server/services/jobs/configurators/task.py +1 -3
  73. dstack/_internal/server/services/logs/__init__.py +78 -0
  74. dstack/_internal/server/services/{logs.py → logs/aws.py} +12 -207
  75. dstack/_internal/server/services/logs/base.py +47 -0
  76. dstack/_internal/server/services/logs/filelog.py +110 -0
  77. dstack/_internal/server/services/logs/gcp.py +165 -0
  78. dstack/_internal/server/services/metrics.py +103 -70
  79. dstack/_internal/server/services/pools.py +16 -17
  80. dstack/_internal/server/services/prometheus.py +87 -0
  81. dstack/_internal/server/services/proxy/routers/service_proxy.py +14 -7
  82. dstack/_internal/server/services/runner/client.py +14 -3
  83. dstack/_internal/server/services/runs.py +43 -15
  84. dstack/_internal/server/services/volumes.py +1 -0
  85. dstack/_internal/server/settings.py +6 -0
  86. dstack/_internal/server/statics/index.html +1 -1
  87. dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js → main-4eb116b97819badd1e2c.js} +131 -78
  88. dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js.map → main-4eb116b97819badd1e2c.js.map} +1 -1
  89. dstack/_internal/server/statics/{main-ad5150a441de98cd8987.css → main-da9f8c06a69c20dac23e.css} +1 -1
  90. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  91. dstack/_internal/server/testing/common.py +50 -8
  92. dstack/api/_public/runs.py +4 -1
  93. dstack/api/server/_fleets.py +2 -0
  94. dstack/api/server/_runs.py +4 -0
  95. dstack/api/utils.py +3 -0
  96. dstack/version.py +2 -2
  97. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/METADATA +13 -3
  98. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/RECORD +115 -97
  99. tests/_internal/core/backends/base/__init__.py +0 -0
  100. tests/_internal/core/backends/base/test_compute.py +56 -0
  101. tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +189 -0
  102. tests/_internal/server/background/tasks/test_process_running_jobs.py +126 -1
  103. tests/_internal/server/conftest.py +4 -5
  104. tests/_internal/server/routers/test_backends.py +1 -0
  105. tests/_internal/server/routers/test_fleets.py +2 -0
  106. tests/_internal/server/routers/test_logs.py +1 -1
  107. tests/_internal/server/routers/test_metrics.py +15 -0
  108. tests/_internal/server/routers/test_prometheus.py +244 -0
  109. tests/_internal/server/routers/test_runs.py +81 -58
  110. tests/_internal/server/services/test_logs.py +3 -3
  111. tests/_internal/server/services/test_metrics.py +163 -0
  112. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/LICENSE.md +0 -0
  113. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/WHEEL +0 -0
  114. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/entry_points.txt +0 -0
  115. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,16 @@
1
- from typing import Tuple
1
+ from datetime import datetime
2
+ from typing import Optional, Tuple
2
3
 
3
4
  from fastapi import APIRouter, Depends
4
5
  from sqlalchemy.ext.asyncio import AsyncSession
5
6
 
7
+ from dstack._internal.core.errors import ResourceNotExistsError
6
8
  from dstack._internal.core.models.metrics import JobMetrics
7
9
  from dstack._internal.server.db import get_session
8
10
  from dstack._internal.server.models import ProjectModel, UserModel
9
11
  from dstack._internal.server.security.permissions import ProjectMember
10
12
  from dstack._internal.server.services import metrics
13
+ from dstack._internal.server.services.jobs import get_run_job_model
11
14
  from dstack._internal.server.utils.routers import get_base_api_additional_responses
12
15
 
13
16
  router = APIRouter(
@@ -24,6 +27,9 @@ async def get_job_metrics(
24
27
  run_name: str,
25
28
  replica_num: int = 0,
26
29
  job_num: int = 0,
30
+ limit: int = 1,
31
+ after: Optional[datetime] = None,
32
+ before: Optional[datetime] = None,
27
33
  session: AsyncSession = Depends(get_session),
28
34
  user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
29
35
  ) -> JobMetrics:
@@ -31,6 +37,8 @@ async def get_job_metrics(
31
37
  Returns job-level metrics such as hardware utilization
32
38
  given `run_name`, `replica_num`, and `job_num`.
33
39
  If only `run_name` is specified, returns metrics of `(replica_num=0, job_num=0)`.
40
+ By default, returns one latest sample. To control time window/number of samples, use
41
+ `limit`, `after`, `before`.
34
42
 
35
43
  Supported metrics: [
36
44
  "cpu_usage_percent",
@@ -42,10 +50,21 @@ async def get_job_metrics(
42
50
  ]
43
51
  """
44
52
  _, project = user_project
45
- return await metrics.get_job_metrics(
53
+
54
+ job_model = await get_run_job_model(
46
55
  session=session,
47
56
  project=project,
48
57
  run_name=run_name,
49
58
  replica_num=replica_num,
50
59
  job_num=job_num,
51
60
  )
61
+ if job_model is None:
62
+ raise ResourceNotExistsError("Found no job with given parameters")
63
+
64
+ return await metrics.get_job_metrics(
65
+ session=session,
66
+ job_model=job_model,
67
+ limit=limit,
68
+ after=after,
69
+ before=before,
70
+ )
@@ -0,0 +1,36 @@
1
+ from typing import Annotated
2
+
3
+ from fastapi import APIRouter, Depends
4
+ from fastapi.responses import PlainTextResponse
5
+ from sqlalchemy.ext.asyncio import AsyncSession
6
+
7
+ from dstack._internal.server import settings
8
+ from dstack._internal.server.db import get_session
9
+ from dstack._internal.server.deps import Project
10
+ from dstack._internal.server.models import ProjectModel
11
+ from dstack._internal.server.services import prometheus
12
+ from dstack._internal.server.utils.routers import error_not_found
13
+
14
+ router = APIRouter(
15
+ tags=["prometheus"],
16
+ default_response_class=PlainTextResponse,
17
+ )
18
+
19
+
20
+ @router.get("/metrics")
21
+ async def get_prometheus_metrics(
22
+ session: Annotated[AsyncSession, Depends(get_session)],
23
+ ) -> str:
24
+ if not settings.ENABLE_PROMETHEUS_METRICS:
25
+ raise error_not_found()
26
+ return await prometheus.get_metrics(session=session)
27
+
28
+
29
+ @router.get("/metrics/project/{project_name}")
30
+ async def get_project_prometheus_metrics(
31
+ session: Annotated[AsyncSession, Depends(get_session)],
32
+ project: Annotated[ProjectModel, Depends(Project())],
33
+ ) -> str:
34
+ if not settings.ENABLE_PROMETHEUS_METRICS:
35
+ raise error_not_found()
36
+ return await prometheus.get_project_metrics(session=session, project=project)
@@ -80,7 +80,7 @@ class ProjectManager:
80
80
  project = await get_project_model_by_name(session=session, project_name=project_name)
81
81
  if project is None:
82
82
  raise error_forbidden()
83
- if user.global_role in GlobalRole.ADMIN:
83
+ if user.global_role == GlobalRole.ADMIN:
84
84
  return user, project
85
85
  project_role = get_user_project_role(user=user, project=project)
86
86
  if project_role in [ProjectRole.ADMIN, ProjectRole.MANAGER]:
@@ -2,6 +2,7 @@ import concurrent.futures
2
2
  import json
3
3
  from typing import List
4
4
 
5
+ import botocore.exceptions
5
6
  from boto3.session import Session
6
7
 
7
8
  from dstack._internal.core.backends.aws import AWSBackend, auth, compute, resources
@@ -35,6 +36,9 @@ from dstack._internal.server.services.backends.configurators.base import (
35
36
  Configurator,
36
37
  raise_invalid_credentials_error,
37
38
  )
39
+ from dstack._internal.utils.logging import get_logger
40
+
41
+ logger = get_logger(__name__)
38
42
 
39
43
  REGIONS = [
40
44
  ("US East, N. Virginia", "us-east-1"),
@@ -137,7 +141,8 @@ class AWSConfigurator(Configurator):
137
141
 
138
142
  def _check_config(self, session: Session, config: AWSConfigInfoWithCredsPartial):
139
143
  self._check_tags_config(config)
140
- self._check_vpc_config(session=session, config=config)
144
+ self._check_iam_instance_profile_config(session, config)
145
+ self._check_vpc_config(session, config)
141
146
 
142
147
  def _check_tags_config(self, config: AWSConfigInfoWithCredsPartial):
143
148
  if not config.tags:
@@ -151,6 +156,31 @@ class AWSConfigurator(Configurator):
151
156
  except BackendError as e:
152
157
  raise ServerClientError(e.args[0])
153
158
 
159
+ def _check_iam_instance_profile_config(
160
+ self, session: Session, config: AWSConfigInfoWithCredsPartial
161
+ ):
162
+ if config.iam_instance_profile is None:
163
+ return
164
+ try:
165
+ iam_client = session.client("iam")
166
+ iam_client.get_instance_profile(InstanceProfileName=config.iam_instance_profile)
167
+ except botocore.exceptions.ClientError as e:
168
+ if e.response["Error"]["Code"] == "NoSuchEntity":
169
+ raise ServerClientError(
170
+ f"IAM instance profile {config.iam_instance_profile} not found"
171
+ )
172
+ logger.exception(
173
+ "Got botocore.exceptions.ClientError when checking iam_instance_profile"
174
+ )
175
+ raise ServerClientError(
176
+ f"Failed to check IAM instance profile {config.iam_instance_profile}"
177
+ )
178
+ except Exception:
179
+ logger.exception("Got exception when checking iam_instance_profile")
180
+ raise ServerClientError(
181
+ f"Failed to check IAM instance profile {config.iam_instance_profile}"
182
+ )
183
+
154
184
  def _check_vpc_config(self, session: Session, config: AWSConfigInfoWithCredsPartial):
155
185
  allocate_public_ip = config.public_ips if config.public_ips is not None else True
156
186
  use_default_vpcs = config.default_vpcs if config.default_vpcs is not None else True
@@ -127,10 +127,6 @@ class GCPConfigurator(Configurator):
127
127
  _, project_id = auth.authenticate(GCPDefaultCreds())
128
128
  except BackendAuthError:
129
129
  return []
130
-
131
- if project_id is None:
132
- return []
133
-
134
130
  return [
135
131
  GCPConfigInfoWithCreds(
136
132
  project_id=project_id,
@@ -152,24 +148,21 @@ class GCPConfigurator(Configurator):
152
148
  ):
153
149
  raise_invalid_credentials_error(fields=[["creds"]])
154
150
  try:
155
- credentials, project_id = auth.authenticate(creds=config.creds)
156
- except BackendAuthError:
151
+ credentials, _ = auth.authenticate(creds=config.creds, project_id=config.project_id)
152
+ except BackendAuthError as e:
153
+ details = None
154
+ if len(e.args) > 0:
155
+ details = e.args[0]
157
156
  if is_core_model_instance(config.creds, GCPServiceAccountCreds):
158
- raise_invalid_credentials_error(fields=[["creds", "data"]])
157
+ raise_invalid_credentials_error(fields=[["creds", "data"]], details=details)
159
158
  else:
160
- raise_invalid_credentials_error(fields=[["creds"]])
161
- if (
162
- project_id is not None
163
- and config.project_id is not None
164
- and config.project_id != project_id
165
- ):
166
- raise ServerClientError(msg="Wrong project_id", fields=[["project_id"]])
167
- config_values.project_id = self._get_project_id_element(selected=project_id)
159
+ raise_invalid_credentials_error(fields=[["creds"]], details=details)
168
160
  config_values.regions = self._get_regions_element(
169
161
  selected=config.regions or DEFAULT_REGIONS
170
162
  )
171
163
  if config.project_id is None:
172
164
  return config_values
165
+ config_values.project_id = self._get_project_id_element(selected=config.project_id)
173
166
  self._check_config(config=config, credentials=credentials)
174
167
  return config_values
175
168
 
@@ -3,11 +3,7 @@ from typing import List
3
3
 
4
4
  from dstack._internal.core.backends.base import Backend
5
5
  from dstack._internal.core.backends.runpod import RunpodBackend, RunpodConfig, api_client
6
- from dstack._internal.core.models.backends.base import (
7
- BackendType,
8
- ConfigElementValue,
9
- ConfigMultiElement,
10
- )
6
+ from dstack._internal.core.models.backends.base import BackendType, ConfigMultiElement
11
7
  from dstack._internal.core.models.backends.runpod import (
12
8
  RunpodConfigInfo,
13
9
  RunpodConfigInfoWithCreds,
@@ -22,25 +18,6 @@ from dstack._internal.server.services.backends.configurators.base import (
22
18
  raise_invalid_credentials_error,
23
19
  )
24
20
 
25
- REGIONS = [
26
- "CA-MTL-1",
27
- "CA-MTL-2",
28
- "CA-MTL-3",
29
- "EU-NL-1",
30
- "EU-RO-1",
31
- "EU-SE-1",
32
- "EUR-IS-1",
33
- "EUR-IS-2",
34
- "US-CA-1",
35
- "US-GA-1",
36
- "US-GA-2",
37
- "US-KS-2",
38
- "US-OR-1",
39
- "US-TX-3",
40
- ]
41
-
42
- DEFAULT_REGION = "CA-MTL-1"
43
-
44
21
 
45
22
  class RunpodConfigurator(Configurator):
46
23
  TYPE: BackendType = BackendType.RUNPOD
@@ -50,16 +27,12 @@ class RunpodConfigurator(Configurator):
50
27
  if config.creds is None:
51
28
  return config_values
52
29
  self._validate_runpod_api_key(config.creds.api_key)
53
- config_values.regions = self._get_regions_element(
54
- selected=config.regions or [DEFAULT_REGION]
55
- )
30
+ config_values.regions = self._get_regions_element(selected=config.regions or [])
56
31
  return config_values
57
32
 
58
33
  def create_backend(
59
34
  self, project: ProjectModel, config: RunpodConfigInfoWithCreds
60
35
  ) -> BackendModel:
61
- if config.regions is None:
62
- config.regions = REGIONS
63
36
  return BackendModel(
64
37
  project_id=project.id,
65
38
  type=self.TYPE.value,
@@ -80,10 +53,7 @@ class RunpodConfigurator(Configurator):
80
53
  return RunpodBackend(config=config)
81
54
 
82
55
  def _get_regions_element(self, selected: List[str]) -> ConfigMultiElement:
83
- element = ConfigMultiElement(selected=selected)
84
- for r in REGIONS:
85
- element.values.append(ConfigElementValue(value=r, label=r))
86
- return element
56
+ return ConfigMultiElement(selected=selected)
87
57
 
88
58
  def _get_backend_config(self, model: BackendModel) -> RunpodConfig:
89
59
  return RunpodConfig(
@@ -6,6 +6,7 @@ from pydantic import BaseModel, Field, ValidationError, root_validator
6
6
  from sqlalchemy.ext.asyncio import AsyncSession
7
7
  from typing_extensions import Annotated
8
8
 
9
+ from dstack._internal.core.backends.runpod.config import RUNPOD_COMMUNITY_CLOUD_DEFAULT
9
10
  from dstack._internal.core.errors import (
10
11
  BackendNotAvailable,
11
12
  ResourceNotExistsError,
@@ -45,7 +46,7 @@ logger = get_logger(__name__)
45
46
  # By default, PyYAML chooses the style of a collection depending on whether it has nested collections.
46
47
  # If a collection has nested collections, it will be assigned the block style. Otherwise it will have the flow style.
47
48
  #
48
- # We want mapping to always be display in block-style but lists without nested objects in flow-style.
49
+ # We want mapping to always be displayed in block-style but lists without nested objects in flow-style.
49
50
  # So we define a custom representeter
50
51
 
51
52
 
@@ -107,6 +108,16 @@ class AWSConfig(CoreModel):
107
108
  )
108
109
  ),
109
110
  ] = None
111
+ iam_instance_profile: Annotated[
112
+ Optional[str],
113
+ Field(
114
+ description=(
115
+ "The name of the IAM instance profile to associate with EC2 instances."
116
+ " You can also specify the IAM role name for roles created via the AWS console."
117
+ " AWS automatically creates an instance profile and gives it the same name as the role"
118
+ )
119
+ ),
120
+ ] = None
110
121
  tags: Annotated[
111
122
  Optional[Dict[str, str]],
112
123
  Field(description="The tags that will be assigned to resources created by `dstack`"),
@@ -251,7 +262,7 @@ class GCPConfig(CoreModel):
251
262
  ),
252
263
  ] = None
253
264
  vm_service_account: Annotated[
254
- Optional[str], Field(description="The service account associated with provisioned VMs")
265
+ Optional[str], Field(description="The service account to associate with provisioned VMs")
255
266
  ] = None
256
267
  tags: Annotated[
257
268
  Optional[Dict[str, str]],
@@ -330,7 +341,7 @@ class KubernetesConfig(CoreModel):
330
341
  kubeconfig: Annotated[KubeconfigConfig, Field(description="The kubeconfig configuration")]
331
342
  networking: Annotated[
332
343
  Optional[KubernetesNetworkingConfig], Field(description="The networking configuration")
333
- ]
344
+ ] = None
334
345
 
335
346
 
336
347
  class KubernetesAPIConfig(CoreModel):
@@ -338,7 +349,7 @@ class KubernetesAPIConfig(CoreModel):
338
349
  kubeconfig: Annotated[KubeconfigAPIConfig, Field(description="The kubeconfig configuration")]
339
350
  networking: Annotated[
340
351
  Optional[KubernetesNetworkingConfig], Field(description="The networking configuration")
341
- ]
352
+ ] = None
342
353
 
343
354
 
344
355
  class LambdaConfig(CoreModel):
@@ -418,6 +429,15 @@ class RunpodConfig(CoreModel):
418
429
  Optional[List[str]],
419
430
  Field(description="The list of RunPod regions. Omit to use all regions"),
420
431
  ] = None
432
+ community_cloud: Annotated[
433
+ Optional[bool],
434
+ Field(
435
+ description=(
436
+ "Whether Community Cloud offers can be suggested in addition to Secure Cloud."
437
+ f" Defaults to `{str(RUNPOD_COMMUNITY_CLOUD_DEFAULT).lower()}`"
438
+ )
439
+ ),
440
+ ] = None
421
441
  creds: Annotated[AnyRunpodCreds, Field(description="The credentials")]
422
442
 
423
443
 
@@ -517,6 +517,7 @@ async def delete_fleets(
517
517
  .options(selectinload(FleetModel.instances))
518
518
  .options(selectinload(FleetModel.runs))
519
519
  .execution_options(populate_existing=True)
520
+ .order_by(FleetModel.id) # take locks in order
520
521
  .with_for_update()
521
522
  )
522
523
  fleet_models = res.scalars().unique().all()
@@ -220,6 +220,7 @@ async def delete_gateways(
220
220
  )
221
221
  .options(selectinload(GatewayModel.gateway_compute))
222
222
  .execution_options(populate_existing=True)
223
+ .order_by(GatewayModel.id) # take locks in order
223
224
  .with_for_update()
224
225
  )
225
226
  gateway_models = res.scalars().all()
@@ -236,13 +236,14 @@ async def process_terminating_job(
236
236
  logger.debug("%s: stopping container", fmt(job_model))
237
237
  ssh_private_keys = get_instance_ssh_private_keys(instance_model)
238
238
  await stop_container(job_model, jpd, ssh_private_keys)
239
- volume_models: list[VolumeModel]
240
239
  if jrd is not None and jrd.volume_names is not None:
241
- volume_models = await list_project_volume_models(
242
- session=session, project=instance_model.project, names=jrd.volume_names
243
- )
240
+ volume_names = jrd.volume_names
244
241
  else:
245
- volume_models = [va.volume for va in instance_model.volume_attachments]
242
+ # Legacy jobs before job_runtime_data/blocks were introduced
243
+ volume_names = [va.volume.name for va in instance_model.volume_attachments]
244
+ volume_models = await list_project_volume_models(
245
+ session=session, project=instance_model.project, names=volume_names
246
+ )
246
247
  if len(volume_models) > 0:
247
248
  logger.info("Detaching volumes: %s", [v.name for v in volume_models])
248
249
  all_volumes_detached = await _detach_volumes_from_job_instance(
@@ -302,11 +303,13 @@ async def process_volumes_detaching(
302
303
  jpd = get_or_error(get_job_provisioning_data(job_model))
303
304
  jrd = get_job_runtime_data(job_model)
304
305
  if jrd is not None and jrd.volume_names is not None:
305
- volume_models = await list_project_volume_models(
306
- session=session, project=instance_model.project, names=jrd.volume_names
307
- )
306
+ volume_names = jrd.volume_names
308
307
  else:
309
- volume_models = [va.volume for va in instance_model.volume_attachments]
308
+ # Legacy jobs before job_runtime_data/blocks were introduced
309
+ volume_names = [va.volume.name for va in instance_model.volume_attachments]
310
+ volume_models = await list_project_volume_models(
311
+ session=session, project=instance_model.project, names=volume_names
312
+ )
310
313
  logger.info("Detaching volumes: %s", [v.name for v in volume_models])
311
314
  all_volumes_detached = await _detach_volumes_from_job_instance(
312
315
  project=instance_model.project,
@@ -13,7 +13,11 @@ from dstack._internal.core.models.configurations import (
13
13
  PythonVersion,
14
14
  RunConfigurationType,
15
15
  )
16
- from dstack._internal.core.models.profiles import DEFAULT_STOP_DURATION, SpotPolicy
16
+ from dstack._internal.core.models.profiles import (
17
+ DEFAULT_STOP_DURATION,
18
+ SpotPolicy,
19
+ UtilizationPolicy,
20
+ )
17
21
  from dstack._internal.core.models.runs import (
18
22
  AppSpec,
19
23
  JobSpec,
@@ -113,6 +117,7 @@ class JobConfigurator(ABC):
113
117
  single_branch=self._single_branch(),
114
118
  max_duration=self._max_duration(),
115
119
  stop_duration=self._stop_duration(),
120
+ utilization_policy=self._utilization_policy(),
116
121
  registry_auth=self._registry_auth(),
117
122
  requirements=self._requirements(),
118
123
  retry=self._retry(),
@@ -201,6 +206,9 @@ class JobConfigurator(ABC):
201
206
  # pydantic validator ensures this is int
202
207
  return self.run_spec.merged_profile.stop_duration
203
208
 
209
+ def _utilization_policy(self) -> Optional[UtilizationPolicy]:
210
+ return self.run_spec.merged_profile.utilization_policy
211
+
204
212
  def _registry_auth(self) -> Optional[RegistryAuth]:
205
213
  return self.run_spec.configuration.registry_auth
206
214
 
@@ -6,8 +6,6 @@ from dstack._internal.core.models.runs import RunSpec
6
6
  from dstack._internal.server.services.jobs.configurators.base import JobConfigurator
7
7
  from dstack._internal.server.services.jobs.configurators.extensions.vscode import VSCodeDesktop
8
8
 
9
- DEFAULT_MAX_DURATION_SECONDS = 6 * 3600
10
-
11
9
  INSTALL_IPYKERNEL = (
12
10
  "(echo pip install ipykernel... && pip install -q --no-cache-dir ipykernel 2> /dev/null) || "
13
11
  'echo "no pip, ipykernel was not installed"'
@@ -44,7 +42,7 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
44
42
  return False
45
43
 
46
44
  def _default_max_duration(self) -> Optional[int]:
47
- return DEFAULT_MAX_DURATION_SECONDS
45
+ return None
48
46
 
49
47
  def _spot_policy(self) -> SpotPolicy:
50
48
  return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND
@@ -5,8 +5,6 @@ from dstack._internal.core.models.profiles import SpotPolicy
5
5
  from dstack._internal.core.models.runs import JobSpec
6
6
  from dstack._internal.server.services.jobs.configurators.base import JobConfigurator
7
7
 
8
- DEFAULT_MAX_DURATION_SECONDS = 72 * 3600
9
-
10
8
 
11
9
  class TaskJobConfigurator(JobConfigurator):
12
10
  TYPE: RunConfigurationType = RunConfigurationType.TASK
@@ -29,7 +27,7 @@ class TaskJobConfigurator(JobConfigurator):
29
27
  return True
30
28
 
31
29
  def _default_max_duration(self) -> Optional[int]:
32
- return DEFAULT_MAX_DURATION_SECONDS
30
+ return None
33
31
 
34
32
  def _spot_policy(self) -> SpotPolicy:
35
33
  return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND
@@ -0,0 +1,78 @@
1
+ import atexit
2
+ from typing import List, Optional
3
+ from uuid import UUID
4
+
5
+ from dstack._internal.core.models.logs import JobSubmissionLogs
6
+ from dstack._internal.server import settings
7
+ from dstack._internal.server.models import ProjectModel
8
+ from dstack._internal.server.schemas.logs import PollLogsRequest
9
+ from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
10
+ from dstack._internal.server.services.logs.aws import BOTO_AVAILABLE, CloudWatchLogStorage
11
+ from dstack._internal.server.services.logs.base import LogStorage, LogStorageError
12
+ from dstack._internal.server.services.logs.filelog import FileLogStorage
13
+ from dstack._internal.server.services.logs.gcp import GCP_LOGGING_AVAILABLE, GCPLogStorage
14
+ from dstack._internal.utils.common import run_async
15
+ from dstack._internal.utils.logging import get_logger
16
+
17
+ logger = get_logger(__name__)
18
+
19
+
20
+ _log_storage: Optional[LogStorage] = None
21
+
22
+
23
+ def get_log_storage() -> LogStorage:
24
+ global _log_storage
25
+ if _log_storage is not None:
26
+ return _log_storage
27
+ if settings.SERVER_CLOUDWATCH_LOG_GROUP:
28
+ if BOTO_AVAILABLE:
29
+ try:
30
+ _log_storage = CloudWatchLogStorage(
31
+ group=settings.SERVER_CLOUDWATCH_LOG_GROUP,
32
+ region=settings.SERVER_CLOUDWATCH_LOG_REGION,
33
+ )
34
+ except LogStorageError as e:
35
+ logger.error("Failed to initialize CloudWatch Logs storage: %s", e)
36
+ except Exception:
37
+ logger.exception("Got exception when initializing CloudWatch Logs storage")
38
+ else:
39
+ logger.debug("Using CloudWatch Logs storage")
40
+ else:
41
+ logger.error("Cannot use CloudWatch Logs storage: boto3 is not installed")
42
+ elif settings.SERVER_GCP_LOGGING_PROJECT:
43
+ if GCP_LOGGING_AVAILABLE:
44
+ try:
45
+ _log_storage = GCPLogStorage(project_id=settings.SERVER_GCP_LOGGING_PROJECT)
46
+ except LogStorageError as e:
47
+ logger.error("Failed to initialize GCP Logs storage: %s", e)
48
+ except Exception:
49
+ logger.exception("Got exception when initializing GCP Logs storage")
50
+ else:
51
+ logger.debug("Using GCP Logs storage")
52
+ else:
53
+ logger.error("Cannot use GCP Logs storage: GCP deps are not installed")
54
+ if _log_storage is None:
55
+ _log_storage = FileLogStorage()
56
+ logger.debug("Using file-based storage")
57
+ atexit.register(_log_storage.close)
58
+ return _log_storage
59
+
60
+
61
+ def write_logs(
62
+ project: ProjectModel,
63
+ run_name: str,
64
+ job_submission_id: UUID,
65
+ runner_logs: List[RunnerLogEvent],
66
+ job_logs: List[RunnerLogEvent],
67
+ ) -> None:
68
+ return get_log_storage().write_logs(
69
+ project=project,
70
+ run_name=run_name,
71
+ job_submission_id=job_submission_id,
72
+ runner_logs=runner_logs,
73
+ job_logs=job_logs,
74
+ )
75
+
76
+
77
+ async def poll_logs_async(project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
78
+ return await run_async(get_log_storage().poll_logs, project=project, request=request)