dstack 0.18.42__py3-none-any.whl → 0.18.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +2 -1
- dstack/_internal/cli/commands/apply.py +4 -2
- dstack/_internal/cli/commands/attach.py +21 -1
- dstack/_internal/cli/commands/completion.py +20 -0
- dstack/_internal/cli/commands/delete.py +3 -1
- dstack/_internal/cli/commands/fleet.py +2 -1
- dstack/_internal/cli/commands/gateway.py +7 -2
- dstack/_internal/cli/commands/logs.py +3 -2
- dstack/_internal/cli/commands/stats.py +2 -1
- dstack/_internal/cli/commands/stop.py +2 -1
- dstack/_internal/cli/commands/volume.py +2 -1
- dstack/_internal/cli/main.py +6 -0
- dstack/_internal/cli/services/completion.py +86 -0
- dstack/_internal/cli/services/configurators/run.py +11 -17
- dstack/_internal/cli/utils/fleet.py +5 -1
- dstack/_internal/cli/utils/run.py +11 -0
- dstack/_internal/core/backends/aws/compute.py +23 -10
- dstack/_internal/core/backends/aws/resources.py +3 -3
- dstack/_internal/core/backends/azure/compute.py +15 -9
- dstack/_internal/core/backends/azure/resources.py +2 -0
- dstack/_internal/core/backends/base/compute.py +102 -2
- dstack/_internal/core/backends/base/offers.py +7 -1
- dstack/_internal/core/backends/cudo/compute.py +8 -4
- dstack/_internal/core/backends/datacrunch/compute.py +10 -4
- dstack/_internal/core/backends/gcp/auth.py +19 -13
- dstack/_internal/core/backends/gcp/compute.py +26 -20
- dstack/_internal/core/backends/gcp/resources.py +3 -10
- dstack/_internal/core/backends/kubernetes/compute.py +4 -3
- dstack/_internal/core/backends/lambdalabs/compute.py +9 -3
- dstack/_internal/core/backends/nebius/compute.py +2 -2
- dstack/_internal/core/backends/oci/compute.py +10 -4
- dstack/_internal/core/backends/runpod/compute.py +32 -7
- dstack/_internal/core/backends/runpod/config.py +8 -0
- dstack/_internal/core/backends/tensordock/compute.py +14 -3
- dstack/_internal/core/backends/vastai/compute.py +12 -2
- dstack/_internal/core/backends/vultr/api_client.py +3 -3
- dstack/_internal/core/backends/vultr/compute.py +9 -3
- dstack/_internal/core/models/backends/aws.py +2 -0
- dstack/_internal/core/models/backends/base.py +1 -0
- dstack/_internal/core/models/backends/runpod.py +2 -0
- dstack/_internal/core/models/configurations.py +2 -2
- dstack/_internal/core/models/profiles.py +46 -1
- dstack/_internal/core/models/runs.py +4 -0
- dstack/_internal/core/services/__init__.py +5 -1
- dstack/_internal/core/services/configs/__init__.py +3 -0
- dstack/_internal/server/app.py +11 -1
- dstack/_internal/server/background/__init__.py +10 -0
- dstack/_internal/server/background/tasks/common.py +22 -0
- dstack/_internal/server/background/tasks/process_instances.py +11 -18
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
- dstack/_internal/server/background/tasks/process_running_jobs.py +74 -34
- dstack/_internal/server/background/tasks/process_runs.py +1 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +4 -1
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +1 -7
- dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
- dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
- dstack/_internal/server/models.py +11 -0
- dstack/_internal/server/routers/logs.py +3 -0
- dstack/_internal/server/routers/metrics.py +21 -2
- dstack/_internal/server/routers/prometheus.py +36 -0
- dstack/_internal/server/security/permissions.py +1 -1
- dstack/_internal/server/services/backends/configurators/aws.py +31 -1
- dstack/_internal/server/services/backends/configurators/gcp.py +8 -15
- dstack/_internal/server/services/backends/configurators/runpod.py +3 -33
- dstack/_internal/server/services/config.py +24 -4
- dstack/_internal/server/services/fleets.py +1 -0
- dstack/_internal/server/services/gateways/__init__.py +1 -0
- dstack/_internal/server/services/jobs/__init__.py +12 -9
- dstack/_internal/server/services/jobs/configurators/base.py +9 -1
- dstack/_internal/server/services/jobs/configurators/dev.py +1 -3
- dstack/_internal/server/services/jobs/configurators/task.py +1 -3
- dstack/_internal/server/services/logs/__init__.py +78 -0
- dstack/_internal/server/services/{logs.py → logs/aws.py} +12 -207
- dstack/_internal/server/services/logs/base.py +47 -0
- dstack/_internal/server/services/logs/filelog.py +110 -0
- dstack/_internal/server/services/logs/gcp.py +165 -0
- dstack/_internal/server/services/metrics.py +103 -70
- dstack/_internal/server/services/pools.py +16 -17
- dstack/_internal/server/services/prometheus.py +87 -0
- dstack/_internal/server/services/proxy/routers/service_proxy.py +14 -7
- dstack/_internal/server/services/runner/client.py +14 -3
- dstack/_internal/server/services/runs.py +43 -15
- dstack/_internal/server/services/volumes.py +1 -0
- dstack/_internal/server/settings.py +6 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js → main-4eb116b97819badd1e2c.js} +131 -78
- dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js.map → main-4eb116b97819badd1e2c.js.map} +1 -1
- dstack/_internal/server/statics/{main-ad5150a441de98cd8987.css → main-da9f8c06a69c20dac23e.css} +1 -1
- dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
- dstack/_internal/server/testing/common.py +50 -8
- dstack/api/_public/runs.py +4 -1
- dstack/api/server/_fleets.py +2 -0
- dstack/api/server/_runs.py +4 -0
- dstack/api/utils.py +3 -0
- dstack/version.py +2 -2
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/METADATA +13 -3
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/RECORD +115 -97
- tests/_internal/core/backends/base/__init__.py +0 -0
- tests/_internal/core/backends/base/test_compute.py +56 -0
- tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +189 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +126 -1
- tests/_internal/server/conftest.py +4 -5
- tests/_internal/server/routers/test_backends.py +1 -0
- tests/_internal/server/routers/test_fleets.py +2 -0
- tests/_internal/server/routers/test_logs.py +1 -1
- tests/_internal/server/routers/test_metrics.py +15 -0
- tests/_internal/server/routers/test_prometheus.py +244 -0
- tests/_internal/server/routers/test_runs.py +81 -58
- tests/_internal/server/services/test_logs.py +3 -3
- tests/_internal/server/services/test_metrics.py +163 -0
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/WHEEL +0 -0
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,16 @@
|
|
|
1
|
-
from
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Optional, Tuple
|
|
2
3
|
|
|
3
4
|
from fastapi import APIRouter, Depends
|
|
4
5
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
5
6
|
|
|
7
|
+
from dstack._internal.core.errors import ResourceNotExistsError
|
|
6
8
|
from dstack._internal.core.models.metrics import JobMetrics
|
|
7
9
|
from dstack._internal.server.db import get_session
|
|
8
10
|
from dstack._internal.server.models import ProjectModel, UserModel
|
|
9
11
|
from dstack._internal.server.security.permissions import ProjectMember
|
|
10
12
|
from dstack._internal.server.services import metrics
|
|
13
|
+
from dstack._internal.server.services.jobs import get_run_job_model
|
|
11
14
|
from dstack._internal.server.utils.routers import get_base_api_additional_responses
|
|
12
15
|
|
|
13
16
|
router = APIRouter(
|
|
@@ -24,6 +27,9 @@ async def get_job_metrics(
|
|
|
24
27
|
run_name: str,
|
|
25
28
|
replica_num: int = 0,
|
|
26
29
|
job_num: int = 0,
|
|
30
|
+
limit: int = 1,
|
|
31
|
+
after: Optional[datetime] = None,
|
|
32
|
+
before: Optional[datetime] = None,
|
|
27
33
|
session: AsyncSession = Depends(get_session),
|
|
28
34
|
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
|
|
29
35
|
) -> JobMetrics:
|
|
@@ -31,6 +37,8 @@ async def get_job_metrics(
|
|
|
31
37
|
Returns job-level metrics such as hardware utilization
|
|
32
38
|
given `run_name`, `replica_num`, and `job_num`.
|
|
33
39
|
If only `run_name` is specified, returns metrics of `(replica_num=0, job_num=0)`.
|
|
40
|
+
By default, returns one latest sample. To control time window/number of samples, use
|
|
41
|
+
`limit`, `after`, `before`.
|
|
34
42
|
|
|
35
43
|
Supported metrics: [
|
|
36
44
|
"cpu_usage_percent",
|
|
@@ -42,10 +50,21 @@ async def get_job_metrics(
|
|
|
42
50
|
]
|
|
43
51
|
"""
|
|
44
52
|
_, project = user_project
|
|
45
|
-
|
|
53
|
+
|
|
54
|
+
job_model = await get_run_job_model(
|
|
46
55
|
session=session,
|
|
47
56
|
project=project,
|
|
48
57
|
run_name=run_name,
|
|
49
58
|
replica_num=replica_num,
|
|
50
59
|
job_num=job_num,
|
|
51
60
|
)
|
|
61
|
+
if job_model is None:
|
|
62
|
+
raise ResourceNotExistsError("Found no job with given parameters")
|
|
63
|
+
|
|
64
|
+
return await metrics.get_job_metrics(
|
|
65
|
+
session=session,
|
|
66
|
+
job_model=job_model,
|
|
67
|
+
limit=limit,
|
|
68
|
+
after=after,
|
|
69
|
+
before=before,
|
|
70
|
+
)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from typing import Annotated
|
|
2
|
+
|
|
3
|
+
from fastapi import APIRouter, Depends
|
|
4
|
+
from fastapi.responses import PlainTextResponse
|
|
5
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
6
|
+
|
|
7
|
+
from dstack._internal.server import settings
|
|
8
|
+
from dstack._internal.server.db import get_session
|
|
9
|
+
from dstack._internal.server.deps import Project
|
|
10
|
+
from dstack._internal.server.models import ProjectModel
|
|
11
|
+
from dstack._internal.server.services import prometheus
|
|
12
|
+
from dstack._internal.server.utils.routers import error_not_found
|
|
13
|
+
|
|
14
|
+
router = APIRouter(
|
|
15
|
+
tags=["prometheus"],
|
|
16
|
+
default_response_class=PlainTextResponse,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@router.get("/metrics")
|
|
21
|
+
async def get_prometheus_metrics(
|
|
22
|
+
session: Annotated[AsyncSession, Depends(get_session)],
|
|
23
|
+
) -> str:
|
|
24
|
+
if not settings.ENABLE_PROMETHEUS_METRICS:
|
|
25
|
+
raise error_not_found()
|
|
26
|
+
return await prometheus.get_metrics(session=session)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@router.get("/metrics/project/{project_name}")
|
|
30
|
+
async def get_project_prometheus_metrics(
|
|
31
|
+
session: Annotated[AsyncSession, Depends(get_session)],
|
|
32
|
+
project: Annotated[ProjectModel, Depends(Project())],
|
|
33
|
+
) -> str:
|
|
34
|
+
if not settings.ENABLE_PROMETHEUS_METRICS:
|
|
35
|
+
raise error_not_found()
|
|
36
|
+
return await prometheus.get_project_metrics(session=session, project=project)
|
|
@@ -80,7 +80,7 @@ class ProjectManager:
|
|
|
80
80
|
project = await get_project_model_by_name(session=session, project_name=project_name)
|
|
81
81
|
if project is None:
|
|
82
82
|
raise error_forbidden()
|
|
83
|
-
if user.global_role
|
|
83
|
+
if user.global_role == GlobalRole.ADMIN:
|
|
84
84
|
return user, project
|
|
85
85
|
project_role = get_user_project_role(user=user, project=project)
|
|
86
86
|
if project_role in [ProjectRole.ADMIN, ProjectRole.MANAGER]:
|
|
@@ -2,6 +2,7 @@ import concurrent.futures
|
|
|
2
2
|
import json
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
import botocore.exceptions
|
|
5
6
|
from boto3.session import Session
|
|
6
7
|
|
|
7
8
|
from dstack._internal.core.backends.aws import AWSBackend, auth, compute, resources
|
|
@@ -35,6 +36,9 @@ from dstack._internal.server.services.backends.configurators.base import (
|
|
|
35
36
|
Configurator,
|
|
36
37
|
raise_invalid_credentials_error,
|
|
37
38
|
)
|
|
39
|
+
from dstack._internal.utils.logging import get_logger
|
|
40
|
+
|
|
41
|
+
logger = get_logger(__name__)
|
|
38
42
|
|
|
39
43
|
REGIONS = [
|
|
40
44
|
("US East, N. Virginia", "us-east-1"),
|
|
@@ -137,7 +141,8 @@ class AWSConfigurator(Configurator):
|
|
|
137
141
|
|
|
138
142
|
def _check_config(self, session: Session, config: AWSConfigInfoWithCredsPartial):
|
|
139
143
|
self._check_tags_config(config)
|
|
140
|
-
self.
|
|
144
|
+
self._check_iam_instance_profile_config(session, config)
|
|
145
|
+
self._check_vpc_config(session, config)
|
|
141
146
|
|
|
142
147
|
def _check_tags_config(self, config: AWSConfigInfoWithCredsPartial):
|
|
143
148
|
if not config.tags:
|
|
@@ -151,6 +156,31 @@ class AWSConfigurator(Configurator):
|
|
|
151
156
|
except BackendError as e:
|
|
152
157
|
raise ServerClientError(e.args[0])
|
|
153
158
|
|
|
159
|
+
def _check_iam_instance_profile_config(
|
|
160
|
+
self, session: Session, config: AWSConfigInfoWithCredsPartial
|
|
161
|
+
):
|
|
162
|
+
if config.iam_instance_profile is None:
|
|
163
|
+
return
|
|
164
|
+
try:
|
|
165
|
+
iam_client = session.client("iam")
|
|
166
|
+
iam_client.get_instance_profile(InstanceProfileName=config.iam_instance_profile)
|
|
167
|
+
except botocore.exceptions.ClientError as e:
|
|
168
|
+
if e.response["Error"]["Code"] == "NoSuchEntity":
|
|
169
|
+
raise ServerClientError(
|
|
170
|
+
f"IAM instance profile {config.iam_instance_profile} not found"
|
|
171
|
+
)
|
|
172
|
+
logger.exception(
|
|
173
|
+
"Got botocore.exceptions.ClientError when checking iam_instance_profile"
|
|
174
|
+
)
|
|
175
|
+
raise ServerClientError(
|
|
176
|
+
f"Failed to check IAM instance profile {config.iam_instance_profile}"
|
|
177
|
+
)
|
|
178
|
+
except Exception:
|
|
179
|
+
logger.exception("Got exception when checking iam_instance_profile")
|
|
180
|
+
raise ServerClientError(
|
|
181
|
+
f"Failed to check IAM instance profile {config.iam_instance_profile}"
|
|
182
|
+
)
|
|
183
|
+
|
|
154
184
|
def _check_vpc_config(self, session: Session, config: AWSConfigInfoWithCredsPartial):
|
|
155
185
|
allocate_public_ip = config.public_ips if config.public_ips is not None else True
|
|
156
186
|
use_default_vpcs = config.default_vpcs if config.default_vpcs is not None else True
|
|
@@ -127,10 +127,6 @@ class GCPConfigurator(Configurator):
|
|
|
127
127
|
_, project_id = auth.authenticate(GCPDefaultCreds())
|
|
128
128
|
except BackendAuthError:
|
|
129
129
|
return []
|
|
130
|
-
|
|
131
|
-
if project_id is None:
|
|
132
|
-
return []
|
|
133
|
-
|
|
134
130
|
return [
|
|
135
131
|
GCPConfigInfoWithCreds(
|
|
136
132
|
project_id=project_id,
|
|
@@ -152,24 +148,21 @@ class GCPConfigurator(Configurator):
|
|
|
152
148
|
):
|
|
153
149
|
raise_invalid_credentials_error(fields=[["creds"]])
|
|
154
150
|
try:
|
|
155
|
-
credentials,
|
|
156
|
-
except BackendAuthError:
|
|
151
|
+
credentials, _ = auth.authenticate(creds=config.creds, project_id=config.project_id)
|
|
152
|
+
except BackendAuthError as e:
|
|
153
|
+
details = None
|
|
154
|
+
if len(e.args) > 0:
|
|
155
|
+
details = e.args[0]
|
|
157
156
|
if is_core_model_instance(config.creds, GCPServiceAccountCreds):
|
|
158
|
-
raise_invalid_credentials_error(fields=[["creds", "data"]])
|
|
157
|
+
raise_invalid_credentials_error(fields=[["creds", "data"]], details=details)
|
|
159
158
|
else:
|
|
160
|
-
raise_invalid_credentials_error(fields=[["creds"]])
|
|
161
|
-
if (
|
|
162
|
-
project_id is not None
|
|
163
|
-
and config.project_id is not None
|
|
164
|
-
and config.project_id != project_id
|
|
165
|
-
):
|
|
166
|
-
raise ServerClientError(msg="Wrong project_id", fields=[["project_id"]])
|
|
167
|
-
config_values.project_id = self._get_project_id_element(selected=project_id)
|
|
159
|
+
raise_invalid_credentials_error(fields=[["creds"]], details=details)
|
|
168
160
|
config_values.regions = self._get_regions_element(
|
|
169
161
|
selected=config.regions or DEFAULT_REGIONS
|
|
170
162
|
)
|
|
171
163
|
if config.project_id is None:
|
|
172
164
|
return config_values
|
|
165
|
+
config_values.project_id = self._get_project_id_element(selected=config.project_id)
|
|
173
166
|
self._check_config(config=config, credentials=credentials)
|
|
174
167
|
return config_values
|
|
175
168
|
|
|
@@ -3,11 +3,7 @@ from typing import List
|
|
|
3
3
|
|
|
4
4
|
from dstack._internal.core.backends.base import Backend
|
|
5
5
|
from dstack._internal.core.backends.runpod import RunpodBackend, RunpodConfig, api_client
|
|
6
|
-
from dstack._internal.core.models.backends.base import
|
|
7
|
-
BackendType,
|
|
8
|
-
ConfigElementValue,
|
|
9
|
-
ConfigMultiElement,
|
|
10
|
-
)
|
|
6
|
+
from dstack._internal.core.models.backends.base import BackendType, ConfigMultiElement
|
|
11
7
|
from dstack._internal.core.models.backends.runpod import (
|
|
12
8
|
RunpodConfigInfo,
|
|
13
9
|
RunpodConfigInfoWithCreds,
|
|
@@ -22,25 +18,6 @@ from dstack._internal.server.services.backends.configurators.base import (
|
|
|
22
18
|
raise_invalid_credentials_error,
|
|
23
19
|
)
|
|
24
20
|
|
|
25
|
-
REGIONS = [
|
|
26
|
-
"CA-MTL-1",
|
|
27
|
-
"CA-MTL-2",
|
|
28
|
-
"CA-MTL-3",
|
|
29
|
-
"EU-NL-1",
|
|
30
|
-
"EU-RO-1",
|
|
31
|
-
"EU-SE-1",
|
|
32
|
-
"EUR-IS-1",
|
|
33
|
-
"EUR-IS-2",
|
|
34
|
-
"US-CA-1",
|
|
35
|
-
"US-GA-1",
|
|
36
|
-
"US-GA-2",
|
|
37
|
-
"US-KS-2",
|
|
38
|
-
"US-OR-1",
|
|
39
|
-
"US-TX-3",
|
|
40
|
-
]
|
|
41
|
-
|
|
42
|
-
DEFAULT_REGION = "CA-MTL-1"
|
|
43
|
-
|
|
44
21
|
|
|
45
22
|
class RunpodConfigurator(Configurator):
|
|
46
23
|
TYPE: BackendType = BackendType.RUNPOD
|
|
@@ -50,16 +27,12 @@ class RunpodConfigurator(Configurator):
|
|
|
50
27
|
if config.creds is None:
|
|
51
28
|
return config_values
|
|
52
29
|
self._validate_runpod_api_key(config.creds.api_key)
|
|
53
|
-
config_values.regions = self._get_regions_element(
|
|
54
|
-
selected=config.regions or [DEFAULT_REGION]
|
|
55
|
-
)
|
|
30
|
+
config_values.regions = self._get_regions_element(selected=config.regions or [])
|
|
56
31
|
return config_values
|
|
57
32
|
|
|
58
33
|
def create_backend(
|
|
59
34
|
self, project: ProjectModel, config: RunpodConfigInfoWithCreds
|
|
60
35
|
) -> BackendModel:
|
|
61
|
-
if config.regions is None:
|
|
62
|
-
config.regions = REGIONS
|
|
63
36
|
return BackendModel(
|
|
64
37
|
project_id=project.id,
|
|
65
38
|
type=self.TYPE.value,
|
|
@@ -80,10 +53,7 @@ class RunpodConfigurator(Configurator):
|
|
|
80
53
|
return RunpodBackend(config=config)
|
|
81
54
|
|
|
82
55
|
def _get_regions_element(self, selected: List[str]) -> ConfigMultiElement:
|
|
83
|
-
|
|
84
|
-
for r in REGIONS:
|
|
85
|
-
element.values.append(ConfigElementValue(value=r, label=r))
|
|
86
|
-
return element
|
|
56
|
+
return ConfigMultiElement(selected=selected)
|
|
87
57
|
|
|
88
58
|
def _get_backend_config(self, model: BackendModel) -> RunpodConfig:
|
|
89
59
|
return RunpodConfig(
|
|
@@ -6,6 +6,7 @@ from pydantic import BaseModel, Field, ValidationError, root_validator
|
|
|
6
6
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
7
7
|
from typing_extensions import Annotated
|
|
8
8
|
|
|
9
|
+
from dstack._internal.core.backends.runpod.config import RUNPOD_COMMUNITY_CLOUD_DEFAULT
|
|
9
10
|
from dstack._internal.core.errors import (
|
|
10
11
|
BackendNotAvailable,
|
|
11
12
|
ResourceNotExistsError,
|
|
@@ -45,7 +46,7 @@ logger = get_logger(__name__)
|
|
|
45
46
|
# By default, PyYAML chooses the style of a collection depending on whether it has nested collections.
|
|
46
47
|
# If a collection has nested collections, it will be assigned the block style. Otherwise it will have the flow style.
|
|
47
48
|
#
|
|
48
|
-
# We want mapping to always be
|
|
49
|
+
# We want mapping to always be displayed in block-style but lists without nested objects in flow-style.
|
|
49
50
|
# So we define a custom representeter
|
|
50
51
|
|
|
51
52
|
|
|
@@ -107,6 +108,16 @@ class AWSConfig(CoreModel):
|
|
|
107
108
|
)
|
|
108
109
|
),
|
|
109
110
|
] = None
|
|
111
|
+
iam_instance_profile: Annotated[
|
|
112
|
+
Optional[str],
|
|
113
|
+
Field(
|
|
114
|
+
description=(
|
|
115
|
+
"The name of the IAM instance profile to associate with EC2 instances."
|
|
116
|
+
" You can also specify the IAM role name for roles created via the AWS console."
|
|
117
|
+
" AWS automatically creates an instance profile and gives it the same name as the role"
|
|
118
|
+
)
|
|
119
|
+
),
|
|
120
|
+
] = None
|
|
110
121
|
tags: Annotated[
|
|
111
122
|
Optional[Dict[str, str]],
|
|
112
123
|
Field(description="The tags that will be assigned to resources created by `dstack`"),
|
|
@@ -251,7 +262,7 @@ class GCPConfig(CoreModel):
|
|
|
251
262
|
),
|
|
252
263
|
] = None
|
|
253
264
|
vm_service_account: Annotated[
|
|
254
|
-
Optional[str], Field(description="The service account
|
|
265
|
+
Optional[str], Field(description="The service account to associate with provisioned VMs")
|
|
255
266
|
] = None
|
|
256
267
|
tags: Annotated[
|
|
257
268
|
Optional[Dict[str, str]],
|
|
@@ -330,7 +341,7 @@ class KubernetesConfig(CoreModel):
|
|
|
330
341
|
kubeconfig: Annotated[KubeconfigConfig, Field(description="The kubeconfig configuration")]
|
|
331
342
|
networking: Annotated[
|
|
332
343
|
Optional[KubernetesNetworkingConfig], Field(description="The networking configuration")
|
|
333
|
-
]
|
|
344
|
+
] = None
|
|
334
345
|
|
|
335
346
|
|
|
336
347
|
class KubernetesAPIConfig(CoreModel):
|
|
@@ -338,7 +349,7 @@ class KubernetesAPIConfig(CoreModel):
|
|
|
338
349
|
kubeconfig: Annotated[KubeconfigAPIConfig, Field(description="The kubeconfig configuration")]
|
|
339
350
|
networking: Annotated[
|
|
340
351
|
Optional[KubernetesNetworkingConfig], Field(description="The networking configuration")
|
|
341
|
-
]
|
|
352
|
+
] = None
|
|
342
353
|
|
|
343
354
|
|
|
344
355
|
class LambdaConfig(CoreModel):
|
|
@@ -418,6 +429,15 @@ class RunpodConfig(CoreModel):
|
|
|
418
429
|
Optional[List[str]],
|
|
419
430
|
Field(description="The list of RunPod regions. Omit to use all regions"),
|
|
420
431
|
] = None
|
|
432
|
+
community_cloud: Annotated[
|
|
433
|
+
Optional[bool],
|
|
434
|
+
Field(
|
|
435
|
+
description=(
|
|
436
|
+
"Whether Community Cloud offers can be suggested in addition to Secure Cloud."
|
|
437
|
+
f" Defaults to `{str(RUNPOD_COMMUNITY_CLOUD_DEFAULT).lower()}`"
|
|
438
|
+
)
|
|
439
|
+
),
|
|
440
|
+
] = None
|
|
421
441
|
creds: Annotated[AnyRunpodCreds, Field(description="The credentials")]
|
|
422
442
|
|
|
423
443
|
|
|
@@ -517,6 +517,7 @@ async def delete_fleets(
|
|
|
517
517
|
.options(selectinload(FleetModel.instances))
|
|
518
518
|
.options(selectinload(FleetModel.runs))
|
|
519
519
|
.execution_options(populate_existing=True)
|
|
520
|
+
.order_by(FleetModel.id) # take locks in order
|
|
520
521
|
.with_for_update()
|
|
521
522
|
)
|
|
522
523
|
fleet_models = res.scalars().unique().all()
|
|
@@ -220,6 +220,7 @@ async def delete_gateways(
|
|
|
220
220
|
)
|
|
221
221
|
.options(selectinload(GatewayModel.gateway_compute))
|
|
222
222
|
.execution_options(populate_existing=True)
|
|
223
|
+
.order_by(GatewayModel.id) # take locks in order
|
|
223
224
|
.with_for_update()
|
|
224
225
|
)
|
|
225
226
|
gateway_models = res.scalars().all()
|
|
@@ -236,13 +236,14 @@ async def process_terminating_job(
|
|
|
236
236
|
logger.debug("%s: stopping container", fmt(job_model))
|
|
237
237
|
ssh_private_keys = get_instance_ssh_private_keys(instance_model)
|
|
238
238
|
await stop_container(job_model, jpd, ssh_private_keys)
|
|
239
|
-
volume_models: list[VolumeModel]
|
|
240
239
|
if jrd is not None and jrd.volume_names is not None:
|
|
241
|
-
|
|
242
|
-
session=session, project=instance_model.project, names=jrd.volume_names
|
|
243
|
-
)
|
|
240
|
+
volume_names = jrd.volume_names
|
|
244
241
|
else:
|
|
245
|
-
|
|
242
|
+
# Legacy jobs before job_runtime_data/blocks were introduced
|
|
243
|
+
volume_names = [va.volume.name for va in instance_model.volume_attachments]
|
|
244
|
+
volume_models = await list_project_volume_models(
|
|
245
|
+
session=session, project=instance_model.project, names=volume_names
|
|
246
|
+
)
|
|
246
247
|
if len(volume_models) > 0:
|
|
247
248
|
logger.info("Detaching volumes: %s", [v.name for v in volume_models])
|
|
248
249
|
all_volumes_detached = await _detach_volumes_from_job_instance(
|
|
@@ -302,11 +303,13 @@ async def process_volumes_detaching(
|
|
|
302
303
|
jpd = get_or_error(get_job_provisioning_data(job_model))
|
|
303
304
|
jrd = get_job_runtime_data(job_model)
|
|
304
305
|
if jrd is not None and jrd.volume_names is not None:
|
|
305
|
-
|
|
306
|
-
session=session, project=instance_model.project, names=jrd.volume_names
|
|
307
|
-
)
|
|
306
|
+
volume_names = jrd.volume_names
|
|
308
307
|
else:
|
|
309
|
-
|
|
308
|
+
# Legacy jobs before job_runtime_data/blocks were introduced
|
|
309
|
+
volume_names = [va.volume.name for va in instance_model.volume_attachments]
|
|
310
|
+
volume_models = await list_project_volume_models(
|
|
311
|
+
session=session, project=instance_model.project, names=volume_names
|
|
312
|
+
)
|
|
310
313
|
logger.info("Detaching volumes: %s", [v.name for v in volume_models])
|
|
311
314
|
all_volumes_detached = await _detach_volumes_from_job_instance(
|
|
312
315
|
project=instance_model.project,
|
|
@@ -13,7 +13,11 @@ from dstack._internal.core.models.configurations import (
|
|
|
13
13
|
PythonVersion,
|
|
14
14
|
RunConfigurationType,
|
|
15
15
|
)
|
|
16
|
-
from dstack._internal.core.models.profiles import
|
|
16
|
+
from dstack._internal.core.models.profiles import (
|
|
17
|
+
DEFAULT_STOP_DURATION,
|
|
18
|
+
SpotPolicy,
|
|
19
|
+
UtilizationPolicy,
|
|
20
|
+
)
|
|
17
21
|
from dstack._internal.core.models.runs import (
|
|
18
22
|
AppSpec,
|
|
19
23
|
JobSpec,
|
|
@@ -113,6 +117,7 @@ class JobConfigurator(ABC):
|
|
|
113
117
|
single_branch=self._single_branch(),
|
|
114
118
|
max_duration=self._max_duration(),
|
|
115
119
|
stop_duration=self._stop_duration(),
|
|
120
|
+
utilization_policy=self._utilization_policy(),
|
|
116
121
|
registry_auth=self._registry_auth(),
|
|
117
122
|
requirements=self._requirements(),
|
|
118
123
|
retry=self._retry(),
|
|
@@ -201,6 +206,9 @@ class JobConfigurator(ABC):
|
|
|
201
206
|
# pydantic validator ensures this is int
|
|
202
207
|
return self.run_spec.merged_profile.stop_duration
|
|
203
208
|
|
|
209
|
+
def _utilization_policy(self) -> Optional[UtilizationPolicy]:
|
|
210
|
+
return self.run_spec.merged_profile.utilization_policy
|
|
211
|
+
|
|
204
212
|
def _registry_auth(self) -> Optional[RegistryAuth]:
|
|
205
213
|
return self.run_spec.configuration.registry_auth
|
|
206
214
|
|
|
@@ -6,8 +6,6 @@ from dstack._internal.core.models.runs import RunSpec
|
|
|
6
6
|
from dstack._internal.server.services.jobs.configurators.base import JobConfigurator
|
|
7
7
|
from dstack._internal.server.services.jobs.configurators.extensions.vscode import VSCodeDesktop
|
|
8
8
|
|
|
9
|
-
DEFAULT_MAX_DURATION_SECONDS = 6 * 3600
|
|
10
|
-
|
|
11
9
|
INSTALL_IPYKERNEL = (
|
|
12
10
|
"(echo pip install ipykernel... && pip install -q --no-cache-dir ipykernel 2> /dev/null) || "
|
|
13
11
|
'echo "no pip, ipykernel was not installed"'
|
|
@@ -44,7 +42,7 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
|
|
|
44
42
|
return False
|
|
45
43
|
|
|
46
44
|
def _default_max_duration(self) -> Optional[int]:
|
|
47
|
-
return
|
|
45
|
+
return None
|
|
48
46
|
|
|
49
47
|
def _spot_policy(self) -> SpotPolicy:
|
|
50
48
|
return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND
|
|
@@ -5,8 +5,6 @@ from dstack._internal.core.models.profiles import SpotPolicy
|
|
|
5
5
|
from dstack._internal.core.models.runs import JobSpec
|
|
6
6
|
from dstack._internal.server.services.jobs.configurators.base import JobConfigurator
|
|
7
7
|
|
|
8
|
-
DEFAULT_MAX_DURATION_SECONDS = 72 * 3600
|
|
9
|
-
|
|
10
8
|
|
|
11
9
|
class TaskJobConfigurator(JobConfigurator):
|
|
12
10
|
TYPE: RunConfigurationType = RunConfigurationType.TASK
|
|
@@ -29,7 +27,7 @@ class TaskJobConfigurator(JobConfigurator):
|
|
|
29
27
|
return True
|
|
30
28
|
|
|
31
29
|
def _default_max_duration(self) -> Optional[int]:
|
|
32
|
-
return
|
|
30
|
+
return None
|
|
33
31
|
|
|
34
32
|
def _spot_policy(self) -> SpotPolicy:
|
|
35
33
|
return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import atexit
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
from uuid import UUID
|
|
4
|
+
|
|
5
|
+
from dstack._internal.core.models.logs import JobSubmissionLogs
|
|
6
|
+
from dstack._internal.server import settings
|
|
7
|
+
from dstack._internal.server.models import ProjectModel
|
|
8
|
+
from dstack._internal.server.schemas.logs import PollLogsRequest
|
|
9
|
+
from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
|
|
10
|
+
from dstack._internal.server.services.logs.aws import BOTO_AVAILABLE, CloudWatchLogStorage
|
|
11
|
+
from dstack._internal.server.services.logs.base import LogStorage, LogStorageError
|
|
12
|
+
from dstack._internal.server.services.logs.filelog import FileLogStorage
|
|
13
|
+
from dstack._internal.server.services.logs.gcp import GCP_LOGGING_AVAILABLE, GCPLogStorage
|
|
14
|
+
from dstack._internal.utils.common import run_async
|
|
15
|
+
from dstack._internal.utils.logging import get_logger
|
|
16
|
+
|
|
17
|
+
logger = get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
_log_storage: Optional[LogStorage] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_log_storage() -> LogStorage:
|
|
24
|
+
global _log_storage
|
|
25
|
+
if _log_storage is not None:
|
|
26
|
+
return _log_storage
|
|
27
|
+
if settings.SERVER_CLOUDWATCH_LOG_GROUP:
|
|
28
|
+
if BOTO_AVAILABLE:
|
|
29
|
+
try:
|
|
30
|
+
_log_storage = CloudWatchLogStorage(
|
|
31
|
+
group=settings.SERVER_CLOUDWATCH_LOG_GROUP,
|
|
32
|
+
region=settings.SERVER_CLOUDWATCH_LOG_REGION,
|
|
33
|
+
)
|
|
34
|
+
except LogStorageError as e:
|
|
35
|
+
logger.error("Failed to initialize CloudWatch Logs storage: %s", e)
|
|
36
|
+
except Exception:
|
|
37
|
+
logger.exception("Got exception when initializing CloudWatch Logs storage")
|
|
38
|
+
else:
|
|
39
|
+
logger.debug("Using CloudWatch Logs storage")
|
|
40
|
+
else:
|
|
41
|
+
logger.error("Cannot use CloudWatch Logs storage: boto3 is not installed")
|
|
42
|
+
elif settings.SERVER_GCP_LOGGING_PROJECT:
|
|
43
|
+
if GCP_LOGGING_AVAILABLE:
|
|
44
|
+
try:
|
|
45
|
+
_log_storage = GCPLogStorage(project_id=settings.SERVER_GCP_LOGGING_PROJECT)
|
|
46
|
+
except LogStorageError as e:
|
|
47
|
+
logger.error("Failed to initialize GCP Logs storage: %s", e)
|
|
48
|
+
except Exception:
|
|
49
|
+
logger.exception("Got exception when initializing GCP Logs storage")
|
|
50
|
+
else:
|
|
51
|
+
logger.debug("Using GCP Logs storage")
|
|
52
|
+
else:
|
|
53
|
+
logger.error("Cannot use GCP Logs storage: GCP deps are not installed")
|
|
54
|
+
if _log_storage is None:
|
|
55
|
+
_log_storage = FileLogStorage()
|
|
56
|
+
logger.debug("Using file-based storage")
|
|
57
|
+
atexit.register(_log_storage.close)
|
|
58
|
+
return _log_storage
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def write_logs(
|
|
62
|
+
project: ProjectModel,
|
|
63
|
+
run_name: str,
|
|
64
|
+
job_submission_id: UUID,
|
|
65
|
+
runner_logs: List[RunnerLogEvent],
|
|
66
|
+
job_logs: List[RunnerLogEvent],
|
|
67
|
+
) -> None:
|
|
68
|
+
return get_log_storage().write_logs(
|
|
69
|
+
project=project,
|
|
70
|
+
run_name=run_name,
|
|
71
|
+
job_submission_id=job_submission_id,
|
|
72
|
+
runner_logs=runner_logs,
|
|
73
|
+
job_logs=job_logs,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
async def poll_logs_async(project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
|
|
78
|
+
return await run_async(get_log_storage().poll_logs, project=project, request=request)
|