dstack 0.19.25rc1__py3-none-any.whl → 0.19.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +2 -2
- dstack/_internal/cli/commands/apply.py +3 -61
- dstack/_internal/cli/commands/attach.py +1 -1
- dstack/_internal/cli/commands/completion.py +1 -1
- dstack/_internal/cli/commands/delete.py +2 -2
- dstack/_internal/cli/commands/fleet.py +1 -1
- dstack/_internal/cli/commands/gateway.py +2 -2
- dstack/_internal/cli/commands/init.py +56 -24
- dstack/_internal/cli/commands/logs.py +1 -1
- dstack/_internal/cli/commands/metrics.py +1 -1
- dstack/_internal/cli/commands/offer.py +45 -7
- dstack/_internal/cli/commands/project.py +2 -2
- dstack/_internal/cli/commands/secrets.py +2 -2
- dstack/_internal/cli/commands/server.py +1 -1
- dstack/_internal/cli/commands/stop.py +1 -1
- dstack/_internal/cli/commands/volume.py +1 -1
- dstack/_internal/cli/main.py +2 -2
- dstack/_internal/cli/services/completion.py +2 -2
- dstack/_internal/cli/services/configurators/__init__.py +6 -2
- dstack/_internal/cli/services/configurators/base.py +6 -7
- dstack/_internal/cli/services/configurators/fleet.py +1 -3
- dstack/_internal/cli/services/configurators/gateway.py +2 -4
- dstack/_internal/cli/services/configurators/run.py +293 -58
- dstack/_internal/cli/services/configurators/volume.py +2 -4
- dstack/_internal/cli/services/profile.py +1 -1
- dstack/_internal/cli/services/repos.py +35 -48
- dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
- dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
- dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
- dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
- dstack/_internal/core/backends/aws/compute.py +6 -1
- dstack/_internal/core/backends/aws/configurator.py +11 -7
- dstack/_internal/core/backends/azure/configurator.py +11 -7
- dstack/_internal/core/backends/base/compute.py +33 -5
- dstack/_internal/core/backends/base/configurator.py +25 -13
- dstack/_internal/core/backends/base/offers.py +2 -0
- dstack/_internal/core/backends/cloudrift/configurator.py +13 -7
- dstack/_internal/core/backends/configurators.py +15 -0
- dstack/_internal/core/backends/cudo/configurator.py +11 -7
- dstack/_internal/core/backends/datacrunch/compute.py +5 -1
- dstack/_internal/core/backends/datacrunch/configurator.py +13 -7
- dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean/backend.py +16 -0
- dstack/_internal/core/backends/digitalocean/compute.py +5 -0
- dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
- dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
- dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +173 -0
- dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
- dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
- dstack/_internal/core/backends/gcp/compute.py +32 -8
- dstack/_internal/core/backends/gcp/configurator.py +11 -7
- dstack/_internal/core/backends/hotaisle/api_client.py +25 -33
- dstack/_internal/core/backends/hotaisle/compute.py +1 -6
- dstack/_internal/core/backends/hotaisle/configurator.py +13 -7
- dstack/_internal/core/backends/kubernetes/configurator.py +13 -7
- dstack/_internal/core/backends/lambdalabs/configurator.py +11 -7
- dstack/_internal/core/backends/models.py +7 -0
- dstack/_internal/core/backends/nebius/compute.py +1 -8
- dstack/_internal/core/backends/nebius/configurator.py +11 -7
- dstack/_internal/core/backends/nebius/resources.py +21 -11
- dstack/_internal/core/backends/oci/compute.py +4 -5
- dstack/_internal/core/backends/oci/configurator.py +11 -7
- dstack/_internal/core/backends/runpod/configurator.py +11 -7
- dstack/_internal/core/backends/template/configurator.py.jinja +11 -7
- dstack/_internal/core/backends/tensordock/configurator.py +13 -7
- dstack/_internal/core/backends/vastai/configurator.py +11 -7
- dstack/_internal/core/backends/vultr/compute.py +1 -5
- dstack/_internal/core/backends/vultr/configurator.py +11 -4
- dstack/_internal/core/compatibility/fleets.py +5 -0
- dstack/_internal/core/compatibility/gpus.py +13 -0
- dstack/_internal/core/compatibility/runs.py +9 -1
- dstack/_internal/core/models/backends/base.py +5 -1
- dstack/_internal/core/models/common.py +3 -3
- dstack/_internal/core/models/configurations.py +191 -32
- dstack/_internal/core/models/files.py +1 -1
- dstack/_internal/core/models/fleets.py +80 -3
- dstack/_internal/core/models/profiles.py +41 -11
- dstack/_internal/core/models/resources.py +46 -42
- dstack/_internal/core/models/runs.py +28 -5
- dstack/_internal/core/services/configs/__init__.py +6 -3
- dstack/_internal/core/services/profiles.py +2 -2
- dstack/_internal/core/services/repos.py +86 -79
- dstack/_internal/core/services/ssh/ports.py +1 -1
- dstack/_internal/proxy/lib/deps.py +6 -2
- dstack/_internal/server/app.py +22 -17
- dstack/_internal/server/background/tasks/process_fleets.py +109 -13
- dstack/_internal/server/background/tasks/process_gateways.py +4 -1
- dstack/_internal/server/background/tasks/process_instances.py +22 -73
- dstack/_internal/server/background/tasks/process_probes.py +1 -1
- dstack/_internal/server/background/tasks/process_running_jobs.py +12 -4
- dstack/_internal/server/background/tasks/process_runs.py +3 -1
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +67 -44
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
- dstack/_internal/server/background/tasks/process_volumes.py +1 -1
- dstack/_internal/server/db.py +8 -4
- dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
- dstack/_internal/server/models.py +6 -2
- dstack/_internal/server/routers/gpus.py +1 -6
- dstack/_internal/server/schemas/runner.py +11 -0
- dstack/_internal/server/services/backends/__init__.py +14 -8
- dstack/_internal/server/services/backends/handlers.py +6 -1
- dstack/_internal/server/services/docker.py +5 -5
- dstack/_internal/server/services/fleets.py +37 -38
- dstack/_internal/server/services/gateways/__init__.py +2 -0
- dstack/_internal/server/services/gateways/client.py +5 -2
- dstack/_internal/server/services/gateways/connection.py +1 -1
- dstack/_internal/server/services/gpus.py +50 -49
- dstack/_internal/server/services/instances.py +44 -4
- dstack/_internal/server/services/jobs/__init__.py +15 -4
- dstack/_internal/server/services/jobs/configurators/base.py +53 -17
- dstack/_internal/server/services/jobs/configurators/dev.py +9 -4
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +6 -8
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +7 -9
- dstack/_internal/server/services/jobs/configurators/service.py +1 -3
- dstack/_internal/server/services/jobs/configurators/task.py +3 -3
- dstack/_internal/server/services/locking.py +5 -5
- dstack/_internal/server/services/logging.py +10 -2
- dstack/_internal/server/services/logs/__init__.py +8 -6
- dstack/_internal/server/services/logs/aws.py +330 -327
- dstack/_internal/server/services/logs/filelog.py +7 -6
- dstack/_internal/server/services/logs/gcp.py +141 -139
- dstack/_internal/server/services/plugins.py +1 -1
- dstack/_internal/server/services/projects.py +2 -5
- dstack/_internal/server/services/proxy/repo.py +5 -1
- dstack/_internal/server/services/requirements/__init__.py +0 -0
- dstack/_internal/server/services/requirements/combine.py +259 -0
- dstack/_internal/server/services/runner/client.py +7 -0
- dstack/_internal/server/services/runs.py +17 -1
- dstack/_internal/server/services/services/__init__.py +8 -2
- dstack/_internal/server/services/services/autoscalers.py +2 -0
- dstack/_internal/server/services/ssh.py +2 -1
- dstack/_internal/server/services/storage/__init__.py +5 -6
- dstack/_internal/server/services/storage/gcs.py +49 -49
- dstack/_internal/server/services/storage/s3.py +52 -52
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-d151b300fcac3933213d.js → main-4eecc75fbe64067eb1bc.js} +1146 -899
- dstack/_internal/server/statics/{main-d151b300fcac3933213d.js.map → main-4eecc75fbe64067eb1bc.js.map} +1 -1
- dstack/_internal/server/statics/{main-aec4762350e34d6fbff9.css → main-56191c63d516fd0041c4.css} +1 -1
- dstack/_internal/server/testing/common.py +7 -4
- dstack/_internal/server/utils/logging.py +3 -3
- dstack/_internal/server/utils/provisioning.py +3 -3
- dstack/_internal/utils/json_schema.py +3 -1
- dstack/_internal/utils/path.py +8 -1
- dstack/_internal/utils/ssh.py +7 -0
- dstack/_internal/utils/typing.py +14 -0
- dstack/api/_public/repos.py +62 -8
- dstack/api/_public/runs.py +19 -8
- dstack/api/server/__init__.py +17 -19
- dstack/api/server/_gpus.py +2 -1
- dstack/api/server/_group.py +4 -3
- dstack/api/server/_repos.py +20 -3
- dstack/plugins/builtin/rest_plugin/_plugin.py +1 -0
- dstack/version.py +1 -1
- {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/METADATA +2 -2
- {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/RECORD +160 -142
- dstack/api/huggingface/__init__.py +0 -73
- {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/WHEEL +0 -0
- {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -48,7 +48,7 @@ class FileLogStorage(LogStorage):
|
|
|
48
48
|
) -> JobSubmissionLogs:
|
|
49
49
|
start_line = 0
|
|
50
50
|
if request.next_token:
|
|
51
|
-
start_line = self.
|
|
51
|
+
start_line = self._parse_next_token(request.next_token)
|
|
52
52
|
|
|
53
53
|
logs = []
|
|
54
54
|
next_token = None
|
|
@@ -97,7 +97,9 @@ class FileLogStorage(LogStorage):
|
|
|
97
97
|
def _poll_logs_descending(
|
|
98
98
|
self, log_file_path: Path, request: PollLogsRequest
|
|
99
99
|
) -> JobSubmissionLogs:
|
|
100
|
-
start_offset =
|
|
100
|
+
start_offset = None
|
|
101
|
+
if request.next_token is not None:
|
|
102
|
+
start_offset = self._parse_next_token(request.next_token)
|
|
101
103
|
|
|
102
104
|
candidate_logs = []
|
|
103
105
|
|
|
@@ -123,12 +125,12 @@ class FileLogStorage(LogStorage):
|
|
|
123
125
|
except FileNotFoundError:
|
|
124
126
|
return JobSubmissionLogs(logs=[], next_token=None)
|
|
125
127
|
|
|
126
|
-
logs = [log for log,
|
|
128
|
+
logs = [log for log, _ in candidate_logs[: request.limit]]
|
|
127
129
|
next_token = None
|
|
128
130
|
if len(candidate_logs) > request.limit:
|
|
129
131
|
# We fetched one more than the limit, so there are more pages.
|
|
130
132
|
# The next token should point to the start of the last log we are returning.
|
|
131
|
-
|
|
133
|
+
_, last_log_offset = candidate_logs[request.limit - 1]
|
|
132
134
|
next_token = str(last_log_offset)
|
|
133
135
|
|
|
134
136
|
return JobSubmissionLogs(logs=logs, next_token=next_token)
|
|
@@ -245,8 +247,7 @@ class FileLogStorage(LogStorage):
|
|
|
245
247
|
message=runner_log_event.message.decode(errors="replace"),
|
|
246
248
|
)
|
|
247
249
|
|
|
248
|
-
def
|
|
249
|
-
next_token = request.next_token
|
|
250
|
+
def _parse_next_token(self, next_token: str) -> int:
|
|
250
251
|
if next_token is None:
|
|
251
252
|
return None
|
|
252
253
|
try:
|
|
@@ -20,6 +20,9 @@ from dstack._internal.server.services.logs.base import (
|
|
|
20
20
|
from dstack._internal.utils.common import batched
|
|
21
21
|
from dstack._internal.utils.logging import get_logger
|
|
22
22
|
|
|
23
|
+
logger = get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
23
26
|
GCP_LOGGING_AVAILABLE = True
|
|
24
27
|
try:
|
|
25
28
|
import google.api_core.exceptions
|
|
@@ -28,152 +31,151 @@ try:
|
|
|
28
31
|
from google.cloud.logging_v2.types import ListLogEntriesRequest
|
|
29
32
|
except ImportError:
|
|
30
33
|
GCP_LOGGING_AVAILABLE = False
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
|
|
68
|
-
# TODO: GCP may return logs in random order when events have the same timestamp.
|
|
69
|
-
producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB
|
|
70
|
-
stream_name = self._get_stream_name(
|
|
71
|
-
project_name=project.name,
|
|
72
|
-
run_name=request.run_name,
|
|
73
|
-
job_submission_id=request.job_submission_id,
|
|
74
|
-
producer=producer,
|
|
75
|
-
)
|
|
76
|
-
log_filters = [f'labels.stream = "{stream_name}"']
|
|
77
|
-
if request.start_time:
|
|
78
|
-
log_filters.append(f'timestamp > "{request.start_time.isoformat()}"')
|
|
79
|
-
if request.end_time:
|
|
80
|
-
log_filters.append(f'timestamp < "{request.end_time.isoformat()}"')
|
|
81
|
-
log_filter = " AND ".join(log_filters)
|
|
82
|
-
|
|
83
|
-
order_by = logging_v2.DESCENDING if request.descending else logging_v2.ASCENDING
|
|
84
|
-
try:
|
|
85
|
-
# Use low-level API to get access to next_page_token
|
|
86
|
-
request_obj = ListLogEntriesRequest(
|
|
87
|
-
resource_names=[f"projects/{self.client.project}"],
|
|
88
|
-
filter=log_filter,
|
|
89
|
-
order_by=order_by,
|
|
90
|
-
page_size=request.limit,
|
|
91
|
-
page_token=request.next_token,
|
|
92
|
-
)
|
|
93
|
-
response = self.client._logging_api._gapic_api.list_log_entries(request=request_obj)
|
|
94
|
-
|
|
95
|
-
logs = [
|
|
96
|
-
LogEvent(
|
|
97
|
-
timestamp=entry.timestamp,
|
|
98
|
-
message=entry.json_payload.get("message"),
|
|
99
|
-
log_source=LogEventSource.STDOUT,
|
|
100
|
-
)
|
|
101
|
-
for entry in response.entries
|
|
102
|
-
]
|
|
103
|
-
next_token = response.next_page_token or None
|
|
104
|
-
except google.api_core.exceptions.ResourceExhausted as e:
|
|
105
|
-
logger.warning("GCP Logging exception: %s", repr(e))
|
|
106
|
-
# GCP Logging has severely low quota of 60 reads/min for entries.list
|
|
107
|
-
raise ServerClientError(
|
|
108
|
-
"GCP Logging read request limit exceeded."
|
|
109
|
-
" It's recommended to increase default entries.list request quota from 60 per minute."
|
|
110
|
-
)
|
|
111
|
-
return JobSubmissionLogs(
|
|
112
|
-
logs=logs,
|
|
113
|
-
external_url=self._get_stream_extrnal_url(stream_name),
|
|
114
|
-
next_token=next_token if len(logs) > 0 else None,
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
def write_logs(
|
|
118
|
-
self,
|
|
119
|
-
project: ProjectModel,
|
|
120
|
-
run_name: str,
|
|
121
|
-
job_submission_id: UUID,
|
|
122
|
-
runner_logs: List[RunnerLogEvent],
|
|
123
|
-
job_logs: List[RunnerLogEvent],
|
|
124
|
-
):
|
|
125
|
-
producers_with_logs = [(LogProducer.RUNNER, runner_logs), (LogProducer.JOB, job_logs)]
|
|
126
|
-
for producer, producer_logs in producers_with_logs:
|
|
34
|
+
else:
|
|
35
|
+
|
|
36
|
+
class GCPLogStorage(LogStorage):
|
|
37
|
+
# Max expected message size from runner is 32KB.
|
|
38
|
+
# Max expected LogEntry size is 32KB + metadata < 50KB < 256KB limit.
|
|
39
|
+
# With MAX_BATCH_SIZE = 100, max write request size < 5MB < 10 MB limit.
|
|
40
|
+
# See: https://cloud.google.com/logging/quotas.
|
|
41
|
+
MAX_RUNNER_MESSAGE_SIZE = 32 * 1024
|
|
42
|
+
MAX_BATCH_SIZE = 100
|
|
43
|
+
|
|
44
|
+
# Use the same log name for all run logs so that it's easy to manage all dstack-related logs.
|
|
45
|
+
LOG_NAME = "dstack-run-logs"
|
|
46
|
+
# Logs from different jobs belong to different "streams".
|
|
47
|
+
# GCP Logging has no built-in concepts of streams, so we implement them with labels.
|
|
48
|
+
# It should be fast to filter by labels since labels are indexed by default
|
|
49
|
+
# (https://cloud.google.com/logging/docs/analyze/custom-index).
|
|
50
|
+
|
|
51
|
+
def __init__(self, project_id: str):
|
|
52
|
+
self.project_id = project_id
|
|
53
|
+
try:
|
|
54
|
+
self.client = logging_v2.Client(project=project_id)
|
|
55
|
+
self.logger = self.client.logger(name=self.LOG_NAME)
|
|
56
|
+
self.logger.list_entries(max_results=1)
|
|
57
|
+
# Python client doesn't seem to support dry_run,
|
|
58
|
+
# so emit an empty log to check permissions.
|
|
59
|
+
self.logger.log_empty()
|
|
60
|
+
except google.auth.exceptions.DefaultCredentialsError:
|
|
61
|
+
raise LogStorageError("Default credentials not found")
|
|
62
|
+
except google.api_core.exceptions.NotFound:
|
|
63
|
+
raise LogStorageError(f"Project {project_id} not found")
|
|
64
|
+
except google.api_core.exceptions.PermissionDenied:
|
|
65
|
+
raise LogStorageError("Insufficient permissions")
|
|
66
|
+
|
|
67
|
+
def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
|
|
68
|
+
# TODO: GCP may return logs in random order when events have the same timestamp.
|
|
69
|
+
producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB
|
|
127
70
|
stream_name = self._get_stream_name(
|
|
128
71
|
project_name=project.name,
|
|
129
|
-
run_name=run_name,
|
|
130
|
-
job_submission_id=job_submission_id,
|
|
72
|
+
run_name=request.run_name,
|
|
73
|
+
job_submission_id=request.job_submission_id,
|
|
131
74
|
producer=producer,
|
|
132
75
|
)
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
76
|
+
log_filters = [f'labels.stream = "{stream_name}"']
|
|
77
|
+
if request.start_time:
|
|
78
|
+
log_filters.append(f'timestamp > "{request.start_time.isoformat()}"')
|
|
79
|
+
if request.end_time:
|
|
80
|
+
log_filters.append(f'timestamp < "{request.end_time.isoformat()}"')
|
|
81
|
+
log_filter = " AND ".join(log_filters)
|
|
82
|
+
|
|
83
|
+
order_by = logging_v2.DESCENDING if request.descending else logging_v2.ASCENDING
|
|
84
|
+
try:
|
|
85
|
+
# Use low-level API to get access to next_page_token
|
|
86
|
+
request_obj = ListLogEntriesRequest(
|
|
87
|
+
resource_names=[f"projects/{self.client.project}"],
|
|
88
|
+
filter=log_filter,
|
|
89
|
+
order_by=order_by,
|
|
90
|
+
page_size=request.limit,
|
|
91
|
+
page_token=request.next_token,
|
|
92
|
+
)
|
|
93
|
+
response = self.client._logging_api._gapic_api.list_log_entries( # type: ignore[attr-defined]
|
|
94
|
+
request=request_obj
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
logs = [
|
|
98
|
+
LogEvent(
|
|
99
|
+
timestamp=entry.timestamp,
|
|
100
|
+
message=entry.json_payload.get("message"),
|
|
101
|
+
log_source=LogEventSource.STDOUT,
|
|
102
|
+
)
|
|
103
|
+
for entry in response.entries
|
|
104
|
+
]
|
|
105
|
+
next_token = response.next_page_token or None
|
|
106
|
+
except google.api_core.exceptions.ResourceExhausted as e:
|
|
107
|
+
logger.warning("GCP Logging exception: %s", repr(e))
|
|
108
|
+
# GCP Logging has severely low quota of 60 reads/min for entries.list
|
|
109
|
+
raise ServerClientError(
|
|
110
|
+
"GCP Logging read request limit exceeded."
|
|
111
|
+
" It's recommended to increase default entries.list request quota from 60 per minute."
|
|
112
|
+
)
|
|
113
|
+
return JobSubmissionLogs(
|
|
114
|
+
logs=logs,
|
|
115
|
+
external_url=self._get_stream_extrnal_url(stream_name),
|
|
116
|
+
next_token=next_token if len(logs) > 0 else None,
|
|
136
117
|
)
|
|
137
118
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
119
|
+
def write_logs(
|
|
120
|
+
self,
|
|
121
|
+
project: ProjectModel,
|
|
122
|
+
run_name: str,
|
|
123
|
+
job_submission_id: UUID,
|
|
124
|
+
runner_logs: List[RunnerLogEvent],
|
|
125
|
+
job_logs: List[RunnerLogEvent],
|
|
126
|
+
):
|
|
127
|
+
producers_with_logs = [(LogProducer.RUNNER, runner_logs), (LogProducer.JOB, job_logs)]
|
|
128
|
+
for producer, producer_logs in producers_with_logs:
|
|
129
|
+
stream_name = self._get_stream_name(
|
|
130
|
+
project_name=project.name,
|
|
131
|
+
run_name=run_name,
|
|
132
|
+
job_submission_id=job_submission_id,
|
|
133
|
+
producer=producer,
|
|
134
|
+
)
|
|
135
|
+
self._write_logs_to_stream(
|
|
136
|
+
stream_name=stream_name,
|
|
137
|
+
logs=producer_logs,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
def close(self):
|
|
141
|
+
self.client.close()
|
|
142
|
+
|
|
143
|
+
def _write_logs_to_stream(self, stream_name: str, logs: List[RunnerLogEvent]):
|
|
144
|
+
with self.logger.batch() as batcher:
|
|
145
|
+
for batch in batched(logs, self.MAX_BATCH_SIZE):
|
|
146
|
+
for log in batch:
|
|
147
|
+
message = log.message.decode(errors="replace")
|
|
148
|
+
timestamp = unix_time_ms_to_datetime(log.timestamp)
|
|
149
|
+
if len(log.message) > self.MAX_RUNNER_MESSAGE_SIZE:
|
|
150
|
+
logger.error(
|
|
151
|
+
"Stream %s: skipping event at %s, message exceeds max size: %d > %d",
|
|
152
|
+
stream_name,
|
|
153
|
+
timestamp.isoformat(),
|
|
154
|
+
len(log.message),
|
|
155
|
+
self.MAX_RUNNER_MESSAGE_SIZE,
|
|
156
|
+
)
|
|
157
|
+
continue
|
|
158
|
+
batcher.log_struct(
|
|
159
|
+
{
|
|
160
|
+
"message": message,
|
|
161
|
+
},
|
|
162
|
+
labels={
|
|
163
|
+
"stream": stream_name,
|
|
164
|
+
},
|
|
165
|
+
timestamp=timestamp,
|
|
154
166
|
)
|
|
155
|
-
|
|
156
|
-
batcher.log_struct(
|
|
157
|
-
{
|
|
158
|
-
"message": message,
|
|
159
|
-
},
|
|
160
|
-
labels={
|
|
161
|
-
"stream": stream_name,
|
|
162
|
-
},
|
|
163
|
-
timestamp=timestamp,
|
|
164
|
-
)
|
|
165
|
-
batcher.commit()
|
|
167
|
+
batcher.commit()
|
|
166
168
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
169
|
+
def _get_stream_name(
|
|
170
|
+
self, project_name: str, run_name: str, job_submission_id: UUID, producer: LogProducer
|
|
171
|
+
) -> str:
|
|
172
|
+
return f"{project_name}-{run_name}-{job_submission_id}-{producer.value}"
|
|
171
173
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
174
|
+
def _get_stream_extrnal_url(self, stream_name: str) -> str:
|
|
175
|
+
log_name_resource_name = self._get_log_name_resource_name()
|
|
176
|
+
query = f'logName="{log_name_resource_name}" AND labels.stream="{stream_name}"'
|
|
177
|
+
quoted_query = urllib.parse.quote(query, safe="")
|
|
178
|
+
return f"https://console.cloud.google.com/logs/query;query={quoted_query}?project={self.project_id}"
|
|
177
179
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
+
def _get_log_name_resource_name(self) -> str:
|
|
181
|
+
return f"projects/{self.project_id}/logs/{self.LOG_NAME}"
|
|
@@ -60,7 +60,7 @@ def load_plugins(enabled_plugins: list[str]):
|
|
|
60
60
|
_PLUGINS.clear()
|
|
61
61
|
entrypoints: dict[str, PluginEntrypoint] = {}
|
|
62
62
|
plugins_to_load = enabled_plugins.copy()
|
|
63
|
-
for entrypoint in entry_points(group="dstack.plugins"):
|
|
63
|
+
for entrypoint in entry_points(group="dstack.plugins"): # type: ignore[call-arg]
|
|
64
64
|
if entrypoint.name not in enabled_plugins:
|
|
65
65
|
logger.info(
|
|
66
66
|
("Found not enabled plugin %s. Plugin will not be loaded."),
|
|
@@ -19,7 +19,7 @@ from dstack._internal.server.models import MemberModel, ProjectModel, UserModel
|
|
|
19
19
|
from dstack._internal.server.schemas.projects import MemberSetting
|
|
20
20
|
from dstack._internal.server.services import users
|
|
21
21
|
from dstack._internal.server.services.backends import (
|
|
22
|
-
|
|
22
|
+
get_backend_config_without_creds_from_backend_model,
|
|
23
23
|
)
|
|
24
24
|
from dstack._internal.server.services.permissions import get_default_permissions
|
|
25
25
|
from dstack._internal.server.settings import DEFAULT_PROJECT_NAME
|
|
@@ -313,7 +313,6 @@ async def add_project_members(
|
|
|
313
313
|
member_num=None,
|
|
314
314
|
commit=False,
|
|
315
315
|
)
|
|
316
|
-
member_by_user_id[user_to_add.id] = None
|
|
317
316
|
|
|
318
317
|
await session.commit()
|
|
319
318
|
|
|
@@ -544,9 +543,7 @@ def project_model_to_project(
|
|
|
544
543
|
b.type.value,
|
|
545
544
|
)
|
|
546
545
|
continue
|
|
547
|
-
backend_config =
|
|
548
|
-
configurator, b, include_creds=False
|
|
549
|
-
)
|
|
546
|
+
backend_config = get_backend_config_without_creds_from_backend_model(configurator, b)
|
|
550
547
|
if isinstance(backend_config, DstackBackendConfig):
|
|
551
548
|
for backend_type in backend_config.base_backends:
|
|
552
549
|
backends.append(
|
|
@@ -74,6 +74,8 @@ class ServerProxyRepo(BaseProxyRepo):
|
|
|
74
74
|
jpd: JobProvisioningData = JobProvisioningData.__response__.parse_raw(
|
|
75
75
|
job.job_provisioning_data
|
|
76
76
|
)
|
|
77
|
+
assert jpd.hostname is not None
|
|
78
|
+
assert jpd.ssh_port is not None
|
|
77
79
|
if not jpd.dockerized:
|
|
78
80
|
ssh_destination = f"{jpd.username}@{jpd.hostname}"
|
|
79
81
|
ssh_port = jpd.ssh_port
|
|
@@ -140,7 +142,7 @@ class ServerProxyRepo(BaseProxyRepo):
|
|
|
140
142
|
model_options_obj = service_spec.options.get("openai", {}).get("model")
|
|
141
143
|
if model_spec is None or model_options_obj is None:
|
|
142
144
|
continue
|
|
143
|
-
model_options = pydantic.parse_obj_as(AnyModel, model_options_obj)
|
|
145
|
+
model_options = pydantic.parse_obj_as(AnyModel, model_options_obj) # type: ignore[arg-type]
|
|
144
146
|
model = ChatModel(
|
|
145
147
|
project_name=project_name,
|
|
146
148
|
name=model_spec.name,
|
|
@@ -175,6 +177,8 @@ def _model_options_to_format_spec(model: AnyModel) -> AnyModelFormat:
|
|
|
175
177
|
if model.format == "openai":
|
|
176
178
|
return OpenAIChatModelFormat(prefix=model.prefix)
|
|
177
179
|
elif model.format == "tgi":
|
|
180
|
+
assert model.chat_template is not None
|
|
181
|
+
assert model.eos_token is not None
|
|
178
182
|
return TGIChatModelFormat(
|
|
179
183
|
chat_template=model.chat_template,
|
|
180
184
|
eos_token=model.eos_token,
|
|
File without changes
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
from typing import Callable, Optional, Protocol, TypeVar
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
from typing_extensions import Self
|
|
5
|
+
|
|
6
|
+
from dstack._internal.core.models.profiles import Profile, SpotPolicy
|
|
7
|
+
from dstack._internal.core.models.resources import (
|
|
8
|
+
CPUSpec,
|
|
9
|
+
DiskSpec,
|
|
10
|
+
GPUSpec,
|
|
11
|
+
Memory,
|
|
12
|
+
Range,
|
|
13
|
+
ResourcesSpec,
|
|
14
|
+
)
|
|
15
|
+
from dstack._internal.core.models.runs import Requirements
|
|
16
|
+
from dstack._internal.utils.typing import SupportsRichComparison
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CombineError(ValueError):
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def combine_fleet_and_run_profiles(
|
|
24
|
+
fleet_profile: Profile, run_profile: Profile
|
|
25
|
+
) -> Optional[Profile]:
|
|
26
|
+
"""
|
|
27
|
+
Combines fleet and run profile parameters that affect offer selection or provisioning.
|
|
28
|
+
"""
|
|
29
|
+
try:
|
|
30
|
+
return Profile(
|
|
31
|
+
backends=_intersect_lists_optional(fleet_profile.backends, run_profile.backends),
|
|
32
|
+
regions=_intersect_lists_optional(fleet_profile.regions, run_profile.regions),
|
|
33
|
+
availability_zones=_intersect_lists_optional(
|
|
34
|
+
fleet_profile.availability_zones, run_profile.availability_zones
|
|
35
|
+
),
|
|
36
|
+
instance_types=_intersect_lists_optional(
|
|
37
|
+
fleet_profile.instance_types, run_profile.instance_types
|
|
38
|
+
),
|
|
39
|
+
reservation=_get_single_value_optional(
|
|
40
|
+
fleet_profile.reservation, run_profile.reservation
|
|
41
|
+
),
|
|
42
|
+
spot_policy=_combine_spot_policy_optional(
|
|
43
|
+
fleet_profile.spot_policy, run_profile.spot_policy
|
|
44
|
+
),
|
|
45
|
+
max_price=_get_min_optional(fleet_profile.max_price, run_profile.max_price),
|
|
46
|
+
idle_duration=_combine_idle_duration_optional(
|
|
47
|
+
fleet_profile.idle_duration, run_profile.idle_duration
|
|
48
|
+
),
|
|
49
|
+
tags=_combine_tags_optional(fleet_profile.tags, run_profile.tags),
|
|
50
|
+
)
|
|
51
|
+
except CombineError:
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def combine_fleet_and_run_requirements(
|
|
56
|
+
fleet_requirements: Requirements, run_requirements: Requirements
|
|
57
|
+
) -> Optional[Requirements]:
|
|
58
|
+
try:
|
|
59
|
+
return Requirements(
|
|
60
|
+
resources=_combine_resources(fleet_requirements.resources, run_requirements.resources),
|
|
61
|
+
max_price=_get_min_optional(fleet_requirements.max_price, run_requirements.max_price),
|
|
62
|
+
spot=_combine_spot_optional(fleet_requirements.spot, run_requirements.spot),
|
|
63
|
+
reservation=_get_single_value_optional(
|
|
64
|
+
fleet_requirements.reservation, run_requirements.reservation
|
|
65
|
+
),
|
|
66
|
+
)
|
|
67
|
+
except CombineError:
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
_T = TypeVar("_T")
|
|
72
|
+
_ModelT = TypeVar("_ModelT", bound=BaseModel)
|
|
73
|
+
_CompT = TypeVar("_CompT", bound=SupportsRichComparison)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class _SupportsCopy(Protocol):
|
|
77
|
+
def copy(self) -> Self: ...
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
_CopyT = TypeVar("_CopyT", bound=_SupportsCopy)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _intersect_lists_optional(
|
|
84
|
+
list1: Optional[list[_T]], list2: Optional[list[_T]]
|
|
85
|
+
) -> Optional[list[_T]]:
|
|
86
|
+
if list1 is None:
|
|
87
|
+
if list2 is None:
|
|
88
|
+
return None
|
|
89
|
+
return list2.copy()
|
|
90
|
+
if list2 is None:
|
|
91
|
+
return list1.copy()
|
|
92
|
+
return [x for x in list1 if x in list2]
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _get_min(value1: _CompT, value2: _CompT) -> _CompT:
|
|
96
|
+
return min(value1, value2)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _get_min_optional(value1: Optional[_CompT], value2: Optional[_CompT]) -> Optional[_CompT]:
|
|
100
|
+
return _combine_optional(value1, value2, _get_min)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _get_single_value(value1: _T, value2: _T) -> _T:
|
|
104
|
+
if value1 == value2:
|
|
105
|
+
return value1
|
|
106
|
+
raise CombineError(f"Values {value1} and {value2} cannot be combined")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _get_single_value_optional(value1: Optional[_T], value2: Optional[_T]) -> Optional[_T]:
|
|
110
|
+
return _combine_optional(value1, value2, _get_single_value)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _combine_spot_policy(value1: SpotPolicy, value2: SpotPolicy) -> SpotPolicy:
|
|
114
|
+
if value1 == SpotPolicy.AUTO:
|
|
115
|
+
return value2
|
|
116
|
+
if value2 == SpotPolicy.AUTO:
|
|
117
|
+
return value1
|
|
118
|
+
if value1 == value2:
|
|
119
|
+
return value1
|
|
120
|
+
raise CombineError(f"spot_policy values {value1} and {value2} cannot be combined")
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _combine_spot_policy_optional(
|
|
124
|
+
value1: Optional[SpotPolicy], value2: Optional[SpotPolicy]
|
|
125
|
+
) -> Optional[SpotPolicy]:
|
|
126
|
+
return _combine_optional(value1, value2, _combine_spot_policy)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _combine_idle_duration(value1: int, value2: int) -> int:
|
|
130
|
+
if value1 < 0 and value2 >= 0 or value2 < 0 and value1 >= 0:
|
|
131
|
+
raise CombineError(f"idle_duration values {value1} and {value2} cannot be combined")
|
|
132
|
+
return min(value1, value2)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _combine_idle_duration_optional(value1: Optional[int], value2: Optional[int]) -> Optional[int]:
|
|
136
|
+
return _combine_optional(value1, value2, _combine_idle_duration)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _combine_tags_optional(
|
|
140
|
+
value1: Optional[dict[str, str]], value2: Optional[dict[str, str]]
|
|
141
|
+
) -> Optional[dict[str, str]]:
|
|
142
|
+
return _combine_copy_optional(value1, value2, _combine_tags)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _combine_tags(value1: dict[str, str], value2: dict[str, str]) -> dict[str, str]:
|
|
146
|
+
return value1 | value2
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _combine_resources(value1: ResourcesSpec, value2: ResourcesSpec) -> ResourcesSpec:
|
|
150
|
+
return ResourcesSpec(
|
|
151
|
+
cpu=_combine_cpu(value1.cpu, value2.cpu), # type: ignore[attr-defined]
|
|
152
|
+
memory=_combine_memory(value1.memory, value2.memory),
|
|
153
|
+
shm_size=_combine_shm_size_optional(value1.shm_size, value2.shm_size),
|
|
154
|
+
gpu=_combine_gpu_optional(value1.gpu, value2.gpu),
|
|
155
|
+
disk=_combine_disk_optional(value1.disk, value2.disk),
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _combine_cpu(value1: CPUSpec, value2: CPUSpec) -> CPUSpec:
|
|
160
|
+
return CPUSpec(
|
|
161
|
+
arch=_get_single_value_optional(value1.arch, value2.arch),
|
|
162
|
+
count=_combine_range(value1.count, value2.count),
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _combine_memory(value1: Range[Memory], value2: Range[Memory]) -> Range[Memory]:
|
|
167
|
+
return _combine_range(value1, value2)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _combine_shm_size_optional(
|
|
171
|
+
value1: Optional[Memory], value2: Optional[Memory]
|
|
172
|
+
) -> Optional[Memory]:
|
|
173
|
+
return _get_min_optional(value1, value2)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _combine_gpu(value1: GPUSpec, value2: GPUSpec) -> GPUSpec:
|
|
177
|
+
return GPUSpec(
|
|
178
|
+
vendor=_get_single_value_optional(value1.vendor, value2.vendor),
|
|
179
|
+
name=_intersect_lists_optional(value1.name, value2.name),
|
|
180
|
+
count=_combine_range(value1.count, value2.count),
|
|
181
|
+
memory=_combine_range_optional(value1.memory, value2.memory),
|
|
182
|
+
total_memory=_combine_range_optional(value1.total_memory, value2.total_memory),
|
|
183
|
+
compute_capability=_get_min_optional(value1.compute_capability, value2.compute_capability),
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _combine_gpu_optional(
|
|
188
|
+
value1: Optional[GPUSpec], value2: Optional[GPUSpec]
|
|
189
|
+
) -> Optional[GPUSpec]:
|
|
190
|
+
return _combine_models_optional(value1, value2, _combine_gpu)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _combine_disk(value1: DiskSpec, value2: DiskSpec) -> DiskSpec:
|
|
194
|
+
return DiskSpec(size=_combine_range(value1.size, value2.size))
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _combine_disk_optional(
|
|
198
|
+
value1: Optional[DiskSpec], value2: Optional[DiskSpec]
|
|
199
|
+
) -> Optional[DiskSpec]:
|
|
200
|
+
return _combine_models_optional(value1, value2, _combine_disk)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _combine_spot(value1: bool, value2: bool) -> bool:
|
|
204
|
+
if value1 != value2:
|
|
205
|
+
raise CombineError(f"spot values {value1} and {value2} cannot be combined")
|
|
206
|
+
return value1
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _combine_spot_optional(value1: Optional[bool], value2: Optional[bool]) -> Optional[bool]:
|
|
210
|
+
return _combine_optional(value1, value2, _combine_spot)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _combine_range(value1: Range, value2: Range) -> Range:
|
|
214
|
+
res = value1.intersect(value2)
|
|
215
|
+
if res is None:
|
|
216
|
+
raise CombineError(f"Ranges {value1} and {value2} cannot be combined")
|
|
217
|
+
return res
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _combine_range_optional(value1: Optional[Range], value2: Optional[Range]) -> Optional[Range]:
|
|
221
|
+
return _combine_models_optional(value1, value2, _combine_range)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _combine_optional(
|
|
225
|
+
value1: Optional[_T], value2: Optional[_T], combiner: Callable[[_T, _T], _T]
|
|
226
|
+
) -> Optional[_T]:
|
|
227
|
+
if value1 is None:
|
|
228
|
+
return value2
|
|
229
|
+
if value2 is None:
|
|
230
|
+
return value1
|
|
231
|
+
return combiner(value1, value2)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _combine_models_optional(
|
|
235
|
+
value1: Optional[_ModelT],
|
|
236
|
+
value2: Optional[_ModelT],
|
|
237
|
+
combiner: Callable[[_ModelT, _ModelT], _ModelT],
|
|
238
|
+
) -> Optional[_ModelT]:
|
|
239
|
+
if value1 is None:
|
|
240
|
+
if value2 is not None:
|
|
241
|
+
return value2.copy(deep=True)
|
|
242
|
+
return None
|
|
243
|
+
if value2 is None:
|
|
244
|
+
return value1.copy(deep=True)
|
|
245
|
+
return combiner(value1, value2)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _combine_copy_optional(
|
|
249
|
+
value1: Optional[_CopyT],
|
|
250
|
+
value2: Optional[_CopyT],
|
|
251
|
+
combiner: Callable[[_CopyT, _CopyT], _CopyT],
|
|
252
|
+
) -> Optional[_CopyT]:
|
|
253
|
+
if value1 is None:
|
|
254
|
+
if value2 is not None:
|
|
255
|
+
return value2.copy()
|
|
256
|
+
return None
|
|
257
|
+
if value2 is None:
|
|
258
|
+
return value1.copy()
|
|
259
|
+
return combiner(value1, value2)
|