dstack 0.19.8__py3-none-any.whl → 0.19.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/configurators/run.py +18 -11
- dstack/_internal/cli/utils/run.py +7 -2
- dstack/_internal/core/backends/cudo/compute.py +1 -1
- dstack/_internal/core/backends/nebius/fabrics.py +1 -0
- dstack/_internal/core/backends/nebius/models.py +1 -1
- dstack/_internal/core/models/resources.py +1 -1
- dstack/_internal/core/models/runs.py +19 -7
- dstack/_internal/server/background/tasks/process_metrics.py +26 -9
- dstack/_internal/server/background/tasks/process_running_jobs.py +56 -18
- dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
- dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
- dstack/_internal/server/models.py +5 -1
- dstack/_internal/server/schemas/runner.py +41 -8
- dstack/_internal/server/services/jobs/__init__.py +1 -0
- dstack/_internal/server/services/runner/client.py +7 -4
- dstack/_internal/server/services/runs.py +2 -2
- dstack/_internal/server/settings.py +20 -1
- dstack/_internal/server/testing/common.py +2 -0
- dstack/api/server/_runs.py +4 -0
- dstack/version.py +1 -1
- {dstack-0.19.8.dist-info → dstack-0.19.9.dist-info}/METADATA +36 -29
- {dstack-0.19.8.dist-info → dstack-0.19.9.dist-info}/RECORD +25 -23
- {dstack-0.19.8.dist-info → dstack-0.19.9.dist-info}/WHEEL +0 -0
- {dstack-0.19.8.dist-info → dstack-0.19.9.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.8.dist-info → dstack-0.19.9.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -98,6 +98,8 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
|
|
|
98
98
|
print_run_plan(run_plan, max_offers=configurator_args.max_offers)
|
|
99
99
|
|
|
100
100
|
confirm_message = "Submit a new run?"
|
|
101
|
+
if conf.name:
|
|
102
|
+
confirm_message = f"Submit the run [code]{conf.name}[/]?"
|
|
101
103
|
stop_run_name = None
|
|
102
104
|
if run_plan.current_resource is not None:
|
|
103
105
|
changed_fields = []
|
|
@@ -130,11 +132,6 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
|
|
|
130
132
|
f"Active run [code]{conf.name}[/] already exists and cannot be updated in-place."
|
|
131
133
|
)
|
|
132
134
|
confirm_message = "Stop and override the run?"
|
|
133
|
-
else:
|
|
134
|
-
console.print(f"Finished run [code]{conf.name}[/] already exists.")
|
|
135
|
-
confirm_message = "Override the run?"
|
|
136
|
-
elif conf.name:
|
|
137
|
-
confirm_message = f"Submit the run [code]{conf.name}[/]?"
|
|
138
135
|
|
|
139
136
|
if not command_args.yes and not confirm_ask(confirm_message):
|
|
140
137
|
console.print("\nExiting...")
|
|
@@ -560,7 +557,9 @@ def print_finished_message(run: Run):
|
|
|
560
557
|
console.print("[code]Done[/]")
|
|
561
558
|
return
|
|
562
559
|
|
|
563
|
-
termination_reason, termination_reason_message =
|
|
560
|
+
termination_reason, termination_reason_message, exit_status = (
|
|
561
|
+
_get_run_termination_reason_and_exit_status(run)
|
|
562
|
+
)
|
|
564
563
|
message = "Run failed due to unknown reason. Check CLI, server, and run logs."
|
|
565
564
|
if run.status == RunStatus.TERMINATED:
|
|
566
565
|
message = "Run terminated due to unknown reason. Check CLI, server, and run logs."
|
|
@@ -572,13 +571,15 @@ def print_finished_message(run: Run):
|
|
|
572
571
|
"Check CLI and server logs for more details."
|
|
573
572
|
)
|
|
574
573
|
elif termination_reason is not None:
|
|
574
|
+
exit_status_details = f"Exit status: {exit_status}.\n" if exit_status else ""
|
|
575
575
|
error_details = (
|
|
576
576
|
f"Error: {termination_reason_message}\n" if termination_reason_message else ""
|
|
577
577
|
)
|
|
578
578
|
message = (
|
|
579
579
|
f"Run failed with error code {termination_reason.name}.\n"
|
|
580
|
+
f"{exit_status_details}"
|
|
580
581
|
f"{error_details}"
|
|
581
|
-
"Check
|
|
582
|
+
f"Check [bold]dstack logs -d {run.name}[/bold] for more details."
|
|
582
583
|
)
|
|
583
584
|
console.print(f"[error]{message}[/]")
|
|
584
585
|
|
|
@@ -589,14 +590,20 @@ def get_run_exit_code(run: Run) -> int:
|
|
|
589
590
|
return 1
|
|
590
591
|
|
|
591
592
|
|
|
592
|
-
def
|
|
593
|
+
def _get_run_termination_reason_and_exit_status(
|
|
594
|
+
run: Run,
|
|
595
|
+
) -> Tuple[Optional[JobTerminationReason], Optional[str], Optional[int]]:
|
|
593
596
|
if len(run._run.jobs) == 0:
|
|
594
|
-
return None, None
|
|
597
|
+
return None, None, None
|
|
595
598
|
job = run._run.jobs[0]
|
|
596
599
|
if len(job.job_submissions) == 0:
|
|
597
|
-
return None, None
|
|
600
|
+
return None, None, None
|
|
598
601
|
job_submission = job.job_submissions[0]
|
|
599
|
-
return
|
|
602
|
+
return (
|
|
603
|
+
job_submission.termination_reason,
|
|
604
|
+
job_submission.termination_reason_message,
|
|
605
|
+
job_submission.exit_status,
|
|
606
|
+
)
|
|
600
607
|
|
|
601
608
|
|
|
602
609
|
def _run_resubmitted(run: Run, current_job_submission: Optional[JobSubmission]) -> bool:
|
|
@@ -218,6 +218,11 @@ def _get_run_error(run: Run) -> str:
|
|
|
218
218
|
|
|
219
219
|
|
|
220
220
|
def _get_job_error(job: Job) -> str:
|
|
221
|
-
|
|
221
|
+
job_submission = job.job_submissions[-1]
|
|
222
|
+
termination_reason = job_submission.termination_reason
|
|
223
|
+
exit_status = job_submission.exit_status
|
|
224
|
+
if termination_reason is None:
|
|
222
225
|
return ""
|
|
223
|
-
|
|
226
|
+
if exit_status:
|
|
227
|
+
return f"{termination_reason.name} {exit_status}"
|
|
228
|
+
return termination_reason.name
|
|
@@ -147,7 +147,7 @@ class CudoCompute(
|
|
|
147
147
|
|
|
148
148
|
|
|
149
149
|
def _get_image_id(cuda: bool) -> str:
|
|
150
|
-
image_name = "ubuntu-2204-nvidia-535-docker-
|
|
150
|
+
image_name = "ubuntu-2204-nvidia-535-docker-v20241017" if cuda else "ubuntu-2204"
|
|
151
151
|
return image_name
|
|
152
152
|
|
|
153
153
|
|
|
@@ -20,6 +20,7 @@ INFINIBAND_FABRICS = [
|
|
|
20
20
|
InfinibandFabric("fabric-5", "gpu-h200-sxm", "eu-west1"),
|
|
21
21
|
InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
|
|
22
22
|
InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
|
|
23
|
+
InfinibandFabric("us-central1-a", "gpu-h200-sxm", "us-central1"),
|
|
23
24
|
]
|
|
24
25
|
|
|
25
26
|
|
|
@@ -5,7 +5,7 @@ from pydantic import Field, root_validator
|
|
|
5
5
|
from dstack._internal.core.backends.base.models import fill_data
|
|
6
6
|
from dstack._internal.core.models.common import CoreModel
|
|
7
7
|
|
|
8
|
-
DEFAULT_PROJECT_NAME_PREFIX = "default
|
|
8
|
+
DEFAULT_PROJECT_NAME_PREFIX = "default"
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class NebiusServiceAccountCreds(CoreModel):
|
|
@@ -126,7 +126,7 @@ class ComputeCapability(Tuple[int, int]):
|
|
|
126
126
|
|
|
127
127
|
DEFAULT_CPU_COUNT = Range[int](min=2)
|
|
128
128
|
DEFAULT_MEMORY_SIZE = Range[Memory](min=Memory.parse("8GB"))
|
|
129
|
-
DEFAULT_GPU_COUNT = Range[int](min=1
|
|
129
|
+
DEFAULT_GPU_COUNT = Range[int](min=1)
|
|
130
130
|
|
|
131
131
|
|
|
132
132
|
class CPUSpec(CoreModel):
|
|
@@ -104,6 +104,7 @@ class JobTerminationReason(str, Enum):
|
|
|
104
104
|
# Set by the server
|
|
105
105
|
FAILED_TO_START_DUE_TO_NO_CAPACITY = "failed_to_start_due_to_no_capacity"
|
|
106
106
|
INTERRUPTED_BY_NO_CAPACITY = "interrupted_by_no_capacity"
|
|
107
|
+
INSTANCE_UNREACHABLE = "instance_unreachable"
|
|
107
108
|
WAITING_INSTANCE_LIMIT_EXCEEDED = "waiting_instance_limit_exceeded"
|
|
108
109
|
WAITING_RUNNER_LIMIT_EXCEEDED = "waiting_runner_limit_exceeded"
|
|
109
110
|
TERMINATED_BY_USER = "terminated_by_user"
|
|
@@ -126,6 +127,7 @@ class JobTerminationReason(str, Enum):
|
|
|
126
127
|
mapping = {
|
|
127
128
|
self.FAILED_TO_START_DUE_TO_NO_CAPACITY: JobStatus.FAILED,
|
|
128
129
|
self.INTERRUPTED_BY_NO_CAPACITY: JobStatus.FAILED,
|
|
130
|
+
self.INSTANCE_UNREACHABLE: JobStatus.FAILED,
|
|
129
131
|
self.WAITING_INSTANCE_LIMIT_EXCEEDED: JobStatus.FAILED,
|
|
130
132
|
self.WAITING_RUNNER_LIMIT_EXCEEDED: JobStatus.FAILED,
|
|
131
133
|
self.TERMINATED_BY_USER: JobStatus.TERMINATED,
|
|
@@ -262,9 +264,9 @@ class JobRuntimeData(CoreModel):
|
|
|
262
264
|
# or not applicable (container-based backends)
|
|
263
265
|
ports: Optional[dict[int, int]] = None
|
|
264
266
|
# List of volumes used by the job
|
|
265
|
-
volume_names: Optional[list[str]] = None # None for backward
|
|
267
|
+
volume_names: Optional[list[str]] = None # None for backward compatibility
|
|
266
268
|
# Virtual shared offer
|
|
267
|
-
offer: Optional[InstanceOfferWithAvailability] = None # None for backward
|
|
269
|
+
offer: Optional[InstanceOfferWithAvailability] = None # None for backward compatibility
|
|
268
270
|
|
|
269
271
|
|
|
270
272
|
class ClusterInfo(CoreModel):
|
|
@@ -283,6 +285,7 @@ class JobSubmission(CoreModel):
|
|
|
283
285
|
status: JobStatus
|
|
284
286
|
termination_reason: Optional[JobTerminationReason]
|
|
285
287
|
termination_reason_message: Optional[str]
|
|
288
|
+
exit_status: Optional[int]
|
|
286
289
|
job_provisioning_data: Optional[JobProvisioningData]
|
|
287
290
|
job_runtime_data: Optional[JobRuntimeData]
|
|
288
291
|
|
|
@@ -508,7 +511,9 @@ def _get_run_error(
|
|
|
508
511
|
return ""
|
|
509
512
|
if len(run_jobs) > 1:
|
|
510
513
|
return run_termination_reason.name
|
|
511
|
-
run_job_termination_reason =
|
|
514
|
+
run_job_termination_reason, exit_status = _get_run_job_termination_reason_and_exit_status(
|
|
515
|
+
run_jobs
|
|
516
|
+
)
|
|
512
517
|
# For failed runs, also show termination reason to provide more context.
|
|
513
518
|
# For other run statuses, the job termination reason will duplicate run status.
|
|
514
519
|
if run_job_termination_reason is not None and run_termination_reason in [
|
|
@@ -516,13 +521,20 @@ def _get_run_error(
|
|
|
516
521
|
RunTerminationReason.SERVER_ERROR,
|
|
517
522
|
RunTerminationReason.RETRY_LIMIT_EXCEEDED,
|
|
518
523
|
]:
|
|
524
|
+
if exit_status:
|
|
525
|
+
return (
|
|
526
|
+
f"{run_termination_reason.name}\n({run_job_termination_reason.name} {exit_status})"
|
|
527
|
+
)
|
|
519
528
|
return f"{run_termination_reason.name}\n({run_job_termination_reason.name})"
|
|
520
529
|
return run_termination_reason.name
|
|
521
530
|
|
|
522
531
|
|
|
523
|
-
def
|
|
532
|
+
def _get_run_job_termination_reason_and_exit_status(
|
|
533
|
+
run_jobs: List[Job],
|
|
534
|
+
) -> tuple[Optional[JobTerminationReason], Optional[int]]:
|
|
524
535
|
for job in run_jobs:
|
|
525
536
|
if len(job.job_submissions) > 0:
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
537
|
+
job_submission = job.job_submissions[-1]
|
|
538
|
+
if job_submission.termination_reason is not None:
|
|
539
|
+
return job_submission.termination_reason, job_submission.exit_status
|
|
540
|
+
return None, None
|
|
@@ -42,10 +42,33 @@ async def collect_metrics():
|
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
async def delete_metrics():
|
|
45
|
-
|
|
45
|
+
now_timestamp_micro = int(get_current_datetime().timestamp() * 1_000_000)
|
|
46
|
+
running_timestamp_micro_cutoff = (
|
|
47
|
+
now_timestamp_micro - settings.SERVER_METRICS_RUNNING_TTL_SECONDS * 1_000_000
|
|
48
|
+
)
|
|
49
|
+
finished_timestamp_micro_cutoff = (
|
|
50
|
+
now_timestamp_micro - settings.SERVER_METRICS_FINISHED_TTL_SECONDS * 1_000_000
|
|
51
|
+
)
|
|
46
52
|
async with get_session_ctx() as session:
|
|
47
|
-
await
|
|
48
|
-
|
|
53
|
+
await asyncio.gather(
|
|
54
|
+
session.execute(
|
|
55
|
+
delete(JobMetricsPoint).where(
|
|
56
|
+
JobMetricsPoint.job_id.in_(
|
|
57
|
+
select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING]))
|
|
58
|
+
),
|
|
59
|
+
JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff,
|
|
60
|
+
)
|
|
61
|
+
),
|
|
62
|
+
session.execute(
|
|
63
|
+
delete(JobMetricsPoint).where(
|
|
64
|
+
JobMetricsPoint.job_id.in_(
|
|
65
|
+
select(JobModel.id).where(
|
|
66
|
+
JobModel.status.in_(JobStatus.finished_statuses())
|
|
67
|
+
)
|
|
68
|
+
),
|
|
69
|
+
JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff,
|
|
70
|
+
)
|
|
71
|
+
),
|
|
49
72
|
)
|
|
50
73
|
await session.commit()
|
|
51
74
|
|
|
@@ -134,9 +157,3 @@ def _pull_runner_metrics(
|
|
|
134
157
|
) -> Optional[MetricsResponse]:
|
|
135
158
|
runner_client = client.RunnerClient(port=ports[DSTACK_RUNNER_HTTP_PORT])
|
|
136
159
|
return runner_client.get_metrics()
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
def _get_delete_metrics_cutoff() -> int:
|
|
140
|
-
now = int(get_current_datetime().timestamp() * 1_000_000)
|
|
141
|
-
cutoff = now - (settings.SERVER_METRICS_TTL_SECONDS * 1_000_000)
|
|
142
|
-
return cutoff
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from collections.abc import Iterable
|
|
3
|
-
from datetime import timedelta
|
|
3
|
+
from datetime import timedelta, timezone
|
|
4
4
|
from typing import Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
from sqlalchemy import select
|
|
@@ -71,6 +71,12 @@ from dstack._internal.utils.logging import get_logger
|
|
|
71
71
|
logger = get_logger(__name__)
|
|
72
72
|
|
|
73
73
|
|
|
74
|
+
# Minimum time before terminating active job in case of connectivity issues.
|
|
75
|
+
# Should be sufficient to survive most problems caused by
|
|
76
|
+
# the server network flickering and providers' glitches.
|
|
77
|
+
JOB_DISCONNECTED_RETRY_TIMEOUT = timedelta(minutes=2)
|
|
78
|
+
|
|
79
|
+
|
|
74
80
|
async def process_running_jobs(batch_size: int = 1):
|
|
75
81
|
tasks = []
|
|
76
82
|
for _ in range(batch_size):
|
|
@@ -202,7 +208,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
202
208
|
user_ssh_key = run.run_spec.ssh_key_pub.strip()
|
|
203
209
|
public_keys = [project.ssh_public_key.strip(), user_ssh_key]
|
|
204
210
|
if job_provisioning_data.backend == BackendType.LOCAL:
|
|
205
|
-
# No need to update ~/.ssh/authorized_keys when running shim
|
|
211
|
+
# No need to update ~/.ssh/authorized_keys when running shim locally
|
|
206
212
|
user_ssh_key = ""
|
|
207
213
|
success = await common_utils.run_async(
|
|
208
214
|
_process_provisioning_with_shim,
|
|
@@ -299,19 +305,38 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
299
305
|
run_model,
|
|
300
306
|
job_model,
|
|
301
307
|
)
|
|
302
|
-
if not success:
|
|
303
|
-
job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
|
|
304
308
|
|
|
305
|
-
if
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
309
|
+
if success:
|
|
310
|
+
job_model.disconnected_at = None
|
|
311
|
+
else:
|
|
312
|
+
if job_model.termination_reason:
|
|
313
|
+
logger.warning(
|
|
314
|
+
"%s: failed because shim/runner returned an error, age=%s",
|
|
315
|
+
fmt(job_model),
|
|
316
|
+
job_submission.age,
|
|
317
|
+
)
|
|
318
|
+
job_model.status = JobStatus.TERMINATING
|
|
319
|
+
# job will be terminated and instance will be emptied by process_terminating_jobs
|
|
320
|
+
else:
|
|
321
|
+
# No job_model.termination_reason set means ssh connection failed
|
|
322
|
+
if job_model.disconnected_at is None:
|
|
323
|
+
job_model.disconnected_at = common_utils.get_current_datetime()
|
|
324
|
+
if _should_terminate_job_due_to_disconnect(job_model):
|
|
325
|
+
logger.warning(
|
|
326
|
+
"%s: failed because instance is unreachable, age=%s",
|
|
327
|
+
fmt(job_model),
|
|
328
|
+
job_submission.age,
|
|
329
|
+
)
|
|
330
|
+
# TODO: Replace with JobTerminationReason.INSTANCE_UNREACHABLE in 0.20 or
|
|
331
|
+
# when CLI <= 0.19.8 is no longer supported
|
|
332
|
+
job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
|
|
333
|
+
job_model.status = JobStatus.TERMINATING
|
|
334
|
+
else:
|
|
335
|
+
logger.warning(
|
|
336
|
+
"%s: is unreachable, waiting for the instance to become reachable again, age=%s",
|
|
337
|
+
fmt(job_model),
|
|
338
|
+
job_submission.age,
|
|
339
|
+
)
|
|
315
340
|
|
|
316
341
|
if (
|
|
317
342
|
initial_status != job_model.status
|
|
@@ -543,7 +568,7 @@ def _process_pulling_with_shim(
|
|
|
543
568
|
if shim_client.is_api_v2_supported(): # raises error if shim is down, causes retry
|
|
544
569
|
task = shim_client.get_task(job_model.id)
|
|
545
570
|
|
|
546
|
-
# If task goes to terminated before the job is submitted to runner, then an error
|
|
571
|
+
# If task goes to terminated before the job is submitted to runner, then an error occurred
|
|
547
572
|
if task.status == TaskStatus.TERMINATED:
|
|
548
573
|
logger.warning(
|
|
549
574
|
"shim failed to execute job %s: %s (%s)",
|
|
@@ -572,7 +597,7 @@ def _process_pulling_with_shim(
|
|
|
572
597
|
else:
|
|
573
598
|
shim_status = shim_client.pull() # raises error if shim is down, causes retry
|
|
574
599
|
|
|
575
|
-
# If shim goes to pending before the job is submitted to runner, then an error
|
|
600
|
+
# If shim goes to pending before the job is submitted to runner, then an error occurred
|
|
576
601
|
if (
|
|
577
602
|
shim_status.state == "pending"
|
|
578
603
|
and shim_status.result is not None
|
|
@@ -651,6 +676,10 @@ def _process_running(
|
|
|
651
676
|
)
|
|
652
677
|
if latest_state_event.termination_message:
|
|
653
678
|
job_model.termination_reason_message = latest_state_event.termination_message
|
|
679
|
+
if (exit_status := latest_state_event.exit_status) is not None:
|
|
680
|
+
job_model.exit_status = exit_status
|
|
681
|
+
if exit_status != 0:
|
|
682
|
+
logger.info("%s: non-zero exit status %s", fmt(job_model), exit_status)
|
|
654
683
|
else:
|
|
655
684
|
_terminate_if_inactivity_duration_exceeded(run_model, job_model, resp.no_connections_secs)
|
|
656
685
|
if job_model.status != previous_status:
|
|
@@ -688,6 +717,15 @@ def _terminate_if_inactivity_duration_exceeded(
|
|
|
688
717
|
)
|
|
689
718
|
|
|
690
719
|
|
|
720
|
+
def _should_terminate_job_due_to_disconnect(job_model: JobModel) -> bool:
|
|
721
|
+
if job_model.disconnected_at is None:
|
|
722
|
+
return False
|
|
723
|
+
return (
|
|
724
|
+
common_utils.get_current_datetime()
|
|
725
|
+
> job_model.disconnected_at.replace(tzinfo=timezone.utc) + JOB_DISCONNECTED_RETRY_TIMEOUT
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
|
|
691
729
|
async def _check_gpu_utilization(session: AsyncSession, job_model: JobModel, job: Job) -> None:
|
|
692
730
|
policy = job.job_spec.utilization_policy
|
|
693
731
|
if policy is None:
|
|
@@ -818,8 +856,8 @@ def _submit_job_to_runner(
|
|
|
818
856
|
return success_if_not_available
|
|
819
857
|
|
|
820
858
|
runner_client.submit_job(
|
|
821
|
-
|
|
822
|
-
|
|
859
|
+
run=run,
|
|
860
|
+
job=job,
|
|
823
861
|
cluster_info=cluster_info,
|
|
824
862
|
secrets=secrets,
|
|
825
863
|
repo_credentials=repo_credentials,
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Add JobModel.disconnected_at
|
|
2
|
+
|
|
3
|
+
Revision ID: 20166748b60c
|
|
4
|
+
Revises: 6c1a9d6530ee
|
|
5
|
+
Create Date: 2025-05-13 16:24:32.496578
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
from alembic_postgresql_enum import TableReference
|
|
12
|
+
|
|
13
|
+
import dstack._internal.server.models
|
|
14
|
+
|
|
15
|
+
# revision identifiers, used by Alembic.
|
|
16
|
+
revision = "20166748b60c"
|
|
17
|
+
down_revision = "6c1a9d6530ee"
|
|
18
|
+
branch_labels = None
|
|
19
|
+
depends_on = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def upgrade() -> None:
|
|
23
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
24
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
25
|
+
batch_op.add_column(
|
|
26
|
+
sa.Column(
|
|
27
|
+
"disconnected_at", dstack._internal.server.models.NaiveDateTime(), nullable=True
|
|
28
|
+
)
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
op.sync_enum_values(
|
|
32
|
+
enum_schema="public",
|
|
33
|
+
enum_name="jobterminationreason",
|
|
34
|
+
new_values=[
|
|
35
|
+
"FAILED_TO_START_DUE_TO_NO_CAPACITY",
|
|
36
|
+
"INTERRUPTED_BY_NO_CAPACITY",
|
|
37
|
+
"INSTANCE_UNREACHABLE",
|
|
38
|
+
"WAITING_INSTANCE_LIMIT_EXCEEDED",
|
|
39
|
+
"WAITING_RUNNER_LIMIT_EXCEEDED",
|
|
40
|
+
"TERMINATED_BY_USER",
|
|
41
|
+
"VOLUME_ERROR",
|
|
42
|
+
"GATEWAY_ERROR",
|
|
43
|
+
"SCALED_DOWN",
|
|
44
|
+
"DONE_BY_RUNNER",
|
|
45
|
+
"ABORTED_BY_USER",
|
|
46
|
+
"TERMINATED_BY_SERVER",
|
|
47
|
+
"INACTIVITY_DURATION_EXCEEDED",
|
|
48
|
+
"TERMINATED_DUE_TO_UTILIZATION_POLICY",
|
|
49
|
+
"CONTAINER_EXITED_WITH_ERROR",
|
|
50
|
+
"PORTS_BINDING_FAILED",
|
|
51
|
+
"CREATING_CONTAINER_ERROR",
|
|
52
|
+
"EXECUTOR_ERROR",
|
|
53
|
+
"MAX_DURATION_EXCEEDED",
|
|
54
|
+
],
|
|
55
|
+
affected_columns=[
|
|
56
|
+
TableReference(
|
|
57
|
+
table_schema="public", table_name="jobs", column_name="termination_reason"
|
|
58
|
+
)
|
|
59
|
+
],
|
|
60
|
+
enum_values_to_rename=[],
|
|
61
|
+
)
|
|
62
|
+
# ### end Alembic commands ###
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def downgrade() -> None:
|
|
66
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
67
|
+
op.sync_enum_values(
|
|
68
|
+
enum_schema="public",
|
|
69
|
+
enum_name="jobterminationreason",
|
|
70
|
+
new_values=[
|
|
71
|
+
"FAILED_TO_START_DUE_TO_NO_CAPACITY",
|
|
72
|
+
"INTERRUPTED_BY_NO_CAPACITY",
|
|
73
|
+
"WAITING_INSTANCE_LIMIT_EXCEEDED",
|
|
74
|
+
"WAITING_RUNNER_LIMIT_EXCEEDED",
|
|
75
|
+
"TERMINATED_BY_USER",
|
|
76
|
+
"VOLUME_ERROR",
|
|
77
|
+
"GATEWAY_ERROR",
|
|
78
|
+
"SCALED_DOWN",
|
|
79
|
+
"DONE_BY_RUNNER",
|
|
80
|
+
"ABORTED_BY_USER",
|
|
81
|
+
"TERMINATED_BY_SERVER",
|
|
82
|
+
"INACTIVITY_DURATION_EXCEEDED",
|
|
83
|
+
"TERMINATED_DUE_TO_UTILIZATION_POLICY",
|
|
84
|
+
"CONTAINER_EXITED_WITH_ERROR",
|
|
85
|
+
"PORTS_BINDING_FAILED",
|
|
86
|
+
"CREATING_CONTAINER_ERROR",
|
|
87
|
+
"EXECUTOR_ERROR",
|
|
88
|
+
"MAX_DURATION_EXCEEDED",
|
|
89
|
+
],
|
|
90
|
+
affected_columns=[
|
|
91
|
+
TableReference(
|
|
92
|
+
table_schema="public", table_name="jobs", column_name="termination_reason"
|
|
93
|
+
)
|
|
94
|
+
],
|
|
95
|
+
enum_values_to_rename=[],
|
|
96
|
+
)
|
|
97
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
98
|
+
batch_op.drop_column("disconnected_at")
|
|
99
|
+
|
|
100
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Add JobModel.exit_status
|
|
2
|
+
|
|
3
|
+
Revision ID: 6c1a9d6530ee
|
|
4
|
+
Revises: 7ba3b59d7ca6
|
|
5
|
+
Create Date: 2025-05-09 10:25:19.715852
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = "6c1a9d6530ee"
|
|
14
|
+
down_revision = "7ba3b59d7ca6"
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
21
|
+
batch_op.add_column(sa.Column("exit_status", sa.Integer(), nullable=True))
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def downgrade() -> None:
|
|
25
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
26
|
+
batch_op.drop_column("exit_status")
|
|
@@ -382,6 +382,10 @@ class JobModel(BaseModel):
|
|
|
382
382
|
Enum(JobTerminationReason)
|
|
383
383
|
)
|
|
384
384
|
termination_reason_message: Mapped[Optional[str]] = mapped_column(Text)
|
|
385
|
+
# `disconnected_at` stores the first time of connectivity issues with the instance.
|
|
386
|
+
# Resets every time connectivity is restored.
|
|
387
|
+
disconnected_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
388
|
+
exit_status: Mapped[Optional[int]] = mapped_column(Integer)
|
|
385
389
|
job_spec_data: Mapped[str] = mapped_column(Text)
|
|
386
390
|
job_provisioning_data: Mapped[Optional[str]] = mapped_column(Text)
|
|
387
391
|
runner_timestamp: Mapped[Optional[int]] = mapped_column(BigInteger)
|
|
@@ -390,7 +394,7 @@ class JobModel(BaseModel):
|
|
|
390
394
|
remove_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
391
395
|
volumes_detached_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
392
396
|
# `instance_assigned` means instance assignment was done.
|
|
393
|
-
# if `instance_assigned` is True and `instance` is None, no instance was
|
|
397
|
+
# if `instance_assigned` is True and `instance` is None, no instance was assigned.
|
|
394
398
|
instance_assigned: Mapped[bool] = mapped_column(Boolean, default=False)
|
|
395
399
|
instance_id: Mapped[Optional[uuid.UUID]] = mapped_column(
|
|
396
400
|
ForeignKey("instances.id", ondelete="CASCADE")
|
|
@@ -7,7 +7,14 @@ from typing_extensions import Annotated
|
|
|
7
7
|
|
|
8
8
|
from dstack._internal.core.models.common import CoreModel, NetworkMode
|
|
9
9
|
from dstack._internal.core.models.repos.remote import RemoteRepoCreds
|
|
10
|
-
from dstack._internal.core.models.runs import
|
|
10
|
+
from dstack._internal.core.models.runs import (
|
|
11
|
+
ClusterInfo,
|
|
12
|
+
JobSpec,
|
|
13
|
+
JobStatus,
|
|
14
|
+
JobSubmission,
|
|
15
|
+
Run,
|
|
16
|
+
RunSpec,
|
|
17
|
+
)
|
|
11
18
|
from dstack._internal.core.models.volumes import InstanceMountPoint, VolumeMountPoint
|
|
12
19
|
|
|
13
20
|
|
|
@@ -16,6 +23,7 @@ class JobStateEvent(CoreModel):
|
|
|
16
23
|
state: JobStatus
|
|
17
24
|
termination_reason: Optional[str] = None
|
|
18
25
|
termination_message: Optional[str] = None
|
|
26
|
+
exit_status: Optional[int] = None
|
|
19
27
|
|
|
20
28
|
|
|
21
29
|
class LogEvent(CoreModel):
|
|
@@ -38,15 +46,18 @@ class PullResponse(CoreModel):
|
|
|
38
46
|
|
|
39
47
|
|
|
40
48
|
class SubmitBody(CoreModel):
|
|
41
|
-
|
|
42
|
-
|
|
49
|
+
run: Annotated[
|
|
50
|
+
Run,
|
|
43
51
|
Field(
|
|
44
52
|
include={
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
53
|
+
"id": True,
|
|
54
|
+
"run_spec": {
|
|
55
|
+
"run_name",
|
|
56
|
+
"repo_id",
|
|
57
|
+
"repo_data",
|
|
58
|
+
"configuration",
|
|
59
|
+
"configuration_path",
|
|
60
|
+
},
|
|
50
61
|
}
|
|
51
62
|
),
|
|
52
63
|
]
|
|
@@ -69,9 +80,31 @@ class SubmitBody(CoreModel):
|
|
|
69
80
|
}
|
|
70
81
|
),
|
|
71
82
|
]
|
|
83
|
+
job_submission: Annotated[
|
|
84
|
+
JobSubmission,
|
|
85
|
+
Field(
|
|
86
|
+
include={
|
|
87
|
+
"id",
|
|
88
|
+
}
|
|
89
|
+
),
|
|
90
|
+
]
|
|
72
91
|
cluster_info: Annotated[Optional[ClusterInfo], Field(include=True)]
|
|
73
92
|
secrets: Annotated[Optional[Dict[str, str]], Field(include=True)]
|
|
74
93
|
repo_credentials: Annotated[Optional[RemoteRepoCreds], Field(include=True)]
|
|
94
|
+
# run_spec is deprecated in favor of run.run_spec
|
|
95
|
+
# TODO: Remove once we no longer support instances deployed with 0.19.8 or earlier.
|
|
96
|
+
run_spec: Annotated[
|
|
97
|
+
RunSpec,
|
|
98
|
+
Field(
|
|
99
|
+
include={
|
|
100
|
+
"run_name",
|
|
101
|
+
"repo_id",
|
|
102
|
+
"repo_data",
|
|
103
|
+
"configuration",
|
|
104
|
+
"configuration_path",
|
|
105
|
+
},
|
|
106
|
+
),
|
|
107
|
+
]
|
|
75
108
|
|
|
76
109
|
|
|
77
110
|
class HealthcheckResponse(CoreModel):
|
|
@@ -135,6 +135,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
|
|
|
135
135
|
status=job_model.status,
|
|
136
136
|
termination_reason=job_model.termination_reason,
|
|
137
137
|
termination_reason_message=job_model.termination_reason_message,
|
|
138
|
+
exit_status=job_model.exit_status,
|
|
138
139
|
job_provisioning_data=job_provisioning_data,
|
|
139
140
|
job_runtime_data=get_job_runtime_data(job_model),
|
|
140
141
|
)
|
|
@@ -12,7 +12,7 @@ from dstack._internal.core.models.common import CoreModel, NetworkMode
|
|
|
12
12
|
from dstack._internal.core.models.envs import Env
|
|
13
13
|
from dstack._internal.core.models.repos.remote import RemoteRepoCreds
|
|
14
14
|
from dstack._internal.core.models.resources import Memory
|
|
15
|
-
from dstack._internal.core.models.runs import ClusterInfo,
|
|
15
|
+
from dstack._internal.core.models.runs import ClusterInfo, Job, Run
|
|
16
16
|
from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
|
|
17
17
|
from dstack._internal.server.schemas.runner import (
|
|
18
18
|
GPUDevice,
|
|
@@ -72,8 +72,8 @@ class RunnerClient:
|
|
|
72
72
|
|
|
73
73
|
def submit_job(
|
|
74
74
|
self,
|
|
75
|
-
|
|
76
|
-
|
|
75
|
+
run: Run,
|
|
76
|
+
job: Job,
|
|
77
77
|
cluster_info: ClusterInfo,
|
|
78
78
|
secrets: Dict[str, str],
|
|
79
79
|
repo_credentials: Optional[RemoteRepoCreds],
|
|
@@ -81,6 +81,7 @@ class RunnerClient:
|
|
|
81
81
|
):
|
|
82
82
|
# XXX: This is a quick-and-dirty hack to deliver InstanceModel-specific environment
|
|
83
83
|
# variables to the runner without runner API modification.
|
|
84
|
+
job_spec = job.job_spec
|
|
84
85
|
if instance_env is not None:
|
|
85
86
|
if isinstance(instance_env, Env):
|
|
86
87
|
merged_env = instance_env.as_dict()
|
|
@@ -90,11 +91,13 @@ class RunnerClient:
|
|
|
90
91
|
job_spec = job_spec.copy(deep=True)
|
|
91
92
|
job_spec.env = merged_env
|
|
92
93
|
body = SubmitBody(
|
|
93
|
-
|
|
94
|
+
run=run,
|
|
94
95
|
job_spec=job_spec,
|
|
96
|
+
job_submission=job.job_submissions[-1],
|
|
95
97
|
cluster_info=cluster_info,
|
|
96
98
|
secrets=secrets,
|
|
97
99
|
repo_credentials=repo_credentials,
|
|
100
|
+
run_spec=run.run_spec,
|
|
98
101
|
)
|
|
99
102
|
resp = requests.post(
|
|
100
103
|
# use .json() to encode enums
|
|
@@ -870,10 +870,10 @@ def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
|
|
|
870
870
|
if (
|
|
871
871
|
run_spec.merged_profile.utilization_policy is not None
|
|
872
872
|
and run_spec.merged_profile.utilization_policy.time_window
|
|
873
|
-
> settings.
|
|
873
|
+
> settings.SERVER_METRICS_RUNNING_TTL_SECONDS
|
|
874
874
|
):
|
|
875
875
|
raise ServerClientError(
|
|
876
|
-
f"Maximum utilization_policy.time_window is {settings.
|
|
876
|
+
f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_RUNNING_TTL_SECONDS}s"
|
|
877
877
|
)
|
|
878
878
|
set_resources_defaults(run_spec.configuration.resources)
|
|
879
879
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import warnings
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
|
|
4
5
|
DSTACK_DIR_PATH = Path("~/.dstack/").expanduser()
|
|
@@ -45,7 +46,25 @@ SERVER_CLOUDWATCH_LOG_REGION = os.getenv("DSTACK_SERVER_CLOUDWATCH_LOG_REGION")
|
|
|
45
46
|
|
|
46
47
|
SERVER_GCP_LOGGING_PROJECT = os.getenv("DSTACK_SERVER_GCP_LOGGING_PROJECT")
|
|
47
48
|
|
|
48
|
-
|
|
49
|
+
SERVER_METRICS_RUNNING_TTL_SECONDS: int
|
|
50
|
+
_SERVER_METRICS_RUNNING_TTL_SECONDS = os.getenv("DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS")
|
|
51
|
+
if _SERVER_METRICS_RUNNING_TTL_SECONDS is None:
|
|
52
|
+
_SERVER_METRICS_RUNNING_TTL_SECONDS = os.getenv("DSTACK_SERVER_METRICS_TTL_SECONDS")
|
|
53
|
+
if _SERVER_METRICS_RUNNING_TTL_SECONDS is not None:
|
|
54
|
+
warnings.warn(
|
|
55
|
+
(
|
|
56
|
+
"DSTACK_SERVER_METRICS_TTL_SECONDS is deprecated,"
|
|
57
|
+
" use DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS instead"
|
|
58
|
+
),
|
|
59
|
+
DeprecationWarning,
|
|
60
|
+
)
|
|
61
|
+
else:
|
|
62
|
+
_SERVER_METRICS_RUNNING_TTL_SECONDS = 3600
|
|
63
|
+
SERVER_METRICS_RUNNING_TTL_SECONDS = int(_SERVER_METRICS_RUNNING_TTL_SECONDS)
|
|
64
|
+
del _SERVER_METRICS_RUNNING_TTL_SECONDS
|
|
65
|
+
SERVER_METRICS_FINISHED_TTL_SECONDS = int(
|
|
66
|
+
os.getenv("DSTACK_SERVER_METRICS_FINISHED_TTL_SECONDS", 7 * 24 * 3600)
|
|
67
|
+
)
|
|
49
68
|
|
|
50
69
|
DEFAULT_PROJECT_NAME = "main"
|
|
51
70
|
|
|
@@ -302,6 +302,7 @@ async def create_job(
|
|
|
302
302
|
job_num: int = 0,
|
|
303
303
|
replica_num: int = 0,
|
|
304
304
|
instance_assigned: bool = False,
|
|
305
|
+
disconnected_at: Optional[datetime] = None,
|
|
305
306
|
) -> JobModel:
|
|
306
307
|
run_spec = RunSpec.parse_raw(run.run_spec)
|
|
307
308
|
job_spec = (await get_job_specs_from_run_spec(run_spec, replica_num=replica_num))[0]
|
|
@@ -323,6 +324,7 @@ async def create_job(
|
|
|
323
324
|
instance=instance,
|
|
324
325
|
instance_assigned=instance_assigned,
|
|
325
326
|
used_instance_id=instance.id if instance is not None else None,
|
|
327
|
+
disconnected_at=disconnected_at,
|
|
326
328
|
)
|
|
327
329
|
session.add(job)
|
|
328
330
|
await session.commit()
|
dstack/api/server/_runs.py
CHANGED
|
@@ -115,6 +115,8 @@ def _get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
|
|
|
115
115
|
job_submissions_excludes["job_runtime_data"] = {
|
|
116
116
|
"offer": {"instance": {"resources": {"cpu_arch"}}}
|
|
117
117
|
}
|
|
118
|
+
if all(js.exit_status is None for js in job_submissions):
|
|
119
|
+
job_submissions_excludes["exit_status"] = True
|
|
118
120
|
latest_job_submission = current_resource.latest_job_submission
|
|
119
121
|
if latest_job_submission is not None:
|
|
120
122
|
latest_job_submission_excludes = {}
|
|
@@ -127,6 +129,8 @@ def _get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
|
|
|
127
129
|
latest_job_submission_excludes["job_runtime_data"] = {
|
|
128
130
|
"offer": {"instance": {"resources": {"cpu_arch"}}}
|
|
129
131
|
}
|
|
132
|
+
if latest_job_submission.exit_status is None:
|
|
133
|
+
latest_job_submission_excludes["exit_status"] = True
|
|
130
134
|
return {"plan": apply_plan_excludes}
|
|
131
135
|
|
|
132
136
|
|
dstack/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dstack
|
|
3
|
-
Version: 0.19.
|
|
3
|
+
Version: 0.19.9
|
|
4
4
|
Summary: dstack is an open-source orchestration engine for running AI workloads on any cloud or on-premises.
|
|
5
5
|
Project-URL: Homepage, https://dstack.ai
|
|
6
6
|
Project-URL: Source, https://github.com/dstackai/dstack
|
|
@@ -54,7 +54,7 @@ Requires-Dist: azure-mgmt-network<28.0.0,>=23.0.0; extra == 'all'
|
|
|
54
54
|
Requires-Dist: azure-mgmt-resource>=22.0.0; extra == 'all'
|
|
55
55
|
Requires-Dist: azure-mgmt-subscription>=3.1.1; extra == 'all'
|
|
56
56
|
Requires-Dist: backports-entry-points-selectable; extra == 'all'
|
|
57
|
-
Requires-Dist: boto3; extra == 'all'
|
|
57
|
+
Requires-Dist: boto3>=1.38.13; extra == 'all'
|
|
58
58
|
Requires-Dist: botocore; extra == 'all'
|
|
59
59
|
Requires-Dist: datacrunch; extra == 'all'
|
|
60
60
|
Requires-Dist: docker>=6.0.0; extra == 'all'
|
|
@@ -90,7 +90,7 @@ Requires-Dist: alembic>=1.10.2; extra == 'aws'
|
|
|
90
90
|
Requires-Dist: apscheduler<4; extra == 'aws'
|
|
91
91
|
Requires-Dist: asyncpg; extra == 'aws'
|
|
92
92
|
Requires-Dist: backports-entry-points-selectable; extra == 'aws'
|
|
93
|
-
Requires-Dist: boto3; extra == 'aws'
|
|
93
|
+
Requires-Dist: boto3>=1.38.13; extra == 'aws'
|
|
94
94
|
Requires-Dist: botocore; extra == 'aws'
|
|
95
95
|
Requires-Dist: docker>=6.0.0; extra == 'aws'
|
|
96
96
|
Requires-Dist: fastapi; extra == 'aws'
|
|
@@ -231,7 +231,7 @@ Requires-Dist: alembic>=1.10.2; extra == 'lambda'
|
|
|
231
231
|
Requires-Dist: apscheduler<4; extra == 'lambda'
|
|
232
232
|
Requires-Dist: asyncpg; extra == 'lambda'
|
|
233
233
|
Requires-Dist: backports-entry-points-selectable; extra == 'lambda'
|
|
234
|
-
Requires-Dist: boto3; extra == 'lambda'
|
|
234
|
+
Requires-Dist: boto3>=1.38.13; extra == 'lambda'
|
|
235
235
|
Requires-Dist: botocore; extra == 'lambda'
|
|
236
236
|
Requires-Dist: docker>=6.0.0; extra == 'lambda'
|
|
237
237
|
Requires-Dist: fastapi; extra == 'lambda'
|
|
@@ -338,24 +338,27 @@ orchestration for ML teams across top clouds and on-prem clusters.
|
|
|
338
338
|
|
|
339
339
|
#### Accelerators
|
|
340
340
|
|
|
341
|
-
`dstack` supports `NVIDIA`, `AMD`, `Google TPU`,
|
|
341
|
+
`dstack` supports `NVIDIA`, `AMD`, `Google TPU`, `Intel Gaudi`, and `Tenstorrent` accelerators out of the box.
|
|
342
342
|
|
|
343
|
-
##
|
|
343
|
+
## Latest news ✨
|
|
344
344
|
|
|
345
|
-
- [2025/
|
|
346
|
-
- [2025/
|
|
347
|
-
- [2025/
|
|
348
|
-
- [
|
|
349
|
-
- [
|
|
350
|
-
- [2024/10] [dstack 0.18.18: Hardware metrics monitoring](https://github.com/dstackai/dstack/releases/tag/0.18.18)
|
|
345
|
+
- [2025/05] [dstack 0.19.8: Nebius clusters, GH200 on Lambda](https://github.com/dstackai/dstack/releases/tag/0.19.8)
|
|
346
|
+
- [2025/04] [dstack 0.19.6: Tenstorrent, Plugins](https://github.com/dstackai/dstack/releases/tag/0.19.6)
|
|
347
|
+
- [2025/04] [dstack 0.19.5: GCP A3 High clusters](https://github.com/dstackai/dstack/releases/tag/0.19.5)
|
|
348
|
+
- [2025/04] [dstack 0.19.3: GCP A3 Mega clusters](https://github.com/dstackai/dstack/releases/tag/0.19.3)
|
|
349
|
+
- [2025/03] [dstack 0.19.0: Prometheus](https://github.com/dstackai/dstack/releases/tag/0.19.0)
|
|
351
350
|
|
|
352
|
-
##
|
|
351
|
+
## How does it work?
|
|
352
|
+
|
|
353
|
+
<img src="https://dstack.ai/static-assets/static-assets/images/dstack-architecture-diagram-v8.svg" width="750" />
|
|
354
|
+
|
|
355
|
+
### Installation
|
|
353
356
|
|
|
354
357
|
> Before using `dstack` through CLI or API, set up a `dstack` server. If you already have a running `dstack` server, you only need to [set up the CLI](#set-up-the-cli).
|
|
355
358
|
|
|
356
|
-
|
|
359
|
+
#### Set up the server
|
|
357
360
|
|
|
358
|
-
|
|
361
|
+
##### (Optional) Configure backends
|
|
359
362
|
|
|
360
363
|
To use `dstack` with cloud providers, configure backends
|
|
361
364
|
via the `~/.dstack/server/config.yml` file.
|
|
@@ -365,21 +368,21 @@ For more details on how to configure backends, check [Backends](https://dstack.a
|
|
|
365
368
|
> For using `dstack` with on-prem servers, create [SSH fleets](https://dstack.ai/docs/concepts/fleets#ssh)
|
|
366
369
|
> once the server is up.
|
|
367
370
|
|
|
368
|
-
|
|
371
|
+
##### Start the server
|
|
369
372
|
|
|
370
373
|
You can install the server on Linux, macOS, and Windows (via WSL 2). It requires Git and
|
|
371
374
|
OpenSSH.
|
|
372
375
|
|
|
373
|
-
#####
|
|
376
|
+
##### uv
|
|
374
377
|
|
|
375
378
|
```shell
|
|
376
|
-
$
|
|
379
|
+
$ uv tool install "dstack[all]" -U
|
|
377
380
|
```
|
|
378
381
|
|
|
379
|
-
#####
|
|
382
|
+
##### pip
|
|
380
383
|
|
|
381
384
|
```shell
|
|
382
|
-
$
|
|
385
|
+
$ pip install "dstack[all]" -U
|
|
383
386
|
```
|
|
384
387
|
|
|
385
388
|
Once it's installed, go ahead and start the server.
|
|
@@ -392,25 +395,28 @@ The admin token is "bbae0f28-d3dd-4820-bf61-8f4bb40815da"
|
|
|
392
395
|
The server is running at http://127.0.0.1:3000/
|
|
393
396
|
```
|
|
394
397
|
|
|
395
|
-
For more details on server configuration options, see the
|
|
398
|
+
> For more details on server configuration options, see the
|
|
396
399
|
[Server deployment](https://dstack.ai/docs/guides/server-deployment) guide.
|
|
397
400
|
|
|
398
|
-
|
|
401
|
+
|
|
402
|
+
<details><summary>Set up the CLI</summary>
|
|
403
|
+
|
|
404
|
+
#### Set up the CLI
|
|
399
405
|
|
|
400
406
|
Once the server is up, you can access it via the `dstack` CLI.
|
|
401
407
|
|
|
402
408
|
The CLI can be installed on Linux, macOS, and Windows. It requires Git and OpenSSH.
|
|
403
409
|
|
|
404
|
-
#####
|
|
410
|
+
##### uv
|
|
405
411
|
|
|
406
412
|
```shell
|
|
407
|
-
$
|
|
413
|
+
$ uv tool install dstack -U
|
|
408
414
|
```
|
|
409
415
|
|
|
410
|
-
#####
|
|
416
|
+
##### pip
|
|
411
417
|
|
|
412
418
|
```shell
|
|
413
|
-
$
|
|
419
|
+
$ pip install dstack -U
|
|
414
420
|
```
|
|
415
421
|
|
|
416
422
|
To point the CLI to the `dstack` server, configure it
|
|
@@ -425,9 +431,9 @@ $ dstack config \
|
|
|
425
431
|
Configuration is updated at ~/.dstack/config.yml
|
|
426
432
|
```
|
|
427
433
|
|
|
428
|
-
|
|
434
|
+
</details>
|
|
429
435
|
|
|
430
|
-
###
|
|
436
|
+
### Define configurations
|
|
431
437
|
|
|
432
438
|
`dstack` supports the following configurations:
|
|
433
439
|
|
|
@@ -440,7 +446,7 @@ Configuration is updated at ~/.dstack/config.yml
|
|
|
440
446
|
|
|
441
447
|
Configuration can be defined as YAML files within your repo.
|
|
442
448
|
|
|
443
|
-
###
|
|
449
|
+
### Apply configurations
|
|
444
450
|
|
|
445
451
|
Apply the configuration either via the `dstack apply` CLI command or through a programmatic API.
|
|
446
452
|
|
|
@@ -452,6 +458,7 @@ out-of-capacity errors, port-forwarding, and more — across clouds and on-p
|
|
|
452
458
|
For additional information, see the following links:
|
|
453
459
|
|
|
454
460
|
* [Docs](https://dstack.ai/docs)
|
|
461
|
+
* [Examples](https://dstack.ai/examples)
|
|
455
462
|
* [Discord](https://discord.gg/u8SmfwPpMd)
|
|
456
463
|
|
|
457
464
|
## Contributing
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
dstack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
dstack/version.py,sha256=
|
|
2
|
+
dstack/version.py,sha256=lJgn5pYWtoxN-mAxtEnIXX2nGXSQrLvGxKsa1F9aNgM,64
|
|
3
3
|
dstack/_internal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
dstack/_internal/compat.py,sha256=bF9U9fTMfL8UVhCouedoUSTYFl7UAOiU0WXrnRoByxw,40
|
|
5
5
|
dstack/_internal/settings.py,sha256=otvcNT0X5UnGZdoNIWNFZBohQRzLme9Zc6oiBzc1BEk,796
|
|
@@ -31,14 +31,14 @@ dstack/_internal/cli/services/configurators/__init__.py,sha256=z94VPBFqybP8Zpwy3
|
|
|
31
31
|
dstack/_internal/cli/services/configurators/base.py,sha256=bGfde2zoma28lLE8MUACO4-NKT1CdJJQJoXrzjpz0mQ,3360
|
|
32
32
|
dstack/_internal/cli/services/configurators/fleet.py,sha256=jm4tNH6QQVplLdboCTlvRYUee3nZ0UYb_qLTrvtYVYM,14049
|
|
33
33
|
dstack/_internal/cli/services/configurators/gateway.py,sha256=czB2s89s7IowOmWnpDwWErPAUlW3FvFMizImhrkQiBM,8927
|
|
34
|
-
dstack/_internal/cli/services/configurators/run.py,sha256=
|
|
34
|
+
dstack/_internal/cli/services/configurators/run.py,sha256=nXNjFrM5YT6RFqPXJQa4MOiEsG6IFiANyGKP-PXILdc,25518
|
|
35
35
|
dstack/_internal/cli/services/configurators/volume.py,sha256=riMXLQbgvHIIFwLKdHfad-_0iE9wE3G_rUmXU5P3ZS8,8519
|
|
36
36
|
dstack/_internal/cli/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
37
|
dstack/_internal/cli/utils/common.py,sha256=rfmzqrsgR3rXW3wj0vxDdvrhUUg2aIy4A6E9MZbd55g,1763
|
|
38
38
|
dstack/_internal/cli/utils/fleet.py,sha256=ch-LN1X9boSm-rFLW4mAJRmz0XliLhH0LvKD2DqSt2g,3942
|
|
39
39
|
dstack/_internal/cli/utils/gateway.py,sha256=qMYa1NTAT_O98x2_mSyWDRbiHj5fqt6xUXFh9NIUwAM,1502
|
|
40
40
|
dstack/_internal/cli/utils/rich.py,sha256=Gx1MJU929kMKsbdo9qF7XHARNta2426Ssb-xMLVhwbQ,5710
|
|
41
|
-
dstack/_internal/cli/utils/run.py,sha256
|
|
41
|
+
dstack/_internal/cli/utils/run.py,sha256=nCQwAU3VDS8ec2oWNjRKi5xIGdwwKI_YNr8vgGyDPzQ,9202
|
|
42
42
|
dstack/_internal/cli/utils/updates.py,sha256=sAPYYptkFzQnGaRjv7FV7HOj-Be3IXGe63xj-sVEpv4,2566
|
|
43
43
|
dstack/_internal/cli/utils/volume.py,sha256=mU9I06dVMFbpjfkefxrZNoSWadKLoib3U14rHudNQN4,1975
|
|
44
44
|
dstack/_internal/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -71,7 +71,7 @@ dstack/_internal/core/backends/base/offers.py,sha256=AzAAx5eSTaHv8CbWuGERTHS151x
|
|
|
71
71
|
dstack/_internal/core/backends/cudo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
72
72
|
dstack/_internal/core/backends/cudo/api_client.py,sha256=ygq1Gx7ZvwKaifdXtvzDSw4xR4ZH6UWd5J47BjuaGh0,3685
|
|
73
73
|
dstack/_internal/core/backends/cudo/backend.py,sha256=i13YoAkUfIStc3Yyyt_3YmL30eVrKtrhwnE9_B1iBRI,546
|
|
74
|
-
dstack/_internal/core/backends/cudo/compute.py,sha256=
|
|
74
|
+
dstack/_internal/core/backends/cudo/compute.py,sha256=wGMdH4Me-IHuQ-U1_XiuOqtHT86AgHyofUi449eqijo,6466
|
|
75
75
|
dstack/_internal/core/backends/cudo/configurator.py,sha256=pkAT1MtL6_yYvYoqCglvPE-DiUdL8-XEviyN1yUSYyw,2056
|
|
76
76
|
dstack/_internal/core/backends/cudo/models.py,sha256=6sfEqY2hvTpIACkyT4mhD3D8K5TsW_pupys9nqtrgoI,1055
|
|
77
77
|
dstack/_internal/core/backends/datacrunch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -109,8 +109,8 @@ dstack/_internal/core/backends/nebius/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
|
|
|
109
109
|
dstack/_internal/core/backends/nebius/backend.py,sha256=2XqZIbSR8VzlfOnuVklXlDxNmwAkQj7txQN8VXF1j2E,566
|
|
110
110
|
dstack/_internal/core/backends/nebius/compute.py,sha256=OUNvDk9rU13NR0CADFqn7nQL8kxgSvL7kbNEV4cLzyQ,14761
|
|
111
111
|
dstack/_internal/core/backends/nebius/configurator.py,sha256=ML2KCD6Ddxc2f6X1juxqKulUcOjF6uJk20_0Teyi65A,3072
|
|
112
|
-
dstack/_internal/core/backends/nebius/fabrics.py,sha256=
|
|
113
|
-
dstack/_internal/core/backends/nebius/models.py,sha256
|
|
112
|
+
dstack/_internal/core/backends/nebius/fabrics.py,sha256=jC7ngUO54rXbyXI4hkl5_9GdBk7h4Ivyh88CH4S37ds,1546
|
|
113
|
+
dstack/_internal/core/backends/nebius/models.py,sha256=UudYX32p-ZY-GWR83VEtY5dpZBaWhKXQIfn2nrBCq-4,4245
|
|
114
114
|
dstack/_internal/core/backends/nebius/resources.py,sha256=hx_VqiaurGO0MYT2KEvMl9EYdcglBRQsWSY5kHKjR00,12163
|
|
115
115
|
dstack/_internal/core/backends/oci/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
116
116
|
dstack/_internal/core/backends/oci/auth.py,sha256=8Cr18y_LOsyRP-16yfFpT70Cofpm0clB3KawS_7aRl4,717
|
|
@@ -165,8 +165,8 @@ dstack/_internal/core/models/metrics.py,sha256=Xb8hCXUL-ncQ3PMsErIUAJTe9gwh5jyrQ
|
|
|
165
165
|
dstack/_internal/core/models/placement.py,sha256=WJVq5ENJykyRarQzL2EeYQag_9_jV7VSAtR_xoFvPVM,720
|
|
166
166
|
dstack/_internal/core/models/profiles.py,sha256=seeysTuMv1vVUmpHAZgrMUGcbMtH7hSMFIvfx0Qk__0,10406
|
|
167
167
|
dstack/_internal/core/models/projects.py,sha256=H5ZZRiyUEKifpTFAhl45KBi5ly7ooE0WmI329myK360,643
|
|
168
|
-
dstack/_internal/core/models/resources.py,sha256=
|
|
169
|
-
dstack/_internal/core/models/runs.py,sha256=
|
|
168
|
+
dstack/_internal/core/models/resources.py,sha256=rsf6hAhi5bfSb_Z9VcS3UoEG0G8Ohl6ekyrOStLOAqw,14114
|
|
169
|
+
dstack/_internal/core/models/runs.py,sha256=Xkv1kY68JA0eJUeCVJjM9YWRkfy6P1RGXslBIMtox2E,18985
|
|
170
170
|
dstack/_internal/core/models/secrets.py,sha256=IQyemsNpSzqOCB-VlVTuc4gyPFmXXO4mhko0Ur0ey3I,221
|
|
171
171
|
dstack/_internal/core/models/server.py,sha256=Hkc1v2s3KOiwslsWVmhUOAzcSeREoG-HD1SzSX9WUGg,152
|
|
172
172
|
dstack/_internal/core/models/services.py,sha256=2Hpi7j0Q1shaf_0wd0C0044AJAmuYi-D3qx3PH849oI,3076
|
|
@@ -250,18 +250,18 @@ dstack/_internal/server/app.py,sha256=K2NojwUKdktdenrR61I21kXIMX6ars6zB9v6Ea-evz
|
|
|
250
250
|
dstack/_internal/server/db.py,sha256=WjuqmjG3QAZmSMCeUaJ_ynbowlHuNAvYCZO649cTPHc,3210
|
|
251
251
|
dstack/_internal/server/deps.py,sha256=31e8SU_ogPJWHIDLkgl7cuC_5V91xbJoLyAj17VanfM,670
|
|
252
252
|
dstack/_internal/server/main.py,sha256=kztKhCYNoHSDyJJQScWfZXE0naNleJOCQULW6dd8SGw,109
|
|
253
|
-
dstack/_internal/server/models.py,sha256=
|
|
254
|
-
dstack/_internal/server/settings.py,sha256=
|
|
253
|
+
dstack/_internal/server/models.py,sha256=GWl78Zl-_w1UyW9nB6DDS95Ko_osbLQtLb2DIi1JDLo,29633
|
|
254
|
+
dstack/_internal/server/settings.py,sha256=XkLexvylNbU3iRM0KHnTX0fywLGczBHya5lVmeptbqY,4123
|
|
255
255
|
dstack/_internal/server/background/__init__.py,sha256=8kTbhEHCeXTibsOlHY1HwqIO6gGb4q8fUa2fcDrah1c,3893
|
|
256
256
|
dstack/_internal/server/background/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
257
257
|
dstack/_internal/server/background/tasks/common.py,sha256=N7xSXbf2MoBWgbJ1e3AEzYBTf1Gn-pDXYND8Zr_YCJQ,970
|
|
258
258
|
dstack/_internal/server/background/tasks/process_fleets.py,sha256=LsD3I7iCbQs_nexJ1kfnn1VPz3BffFmALRgpk5DHGO4,2985
|
|
259
259
|
dstack/_internal/server/background/tasks/process_gateways.py,sha256=hoUI1CSqbHt_uMwnzTRAEDl-LBw0wUk_W4xobIbdvRc,7017
|
|
260
260
|
dstack/_internal/server/background/tasks/process_instances.py,sha256=E1NLac4ubiF6jUr9M7rj8cjQf4zFZCBVr428UBgFeGI,42855
|
|
261
|
-
dstack/_internal/server/background/tasks/process_metrics.py,sha256=
|
|
261
|
+
dstack/_internal/server/background/tasks/process_metrics.py,sha256=IDGyVQtGLua_NoY8sLv0RH18iV_3-LUONj6cEI181QM,6136
|
|
262
262
|
dstack/_internal/server/background/tasks/process_placement_groups.py,sha256=gJ8Um3Vx-brazHVWdtVXPnov4rwvDr-0Vn1Voq1cYBQ,4108
|
|
263
263
|
dstack/_internal/server/background/tasks/process_prometheus_metrics.py,sha256=9VoGFqdiXcVkCi_NV1VqQdqllDkB0bHIqOHKMIZK1Fg,5183
|
|
264
|
-
dstack/_internal/server/background/tasks/process_running_jobs.py,sha256=
|
|
264
|
+
dstack/_internal/server/background/tasks/process_running_jobs.py,sha256=jyuYOQfXIVGu5ugTennwSGXL_c6VDNJN96-Qrz7Hgic,36441
|
|
265
265
|
dstack/_internal/server/background/tasks/process_runs.py,sha256=EI1W6HUyB-og3g8BDP_GsBrJjQ-Z3JvZHTuJf7CRKRM,17974
|
|
266
266
|
dstack/_internal/server/background/tasks/process_submitted_jobs.py,sha256=-XOApBgmn9ZyCoeXgnbp6cnsFT3uxE_-xqLtn1ez5dc,26603
|
|
267
267
|
dstack/_internal/server/background/tasks/process_terminating_jobs.py,sha256=0Z3Q409RwSxOL_pgK8JktBthjtESEUH3ahwTLsTdYPk,3800
|
|
@@ -277,6 +277,7 @@ dstack/_internal/server/migrations/versions/14f2cb002fc2_add_jobmodel_removed_fl
|
|
|
277
277
|
dstack/_internal/server/migrations/versions/1a48dfe44a40_rework_termination_handling.py,sha256=sqYOR7ZoUifmRrZz2DBaO-D_Pgu20nup15yihg1FBcM,1417
|
|
278
278
|
dstack/_internal/server/migrations/versions/1e3fb39ef74b_add_remote_connection_details.py,sha256=x4FdfUD4XI7nxcppnw5juFKksusYMA4EXNxs0UEETFE,649
|
|
279
279
|
dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py,sha256=4-H_mGGSD6tI7H0HQ-pBs5wixMKdDro6KtLdH_QId28,831
|
|
280
|
+
dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py,sha256=Vj7VPo67pAy6gHztSrbp9HmzAz0ckROeV6HxDfvRwOw,3085
|
|
280
281
|
dstack/_internal/server/migrations/versions/23e01c56279a_make_blob_nullable.py,sha256=O5ZrwAXs1ubPrChyItCbuEeAPrlRF_ys1nVw4knAO5g,845
|
|
281
282
|
dstack/_internal/server/migrations/versions/252d3743b641_.py,sha256=z3mMF3YCEg6ueoj746cDNBNlQSimmBOcVLNupOv2UuU,1246
|
|
282
283
|
dstack/_internal/server/migrations/versions/27d3e55759fa_add_pools.py,sha256=yKEQ1OdPZSaO8YxRd986mJjNc9CjeO6SqY5SmL6aGfE,5433
|
|
@@ -296,6 +297,7 @@ dstack/_internal/server/migrations/versions/5ec538b70e71_replace_instansestatus.
|
|
|
296
297
|
dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py,sha256=PKqFqe6x6bkO8hrv73fes4uyBpzsemqwp3c-i5XzaS8,1195
|
|
297
298
|
dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py,sha256=UyNT3d8Osa2MpuODQRdmmbHlt-qT-wHNNOUTzUEcLLI,2512
|
|
298
299
|
dstack/_internal/server/migrations/versions/686fb8341ea5_add_user_emails.py,sha256=0FlRf5Mis9ZIc7X6M9yqPtyFqR2SKCTUFfZeDbqAXBU,809
|
|
300
|
+
dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py,sha256=aYZIGWhM38hZ6SkrUsFESOmOIVhSAIbA9dbyFKsLUr8,615
|
|
299
301
|
dstack/_internal/server/migrations/versions/710e5b3fac8f_add_encryption.py,sha256=nBURp4A6TpT13H1ccH4WyzkU2GGy3uDGKCNG88cAciw,1827
|
|
300
302
|
dstack/_internal/server/migrations/versions/7b24b1c8eba7_add_instancemodel_last_processed_at.py,sha256=o1A8nzrmMFcivBzaIMemCtRfCZ9mq1IuBko1CJXoVOo,2124
|
|
301
303
|
dstack/_internal/server/migrations/versions/7ba3b59d7ca6_add_runmodel_resubmission_attempt.py,sha256=FUvCRzOzkp8HjRYy0-kuOwvBGbwuCgyjXU7hD-AWRJs,1045
|
|
@@ -358,7 +360,7 @@ dstack/_internal/server/schemas/instances.py,sha256=MedAVbKa_1F9zhdSPrjMmL-Og5Dp
|
|
|
358
360
|
dstack/_internal/server/schemas/logs.py,sha256=JGt39fBEFRjHhlGT1jIC6kwQhujxPO8uecjplzjTZXY,402
|
|
359
361
|
dstack/_internal/server/schemas/projects.py,sha256=UmHtX0pkr3L_vPsosvRC9JneqBrYaJvVKd4OxhYySHc,566
|
|
360
362
|
dstack/_internal/server/schemas/repos.py,sha256=Sit0Tqol79VOMGGp8ncZXLLsZ4INcF-pAA_jwRchYqA,666
|
|
361
|
-
dstack/_internal/server/schemas/runner.py,sha256=
|
|
363
|
+
dstack/_internal/server/schemas/runner.py,sha256=CqlP90yvPKaavAqxLOxewIMq3cHeT_NWfR0YO81JfU0,5151
|
|
362
364
|
dstack/_internal/server/schemas/runs.py,sha256=XhlTnn67g4NWVmIJFQdy2yPK_EcnSIYOCCSex0XOIes,1341
|
|
363
365
|
dstack/_internal/server/schemas/secrets.py,sha256=mfqLSM7PqxVQ-GIWB6RfPRUOvSvvaRv-JxXAYxZ6dyY,373
|
|
364
366
|
dstack/_internal/server/schemas/users.py,sha256=FuDqwRVe3mOmv497vOZKjI0a_d4Wt2g4ZiCJcyfHEKA,495
|
|
@@ -381,7 +383,7 @@ dstack/_internal/server/services/projects.py,sha256=Je1iWZ-ArmyFxK1yMUzod5WRXyiI
|
|
|
381
383
|
dstack/_internal/server/services/prometheus.py,sha256=xq5G-Q2BJup9lS2F6__0wUVTs-k1Gr3dYclGzo2WoWo,12474
|
|
382
384
|
dstack/_internal/server/services/repos.py,sha256=f9ztN7jz_2gvD9hXF5sJwWDVyG2-NHRfjIdSukowPh8,9342
|
|
383
385
|
dstack/_internal/server/services/resources.py,sha256=VRFOih_cMJdc0c2m9nSGsX8vWAJQV3M6N87aqS_JXfw,699
|
|
384
|
-
dstack/_internal/server/services/runs.py,sha256=
|
|
386
|
+
dstack/_internal/server/services/runs.py,sha256=K3rqzfqkUY9dhTO-2W9_PGH8gCdYhCJK-S-gbM5drx4,38559
|
|
385
387
|
dstack/_internal/server/services/storage.py,sha256=6I0xI_3_RpJNbKZwHjDnjrEwXGdHfiaeb5li15T-M1I,1884
|
|
386
388
|
dstack/_internal/server/services/users.py,sha256=W-5xL7zsHNjeG7BBK54RWGvIrBOrw-FF0NcG_z9qhoE,7466
|
|
387
389
|
dstack/_internal/server/services/volumes.py,sha256=vfKY6eZp64I58Mfdvrk9Wig7deveD2Rw4ET1cbc1Sog,16238
|
|
@@ -396,7 +398,7 @@ dstack/_internal/server/services/gateways/__init__.py,sha256=Up8uFsEQDBE0yOXn7n5
|
|
|
396
398
|
dstack/_internal/server/services/gateways/client.py,sha256=XIJX3fGBbZ_AG8qZMTSE8KAB_ojq5YJFa0OXoD_dofg,7493
|
|
397
399
|
dstack/_internal/server/services/gateways/connection.py,sha256=ot3lV85XdmCT45vBWeyj57nLPcLPNm316zu3jMyeWjA,5625
|
|
398
400
|
dstack/_internal/server/services/gateways/pool.py,sha256=0LclTl1tyx-doS78LeaAKjr-SMp98zuwh5f9s06JSd0,1914
|
|
399
|
-
dstack/_internal/server/services/jobs/__init__.py,sha256=
|
|
401
|
+
dstack/_internal/server/services/jobs/__init__.py,sha256=GU3vMC0SZKyvL564A7t_QRoDjf83-8GsUkguDWK5x6c,25578
|
|
400
402
|
dstack/_internal/server/services/jobs/configurators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
401
403
|
dstack/_internal/server/services/jobs/configurators/base.py,sha256=QSIU1OoZ794HKdwjo5iXxGUvFE8a2-g_SoYjAQjXhcI,11354
|
|
402
404
|
dstack/_internal/server/services/jobs/configurators/dev.py,sha256=ufN6Sd8TwIsjQYNZE32fkAqJI7o2zjgoZThbrP-bd7U,2378
|
|
@@ -420,7 +422,7 @@ dstack/_internal/server/services/proxy/routers/service_proxy.py,sha256=5oB-SX8f_
|
|
|
420
422
|
dstack/_internal/server/services/proxy/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
421
423
|
dstack/_internal/server/services/proxy/services/service_proxy.py,sha256=4JrSxHqhBYqU1oENii89Db-bzkFWExYrOy-0mNEhWBs,4879
|
|
422
424
|
dstack/_internal/server/services/runner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
423
|
-
dstack/_internal/server/services/runner/client.py,sha256=
|
|
425
|
+
dstack/_internal/server/services/runner/client.py,sha256=jQDxv_Yaiwr2e3m1LqPCFtToB_GrsC2yVQfgXzyn6g8,15586
|
|
424
426
|
dstack/_internal/server/services/runner/ssh.py,sha256=H-X0015ZPwYq5tc31ytFF1uNaUAr9itAsABI2oPJWrk,5017
|
|
425
427
|
dstack/_internal/server/services/services/__init__.py,sha256=HQz72SNN8W9gUQ5INyO_Wd8TR9j3V6qoHFGEDEI920w,10862
|
|
426
428
|
dstack/_internal/server/services/services/autoscalers.py,sha256=0o_w9La-ex_P3VKG88w_XN3hkLkzryv5l1cH3pkZyAE,4315
|
|
@@ -531,7 +533,7 @@ dstack/_internal/server/statics/static/media/logo.f602feeb138844eda97c8cb6414614
|
|
|
531
533
|
dstack/_internal/server/statics/static/media/okta.12f178e6873a1100965f2a4dbd18fcec.svg,sha256=KqFI05gQM135zC1plF1WBRF2F7CyKL7km97WKsZjAHI,319
|
|
532
534
|
dstack/_internal/server/statics/static/media/theme.3994c817bb7dda191c1c9640dee0bf42.svg,sha256=ZxFFBVZWuRLqmWH4zhwGLNtKjOzHj-5MGJRunFAtu1I,561
|
|
533
535
|
dstack/_internal/server/testing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
534
|
-
dstack/_internal/server/testing/common.py,sha256=
|
|
536
|
+
dstack/_internal/server/testing/common.py,sha256=uzmF9_xsiHkb8l8adljuYSpAMEH4hmRZAbv6-96rN58,31480
|
|
535
537
|
dstack/_internal/server/testing/conf.py,sha256=-zhujfFjTHNfQDOK-hBck32By11c_kC0OeinB3esQGg,1902
|
|
536
538
|
dstack/_internal/server/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
537
539
|
dstack/_internal/server/utils/common.py,sha256=PbjXtqYy1taKXpyG5ys8cIrz9MXqc9dBAsR_9D1brrk,1414
|
|
@@ -570,7 +572,7 @@ dstack/api/server/_logs.py,sha256=ng8QvFAIaoVOVChTK6Wuu5BeM6y7gAdx30KEYRsn9xA,50
|
|
|
570
572
|
dstack/api/server/_metrics.py,sha256=OPb8sLhI_U605sHOPrELgy0_6cNFLJVfpvr-qkEukRM,670
|
|
571
573
|
dstack/api/server/_projects.py,sha256=g6kNSU6jer8u7Kaut1I0Ft4wRMLBBCQShJf3fOB63hQ,1440
|
|
572
574
|
dstack/api/server/_repos.py,sha256=bqsKuZWyiNLE8UAdSZrYNtk1J3Gu5MXXnTMIoM9jxpI,1770
|
|
573
|
-
dstack/api/server/_runs.py,sha256=
|
|
575
|
+
dstack/api/server/_runs.py,sha256=uVTY57BlBvB86pkKNjUh-Nc5AYNmfH9kfBNbgzTnTyw,7914
|
|
574
576
|
dstack/api/server/_secrets.py,sha256=VqLfrIcmBJtPxNDRkXTG44H5SWoY788YJapScUukvdY,1576
|
|
575
577
|
dstack/api/server/_users.py,sha256=XzhgGKc5Tsr0-xkz3T6rGyWZ1tO7aYNhLux2eE7dAoY,1738
|
|
576
578
|
dstack/api/server/_volumes.py,sha256=xxOt8o5G-bhMh6wSvF4BDFNoqVEhlM4BXQr2KvX0pN0,1937
|
|
@@ -580,8 +582,8 @@ dstack/plugins/__init__.py,sha256=buT1pcyORLgVbl89ATkRWJPhvejriVz7sNBjvuZRCRE,40
|
|
|
580
582
|
dstack/plugins/_base.py,sha256=-etiB-EozaJCg2wtmONfj8ic-K03qXvXyl_TIDp-kNE,2662
|
|
581
583
|
dstack/plugins/_models.py,sha256=1Gw--mDQ1_0FFr9Zur9LE8UbMoWESUpTdHHt12AyIZo,341
|
|
582
584
|
dstack/plugins/_utils.py,sha256=FqeWYb7zOrgZkO9Bd8caL5I81_TUEsysIzvxsULrmzk,392
|
|
583
|
-
dstack-0.19.
|
|
584
|
-
dstack-0.19.
|
|
585
|
-
dstack-0.19.
|
|
586
|
-
dstack-0.19.
|
|
587
|
-
dstack-0.19.
|
|
585
|
+
dstack-0.19.9.dist-info/METADATA,sha256=0gv_xHbluxlydceXCwjWo2m-CjyWGjNiR4gNpBKOpE0,20254
|
|
586
|
+
dstack-0.19.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
587
|
+
dstack-0.19.9.dist-info/entry_points.txt,sha256=GnLrMS8hx3rWAySQjA7tPNhtixV6a-brRkmal1PKoHc,58
|
|
588
|
+
dstack-0.19.9.dist-info/licenses/LICENSE.md,sha256=qDABaRGjSKVOib1U8viw2P_96sIK7Puo426784oD9f8,15976
|
|
589
|
+
dstack-0.19.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|