dstack 0.19.8__py3-none-any.whl → 0.19.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

@@ -98,6 +98,8 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
98
98
  print_run_plan(run_plan, max_offers=configurator_args.max_offers)
99
99
 
100
100
  confirm_message = "Submit a new run?"
101
+ if conf.name:
102
+ confirm_message = f"Submit the run [code]{conf.name}[/]?"
101
103
  stop_run_name = None
102
104
  if run_plan.current_resource is not None:
103
105
  changed_fields = []
@@ -130,11 +132,6 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
130
132
  f"Active run [code]{conf.name}[/] already exists and cannot be updated in-place."
131
133
  )
132
134
  confirm_message = "Stop and override the run?"
133
- else:
134
- console.print(f"Finished run [code]{conf.name}[/] already exists.")
135
- confirm_message = "Override the run?"
136
- elif conf.name:
137
- confirm_message = f"Submit the run [code]{conf.name}[/]?"
138
135
 
139
136
  if not command_args.yes and not confirm_ask(confirm_message):
140
137
  console.print("\nExiting...")
@@ -560,7 +557,9 @@ def print_finished_message(run: Run):
560
557
  console.print("[code]Done[/]")
561
558
  return
562
559
 
563
- termination_reason, termination_reason_message = _get_run_termination_reason(run)
560
+ termination_reason, termination_reason_message, exit_status = (
561
+ _get_run_termination_reason_and_exit_status(run)
562
+ )
564
563
  message = "Run failed due to unknown reason. Check CLI, server, and run logs."
565
564
  if run.status == RunStatus.TERMINATED:
566
565
  message = "Run terminated due to unknown reason. Check CLI, server, and run logs."
@@ -572,13 +571,15 @@ def print_finished_message(run: Run):
572
571
  "Check CLI and server logs for more details."
573
572
  )
574
573
  elif termination_reason is not None:
574
+ exit_status_details = f"Exit status: {exit_status}.\n" if exit_status else ""
575
575
  error_details = (
576
576
  f"Error: {termination_reason_message}\n" if termination_reason_message else ""
577
577
  )
578
578
  message = (
579
579
  f"Run failed with error code {termination_reason.name}.\n"
580
+ f"{exit_status_details}"
580
581
  f"{error_details}"
581
- "Check CLI, server, and run logs for more details."
582
+ f"Check [bold]dstack logs -d {run.name}[/bold] for more details."
582
583
  )
583
584
  console.print(f"[error]{message}[/]")
584
585
 
@@ -589,14 +590,20 @@ def get_run_exit_code(run: Run) -> int:
589
590
  return 1
590
591
 
591
592
 
592
- def _get_run_termination_reason(run: Run) -> Tuple[Optional[JobTerminationReason], Optional[str]]:
593
+ def _get_run_termination_reason_and_exit_status(
594
+ run: Run,
595
+ ) -> Tuple[Optional[JobTerminationReason], Optional[str], Optional[int]]:
593
596
  if len(run._run.jobs) == 0:
594
- return None, None
597
+ return None, None, None
595
598
  job = run._run.jobs[0]
596
599
  if len(job.job_submissions) == 0:
597
- return None, None
600
+ return None, None, None
598
601
  job_submission = job.job_submissions[0]
599
- return job_submission.termination_reason, job_submission.termination_reason_message
602
+ return (
603
+ job_submission.termination_reason,
604
+ job_submission.termination_reason_message,
605
+ job_submission.exit_status,
606
+ )
600
607
 
601
608
 
602
609
  def _run_resubmitted(run: Run, current_job_submission: Optional[JobSubmission]) -> bool:
@@ -218,6 +218,11 @@ def _get_run_error(run: Run) -> str:
218
218
 
219
219
 
220
220
  def _get_job_error(job: Job) -> str:
221
- if job.job_submissions[-1].termination_reason is None:
221
+ job_submission = job.job_submissions[-1]
222
+ termination_reason = job_submission.termination_reason
223
+ exit_status = job_submission.exit_status
224
+ if termination_reason is None:
222
225
  return ""
223
- return job.job_submissions[-1].termination_reason.name
226
+ if exit_status:
227
+ return f"{termination_reason.name} {exit_status}"
228
+ return termination_reason.name
@@ -147,7 +147,7 @@ class CudoCompute(
147
147
 
148
148
 
149
149
  def _get_image_id(cuda: bool) -> str:
150
- image_name = "ubuntu-2204-nvidia-535-docker-v20240214" if cuda else "ubuntu-2204"
150
+ image_name = "ubuntu-2204-nvidia-535-docker-v20241017" if cuda else "ubuntu-2204"
151
151
  return image_name
152
152
 
153
153
 
@@ -20,6 +20,7 @@ INFINIBAND_FABRICS = [
20
20
  InfinibandFabric("fabric-5", "gpu-h200-sxm", "eu-west1"),
21
21
  InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
22
22
  InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
23
+ InfinibandFabric("us-central1-a", "gpu-h200-sxm", "us-central1"),
23
24
  ]
24
25
 
25
26
 
@@ -5,7 +5,7 @@ from pydantic import Field, root_validator
5
5
  from dstack._internal.core.backends.base.models import fill_data
6
6
  from dstack._internal.core.models.common import CoreModel
7
7
 
8
- DEFAULT_PROJECT_NAME_PREFIX = "default-project"
8
+ DEFAULT_PROJECT_NAME_PREFIX = "default"
9
9
 
10
10
 
11
11
  class NebiusServiceAccountCreds(CoreModel):
@@ -126,7 +126,7 @@ class ComputeCapability(Tuple[int, int]):
126
126
 
127
127
  DEFAULT_CPU_COUNT = Range[int](min=2)
128
128
  DEFAULT_MEMORY_SIZE = Range[Memory](min=Memory.parse("8GB"))
129
- DEFAULT_GPU_COUNT = Range[int](min=1, max=1)
129
+ DEFAULT_GPU_COUNT = Range[int](min=1)
130
130
 
131
131
 
132
132
  class CPUSpec(CoreModel):
@@ -104,6 +104,7 @@ class JobTerminationReason(str, Enum):
104
104
  # Set by the server
105
105
  FAILED_TO_START_DUE_TO_NO_CAPACITY = "failed_to_start_due_to_no_capacity"
106
106
  INTERRUPTED_BY_NO_CAPACITY = "interrupted_by_no_capacity"
107
+ INSTANCE_UNREACHABLE = "instance_unreachable"
107
108
  WAITING_INSTANCE_LIMIT_EXCEEDED = "waiting_instance_limit_exceeded"
108
109
  WAITING_RUNNER_LIMIT_EXCEEDED = "waiting_runner_limit_exceeded"
109
110
  TERMINATED_BY_USER = "terminated_by_user"
@@ -126,6 +127,7 @@ class JobTerminationReason(str, Enum):
126
127
  mapping = {
127
128
  self.FAILED_TO_START_DUE_TO_NO_CAPACITY: JobStatus.FAILED,
128
129
  self.INTERRUPTED_BY_NO_CAPACITY: JobStatus.FAILED,
130
+ self.INSTANCE_UNREACHABLE: JobStatus.FAILED,
129
131
  self.WAITING_INSTANCE_LIMIT_EXCEEDED: JobStatus.FAILED,
130
132
  self.WAITING_RUNNER_LIMIT_EXCEEDED: JobStatus.FAILED,
131
133
  self.TERMINATED_BY_USER: JobStatus.TERMINATED,
@@ -262,9 +264,9 @@ class JobRuntimeData(CoreModel):
262
264
  # or not applicable (container-based backends)
263
265
  ports: Optional[dict[int, int]] = None
264
266
  # List of volumes used by the job
265
- volume_names: Optional[list[str]] = None # None for backward compalibility
267
+ volume_names: Optional[list[str]] = None # None for backward compatibility
266
268
  # Virtual shared offer
267
- offer: Optional[InstanceOfferWithAvailability] = None # None for backward compalibility
269
+ offer: Optional[InstanceOfferWithAvailability] = None # None for backward compatibility
268
270
 
269
271
 
270
272
  class ClusterInfo(CoreModel):
@@ -283,6 +285,7 @@ class JobSubmission(CoreModel):
283
285
  status: JobStatus
284
286
  termination_reason: Optional[JobTerminationReason]
285
287
  termination_reason_message: Optional[str]
288
+ exit_status: Optional[int]
286
289
  job_provisioning_data: Optional[JobProvisioningData]
287
290
  job_runtime_data: Optional[JobRuntimeData]
288
291
 
@@ -508,7 +511,9 @@ def _get_run_error(
508
511
  return ""
509
512
  if len(run_jobs) > 1:
510
513
  return run_termination_reason.name
511
- run_job_termination_reason = _get_run_job_termination_reason(run_jobs)
514
+ run_job_termination_reason, exit_status = _get_run_job_termination_reason_and_exit_status(
515
+ run_jobs
516
+ )
512
517
  # For failed runs, also show termination reason to provide more context.
513
518
  # For other run statuses, the job termination reason will duplicate run status.
514
519
  if run_job_termination_reason is not None and run_termination_reason in [
@@ -516,13 +521,20 @@ def _get_run_error(
516
521
  RunTerminationReason.SERVER_ERROR,
517
522
  RunTerminationReason.RETRY_LIMIT_EXCEEDED,
518
523
  ]:
524
+ if exit_status:
525
+ return (
526
+ f"{run_termination_reason.name}\n({run_job_termination_reason.name} {exit_status})"
527
+ )
519
528
  return f"{run_termination_reason.name}\n({run_job_termination_reason.name})"
520
529
  return run_termination_reason.name
521
530
 
522
531
 
523
- def _get_run_job_termination_reason(run_jobs: List[Job]) -> Optional[JobTerminationReason]:
532
+ def _get_run_job_termination_reason_and_exit_status(
533
+ run_jobs: List[Job],
534
+ ) -> tuple[Optional[JobTerminationReason], Optional[int]]:
524
535
  for job in run_jobs:
525
536
  if len(job.job_submissions) > 0:
526
- if job.job_submissions[-1].termination_reason is not None:
527
- return job.job_submissions[-1].termination_reason
528
- return None
537
+ job_submission = job.job_submissions[-1]
538
+ if job_submission.termination_reason is not None:
539
+ return job_submission.termination_reason, job_submission.exit_status
540
+ return None, None
@@ -42,10 +42,33 @@ async def collect_metrics():
42
42
 
43
43
 
44
44
  async def delete_metrics():
45
- cutoff = _get_delete_metrics_cutoff()
45
+ now_timestamp_micro = int(get_current_datetime().timestamp() * 1_000_000)
46
+ running_timestamp_micro_cutoff = (
47
+ now_timestamp_micro - settings.SERVER_METRICS_RUNNING_TTL_SECONDS * 1_000_000
48
+ )
49
+ finished_timestamp_micro_cutoff = (
50
+ now_timestamp_micro - settings.SERVER_METRICS_FINISHED_TTL_SECONDS * 1_000_000
51
+ )
46
52
  async with get_session_ctx() as session:
47
- await session.execute(
48
- delete(JobMetricsPoint).where(JobMetricsPoint.timestamp_micro < cutoff)
53
+ await asyncio.gather(
54
+ session.execute(
55
+ delete(JobMetricsPoint).where(
56
+ JobMetricsPoint.job_id.in_(
57
+ select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING]))
58
+ ),
59
+ JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff,
60
+ )
61
+ ),
62
+ session.execute(
63
+ delete(JobMetricsPoint).where(
64
+ JobMetricsPoint.job_id.in_(
65
+ select(JobModel.id).where(
66
+ JobModel.status.in_(JobStatus.finished_statuses())
67
+ )
68
+ ),
69
+ JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff,
70
+ )
71
+ ),
49
72
  )
50
73
  await session.commit()
51
74
 
@@ -134,9 +157,3 @@ def _pull_runner_metrics(
134
157
  ) -> Optional[MetricsResponse]:
135
158
  runner_client = client.RunnerClient(port=ports[DSTACK_RUNNER_HTTP_PORT])
136
159
  return runner_client.get_metrics()
137
-
138
-
139
- def _get_delete_metrics_cutoff() -> int:
140
- now = int(get_current_datetime().timestamp() * 1_000_000)
141
- cutoff = now - (settings.SERVER_METRICS_TTL_SECONDS * 1_000_000)
142
- return cutoff
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from collections.abc import Iterable
3
- from datetime import timedelta
3
+ from datetime import timedelta, timezone
4
4
  from typing import Dict, List, Optional
5
5
 
6
6
  from sqlalchemy import select
@@ -71,6 +71,12 @@ from dstack._internal.utils.logging import get_logger
71
71
  logger = get_logger(__name__)
72
72
 
73
73
 
74
+ # Minimum time before terminating active job in case of connectivity issues.
75
+ # Should be sufficient to survive most problems caused by
76
+ # the server network flickering and providers' glitches.
77
+ JOB_DISCONNECTED_RETRY_TIMEOUT = timedelta(minutes=2)
78
+
79
+
74
80
  async def process_running_jobs(batch_size: int = 1):
75
81
  tasks = []
76
82
  for _ in range(batch_size):
@@ -202,7 +208,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
202
208
  user_ssh_key = run.run_spec.ssh_key_pub.strip()
203
209
  public_keys = [project.ssh_public_key.strip(), user_ssh_key]
204
210
  if job_provisioning_data.backend == BackendType.LOCAL:
205
- # No need to update ~/.ssh/authorized_keys when running shim localy
211
+ # No need to update ~/.ssh/authorized_keys when running shim locally
206
212
  user_ssh_key = ""
207
213
  success = await common_utils.run_async(
208
214
  _process_provisioning_with_shim,
@@ -299,19 +305,38 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
299
305
  run_model,
300
306
  job_model,
301
307
  )
302
- if not success:
303
- job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
304
308
 
305
- if not success: # kill the job
306
- logger.warning(
307
- "%s: failed because runner is not available or return an error, age=%s",
308
- fmt(job_model),
309
- job_submission.age,
310
- )
311
- job_model.status = JobStatus.TERMINATING
312
- if not job_model.termination_reason:
313
- job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
314
- # job will be terminated and instance will be emptied by process_terminating_jobs
309
+ if success:
310
+ job_model.disconnected_at = None
311
+ else:
312
+ if job_model.termination_reason:
313
+ logger.warning(
314
+ "%s: failed because shim/runner returned an error, age=%s",
315
+ fmt(job_model),
316
+ job_submission.age,
317
+ )
318
+ job_model.status = JobStatus.TERMINATING
319
+ # job will be terminated and instance will be emptied by process_terminating_jobs
320
+ else:
321
+ # No job_model.termination_reason set means ssh connection failed
322
+ if job_model.disconnected_at is None:
323
+ job_model.disconnected_at = common_utils.get_current_datetime()
324
+ if _should_terminate_job_due_to_disconnect(job_model):
325
+ logger.warning(
326
+ "%s: failed because instance is unreachable, age=%s",
327
+ fmt(job_model),
328
+ job_submission.age,
329
+ )
330
+ # TODO: Replace with JobTerminationReason.INSTANCE_UNREACHABLE in 0.20 or
331
+ # when CLI <= 0.19.8 is no longer supported
332
+ job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
333
+ job_model.status = JobStatus.TERMINATING
334
+ else:
335
+ logger.warning(
336
+ "%s: is unreachable, waiting for the instance to become reachable again, age=%s",
337
+ fmt(job_model),
338
+ job_submission.age,
339
+ )
315
340
 
316
341
  if (
317
342
  initial_status != job_model.status
@@ -543,7 +568,7 @@ def _process_pulling_with_shim(
543
568
  if shim_client.is_api_v2_supported(): # raises error if shim is down, causes retry
544
569
  task = shim_client.get_task(job_model.id)
545
570
 
546
- # If task goes to terminated before the job is submitted to runner, then an error occured
571
+ # If task goes to terminated before the job is submitted to runner, then an error occurred
547
572
  if task.status == TaskStatus.TERMINATED:
548
573
  logger.warning(
549
574
  "shim failed to execute job %s: %s (%s)",
@@ -572,7 +597,7 @@ def _process_pulling_with_shim(
572
597
  else:
573
598
  shim_status = shim_client.pull() # raises error if shim is down, causes retry
574
599
 
575
- # If shim goes to pending before the job is submitted to runner, then an error occured
600
+ # If shim goes to pending before the job is submitted to runner, then an error occurred
576
601
  if (
577
602
  shim_status.state == "pending"
578
603
  and shim_status.result is not None
@@ -651,6 +676,10 @@ def _process_running(
651
676
  )
652
677
  if latest_state_event.termination_message:
653
678
  job_model.termination_reason_message = latest_state_event.termination_message
679
+ if (exit_status := latest_state_event.exit_status) is not None:
680
+ job_model.exit_status = exit_status
681
+ if exit_status != 0:
682
+ logger.info("%s: non-zero exit status %s", fmt(job_model), exit_status)
654
683
  else:
655
684
  _terminate_if_inactivity_duration_exceeded(run_model, job_model, resp.no_connections_secs)
656
685
  if job_model.status != previous_status:
@@ -688,6 +717,15 @@ def _terminate_if_inactivity_duration_exceeded(
688
717
  )
689
718
 
690
719
 
720
+ def _should_terminate_job_due_to_disconnect(job_model: JobModel) -> bool:
721
+ if job_model.disconnected_at is None:
722
+ return False
723
+ return (
724
+ common_utils.get_current_datetime()
725
+ > job_model.disconnected_at.replace(tzinfo=timezone.utc) + JOB_DISCONNECTED_RETRY_TIMEOUT
726
+ )
727
+
728
+
691
729
  async def _check_gpu_utilization(session: AsyncSession, job_model: JobModel, job: Job) -> None:
692
730
  policy = job.job_spec.utilization_policy
693
731
  if policy is None:
@@ -818,8 +856,8 @@ def _submit_job_to_runner(
818
856
  return success_if_not_available
819
857
 
820
858
  runner_client.submit_job(
821
- run_spec=run.run_spec,
822
- job_spec=job.job_spec,
859
+ run=run,
860
+ job=job,
823
861
  cluster_info=cluster_info,
824
862
  secrets=secrets,
825
863
  repo_credentials=repo_credentials,
@@ -0,0 +1,100 @@
1
+ """Add JobModel.disconnected_at
2
+
3
+ Revision ID: 20166748b60c
4
+ Revises: 6c1a9d6530ee
5
+ Create Date: 2025-05-13 16:24:32.496578
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+ from alembic_postgresql_enum import TableReference
12
+
13
+ import dstack._internal.server.models
14
+
15
+ # revision identifiers, used by Alembic.
16
+ revision = "20166748b60c"
17
+ down_revision = "6c1a9d6530ee"
18
+ branch_labels = None
19
+ depends_on = None
20
+
21
+
22
+ def upgrade() -> None:
23
+ # ### commands auto generated by Alembic - please adjust! ###
24
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
25
+ batch_op.add_column(
26
+ sa.Column(
27
+ "disconnected_at", dstack._internal.server.models.NaiveDateTime(), nullable=True
28
+ )
29
+ )
30
+
31
+ op.sync_enum_values(
32
+ enum_schema="public",
33
+ enum_name="jobterminationreason",
34
+ new_values=[
35
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
36
+ "INTERRUPTED_BY_NO_CAPACITY",
37
+ "INSTANCE_UNREACHABLE",
38
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
39
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
40
+ "TERMINATED_BY_USER",
41
+ "VOLUME_ERROR",
42
+ "GATEWAY_ERROR",
43
+ "SCALED_DOWN",
44
+ "DONE_BY_RUNNER",
45
+ "ABORTED_BY_USER",
46
+ "TERMINATED_BY_SERVER",
47
+ "INACTIVITY_DURATION_EXCEEDED",
48
+ "TERMINATED_DUE_TO_UTILIZATION_POLICY",
49
+ "CONTAINER_EXITED_WITH_ERROR",
50
+ "PORTS_BINDING_FAILED",
51
+ "CREATING_CONTAINER_ERROR",
52
+ "EXECUTOR_ERROR",
53
+ "MAX_DURATION_EXCEEDED",
54
+ ],
55
+ affected_columns=[
56
+ TableReference(
57
+ table_schema="public", table_name="jobs", column_name="termination_reason"
58
+ )
59
+ ],
60
+ enum_values_to_rename=[],
61
+ )
62
+ # ### end Alembic commands ###
63
+
64
+
65
+ def downgrade() -> None:
66
+ # ### commands auto generated by Alembic - please adjust! ###
67
+ op.sync_enum_values(
68
+ enum_schema="public",
69
+ enum_name="jobterminationreason",
70
+ new_values=[
71
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
72
+ "INTERRUPTED_BY_NO_CAPACITY",
73
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
74
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
75
+ "TERMINATED_BY_USER",
76
+ "VOLUME_ERROR",
77
+ "GATEWAY_ERROR",
78
+ "SCALED_DOWN",
79
+ "DONE_BY_RUNNER",
80
+ "ABORTED_BY_USER",
81
+ "TERMINATED_BY_SERVER",
82
+ "INACTIVITY_DURATION_EXCEEDED",
83
+ "TERMINATED_DUE_TO_UTILIZATION_POLICY",
84
+ "CONTAINER_EXITED_WITH_ERROR",
85
+ "PORTS_BINDING_FAILED",
86
+ "CREATING_CONTAINER_ERROR",
87
+ "EXECUTOR_ERROR",
88
+ "MAX_DURATION_EXCEEDED",
89
+ ],
90
+ affected_columns=[
91
+ TableReference(
92
+ table_schema="public", table_name="jobs", column_name="termination_reason"
93
+ )
94
+ ],
95
+ enum_values_to_rename=[],
96
+ )
97
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
98
+ batch_op.drop_column("disconnected_at")
99
+
100
+ # ### end Alembic commands ###
@@ -0,0 +1,26 @@
1
+ """Add JobModel.exit_status
2
+
3
+ Revision ID: 6c1a9d6530ee
4
+ Revises: 7ba3b59d7ca6
5
+ Create Date: 2025-05-09 10:25:19.715852
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = "6c1a9d6530ee"
14
+ down_revision = "7ba3b59d7ca6"
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
21
+ batch_op.add_column(sa.Column("exit_status", sa.Integer(), nullable=True))
22
+
23
+
24
+ def downgrade() -> None:
25
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
26
+ batch_op.drop_column("exit_status")
@@ -382,6 +382,10 @@ class JobModel(BaseModel):
382
382
  Enum(JobTerminationReason)
383
383
  )
384
384
  termination_reason_message: Mapped[Optional[str]] = mapped_column(Text)
385
+ # `disconnected_at` stores the first time of connectivity issues with the instance.
386
+ # Resets every time connectivity is restored.
387
+ disconnected_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
388
+ exit_status: Mapped[Optional[int]] = mapped_column(Integer)
385
389
  job_spec_data: Mapped[str] = mapped_column(Text)
386
390
  job_provisioning_data: Mapped[Optional[str]] = mapped_column(Text)
387
391
  runner_timestamp: Mapped[Optional[int]] = mapped_column(BigInteger)
@@ -390,7 +394,7 @@ class JobModel(BaseModel):
390
394
  remove_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
391
395
  volumes_detached_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
392
396
  # `instance_assigned` means instance assignment was done.
393
- # if `instance_assigned` is True and `instance` is None, no instance was assiged.
397
+ # if `instance_assigned` is True and `instance` is None, no instance was assigned.
394
398
  instance_assigned: Mapped[bool] = mapped_column(Boolean, default=False)
395
399
  instance_id: Mapped[Optional[uuid.UUID]] = mapped_column(
396
400
  ForeignKey("instances.id", ondelete="CASCADE")
@@ -7,7 +7,14 @@ from typing_extensions import Annotated
7
7
 
8
8
  from dstack._internal.core.models.common import CoreModel, NetworkMode
9
9
  from dstack._internal.core.models.repos.remote import RemoteRepoCreds
10
- from dstack._internal.core.models.runs import ClusterInfo, JobSpec, JobStatus, RunSpec
10
+ from dstack._internal.core.models.runs import (
11
+ ClusterInfo,
12
+ JobSpec,
13
+ JobStatus,
14
+ JobSubmission,
15
+ Run,
16
+ RunSpec,
17
+ )
11
18
  from dstack._internal.core.models.volumes import InstanceMountPoint, VolumeMountPoint
12
19
 
13
20
 
@@ -16,6 +23,7 @@ class JobStateEvent(CoreModel):
16
23
  state: JobStatus
17
24
  termination_reason: Optional[str] = None
18
25
  termination_message: Optional[str] = None
26
+ exit_status: Optional[int] = None
19
27
 
20
28
 
21
29
  class LogEvent(CoreModel):
@@ -38,15 +46,18 @@ class PullResponse(CoreModel):
38
46
 
39
47
 
40
48
  class SubmitBody(CoreModel):
41
- run_spec: Annotated[
42
- RunSpec,
49
+ run: Annotated[
50
+ Run,
43
51
  Field(
44
52
  include={
45
- "run_name",
46
- "repo_id",
47
- "repo_data",
48
- "configuration",
49
- "configuration_path",
53
+ "id": True,
54
+ "run_spec": {
55
+ "run_name",
56
+ "repo_id",
57
+ "repo_data",
58
+ "configuration",
59
+ "configuration_path",
60
+ },
50
61
  }
51
62
  ),
52
63
  ]
@@ -69,9 +80,31 @@ class SubmitBody(CoreModel):
69
80
  }
70
81
  ),
71
82
  ]
83
+ job_submission: Annotated[
84
+ JobSubmission,
85
+ Field(
86
+ include={
87
+ "id",
88
+ }
89
+ ),
90
+ ]
72
91
  cluster_info: Annotated[Optional[ClusterInfo], Field(include=True)]
73
92
  secrets: Annotated[Optional[Dict[str, str]], Field(include=True)]
74
93
  repo_credentials: Annotated[Optional[RemoteRepoCreds], Field(include=True)]
94
+ # run_spec is deprecated in favor of run.run_spec
95
+ # TODO: Remove once we no longer support instances deployed with 0.19.8 or earlier.
96
+ run_spec: Annotated[
97
+ RunSpec,
98
+ Field(
99
+ include={
100
+ "run_name",
101
+ "repo_id",
102
+ "repo_data",
103
+ "configuration",
104
+ "configuration_path",
105
+ },
106
+ ),
107
+ ]
75
108
 
76
109
 
77
110
  class HealthcheckResponse(CoreModel):
@@ -135,6 +135,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
135
135
  status=job_model.status,
136
136
  termination_reason=job_model.termination_reason,
137
137
  termination_reason_message=job_model.termination_reason_message,
138
+ exit_status=job_model.exit_status,
138
139
  job_provisioning_data=job_provisioning_data,
139
140
  job_runtime_data=get_job_runtime_data(job_model),
140
141
  )
@@ -12,7 +12,7 @@ from dstack._internal.core.models.common import CoreModel, NetworkMode
12
12
  from dstack._internal.core.models.envs import Env
13
13
  from dstack._internal.core.models.repos.remote import RemoteRepoCreds
14
14
  from dstack._internal.core.models.resources import Memory
15
- from dstack._internal.core.models.runs import ClusterInfo, JobSpec, RunSpec
15
+ from dstack._internal.core.models.runs import ClusterInfo, Job, Run
16
16
  from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
17
17
  from dstack._internal.server.schemas.runner import (
18
18
  GPUDevice,
@@ -72,8 +72,8 @@ class RunnerClient:
72
72
 
73
73
  def submit_job(
74
74
  self,
75
- run_spec: RunSpec,
76
- job_spec: JobSpec,
75
+ run: Run,
76
+ job: Job,
77
77
  cluster_info: ClusterInfo,
78
78
  secrets: Dict[str, str],
79
79
  repo_credentials: Optional[RemoteRepoCreds],
@@ -81,6 +81,7 @@ class RunnerClient:
81
81
  ):
82
82
  # XXX: This is a quick-and-dirty hack to deliver InstanceModel-specific environment
83
83
  # variables to the runner without runner API modification.
84
+ job_spec = job.job_spec
84
85
  if instance_env is not None:
85
86
  if isinstance(instance_env, Env):
86
87
  merged_env = instance_env.as_dict()
@@ -90,11 +91,13 @@ class RunnerClient:
90
91
  job_spec = job_spec.copy(deep=True)
91
92
  job_spec.env = merged_env
92
93
  body = SubmitBody(
93
- run_spec=run_spec,
94
+ run=run,
94
95
  job_spec=job_spec,
96
+ job_submission=job.job_submissions[-1],
95
97
  cluster_info=cluster_info,
96
98
  secrets=secrets,
97
99
  repo_credentials=repo_credentials,
100
+ run_spec=run.run_spec,
98
101
  )
99
102
  resp = requests.post(
100
103
  # use .json() to encode enums
@@ -870,10 +870,10 @@ def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
870
870
  if (
871
871
  run_spec.merged_profile.utilization_policy is not None
872
872
  and run_spec.merged_profile.utilization_policy.time_window
873
- > settings.SERVER_METRICS_TTL_SECONDS
873
+ > settings.SERVER_METRICS_RUNNING_TTL_SECONDS
874
874
  ):
875
875
  raise ServerClientError(
876
- f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_TTL_SECONDS}s"
876
+ f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_RUNNING_TTL_SECONDS}s"
877
877
  )
878
878
  set_resources_defaults(run_spec.configuration.resources)
879
879
 
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import warnings
2
3
  from pathlib import Path
3
4
 
4
5
  DSTACK_DIR_PATH = Path("~/.dstack/").expanduser()
@@ -45,7 +46,25 @@ SERVER_CLOUDWATCH_LOG_REGION = os.getenv("DSTACK_SERVER_CLOUDWATCH_LOG_REGION")
45
46
 
46
47
  SERVER_GCP_LOGGING_PROJECT = os.getenv("DSTACK_SERVER_GCP_LOGGING_PROJECT")
47
48
 
48
- SERVER_METRICS_TTL_SECONDS = int(os.getenv("DSTACK_SERVER_METRICS_TTL_SECONDS", 3600))
49
+ SERVER_METRICS_RUNNING_TTL_SECONDS: int
50
+ _SERVER_METRICS_RUNNING_TTL_SECONDS = os.getenv("DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS")
51
+ if _SERVER_METRICS_RUNNING_TTL_SECONDS is None:
52
+ _SERVER_METRICS_RUNNING_TTL_SECONDS = os.getenv("DSTACK_SERVER_METRICS_TTL_SECONDS")
53
+ if _SERVER_METRICS_RUNNING_TTL_SECONDS is not None:
54
+ warnings.warn(
55
+ (
56
+ "DSTACK_SERVER_METRICS_TTL_SECONDS is deprecated,"
57
+ " use DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS instead"
58
+ ),
59
+ DeprecationWarning,
60
+ )
61
+ else:
62
+ _SERVER_METRICS_RUNNING_TTL_SECONDS = 3600
63
+ SERVER_METRICS_RUNNING_TTL_SECONDS = int(_SERVER_METRICS_RUNNING_TTL_SECONDS)
64
+ del _SERVER_METRICS_RUNNING_TTL_SECONDS
65
+ SERVER_METRICS_FINISHED_TTL_SECONDS = int(
66
+ os.getenv("DSTACK_SERVER_METRICS_FINISHED_TTL_SECONDS", 7 * 24 * 3600)
67
+ )
49
68
 
50
69
  DEFAULT_PROJECT_NAME = "main"
51
70
 
@@ -302,6 +302,7 @@ async def create_job(
302
302
  job_num: int = 0,
303
303
  replica_num: int = 0,
304
304
  instance_assigned: bool = False,
305
+ disconnected_at: Optional[datetime] = None,
305
306
  ) -> JobModel:
306
307
  run_spec = RunSpec.parse_raw(run.run_spec)
307
308
  job_spec = (await get_job_specs_from_run_spec(run_spec, replica_num=replica_num))[0]
@@ -323,6 +324,7 @@ async def create_job(
323
324
  instance=instance,
324
325
  instance_assigned=instance_assigned,
325
326
  used_instance_id=instance.id if instance is not None else None,
327
+ disconnected_at=disconnected_at,
326
328
  )
327
329
  session.add(job)
328
330
  await session.commit()
@@ -115,6 +115,8 @@ def _get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
115
115
  job_submissions_excludes["job_runtime_data"] = {
116
116
  "offer": {"instance": {"resources": {"cpu_arch"}}}
117
117
  }
118
+ if all(js.exit_status is None for js in job_submissions):
119
+ job_submissions_excludes["exit_status"] = True
118
120
  latest_job_submission = current_resource.latest_job_submission
119
121
  if latest_job_submission is not None:
120
122
  latest_job_submission_excludes = {}
@@ -127,6 +129,8 @@ def _get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
127
129
  latest_job_submission_excludes["job_runtime_data"] = {
128
130
  "offer": {"instance": {"resources": {"cpu_arch"}}}
129
131
  }
132
+ if latest_job_submission.exit_status is None:
133
+ latest_job_submission_excludes["exit_status"] = True
130
134
  return {"plan": apply_plan_excludes}
131
135
 
132
136
 
dstack/version.py CHANGED
@@ -1,3 +1,3 @@
1
- __version__ = "0.19.8"
1
+ __version__ = "0.19.9"
2
2
  __is_release__ = True
3
3
  base_image = "0.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dstack
3
- Version: 0.19.8
3
+ Version: 0.19.9
4
4
  Summary: dstack is an open-source orchestration engine for running AI workloads on any cloud or on-premises.
5
5
  Project-URL: Homepage, https://dstack.ai
6
6
  Project-URL: Source, https://github.com/dstackai/dstack
@@ -54,7 +54,7 @@ Requires-Dist: azure-mgmt-network<28.0.0,>=23.0.0; extra == 'all'
54
54
  Requires-Dist: azure-mgmt-resource>=22.0.0; extra == 'all'
55
55
  Requires-Dist: azure-mgmt-subscription>=3.1.1; extra == 'all'
56
56
  Requires-Dist: backports-entry-points-selectable; extra == 'all'
57
- Requires-Dist: boto3; extra == 'all'
57
+ Requires-Dist: boto3>=1.38.13; extra == 'all'
58
58
  Requires-Dist: botocore; extra == 'all'
59
59
  Requires-Dist: datacrunch; extra == 'all'
60
60
  Requires-Dist: docker>=6.0.0; extra == 'all'
@@ -90,7 +90,7 @@ Requires-Dist: alembic>=1.10.2; extra == 'aws'
90
90
  Requires-Dist: apscheduler<4; extra == 'aws'
91
91
  Requires-Dist: asyncpg; extra == 'aws'
92
92
  Requires-Dist: backports-entry-points-selectable; extra == 'aws'
93
- Requires-Dist: boto3; extra == 'aws'
93
+ Requires-Dist: boto3>=1.38.13; extra == 'aws'
94
94
  Requires-Dist: botocore; extra == 'aws'
95
95
  Requires-Dist: docker>=6.0.0; extra == 'aws'
96
96
  Requires-Dist: fastapi; extra == 'aws'
@@ -231,7 +231,7 @@ Requires-Dist: alembic>=1.10.2; extra == 'lambda'
231
231
  Requires-Dist: apscheduler<4; extra == 'lambda'
232
232
  Requires-Dist: asyncpg; extra == 'lambda'
233
233
  Requires-Dist: backports-entry-points-selectable; extra == 'lambda'
234
- Requires-Dist: boto3; extra == 'lambda'
234
+ Requires-Dist: boto3>=1.38.13; extra == 'lambda'
235
235
  Requires-Dist: botocore; extra == 'lambda'
236
236
  Requires-Dist: docker>=6.0.0; extra == 'lambda'
237
237
  Requires-Dist: fastapi; extra == 'lambda'
@@ -338,24 +338,27 @@ orchestration for ML teams across top clouds and on-prem clusters.
338
338
 
339
339
  #### Accelerators
340
340
 
341
- `dstack` supports `NVIDIA`, `AMD`, `Google TPU`, and `Intel Gaudi` accelerators out of the box.
341
+ `dstack` supports `NVIDIA`, `AMD`, `Google TPU`, `Intel Gaudi`, and `Tenstorrent` accelerators out of the box.
342
342
 
343
- ## Major news ✨
343
+ ## Latest news ✨
344
344
 
345
- - [2025/02] [dstack 0.18.41: GPU blocks, Proxy jump, inactivity duration, and more](https://github.com/dstackai/dstack/releases/tag/0.18.41)
346
- - [2025/01] [dstack 0.18.38: Intel Gaudi](https://github.com/dstackai/dstack/releases/tag/0.18.38)
347
- - [2025/01] [dstack 0.18.35: Vultr](https://github.com/dstackai/dstack/releases/tag/0.18.35)
348
- - [2024/12] [dstack 0.18.30: AWS Capacity Reservations and Capacity Blocks](https://github.com/dstackai/dstack/releases/tag/0.18.30)
349
- - [2024/10] [dstack 0.18.21: Instance volumes](https://github.com/dstackai/dstack/releases/tag/0.18.21)
350
- - [2024/10] [dstack 0.18.18: Hardware metrics monitoring](https://github.com/dstackai/dstack/releases/tag/0.18.18)
345
+ - [2025/05] [dstack 0.19.8: Nebius clusters, GH200 on Lambda](https://github.com/dstackai/dstack/releases/tag/0.19.8)
346
+ - [2025/04] [dstack 0.19.6: Tenstorrent, Plugins](https://github.com/dstackai/dstack/releases/tag/0.19.6)
347
+ - [2025/04] [dstack 0.19.5: GCP A3 High clusters](https://github.com/dstackai/dstack/releases/tag/0.19.5)
348
+ - [2025/04] [dstack 0.19.3: GCP A3 Mega clusters](https://github.com/dstackai/dstack/releases/tag/0.19.3)
349
+ - [2025/03] [dstack 0.19.0: Prometheus](https://github.com/dstackai/dstack/releases/tag/0.19.0)
351
350
 
352
- ## Installation
351
+ ## How does it work?
352
+
353
+ <img src="https://dstack.ai/static-assets/static-assets/images/dstack-architecture-diagram-v8.svg" width="750" />
354
+
355
+ ### Installation
353
356
 
354
357
  > Before using `dstack` through CLI or API, set up a `dstack` server. If you already have a running `dstack` server, you only need to [set up the CLI](#set-up-the-cli).
355
358
 
356
- ### Set up the server
359
+ #### Set up the server
357
360
 
358
- #### (Optional) Configure backends
361
+ ##### (Optional) Configure backends
359
362
 
360
363
  To use `dstack` with cloud providers, configure backends
361
364
  via the `~/.dstack/server/config.yml` file.
@@ -365,21 +368,21 @@ For more details on how to configure backends, check [Backends](https://dstack.a
365
368
  > For using `dstack` with on-prem servers, create [SSH fleets](https://dstack.ai/docs/concepts/fleets#ssh)
366
369
  > once the server is up.
367
370
 
368
- #### Start the server
371
+ ##### Start the server
369
372
 
370
373
  You can install the server on Linux, macOS, and Windows (via WSL 2). It requires Git and
371
374
  OpenSSH.
372
375
 
373
- ##### pip
376
+ ##### uv
374
377
 
375
378
  ```shell
376
- $ pip install "dstack[all]" -U
379
+ $ uv tool install "dstack[all]" -U
377
380
  ```
378
381
 
379
- ##### uv
382
+ ##### pip
380
383
 
381
384
  ```shell
382
- $ uv tool install "dstack[all]" -U
385
+ $ pip install "dstack[all]" -U
383
386
  ```
384
387
 
385
388
  Once it's installed, go ahead and start the server.
@@ -392,25 +395,28 @@ The admin token is "bbae0f28-d3dd-4820-bf61-8f4bb40815da"
392
395
  The server is running at http://127.0.0.1:3000/
393
396
  ```
394
397
 
395
- For more details on server configuration options, see the
398
+ > For more details on server configuration options, see the
396
399
  [Server deployment](https://dstack.ai/docs/guides/server-deployment) guide.
397
400
 
398
- ### Set up the CLI
401
+
402
+ <details><summary>Set up the CLI</summary>
403
+
404
+ #### Set up the CLI
399
405
 
400
406
  Once the server is up, you can access it via the `dstack` CLI.
401
407
 
402
408
  The CLI can be installed on Linux, macOS, and Windows. It requires Git and OpenSSH.
403
409
 
404
- ##### pip
410
+ ##### uv
405
411
 
406
412
  ```shell
407
- $ pip install dstack -U
413
+ $ uv tool install dstack -U
408
414
  ```
409
415
 
410
- ##### uv
416
+ ##### pip
411
417
 
412
418
  ```shell
413
- $ uv tool install dstack -U
419
+ $ pip install dstack -U
414
420
  ```
415
421
 
416
422
  To point the CLI to the `dstack` server, configure it
@@ -425,9 +431,9 @@ $ dstack config \
425
431
  Configuration is updated at ~/.dstack/config.yml
426
432
  ```
427
433
 
428
- ## How does it work?
434
+ </details>
429
435
 
430
- ### 1. Define configurations
436
+ ### Define configurations
431
437
 
432
438
  `dstack` supports the following configurations:
433
439
 
@@ -440,7 +446,7 @@ Configuration is updated at ~/.dstack/config.yml
440
446
 
441
447
  Configuration can be defined as YAML files within your repo.
442
448
 
443
- ### 2. Apply configurations
449
+ ### Apply configurations
444
450
 
445
451
  Apply the configuration either via the `dstack apply` CLI command or through a programmatic API.
446
452
 
@@ -452,6 +458,7 @@ out-of-capacity errors, port-forwarding, and more &mdash; across clouds and on-p
452
458
  For additional information, see the following links:
453
459
 
454
460
  * [Docs](https://dstack.ai/docs)
461
+ * [Examples](https://dstack.ai/examples)
455
462
  * [Discord](https://discord.gg/u8SmfwPpMd)
456
463
 
457
464
  ## Contributing
@@ -1,5 +1,5 @@
1
1
  dstack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- dstack/version.py,sha256=oR0QmUkjAU0DEnlfWpUCO78TpWiiucCIz4_FvmSdXUE,64
2
+ dstack/version.py,sha256=lJgn5pYWtoxN-mAxtEnIXX2nGXSQrLvGxKsa1F9aNgM,64
3
3
  dstack/_internal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  dstack/_internal/compat.py,sha256=bF9U9fTMfL8UVhCouedoUSTYFl7UAOiU0WXrnRoByxw,40
5
5
  dstack/_internal/settings.py,sha256=otvcNT0X5UnGZdoNIWNFZBohQRzLme9Zc6oiBzc1BEk,796
@@ -31,14 +31,14 @@ dstack/_internal/cli/services/configurators/__init__.py,sha256=z94VPBFqybP8Zpwy3
31
31
  dstack/_internal/cli/services/configurators/base.py,sha256=bGfde2zoma28lLE8MUACO4-NKT1CdJJQJoXrzjpz0mQ,3360
32
32
  dstack/_internal/cli/services/configurators/fleet.py,sha256=jm4tNH6QQVplLdboCTlvRYUee3nZ0UYb_qLTrvtYVYM,14049
33
33
  dstack/_internal/cli/services/configurators/gateway.py,sha256=czB2s89s7IowOmWnpDwWErPAUlW3FvFMizImhrkQiBM,8927
34
- dstack/_internal/cli/services/configurators/run.py,sha256=ygfFWcZZ6nBXZUPmBtX5s0r0szOTjR8tNnErHsizDnk,25383
34
+ dstack/_internal/cli/services/configurators/run.py,sha256=nXNjFrM5YT6RFqPXJQa4MOiEsG6IFiANyGKP-PXILdc,25518
35
35
  dstack/_internal/cli/services/configurators/volume.py,sha256=riMXLQbgvHIIFwLKdHfad-_0iE9wE3G_rUmXU5P3ZS8,8519
36
36
  dstack/_internal/cli/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
37
  dstack/_internal/cli/utils/common.py,sha256=rfmzqrsgR3rXW3wj0vxDdvrhUUg2aIy4A6E9MZbd55g,1763
38
38
  dstack/_internal/cli/utils/fleet.py,sha256=ch-LN1X9boSm-rFLW4mAJRmz0XliLhH0LvKD2DqSt2g,3942
39
39
  dstack/_internal/cli/utils/gateway.py,sha256=qMYa1NTAT_O98x2_mSyWDRbiHj5fqt6xUXFh9NIUwAM,1502
40
40
  dstack/_internal/cli/utils/rich.py,sha256=Gx1MJU929kMKsbdo9qF7XHARNta2426Ssb-xMLVhwbQ,5710
41
- dstack/_internal/cli/utils/run.py,sha256=-zfOA_SqBOqHXuQIXnZrxhxt7iYOnsUjqZZ1TzVHmUE,9023
41
+ dstack/_internal/cli/utils/run.py,sha256=nCQwAU3VDS8ec2oWNjRKi5xIGdwwKI_YNr8vgGyDPzQ,9202
42
42
  dstack/_internal/cli/utils/updates.py,sha256=sAPYYptkFzQnGaRjv7FV7HOj-Be3IXGe63xj-sVEpv4,2566
43
43
  dstack/_internal/cli/utils/volume.py,sha256=mU9I06dVMFbpjfkefxrZNoSWadKLoib3U14rHudNQN4,1975
44
44
  dstack/_internal/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -71,7 +71,7 @@ dstack/_internal/core/backends/base/offers.py,sha256=AzAAx5eSTaHv8CbWuGERTHS151x
71
71
  dstack/_internal/core/backends/cudo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
72
  dstack/_internal/core/backends/cudo/api_client.py,sha256=ygq1Gx7ZvwKaifdXtvzDSw4xR4ZH6UWd5J47BjuaGh0,3685
73
73
  dstack/_internal/core/backends/cudo/backend.py,sha256=i13YoAkUfIStc3Yyyt_3YmL30eVrKtrhwnE9_B1iBRI,546
74
- dstack/_internal/core/backends/cudo/compute.py,sha256=xtA09zvcM6xpp6YdHK6W20GcXA4zOfuHnU0tbODUo14,6466
74
+ dstack/_internal/core/backends/cudo/compute.py,sha256=wGMdH4Me-IHuQ-U1_XiuOqtHT86AgHyofUi449eqijo,6466
75
75
  dstack/_internal/core/backends/cudo/configurator.py,sha256=pkAT1MtL6_yYvYoqCglvPE-DiUdL8-XEviyN1yUSYyw,2056
76
76
  dstack/_internal/core/backends/cudo/models.py,sha256=6sfEqY2hvTpIACkyT4mhD3D8K5TsW_pupys9nqtrgoI,1055
77
77
  dstack/_internal/core/backends/datacrunch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -109,8 +109,8 @@ dstack/_internal/core/backends/nebius/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
109
109
  dstack/_internal/core/backends/nebius/backend.py,sha256=2XqZIbSR8VzlfOnuVklXlDxNmwAkQj7txQN8VXF1j2E,566
110
110
  dstack/_internal/core/backends/nebius/compute.py,sha256=OUNvDk9rU13NR0CADFqn7nQL8kxgSvL7kbNEV4cLzyQ,14761
111
111
  dstack/_internal/core/backends/nebius/configurator.py,sha256=ML2KCD6Ddxc2f6X1juxqKulUcOjF6uJk20_0Teyi65A,3072
112
- dstack/_internal/core/backends/nebius/fabrics.py,sha256=09eXtzSWpK7Oxv4N1pOmXzBJgbi2d5yC6gvcizHhplg,1476
113
- dstack/_internal/core/backends/nebius/models.py,sha256=-qM-F_c2Hf4ZL5AXmtQiGA5q1PyGsCCPhmRFHEyx2dw,4253
112
+ dstack/_internal/core/backends/nebius/fabrics.py,sha256=jC7ngUO54rXbyXI4hkl5_9GdBk7h4Ivyh88CH4S37ds,1546
113
+ dstack/_internal/core/backends/nebius/models.py,sha256=UudYX32p-ZY-GWR83VEtY5dpZBaWhKXQIfn2nrBCq-4,4245
114
114
  dstack/_internal/core/backends/nebius/resources.py,sha256=hx_VqiaurGO0MYT2KEvMl9EYdcglBRQsWSY5kHKjR00,12163
115
115
  dstack/_internal/core/backends/oci/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
116
116
  dstack/_internal/core/backends/oci/auth.py,sha256=8Cr18y_LOsyRP-16yfFpT70Cofpm0clB3KawS_7aRl4,717
@@ -165,8 +165,8 @@ dstack/_internal/core/models/metrics.py,sha256=Xb8hCXUL-ncQ3PMsErIUAJTe9gwh5jyrQ
165
165
  dstack/_internal/core/models/placement.py,sha256=WJVq5ENJykyRarQzL2EeYQag_9_jV7VSAtR_xoFvPVM,720
166
166
  dstack/_internal/core/models/profiles.py,sha256=seeysTuMv1vVUmpHAZgrMUGcbMtH7hSMFIvfx0Qk__0,10406
167
167
  dstack/_internal/core/models/projects.py,sha256=H5ZZRiyUEKifpTFAhl45KBi5ly7ooE0WmI329myK360,643
168
- dstack/_internal/core/models/resources.py,sha256=fDAcbXNYQHb9KopFVMLNXftC9OsJaOFiKP1cIrPDnps,14121
169
- dstack/_internal/core/models/runs.py,sha256=OMT69BfUUiaxRNUjTCZbWahHCTVBXXUF9jaYa6xgH38,18531
168
+ dstack/_internal/core/models/resources.py,sha256=rsf6hAhi5bfSb_Z9VcS3UoEG0G8Ohl6ekyrOStLOAqw,14114
169
+ dstack/_internal/core/models/runs.py,sha256=Xkv1kY68JA0eJUeCVJjM9YWRkfy6P1RGXslBIMtox2E,18985
170
170
  dstack/_internal/core/models/secrets.py,sha256=IQyemsNpSzqOCB-VlVTuc4gyPFmXXO4mhko0Ur0ey3I,221
171
171
  dstack/_internal/core/models/server.py,sha256=Hkc1v2s3KOiwslsWVmhUOAzcSeREoG-HD1SzSX9WUGg,152
172
172
  dstack/_internal/core/models/services.py,sha256=2Hpi7j0Q1shaf_0wd0C0044AJAmuYi-D3qx3PH849oI,3076
@@ -250,18 +250,18 @@ dstack/_internal/server/app.py,sha256=K2NojwUKdktdenrR61I21kXIMX6ars6zB9v6Ea-evz
250
250
  dstack/_internal/server/db.py,sha256=WjuqmjG3QAZmSMCeUaJ_ynbowlHuNAvYCZO649cTPHc,3210
251
251
  dstack/_internal/server/deps.py,sha256=31e8SU_ogPJWHIDLkgl7cuC_5V91xbJoLyAj17VanfM,670
252
252
  dstack/_internal/server/main.py,sha256=kztKhCYNoHSDyJJQScWfZXE0naNleJOCQULW6dd8SGw,109
253
- dstack/_internal/server/models.py,sha256=S0L7G_3q6akytdCZ1svukzCSY0kdwnGplbHwTAUBpwo,29351
254
- dstack/_internal/server/settings.py,sha256=1iqXWgvvsr19jXX1javGdPj6UhOfOHuuXSXgmGtjO2A,3335
253
+ dstack/_internal/server/models.py,sha256=GWl78Zl-_w1UyW9nB6DDS95Ko_osbLQtLb2DIi1JDLo,29633
254
+ dstack/_internal/server/settings.py,sha256=XkLexvylNbU3iRM0KHnTX0fywLGczBHya5lVmeptbqY,4123
255
255
  dstack/_internal/server/background/__init__.py,sha256=8kTbhEHCeXTibsOlHY1HwqIO6gGb4q8fUa2fcDrah1c,3893
256
256
  dstack/_internal/server/background/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
257
257
  dstack/_internal/server/background/tasks/common.py,sha256=N7xSXbf2MoBWgbJ1e3AEzYBTf1Gn-pDXYND8Zr_YCJQ,970
258
258
  dstack/_internal/server/background/tasks/process_fleets.py,sha256=LsD3I7iCbQs_nexJ1kfnn1VPz3BffFmALRgpk5DHGO4,2985
259
259
  dstack/_internal/server/background/tasks/process_gateways.py,sha256=hoUI1CSqbHt_uMwnzTRAEDl-LBw0wUk_W4xobIbdvRc,7017
260
260
  dstack/_internal/server/background/tasks/process_instances.py,sha256=E1NLac4ubiF6jUr9M7rj8cjQf4zFZCBVr428UBgFeGI,42855
261
- dstack/_internal/server/background/tasks/process_metrics.py,sha256=acySfsacpYbTPV9Yivs-oU37z1S2sUdWhRHdJkfBcCA,5332
261
+ dstack/_internal/server/background/tasks/process_metrics.py,sha256=IDGyVQtGLua_NoY8sLv0RH18iV_3-LUONj6cEI181QM,6136
262
262
  dstack/_internal/server/background/tasks/process_placement_groups.py,sha256=gJ8Um3Vx-brazHVWdtVXPnov4rwvDr-0Vn1Voq1cYBQ,4108
263
263
  dstack/_internal/server/background/tasks/process_prometheus_metrics.py,sha256=9VoGFqdiXcVkCi_NV1VqQdqllDkB0bHIqOHKMIZK1Fg,5183
264
- dstack/_internal/server/background/tasks/process_running_jobs.py,sha256=U6JdkEnpIApbiSRLKxqjNwA9WFAZY2zZNXujofhUd_g,34719
264
+ dstack/_internal/server/background/tasks/process_running_jobs.py,sha256=jyuYOQfXIVGu5ugTennwSGXL_c6VDNJN96-Qrz7Hgic,36441
265
265
  dstack/_internal/server/background/tasks/process_runs.py,sha256=EI1W6HUyB-og3g8BDP_GsBrJjQ-Z3JvZHTuJf7CRKRM,17974
266
266
  dstack/_internal/server/background/tasks/process_submitted_jobs.py,sha256=-XOApBgmn9ZyCoeXgnbp6cnsFT3uxE_-xqLtn1ez5dc,26603
267
267
  dstack/_internal/server/background/tasks/process_terminating_jobs.py,sha256=0Z3Q409RwSxOL_pgK8JktBthjtESEUH3ahwTLsTdYPk,3800
@@ -277,6 +277,7 @@ dstack/_internal/server/migrations/versions/14f2cb002fc2_add_jobmodel_removed_fl
277
277
  dstack/_internal/server/migrations/versions/1a48dfe44a40_rework_termination_handling.py,sha256=sqYOR7ZoUifmRrZz2DBaO-D_Pgu20nup15yihg1FBcM,1417
278
278
  dstack/_internal/server/migrations/versions/1e3fb39ef74b_add_remote_connection_details.py,sha256=x4FdfUD4XI7nxcppnw5juFKksusYMA4EXNxs0UEETFE,649
279
279
  dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py,sha256=4-H_mGGSD6tI7H0HQ-pBs5wixMKdDro6KtLdH_QId28,831
280
+ dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py,sha256=Vj7VPo67pAy6gHztSrbp9HmzAz0ckROeV6HxDfvRwOw,3085
280
281
  dstack/_internal/server/migrations/versions/23e01c56279a_make_blob_nullable.py,sha256=O5ZrwAXs1ubPrChyItCbuEeAPrlRF_ys1nVw4knAO5g,845
281
282
  dstack/_internal/server/migrations/versions/252d3743b641_.py,sha256=z3mMF3YCEg6ueoj746cDNBNlQSimmBOcVLNupOv2UuU,1246
282
283
  dstack/_internal/server/migrations/versions/27d3e55759fa_add_pools.py,sha256=yKEQ1OdPZSaO8YxRd986mJjNc9CjeO6SqY5SmL6aGfE,5433
@@ -296,6 +297,7 @@ dstack/_internal/server/migrations/versions/5ec538b70e71_replace_instansestatus.
296
297
  dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py,sha256=PKqFqe6x6bkO8hrv73fes4uyBpzsemqwp3c-i5XzaS8,1195
297
298
  dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py,sha256=UyNT3d8Osa2MpuODQRdmmbHlt-qT-wHNNOUTzUEcLLI,2512
298
299
  dstack/_internal/server/migrations/versions/686fb8341ea5_add_user_emails.py,sha256=0FlRf5Mis9ZIc7X6M9yqPtyFqR2SKCTUFfZeDbqAXBU,809
300
+ dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py,sha256=aYZIGWhM38hZ6SkrUsFESOmOIVhSAIbA9dbyFKsLUr8,615
299
301
  dstack/_internal/server/migrations/versions/710e5b3fac8f_add_encryption.py,sha256=nBURp4A6TpT13H1ccH4WyzkU2GGy3uDGKCNG88cAciw,1827
300
302
  dstack/_internal/server/migrations/versions/7b24b1c8eba7_add_instancemodel_last_processed_at.py,sha256=o1A8nzrmMFcivBzaIMemCtRfCZ9mq1IuBko1CJXoVOo,2124
301
303
  dstack/_internal/server/migrations/versions/7ba3b59d7ca6_add_runmodel_resubmission_attempt.py,sha256=FUvCRzOzkp8HjRYy0-kuOwvBGbwuCgyjXU7hD-AWRJs,1045
@@ -358,7 +360,7 @@ dstack/_internal/server/schemas/instances.py,sha256=MedAVbKa_1F9zhdSPrjMmL-Og5Dp
358
360
  dstack/_internal/server/schemas/logs.py,sha256=JGt39fBEFRjHhlGT1jIC6kwQhujxPO8uecjplzjTZXY,402
359
361
  dstack/_internal/server/schemas/projects.py,sha256=UmHtX0pkr3L_vPsosvRC9JneqBrYaJvVKd4OxhYySHc,566
360
362
  dstack/_internal/server/schemas/repos.py,sha256=Sit0Tqol79VOMGGp8ncZXLLsZ4INcF-pAA_jwRchYqA,666
361
- dstack/_internal/server/schemas/runner.py,sha256=L9cG4n8bt_wJQhk9iWeIHEOlOPuoE0aDXlMEfPNZX7s,4424
363
+ dstack/_internal/server/schemas/runner.py,sha256=CqlP90yvPKaavAqxLOxewIMq3cHeT_NWfR0YO81JfU0,5151
362
364
  dstack/_internal/server/schemas/runs.py,sha256=XhlTnn67g4NWVmIJFQdy2yPK_EcnSIYOCCSex0XOIes,1341
363
365
  dstack/_internal/server/schemas/secrets.py,sha256=mfqLSM7PqxVQ-GIWB6RfPRUOvSvvaRv-JxXAYxZ6dyY,373
364
366
  dstack/_internal/server/schemas/users.py,sha256=FuDqwRVe3mOmv497vOZKjI0a_d4Wt2g4ZiCJcyfHEKA,495
@@ -381,7 +383,7 @@ dstack/_internal/server/services/projects.py,sha256=Je1iWZ-ArmyFxK1yMUzod5WRXyiI
381
383
  dstack/_internal/server/services/prometheus.py,sha256=xq5G-Q2BJup9lS2F6__0wUVTs-k1Gr3dYclGzo2WoWo,12474
382
384
  dstack/_internal/server/services/repos.py,sha256=f9ztN7jz_2gvD9hXF5sJwWDVyG2-NHRfjIdSukowPh8,9342
383
385
  dstack/_internal/server/services/resources.py,sha256=VRFOih_cMJdc0c2m9nSGsX8vWAJQV3M6N87aqS_JXfw,699
384
- dstack/_internal/server/services/runs.py,sha256=Wcvz65TYtWv2YWeSseNvKSnOD85roADh3N8UToHP3nc,38543
386
+ dstack/_internal/server/services/runs.py,sha256=K3rqzfqkUY9dhTO-2W9_PGH8gCdYhCJK-S-gbM5drx4,38559
385
387
  dstack/_internal/server/services/storage.py,sha256=6I0xI_3_RpJNbKZwHjDnjrEwXGdHfiaeb5li15T-M1I,1884
386
388
  dstack/_internal/server/services/users.py,sha256=W-5xL7zsHNjeG7BBK54RWGvIrBOrw-FF0NcG_z9qhoE,7466
387
389
  dstack/_internal/server/services/volumes.py,sha256=vfKY6eZp64I58Mfdvrk9Wig7deveD2Rw4ET1cbc1Sog,16238
@@ -396,7 +398,7 @@ dstack/_internal/server/services/gateways/__init__.py,sha256=Up8uFsEQDBE0yOXn7n5
396
398
  dstack/_internal/server/services/gateways/client.py,sha256=XIJX3fGBbZ_AG8qZMTSE8KAB_ojq5YJFa0OXoD_dofg,7493
397
399
  dstack/_internal/server/services/gateways/connection.py,sha256=ot3lV85XdmCT45vBWeyj57nLPcLPNm316zu3jMyeWjA,5625
398
400
  dstack/_internal/server/services/gateways/pool.py,sha256=0LclTl1tyx-doS78LeaAKjr-SMp98zuwh5f9s06JSd0,1914
399
- dstack/_internal/server/services/jobs/__init__.py,sha256=XR23KU--yX4sLszLoCCQDPKLw9cKgrHdkI9SgIHvwHY,25535
401
+ dstack/_internal/server/services/jobs/__init__.py,sha256=GU3vMC0SZKyvL564A7t_QRoDjf83-8GsUkguDWK5x6c,25578
400
402
  dstack/_internal/server/services/jobs/configurators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
401
403
  dstack/_internal/server/services/jobs/configurators/base.py,sha256=QSIU1OoZ794HKdwjo5iXxGUvFE8a2-g_SoYjAQjXhcI,11354
402
404
  dstack/_internal/server/services/jobs/configurators/dev.py,sha256=ufN6Sd8TwIsjQYNZE32fkAqJI7o2zjgoZThbrP-bd7U,2378
@@ -420,7 +422,7 @@ dstack/_internal/server/services/proxy/routers/service_proxy.py,sha256=5oB-SX8f_
420
422
  dstack/_internal/server/services/proxy/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
421
423
  dstack/_internal/server/services/proxy/services/service_proxy.py,sha256=4JrSxHqhBYqU1oENii89Db-bzkFWExYrOy-0mNEhWBs,4879
422
424
  dstack/_internal/server/services/runner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
423
- dstack/_internal/server/services/runner/client.py,sha256=bEA_1NvOJpznSuoVfVfvIJJ03zzAHj_lXRUbA82SbPY,15503
425
+ dstack/_internal/server/services/runner/client.py,sha256=jQDxv_Yaiwr2e3m1LqPCFtToB_GrsC2yVQfgXzyn6g8,15586
424
426
  dstack/_internal/server/services/runner/ssh.py,sha256=H-X0015ZPwYq5tc31ytFF1uNaUAr9itAsABI2oPJWrk,5017
425
427
  dstack/_internal/server/services/services/__init__.py,sha256=HQz72SNN8W9gUQ5INyO_Wd8TR9j3V6qoHFGEDEI920w,10862
426
428
  dstack/_internal/server/services/services/autoscalers.py,sha256=0o_w9La-ex_P3VKG88w_XN3hkLkzryv5l1cH3pkZyAE,4315
@@ -531,7 +533,7 @@ dstack/_internal/server/statics/static/media/logo.f602feeb138844eda97c8cb6414614
531
533
  dstack/_internal/server/statics/static/media/okta.12f178e6873a1100965f2a4dbd18fcec.svg,sha256=KqFI05gQM135zC1plF1WBRF2F7CyKL7km97WKsZjAHI,319
532
534
  dstack/_internal/server/statics/static/media/theme.3994c817bb7dda191c1c9640dee0bf42.svg,sha256=ZxFFBVZWuRLqmWH4zhwGLNtKjOzHj-5MGJRunFAtu1I,561
533
535
  dstack/_internal/server/testing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
534
- dstack/_internal/server/testing/common.py,sha256=gsSjGb6c7Kp-f345srXxca0EuUL-TV_So7irLr_iss8,31391
536
+ dstack/_internal/server/testing/common.py,sha256=uzmF9_xsiHkb8l8adljuYSpAMEH4hmRZAbv6-96rN58,31480
535
537
  dstack/_internal/server/testing/conf.py,sha256=-zhujfFjTHNfQDOK-hBck32By11c_kC0OeinB3esQGg,1902
536
538
  dstack/_internal/server/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
537
539
  dstack/_internal/server/utils/common.py,sha256=PbjXtqYy1taKXpyG5ys8cIrz9MXqc9dBAsR_9D1brrk,1414
@@ -570,7 +572,7 @@ dstack/api/server/_logs.py,sha256=ng8QvFAIaoVOVChTK6Wuu5BeM6y7gAdx30KEYRsn9xA,50
570
572
  dstack/api/server/_metrics.py,sha256=OPb8sLhI_U605sHOPrELgy0_6cNFLJVfpvr-qkEukRM,670
571
573
  dstack/api/server/_projects.py,sha256=g6kNSU6jer8u7Kaut1I0Ft4wRMLBBCQShJf3fOB63hQ,1440
572
574
  dstack/api/server/_repos.py,sha256=bqsKuZWyiNLE8UAdSZrYNtk1J3Gu5MXXnTMIoM9jxpI,1770
573
- dstack/api/server/_runs.py,sha256=qxKlHcW73HRglW5iogz5FYPtd85zb0vu2uKBuZx8BIc,7662
575
+ dstack/api/server/_runs.py,sha256=uVTY57BlBvB86pkKNjUh-Nc5AYNmfH9kfBNbgzTnTyw,7914
574
576
  dstack/api/server/_secrets.py,sha256=VqLfrIcmBJtPxNDRkXTG44H5SWoY788YJapScUukvdY,1576
575
577
  dstack/api/server/_users.py,sha256=XzhgGKc5Tsr0-xkz3T6rGyWZ1tO7aYNhLux2eE7dAoY,1738
576
578
  dstack/api/server/_volumes.py,sha256=xxOt8o5G-bhMh6wSvF4BDFNoqVEhlM4BXQr2KvX0pN0,1937
@@ -580,8 +582,8 @@ dstack/plugins/__init__.py,sha256=buT1pcyORLgVbl89ATkRWJPhvejriVz7sNBjvuZRCRE,40
580
582
  dstack/plugins/_base.py,sha256=-etiB-EozaJCg2wtmONfj8ic-K03qXvXyl_TIDp-kNE,2662
581
583
  dstack/plugins/_models.py,sha256=1Gw--mDQ1_0FFr9Zur9LE8UbMoWESUpTdHHt12AyIZo,341
582
584
  dstack/plugins/_utils.py,sha256=FqeWYb7zOrgZkO9Bd8caL5I81_TUEsysIzvxsULrmzk,392
583
- dstack-0.19.8.dist-info/METADATA,sha256=OZnwMM_G_MuMYJbZ2StSOZnkeV2pbEPRE4_qWv8jFoQ,20150
584
- dstack-0.19.8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
585
- dstack-0.19.8.dist-info/entry_points.txt,sha256=GnLrMS8hx3rWAySQjA7tPNhtixV6a-brRkmal1PKoHc,58
586
- dstack-0.19.8.dist-info/licenses/LICENSE.md,sha256=qDABaRGjSKVOib1U8viw2P_96sIK7Puo426784oD9f8,15976
587
- dstack-0.19.8.dist-info/RECORD,,
585
+ dstack-0.19.9.dist-info/METADATA,sha256=0gv_xHbluxlydceXCwjWo2m-CjyWGjNiR4gNpBKOpE0,20254
586
+ dstack-0.19.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
587
+ dstack-0.19.9.dist-info/entry_points.txt,sha256=GnLrMS8hx3rWAySQjA7tPNhtixV6a-brRkmal1PKoHc,58
588
+ dstack-0.19.9.dist-info/licenses/LICENSE.md,sha256=qDABaRGjSKVOib1U8viw2P_96sIK7Puo426784oD9f8,15976
589
+ dstack-0.19.9.dist-info/RECORD,,