dstack 0.19.25rc1__py3-none-any.whl → 0.19.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (161) hide show
  1. dstack/_internal/cli/commands/__init__.py +2 -2
  2. dstack/_internal/cli/commands/apply.py +3 -61
  3. dstack/_internal/cli/commands/attach.py +1 -1
  4. dstack/_internal/cli/commands/completion.py +1 -1
  5. dstack/_internal/cli/commands/delete.py +2 -2
  6. dstack/_internal/cli/commands/fleet.py +1 -1
  7. dstack/_internal/cli/commands/gateway.py +2 -2
  8. dstack/_internal/cli/commands/init.py +56 -24
  9. dstack/_internal/cli/commands/logs.py +1 -1
  10. dstack/_internal/cli/commands/metrics.py +1 -1
  11. dstack/_internal/cli/commands/offer.py +45 -7
  12. dstack/_internal/cli/commands/project.py +2 -2
  13. dstack/_internal/cli/commands/secrets.py +2 -2
  14. dstack/_internal/cli/commands/server.py +1 -1
  15. dstack/_internal/cli/commands/stop.py +1 -1
  16. dstack/_internal/cli/commands/volume.py +1 -1
  17. dstack/_internal/cli/main.py +2 -2
  18. dstack/_internal/cli/services/completion.py +2 -2
  19. dstack/_internal/cli/services/configurators/__init__.py +6 -2
  20. dstack/_internal/cli/services/configurators/base.py +6 -7
  21. dstack/_internal/cli/services/configurators/fleet.py +1 -3
  22. dstack/_internal/cli/services/configurators/gateway.py +2 -4
  23. dstack/_internal/cli/services/configurators/run.py +293 -58
  24. dstack/_internal/cli/services/configurators/volume.py +2 -4
  25. dstack/_internal/cli/services/profile.py +1 -1
  26. dstack/_internal/cli/services/repos.py +35 -48
  27. dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
  28. dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
  29. dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
  30. dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
  31. dstack/_internal/core/backends/aws/compute.py +6 -1
  32. dstack/_internal/core/backends/aws/configurator.py +11 -7
  33. dstack/_internal/core/backends/azure/configurator.py +11 -7
  34. dstack/_internal/core/backends/base/compute.py +33 -5
  35. dstack/_internal/core/backends/base/configurator.py +25 -13
  36. dstack/_internal/core/backends/base/offers.py +2 -0
  37. dstack/_internal/core/backends/cloudrift/configurator.py +13 -7
  38. dstack/_internal/core/backends/configurators.py +15 -0
  39. dstack/_internal/core/backends/cudo/configurator.py +11 -7
  40. dstack/_internal/core/backends/datacrunch/compute.py +5 -1
  41. dstack/_internal/core/backends/datacrunch/configurator.py +13 -7
  42. dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
  43. dstack/_internal/core/backends/digitalocean/backend.py +16 -0
  44. dstack/_internal/core/backends/digitalocean/compute.py +5 -0
  45. dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
  46. dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
  47. dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
  48. dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
  49. dstack/_internal/core/backends/digitalocean_base/compute.py +173 -0
  50. dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
  51. dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
  52. dstack/_internal/core/backends/gcp/compute.py +32 -8
  53. dstack/_internal/core/backends/gcp/configurator.py +11 -7
  54. dstack/_internal/core/backends/hotaisle/api_client.py +25 -33
  55. dstack/_internal/core/backends/hotaisle/compute.py +1 -6
  56. dstack/_internal/core/backends/hotaisle/configurator.py +13 -7
  57. dstack/_internal/core/backends/kubernetes/configurator.py +13 -7
  58. dstack/_internal/core/backends/lambdalabs/configurator.py +11 -7
  59. dstack/_internal/core/backends/models.py +7 -0
  60. dstack/_internal/core/backends/nebius/compute.py +1 -8
  61. dstack/_internal/core/backends/nebius/configurator.py +11 -7
  62. dstack/_internal/core/backends/nebius/resources.py +21 -11
  63. dstack/_internal/core/backends/oci/compute.py +4 -5
  64. dstack/_internal/core/backends/oci/configurator.py +11 -7
  65. dstack/_internal/core/backends/runpod/configurator.py +11 -7
  66. dstack/_internal/core/backends/template/configurator.py.jinja +11 -7
  67. dstack/_internal/core/backends/tensordock/configurator.py +13 -7
  68. dstack/_internal/core/backends/vastai/configurator.py +11 -7
  69. dstack/_internal/core/backends/vultr/compute.py +1 -5
  70. dstack/_internal/core/backends/vultr/configurator.py +11 -4
  71. dstack/_internal/core/compatibility/fleets.py +5 -0
  72. dstack/_internal/core/compatibility/gpus.py +13 -0
  73. dstack/_internal/core/compatibility/runs.py +9 -1
  74. dstack/_internal/core/models/backends/base.py +5 -1
  75. dstack/_internal/core/models/common.py +3 -3
  76. dstack/_internal/core/models/configurations.py +191 -32
  77. dstack/_internal/core/models/files.py +1 -1
  78. dstack/_internal/core/models/fleets.py +80 -3
  79. dstack/_internal/core/models/profiles.py +41 -11
  80. dstack/_internal/core/models/resources.py +46 -42
  81. dstack/_internal/core/models/runs.py +28 -5
  82. dstack/_internal/core/services/configs/__init__.py +6 -3
  83. dstack/_internal/core/services/profiles.py +2 -2
  84. dstack/_internal/core/services/repos.py +86 -79
  85. dstack/_internal/core/services/ssh/ports.py +1 -1
  86. dstack/_internal/proxy/lib/deps.py +6 -2
  87. dstack/_internal/server/app.py +22 -17
  88. dstack/_internal/server/background/tasks/process_fleets.py +109 -13
  89. dstack/_internal/server/background/tasks/process_gateways.py +4 -1
  90. dstack/_internal/server/background/tasks/process_instances.py +22 -73
  91. dstack/_internal/server/background/tasks/process_probes.py +1 -1
  92. dstack/_internal/server/background/tasks/process_running_jobs.py +12 -4
  93. dstack/_internal/server/background/tasks/process_runs.py +3 -1
  94. dstack/_internal/server/background/tasks/process_submitted_jobs.py +67 -44
  95. dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
  96. dstack/_internal/server/background/tasks/process_volumes.py +1 -1
  97. dstack/_internal/server/db.py +8 -4
  98. dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
  99. dstack/_internal/server/models.py +6 -2
  100. dstack/_internal/server/routers/gpus.py +1 -6
  101. dstack/_internal/server/schemas/runner.py +11 -0
  102. dstack/_internal/server/services/backends/__init__.py +14 -8
  103. dstack/_internal/server/services/backends/handlers.py +6 -1
  104. dstack/_internal/server/services/docker.py +5 -5
  105. dstack/_internal/server/services/fleets.py +37 -38
  106. dstack/_internal/server/services/gateways/__init__.py +2 -0
  107. dstack/_internal/server/services/gateways/client.py +5 -2
  108. dstack/_internal/server/services/gateways/connection.py +1 -1
  109. dstack/_internal/server/services/gpus.py +50 -49
  110. dstack/_internal/server/services/instances.py +44 -4
  111. dstack/_internal/server/services/jobs/__init__.py +15 -4
  112. dstack/_internal/server/services/jobs/configurators/base.py +53 -17
  113. dstack/_internal/server/services/jobs/configurators/dev.py +9 -4
  114. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +6 -8
  115. dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +7 -9
  116. dstack/_internal/server/services/jobs/configurators/service.py +1 -3
  117. dstack/_internal/server/services/jobs/configurators/task.py +3 -3
  118. dstack/_internal/server/services/locking.py +5 -5
  119. dstack/_internal/server/services/logging.py +10 -2
  120. dstack/_internal/server/services/logs/__init__.py +8 -6
  121. dstack/_internal/server/services/logs/aws.py +330 -327
  122. dstack/_internal/server/services/logs/filelog.py +7 -6
  123. dstack/_internal/server/services/logs/gcp.py +141 -139
  124. dstack/_internal/server/services/plugins.py +1 -1
  125. dstack/_internal/server/services/projects.py +2 -5
  126. dstack/_internal/server/services/proxy/repo.py +5 -1
  127. dstack/_internal/server/services/requirements/__init__.py +0 -0
  128. dstack/_internal/server/services/requirements/combine.py +259 -0
  129. dstack/_internal/server/services/runner/client.py +7 -0
  130. dstack/_internal/server/services/runs.py +17 -1
  131. dstack/_internal/server/services/services/__init__.py +8 -2
  132. dstack/_internal/server/services/services/autoscalers.py +2 -0
  133. dstack/_internal/server/services/ssh.py +2 -1
  134. dstack/_internal/server/services/storage/__init__.py +5 -6
  135. dstack/_internal/server/services/storage/gcs.py +49 -49
  136. dstack/_internal/server/services/storage/s3.py +52 -52
  137. dstack/_internal/server/statics/index.html +1 -1
  138. dstack/_internal/server/statics/{main-d151b300fcac3933213d.js → main-4eecc75fbe64067eb1bc.js} +1146 -899
  139. dstack/_internal/server/statics/{main-d151b300fcac3933213d.js.map → main-4eecc75fbe64067eb1bc.js.map} +1 -1
  140. dstack/_internal/server/statics/{main-aec4762350e34d6fbff9.css → main-56191c63d516fd0041c4.css} +1 -1
  141. dstack/_internal/server/testing/common.py +7 -4
  142. dstack/_internal/server/utils/logging.py +3 -3
  143. dstack/_internal/server/utils/provisioning.py +3 -3
  144. dstack/_internal/utils/json_schema.py +3 -1
  145. dstack/_internal/utils/path.py +8 -1
  146. dstack/_internal/utils/ssh.py +7 -0
  147. dstack/_internal/utils/typing.py +14 -0
  148. dstack/api/_public/repos.py +62 -8
  149. dstack/api/_public/runs.py +19 -8
  150. dstack/api/server/__init__.py +17 -19
  151. dstack/api/server/_gpus.py +2 -1
  152. dstack/api/server/_group.py +4 -3
  153. dstack/api/server/_repos.py +20 -3
  154. dstack/plugins/builtin/rest_plugin/_plugin.py +1 -0
  155. dstack/version.py +1 -1
  156. {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/METADATA +2 -2
  157. {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/RECORD +160 -142
  158. dstack/api/huggingface/__init__.py +0 -73
  159. {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/WHEEL +0 -0
  160. {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/entry_points.txt +0 -0
  161. {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,11 +1,13 @@
1
1
  from datetime import timedelta
2
2
  from typing import List
3
+ from uuid import UUID
3
4
 
4
5
  from sqlalchemy import select, update
5
6
  from sqlalchemy.ext.asyncio import AsyncSession
6
7
  from sqlalchemy.orm import joinedload, load_only
7
8
 
8
- from dstack._internal.core.models.fleets import FleetStatus
9
+ from dstack._internal.core.models.fleets import FleetSpec, FleetStatus
10
+ from dstack._internal.core.models.instances import InstanceStatus
9
11
  from dstack._internal.server.db import get_db, get_session_ctx
10
12
  from dstack._internal.server.models import (
11
13
  FleetModel,
@@ -15,7 +17,9 @@ from dstack._internal.server.models import (
15
17
  RunModel,
16
18
  )
17
19
  from dstack._internal.server.services.fleets import (
20
+ create_fleet_instance_model,
18
21
  get_fleet_spec,
22
+ get_next_instance_num,
19
23
  is_fleet_empty,
20
24
  is_fleet_in_use,
21
25
  )
@@ -65,31 +69,111 @@ async def _process_fleets(session: AsyncSession, fleet_models: List[FleetModel])
65
69
  res = await session.execute(
66
70
  select(FleetModel)
67
71
  .where(FleetModel.id.in_(fleet_ids))
68
- .options(joinedload(FleetModel.instances).load_only(InstanceModel.deleted))
69
72
  .options(
70
- joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id)
73
+ joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id),
74
+ joinedload(FleetModel.project),
71
75
  )
72
76
  .options(joinedload(FleetModel.runs).load_only(RunModel.status))
73
77
  .execution_options(populate_existing=True)
74
78
  )
75
79
  fleet_models = list(res.unique().scalars().all())
76
80
 
81
+ # TODO: Drop fleets auto-deletion after dropping fleets auto-creation.
77
82
  deleted_fleets_ids = []
78
- now = get_current_datetime()
79
83
  for fleet_model in fleet_models:
84
+ _consolidate_fleet_state_with_spec(session, fleet_model)
80
85
  deleted = _autodelete_fleet(fleet_model)
81
86
  if deleted:
82
87
  deleted_fleets_ids.append(fleet_model.id)
83
- fleet_model.last_processed_at = now
88
+ fleet_model.last_processed_at = get_current_datetime()
89
+ await _update_deleted_fleets_placement_groups(session, deleted_fleets_ids)
90
+ await session.commit()
84
91
 
85
- await session.execute(
86
- update(PlacementGroupModel)
87
- .where(
88
- PlacementGroupModel.fleet_id.in_(deleted_fleets_ids),
92
+
93
+ def _consolidate_fleet_state_with_spec(session: AsyncSession, fleet_model: FleetModel):
94
+ if fleet_model.status == FleetStatus.TERMINATING:
95
+ return
96
+ fleet_spec = get_fleet_spec(fleet_model)
97
+ if fleet_spec.configuration.nodes is None or fleet_spec.autocreated:
98
+ # Only explicitly created cloud fleets are consolidated.
99
+ return
100
+ if not _is_fleet_ready_for_consolidation(fleet_model):
101
+ return
102
+ added_instances = _maintain_fleet_nodes_min(session, fleet_model, fleet_spec)
103
+ if added_instances:
104
+ fleet_model.consolidation_attempt += 1
105
+ else:
106
+ # The fleet is already consolidated or consolidation is in progress.
107
+ # We reset consolidation_attempt in both cases for simplicity.
108
+ # The second case does not need reset but is ok to do since
109
+ # it means consolidation is longer than delay, so it won't happen too often.
110
+ # TODO: Reset consolidation_attempt on fleet in-place update.
111
+ fleet_model.consolidation_attempt = 0
112
+ fleet_model.last_consolidated_at = get_current_datetime()
113
+
114
+
115
+ def _is_fleet_ready_for_consolidation(fleet_model: FleetModel) -> bool:
116
+ consolidation_retry_delay = _get_consolidation_retry_delay(fleet_model.consolidation_attempt)
117
+ last_consolidated_at = fleet_model.last_consolidated_at or fleet_model.last_processed_at
118
+ duration_since_last_consolidation = get_current_datetime() - last_consolidated_at
119
+ return duration_since_last_consolidation >= consolidation_retry_delay
120
+
121
+
122
+ # We use exponentially increasing consolidation retry delays so that
123
+ # consolidation does not happen too often. In particular, this prevents
124
+ # retrying instance provisioning constantly in case of no offers.
125
+ # TODO: Adjust delays.
126
+ _CONSOLIDATION_RETRY_DELAYS = [
127
+ timedelta(seconds=30),
128
+ timedelta(minutes=1),
129
+ timedelta(minutes=2),
130
+ timedelta(minutes=5),
131
+ timedelta(minutes=10),
132
+ ]
133
+
134
+
135
+ def _get_consolidation_retry_delay(consolidation_attempt: int) -> timedelta:
136
+ if consolidation_attempt < len(_CONSOLIDATION_RETRY_DELAYS):
137
+ return _CONSOLIDATION_RETRY_DELAYS[consolidation_attempt]
138
+ return _CONSOLIDATION_RETRY_DELAYS[-1]
139
+
140
+
141
+ def _maintain_fleet_nodes_min(
142
+ session: AsyncSession,
143
+ fleet_model: FleetModel,
144
+ fleet_spec: FleetSpec,
145
+ ) -> bool:
146
+ """
147
+ Ensures the fleet has at least `nodes.min` instances.
148
+ Returns `True` if retried or added new instances and `False` otherwise.
149
+ """
150
+ assert fleet_spec.configuration.nodes is not None
151
+ for instance in fleet_model.instances:
152
+ # Delete terminated but not deleted instances since
153
+ # they are going to be replaced with new pending instances.
154
+ if instance.status == InstanceStatus.TERMINATED and not instance.deleted:
155
+ # It's safe to modify instances without instance lock since
156
+ # no other task modifies already terminated instances.
157
+ instance.deleted = True
158
+ instance.deleted_at = get_current_datetime()
159
+ active_instances = [i for i in fleet_model.instances if not i.deleted]
160
+ active_instances_num = len(active_instances)
161
+ if active_instances_num >= fleet_spec.configuration.nodes.min:
162
+ return False
163
+ nodes_missing = fleet_spec.configuration.nodes.min - active_instances_num
164
+ for i in range(nodes_missing):
165
+ instance_model = create_fleet_instance_model(
166
+ session=session,
167
+ project=fleet_model.project,
168
+ # TODO: Store fleet.user and pass it instead of the project owner.
169
+ username=fleet_model.project.owner.name,
170
+ spec=fleet_spec,
171
+ instance_num=get_next_instance_num({i.instance_num for i in active_instances}),
89
172
  )
90
- .values(fleet_deleted=True)
91
- )
92
- await session.commit()
173
+ active_instances.append(instance_model)
174
+ fleet_model.instances.append(instance_model)
175
+ logger.info("Added %s instances to fleet %s", nodes_missing, fleet_model.name)
176
+ return True
93
177
 
94
178
 
95
179
  def _autodelete_fleet(fleet_model: FleetModel) -> bool:
@@ -100,7 +184,7 @@ def _autodelete_fleet(fleet_model: FleetModel) -> bool:
100
184
  if (
101
185
  fleet_model.status != FleetStatus.TERMINATING
102
186
  and fleet_spec.configuration.nodes is not None
103
- and (fleet_spec.configuration.nodes.min is None or fleet_spec.configuration.nodes.min == 0)
187
+ and fleet_spec.configuration.nodes.min == 0
104
188
  ):
105
189
  # Empty fleets that allow 0 nodes should not be auto-deleted
106
190
  return False
@@ -110,3 +194,15 @@ def _autodelete_fleet(fleet_model: FleetModel) -> bool:
110
194
  fleet_model.deleted = True
111
195
  logger.info("Fleet %s deleted", fleet_model.name)
112
196
  return True
197
+
198
+
199
+ async def _update_deleted_fleets_placement_groups(session: AsyncSession, fleets_ids: list[UUID]):
200
+ if len(fleets_ids) == 0:
201
+ return
202
+ await session.execute(
203
+ update(PlacementGroupModel)
204
+ .where(
205
+ PlacementGroupModel.fleet_id.in_(fleets_ids),
206
+ )
207
+ .values(fleet_deleted=True)
208
+ )
@@ -49,8 +49,8 @@ async def process_gateways():
49
49
  if gateway_model is None:
50
50
  return
51
51
  lockset.add(gateway_model.id)
52
+ gateway_model_id = gateway_model.id
52
53
  try:
53
- gateway_model_id = gateway_model.id
54
54
  initial_status = gateway_model.status
55
55
  if initial_status == GatewayStatus.SUBMITTED:
56
56
  await _process_submitted_gateway(session=session, gateway_model=gateway_model)
@@ -165,6 +165,9 @@ async def _process_provisioning_gateway(
165
165
  )
166
166
  gateway_model = res.unique().scalar_one()
167
167
 
168
+ # Provisioning gateways must have compute.
169
+ assert gateway_model.gateway_compute is not None
170
+
168
171
  # FIXME: problems caused by blocking on connect_to_gateway_with_retry and configure_gateway:
169
172
  # - cannot delete the gateway before it is provisioned because the DB model is locked
170
173
  # - connection retry counter is reset on server restart
@@ -53,14 +53,12 @@ from dstack._internal.core.models.placement import (
53
53
  PlacementStrategy,
54
54
  )
55
55
  from dstack._internal.core.models.profiles import (
56
- RetryEvent,
57
56
  TerminationPolicy,
58
57
  )
59
58
  from dstack._internal.core.models.runs import (
60
59
  JobProvisioningData,
61
60
  Retry,
62
61
  )
63
- from dstack._internal.core.services.profiles import get_retry
64
62
  from dstack._internal.server import settings as server_settings
65
63
  from dstack._internal.server.background.tasks.common import get_provisioning_timeout
66
64
  from dstack._internal.server.db import get_db, get_session_ctx
@@ -85,8 +83,10 @@ from dstack._internal.server.services.instances import (
85
83
  get_instance_provisioning_data,
86
84
  get_instance_requirements,
87
85
  get_instance_ssh_private_keys,
86
+ remove_dangling_tasks_from_instance,
88
87
  )
89
88
  from dstack._internal.server.services.locking import get_locker
89
+ from dstack._internal.server.services.logging import fmt
90
90
  from dstack._internal.server.services.offers import is_divisible_into_blocks
91
91
  from dstack._internal.server.services.placement import (
92
92
  get_fleet_placement_group_models,
@@ -181,8 +181,8 @@ async def _process_next_instance():
181
181
  if instance is None:
182
182
  return
183
183
  lockset.add(instance.id)
184
+ instance_model_id = instance.id
184
185
  try:
185
- instance_model_id = instance.id
186
186
  await _process_instance(session=session, instance=instance)
187
187
  finally:
188
188
  lockset.difference_update([instance_model_id])
@@ -325,7 +325,6 @@ async def _add_remote(instance: InstanceModel) -> None:
325
325
  e,
326
326
  )
327
327
  instance.status = InstanceStatus.PENDING
328
- instance.last_retry_at = get_current_datetime()
329
328
  return
330
329
 
331
330
  instance_type = host_info_to_instance_type(host_info, cpu_arch)
@@ -393,6 +392,7 @@ async def _add_remote(instance: InstanceModel) -> None:
393
392
  return
394
393
 
395
394
  region = instance.region
395
+ assert region is not None # always set for ssh instances
396
396
  jpd = JobProvisioningData(
397
397
  backend=BackendType.REMOTE,
398
398
  instance_type=instance_type,
@@ -423,7 +423,6 @@ async def _add_remote(instance: InstanceModel) -> None:
423
423
  instance.offer = instance_offer.json()
424
424
  instance.job_provisioning_data = jpd.json()
425
425
  instance.started_at = get_current_datetime()
426
- instance.last_retry_at = get_current_datetime()
427
426
 
428
427
 
429
428
  def _deploy_instance(
@@ -490,29 +489,6 @@ def _deploy_instance(
490
489
 
491
490
 
492
491
  async def _create_instance(session: AsyncSession, instance: InstanceModel) -> None:
493
- if instance.last_retry_at is not None:
494
- last_retry = instance.last_retry_at
495
- if get_current_datetime() < last_retry + timedelta(minutes=1):
496
- return
497
-
498
- if (
499
- instance.profile is None
500
- or instance.requirements is None
501
- or instance.instance_configuration is None
502
- ):
503
- instance.status = InstanceStatus.TERMINATED
504
- instance.termination_reason = "Empty profile, requirements or instance_configuration"
505
- instance.last_retry_at = get_current_datetime()
506
- logger.warning(
507
- "Empty profile, requirements or instance_configuration. Terminate instance: %s",
508
- instance.name,
509
- extra={
510
- "instance_name": instance.name,
511
- "instance_status": InstanceStatus.TERMINATED.value,
512
- },
513
- )
514
- return
515
-
516
492
  if _need_to_wait_fleet_provisioning(instance):
517
493
  logger.debug("Waiting for the first instance in the fleet to be provisioned")
518
494
  return
@@ -526,7 +502,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
526
502
  instance.termination_reason = (
527
503
  f"Error to parse profile, requirements or instance_configuration: {e}"
528
504
  )
529
- instance.last_retry_at = get_current_datetime()
530
505
  logger.warning(
531
506
  "Error to parse profile, requirements or instance_configuration. Terminate instance: %s",
532
507
  instance.name,
@@ -537,24 +512,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
537
512
  )
538
513
  return
539
514
 
540
- retry = get_retry(profile)
541
- should_retry = retry is not None and RetryEvent.NO_CAPACITY in retry.on_events
542
-
543
- if retry is not None:
544
- retry_duration_deadline = _get_retry_duration_deadline(instance, retry)
545
- if get_current_datetime() > retry_duration_deadline:
546
- instance.status = InstanceStatus.TERMINATED
547
- instance.termination_reason = "Retry duration expired"
548
- logger.warning(
549
- "Retry duration expired. Terminating instance %s",
550
- instance.name,
551
- extra={
552
- "instance_name": instance.name,
553
- "instance_status": InstanceStatus.TERMINATED.value,
554
- },
555
- )
556
- return
557
-
558
515
  placement_group_models = []
559
516
  placement_group_model = None
560
517
  if instance.fleet_id:
@@ -592,15 +549,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
592
549
  exclude_not_available=True,
593
550
  )
594
551
 
595
- if not offers and should_retry:
596
- instance.last_retry_at = get_current_datetime()
597
- logger.debug(
598
- "No offers for instance %s. Next retry",
599
- instance.name,
600
- extra={"instance_name": instance.name},
601
- )
602
- return
603
-
604
552
  # Limit number of offers tried to prevent long-running processing
605
553
  # in case all offers fail.
606
554
  for backend, instance_offer in offers[: server_settings.MAX_OFFERS_TRIED]:
@@ -678,7 +626,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
678
626
  instance.offer = instance_offer.json()
679
627
  instance.total_blocks = instance_offer.total_blocks
680
628
  instance.started_at = get_current_datetime()
681
- instance.last_retry_at = get_current_datetime()
682
629
 
683
630
  logger.info(
684
631
  "Created instance %s",
@@ -699,21 +646,18 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
699
646
  )
700
647
  return
701
648
 
702
- instance.last_retry_at = get_current_datetime()
703
-
704
- if not should_retry:
705
- _mark_terminated(instance, "All offers failed" if offers else "No offers found")
706
- if (
707
- instance.fleet
708
- and _is_fleet_master_instance(instance)
709
- and _is_cloud_cluster(instance.fleet)
710
- ):
711
- # Do not attempt to deploy other instances, as they won't determine the correct cluster
712
- # backend, region, and placement group without a successfully deployed master instance
713
- for sibling_instance in instance.fleet.instances:
714
- if sibling_instance.id == instance.id:
715
- continue
716
- _mark_terminated(sibling_instance, "Master instance failed to start")
649
+ _mark_terminated(instance, "All offers failed" if offers else "No offers found")
650
+ if (
651
+ instance.fleet
652
+ and _is_fleet_master_instance(instance)
653
+ and _is_cloud_cluster(instance.fleet)
654
+ ):
655
+ # Do not attempt to deploy other instances, as they won't determine the correct cluster
656
+ # backend, region, and placement group without a successfully deployed master instance
657
+ for sibling_instance in instance.fleet.instances:
658
+ if sibling_instance.id == instance.id:
659
+ continue
660
+ _mark_terminated(sibling_instance, "Master instance failed to start")
717
661
 
718
662
 
719
663
  def _mark_terminated(instance: InstanceModel, termination_reason: str) -> None:
@@ -788,6 +732,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non
788
732
  ssh_private_keys,
789
733
  job_provisioning_data,
790
734
  None,
735
+ instance=instance,
791
736
  check_instance_health=check_instance_health,
792
737
  )
793
738
  if instance_check is False:
@@ -934,7 +879,7 @@ async def _wait_for_instance_provisioning_data(
934
879
 
935
880
  @runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
936
881
  def _check_instance_inner(
937
- ports: Dict[int, int], *, check_instance_health: bool = False
882
+ ports: Dict[int, int], *, instance: InstanceModel, check_instance_health: bool = False
938
883
  ) -> InstanceCheck:
939
884
  instance_health_response: Optional[InstanceHealthResponse] = None
940
885
  shim_client = runner_client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
@@ -954,6 +899,10 @@ def _check_instance_inner(
954
899
  args = (method.__func__.__name__, e.__class__.__name__, e)
955
900
  logger.exception(template, *args)
956
901
  return InstanceCheck(reachable=False, message=template % args)
902
+ try:
903
+ remove_dangling_tasks_from_instance(shim_client, instance)
904
+ except Exception as e:
905
+ logger.exception("%s: error removing dangling tasks: %s", fmt(instance), e)
957
906
  return runner_client.healthcheck_response_to_instance_check(
958
907
  healthcheck_response, instance_health_response
959
908
  )
@@ -120,7 +120,7 @@ async def _execute_probe(probe: ProbeModel, probe_spec: ProbeSpec) -> bool:
120
120
  method=probe_spec.method,
121
121
  url="http://dstack" + probe_spec.url,
122
122
  headers=[(h.name, h.value) for h in probe_spec.headers],
123
- data=probe_spec.body,
123
+ content=probe_spec.body,
124
124
  timeout=probe_spec.timeout,
125
125
  follow_redirects=False,
126
126
  )
@@ -41,6 +41,7 @@ from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, Vol
41
41
  from dstack._internal.server.background.tasks.common import get_provisioning_timeout
42
42
  from dstack._internal.server.db import get_db, get_session_ctx
43
43
  from dstack._internal.server.models import (
44
+ FleetModel,
44
45
  InstanceModel,
45
46
  JobModel,
46
47
  ProbeModel,
@@ -128,9 +129,8 @@ async def _process_next_running_job():
128
129
  if job_model is None:
129
130
  return
130
131
  lockset.add(job_model.id)
131
-
132
+ job_model_id = job_model.id
132
133
  try:
133
- job_model_id = job_model.id
134
134
  await _process_running_job(session=session, job_model=job_model)
135
135
  finally:
136
136
  lockset.difference_update([job_model_id])
@@ -152,6 +152,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
152
152
  .options(joinedload(RunModel.project))
153
153
  .options(joinedload(RunModel.user))
154
154
  .options(joinedload(RunModel.repo))
155
+ .options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name))
155
156
  .options(joinedload(RunModel.jobs))
156
157
  )
157
158
  run_model = res.unique().scalar_one()
@@ -170,6 +171,11 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
170
171
 
171
172
  job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
172
173
 
174
+ volumes = []
175
+ secrets = {}
176
+ cluster_info = None
177
+ repo_creds = None
178
+
173
179
  initial_status = job_model.status
174
180
  if initial_status in [JobStatus.PROVISIONING, JobStatus.PULLING]:
175
181
  # Wait until all other jobs in the replica are provisioned
@@ -257,6 +263,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
257
263
  user_ssh_key,
258
264
  )
259
265
  else:
266
+ assert cluster_info is not None
260
267
  logger.debug(
261
268
  "%s: process provisioning job without shim, age=%s",
262
269
  fmt(job_model),
@@ -275,7 +282,6 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
275
282
  repo=repo_model,
276
283
  code_hash=_get_repo_code_hash(run, job),
277
284
  )
278
-
279
285
  success = await common_utils.run_async(
280
286
  _submit_job_to_runner,
281
287
  server_ssh_private_keys,
@@ -309,6 +315,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
309
315
 
310
316
  else: # fails are not acceptable
311
317
  if initial_status == JobStatus.PULLING:
318
+ assert cluster_info is not None
312
319
  logger.debug(
313
320
  "%s: process pulling job with shim, age=%s", fmt(job_model), job_submission.age
314
321
  )
@@ -341,7 +348,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
341
348
  server_ssh_private_keys,
342
349
  job_provisioning_data,
343
350
  )
344
- elif initial_status == JobStatus.RUNNING:
351
+ else:
345
352
  logger.debug("%s: process running job, age=%s", fmt(job_model), job_submission.age)
346
353
  success = await common_utils.run_async(
347
354
  _process_running,
@@ -632,6 +639,7 @@ def _process_pulling_with_shim(
632
639
  is successful
633
640
  """
634
641
  shim_client = client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
642
+ job_runtime_data = None
635
643
  if shim_client.is_api_v2_supported(): # raises error if shim is down, causes retry
636
644
  task = shim_client.get_task(job_model.id)
637
645
 
@@ -21,6 +21,7 @@ from dstack._internal.core.models.runs import (
21
21
  )
22
22
  from dstack._internal.server.db import get_db, get_session_ctx
23
23
  from dstack._internal.server.models import (
24
+ FleetModel,
24
25
  InstanceModel,
25
26
  JobModel,
26
27
  ProjectModel,
@@ -129,8 +130,8 @@ async def _process_next_run():
129
130
  job_ids = [j.id for j in run_model.jobs]
130
131
  run_lockset.add(run_model.id)
131
132
  job_lockset.update(job_ids)
133
+ run_model_id = run_model.id
132
134
  try:
133
- run_model_id = run_model.id
134
135
  await _process_run(session=session, run_model=run_model)
135
136
  finally:
136
137
  run_lockset.difference_update([run_model_id])
@@ -145,6 +146,7 @@ async def _process_run(session: AsyncSession, run_model: RunModel):
145
146
  .execution_options(populate_existing=True)
146
147
  .options(joinedload(RunModel.project).load_only(ProjectModel.id, ProjectModel.name))
147
148
  .options(joinedload(RunModel.user).load_only(UserModel.name))
149
+ .options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name))
148
150
  .options(
149
151
  selectinload(RunModel.jobs)
150
152
  .joinedload(JobModel.instance)