dstack 0.19.25rc1__py3-none-any.whl → 0.19.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +2 -2
- dstack/_internal/cli/commands/apply.py +3 -61
- dstack/_internal/cli/commands/attach.py +1 -1
- dstack/_internal/cli/commands/completion.py +1 -1
- dstack/_internal/cli/commands/delete.py +2 -2
- dstack/_internal/cli/commands/fleet.py +1 -1
- dstack/_internal/cli/commands/gateway.py +2 -2
- dstack/_internal/cli/commands/init.py +56 -24
- dstack/_internal/cli/commands/logs.py +1 -1
- dstack/_internal/cli/commands/metrics.py +1 -1
- dstack/_internal/cli/commands/offer.py +45 -7
- dstack/_internal/cli/commands/project.py +2 -2
- dstack/_internal/cli/commands/secrets.py +2 -2
- dstack/_internal/cli/commands/server.py +1 -1
- dstack/_internal/cli/commands/stop.py +1 -1
- dstack/_internal/cli/commands/volume.py +1 -1
- dstack/_internal/cli/main.py +2 -2
- dstack/_internal/cli/services/completion.py +2 -2
- dstack/_internal/cli/services/configurators/__init__.py +6 -2
- dstack/_internal/cli/services/configurators/base.py +6 -7
- dstack/_internal/cli/services/configurators/fleet.py +1 -3
- dstack/_internal/cli/services/configurators/gateway.py +2 -4
- dstack/_internal/cli/services/configurators/run.py +293 -58
- dstack/_internal/cli/services/configurators/volume.py +2 -4
- dstack/_internal/cli/services/profile.py +1 -1
- dstack/_internal/cli/services/repos.py +35 -48
- dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
- dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
- dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
- dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
- dstack/_internal/core/backends/aws/compute.py +6 -1
- dstack/_internal/core/backends/aws/configurator.py +11 -7
- dstack/_internal/core/backends/azure/configurator.py +11 -7
- dstack/_internal/core/backends/base/compute.py +33 -5
- dstack/_internal/core/backends/base/configurator.py +25 -13
- dstack/_internal/core/backends/base/offers.py +2 -0
- dstack/_internal/core/backends/cloudrift/configurator.py +13 -7
- dstack/_internal/core/backends/configurators.py +15 -0
- dstack/_internal/core/backends/cudo/configurator.py +11 -7
- dstack/_internal/core/backends/datacrunch/compute.py +5 -1
- dstack/_internal/core/backends/datacrunch/configurator.py +13 -7
- dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean/backend.py +16 -0
- dstack/_internal/core/backends/digitalocean/compute.py +5 -0
- dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
- dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
- dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +173 -0
- dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
- dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
- dstack/_internal/core/backends/gcp/compute.py +32 -8
- dstack/_internal/core/backends/gcp/configurator.py +11 -7
- dstack/_internal/core/backends/hotaisle/api_client.py +25 -33
- dstack/_internal/core/backends/hotaisle/compute.py +1 -6
- dstack/_internal/core/backends/hotaisle/configurator.py +13 -7
- dstack/_internal/core/backends/kubernetes/configurator.py +13 -7
- dstack/_internal/core/backends/lambdalabs/configurator.py +11 -7
- dstack/_internal/core/backends/models.py +7 -0
- dstack/_internal/core/backends/nebius/compute.py +1 -8
- dstack/_internal/core/backends/nebius/configurator.py +11 -7
- dstack/_internal/core/backends/nebius/resources.py +21 -11
- dstack/_internal/core/backends/oci/compute.py +4 -5
- dstack/_internal/core/backends/oci/configurator.py +11 -7
- dstack/_internal/core/backends/runpod/configurator.py +11 -7
- dstack/_internal/core/backends/template/configurator.py.jinja +11 -7
- dstack/_internal/core/backends/tensordock/configurator.py +13 -7
- dstack/_internal/core/backends/vastai/configurator.py +11 -7
- dstack/_internal/core/backends/vultr/compute.py +1 -5
- dstack/_internal/core/backends/vultr/configurator.py +11 -4
- dstack/_internal/core/compatibility/fleets.py +5 -0
- dstack/_internal/core/compatibility/gpus.py +13 -0
- dstack/_internal/core/compatibility/runs.py +9 -1
- dstack/_internal/core/models/backends/base.py +5 -1
- dstack/_internal/core/models/common.py +3 -3
- dstack/_internal/core/models/configurations.py +191 -32
- dstack/_internal/core/models/files.py +1 -1
- dstack/_internal/core/models/fleets.py +80 -3
- dstack/_internal/core/models/profiles.py +41 -11
- dstack/_internal/core/models/resources.py +46 -42
- dstack/_internal/core/models/runs.py +28 -5
- dstack/_internal/core/services/configs/__init__.py +6 -3
- dstack/_internal/core/services/profiles.py +2 -2
- dstack/_internal/core/services/repos.py +86 -79
- dstack/_internal/core/services/ssh/ports.py +1 -1
- dstack/_internal/proxy/lib/deps.py +6 -2
- dstack/_internal/server/app.py +22 -17
- dstack/_internal/server/background/tasks/process_fleets.py +109 -13
- dstack/_internal/server/background/tasks/process_gateways.py +4 -1
- dstack/_internal/server/background/tasks/process_instances.py +22 -73
- dstack/_internal/server/background/tasks/process_probes.py +1 -1
- dstack/_internal/server/background/tasks/process_running_jobs.py +12 -4
- dstack/_internal/server/background/tasks/process_runs.py +3 -1
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +67 -44
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
- dstack/_internal/server/background/tasks/process_volumes.py +1 -1
- dstack/_internal/server/db.py +8 -4
- dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
- dstack/_internal/server/models.py +6 -2
- dstack/_internal/server/routers/gpus.py +1 -6
- dstack/_internal/server/schemas/runner.py +11 -0
- dstack/_internal/server/services/backends/__init__.py +14 -8
- dstack/_internal/server/services/backends/handlers.py +6 -1
- dstack/_internal/server/services/docker.py +5 -5
- dstack/_internal/server/services/fleets.py +37 -38
- dstack/_internal/server/services/gateways/__init__.py +2 -0
- dstack/_internal/server/services/gateways/client.py +5 -2
- dstack/_internal/server/services/gateways/connection.py +1 -1
- dstack/_internal/server/services/gpus.py +50 -49
- dstack/_internal/server/services/instances.py +44 -4
- dstack/_internal/server/services/jobs/__init__.py +15 -4
- dstack/_internal/server/services/jobs/configurators/base.py +53 -17
- dstack/_internal/server/services/jobs/configurators/dev.py +9 -4
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +6 -8
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +7 -9
- dstack/_internal/server/services/jobs/configurators/service.py +1 -3
- dstack/_internal/server/services/jobs/configurators/task.py +3 -3
- dstack/_internal/server/services/locking.py +5 -5
- dstack/_internal/server/services/logging.py +10 -2
- dstack/_internal/server/services/logs/__init__.py +8 -6
- dstack/_internal/server/services/logs/aws.py +330 -327
- dstack/_internal/server/services/logs/filelog.py +7 -6
- dstack/_internal/server/services/logs/gcp.py +141 -139
- dstack/_internal/server/services/plugins.py +1 -1
- dstack/_internal/server/services/projects.py +2 -5
- dstack/_internal/server/services/proxy/repo.py +5 -1
- dstack/_internal/server/services/requirements/__init__.py +0 -0
- dstack/_internal/server/services/requirements/combine.py +259 -0
- dstack/_internal/server/services/runner/client.py +7 -0
- dstack/_internal/server/services/runs.py +17 -1
- dstack/_internal/server/services/services/__init__.py +8 -2
- dstack/_internal/server/services/services/autoscalers.py +2 -0
- dstack/_internal/server/services/ssh.py +2 -1
- dstack/_internal/server/services/storage/__init__.py +5 -6
- dstack/_internal/server/services/storage/gcs.py +49 -49
- dstack/_internal/server/services/storage/s3.py +52 -52
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-d151b300fcac3933213d.js → main-4eecc75fbe64067eb1bc.js} +1146 -899
- dstack/_internal/server/statics/{main-d151b300fcac3933213d.js.map → main-4eecc75fbe64067eb1bc.js.map} +1 -1
- dstack/_internal/server/statics/{main-aec4762350e34d6fbff9.css → main-56191c63d516fd0041c4.css} +1 -1
- dstack/_internal/server/testing/common.py +7 -4
- dstack/_internal/server/utils/logging.py +3 -3
- dstack/_internal/server/utils/provisioning.py +3 -3
- dstack/_internal/utils/json_schema.py +3 -1
- dstack/_internal/utils/path.py +8 -1
- dstack/_internal/utils/ssh.py +7 -0
- dstack/_internal/utils/typing.py +14 -0
- dstack/api/_public/repos.py +62 -8
- dstack/api/_public/runs.py +19 -8
- dstack/api/server/__init__.py +17 -19
- dstack/api/server/_gpus.py +2 -1
- dstack/api/server/_group.py +4 -3
- dstack/api/server/_repos.py +20 -3
- dstack/plugins/builtin/rest_plugin/_plugin.py +1 -0
- dstack/version.py +1 -1
- {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/METADATA +2 -2
- {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/RECORD +160 -142
- dstack/api/huggingface/__init__.py +0 -73
- {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/WHEEL +0 -0
- {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
from datetime import timedelta
|
|
2
2
|
from typing import List
|
|
3
|
+
from uuid import UUID
|
|
3
4
|
|
|
4
5
|
from sqlalchemy import select, update
|
|
5
6
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
6
7
|
from sqlalchemy.orm import joinedload, load_only
|
|
7
8
|
|
|
8
|
-
from dstack._internal.core.models.fleets import FleetStatus
|
|
9
|
+
from dstack._internal.core.models.fleets import FleetSpec, FleetStatus
|
|
10
|
+
from dstack._internal.core.models.instances import InstanceStatus
|
|
9
11
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
10
12
|
from dstack._internal.server.models import (
|
|
11
13
|
FleetModel,
|
|
@@ -15,7 +17,9 @@ from dstack._internal.server.models import (
|
|
|
15
17
|
RunModel,
|
|
16
18
|
)
|
|
17
19
|
from dstack._internal.server.services.fleets import (
|
|
20
|
+
create_fleet_instance_model,
|
|
18
21
|
get_fleet_spec,
|
|
22
|
+
get_next_instance_num,
|
|
19
23
|
is_fleet_empty,
|
|
20
24
|
is_fleet_in_use,
|
|
21
25
|
)
|
|
@@ -65,31 +69,111 @@ async def _process_fleets(session: AsyncSession, fleet_models: List[FleetModel])
|
|
|
65
69
|
res = await session.execute(
|
|
66
70
|
select(FleetModel)
|
|
67
71
|
.where(FleetModel.id.in_(fleet_ids))
|
|
68
|
-
.options(joinedload(FleetModel.instances).load_only(InstanceModel.deleted))
|
|
69
72
|
.options(
|
|
70
|
-
joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id)
|
|
73
|
+
joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id),
|
|
74
|
+
joinedload(FleetModel.project),
|
|
71
75
|
)
|
|
72
76
|
.options(joinedload(FleetModel.runs).load_only(RunModel.status))
|
|
73
77
|
.execution_options(populate_existing=True)
|
|
74
78
|
)
|
|
75
79
|
fleet_models = list(res.unique().scalars().all())
|
|
76
80
|
|
|
81
|
+
# TODO: Drop fleets auto-deletion after dropping fleets auto-creation.
|
|
77
82
|
deleted_fleets_ids = []
|
|
78
|
-
now = get_current_datetime()
|
|
79
83
|
for fleet_model in fleet_models:
|
|
84
|
+
_consolidate_fleet_state_with_spec(session, fleet_model)
|
|
80
85
|
deleted = _autodelete_fleet(fleet_model)
|
|
81
86
|
if deleted:
|
|
82
87
|
deleted_fleets_ids.append(fleet_model.id)
|
|
83
|
-
fleet_model.last_processed_at =
|
|
88
|
+
fleet_model.last_processed_at = get_current_datetime()
|
|
89
|
+
await _update_deleted_fleets_placement_groups(session, deleted_fleets_ids)
|
|
90
|
+
await session.commit()
|
|
84
91
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
92
|
+
|
|
93
|
+
def _consolidate_fleet_state_with_spec(session: AsyncSession, fleet_model: FleetModel):
|
|
94
|
+
if fleet_model.status == FleetStatus.TERMINATING:
|
|
95
|
+
return
|
|
96
|
+
fleet_spec = get_fleet_spec(fleet_model)
|
|
97
|
+
if fleet_spec.configuration.nodes is None or fleet_spec.autocreated:
|
|
98
|
+
# Only explicitly created cloud fleets are consolidated.
|
|
99
|
+
return
|
|
100
|
+
if not _is_fleet_ready_for_consolidation(fleet_model):
|
|
101
|
+
return
|
|
102
|
+
added_instances = _maintain_fleet_nodes_min(session, fleet_model, fleet_spec)
|
|
103
|
+
if added_instances:
|
|
104
|
+
fleet_model.consolidation_attempt += 1
|
|
105
|
+
else:
|
|
106
|
+
# The fleet is already consolidated or consolidation is in progress.
|
|
107
|
+
# We reset consolidation_attempt in both cases for simplicity.
|
|
108
|
+
# The second case does not need reset but is ok to do since
|
|
109
|
+
# it means consolidation is longer than delay, so it won't happen too often.
|
|
110
|
+
# TODO: Reset consolidation_attempt on fleet in-place update.
|
|
111
|
+
fleet_model.consolidation_attempt = 0
|
|
112
|
+
fleet_model.last_consolidated_at = get_current_datetime()
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _is_fleet_ready_for_consolidation(fleet_model: FleetModel) -> bool:
|
|
116
|
+
consolidation_retry_delay = _get_consolidation_retry_delay(fleet_model.consolidation_attempt)
|
|
117
|
+
last_consolidated_at = fleet_model.last_consolidated_at or fleet_model.last_processed_at
|
|
118
|
+
duration_since_last_consolidation = get_current_datetime() - last_consolidated_at
|
|
119
|
+
return duration_since_last_consolidation >= consolidation_retry_delay
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# We use exponentially increasing consolidation retry delays so that
|
|
123
|
+
# consolidation does not happen too often. In particular, this prevents
|
|
124
|
+
# retrying instance provisioning constantly in case of no offers.
|
|
125
|
+
# TODO: Adjust delays.
|
|
126
|
+
_CONSOLIDATION_RETRY_DELAYS = [
|
|
127
|
+
timedelta(seconds=30),
|
|
128
|
+
timedelta(minutes=1),
|
|
129
|
+
timedelta(minutes=2),
|
|
130
|
+
timedelta(minutes=5),
|
|
131
|
+
timedelta(minutes=10),
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _get_consolidation_retry_delay(consolidation_attempt: int) -> timedelta:
|
|
136
|
+
if consolidation_attempt < len(_CONSOLIDATION_RETRY_DELAYS):
|
|
137
|
+
return _CONSOLIDATION_RETRY_DELAYS[consolidation_attempt]
|
|
138
|
+
return _CONSOLIDATION_RETRY_DELAYS[-1]
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _maintain_fleet_nodes_min(
|
|
142
|
+
session: AsyncSession,
|
|
143
|
+
fleet_model: FleetModel,
|
|
144
|
+
fleet_spec: FleetSpec,
|
|
145
|
+
) -> bool:
|
|
146
|
+
"""
|
|
147
|
+
Ensures the fleet has at least `nodes.min` instances.
|
|
148
|
+
Returns `True` if retried or added new instances and `False` otherwise.
|
|
149
|
+
"""
|
|
150
|
+
assert fleet_spec.configuration.nodes is not None
|
|
151
|
+
for instance in fleet_model.instances:
|
|
152
|
+
# Delete terminated but not deleted instances since
|
|
153
|
+
# they are going to be replaced with new pending instances.
|
|
154
|
+
if instance.status == InstanceStatus.TERMINATED and not instance.deleted:
|
|
155
|
+
# It's safe to modify instances without instance lock since
|
|
156
|
+
# no other task modifies already terminated instances.
|
|
157
|
+
instance.deleted = True
|
|
158
|
+
instance.deleted_at = get_current_datetime()
|
|
159
|
+
active_instances = [i for i in fleet_model.instances if not i.deleted]
|
|
160
|
+
active_instances_num = len(active_instances)
|
|
161
|
+
if active_instances_num >= fleet_spec.configuration.nodes.min:
|
|
162
|
+
return False
|
|
163
|
+
nodes_missing = fleet_spec.configuration.nodes.min - active_instances_num
|
|
164
|
+
for i in range(nodes_missing):
|
|
165
|
+
instance_model = create_fleet_instance_model(
|
|
166
|
+
session=session,
|
|
167
|
+
project=fleet_model.project,
|
|
168
|
+
# TODO: Store fleet.user and pass it instead of the project owner.
|
|
169
|
+
username=fleet_model.project.owner.name,
|
|
170
|
+
spec=fleet_spec,
|
|
171
|
+
instance_num=get_next_instance_num({i.instance_num for i in active_instances}),
|
|
89
172
|
)
|
|
90
|
-
.
|
|
91
|
-
|
|
92
|
-
|
|
173
|
+
active_instances.append(instance_model)
|
|
174
|
+
fleet_model.instances.append(instance_model)
|
|
175
|
+
logger.info("Added %s instances to fleet %s", nodes_missing, fleet_model.name)
|
|
176
|
+
return True
|
|
93
177
|
|
|
94
178
|
|
|
95
179
|
def _autodelete_fleet(fleet_model: FleetModel) -> bool:
|
|
@@ -100,7 +184,7 @@ def _autodelete_fleet(fleet_model: FleetModel) -> bool:
|
|
|
100
184
|
if (
|
|
101
185
|
fleet_model.status != FleetStatus.TERMINATING
|
|
102
186
|
and fleet_spec.configuration.nodes is not None
|
|
103
|
-
and
|
|
187
|
+
and fleet_spec.configuration.nodes.min == 0
|
|
104
188
|
):
|
|
105
189
|
# Empty fleets that allow 0 nodes should not be auto-deleted
|
|
106
190
|
return False
|
|
@@ -110,3 +194,15 @@ def _autodelete_fleet(fleet_model: FleetModel) -> bool:
|
|
|
110
194
|
fleet_model.deleted = True
|
|
111
195
|
logger.info("Fleet %s deleted", fleet_model.name)
|
|
112
196
|
return True
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
async def _update_deleted_fleets_placement_groups(session: AsyncSession, fleets_ids: list[UUID]):
|
|
200
|
+
if len(fleets_ids) == 0:
|
|
201
|
+
return
|
|
202
|
+
await session.execute(
|
|
203
|
+
update(PlacementGroupModel)
|
|
204
|
+
.where(
|
|
205
|
+
PlacementGroupModel.fleet_id.in_(fleets_ids),
|
|
206
|
+
)
|
|
207
|
+
.values(fleet_deleted=True)
|
|
208
|
+
)
|
|
@@ -49,8 +49,8 @@ async def process_gateways():
|
|
|
49
49
|
if gateway_model is None:
|
|
50
50
|
return
|
|
51
51
|
lockset.add(gateway_model.id)
|
|
52
|
+
gateway_model_id = gateway_model.id
|
|
52
53
|
try:
|
|
53
|
-
gateway_model_id = gateway_model.id
|
|
54
54
|
initial_status = gateway_model.status
|
|
55
55
|
if initial_status == GatewayStatus.SUBMITTED:
|
|
56
56
|
await _process_submitted_gateway(session=session, gateway_model=gateway_model)
|
|
@@ -165,6 +165,9 @@ async def _process_provisioning_gateway(
|
|
|
165
165
|
)
|
|
166
166
|
gateway_model = res.unique().scalar_one()
|
|
167
167
|
|
|
168
|
+
# Provisioning gateways must have compute.
|
|
169
|
+
assert gateway_model.gateway_compute is not None
|
|
170
|
+
|
|
168
171
|
# FIXME: problems caused by blocking on connect_to_gateway_with_retry and configure_gateway:
|
|
169
172
|
# - cannot delete the gateway before it is provisioned because the DB model is locked
|
|
170
173
|
# - connection retry counter is reset on server restart
|
|
@@ -53,14 +53,12 @@ from dstack._internal.core.models.placement import (
|
|
|
53
53
|
PlacementStrategy,
|
|
54
54
|
)
|
|
55
55
|
from dstack._internal.core.models.profiles import (
|
|
56
|
-
RetryEvent,
|
|
57
56
|
TerminationPolicy,
|
|
58
57
|
)
|
|
59
58
|
from dstack._internal.core.models.runs import (
|
|
60
59
|
JobProvisioningData,
|
|
61
60
|
Retry,
|
|
62
61
|
)
|
|
63
|
-
from dstack._internal.core.services.profiles import get_retry
|
|
64
62
|
from dstack._internal.server import settings as server_settings
|
|
65
63
|
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
66
64
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
@@ -85,8 +83,10 @@ from dstack._internal.server.services.instances import (
|
|
|
85
83
|
get_instance_provisioning_data,
|
|
86
84
|
get_instance_requirements,
|
|
87
85
|
get_instance_ssh_private_keys,
|
|
86
|
+
remove_dangling_tasks_from_instance,
|
|
88
87
|
)
|
|
89
88
|
from dstack._internal.server.services.locking import get_locker
|
|
89
|
+
from dstack._internal.server.services.logging import fmt
|
|
90
90
|
from dstack._internal.server.services.offers import is_divisible_into_blocks
|
|
91
91
|
from dstack._internal.server.services.placement import (
|
|
92
92
|
get_fleet_placement_group_models,
|
|
@@ -181,8 +181,8 @@ async def _process_next_instance():
|
|
|
181
181
|
if instance is None:
|
|
182
182
|
return
|
|
183
183
|
lockset.add(instance.id)
|
|
184
|
+
instance_model_id = instance.id
|
|
184
185
|
try:
|
|
185
|
-
instance_model_id = instance.id
|
|
186
186
|
await _process_instance(session=session, instance=instance)
|
|
187
187
|
finally:
|
|
188
188
|
lockset.difference_update([instance_model_id])
|
|
@@ -325,7 +325,6 @@ async def _add_remote(instance: InstanceModel) -> None:
|
|
|
325
325
|
e,
|
|
326
326
|
)
|
|
327
327
|
instance.status = InstanceStatus.PENDING
|
|
328
|
-
instance.last_retry_at = get_current_datetime()
|
|
329
328
|
return
|
|
330
329
|
|
|
331
330
|
instance_type = host_info_to_instance_type(host_info, cpu_arch)
|
|
@@ -393,6 +392,7 @@ async def _add_remote(instance: InstanceModel) -> None:
|
|
|
393
392
|
return
|
|
394
393
|
|
|
395
394
|
region = instance.region
|
|
395
|
+
assert region is not None # always set for ssh instances
|
|
396
396
|
jpd = JobProvisioningData(
|
|
397
397
|
backend=BackendType.REMOTE,
|
|
398
398
|
instance_type=instance_type,
|
|
@@ -423,7 +423,6 @@ async def _add_remote(instance: InstanceModel) -> None:
|
|
|
423
423
|
instance.offer = instance_offer.json()
|
|
424
424
|
instance.job_provisioning_data = jpd.json()
|
|
425
425
|
instance.started_at = get_current_datetime()
|
|
426
|
-
instance.last_retry_at = get_current_datetime()
|
|
427
426
|
|
|
428
427
|
|
|
429
428
|
def _deploy_instance(
|
|
@@ -490,29 +489,6 @@ def _deploy_instance(
|
|
|
490
489
|
|
|
491
490
|
|
|
492
491
|
async def _create_instance(session: AsyncSession, instance: InstanceModel) -> None:
|
|
493
|
-
if instance.last_retry_at is not None:
|
|
494
|
-
last_retry = instance.last_retry_at
|
|
495
|
-
if get_current_datetime() < last_retry + timedelta(minutes=1):
|
|
496
|
-
return
|
|
497
|
-
|
|
498
|
-
if (
|
|
499
|
-
instance.profile is None
|
|
500
|
-
or instance.requirements is None
|
|
501
|
-
or instance.instance_configuration is None
|
|
502
|
-
):
|
|
503
|
-
instance.status = InstanceStatus.TERMINATED
|
|
504
|
-
instance.termination_reason = "Empty profile, requirements or instance_configuration"
|
|
505
|
-
instance.last_retry_at = get_current_datetime()
|
|
506
|
-
logger.warning(
|
|
507
|
-
"Empty profile, requirements or instance_configuration. Terminate instance: %s",
|
|
508
|
-
instance.name,
|
|
509
|
-
extra={
|
|
510
|
-
"instance_name": instance.name,
|
|
511
|
-
"instance_status": InstanceStatus.TERMINATED.value,
|
|
512
|
-
},
|
|
513
|
-
)
|
|
514
|
-
return
|
|
515
|
-
|
|
516
492
|
if _need_to_wait_fleet_provisioning(instance):
|
|
517
493
|
logger.debug("Waiting for the first instance in the fleet to be provisioned")
|
|
518
494
|
return
|
|
@@ -526,7 +502,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
526
502
|
instance.termination_reason = (
|
|
527
503
|
f"Error to parse profile, requirements or instance_configuration: {e}"
|
|
528
504
|
)
|
|
529
|
-
instance.last_retry_at = get_current_datetime()
|
|
530
505
|
logger.warning(
|
|
531
506
|
"Error to parse profile, requirements or instance_configuration. Terminate instance: %s",
|
|
532
507
|
instance.name,
|
|
@@ -537,24 +512,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
537
512
|
)
|
|
538
513
|
return
|
|
539
514
|
|
|
540
|
-
retry = get_retry(profile)
|
|
541
|
-
should_retry = retry is not None and RetryEvent.NO_CAPACITY in retry.on_events
|
|
542
|
-
|
|
543
|
-
if retry is not None:
|
|
544
|
-
retry_duration_deadline = _get_retry_duration_deadline(instance, retry)
|
|
545
|
-
if get_current_datetime() > retry_duration_deadline:
|
|
546
|
-
instance.status = InstanceStatus.TERMINATED
|
|
547
|
-
instance.termination_reason = "Retry duration expired"
|
|
548
|
-
logger.warning(
|
|
549
|
-
"Retry duration expired. Terminating instance %s",
|
|
550
|
-
instance.name,
|
|
551
|
-
extra={
|
|
552
|
-
"instance_name": instance.name,
|
|
553
|
-
"instance_status": InstanceStatus.TERMINATED.value,
|
|
554
|
-
},
|
|
555
|
-
)
|
|
556
|
-
return
|
|
557
|
-
|
|
558
515
|
placement_group_models = []
|
|
559
516
|
placement_group_model = None
|
|
560
517
|
if instance.fleet_id:
|
|
@@ -592,15 +549,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
592
549
|
exclude_not_available=True,
|
|
593
550
|
)
|
|
594
551
|
|
|
595
|
-
if not offers and should_retry:
|
|
596
|
-
instance.last_retry_at = get_current_datetime()
|
|
597
|
-
logger.debug(
|
|
598
|
-
"No offers for instance %s. Next retry",
|
|
599
|
-
instance.name,
|
|
600
|
-
extra={"instance_name": instance.name},
|
|
601
|
-
)
|
|
602
|
-
return
|
|
603
|
-
|
|
604
552
|
# Limit number of offers tried to prevent long-running processing
|
|
605
553
|
# in case all offers fail.
|
|
606
554
|
for backend, instance_offer in offers[: server_settings.MAX_OFFERS_TRIED]:
|
|
@@ -678,7 +626,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
678
626
|
instance.offer = instance_offer.json()
|
|
679
627
|
instance.total_blocks = instance_offer.total_blocks
|
|
680
628
|
instance.started_at = get_current_datetime()
|
|
681
|
-
instance.last_retry_at = get_current_datetime()
|
|
682
629
|
|
|
683
630
|
logger.info(
|
|
684
631
|
"Created instance %s",
|
|
@@ -699,21 +646,18 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
699
646
|
)
|
|
700
647
|
return
|
|
701
648
|
|
|
702
|
-
instance
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
if sibling_instance.id == instance.id:
|
|
715
|
-
continue
|
|
716
|
-
_mark_terminated(sibling_instance, "Master instance failed to start")
|
|
649
|
+
_mark_terminated(instance, "All offers failed" if offers else "No offers found")
|
|
650
|
+
if (
|
|
651
|
+
instance.fleet
|
|
652
|
+
and _is_fleet_master_instance(instance)
|
|
653
|
+
and _is_cloud_cluster(instance.fleet)
|
|
654
|
+
):
|
|
655
|
+
# Do not attempt to deploy other instances, as they won't determine the correct cluster
|
|
656
|
+
# backend, region, and placement group without a successfully deployed master instance
|
|
657
|
+
for sibling_instance in instance.fleet.instances:
|
|
658
|
+
if sibling_instance.id == instance.id:
|
|
659
|
+
continue
|
|
660
|
+
_mark_terminated(sibling_instance, "Master instance failed to start")
|
|
717
661
|
|
|
718
662
|
|
|
719
663
|
def _mark_terminated(instance: InstanceModel, termination_reason: str) -> None:
|
|
@@ -788,6 +732,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non
|
|
|
788
732
|
ssh_private_keys,
|
|
789
733
|
job_provisioning_data,
|
|
790
734
|
None,
|
|
735
|
+
instance=instance,
|
|
791
736
|
check_instance_health=check_instance_health,
|
|
792
737
|
)
|
|
793
738
|
if instance_check is False:
|
|
@@ -934,7 +879,7 @@ async def _wait_for_instance_provisioning_data(
|
|
|
934
879
|
|
|
935
880
|
@runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
|
|
936
881
|
def _check_instance_inner(
|
|
937
|
-
ports: Dict[int, int], *, check_instance_health: bool = False
|
|
882
|
+
ports: Dict[int, int], *, instance: InstanceModel, check_instance_health: bool = False
|
|
938
883
|
) -> InstanceCheck:
|
|
939
884
|
instance_health_response: Optional[InstanceHealthResponse] = None
|
|
940
885
|
shim_client = runner_client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
|
|
@@ -954,6 +899,10 @@ def _check_instance_inner(
|
|
|
954
899
|
args = (method.__func__.__name__, e.__class__.__name__, e)
|
|
955
900
|
logger.exception(template, *args)
|
|
956
901
|
return InstanceCheck(reachable=False, message=template % args)
|
|
902
|
+
try:
|
|
903
|
+
remove_dangling_tasks_from_instance(shim_client, instance)
|
|
904
|
+
except Exception as e:
|
|
905
|
+
logger.exception("%s: error removing dangling tasks: %s", fmt(instance), e)
|
|
957
906
|
return runner_client.healthcheck_response_to_instance_check(
|
|
958
907
|
healthcheck_response, instance_health_response
|
|
959
908
|
)
|
|
@@ -120,7 +120,7 @@ async def _execute_probe(probe: ProbeModel, probe_spec: ProbeSpec) -> bool:
|
|
|
120
120
|
method=probe_spec.method,
|
|
121
121
|
url="http://dstack" + probe_spec.url,
|
|
122
122
|
headers=[(h.name, h.value) for h in probe_spec.headers],
|
|
123
|
-
|
|
123
|
+
content=probe_spec.body,
|
|
124
124
|
timeout=probe_spec.timeout,
|
|
125
125
|
follow_redirects=False,
|
|
126
126
|
)
|
|
@@ -41,6 +41,7 @@ from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, Vol
|
|
|
41
41
|
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
42
42
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
43
43
|
from dstack._internal.server.models import (
|
|
44
|
+
FleetModel,
|
|
44
45
|
InstanceModel,
|
|
45
46
|
JobModel,
|
|
46
47
|
ProbeModel,
|
|
@@ -128,9 +129,8 @@ async def _process_next_running_job():
|
|
|
128
129
|
if job_model is None:
|
|
129
130
|
return
|
|
130
131
|
lockset.add(job_model.id)
|
|
131
|
-
|
|
132
|
+
job_model_id = job_model.id
|
|
132
133
|
try:
|
|
133
|
-
job_model_id = job_model.id
|
|
134
134
|
await _process_running_job(session=session, job_model=job_model)
|
|
135
135
|
finally:
|
|
136
136
|
lockset.difference_update([job_model_id])
|
|
@@ -152,6 +152,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
152
152
|
.options(joinedload(RunModel.project))
|
|
153
153
|
.options(joinedload(RunModel.user))
|
|
154
154
|
.options(joinedload(RunModel.repo))
|
|
155
|
+
.options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name))
|
|
155
156
|
.options(joinedload(RunModel.jobs))
|
|
156
157
|
)
|
|
157
158
|
run_model = res.unique().scalar_one()
|
|
@@ -170,6 +171,11 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
170
171
|
|
|
171
172
|
job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
|
|
172
173
|
|
|
174
|
+
volumes = []
|
|
175
|
+
secrets = {}
|
|
176
|
+
cluster_info = None
|
|
177
|
+
repo_creds = None
|
|
178
|
+
|
|
173
179
|
initial_status = job_model.status
|
|
174
180
|
if initial_status in [JobStatus.PROVISIONING, JobStatus.PULLING]:
|
|
175
181
|
# Wait until all other jobs in the replica are provisioned
|
|
@@ -257,6 +263,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
257
263
|
user_ssh_key,
|
|
258
264
|
)
|
|
259
265
|
else:
|
|
266
|
+
assert cluster_info is not None
|
|
260
267
|
logger.debug(
|
|
261
268
|
"%s: process provisioning job without shim, age=%s",
|
|
262
269
|
fmt(job_model),
|
|
@@ -275,7 +282,6 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
275
282
|
repo=repo_model,
|
|
276
283
|
code_hash=_get_repo_code_hash(run, job),
|
|
277
284
|
)
|
|
278
|
-
|
|
279
285
|
success = await common_utils.run_async(
|
|
280
286
|
_submit_job_to_runner,
|
|
281
287
|
server_ssh_private_keys,
|
|
@@ -309,6 +315,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
309
315
|
|
|
310
316
|
else: # fails are not acceptable
|
|
311
317
|
if initial_status == JobStatus.PULLING:
|
|
318
|
+
assert cluster_info is not None
|
|
312
319
|
logger.debug(
|
|
313
320
|
"%s: process pulling job with shim, age=%s", fmt(job_model), job_submission.age
|
|
314
321
|
)
|
|
@@ -341,7 +348,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
341
348
|
server_ssh_private_keys,
|
|
342
349
|
job_provisioning_data,
|
|
343
350
|
)
|
|
344
|
-
|
|
351
|
+
else:
|
|
345
352
|
logger.debug("%s: process running job, age=%s", fmt(job_model), job_submission.age)
|
|
346
353
|
success = await common_utils.run_async(
|
|
347
354
|
_process_running,
|
|
@@ -632,6 +639,7 @@ def _process_pulling_with_shim(
|
|
|
632
639
|
is successful
|
|
633
640
|
"""
|
|
634
641
|
shim_client = client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
|
|
642
|
+
job_runtime_data = None
|
|
635
643
|
if shim_client.is_api_v2_supported(): # raises error if shim is down, causes retry
|
|
636
644
|
task = shim_client.get_task(job_model.id)
|
|
637
645
|
|
|
@@ -21,6 +21,7 @@ from dstack._internal.core.models.runs import (
|
|
|
21
21
|
)
|
|
22
22
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
23
23
|
from dstack._internal.server.models import (
|
|
24
|
+
FleetModel,
|
|
24
25
|
InstanceModel,
|
|
25
26
|
JobModel,
|
|
26
27
|
ProjectModel,
|
|
@@ -129,8 +130,8 @@ async def _process_next_run():
|
|
|
129
130
|
job_ids = [j.id for j in run_model.jobs]
|
|
130
131
|
run_lockset.add(run_model.id)
|
|
131
132
|
job_lockset.update(job_ids)
|
|
133
|
+
run_model_id = run_model.id
|
|
132
134
|
try:
|
|
133
|
-
run_model_id = run_model.id
|
|
134
135
|
await _process_run(session=session, run_model=run_model)
|
|
135
136
|
finally:
|
|
136
137
|
run_lockset.difference_update([run_model_id])
|
|
@@ -145,6 +146,7 @@ async def _process_run(session: AsyncSession, run_model: RunModel):
|
|
|
145
146
|
.execution_options(populate_existing=True)
|
|
146
147
|
.options(joinedload(RunModel.project).load_only(ProjectModel.id, ProjectModel.name))
|
|
147
148
|
.options(joinedload(RunModel.user).load_only(UserModel.name))
|
|
149
|
+
.options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name))
|
|
148
150
|
.options(
|
|
149
151
|
selectinload(RunModel.jobs)
|
|
150
152
|
.joinedload(JobModel.instance)
|