dstack 0.18.40rc1__py3-none-any.whl → 0.18.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/apply.py +8 -5
- dstack/_internal/cli/services/configurators/base.py +4 -2
- dstack/_internal/cli/services/configurators/fleet.py +21 -9
- dstack/_internal/cli/services/configurators/gateway.py +15 -0
- dstack/_internal/cli/services/configurators/run.py +6 -5
- dstack/_internal/cli/services/configurators/volume.py +15 -0
- dstack/_internal/cli/services/repos.py +3 -3
- dstack/_internal/cli/utils/fleet.py +44 -33
- dstack/_internal/cli/utils/run.py +27 -7
- dstack/_internal/cli/utils/volume.py +30 -9
- dstack/_internal/core/backends/aws/compute.py +94 -53
- dstack/_internal/core/backends/aws/resources.py +22 -12
- dstack/_internal/core/backends/azure/compute.py +2 -0
- dstack/_internal/core/backends/base/compute.py +20 -2
- dstack/_internal/core/backends/gcp/compute.py +32 -24
- dstack/_internal/core/backends/gcp/resources.py +0 -15
- dstack/_internal/core/backends/oci/compute.py +10 -5
- dstack/_internal/core/backends/oci/resources.py +23 -26
- dstack/_internal/core/backends/remote/provisioning.py +65 -27
- dstack/_internal/core/backends/runpod/compute.py +1 -0
- dstack/_internal/core/models/backends/azure.py +3 -1
- dstack/_internal/core/models/configurations.py +24 -1
- dstack/_internal/core/models/fleets.py +46 -0
- dstack/_internal/core/models/instances.py +5 -1
- dstack/_internal/core/models/pools.py +4 -1
- dstack/_internal/core/models/profiles.py +10 -4
- dstack/_internal/core/models/runs.py +23 -3
- dstack/_internal/core/models/volumes.py +26 -0
- dstack/_internal/core/services/ssh/attach.py +92 -53
- dstack/_internal/core/services/ssh/tunnel.py +58 -31
- dstack/_internal/proxy/gateway/routers/registry.py +2 -0
- dstack/_internal/proxy/gateway/schemas/registry.py +2 -0
- dstack/_internal/proxy/gateway/services/registry.py +4 -0
- dstack/_internal/proxy/lib/models.py +3 -0
- dstack/_internal/proxy/lib/services/service_connection.py +8 -1
- dstack/_internal/server/background/tasks/process_instances.py +73 -35
- dstack/_internal/server/background/tasks/process_metrics.py +9 -9
- dstack/_internal/server/background/tasks/process_running_jobs.py +77 -26
- dstack/_internal/server/background/tasks/process_runs.py +2 -12
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +121 -49
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +14 -3
- dstack/_internal/server/background/tasks/process_volumes.py +11 -1
- dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py +71 -0
- dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py +32 -0
- dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py +43 -0
- dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py +83 -0
- dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
- dstack/_internal/server/models.py +27 -23
- dstack/_internal/server/routers/runs.py +1 -0
- dstack/_internal/server/schemas/runner.py +1 -0
- dstack/_internal/server/services/backends/configurators/azure.py +34 -8
- dstack/_internal/server/services/config.py +9 -0
- dstack/_internal/server/services/fleets.py +32 -3
- dstack/_internal/server/services/gateways/client.py +9 -1
- dstack/_internal/server/services/jobs/__init__.py +217 -45
- dstack/_internal/server/services/jobs/configurators/base.py +47 -2
- dstack/_internal/server/services/offers.py +96 -10
- dstack/_internal/server/services/pools.py +98 -14
- dstack/_internal/server/services/proxy/repo.py +17 -3
- dstack/_internal/server/services/runner/client.py +9 -6
- dstack/_internal/server/services/runner/ssh.py +33 -5
- dstack/_internal/server/services/runs.py +48 -179
- dstack/_internal/server/services/services/__init__.py +9 -1
- dstack/_internal/server/services/volumes.py +68 -9
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-11ec5e4a00ea6ec833e3.js → main-2ac66bfcbd2e39830b88.js} +30 -31
- dstack/_internal/server/statics/{main-11ec5e4a00ea6ec833e3.js.map → main-2ac66bfcbd2e39830b88.js.map} +1 -1
- dstack/_internal/server/statics/{main-fc56d1f4af8e57522a1c.css → main-ad5150a441de98cd8987.css} +1 -1
- dstack/_internal/server/testing/common.py +130 -61
- dstack/_internal/utils/common.py +22 -8
- dstack/_internal/utils/env.py +14 -0
- dstack/_internal/utils/ssh.py +1 -1
- dstack/api/server/_fleets.py +25 -1
- dstack/api/server/_runs.py +23 -2
- dstack/api/server/_volumes.py +12 -1
- dstack/version.py +1 -1
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/METADATA +1 -1
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/RECORD +104 -93
- tests/_internal/cli/services/configurators/test_profile.py +3 -3
- tests/_internal/core/services/ssh/test_tunnel.py +56 -4
- tests/_internal/proxy/gateway/routers/test_registry.py +30 -7
- tests/_internal/server/background/tasks/test_process_instances.py +138 -20
- tests/_internal/server/background/tasks/test_process_metrics.py +12 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +193 -0
- tests/_internal/server/background/tasks/test_process_runs.py +27 -3
- tests/_internal/server/background/tasks/test_process_submitted_jobs.py +53 -6
- tests/_internal/server/background/tasks/test_process_terminating_jobs.py +135 -17
- tests/_internal/server/routers/test_fleets.py +15 -2
- tests/_internal/server/routers/test_pools.py +6 -0
- tests/_internal/server/routers/test_runs.py +27 -0
- tests/_internal/server/routers/test_volumes.py +9 -2
- tests/_internal/server/services/jobs/__init__.py +0 -0
- tests/_internal/server/services/jobs/configurators/__init__.py +0 -0
- tests/_internal/server/services/jobs/configurators/test_base.py +72 -0
- tests/_internal/server/services/runner/test_client.py +22 -3
- tests/_internal/server/services/test_offers.py +167 -0
- tests/_internal/server/services/test_pools.py +109 -1
- tests/_internal/server/services/test_runs.py +5 -41
- tests/_internal/utils/test_common.py +21 -0
- tests/_internal/utils/test_env.py +38 -0
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/WHEEL +0 -0
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/top_level.txt +0 -0
|
@@ -15,10 +15,7 @@ from dstack._internal.core.models.fleets import (
|
|
|
15
15
|
FleetStatus,
|
|
16
16
|
InstanceGroupPlacement,
|
|
17
17
|
)
|
|
18
|
-
from dstack._internal.core.models.instances import
|
|
19
|
-
InstanceOfferWithAvailability,
|
|
20
|
-
InstanceStatus,
|
|
21
|
-
)
|
|
18
|
+
from dstack._internal.core.models.instances import InstanceOfferWithAvailability, InstanceStatus
|
|
22
19
|
from dstack._internal.core.models.profiles import (
|
|
23
20
|
DEFAULT_POOL_NAME,
|
|
24
21
|
DEFAULT_RUN_TERMINATION_IDLE_TIME,
|
|
@@ -26,6 +23,7 @@ from dstack._internal.core.models.profiles import (
|
|
|
26
23
|
Profile,
|
|
27
24
|
TerminationPolicy,
|
|
28
25
|
)
|
|
26
|
+
from dstack._internal.core.models.resources import Memory
|
|
29
27
|
from dstack._internal.core.models.runs import (
|
|
30
28
|
Job,
|
|
31
29
|
JobProvisioningData,
|
|
@@ -45,6 +43,7 @@ from dstack._internal.server.models import (
|
|
|
45
43
|
PoolModel,
|
|
46
44
|
ProjectModel,
|
|
47
45
|
RunModel,
|
|
46
|
+
VolumeAttachmentModel,
|
|
48
47
|
VolumeModel,
|
|
49
48
|
)
|
|
50
49
|
from dstack._internal.server.services.backends import get_project_backend_by_type_or_error
|
|
@@ -52,28 +51,31 @@ from dstack._internal.server.services.fleets import (
|
|
|
52
51
|
fleet_model_to_fleet,
|
|
53
52
|
)
|
|
54
53
|
from dstack._internal.server.services.jobs import (
|
|
54
|
+
check_can_attach_job_volumes,
|
|
55
55
|
find_job,
|
|
56
56
|
get_instances_ids_with_detaching_volumes,
|
|
57
|
+
get_job_configured_volume_models,
|
|
58
|
+
get_job_configured_volumes,
|
|
59
|
+
get_job_runtime_data,
|
|
57
60
|
)
|
|
58
61
|
from dstack._internal.server.services.locking import get_locker
|
|
59
62
|
from dstack._internal.server.services.logging import fmt
|
|
60
63
|
from dstack._internal.server.services.offers import get_offers_by_requirements
|
|
61
64
|
from dstack._internal.server.services.pools import (
|
|
62
65
|
filter_pool_instances,
|
|
66
|
+
get_instance_offer,
|
|
63
67
|
get_instance_provisioning_data,
|
|
68
|
+
get_shared_pool_instances_with_offers,
|
|
64
69
|
)
|
|
65
70
|
from dstack._internal.server.services.runs import (
|
|
66
|
-
check_can_attach_run_volumes,
|
|
67
71
|
check_run_spec_requires_instance_mounts,
|
|
68
|
-
get_offer_volumes,
|
|
69
|
-
get_run_volume_models,
|
|
70
|
-
get_run_volumes,
|
|
71
72
|
run_model_to_run,
|
|
72
73
|
)
|
|
73
74
|
from dstack._internal.server.services.volumes import (
|
|
74
75
|
volume_model_to_volume,
|
|
75
76
|
)
|
|
76
77
|
from dstack._internal.utils import common as common_utils
|
|
78
|
+
from dstack._internal.utils import env as env_utils
|
|
77
79
|
from dstack._internal.utils.logging import get_logger
|
|
78
80
|
|
|
79
81
|
logger = get_logger(__name__)
|
|
@@ -152,17 +154,21 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
152
154
|
await session.commit()
|
|
153
155
|
return
|
|
154
156
|
try:
|
|
155
|
-
volume_models = await
|
|
157
|
+
volume_models = await get_job_configured_volume_models(
|
|
156
158
|
session=session,
|
|
157
159
|
project=project,
|
|
158
160
|
run_spec=run_spec,
|
|
161
|
+
job_num=job.job_spec.job_num,
|
|
162
|
+
job_spec=job.job_spec,
|
|
159
163
|
)
|
|
160
|
-
volumes = await
|
|
164
|
+
volumes = await get_job_configured_volumes(
|
|
161
165
|
session=session,
|
|
162
166
|
project=project,
|
|
163
167
|
run_spec=run_spec,
|
|
168
|
+
job_num=job.job_spec.job_num,
|
|
169
|
+
job_spec=job.job_spec,
|
|
164
170
|
)
|
|
165
|
-
|
|
171
|
+
check_can_attach_job_volumes(volumes)
|
|
166
172
|
except ServerClientError as e:
|
|
167
173
|
logger.warning("%s: failed to prepare run volumes: %s", fmt(job_model), repr(e))
|
|
168
174
|
job_model.status = JobStatus.TERMINATING
|
|
@@ -186,12 +192,12 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
186
192
|
.where(
|
|
187
193
|
InstanceModel.pool_id == pool.id,
|
|
188
194
|
InstanceModel.deleted == False,
|
|
189
|
-
InstanceModel.
|
|
195
|
+
InstanceModel.total_blocks > InstanceModel.busy_blocks,
|
|
190
196
|
)
|
|
191
|
-
.options(lazyload(InstanceModel.
|
|
197
|
+
.options(lazyload(InstanceModel.jobs))
|
|
192
198
|
.with_for_update()
|
|
193
199
|
)
|
|
194
|
-
pool_instances = list(res.scalars().all())
|
|
200
|
+
pool_instances = list(res.unique().scalars().all())
|
|
195
201
|
instances_ids = sorted([i.id for i in pool_instances])
|
|
196
202
|
if get_db().dialect_name == "sqlite":
|
|
197
203
|
# Start new transaction to see commited changes after lock
|
|
@@ -202,14 +208,16 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
202
208
|
detaching_instances_ids = await get_instances_ids_with_detaching_volumes(session)
|
|
203
209
|
# Refetch after lock
|
|
204
210
|
res = await session.execute(
|
|
205
|
-
select(InstanceModel)
|
|
211
|
+
select(InstanceModel)
|
|
212
|
+
.where(
|
|
206
213
|
InstanceModel.id.not_in(detaching_instances_ids),
|
|
207
214
|
InstanceModel.id.in_(instances_ids),
|
|
208
215
|
InstanceModel.deleted == False,
|
|
209
|
-
InstanceModel.
|
|
216
|
+
InstanceModel.total_blocks > InstanceModel.busy_blocks,
|
|
210
217
|
)
|
|
218
|
+
.execution_options(populate_existing=True)
|
|
211
219
|
)
|
|
212
|
-
pool_instances = list(res.scalars().all())
|
|
220
|
+
pool_instances = list(res.unique().scalars().all())
|
|
213
221
|
instance = await _assign_job_to_pool_instance(
|
|
214
222
|
session=session,
|
|
215
223
|
pool_instances=pool_instances,
|
|
@@ -221,8 +229,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
221
229
|
volumes=volumes,
|
|
222
230
|
)
|
|
223
231
|
job_model.instance_assigned = True
|
|
224
|
-
if instance is not None:
|
|
225
|
-
job_model.job_runtime_data = _prepare_job_runtime_data(job, instance).json()
|
|
226
232
|
job_model.last_processed_at = common_utils.get_current_datetime()
|
|
227
233
|
await session.commit()
|
|
228
234
|
return
|
|
@@ -231,10 +237,10 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
231
237
|
res = await session.execute(
|
|
232
238
|
select(InstanceModel)
|
|
233
239
|
.where(InstanceModel.id == job_model.instance.id)
|
|
234
|
-
.options(selectinload(InstanceModel.
|
|
240
|
+
.options(selectinload(InstanceModel.volume_attachments))
|
|
235
241
|
.execution_options(populate_existing=True)
|
|
236
242
|
)
|
|
237
|
-
instance = res.scalar_one()
|
|
243
|
+
instance = res.unique().scalar_one()
|
|
238
244
|
job_model.status = JobStatus.PROVISIONING
|
|
239
245
|
else:
|
|
240
246
|
# Assigned no instance, create a new one
|
|
@@ -290,7 +296,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
290
296
|
offer=offer,
|
|
291
297
|
instance_num=instance_num,
|
|
292
298
|
)
|
|
293
|
-
job_model.job_runtime_data = _prepare_job_runtime_data(
|
|
299
|
+
job_model.job_runtime_data = _prepare_job_runtime_data(offer).json()
|
|
294
300
|
instance.fleet_id = fleet_model.id
|
|
295
301
|
logger.info(
|
|
296
302
|
"The job %s created the new instance %s",
|
|
@@ -351,30 +357,50 @@ async def _assign_job_to_pool_instance(
|
|
|
351
357
|
master_job_provisioning_data: Optional[JobProvisioningData] = None,
|
|
352
358
|
volumes: Optional[List[List[Volume]]] = None,
|
|
353
359
|
) -> Optional[InstanceModel]:
|
|
360
|
+
instances_with_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]]
|
|
354
361
|
profile = run_spec.merged_profile
|
|
355
|
-
|
|
362
|
+
multinode = job.job_spec.jobs_per_replica > 1
|
|
363
|
+
nonshared_instances = filter_pool_instances(
|
|
356
364
|
pool_instances=pool_instances,
|
|
357
365
|
profile=profile,
|
|
358
366
|
requirements=job.job_spec.requirements,
|
|
359
367
|
status=InstanceStatus.IDLE,
|
|
360
368
|
fleet_model=fleet_model,
|
|
361
|
-
multinode=
|
|
369
|
+
multinode=multinode,
|
|
362
370
|
master_job_provisioning_data=master_job_provisioning_data,
|
|
363
371
|
volumes=volumes,
|
|
372
|
+
shared=False,
|
|
364
373
|
)
|
|
365
|
-
|
|
374
|
+
instances_with_offers = [
|
|
375
|
+
(instance, common_utils.get_or_error(get_instance_offer(instance)))
|
|
376
|
+
for instance in nonshared_instances
|
|
377
|
+
]
|
|
378
|
+
if not multinode:
|
|
379
|
+
shared_instances_with_offers = get_shared_pool_instances_with_offers(
|
|
380
|
+
pool_instances=pool_instances,
|
|
381
|
+
profile=profile,
|
|
382
|
+
requirements=job.job_spec.requirements,
|
|
383
|
+
idle_only=True,
|
|
384
|
+
fleet_model=fleet_model,
|
|
385
|
+
volumes=volumes,
|
|
386
|
+
)
|
|
387
|
+
instances_with_offers.extend(shared_instances_with_offers)
|
|
388
|
+
|
|
389
|
+
if len(instances_with_offers) == 0:
|
|
366
390
|
return None
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
391
|
+
|
|
392
|
+
instances_with_offers.sort(key=lambda instance_with_offer: instance_with_offer[0].price or 0)
|
|
393
|
+
instance, offer = instances_with_offers[0]
|
|
394
|
+
# Reload InstanceModel with volume attachments
|
|
370
395
|
res = await session.execute(
|
|
371
396
|
select(InstanceModel)
|
|
372
397
|
.where(InstanceModel.id == instance.id)
|
|
373
|
-
.options(joinedload(InstanceModel.
|
|
398
|
+
.options(joinedload(InstanceModel.volume_attachments))
|
|
374
399
|
)
|
|
375
400
|
instance = res.unique().scalar_one()
|
|
376
401
|
instance.status = InstanceStatus.BUSY
|
|
377
|
-
instance.
|
|
402
|
+
instance.busy_blocks += offer.blocks
|
|
403
|
+
|
|
378
404
|
logger.info(
|
|
379
405
|
"The job %s switched instance %s status to BUSY",
|
|
380
406
|
job_model.job_name,
|
|
@@ -385,8 +411,10 @@ async def _assign_job_to_pool_instance(
|
|
|
385
411
|
},
|
|
386
412
|
)
|
|
387
413
|
logger.info("%s: now is provisioning on '%s'", fmt(job_model), instance.name)
|
|
388
|
-
job_model.
|
|
414
|
+
job_model.instance = instance
|
|
389
415
|
job_model.used_instance_id = instance.id
|
|
416
|
+
job_model.job_provisioning_data = instance.job_provisioning_data
|
|
417
|
+
job_model.job_runtime_data = _prepare_job_runtime_data(offer).json()
|
|
390
418
|
return instance
|
|
391
419
|
|
|
392
420
|
|
|
@@ -431,7 +459,7 @@ async def _run_job_on_new_instance(
|
|
|
431
459
|
offer.region,
|
|
432
460
|
offer.price,
|
|
433
461
|
)
|
|
434
|
-
offer_volumes =
|
|
462
|
+
offer_volumes = _get_offer_volumes(volumes, offer)
|
|
435
463
|
try:
|
|
436
464
|
job_provisioning_data = await common_utils.run_async(
|
|
437
465
|
backend.compute().run_job,
|
|
@@ -549,29 +577,64 @@ def _create_instance_model_for_job(
|
|
|
549
577
|
offer=offer.json(),
|
|
550
578
|
termination_policy=termination_policy,
|
|
551
579
|
termination_idle_time=termination_idle_time,
|
|
552
|
-
|
|
580
|
+
jobs=[job_model],
|
|
553
581
|
backend=offer.backend,
|
|
554
582
|
price=offer.price,
|
|
555
583
|
region=offer.region,
|
|
556
|
-
|
|
584
|
+
volume_attachments=[],
|
|
585
|
+
total_blocks=1,
|
|
586
|
+
busy_blocks=1,
|
|
557
587
|
)
|
|
558
588
|
return instance
|
|
559
589
|
|
|
560
590
|
|
|
561
|
-
def _prepare_job_runtime_data(
|
|
562
|
-
if
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
591
|
+
def _prepare_job_runtime_data(offer: InstanceOfferWithAvailability) -> JobRuntimeData:
|
|
592
|
+
if offer.total_blocks == 1:
|
|
593
|
+
if env_utils.get_bool("DSTACK_FORCE_BRIDGE_NETWORK"):
|
|
594
|
+
network_mode = NetworkMode.BRIDGE
|
|
595
|
+
else:
|
|
596
|
+
network_mode = NetworkMode.HOST
|
|
597
|
+
return JobRuntimeData(
|
|
598
|
+
network_mode=network_mode,
|
|
599
|
+
offer=offer,
|
|
600
|
+
)
|
|
601
|
+
return JobRuntimeData(
|
|
602
|
+
network_mode=NetworkMode.BRIDGE,
|
|
603
|
+
offer=offer,
|
|
604
|
+
cpu=offer.instance.resources.cpus,
|
|
605
|
+
gpu=len(offer.instance.resources.gpus),
|
|
606
|
+
memory=Memory(offer.instance.resources.memory_mib / 1024),
|
|
607
|
+
)
|
|
569
608
|
|
|
570
|
-
if not is_shared_instance:
|
|
571
|
-
return JobRuntimeData(network_mode=NetworkMode.HOST)
|
|
572
609
|
|
|
573
|
-
|
|
574
|
-
|
|
610
|
+
def _get_offer_volumes(
|
|
611
|
+
volumes: List[List[Volume]],
|
|
612
|
+
offer: InstanceOfferWithAvailability,
|
|
613
|
+
) -> List[Volume]:
|
|
614
|
+
"""
|
|
615
|
+
Returns volumes suitable for the offer for each mount point.
|
|
616
|
+
"""
|
|
617
|
+
offer_volumes = []
|
|
618
|
+
for mount_point_volumes in volumes:
|
|
619
|
+
offer_volumes.append(_get_offer_mount_point_volume(mount_point_volumes, offer))
|
|
620
|
+
return offer_volumes
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
def _get_offer_mount_point_volume(
|
|
624
|
+
volumes: List[Volume],
|
|
625
|
+
offer: InstanceOfferWithAvailability,
|
|
626
|
+
) -> Volume:
|
|
627
|
+
"""
|
|
628
|
+
Returns the first suitable volume for the offer among possible mount point volumes.
|
|
629
|
+
"""
|
|
630
|
+
for volume in volumes:
|
|
631
|
+
if (
|
|
632
|
+
volume.configuration.backend != offer.backend
|
|
633
|
+
or volume.configuration.region != offer.region
|
|
634
|
+
):
|
|
635
|
+
continue
|
|
636
|
+
return volume
|
|
637
|
+
raise ServerClientError("Failed to find an eligible volume for the mount point")
|
|
575
638
|
|
|
576
639
|
|
|
577
640
|
async def _attach_volumes(
|
|
@@ -586,6 +649,8 @@ async def _attach_volumes(
|
|
|
586
649
|
project=project,
|
|
587
650
|
backend_type=job_provisioning_data.backend,
|
|
588
651
|
)
|
|
652
|
+
job_runtime_data = common_utils.get_or_error(get_job_runtime_data(job_model))
|
|
653
|
+
job_runtime_data.volume_names = []
|
|
589
654
|
logger.info("Attaching volumes: %s", [[v.name for v in vs] for vs in volume_models])
|
|
590
655
|
for mount_point_volume_models in volume_models:
|
|
591
656
|
for volume_model in mount_point_volume_models:
|
|
@@ -604,6 +669,7 @@ async def _attach_volumes(
|
|
|
604
669
|
instance=instance,
|
|
605
670
|
instance_id=job_provisioning_data.instance_id,
|
|
606
671
|
)
|
|
672
|
+
job_runtime_data.volume_names.append(volume.name)
|
|
607
673
|
break # attach next mount point
|
|
608
674
|
except (ServerClientError, BackendError) as e:
|
|
609
675
|
logger.warning("%s: failed to attached volume: %s", fmt(job_model), repr(e))
|
|
@@ -620,6 +686,8 @@ async def _attach_volumes(
|
|
|
620
686
|
# TODO: Replace with JobTerminationReason.VOLUME_ERROR in 0.19
|
|
621
687
|
job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
|
|
622
688
|
job_model.termination_reason_message = "Failed to attach volume"
|
|
689
|
+
finally:
|
|
690
|
+
job_model.job_runtime_data = job_runtime_data.json()
|
|
623
691
|
|
|
624
692
|
|
|
625
693
|
async def _attach_volume(
|
|
@@ -629,14 +697,18 @@ async def _attach_volume(
|
|
|
629
697
|
instance: InstanceModel,
|
|
630
698
|
instance_id: str,
|
|
631
699
|
):
|
|
700
|
+
volume = volume_model_to_volume(volume_model)
|
|
701
|
+
# Refresh only to check if the volume wasn't deleted before the lock
|
|
632
702
|
await session.refresh(volume_model)
|
|
633
703
|
if volume_model.deleted:
|
|
634
704
|
raise ServerClientError("Cannot attach a deleted volume")
|
|
635
|
-
volume = volume_model_to_volume(volume_model)
|
|
636
705
|
attachment_data = await common_utils.run_async(
|
|
637
706
|
backend.compute().attach_volume,
|
|
638
707
|
volume=volume,
|
|
639
708
|
instance_id=instance_id,
|
|
640
709
|
)
|
|
641
|
-
|
|
642
|
-
|
|
710
|
+
volume_attachment_model = VolumeAttachmentModel(
|
|
711
|
+
volume=volume_model,
|
|
712
|
+
attachment_data=attachment_data.json(),
|
|
713
|
+
)
|
|
714
|
+
instance.volume_attachments.append(volume_attachment_model)
|
|
@@ -6,7 +6,13 @@ from sqlalchemy.orm import joinedload, lazyload
|
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.models.runs import JobStatus
|
|
8
8
|
from dstack._internal.server.db import get_session_ctx
|
|
9
|
-
from dstack._internal.server.models import
|
|
9
|
+
from dstack._internal.server.models import (
|
|
10
|
+
InstanceModel,
|
|
11
|
+
JobModel,
|
|
12
|
+
ProjectModel,
|
|
13
|
+
VolumeAttachmentModel,
|
|
14
|
+
VolumeModel,
|
|
15
|
+
)
|
|
10
16
|
from dstack._internal.server.services.jobs import (
|
|
11
17
|
process_terminating_job,
|
|
12
18
|
process_volumes_detaching,
|
|
@@ -52,7 +58,7 @@ async def _process_next_terminating_job():
|
|
|
52
58
|
InstanceModel.id == job_model.used_instance_id,
|
|
53
59
|
InstanceModel.id.not_in(instance_lockset),
|
|
54
60
|
)
|
|
55
|
-
.options(lazyload(InstanceModel.
|
|
61
|
+
.options(lazyload(InstanceModel.jobs))
|
|
56
62
|
.with_for_update(skip_locked=True)
|
|
57
63
|
)
|
|
58
64
|
instance_model = res.scalar()
|
|
@@ -80,7 +86,12 @@ async def _process_job(session: AsyncSession, job_model: JobModel):
|
|
|
80
86
|
.where(InstanceModel.id == job_model.used_instance_id)
|
|
81
87
|
.options(
|
|
82
88
|
joinedload(InstanceModel.project).joinedload(ProjectModel.backends),
|
|
83
|
-
joinedload(InstanceModel.
|
|
89
|
+
joinedload(InstanceModel.volume_attachments)
|
|
90
|
+
.joinedload(VolumeAttachmentModel.volume)
|
|
91
|
+
.joinedload(VolumeModel.user),
|
|
92
|
+
joinedload(InstanceModel.volume_attachments)
|
|
93
|
+
.joinedload(VolumeAttachmentModel.volume)
|
|
94
|
+
.joinedload(VolumeModel.attachments),
|
|
84
95
|
)
|
|
85
96
|
)
|
|
86
97
|
instance_model = res.unique().scalar()
|
|
@@ -5,7 +5,12 @@ from sqlalchemy.orm import joinedload
|
|
|
5
5
|
from dstack._internal.core.errors import BackendError, BackendNotAvailable
|
|
6
6
|
from dstack._internal.core.models.volumes import VolumeStatus
|
|
7
7
|
from dstack._internal.server.db import get_session_ctx
|
|
8
|
-
from dstack._internal.server.models import
|
|
8
|
+
from dstack._internal.server.models import (
|
|
9
|
+
InstanceModel,
|
|
10
|
+
ProjectModel,
|
|
11
|
+
VolumeAttachmentModel,
|
|
12
|
+
VolumeModel,
|
|
13
|
+
)
|
|
9
14
|
from dstack._internal.server.services import backends as backends_services
|
|
10
15
|
from dstack._internal.server.services import volumes as volumes_services
|
|
11
16
|
from dstack._internal.server.services.locking import get_locker
|
|
@@ -49,6 +54,11 @@ async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeM
|
|
|
49
54
|
.where(VolumeModel.id == volume_model.id)
|
|
50
55
|
.options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends))
|
|
51
56
|
.options(joinedload(VolumeModel.user))
|
|
57
|
+
.options(
|
|
58
|
+
joinedload(VolumeModel.attachments)
|
|
59
|
+
.joinedload(VolumeAttachmentModel.instance)
|
|
60
|
+
.joinedload(InstanceModel.fleet)
|
|
61
|
+
)
|
|
52
62
|
.execution_options(populate_existing=True)
|
|
53
63
|
)
|
|
54
64
|
volume_model = res.unique().scalar_one()
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Reverse Job-Instance relationship
|
|
2
|
+
|
|
3
|
+
Revision ID: 1338b788b612
|
|
4
|
+
Revises: 51d45659d574
|
|
5
|
+
Create Date: 2025-01-16 14:59:19.113534
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
import sqlalchemy_utils
|
|
11
|
+
from alembic import op
|
|
12
|
+
|
|
13
|
+
# revision identifiers, used by Alembic.
|
|
14
|
+
revision = "1338b788b612"
|
|
15
|
+
down_revision = "51d45659d574"
|
|
16
|
+
branch_labels = None
|
|
17
|
+
depends_on = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def upgrade() -> None:
|
|
21
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
22
|
+
batch_op.add_column(
|
|
23
|
+
sa.Column(
|
|
24
|
+
"instance_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True
|
|
25
|
+
)
|
|
26
|
+
)
|
|
27
|
+
batch_op.create_foreign_key(
|
|
28
|
+
batch_op.f("fk_jobs_instance_id_instances"),
|
|
29
|
+
"instances",
|
|
30
|
+
["instance_id"],
|
|
31
|
+
["id"],
|
|
32
|
+
ondelete="CASCADE",
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
op.execute("""
|
|
36
|
+
UPDATE jobs AS j
|
|
37
|
+
SET instance_id = (
|
|
38
|
+
SELECT i.id
|
|
39
|
+
FROM instances AS i
|
|
40
|
+
WHERE i.job_id = j.id
|
|
41
|
+
)
|
|
42
|
+
""")
|
|
43
|
+
|
|
44
|
+
with op.batch_alter_table("instances", schema=None) as batch_op:
|
|
45
|
+
batch_op.drop_constraint("fk_instances_job_id_jobs", type_="foreignkey")
|
|
46
|
+
batch_op.drop_column("job_id")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def downgrade() -> None:
|
|
50
|
+
with op.batch_alter_table("instances", schema=None) as batch_op:
|
|
51
|
+
batch_op.add_column(
|
|
52
|
+
sa.Column("job_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True)
|
|
53
|
+
)
|
|
54
|
+
batch_op.create_foreign_key("fk_instances_job_id_jobs", "jobs", ["job_id"], ["id"])
|
|
55
|
+
|
|
56
|
+
# This migration is not fully reversible - we cannot assign multiple jobs to a single instance,
|
|
57
|
+
# thus LIMIT 1
|
|
58
|
+
op.execute("""
|
|
59
|
+
UPDATE instances AS i
|
|
60
|
+
SET job_id = (
|
|
61
|
+
SELECT j.id
|
|
62
|
+
FROM jobs j
|
|
63
|
+
WHERE j.instance_id = i.id
|
|
64
|
+
ORDER by j.submitted_at DESC
|
|
65
|
+
LIMIT 1
|
|
66
|
+
)
|
|
67
|
+
""")
|
|
68
|
+
|
|
69
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
70
|
+
batch_op.drop_constraint(batch_op.f("fk_jobs_instance_id_instances"), type_="foreignkey")
|
|
71
|
+
batch_op.drop_column("instance_id")
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Add JobModel.inactivity_secs
|
|
2
|
+
|
|
3
|
+
Revision ID: 1e76fb0dde87
|
|
4
|
+
Revises: 63c3f19cb184
|
|
5
|
+
Create Date: 2025-02-11 23:37:58.823710
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = "1e76fb0dde87"
|
|
14
|
+
down_revision = "63c3f19cb184"
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
21
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
22
|
+
batch_op.add_column(sa.Column("inactivity_secs", sa.Integer(), nullable=True))
|
|
23
|
+
|
|
24
|
+
# ### end Alembic commands ###
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def downgrade() -> None:
|
|
28
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
29
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
30
|
+
batch_op.drop_column("inactivity_secs")
|
|
31
|
+
|
|
32
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Add InstanceModel blocks fields
|
|
2
|
+
|
|
3
|
+
Revision ID: 51d45659d574
|
|
4
|
+
Revises: da574e93fee0
|
|
5
|
+
Create Date: 2025-02-04 11:10:41.626273
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = "51d45659d574"
|
|
14
|
+
down_revision = "da574e93fee0"
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
with op.batch_alter_table("instances", schema=None) as batch_op:
|
|
21
|
+
batch_op.add_column(sa.Column("total_blocks", sa.Integer(), nullable=True))
|
|
22
|
+
batch_op.add_column(sa.Column("busy_blocks", sa.Integer(), nullable=True))
|
|
23
|
+
|
|
24
|
+
op.execute("""
|
|
25
|
+
UPDATE instances
|
|
26
|
+
SET total_blocks = 1
|
|
27
|
+
""")
|
|
28
|
+
op.execute("""
|
|
29
|
+
UPDATE instances
|
|
30
|
+
SET busy_blocks = CASE
|
|
31
|
+
WHEN job_id IS NOT NULL THEN 1
|
|
32
|
+
ELSE 0
|
|
33
|
+
END
|
|
34
|
+
""")
|
|
35
|
+
|
|
36
|
+
with op.batch_alter_table("instances", schema=None) as batch_op:
|
|
37
|
+
batch_op.alter_column("busy_blocks", existing_type=sa.INTEGER(), nullable=False)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def downgrade() -> None:
|
|
41
|
+
with op.batch_alter_table("instances", schema=None) as batch_op:
|
|
42
|
+
batch_op.drop_column("busy_blocks")
|
|
43
|
+
batch_op.drop_column("total_blocks")
|
dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Add JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
|
|
2
|
+
|
|
3
|
+
Revision ID: 63c3f19cb184
|
|
4
|
+
Revises: 1338b788b612
|
|
5
|
+
Create Date: 2025-02-11 22:30:47.289393
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from alembic import op
|
|
10
|
+
from alembic_postgresql_enum import TableReference
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = "63c3f19cb184"
|
|
14
|
+
down_revision = "1338b788b612"
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
21
|
+
op.sync_enum_values(
|
|
22
|
+
enum_schema="public",
|
|
23
|
+
enum_name="jobterminationreason",
|
|
24
|
+
new_values=[
|
|
25
|
+
"FAILED_TO_START_DUE_TO_NO_CAPACITY",
|
|
26
|
+
"INTERRUPTED_BY_NO_CAPACITY",
|
|
27
|
+
"WAITING_INSTANCE_LIMIT_EXCEEDED",
|
|
28
|
+
"WAITING_RUNNER_LIMIT_EXCEEDED",
|
|
29
|
+
"TERMINATED_BY_USER",
|
|
30
|
+
"VOLUME_ERROR",
|
|
31
|
+
"GATEWAY_ERROR",
|
|
32
|
+
"SCALED_DOWN",
|
|
33
|
+
"DONE_BY_RUNNER",
|
|
34
|
+
"ABORTED_BY_USER",
|
|
35
|
+
"TERMINATED_BY_SERVER",
|
|
36
|
+
"INACTIVITY_DURATION_EXCEEDED",
|
|
37
|
+
"CONTAINER_EXITED_WITH_ERROR",
|
|
38
|
+
"PORTS_BINDING_FAILED",
|
|
39
|
+
"CREATING_CONTAINER_ERROR",
|
|
40
|
+
"EXECUTOR_ERROR",
|
|
41
|
+
"MAX_DURATION_EXCEEDED",
|
|
42
|
+
],
|
|
43
|
+
affected_columns=[
|
|
44
|
+
TableReference(
|
|
45
|
+
table_schema="public", table_name="jobs", column_name="termination_reason"
|
|
46
|
+
)
|
|
47
|
+
],
|
|
48
|
+
enum_values_to_rename=[],
|
|
49
|
+
)
|
|
50
|
+
# ### end Alembic commands ###
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def downgrade() -> None:
|
|
54
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
55
|
+
op.sync_enum_values(
|
|
56
|
+
enum_schema="public",
|
|
57
|
+
enum_name="jobterminationreason",
|
|
58
|
+
new_values=[
|
|
59
|
+
"FAILED_TO_START_DUE_TO_NO_CAPACITY",
|
|
60
|
+
"INTERRUPTED_BY_NO_CAPACITY",
|
|
61
|
+
"WAITING_INSTANCE_LIMIT_EXCEEDED",
|
|
62
|
+
"WAITING_RUNNER_LIMIT_EXCEEDED",
|
|
63
|
+
"TERMINATED_BY_USER",
|
|
64
|
+
"VOLUME_ERROR",
|
|
65
|
+
"GATEWAY_ERROR",
|
|
66
|
+
"SCALED_DOWN",
|
|
67
|
+
"DONE_BY_RUNNER",
|
|
68
|
+
"ABORTED_BY_USER",
|
|
69
|
+
"TERMINATED_BY_SERVER",
|
|
70
|
+
"CONTAINER_EXITED_WITH_ERROR",
|
|
71
|
+
"PORTS_BINDING_FAILED",
|
|
72
|
+
"CREATING_CONTAINER_ERROR",
|
|
73
|
+
"EXECUTOR_ERROR",
|
|
74
|
+
"MAX_DURATION_EXCEEDED",
|
|
75
|
+
],
|
|
76
|
+
affected_columns=[
|
|
77
|
+
TableReference(
|
|
78
|
+
table_schema="public", table_name="jobs", column_name="termination_reason"
|
|
79
|
+
)
|
|
80
|
+
],
|
|
81
|
+
enum_values_to_rename=[],
|
|
82
|
+
)
|
|
83
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Move attachment_data to volumes_attachments
|
|
2
|
+
|
|
3
|
+
Revision ID: a751ef183f27
|
|
4
|
+
Revises: 1e76fb0dde87
|
|
5
|
+
Create Date: 2025-02-12 13:19:57.569591
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = "a751ef183f27"
|
|
14
|
+
down_revision = "1e76fb0dde87"
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
21
|
+
with op.batch_alter_table("volumes_attachments", schema=None) as batch_op:
|
|
22
|
+
batch_op.alter_column("instace_id", new_column_name="instance_id")
|
|
23
|
+
batch_op.add_column(sa.Column("attachment_data", sa.Text(), nullable=True))
|
|
24
|
+
|
|
25
|
+
# ### end Alembic commands ###
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def downgrade() -> None:
|
|
29
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
30
|
+
with op.batch_alter_table("volumes_attachments", schema=None) as batch_op:
|
|
31
|
+
batch_op.drop_column("attachment_data")
|
|
32
|
+
batch_op.alter_column("instance_id", new_column_name="instace_id")
|
|
33
|
+
|
|
34
|
+
# ### end Alembic commands ###
|