dstack 0.18.40rc1__py3-none-any.whl → 0.18.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. dstack/_internal/cli/commands/apply.py +8 -5
  2. dstack/_internal/cli/services/configurators/base.py +4 -2
  3. dstack/_internal/cli/services/configurators/fleet.py +21 -9
  4. dstack/_internal/cli/services/configurators/gateway.py +15 -0
  5. dstack/_internal/cli/services/configurators/run.py +6 -5
  6. dstack/_internal/cli/services/configurators/volume.py +15 -0
  7. dstack/_internal/cli/services/repos.py +3 -3
  8. dstack/_internal/cli/utils/fleet.py +44 -33
  9. dstack/_internal/cli/utils/run.py +27 -7
  10. dstack/_internal/cli/utils/volume.py +30 -9
  11. dstack/_internal/core/backends/aws/compute.py +94 -53
  12. dstack/_internal/core/backends/aws/resources.py +22 -12
  13. dstack/_internal/core/backends/azure/compute.py +2 -0
  14. dstack/_internal/core/backends/base/compute.py +20 -2
  15. dstack/_internal/core/backends/gcp/compute.py +32 -24
  16. dstack/_internal/core/backends/gcp/resources.py +0 -15
  17. dstack/_internal/core/backends/oci/compute.py +10 -5
  18. dstack/_internal/core/backends/oci/resources.py +23 -26
  19. dstack/_internal/core/backends/remote/provisioning.py +65 -27
  20. dstack/_internal/core/backends/runpod/compute.py +1 -0
  21. dstack/_internal/core/models/backends/azure.py +3 -1
  22. dstack/_internal/core/models/configurations.py +24 -1
  23. dstack/_internal/core/models/fleets.py +46 -0
  24. dstack/_internal/core/models/instances.py +5 -1
  25. dstack/_internal/core/models/pools.py +4 -1
  26. dstack/_internal/core/models/profiles.py +10 -4
  27. dstack/_internal/core/models/runs.py +23 -3
  28. dstack/_internal/core/models/volumes.py +26 -0
  29. dstack/_internal/core/services/ssh/attach.py +92 -53
  30. dstack/_internal/core/services/ssh/tunnel.py +58 -31
  31. dstack/_internal/proxy/gateway/routers/registry.py +2 -0
  32. dstack/_internal/proxy/gateway/schemas/registry.py +2 -0
  33. dstack/_internal/proxy/gateway/services/registry.py +4 -0
  34. dstack/_internal/proxy/lib/models.py +3 -0
  35. dstack/_internal/proxy/lib/services/service_connection.py +8 -1
  36. dstack/_internal/server/background/tasks/process_instances.py +73 -35
  37. dstack/_internal/server/background/tasks/process_metrics.py +9 -9
  38. dstack/_internal/server/background/tasks/process_running_jobs.py +77 -26
  39. dstack/_internal/server/background/tasks/process_runs.py +2 -12
  40. dstack/_internal/server/background/tasks/process_submitted_jobs.py +121 -49
  41. dstack/_internal/server/background/tasks/process_terminating_jobs.py +14 -3
  42. dstack/_internal/server/background/tasks/process_volumes.py +11 -1
  43. dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py +71 -0
  44. dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py +32 -0
  45. dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py +43 -0
  46. dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py +83 -0
  47. dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
  48. dstack/_internal/server/models.py +27 -23
  49. dstack/_internal/server/routers/runs.py +1 -0
  50. dstack/_internal/server/schemas/runner.py +1 -0
  51. dstack/_internal/server/services/backends/configurators/azure.py +34 -8
  52. dstack/_internal/server/services/config.py +9 -0
  53. dstack/_internal/server/services/fleets.py +32 -3
  54. dstack/_internal/server/services/gateways/client.py +9 -1
  55. dstack/_internal/server/services/jobs/__init__.py +217 -45
  56. dstack/_internal/server/services/jobs/configurators/base.py +47 -2
  57. dstack/_internal/server/services/offers.py +96 -10
  58. dstack/_internal/server/services/pools.py +98 -14
  59. dstack/_internal/server/services/proxy/repo.py +17 -3
  60. dstack/_internal/server/services/runner/client.py +9 -6
  61. dstack/_internal/server/services/runner/ssh.py +33 -5
  62. dstack/_internal/server/services/runs.py +48 -179
  63. dstack/_internal/server/services/services/__init__.py +9 -1
  64. dstack/_internal/server/services/volumes.py +68 -9
  65. dstack/_internal/server/statics/index.html +1 -1
  66. dstack/_internal/server/statics/{main-11ec5e4a00ea6ec833e3.js → main-2ac66bfcbd2e39830b88.js} +30 -31
  67. dstack/_internal/server/statics/{main-11ec5e4a00ea6ec833e3.js.map → main-2ac66bfcbd2e39830b88.js.map} +1 -1
  68. dstack/_internal/server/statics/{main-fc56d1f4af8e57522a1c.css → main-ad5150a441de98cd8987.css} +1 -1
  69. dstack/_internal/server/testing/common.py +130 -61
  70. dstack/_internal/utils/common.py +22 -8
  71. dstack/_internal/utils/env.py +14 -0
  72. dstack/_internal/utils/ssh.py +1 -1
  73. dstack/api/server/_fleets.py +25 -1
  74. dstack/api/server/_runs.py +23 -2
  75. dstack/api/server/_volumes.py +12 -1
  76. dstack/version.py +1 -1
  77. {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/METADATA +1 -1
  78. {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/RECORD +104 -93
  79. tests/_internal/cli/services/configurators/test_profile.py +3 -3
  80. tests/_internal/core/services/ssh/test_tunnel.py +56 -4
  81. tests/_internal/proxy/gateway/routers/test_registry.py +30 -7
  82. tests/_internal/server/background/tasks/test_process_instances.py +138 -20
  83. tests/_internal/server/background/tasks/test_process_metrics.py +12 -0
  84. tests/_internal/server/background/tasks/test_process_running_jobs.py +193 -0
  85. tests/_internal/server/background/tasks/test_process_runs.py +27 -3
  86. tests/_internal/server/background/tasks/test_process_submitted_jobs.py +53 -6
  87. tests/_internal/server/background/tasks/test_process_terminating_jobs.py +135 -17
  88. tests/_internal/server/routers/test_fleets.py +15 -2
  89. tests/_internal/server/routers/test_pools.py +6 -0
  90. tests/_internal/server/routers/test_runs.py +27 -0
  91. tests/_internal/server/routers/test_volumes.py +9 -2
  92. tests/_internal/server/services/jobs/__init__.py +0 -0
  93. tests/_internal/server/services/jobs/configurators/__init__.py +0 -0
  94. tests/_internal/server/services/jobs/configurators/test_base.py +72 -0
  95. tests/_internal/server/services/runner/test_client.py +22 -3
  96. tests/_internal/server/services/test_offers.py +167 -0
  97. tests/_internal/server/services/test_pools.py +109 -1
  98. tests/_internal/server/services/test_runs.py +5 -41
  99. tests/_internal/utils/test_common.py +21 -0
  100. tests/_internal/utils/test_env.py +38 -0
  101. {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/LICENSE.md +0 -0
  102. {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/WHEEL +0 -0
  103. {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/entry_points.txt +0 -0
  104. {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/top_level.txt +0 -0
@@ -15,10 +15,7 @@ from dstack._internal.core.models.fleets import (
15
15
  FleetStatus,
16
16
  InstanceGroupPlacement,
17
17
  )
18
- from dstack._internal.core.models.instances import (
19
- InstanceOfferWithAvailability,
20
- InstanceStatus,
21
- )
18
+ from dstack._internal.core.models.instances import InstanceOfferWithAvailability, InstanceStatus
22
19
  from dstack._internal.core.models.profiles import (
23
20
  DEFAULT_POOL_NAME,
24
21
  DEFAULT_RUN_TERMINATION_IDLE_TIME,
@@ -26,6 +23,7 @@ from dstack._internal.core.models.profiles import (
26
23
  Profile,
27
24
  TerminationPolicy,
28
25
  )
26
+ from dstack._internal.core.models.resources import Memory
29
27
  from dstack._internal.core.models.runs import (
30
28
  Job,
31
29
  JobProvisioningData,
@@ -45,6 +43,7 @@ from dstack._internal.server.models import (
45
43
  PoolModel,
46
44
  ProjectModel,
47
45
  RunModel,
46
+ VolumeAttachmentModel,
48
47
  VolumeModel,
49
48
  )
50
49
  from dstack._internal.server.services.backends import get_project_backend_by_type_or_error
@@ -52,28 +51,31 @@ from dstack._internal.server.services.fleets import (
52
51
  fleet_model_to_fleet,
53
52
  )
54
53
  from dstack._internal.server.services.jobs import (
54
+ check_can_attach_job_volumes,
55
55
  find_job,
56
56
  get_instances_ids_with_detaching_volumes,
57
+ get_job_configured_volume_models,
58
+ get_job_configured_volumes,
59
+ get_job_runtime_data,
57
60
  )
58
61
  from dstack._internal.server.services.locking import get_locker
59
62
  from dstack._internal.server.services.logging import fmt
60
63
  from dstack._internal.server.services.offers import get_offers_by_requirements
61
64
  from dstack._internal.server.services.pools import (
62
65
  filter_pool_instances,
66
+ get_instance_offer,
63
67
  get_instance_provisioning_data,
68
+ get_shared_pool_instances_with_offers,
64
69
  )
65
70
  from dstack._internal.server.services.runs import (
66
- check_can_attach_run_volumes,
67
71
  check_run_spec_requires_instance_mounts,
68
- get_offer_volumes,
69
- get_run_volume_models,
70
- get_run_volumes,
71
72
  run_model_to_run,
72
73
  )
73
74
  from dstack._internal.server.services.volumes import (
74
75
  volume_model_to_volume,
75
76
  )
76
77
  from dstack._internal.utils import common as common_utils
78
+ from dstack._internal.utils import env as env_utils
77
79
  from dstack._internal.utils.logging import get_logger
78
80
 
79
81
  logger = get_logger(__name__)
@@ -152,17 +154,21 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
152
154
  await session.commit()
153
155
  return
154
156
  try:
155
- volume_models = await get_run_volume_models(
157
+ volume_models = await get_job_configured_volume_models(
156
158
  session=session,
157
159
  project=project,
158
160
  run_spec=run_spec,
161
+ job_num=job.job_spec.job_num,
162
+ job_spec=job.job_spec,
159
163
  )
160
- volumes = await get_run_volumes(
164
+ volumes = await get_job_configured_volumes(
161
165
  session=session,
162
166
  project=project,
163
167
  run_spec=run_spec,
168
+ job_num=job.job_spec.job_num,
169
+ job_spec=job.job_spec,
164
170
  )
165
- check_can_attach_run_volumes(run_spec=run_spec, volumes=volumes)
171
+ check_can_attach_job_volumes(volumes)
166
172
  except ServerClientError as e:
167
173
  logger.warning("%s: failed to prepare run volumes: %s", fmt(job_model), repr(e))
168
174
  job_model.status = JobStatus.TERMINATING
@@ -186,12 +192,12 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
186
192
  .where(
187
193
  InstanceModel.pool_id == pool.id,
188
194
  InstanceModel.deleted == False,
189
- InstanceModel.job_id.is_(None),
195
+ InstanceModel.total_blocks > InstanceModel.busy_blocks,
190
196
  )
191
- .options(lazyload(InstanceModel.job))
197
+ .options(lazyload(InstanceModel.jobs))
192
198
  .with_for_update()
193
199
  )
194
- pool_instances = list(res.scalars().all())
200
+ pool_instances = list(res.unique().scalars().all())
195
201
  instances_ids = sorted([i.id for i in pool_instances])
196
202
  if get_db().dialect_name == "sqlite":
197
203
  # Start new transaction to see commited changes after lock
@@ -202,14 +208,16 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
202
208
  detaching_instances_ids = await get_instances_ids_with_detaching_volumes(session)
203
209
  # Refetch after lock
204
210
  res = await session.execute(
205
- select(InstanceModel).where(
211
+ select(InstanceModel)
212
+ .where(
206
213
  InstanceModel.id.not_in(detaching_instances_ids),
207
214
  InstanceModel.id.in_(instances_ids),
208
215
  InstanceModel.deleted == False,
209
- InstanceModel.job_id.is_(None),
216
+ InstanceModel.total_blocks > InstanceModel.busy_blocks,
210
217
  )
218
+ .execution_options(populate_existing=True)
211
219
  )
212
- pool_instances = list(res.scalars().all())
220
+ pool_instances = list(res.unique().scalars().all())
213
221
  instance = await _assign_job_to_pool_instance(
214
222
  session=session,
215
223
  pool_instances=pool_instances,
@@ -221,8 +229,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
221
229
  volumes=volumes,
222
230
  )
223
231
  job_model.instance_assigned = True
224
- if instance is not None:
225
- job_model.job_runtime_data = _prepare_job_runtime_data(job, instance).json()
226
232
  job_model.last_processed_at = common_utils.get_current_datetime()
227
233
  await session.commit()
228
234
  return
@@ -231,10 +237,10 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
231
237
  res = await session.execute(
232
238
  select(InstanceModel)
233
239
  .where(InstanceModel.id == job_model.instance.id)
234
- .options(selectinload(InstanceModel.volumes))
240
+ .options(selectinload(InstanceModel.volume_attachments))
235
241
  .execution_options(populate_existing=True)
236
242
  )
237
- instance = res.scalar_one()
243
+ instance = res.unique().scalar_one()
238
244
  job_model.status = JobStatus.PROVISIONING
239
245
  else:
240
246
  # Assigned no instance, create a new one
@@ -290,7 +296,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
290
296
  offer=offer,
291
297
  instance_num=instance_num,
292
298
  )
293
- job_model.job_runtime_data = _prepare_job_runtime_data(job, instance).json()
299
+ job_model.job_runtime_data = _prepare_job_runtime_data(offer).json()
294
300
  instance.fleet_id = fleet_model.id
295
301
  logger.info(
296
302
  "The job %s created the new instance %s",
@@ -351,30 +357,50 @@ async def _assign_job_to_pool_instance(
351
357
  master_job_provisioning_data: Optional[JobProvisioningData] = None,
352
358
  volumes: Optional[List[List[Volume]]] = None,
353
359
  ) -> Optional[InstanceModel]:
360
+ instances_with_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]]
354
361
  profile = run_spec.merged_profile
355
- relevant_instances = filter_pool_instances(
362
+ multinode = job.job_spec.jobs_per_replica > 1
363
+ nonshared_instances = filter_pool_instances(
356
364
  pool_instances=pool_instances,
357
365
  profile=profile,
358
366
  requirements=job.job_spec.requirements,
359
367
  status=InstanceStatus.IDLE,
360
368
  fleet_model=fleet_model,
361
- multinode=job.job_spec.jobs_per_replica > 1,
369
+ multinode=multinode,
362
370
  master_job_provisioning_data=master_job_provisioning_data,
363
371
  volumes=volumes,
372
+ shared=False,
364
373
  )
365
- if len(relevant_instances) == 0:
374
+ instances_with_offers = [
375
+ (instance, common_utils.get_or_error(get_instance_offer(instance)))
376
+ for instance in nonshared_instances
377
+ ]
378
+ if not multinode:
379
+ shared_instances_with_offers = get_shared_pool_instances_with_offers(
380
+ pool_instances=pool_instances,
381
+ profile=profile,
382
+ requirements=job.job_spec.requirements,
383
+ idle_only=True,
384
+ fleet_model=fleet_model,
385
+ volumes=volumes,
386
+ )
387
+ instances_with_offers.extend(shared_instances_with_offers)
388
+
389
+ if len(instances_with_offers) == 0:
366
390
  return None
367
- sorted_instances = sorted(relevant_instances, key=lambda instance: instance.price)
368
- instance = sorted_instances[0]
369
- # Reload InstanceModel with volumes
391
+
392
+ instances_with_offers.sort(key=lambda instance_with_offer: instance_with_offer[0].price or 0)
393
+ instance, offer = instances_with_offers[0]
394
+ # Reload InstanceModel with volume attachments
370
395
  res = await session.execute(
371
396
  select(InstanceModel)
372
397
  .where(InstanceModel.id == instance.id)
373
- .options(joinedload(InstanceModel.volumes))
398
+ .options(joinedload(InstanceModel.volume_attachments))
374
399
  )
375
400
  instance = res.unique().scalar_one()
376
401
  instance.status = InstanceStatus.BUSY
377
- instance.job = job_model
402
+ instance.busy_blocks += offer.blocks
403
+
378
404
  logger.info(
379
405
  "The job %s switched instance %s status to BUSY",
380
406
  job_model.job_name,
@@ -385,8 +411,10 @@ async def _assign_job_to_pool_instance(
385
411
  },
386
412
  )
387
413
  logger.info("%s: now is provisioning on '%s'", fmt(job_model), instance.name)
388
- job_model.job_provisioning_data = instance.job_provisioning_data
414
+ job_model.instance = instance
389
415
  job_model.used_instance_id = instance.id
416
+ job_model.job_provisioning_data = instance.job_provisioning_data
417
+ job_model.job_runtime_data = _prepare_job_runtime_data(offer).json()
390
418
  return instance
391
419
 
392
420
 
@@ -431,7 +459,7 @@ async def _run_job_on_new_instance(
431
459
  offer.region,
432
460
  offer.price,
433
461
  )
434
- offer_volumes = get_offer_volumes(volumes, offer)
462
+ offer_volumes = _get_offer_volumes(volumes, offer)
435
463
  try:
436
464
  job_provisioning_data = await common_utils.run_async(
437
465
  backend.compute().run_job,
@@ -549,29 +577,64 @@ def _create_instance_model_for_job(
549
577
  offer=offer.json(),
550
578
  termination_policy=termination_policy,
551
579
  termination_idle_time=termination_idle_time,
552
- job=job_model,
580
+ jobs=[job_model],
553
581
  backend=offer.backend,
554
582
  price=offer.price,
555
583
  region=offer.region,
556
- volumes=[],
584
+ volume_attachments=[],
585
+ total_blocks=1,
586
+ busy_blocks=1,
557
587
  )
558
588
  return instance
559
589
 
560
590
 
561
- def _prepare_job_runtime_data(job: Job, instance: InstanceModel) -> JobRuntimeData:
562
- if job.job_spec.jobs_per_replica > 1:
563
- # multi-node runs require host network mode for inter-node communication and occupy
564
- # the entire instance
565
- return JobRuntimeData(network_mode=NetworkMode.HOST)
566
-
567
- # TODO: replace with a real computed value depending on the instance
568
- is_shared_instance = True
591
+ def _prepare_job_runtime_data(offer: InstanceOfferWithAvailability) -> JobRuntimeData:
592
+ if offer.total_blocks == 1:
593
+ if env_utils.get_bool("DSTACK_FORCE_BRIDGE_NETWORK"):
594
+ network_mode = NetworkMode.BRIDGE
595
+ else:
596
+ network_mode = NetworkMode.HOST
597
+ return JobRuntimeData(
598
+ network_mode=network_mode,
599
+ offer=offer,
600
+ )
601
+ return JobRuntimeData(
602
+ network_mode=NetworkMode.BRIDGE,
603
+ offer=offer,
604
+ cpu=offer.instance.resources.cpus,
605
+ gpu=len(offer.instance.resources.gpus),
606
+ memory=Memory(offer.instance.resources.memory_mib / 1024),
607
+ )
569
608
 
570
- if not is_shared_instance:
571
- return JobRuntimeData(network_mode=NetworkMode.HOST)
572
609
 
573
- # TODO: slice CPU/GPU/Memory resources depending on the instance
574
- return JobRuntimeData(network_mode=NetworkMode.BRIDGE)
610
+ def _get_offer_volumes(
611
+ volumes: List[List[Volume]],
612
+ offer: InstanceOfferWithAvailability,
613
+ ) -> List[Volume]:
614
+ """
615
+ Returns volumes suitable for the offer for each mount point.
616
+ """
617
+ offer_volumes = []
618
+ for mount_point_volumes in volumes:
619
+ offer_volumes.append(_get_offer_mount_point_volume(mount_point_volumes, offer))
620
+ return offer_volumes
621
+
622
+
623
+ def _get_offer_mount_point_volume(
624
+ volumes: List[Volume],
625
+ offer: InstanceOfferWithAvailability,
626
+ ) -> Volume:
627
+ """
628
+ Returns the first suitable volume for the offer among possible mount point volumes.
629
+ """
630
+ for volume in volumes:
631
+ if (
632
+ volume.configuration.backend != offer.backend
633
+ or volume.configuration.region != offer.region
634
+ ):
635
+ continue
636
+ return volume
637
+ raise ServerClientError("Failed to find an eligible volume for the mount point")
575
638
 
576
639
 
577
640
  async def _attach_volumes(
@@ -586,6 +649,8 @@ async def _attach_volumes(
586
649
  project=project,
587
650
  backend_type=job_provisioning_data.backend,
588
651
  )
652
+ job_runtime_data = common_utils.get_or_error(get_job_runtime_data(job_model))
653
+ job_runtime_data.volume_names = []
589
654
  logger.info("Attaching volumes: %s", [[v.name for v in vs] for vs in volume_models])
590
655
  for mount_point_volume_models in volume_models:
591
656
  for volume_model in mount_point_volume_models:
@@ -604,6 +669,7 @@ async def _attach_volumes(
604
669
  instance=instance,
605
670
  instance_id=job_provisioning_data.instance_id,
606
671
  )
672
+ job_runtime_data.volume_names.append(volume.name)
607
673
  break # attach next mount point
608
674
  except (ServerClientError, BackendError) as e:
609
675
  logger.warning("%s: failed to attached volume: %s", fmt(job_model), repr(e))
@@ -620,6 +686,8 @@ async def _attach_volumes(
620
686
  # TODO: Replace with JobTerminationReason.VOLUME_ERROR in 0.19
621
687
  job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
622
688
  job_model.termination_reason_message = "Failed to attach volume"
689
+ finally:
690
+ job_model.job_runtime_data = job_runtime_data.json()
623
691
 
624
692
 
625
693
  async def _attach_volume(
@@ -629,14 +697,18 @@ async def _attach_volume(
629
697
  instance: InstanceModel,
630
698
  instance_id: str,
631
699
  ):
700
+ volume = volume_model_to_volume(volume_model)
701
+ # Refresh only to check if the volume wasn't deleted before the lock
632
702
  await session.refresh(volume_model)
633
703
  if volume_model.deleted:
634
704
  raise ServerClientError("Cannot attach a deleted volume")
635
- volume = volume_model_to_volume(volume_model)
636
705
  attachment_data = await common_utils.run_async(
637
706
  backend.compute().attach_volume,
638
707
  volume=volume,
639
708
  instance_id=instance_id,
640
709
  )
641
- volume_model.volume_attachment_data = attachment_data.json()
642
- instance.volumes.append(volume_model)
710
+ volume_attachment_model = VolumeAttachmentModel(
711
+ volume=volume_model,
712
+ attachment_data=attachment_data.json(),
713
+ )
714
+ instance.volume_attachments.append(volume_attachment_model)
@@ -6,7 +6,13 @@ from sqlalchemy.orm import joinedload, lazyload
6
6
 
7
7
  from dstack._internal.core.models.runs import JobStatus
8
8
  from dstack._internal.server.db import get_session_ctx
9
- from dstack._internal.server.models import InstanceModel, JobModel, ProjectModel, VolumeModel
9
+ from dstack._internal.server.models import (
10
+ InstanceModel,
11
+ JobModel,
12
+ ProjectModel,
13
+ VolumeAttachmentModel,
14
+ VolumeModel,
15
+ )
10
16
  from dstack._internal.server.services.jobs import (
11
17
  process_terminating_job,
12
18
  process_volumes_detaching,
@@ -52,7 +58,7 @@ async def _process_next_terminating_job():
52
58
  InstanceModel.id == job_model.used_instance_id,
53
59
  InstanceModel.id.not_in(instance_lockset),
54
60
  )
55
- .options(lazyload(InstanceModel.job))
61
+ .options(lazyload(InstanceModel.jobs))
56
62
  .with_for_update(skip_locked=True)
57
63
  )
58
64
  instance_model = res.scalar()
@@ -80,7 +86,12 @@ async def _process_job(session: AsyncSession, job_model: JobModel):
80
86
  .where(InstanceModel.id == job_model.used_instance_id)
81
87
  .options(
82
88
  joinedload(InstanceModel.project).joinedload(ProjectModel.backends),
83
- joinedload(InstanceModel.volumes).joinedload(VolumeModel.user),
89
+ joinedload(InstanceModel.volume_attachments)
90
+ .joinedload(VolumeAttachmentModel.volume)
91
+ .joinedload(VolumeModel.user),
92
+ joinedload(InstanceModel.volume_attachments)
93
+ .joinedload(VolumeAttachmentModel.volume)
94
+ .joinedload(VolumeModel.attachments),
84
95
  )
85
96
  )
86
97
  instance_model = res.unique().scalar()
@@ -5,7 +5,12 @@ from sqlalchemy.orm import joinedload
5
5
  from dstack._internal.core.errors import BackendError, BackendNotAvailable
6
6
  from dstack._internal.core.models.volumes import VolumeStatus
7
7
  from dstack._internal.server.db import get_session_ctx
8
- from dstack._internal.server.models import ProjectModel, VolumeModel
8
+ from dstack._internal.server.models import (
9
+ InstanceModel,
10
+ ProjectModel,
11
+ VolumeAttachmentModel,
12
+ VolumeModel,
13
+ )
9
14
  from dstack._internal.server.services import backends as backends_services
10
15
  from dstack._internal.server.services import volumes as volumes_services
11
16
  from dstack._internal.server.services.locking import get_locker
@@ -49,6 +54,11 @@ async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeM
49
54
  .where(VolumeModel.id == volume_model.id)
50
55
  .options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends))
51
56
  .options(joinedload(VolumeModel.user))
57
+ .options(
58
+ joinedload(VolumeModel.attachments)
59
+ .joinedload(VolumeAttachmentModel.instance)
60
+ .joinedload(InstanceModel.fleet)
61
+ )
52
62
  .execution_options(populate_existing=True)
53
63
  )
54
64
  volume_model = res.unique().scalar_one()
@@ -0,0 +1,71 @@
1
+ """Reverse Job-Instance relationship
2
+
3
+ Revision ID: 1338b788b612
4
+ Revises: 51d45659d574
5
+ Create Date: 2025-01-16 14:59:19.113534
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ import sqlalchemy_utils
11
+ from alembic import op
12
+
13
+ # revision identifiers, used by Alembic.
14
+ revision = "1338b788b612"
15
+ down_revision = "51d45659d574"
16
+ branch_labels = None
17
+ depends_on = None
18
+
19
+
20
+ def upgrade() -> None:
21
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
22
+ batch_op.add_column(
23
+ sa.Column(
24
+ "instance_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True
25
+ )
26
+ )
27
+ batch_op.create_foreign_key(
28
+ batch_op.f("fk_jobs_instance_id_instances"),
29
+ "instances",
30
+ ["instance_id"],
31
+ ["id"],
32
+ ondelete="CASCADE",
33
+ )
34
+
35
+ op.execute("""
36
+ UPDATE jobs AS j
37
+ SET instance_id = (
38
+ SELECT i.id
39
+ FROM instances AS i
40
+ WHERE i.job_id = j.id
41
+ )
42
+ """)
43
+
44
+ with op.batch_alter_table("instances", schema=None) as batch_op:
45
+ batch_op.drop_constraint("fk_instances_job_id_jobs", type_="foreignkey")
46
+ batch_op.drop_column("job_id")
47
+
48
+
49
+ def downgrade() -> None:
50
+ with op.batch_alter_table("instances", schema=None) as batch_op:
51
+ batch_op.add_column(
52
+ sa.Column("job_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True)
53
+ )
54
+ batch_op.create_foreign_key("fk_instances_job_id_jobs", "jobs", ["job_id"], ["id"])
55
+
56
+ # This migration is not fully reversible - we cannot assign multiple jobs to a single instance,
57
+ # thus LIMIT 1
58
+ op.execute("""
59
+ UPDATE instances AS i
60
+ SET job_id = (
61
+ SELECT j.id
62
+ FROM jobs j
63
+ WHERE j.instance_id = i.id
64
+ ORDER by j.submitted_at DESC
65
+ LIMIT 1
66
+ )
67
+ """)
68
+
69
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
70
+ batch_op.drop_constraint(batch_op.f("fk_jobs_instance_id_instances"), type_="foreignkey")
71
+ batch_op.drop_column("instance_id")
@@ -0,0 +1,32 @@
1
+ """Add JobModel.inactivity_secs
2
+
3
+ Revision ID: 1e76fb0dde87
4
+ Revises: 63c3f19cb184
5
+ Create Date: 2025-02-11 23:37:58.823710
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = "1e76fb0dde87"
14
+ down_revision = "63c3f19cb184"
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
22
+ batch_op.add_column(sa.Column("inactivity_secs", sa.Integer(), nullable=True))
23
+
24
+ # ### end Alembic commands ###
25
+
26
+
27
+ def downgrade() -> None:
28
+ # ### commands auto generated by Alembic - please adjust! ###
29
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
30
+ batch_op.drop_column("inactivity_secs")
31
+
32
+ # ### end Alembic commands ###
@@ -0,0 +1,43 @@
1
+ """Add InstanceModel blocks fields
2
+
3
+ Revision ID: 51d45659d574
4
+ Revises: da574e93fee0
5
+ Create Date: 2025-02-04 11:10:41.626273
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = "51d45659d574"
14
+ down_revision = "da574e93fee0"
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ with op.batch_alter_table("instances", schema=None) as batch_op:
21
+ batch_op.add_column(sa.Column("total_blocks", sa.Integer(), nullable=True))
22
+ batch_op.add_column(sa.Column("busy_blocks", sa.Integer(), nullable=True))
23
+
24
+ op.execute("""
25
+ UPDATE instances
26
+ SET total_blocks = 1
27
+ """)
28
+ op.execute("""
29
+ UPDATE instances
30
+ SET busy_blocks = CASE
31
+ WHEN job_id IS NOT NULL THEN 1
32
+ ELSE 0
33
+ END
34
+ """)
35
+
36
+ with op.batch_alter_table("instances", schema=None) as batch_op:
37
+ batch_op.alter_column("busy_blocks", existing_type=sa.INTEGER(), nullable=False)
38
+
39
+
40
+ def downgrade() -> None:
41
+ with op.batch_alter_table("instances", schema=None) as batch_op:
42
+ batch_op.drop_column("busy_blocks")
43
+ batch_op.drop_column("total_blocks")
@@ -0,0 +1,83 @@
1
+ """Add JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
2
+
3
+ Revision ID: 63c3f19cb184
4
+ Revises: 1338b788b612
5
+ Create Date: 2025-02-11 22:30:47.289393
6
+
7
+ """
8
+
9
+ from alembic import op
10
+ from alembic_postgresql_enum import TableReference
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = "63c3f19cb184"
14
+ down_revision = "1338b788b612"
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ op.sync_enum_values(
22
+ enum_schema="public",
23
+ enum_name="jobterminationreason",
24
+ new_values=[
25
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
26
+ "INTERRUPTED_BY_NO_CAPACITY",
27
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
28
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
29
+ "TERMINATED_BY_USER",
30
+ "VOLUME_ERROR",
31
+ "GATEWAY_ERROR",
32
+ "SCALED_DOWN",
33
+ "DONE_BY_RUNNER",
34
+ "ABORTED_BY_USER",
35
+ "TERMINATED_BY_SERVER",
36
+ "INACTIVITY_DURATION_EXCEEDED",
37
+ "CONTAINER_EXITED_WITH_ERROR",
38
+ "PORTS_BINDING_FAILED",
39
+ "CREATING_CONTAINER_ERROR",
40
+ "EXECUTOR_ERROR",
41
+ "MAX_DURATION_EXCEEDED",
42
+ ],
43
+ affected_columns=[
44
+ TableReference(
45
+ table_schema="public", table_name="jobs", column_name="termination_reason"
46
+ )
47
+ ],
48
+ enum_values_to_rename=[],
49
+ )
50
+ # ### end Alembic commands ###
51
+
52
+
53
+ def downgrade() -> None:
54
+ # ### commands auto generated by Alembic - please adjust! ###
55
+ op.sync_enum_values(
56
+ enum_schema="public",
57
+ enum_name="jobterminationreason",
58
+ new_values=[
59
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
60
+ "INTERRUPTED_BY_NO_CAPACITY",
61
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
62
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
63
+ "TERMINATED_BY_USER",
64
+ "VOLUME_ERROR",
65
+ "GATEWAY_ERROR",
66
+ "SCALED_DOWN",
67
+ "DONE_BY_RUNNER",
68
+ "ABORTED_BY_USER",
69
+ "TERMINATED_BY_SERVER",
70
+ "CONTAINER_EXITED_WITH_ERROR",
71
+ "PORTS_BINDING_FAILED",
72
+ "CREATING_CONTAINER_ERROR",
73
+ "EXECUTOR_ERROR",
74
+ "MAX_DURATION_EXCEEDED",
75
+ ],
76
+ affected_columns=[
77
+ TableReference(
78
+ table_schema="public", table_name="jobs", column_name="termination_reason"
79
+ )
80
+ ],
81
+ enum_values_to_rename=[],
82
+ )
83
+ # ### end Alembic commands ###
@@ -0,0 +1,34 @@
1
+ """Move attachment_data to volumes_attachments
2
+
3
+ Revision ID: a751ef183f27
4
+ Revises: 1e76fb0dde87
5
+ Create Date: 2025-02-12 13:19:57.569591
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = "a751ef183f27"
14
+ down_revision = "1e76fb0dde87"
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ with op.batch_alter_table("volumes_attachments", schema=None) as batch_op:
22
+ batch_op.alter_column("instace_id", new_column_name="instance_id")
23
+ batch_op.add_column(sa.Column("attachment_data", sa.Text(), nullable=True))
24
+
25
+ # ### end Alembic commands ###
26
+
27
+
28
+ def downgrade() -> None:
29
+ # ### commands auto generated by Alembic - please adjust! ###
30
+ with op.batch_alter_table("volumes_attachments", schema=None) as batch_op:
31
+ batch_op.drop_column("attachment_data")
32
+ batch_op.alter_column("instance_id", new_column_name="instace_id")
33
+
34
+ # ### end Alembic commands ###