dstack 0.19.25rc1__py3-none-any.whl → 0.19.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +2 -2
- dstack/_internal/cli/commands/apply.py +3 -61
- dstack/_internal/cli/commands/attach.py +1 -1
- dstack/_internal/cli/commands/completion.py +1 -1
- dstack/_internal/cli/commands/delete.py +2 -2
- dstack/_internal/cli/commands/fleet.py +1 -1
- dstack/_internal/cli/commands/gateway.py +2 -2
- dstack/_internal/cli/commands/init.py +56 -24
- dstack/_internal/cli/commands/logs.py +1 -1
- dstack/_internal/cli/commands/metrics.py +1 -1
- dstack/_internal/cli/commands/offer.py +45 -7
- dstack/_internal/cli/commands/project.py +2 -2
- dstack/_internal/cli/commands/secrets.py +2 -2
- dstack/_internal/cli/commands/server.py +1 -1
- dstack/_internal/cli/commands/stop.py +1 -1
- dstack/_internal/cli/commands/volume.py +1 -1
- dstack/_internal/cli/main.py +2 -2
- dstack/_internal/cli/services/completion.py +2 -2
- dstack/_internal/cli/services/configurators/__init__.py +6 -2
- dstack/_internal/cli/services/configurators/base.py +6 -7
- dstack/_internal/cli/services/configurators/fleet.py +1 -3
- dstack/_internal/cli/services/configurators/gateway.py +2 -4
- dstack/_internal/cli/services/configurators/run.py +195 -58
- dstack/_internal/cli/services/configurators/volume.py +2 -4
- dstack/_internal/cli/services/profile.py +1 -1
- dstack/_internal/cli/services/repos.py +51 -47
- dstack/_internal/core/backends/aws/configurator.py +11 -7
- dstack/_internal/core/backends/azure/configurator.py +11 -7
- dstack/_internal/core/backends/base/configurator.py +25 -13
- dstack/_internal/core/backends/cloudrift/configurator.py +13 -7
- dstack/_internal/core/backends/cudo/configurator.py +11 -7
- dstack/_internal/core/backends/datacrunch/compute.py +5 -1
- dstack/_internal/core/backends/datacrunch/configurator.py +13 -7
- dstack/_internal/core/backends/gcp/configurator.py +11 -7
- dstack/_internal/core/backends/hotaisle/configurator.py +13 -7
- dstack/_internal/core/backends/kubernetes/configurator.py +13 -7
- dstack/_internal/core/backends/lambdalabs/configurator.py +11 -7
- dstack/_internal/core/backends/nebius/compute.py +1 -1
- dstack/_internal/core/backends/nebius/configurator.py +11 -7
- dstack/_internal/core/backends/nebius/resources.py +21 -11
- dstack/_internal/core/backends/oci/configurator.py +11 -7
- dstack/_internal/core/backends/runpod/configurator.py +11 -7
- dstack/_internal/core/backends/template/configurator.py.jinja +11 -7
- dstack/_internal/core/backends/tensordock/configurator.py +13 -7
- dstack/_internal/core/backends/vastai/configurator.py +11 -7
- dstack/_internal/core/backends/vultr/configurator.py +11 -4
- dstack/_internal/core/compatibility/gpus.py +13 -0
- dstack/_internal/core/compatibility/runs.py +1 -0
- dstack/_internal/core/models/common.py +3 -3
- dstack/_internal/core/models/configurations.py +172 -27
- dstack/_internal/core/models/files.py +1 -1
- dstack/_internal/core/models/fleets.py +5 -1
- dstack/_internal/core/models/profiles.py +41 -11
- dstack/_internal/core/models/resources.py +46 -42
- dstack/_internal/core/models/runs.py +4 -0
- dstack/_internal/core/services/configs/__init__.py +6 -3
- dstack/_internal/core/services/profiles.py +2 -2
- dstack/_internal/core/services/repos.py +5 -3
- dstack/_internal/core/services/ssh/ports.py +1 -1
- dstack/_internal/proxy/lib/deps.py +6 -2
- dstack/_internal/server/app.py +22 -17
- dstack/_internal/server/background/tasks/process_gateways.py +4 -1
- dstack/_internal/server/background/tasks/process_instances.py +10 -2
- dstack/_internal/server/background/tasks/process_probes.py +1 -1
- dstack/_internal/server/background/tasks/process_running_jobs.py +10 -4
- dstack/_internal/server/background/tasks/process_runs.py +1 -1
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +54 -43
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
- dstack/_internal/server/background/tasks/process_volumes.py +1 -1
- dstack/_internal/server/db.py +8 -4
- dstack/_internal/server/models.py +1 -0
- dstack/_internal/server/routers/gpus.py +1 -6
- dstack/_internal/server/schemas/runner.py +10 -0
- dstack/_internal/server/services/backends/__init__.py +14 -8
- dstack/_internal/server/services/backends/handlers.py +6 -1
- dstack/_internal/server/services/docker.py +5 -5
- dstack/_internal/server/services/fleets.py +14 -13
- dstack/_internal/server/services/gateways/__init__.py +2 -0
- dstack/_internal/server/services/gateways/client.py +5 -2
- dstack/_internal/server/services/gateways/connection.py +1 -1
- dstack/_internal/server/services/gpus.py +50 -49
- dstack/_internal/server/services/instances.py +41 -1
- dstack/_internal/server/services/jobs/__init__.py +15 -4
- dstack/_internal/server/services/jobs/configurators/base.py +7 -11
- dstack/_internal/server/services/jobs/configurators/dev.py +5 -0
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -3
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +3 -3
- dstack/_internal/server/services/jobs/configurators/service.py +1 -0
- dstack/_internal/server/services/jobs/configurators/task.py +3 -0
- dstack/_internal/server/services/locking.py +5 -5
- dstack/_internal/server/services/logging.py +10 -2
- dstack/_internal/server/services/logs/__init__.py +8 -6
- dstack/_internal/server/services/logs/aws.py +330 -327
- dstack/_internal/server/services/logs/filelog.py +7 -6
- dstack/_internal/server/services/logs/gcp.py +141 -139
- dstack/_internal/server/services/plugins.py +1 -1
- dstack/_internal/server/services/projects.py +2 -5
- dstack/_internal/server/services/proxy/repo.py +5 -1
- dstack/_internal/server/services/requirements/__init__.py +0 -0
- dstack/_internal/server/services/requirements/combine.py +259 -0
- dstack/_internal/server/services/runner/client.py +7 -0
- dstack/_internal/server/services/runs.py +1 -1
- dstack/_internal/server/services/services/__init__.py +8 -2
- dstack/_internal/server/services/services/autoscalers.py +2 -0
- dstack/_internal/server/services/ssh.py +2 -1
- dstack/_internal/server/services/storage/__init__.py +5 -6
- dstack/_internal/server/services/storage/gcs.py +49 -49
- dstack/_internal/server/services/storage/s3.py +52 -52
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/testing/common.py +1 -1
- dstack/_internal/server/utils/logging.py +3 -3
- dstack/_internal/server/utils/provisioning.py +3 -3
- dstack/_internal/utils/json_schema.py +3 -1
- dstack/_internal/utils/typing.py +14 -0
- dstack/api/_public/repos.py +21 -2
- dstack/api/_public/runs.py +5 -7
- dstack/api/server/__init__.py +17 -19
- dstack/api/server/_gpus.py +2 -1
- dstack/api/server/_group.py +4 -3
- dstack/api/server/_repos.py +20 -3
- dstack/plugins/builtin/rest_plugin/_plugin.py +1 -0
- dstack/version.py +1 -1
- {dstack-0.19.25rc1.dist-info → dstack-0.19.26.dist-info}/METADATA +1 -1
- {dstack-0.19.25rc1.dist-info → dstack-0.19.26.dist-info}/RECORD +127 -124
- dstack/api/huggingface/__init__.py +0 -73
- {dstack-0.19.25rc1.dist-info → dstack-0.19.26.dist-info}/WHEEL +0 -0
- {dstack-0.19.25rc1.dist-info → dstack-0.19.26.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.25rc1.dist-info → dstack-0.19.26.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -5,9 +5,9 @@ import uuid
|
|
|
5
5
|
from datetime import datetime, timedelta
|
|
6
6
|
from typing import List, Optional, Tuple
|
|
7
7
|
|
|
8
|
-
from sqlalchemy import and_, or_, select
|
|
8
|
+
from sqlalchemy import and_, func, not_, or_, select
|
|
9
9
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
|
-
from sqlalchemy.orm import contains_eager, joinedload, load_only, selectinload
|
|
10
|
+
from sqlalchemy.orm import contains_eager, joinedload, load_only, noload, selectinload
|
|
11
11
|
|
|
12
12
|
from dstack._internal.core.backends.base.backend import Backend
|
|
13
13
|
from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
|
|
@@ -53,6 +53,7 @@ from dstack._internal.server.models import (
|
|
|
53
53
|
from dstack._internal.server.services.backends import get_project_backend_by_type_or_error
|
|
54
54
|
from dstack._internal.server.services.fleets import (
|
|
55
55
|
fleet_model_to_fleet,
|
|
56
|
+
get_fleet_requirements,
|
|
56
57
|
)
|
|
57
58
|
from dstack._internal.server.services.instances import (
|
|
58
59
|
filter_pool_instances,
|
|
@@ -71,6 +72,10 @@ from dstack._internal.server.services.jobs import (
|
|
|
71
72
|
from dstack._internal.server.services.locking import get_locker
|
|
72
73
|
from dstack._internal.server.services.logging import fmt
|
|
73
74
|
from dstack._internal.server.services.offers import get_offers_by_requirements
|
|
75
|
+
from dstack._internal.server.services.requirements.combine import (
|
|
76
|
+
combine_fleet_and_run_profiles,
|
|
77
|
+
combine_fleet_and_run_requirements,
|
|
78
|
+
)
|
|
74
79
|
from dstack._internal.server.services.runs import (
|
|
75
80
|
check_run_spec_requires_instance_mounts,
|
|
76
81
|
run_model_to_run,
|
|
@@ -148,8 +153,8 @@ async def _process_next_submitted_job():
|
|
|
148
153
|
if job_model is None:
|
|
149
154
|
return
|
|
150
155
|
lockset.add(job_model.id)
|
|
156
|
+
job_model_id = job_model.id
|
|
151
157
|
try:
|
|
152
|
-
job_model_id = job_model.id
|
|
153
158
|
await _process_submitted_job(session=session, job_model=job_model)
|
|
154
159
|
finally:
|
|
155
160
|
lockset.difference_update([job_model_id])
|
|
@@ -245,8 +250,8 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
245
250
|
]
|
|
246
251
|
if run_model.fleet is not None:
|
|
247
252
|
fleet_filters.append(FleetModel.id == run_model.fleet_id)
|
|
248
|
-
if run_spec.
|
|
249
|
-
fleet_filters.append(FleetModel.name.in_(run_spec.
|
|
253
|
+
if run_spec.merged_profile.fleets is not None:
|
|
254
|
+
fleet_filters.append(FleetModel.name.in_(run_spec.merged_profile.fleets))
|
|
250
255
|
|
|
251
256
|
instance_filters = [
|
|
252
257
|
InstanceModel.deleted == False,
|
|
@@ -264,9 +269,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
264
269
|
[i.id for i in f.instances] for f in fleet_models_with_instances
|
|
265
270
|
)
|
|
266
271
|
)
|
|
267
|
-
fleet_models = fleet_models_with_instances + fleet_models_without_instances
|
|
268
|
-
fleets_ids = [f.id for f in fleet_models]
|
|
269
|
-
|
|
270
272
|
if get_db().dialect_name == "sqlite":
|
|
271
273
|
# Start new transaction to see committed changes after lock
|
|
272
274
|
await session.commit()
|
|
@@ -275,13 +277,15 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
275
277
|
InstanceModel.__tablename__, instances_ids
|
|
276
278
|
):
|
|
277
279
|
if get_db().dialect_name == "sqlite":
|
|
278
|
-
|
|
280
|
+
fleets_with_instances_ids = [f.id for f in fleet_models_with_instances]
|
|
281
|
+
fleet_models_with_instances = await _refetch_fleet_models_with_instances(
|
|
279
282
|
session=session,
|
|
280
|
-
fleets_ids=
|
|
283
|
+
fleets_ids=fleets_with_instances_ids,
|
|
281
284
|
instances_ids=instances_ids,
|
|
282
285
|
fleet_filters=fleet_filters,
|
|
283
286
|
instance_filters=instance_filters,
|
|
284
287
|
)
|
|
288
|
+
fleet_models = fleet_models_with_instances + fleet_models_without_instances
|
|
285
289
|
fleet_model, fleet_instances_with_offers = _find_optimal_fleet_with_offers(
|
|
286
290
|
fleet_models=fleet_models,
|
|
287
291
|
run_model=run_model,
|
|
@@ -290,7 +294,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
290
294
|
master_job_provisioning_data=master_job_provisioning_data,
|
|
291
295
|
volumes=volumes,
|
|
292
296
|
)
|
|
293
|
-
if fleet_model is None and run_spec.
|
|
297
|
+
if fleet_model is None and run_spec.merged_profile.fleets is not None:
|
|
294
298
|
# Run cannot create new fleets when fleets are specified
|
|
295
299
|
logger.debug("%s: failed to use specified fleets", fmt(job_model))
|
|
296
300
|
job_model.status = JobStatus.TERMINATING
|
|
@@ -361,6 +365,10 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
361
365
|
project=project,
|
|
362
366
|
run=run,
|
|
363
367
|
)
|
|
368
|
+
# FIXME: Fleet is not locked which may lead to duplicate instance_num.
|
|
369
|
+
# This is currently hard to fix without locking the fleet for entire provisioning duration.
|
|
370
|
+
# Processing should be done in multiple steps so that
|
|
371
|
+
# InstanceModel is created before provisioning.
|
|
364
372
|
instance_num = await _get_next_instance_num(
|
|
365
373
|
session=session,
|
|
366
374
|
fleet_model=fleet_model,
|
|
@@ -438,14 +446,21 @@ async def _select_fleet_models(
|
|
|
438
446
|
*fleet_filters,
|
|
439
447
|
FleetModel.id.not_in(fleet_models_with_instances_ids),
|
|
440
448
|
)
|
|
441
|
-
.where(
|
|
442
|
-
|
|
449
|
+
.where(
|
|
450
|
+
or_(
|
|
451
|
+
InstanceModel.id.is_(None),
|
|
452
|
+
not_(and_(*instance_filters)),
|
|
453
|
+
)
|
|
454
|
+
)
|
|
455
|
+
# Load empty list of instances so that downstream code
|
|
456
|
+
# knows this fleet has no instances eligible for offers.
|
|
457
|
+
.options(noload(FleetModel.instances))
|
|
443
458
|
)
|
|
444
459
|
fleet_models_without_instances = list(res.unique().scalars().all())
|
|
445
460
|
return fleet_models_with_instances, fleet_models_without_instances
|
|
446
461
|
|
|
447
462
|
|
|
448
|
-
async def
|
|
463
|
+
async def _refetch_fleet_models_with_instances(
|
|
449
464
|
session: AsyncSession,
|
|
450
465
|
fleets_ids: list[uuid.UUID],
|
|
451
466
|
instances_ids: list[uuid.UUID],
|
|
@@ -460,13 +475,8 @@ async def _refetch_fleet_models(
|
|
|
460
475
|
*fleet_filters,
|
|
461
476
|
)
|
|
462
477
|
.where(
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
and_(
|
|
466
|
-
InstanceModel.id.in_(instances_ids),
|
|
467
|
-
*instance_filters,
|
|
468
|
-
),
|
|
469
|
-
)
|
|
478
|
+
InstanceModel.id.in_(instances_ids),
|
|
479
|
+
*instance_filters,
|
|
470
480
|
)
|
|
471
481
|
.options(contains_eager(FleetModel.instances))
|
|
472
482
|
.execution_options(populate_existing=True)
|
|
@@ -533,7 +543,7 @@ def _find_optimal_fleet_with_offers(
|
|
|
533
543
|
fleet_priority,
|
|
534
544
|
)
|
|
535
545
|
)
|
|
536
|
-
if run_spec.
|
|
546
|
+
if run_spec.merged_profile.fleets is None and all(
|
|
537
547
|
t[2] == 0 for t in candidate_fleets_with_offers
|
|
538
548
|
):
|
|
539
549
|
# If fleets are not specified and no fleets have available offers, create a new fleet.
|
|
@@ -646,6 +656,8 @@ async def _run_job_on_new_instance(
|
|
|
646
656
|
) -> Optional[Tuple[JobProvisioningData, InstanceOfferWithAvailability]]:
|
|
647
657
|
if volumes is None:
|
|
648
658
|
volumes = []
|
|
659
|
+
profile = run.run_spec.merged_profile
|
|
660
|
+
requirements = job.job_spec.requirements
|
|
649
661
|
fleet = None
|
|
650
662
|
if fleet_model is not None:
|
|
651
663
|
fleet = fleet_model_to_fleet(fleet_model)
|
|
@@ -654,13 +666,26 @@ async def _run_job_on_new_instance(
|
|
|
654
666
|
"%s: cannot fit new instance into fleet %s", fmt(job_model), fleet_model.name
|
|
655
667
|
)
|
|
656
668
|
return None
|
|
669
|
+
profile = combine_fleet_and_run_profiles(fleet.spec.merged_profile, profile)
|
|
670
|
+
if profile is None:
|
|
671
|
+
logger.debug("%s: cannot combine fleet %s profile", fmt(job_model), fleet_model.name)
|
|
672
|
+
return None
|
|
673
|
+
fleet_requirements = get_fleet_requirements(fleet.spec)
|
|
674
|
+
requirements = combine_fleet_and_run_requirements(fleet_requirements, requirements)
|
|
675
|
+
if requirements is None:
|
|
676
|
+
logger.debug(
|
|
677
|
+
"%s: cannot combine fleet %s requirements", fmt(job_model), fleet_model.name
|
|
678
|
+
)
|
|
679
|
+
return None
|
|
680
|
+
# TODO: Respect fleet provisioning properties such as tags
|
|
681
|
+
|
|
657
682
|
multinode = job.job_spec.jobs_per_replica > 1 or (
|
|
658
683
|
fleet is not None and fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
|
|
659
684
|
)
|
|
660
685
|
offers = await get_offers_by_requirements(
|
|
661
686
|
project=project,
|
|
662
|
-
profile=
|
|
663
|
-
requirements=
|
|
687
|
+
profile=profile,
|
|
688
|
+
requirements=requirements,
|
|
664
689
|
exclude_not_available=True,
|
|
665
690
|
multinode=multinode,
|
|
666
691
|
master_job_provisioning_data=master_job_provisioning_data,
|
|
@@ -752,25 +777,11 @@ def _create_fleet_model_for_job(
|
|
|
752
777
|
|
|
753
778
|
|
|
754
779
|
async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel) -> int:
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
):
|
|
761
|
-
fleet_model = (
|
|
762
|
-
(
|
|
763
|
-
await session.execute(
|
|
764
|
-
select(FleetModel)
|
|
765
|
-
.where(FleetModel.id == fleet_model.id)
|
|
766
|
-
.options(joinedload(FleetModel.instances))
|
|
767
|
-
.execution_options(populate_existing=True)
|
|
768
|
-
)
|
|
769
|
-
)
|
|
770
|
-
.unique()
|
|
771
|
-
.scalar_one()
|
|
772
|
-
)
|
|
773
|
-
return len(fleet_model.instances)
|
|
780
|
+
res = await session.execute(
|
|
781
|
+
select(func.count(InstanceModel.id)).where(InstanceModel.fleet_id == fleet_model.id)
|
|
782
|
+
)
|
|
783
|
+
instance_count = res.scalar_one()
|
|
784
|
+
return instance_count
|
|
774
785
|
|
|
775
786
|
|
|
776
787
|
def _create_instance_model_for_job(
|
|
@@ -75,9 +75,9 @@ async def _process_next_terminating_job():
|
|
|
75
75
|
return
|
|
76
76
|
instance_lockset.add(instance_model.id)
|
|
77
77
|
job_lockset.add(job_model.id)
|
|
78
|
+
job_model_id = job_model.id
|
|
79
|
+
instance_model_id = job_model.used_instance_id
|
|
78
80
|
try:
|
|
79
|
-
job_model_id = job_model.id
|
|
80
|
-
instance_model_id = job_model.used_instance_id
|
|
81
81
|
await _process_job(
|
|
82
82
|
session=session,
|
|
83
83
|
job_model=job_model,
|
|
@@ -42,8 +42,8 @@ async def process_submitted_volumes():
|
|
|
42
42
|
if volume_model is None:
|
|
43
43
|
return
|
|
44
44
|
lockset.add(volume_model.id)
|
|
45
|
+
volume_model_id = volume_model.id
|
|
45
46
|
try:
|
|
46
|
-
volume_model_id = volume_model.id
|
|
47
47
|
await _process_submitted_volume(session=session, volume_model=volume_model)
|
|
48
48
|
finally:
|
|
49
49
|
lockset.difference_update([volume_model_id])
|
dstack/_internal/server/db.py
CHANGED
|
@@ -4,8 +4,12 @@ from typing import Optional
|
|
|
4
4
|
from alembic import command, config
|
|
5
5
|
from sqlalchemy import AsyncAdaptedQueuePool, event
|
|
6
6
|
from sqlalchemy.engine.interfaces import DBAPIConnection
|
|
7
|
-
from sqlalchemy.ext.asyncio import
|
|
8
|
-
|
|
7
|
+
from sqlalchemy.ext.asyncio import (
|
|
8
|
+
AsyncEngine,
|
|
9
|
+
AsyncSession,
|
|
10
|
+
async_sessionmaker,
|
|
11
|
+
create_async_engine,
|
|
12
|
+
)
|
|
9
13
|
from sqlalchemy.pool import ConnectionPoolEntry
|
|
10
14
|
|
|
11
15
|
from dstack._internal.server import settings
|
|
@@ -26,8 +30,8 @@ class Database:
|
|
|
26
30
|
pool_size=settings.DB_POOL_SIZE,
|
|
27
31
|
max_overflow=settings.DB_MAX_OVERFLOW,
|
|
28
32
|
)
|
|
29
|
-
self.session_maker =
|
|
30
|
-
bind=self.engine,
|
|
33
|
+
self.session_maker = async_sessionmaker(
|
|
34
|
+
bind=self.engine, # type: ignore[assignment]
|
|
31
35
|
expire_on_commit=False,
|
|
32
36
|
class_=AsyncSession,
|
|
33
37
|
)
|
|
@@ -622,6 +622,7 @@ class InstanceModel(BaseModel):
|
|
|
622
622
|
backend: Mapped[Optional[BackendType]] = mapped_column(EnumAsString(BackendType, 100))
|
|
623
623
|
backend_data: Mapped[Optional[str]] = mapped_column(Text)
|
|
624
624
|
|
|
625
|
+
# Not set for cloud fleets that haven't been provisioning
|
|
625
626
|
offer: Mapped[Optional[str]] = mapped_column(Text)
|
|
626
627
|
region: Mapped[Optional[str]] = mapped_column(String(2000))
|
|
627
628
|
price: Mapped[Optional[float]] = mapped_column(Float)
|
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
from typing import Tuple
|
|
2
2
|
|
|
3
3
|
from fastapi import APIRouter, Depends
|
|
4
|
-
from sqlalchemy.ext.asyncio import AsyncSession
|
|
5
4
|
|
|
6
|
-
from dstack._internal.server.db import get_session
|
|
7
5
|
from dstack._internal.server.models import ProjectModel, UserModel
|
|
8
6
|
from dstack._internal.server.schemas.gpus import ListGpusRequest, ListGpusResponse
|
|
9
7
|
from dstack._internal.server.security.permissions import ProjectMember
|
|
@@ -20,10 +18,7 @@ project_router = APIRouter(
|
|
|
20
18
|
@project_router.post("/list", response_model=ListGpusResponse, response_model_exclude_none=True)
|
|
21
19
|
async def list_gpus(
|
|
22
20
|
body: ListGpusRequest,
|
|
23
|
-
session: AsyncSession = Depends(get_session),
|
|
24
21
|
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
|
|
25
22
|
) -> ListGpusResponse:
|
|
26
23
|
_, project = user_project
|
|
27
|
-
return await list_gpus_grouped(
|
|
28
|
-
session=session, project=project, run_spec=body.run_spec, group_by=body.group_by
|
|
29
|
-
)
|
|
24
|
+
return await list_gpus_grouped(project=project, run_spec=body.run_spec, group_by=body.group_by)
|
|
@@ -159,6 +159,16 @@ class GPUDevice(CoreModel):
|
|
|
159
159
|
path_in_container: str
|
|
160
160
|
|
|
161
161
|
|
|
162
|
+
class TaskListItem(CoreModel):
|
|
163
|
+
id: str
|
|
164
|
+
status: TaskStatus
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class TaskListResponse(CoreModel):
|
|
168
|
+
ids: Optional[list[str]] = None # returned by pre-0.19.26 shim
|
|
169
|
+
tasks: Optional[list[TaskListItem]] = None # returned by 0.19.26+ shim
|
|
170
|
+
|
|
171
|
+
|
|
162
172
|
class TaskInfoResponse(CoreModel):
|
|
163
173
|
id: str
|
|
164
174
|
status: TaskStatus
|
|
@@ -17,8 +17,8 @@ from dstack._internal.core.backends.configurators import (
|
|
|
17
17
|
)
|
|
18
18
|
from dstack._internal.core.backends.local.backend import LocalBackend
|
|
19
19
|
from dstack._internal.core.backends.models import (
|
|
20
|
-
AnyBackendConfig,
|
|
21
20
|
AnyBackendConfigWithCreds,
|
|
21
|
+
AnyBackendConfigWithoutCreds,
|
|
22
22
|
)
|
|
23
23
|
from dstack._internal.core.errors import (
|
|
24
24
|
BackendError,
|
|
@@ -126,19 +126,25 @@ async def get_backend_config(
|
|
|
126
126
|
)
|
|
127
127
|
continue
|
|
128
128
|
if backend_model.type == backend_type:
|
|
129
|
-
return
|
|
130
|
-
configurator, backend_model, include_creds=True
|
|
131
|
-
)
|
|
129
|
+
return get_backend_config_with_creds_from_backend_model(configurator, backend_model)
|
|
132
130
|
return None
|
|
133
131
|
|
|
134
132
|
|
|
135
|
-
def
|
|
133
|
+
def get_backend_config_with_creds_from_backend_model(
|
|
134
|
+
configurator: Configurator,
|
|
135
|
+
backend_model: BackendModel,
|
|
136
|
+
) -> AnyBackendConfigWithCreds:
|
|
137
|
+
backend_record = get_stored_backend_record(backend_model)
|
|
138
|
+
backend_config = configurator.get_backend_config_with_creds(backend_record)
|
|
139
|
+
return backend_config
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def get_backend_config_without_creds_from_backend_model(
|
|
136
143
|
configurator: Configurator,
|
|
137
144
|
backend_model: BackendModel,
|
|
138
|
-
|
|
139
|
-
) -> AnyBackendConfig:
|
|
145
|
+
) -> AnyBackendConfigWithoutCreds:
|
|
140
146
|
backend_record = get_stored_backend_record(backend_model)
|
|
141
|
-
backend_config = configurator.
|
|
147
|
+
backend_config = configurator.get_backend_config_without_creds(backend_record)
|
|
142
148
|
return backend_config
|
|
143
149
|
|
|
144
150
|
|
|
@@ -55,7 +55,11 @@ async def _check_active_instances(
|
|
|
55
55
|
)
|
|
56
56
|
for fleet_model in fleet_models:
|
|
57
57
|
for instance in fleet_model.instances:
|
|
58
|
-
if
|
|
58
|
+
if (
|
|
59
|
+
instance.status.is_active()
|
|
60
|
+
and instance.backend is not None
|
|
61
|
+
and instance.backend in backends_types
|
|
62
|
+
):
|
|
59
63
|
if error:
|
|
60
64
|
msg = (
|
|
61
65
|
f"Backend {instance.backend.value} has active instances."
|
|
@@ -83,6 +87,7 @@ async def _check_active_volumes(
|
|
|
83
87
|
if (
|
|
84
88
|
volume_model.status.is_active()
|
|
85
89
|
and volume_model.provisioning_data is not None
|
|
90
|
+
and volume_model.provisioning_data.backend is not None
|
|
86
91
|
and volume_model.provisioning_data.backend in backends_types
|
|
87
92
|
):
|
|
88
93
|
if error:
|
|
@@ -32,15 +32,15 @@ class DXFAuthAdapter:
|
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
class DockerImage(CoreModel):
|
|
35
|
-
class Config(CoreModel.Config):
|
|
36
|
-
frozen = True
|
|
37
|
-
|
|
38
35
|
image: str
|
|
39
36
|
registry: Optional[str]
|
|
40
37
|
repo: str
|
|
41
38
|
tag: str
|
|
42
39
|
digest: Optional[str]
|
|
43
40
|
|
|
41
|
+
class Config(CoreModel.Config):
|
|
42
|
+
frozen = True
|
|
43
|
+
|
|
44
44
|
|
|
45
45
|
class ImageConfig(CoreModel):
|
|
46
46
|
user: Annotated[Optional[str], Field(alias="User")] = None
|
|
@@ -77,7 +77,7 @@ def get_image_config(image_name: str, registry_auth: Optional[RegistryAuth]) ->
|
|
|
77
77
|
registry_client = PatchedDXF(
|
|
78
78
|
host=image.registry or DEFAULT_REGISTRY,
|
|
79
79
|
repo=image.repo,
|
|
80
|
-
auth=DXFAuthAdapter(registry_auth),
|
|
80
|
+
auth=DXFAuthAdapter(registry_auth), # type: ignore[assignment]
|
|
81
81
|
timeout=REGISTRY_REQUEST_TIMEOUT,
|
|
82
82
|
)
|
|
83
83
|
|
|
@@ -88,7 +88,7 @@ def get_image_config(image_name: str, registry_auth: Optional[RegistryAuth]) ->
|
|
|
88
88
|
)
|
|
89
89
|
manifest = ImageManifest.__response__.parse_raw(manifest_resp)
|
|
90
90
|
config_stream = registry_client.pull_blob(manifest.config.digest)
|
|
91
|
-
config_resp = join_byte_stream_checked(config_stream, MAX_CONFIG_OBJECT_SIZE)
|
|
91
|
+
config_resp = join_byte_stream_checked(config_stream, MAX_CONFIG_OBJECT_SIZE) # type: ignore[arg-type]
|
|
92
92
|
if config_resp is None:
|
|
93
93
|
raise DockerRegistryError(
|
|
94
94
|
f"Image config object exceeds the size limit of {MAX_CONFIG_OBJECT_SIZE} bytes"
|
|
@@ -279,7 +279,7 @@ async def get_plan(
|
|
|
279
279
|
offers_with_backends = await get_create_instance_offers(
|
|
280
280
|
project=project,
|
|
281
281
|
profile=effective_spec.merged_profile,
|
|
282
|
-
requirements=
|
|
282
|
+
requirements=get_fleet_requirements(effective_spec),
|
|
283
283
|
fleet_spec=effective_spec,
|
|
284
284
|
blocks=effective_spec.configuration.blocks,
|
|
285
285
|
)
|
|
@@ -458,7 +458,7 @@ async def create_fleet_instance_model(
|
|
|
458
458
|
instance_num: int,
|
|
459
459
|
) -> InstanceModel:
|
|
460
460
|
profile = spec.merged_profile
|
|
461
|
-
requirements =
|
|
461
|
+
requirements = get_fleet_requirements(spec)
|
|
462
462
|
instance_model = await instances_services.create_instance_model(
|
|
463
463
|
session=session,
|
|
464
464
|
project=project,
|
|
@@ -504,6 +504,7 @@ async def create_fleet_ssh_instance_model(
|
|
|
504
504
|
raise ServerClientError("ssh key or user not specified")
|
|
505
505
|
|
|
506
506
|
if proxy_jump is not None:
|
|
507
|
+
assert proxy_jump.ssh_key is not None
|
|
507
508
|
ssh_proxy = SSHConnectionParams(
|
|
508
509
|
hostname=proxy_jump.hostname,
|
|
509
510
|
port=proxy_jump.port or 22,
|
|
@@ -643,6 +644,17 @@ def is_fleet_empty(fleet_model: FleetModel) -> bool:
|
|
|
643
644
|
return len(active_instances) == 0
|
|
644
645
|
|
|
645
646
|
|
|
647
|
+
def get_fleet_requirements(fleet_spec: FleetSpec) -> Requirements:
|
|
648
|
+
profile = fleet_spec.merged_profile
|
|
649
|
+
requirements = Requirements(
|
|
650
|
+
resources=fleet_spec.configuration.resources or ResourcesSpec(),
|
|
651
|
+
max_price=profile.max_price,
|
|
652
|
+
spot=get_policy_map(profile.spot_policy, default=SpotPolicy.ONDEMAND),
|
|
653
|
+
reservation=fleet_spec.configuration.reservation,
|
|
654
|
+
)
|
|
655
|
+
return requirements
|
|
656
|
+
|
|
657
|
+
|
|
646
658
|
async def _create_fleet(
|
|
647
659
|
session: AsyncSession,
|
|
648
660
|
project: ProjectModel,
|
|
@@ -1003,17 +1015,6 @@ def _terminate_fleet_instances(fleet_model: FleetModel, instance_nums: Optional[
|
|
|
1003
1015
|
instance.status = InstanceStatus.TERMINATING
|
|
1004
1016
|
|
|
1005
1017
|
|
|
1006
|
-
def _get_fleet_requirements(fleet_spec: FleetSpec) -> Requirements:
|
|
1007
|
-
profile = fleet_spec.merged_profile
|
|
1008
|
-
requirements = Requirements(
|
|
1009
|
-
resources=fleet_spec.configuration.resources or ResourcesSpec(),
|
|
1010
|
-
max_price=profile.max_price,
|
|
1011
|
-
spot=get_policy_map(profile.spot_policy, default=SpotPolicy.ONDEMAND),
|
|
1012
|
-
reservation=fleet_spec.configuration.reservation,
|
|
1013
|
-
)
|
|
1014
|
-
return requirements
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
1018
|
def _get_next_instance_num(instance_nums: set[int]) -> int:
|
|
1018
1019
|
if not instance_nums:
|
|
1019
1020
|
return 0
|
|
@@ -93,6 +93,8 @@ async def create_gateway_compute(
|
|
|
93
93
|
backend_id: Optional[uuid.UUID] = None,
|
|
94
94
|
) -> GatewayComputeModel:
|
|
95
95
|
assert isinstance(backend_compute, ComputeWithGatewaySupport)
|
|
96
|
+
assert configuration.name is not None
|
|
97
|
+
|
|
96
98
|
private_bytes, public_bytes = generate_rsa_key_pair_bytes()
|
|
97
99
|
gateway_ssh_private_key = private_bytes.decode()
|
|
98
100
|
gateway_ssh_public_key = public_bytes.decode()
|
|
@@ -7,7 +7,7 @@ from pydantic import parse_obj_as
|
|
|
7
7
|
|
|
8
8
|
from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
|
|
9
9
|
from dstack._internal.core.errors import GatewayError
|
|
10
|
-
from dstack._internal.core.models.configurations import RateLimit
|
|
10
|
+
from dstack._internal.core.models.configurations import RateLimit
|
|
11
11
|
from dstack._internal.core.models.instances import SSHConnectionParams
|
|
12
12
|
from dstack._internal.core.models.runs import JobSpec, JobSubmission, Run, get_service_port
|
|
13
13
|
from dstack._internal.proxy.gateway.schemas.stats import ServiceStats
|
|
@@ -85,7 +85,7 @@ class GatewayClient:
|
|
|
85
85
|
ssh_head_proxy: Optional[SSHConnectionParams],
|
|
86
86
|
ssh_head_proxy_private_key: Optional[str],
|
|
87
87
|
):
|
|
88
|
-
assert
|
|
88
|
+
assert run.run_spec.configuration.type == "service"
|
|
89
89
|
payload = {
|
|
90
90
|
"job_id": job_submission.id.hex,
|
|
91
91
|
"app_port": get_service_port(job_spec, run.run_spec.configuration),
|
|
@@ -93,6 +93,9 @@ class GatewayClient:
|
|
|
93
93
|
"ssh_head_proxy_private_key": ssh_head_proxy_private_key,
|
|
94
94
|
}
|
|
95
95
|
jpd = job_submission.job_provisioning_data
|
|
96
|
+
assert jpd is not None
|
|
97
|
+
assert jpd.hostname is not None
|
|
98
|
+
assert jpd.ssh_port is not None
|
|
96
99
|
if not jpd.dockerized:
|
|
97
100
|
payload.update(
|
|
98
101
|
{
|
|
@@ -67,7 +67,7 @@ class GatewayConnection:
|
|
|
67
67
|
# reverse_forwarded_sockets are added later in .open()
|
|
68
68
|
)
|
|
69
69
|
self.tunnel_id = uuid.uuid4()
|
|
70
|
-
self._client = GatewayClient(uds=self.gateway_socket_path)
|
|
70
|
+
self._client = GatewayClient(uds=str(self.gateway_socket_path))
|
|
71
71
|
|
|
72
72
|
@staticmethod
|
|
73
73
|
def _init_symlink_dir(connection_dir: Path) -> Tuple[TemporaryDirectory, Path]:
|