dstack 0.18.43__py3-none-any.whl → 0.19.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/gateway.py +15 -3
- dstack/_internal/cli/commands/logs.py +0 -22
- dstack/_internal/cli/commands/stats.py +8 -17
- dstack/_internal/cli/main.py +1 -5
- dstack/_internal/cli/services/configurators/fleet.py +4 -39
- dstack/_internal/cli/services/configurators/run.py +22 -20
- dstack/_internal/cli/services/profile.py +34 -83
- dstack/_internal/cli/utils/gateway.py +1 -1
- dstack/_internal/cli/utils/run.py +11 -0
- dstack/_internal/core/backends/__init__.py +56 -39
- dstack/_internal/core/backends/aws/__init__.py +0 -25
- dstack/_internal/core/backends/aws/auth.py +1 -10
- dstack/_internal/core/backends/aws/backend.py +26 -0
- dstack/_internal/core/backends/aws/compute.py +21 -45
- dstack/_internal/{server/services/backends/configurators/aws.py → core/backends/aws/configurator.py} +46 -85
- dstack/_internal/core/backends/aws/models.py +135 -0
- dstack/_internal/core/backends/aws/resources.py +1 -1
- dstack/_internal/core/backends/azure/__init__.py +0 -20
- dstack/_internal/core/backends/azure/auth.py +2 -11
- dstack/_internal/core/backends/azure/backend.py +21 -0
- dstack/_internal/core/backends/azure/compute.py +14 -28
- dstack/_internal/{server/services/backends/configurators/azure.py → core/backends/azure/configurator.py} +141 -210
- dstack/_internal/core/backends/azure/models.py +89 -0
- dstack/_internal/core/backends/base/__init__.py +0 -12
- dstack/_internal/core/backends/base/backend.py +18 -0
- dstack/_internal/core/backends/base/compute.py +153 -33
- dstack/_internal/core/backends/base/configurator.py +105 -0
- dstack/_internal/core/backends/base/models.py +14 -0
- dstack/_internal/core/backends/configurators.py +138 -0
- dstack/_internal/core/backends/cudo/__init__.py +0 -15
- dstack/_internal/core/backends/cudo/backend.py +16 -0
- dstack/_internal/core/backends/cudo/compute.py +8 -26
- dstack/_internal/core/backends/cudo/configurator.py +72 -0
- dstack/_internal/core/backends/cudo/models.py +37 -0
- dstack/_internal/core/backends/datacrunch/__init__.py +0 -15
- dstack/_internal/core/backends/datacrunch/backend.py +16 -0
- dstack/_internal/core/backends/datacrunch/compute.py +8 -25
- dstack/_internal/core/backends/datacrunch/configurator.py +66 -0
- dstack/_internal/core/backends/datacrunch/models.py +38 -0
- dstack/_internal/core/{models/backends/dstack.py → backends/dstack/models.py} +7 -7
- dstack/_internal/core/backends/gcp/__init__.py +0 -16
- dstack/_internal/core/backends/gcp/auth.py +2 -11
- dstack/_internal/core/backends/gcp/backend.py +17 -0
- dstack/_internal/core/backends/gcp/compute.py +14 -44
- dstack/_internal/{server/services/backends/configurators/gcp.py → core/backends/gcp/configurator.py} +46 -103
- dstack/_internal/core/backends/gcp/models.py +125 -0
- dstack/_internal/core/backends/kubernetes/__init__.py +0 -15
- dstack/_internal/core/backends/kubernetes/backend.py +16 -0
- dstack/_internal/core/backends/kubernetes/compute.py +16 -5
- dstack/_internal/core/backends/kubernetes/configurator.py +55 -0
- dstack/_internal/core/backends/kubernetes/models.py +72 -0
- dstack/_internal/core/backends/lambdalabs/__init__.py +0 -16
- dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
- dstack/_internal/core/backends/lambdalabs/compute.py +7 -28
- dstack/_internal/core/backends/lambdalabs/configurator.py +82 -0
- dstack/_internal/core/backends/lambdalabs/models.py +37 -0
- dstack/_internal/core/backends/local/__init__.py +0 -13
- dstack/_internal/core/backends/local/backend.py +14 -0
- dstack/_internal/core/backends/local/compute.py +16 -2
- dstack/_internal/core/backends/models.py +128 -0
- dstack/_internal/core/backends/oci/__init__.py +0 -15
- dstack/_internal/core/backends/oci/auth.py +1 -5
- dstack/_internal/core/backends/oci/backend.py +16 -0
- dstack/_internal/core/backends/oci/compute.py +9 -23
- dstack/_internal/{server/services/backends/configurators/oci.py → core/backends/oci/configurator.py} +40 -85
- dstack/_internal/core/{models/backends/oci.py → backends/oci/models.py} +24 -25
- dstack/_internal/core/backends/oci/region.py +1 -1
- dstack/_internal/core/backends/runpod/__init__.py +0 -15
- dstack/_internal/core/backends/runpod/backend.py +16 -0
- dstack/_internal/core/backends/runpod/compute.py +28 -6
- dstack/_internal/core/backends/runpod/configurator.py +59 -0
- dstack/_internal/core/backends/runpod/models.py +54 -0
- dstack/_internal/core/backends/template/__init__.py +0 -0
- dstack/_internal/core/backends/tensordock/__init__.py +0 -15
- dstack/_internal/core/backends/tensordock/backend.py +16 -0
- dstack/_internal/core/backends/tensordock/compute.py +8 -27
- dstack/_internal/core/backends/tensordock/configurator.py +68 -0
- dstack/_internal/core/backends/tensordock/models.py +38 -0
- dstack/_internal/core/backends/vastai/__init__.py +0 -15
- dstack/_internal/core/backends/vastai/backend.py +16 -0
- dstack/_internal/core/backends/vastai/compute.py +2 -2
- dstack/_internal/core/backends/vastai/configurator.py +66 -0
- dstack/_internal/core/backends/vastai/models.py +37 -0
- dstack/_internal/core/backends/vultr/__init__.py +0 -15
- dstack/_internal/core/backends/vultr/backend.py +16 -0
- dstack/_internal/core/backends/vultr/compute.py +10 -24
- dstack/_internal/core/backends/vultr/configurator.py +64 -0
- dstack/_internal/core/backends/vultr/models.py +34 -0
- dstack/_internal/core/models/backends/__init__.py +0 -184
- dstack/_internal/core/models/backends/base.py +0 -19
- dstack/_internal/core/models/configurations.py +22 -16
- dstack/_internal/core/models/envs.py +4 -3
- dstack/_internal/core/models/fleets.py +17 -22
- dstack/_internal/core/models/gateways.py +3 -3
- dstack/_internal/core/models/instances.py +24 -0
- dstack/_internal/core/models/profiles.py +85 -45
- dstack/_internal/core/models/projects.py +1 -1
- dstack/_internal/core/models/repos/base.py +0 -5
- dstack/_internal/core/models/repos/local.py +3 -3
- dstack/_internal/core/models/repos/remote.py +26 -12
- dstack/_internal/core/models/repos/virtual.py +1 -1
- dstack/_internal/core/models/resources.py +45 -76
- dstack/_internal/core/models/runs.py +21 -19
- dstack/_internal/core/models/volumes.py +1 -3
- dstack/_internal/core/services/profiles.py +7 -16
- dstack/_internal/core/services/repos.py +0 -4
- dstack/_internal/server/app.py +11 -4
- dstack/_internal/server/background/__init__.py +10 -0
- dstack/_internal/server/background/tasks/process_gateways.py +4 -8
- dstack/_internal/server/background/tasks/process_instances.py +14 -9
- dstack/_internal/server/background/tasks/process_metrics.py +1 -1
- dstack/_internal/server/background/tasks/process_placement_groups.py +5 -1
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
- dstack/_internal/server/background/tasks/process_running_jobs.py +80 -24
- dstack/_internal/server/background/tasks/process_runs.py +1 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +20 -38
- dstack/_internal/server/background/tasks/process_volumes.py +5 -2
- dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
- dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
- dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
- dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
- dstack/_internal/server/models.py +59 -9
- dstack/_internal/server/routers/backends.py +14 -23
- dstack/_internal/server/routers/instances.py +3 -4
- dstack/_internal/server/routers/metrics.py +31 -10
- dstack/_internal/server/routers/prometheus.py +36 -0
- dstack/_internal/server/routers/repos.py +1 -2
- dstack/_internal/server/routers/runs.py +13 -59
- dstack/_internal/server/schemas/gateways.py +14 -23
- dstack/_internal/server/schemas/projects.py +7 -2
- dstack/_internal/server/schemas/repos.py +2 -38
- dstack/_internal/server/schemas/runner.py +1 -0
- dstack/_internal/server/schemas/runs.py +1 -24
- dstack/_internal/server/security/permissions.py +1 -1
- dstack/_internal/server/services/backends/__init__.py +85 -158
- dstack/_internal/server/services/config.py +53 -567
- dstack/_internal/server/services/fleets.py +9 -103
- dstack/_internal/server/services/gateways/__init__.py +13 -4
- dstack/_internal/server/services/{pools.py → instances.py} +22 -329
- dstack/_internal/server/services/jobs/__init__.py +9 -6
- dstack/_internal/server/services/jobs/configurators/base.py +25 -1
- dstack/_internal/server/services/jobs/configurators/dev.py +9 -1
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
- dstack/_internal/server/services/metrics.py +131 -72
- dstack/_internal/server/services/offers.py +1 -1
- dstack/_internal/server/services/projects.py +23 -14
- dstack/_internal/server/services/prometheus.py +245 -0
- dstack/_internal/server/services/runner/client.py +14 -3
- dstack/_internal/server/services/runs.py +67 -31
- dstack/_internal/server/services/volumes.py +9 -4
- dstack/_internal/server/settings.py +3 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js → main-4fd5a4770eff59325ee3.js} +68 -15
- dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js.map → main-4fd5a4770eff59325ee3.js.map} +1 -1
- dstack/_internal/server/statics/{main-7510e71dfa9749a4e70e.css → main-da9f8c06a69c20dac23e.css} +1 -1
- dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
- dstack/_internal/server/testing/common.py +75 -32
- dstack/_internal/utils/json_schema.py +6 -0
- dstack/_internal/utils/ssh.py +2 -1
- dstack/api/__init__.py +4 -0
- dstack/api/_public/__init__.py +16 -20
- dstack/api/_public/backends.py +1 -1
- dstack/api/_public/repos.py +36 -36
- dstack/api/_public/runs.py +170 -83
- dstack/api/server/__init__.py +11 -13
- dstack/api/server/_backends.py +12 -16
- dstack/api/server/_fleets.py +15 -55
- dstack/api/server/_gateways.py +3 -14
- dstack/api/server/_repos.py +1 -4
- dstack/api/server/_runs.py +21 -96
- dstack/api/server/_volumes.py +10 -5
- dstack/api/utils.py +3 -0
- dstack/version.py +1 -1
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/METADATA +10 -1
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/RECORD +229 -206
- tests/_internal/cli/services/configurators/test_profile.py +6 -6
- tests/_internal/core/backends/aws/test_configurator.py +35 -0
- tests/_internal/core/backends/aws/test_resources.py +1 -1
- tests/_internal/core/backends/azure/test_configurator.py +61 -0
- tests/_internal/core/backends/cudo/__init__.py +0 -0
- tests/_internal/core/backends/cudo/test_configurator.py +37 -0
- tests/_internal/core/backends/datacrunch/__init__.py +0 -0
- tests/_internal/core/backends/datacrunch/test_configurator.py +17 -0
- tests/_internal/core/backends/gcp/test_configurator.py +42 -0
- tests/_internal/core/backends/kubernetes/test_configurator.py +43 -0
- tests/_internal/core/backends/lambdalabs/__init__.py +0 -0
- tests/_internal/core/backends/lambdalabs/test_configurator.py +38 -0
- tests/_internal/core/backends/oci/test_configurator.py +55 -0
- tests/_internal/core/backends/runpod/__init__.py +0 -0
- tests/_internal/core/backends/runpod/test_configurator.py +33 -0
- tests/_internal/core/backends/tensordock/__init__.py +0 -0
- tests/_internal/core/backends/tensordock/test_configurator.py +38 -0
- tests/_internal/core/backends/vastai/__init__.py +0 -0
- tests/_internal/core/backends/vastai/test_configurator.py +33 -0
- tests/_internal/core/backends/vultr/__init__.py +0 -0
- tests/_internal/core/backends/vultr/test_configurator.py +33 -0
- tests/_internal/server/background/tasks/test_process_gateways.py +4 -0
- tests/_internal/server/background/tasks/test_process_instances.py +49 -48
- tests/_internal/server/background/tasks/test_process_metrics.py +0 -3
- tests/_internal/server/background/tasks/test_process_placement_groups.py +2 -0
- tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +186 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +123 -19
- tests/_internal/server/background/tasks/test_process_runs.py +8 -22
- tests/_internal/server/background/tasks/test_process_submitted_jobs.py +3 -40
- tests/_internal/server/background/tasks/test_process_submitted_volumes.py +2 -0
- tests/_internal/server/background/tasks/test_process_terminating_jobs.py +10 -15
- tests/_internal/server/routers/test_backends.py +6 -764
- tests/_internal/server/routers/test_fleets.py +2 -26
- tests/_internal/server/routers/test_gateways.py +27 -3
- tests/_internal/server/routers/test_instances.py +0 -10
- tests/_internal/server/routers/test_metrics.py +42 -0
- tests/_internal/server/routers/test_projects.py +56 -0
- tests/_internal/server/routers/test_prometheus.py +333 -0
- tests/_internal/server/routers/test_repos.py +0 -15
- tests/_internal/server/routers/test_runs.py +83 -275
- tests/_internal/server/routers/test_volumes.py +2 -3
- tests/_internal/server/services/backends/__init__.py +0 -0
- tests/_internal/server/services/jobs/configurators/test_task.py +35 -0
- tests/_internal/server/services/test_config.py +7 -4
- tests/_internal/server/services/test_fleets.py +1 -4
- tests/_internal/server/services/{test_pools.py → test_instances.py} +11 -49
- tests/_internal/server/services/test_metrics.py +167 -0
- tests/_internal/server/services/test_repos.py +1 -14
- tests/_internal/server/services/test_runs.py +0 -4
- dstack/_internal/cli/commands/pool.py +0 -581
- dstack/_internal/cli/commands/run.py +0 -75
- dstack/_internal/core/backends/aws/config.py +0 -18
- dstack/_internal/core/backends/azure/config.py +0 -12
- dstack/_internal/core/backends/base/config.py +0 -5
- dstack/_internal/core/backends/cudo/config.py +0 -9
- dstack/_internal/core/backends/datacrunch/config.py +0 -9
- dstack/_internal/core/backends/gcp/config.py +0 -22
- dstack/_internal/core/backends/kubernetes/config.py +0 -6
- dstack/_internal/core/backends/lambdalabs/config.py +0 -9
- dstack/_internal/core/backends/nebius/__init__.py +0 -15
- dstack/_internal/core/backends/nebius/api_client.py +0 -319
- dstack/_internal/core/backends/nebius/compute.py +0 -220
- dstack/_internal/core/backends/nebius/config.py +0 -6
- dstack/_internal/core/backends/nebius/types.py +0 -37
- dstack/_internal/core/backends/oci/config.py +0 -6
- dstack/_internal/core/backends/runpod/config.py +0 -9
- dstack/_internal/core/backends/tensordock/config.py +0 -9
- dstack/_internal/core/backends/vastai/config.py +0 -6
- dstack/_internal/core/backends/vultr/config.py +0 -9
- dstack/_internal/core/models/backends/aws.py +0 -86
- dstack/_internal/core/models/backends/azure.py +0 -68
- dstack/_internal/core/models/backends/cudo.py +0 -43
- dstack/_internal/core/models/backends/datacrunch.py +0 -44
- dstack/_internal/core/models/backends/gcp.py +0 -67
- dstack/_internal/core/models/backends/kubernetes.py +0 -40
- dstack/_internal/core/models/backends/lambdalabs.py +0 -43
- dstack/_internal/core/models/backends/nebius.py +0 -54
- dstack/_internal/core/models/backends/runpod.py +0 -40
- dstack/_internal/core/models/backends/tensordock.py +0 -44
- dstack/_internal/core/models/backends/vastai.py +0 -43
- dstack/_internal/core/models/backends/vultr.py +0 -40
- dstack/_internal/core/models/pools.py +0 -43
- dstack/_internal/server/routers/pools.py +0 -142
- dstack/_internal/server/schemas/pools.py +0 -38
- dstack/_internal/server/services/backends/configurators/base.py +0 -72
- dstack/_internal/server/services/backends/configurators/cudo.py +0 -87
- dstack/_internal/server/services/backends/configurators/datacrunch.py +0 -79
- dstack/_internal/server/services/backends/configurators/kubernetes.py +0 -63
- dstack/_internal/server/services/backends/configurators/lambdalabs.py +0 -98
- dstack/_internal/server/services/backends/configurators/nebius.py +0 -85
- dstack/_internal/server/services/backends/configurators/runpod.py +0 -97
- dstack/_internal/server/services/backends/configurators/tensordock.py +0 -82
- dstack/_internal/server/services/backends/configurators/vastai.py +0 -80
- dstack/_internal/server/services/backends/configurators/vultr.py +0 -80
- dstack/api/_public/pools.py +0 -41
- dstack/api/_public/resources.py +0 -105
- dstack/api/server/_pools.py +0 -63
- tests/_internal/server/routers/test_pools.py +0 -612
- /dstack/_internal/{server/services/backends/configurators → core/backends/dstack}/__init__.py +0 -0
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/WHEEL +0 -0
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import ipaddress
|
|
2
1
|
import uuid
|
|
3
2
|
from collections.abc import Container, Iterable
|
|
4
3
|
from datetime import datetime, timezone
|
|
@@ -14,14 +13,10 @@ from dstack._internal.core.backends.base.offers import (
|
|
|
14
13
|
offer_to_catalog_item,
|
|
15
14
|
requirements_to_query_filter,
|
|
16
15
|
)
|
|
17
|
-
from dstack._internal.core.errors import (
|
|
18
|
-
ResourceExistsError,
|
|
19
|
-
ResourceNotExistsError,
|
|
20
|
-
ServerClientError,
|
|
21
|
-
)
|
|
22
16
|
from dstack._internal.core.models.backends.base import BackendType
|
|
23
17
|
from dstack._internal.core.models.envs import Env
|
|
24
18
|
from dstack._internal.core.models.instances import (
|
|
19
|
+
Instance,
|
|
25
20
|
InstanceAvailability,
|
|
26
21
|
InstanceConfiguration,
|
|
27
22
|
InstanceOffer,
|
|
@@ -33,10 +28,8 @@ from dstack._internal.core.models.instances import (
|
|
|
33
28
|
SSHConnectionParams,
|
|
34
29
|
SSHKey,
|
|
35
30
|
)
|
|
36
|
-
from dstack._internal.core.models.pools import Instance, Pool, PoolInstances
|
|
37
31
|
from dstack._internal.core.models.profiles import (
|
|
38
|
-
|
|
39
|
-
DEFAULT_POOL_TERMINATION_IDLE_TIME,
|
|
32
|
+
DEFAULT_FLEET_TERMINATION_IDLE_TIME,
|
|
40
33
|
Profile,
|
|
41
34
|
TerminationPolicy,
|
|
42
35
|
)
|
|
@@ -47,198 +40,17 @@ from dstack._internal.core.services.profiles import get_termination
|
|
|
47
40
|
from dstack._internal.server.models import (
|
|
48
41
|
FleetModel,
|
|
49
42
|
InstanceModel,
|
|
50
|
-
PoolModel,
|
|
51
43
|
ProjectModel,
|
|
52
44
|
UserModel,
|
|
53
45
|
)
|
|
54
|
-
from dstack._internal.server.services.locking import get_locker
|
|
55
46
|
from dstack._internal.server.services.offers import generate_shared_offer
|
|
56
47
|
from dstack._internal.server.services.projects import list_project_models, list_user_project_models
|
|
57
48
|
from dstack._internal.utils import common as common_utils
|
|
58
|
-
from dstack._internal.utils import random_names
|
|
59
|
-
from dstack._internal.utils.common import get_current_datetime
|
|
60
49
|
from dstack._internal.utils.logging import get_logger
|
|
61
50
|
|
|
62
51
|
logger = get_logger(__name__)
|
|
63
52
|
|
|
64
53
|
|
|
65
|
-
async def list_project_pools(session: AsyncSession, project: ProjectModel) -> List[Pool]:
|
|
66
|
-
pools = await list_project_pool_models(session=session, project=project)
|
|
67
|
-
if len(pools) == 0:
|
|
68
|
-
pool = await get_or_create_pool_by_name(session, project, DEFAULT_POOL_NAME)
|
|
69
|
-
pools.append(pool)
|
|
70
|
-
return [pool_model_to_pool(p) for p in pools]
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
async def get_pool(
|
|
74
|
-
session: AsyncSession,
|
|
75
|
-
project: ProjectModel,
|
|
76
|
-
pool_name: str,
|
|
77
|
-
select_deleted: bool = False,
|
|
78
|
-
load_instance_fleets: bool = False,
|
|
79
|
-
) -> Optional[PoolModel]:
|
|
80
|
-
filters = [
|
|
81
|
-
PoolModel.name == pool_name,
|
|
82
|
-
PoolModel.project_id == project.id,
|
|
83
|
-
]
|
|
84
|
-
if not select_deleted:
|
|
85
|
-
filters.append(PoolModel.deleted == False)
|
|
86
|
-
query = select(PoolModel).where(*filters)
|
|
87
|
-
if load_instance_fleets:
|
|
88
|
-
query = query.options(joinedload(PoolModel.instances, InstanceModel.fleet))
|
|
89
|
-
res = await session.scalars(query)
|
|
90
|
-
return res.one_or_none()
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
async def get_or_create_pool_by_name(
|
|
94
|
-
session: AsyncSession,
|
|
95
|
-
project: ProjectModel,
|
|
96
|
-
pool_name: Optional[str],
|
|
97
|
-
load_instance_fleets: bool = False,
|
|
98
|
-
) -> PoolModel:
|
|
99
|
-
if pool_name is None:
|
|
100
|
-
if project.default_pool_id is not None:
|
|
101
|
-
return await get_default_pool_or_error(session, project, load_instance_fleets)
|
|
102
|
-
default_pool = await get_pool(
|
|
103
|
-
session, project, DEFAULT_POOL_NAME, load_instance_fleets=load_instance_fleets
|
|
104
|
-
)
|
|
105
|
-
if default_pool is not None:
|
|
106
|
-
await set_default_pool(session, project, DEFAULT_POOL_NAME)
|
|
107
|
-
return default_pool
|
|
108
|
-
return await create_pool(session, project, DEFAULT_POOL_NAME)
|
|
109
|
-
pool = await get_pool(session, project, pool_name, load_instance_fleets=load_instance_fleets)
|
|
110
|
-
if pool is not None:
|
|
111
|
-
return pool
|
|
112
|
-
return await create_pool(session, project, pool_name)
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
async def get_default_pool_or_error(
|
|
116
|
-
session: AsyncSession, project: ProjectModel, load_instance_fleets: bool = False
|
|
117
|
-
) -> PoolModel:
|
|
118
|
-
query = select(PoolModel).where(PoolModel.id == project.default_pool_id)
|
|
119
|
-
if load_instance_fleets:
|
|
120
|
-
query = query.options(joinedload(PoolModel.instances, InstanceModel.fleet))
|
|
121
|
-
res = await session.execute(query)
|
|
122
|
-
return res.scalar_one()
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
async def create_pool(session: AsyncSession, project: ProjectModel, name: str) -> PoolModel:
|
|
126
|
-
pool = await get_pool(session, project, name)
|
|
127
|
-
if pool is not None:
|
|
128
|
-
raise ResourceExistsError()
|
|
129
|
-
pool = PoolModel(
|
|
130
|
-
name=name,
|
|
131
|
-
project_id=project.id,
|
|
132
|
-
)
|
|
133
|
-
session.add(pool)
|
|
134
|
-
await session.commit()
|
|
135
|
-
await session.refresh(pool)
|
|
136
|
-
if project.default_pool_id is None:
|
|
137
|
-
await set_default_pool(session, project, pool.name)
|
|
138
|
-
return pool
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
async def list_project_pool_models(
|
|
142
|
-
session: AsyncSession, project: ProjectModel, select_deleted: bool = False
|
|
143
|
-
) -> List[PoolModel]:
|
|
144
|
-
filters = [PoolModel.project_id == project.id]
|
|
145
|
-
if not select_deleted:
|
|
146
|
-
filters.append(PoolModel.deleted == select_deleted)
|
|
147
|
-
pools = await session.execute(
|
|
148
|
-
select(PoolModel).where(*filters).options(joinedload(PoolModel.instances))
|
|
149
|
-
)
|
|
150
|
-
return list(pools.scalars().unique().all())
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
async def set_default_pool(session: AsyncSession, project: ProjectModel, pool_name: str):
|
|
154
|
-
pool = await get_pool(session, project, pool_name)
|
|
155
|
-
if pool is None:
|
|
156
|
-
raise ResourceNotExistsError("Pool not found")
|
|
157
|
-
project.default_pool = pool
|
|
158
|
-
await session.commit()
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
async def delete_pool(session: AsyncSession, project: ProjectModel, pool_name: str) -> None:
|
|
162
|
-
# TODO force delete
|
|
163
|
-
pool = await get_pool(session, project, pool_name)
|
|
164
|
-
if pool is None:
|
|
165
|
-
raise ResourceNotExistsError("Pool not found")
|
|
166
|
-
|
|
167
|
-
pool_instances = get_pool_instances(pool)
|
|
168
|
-
for instance in pool_instances:
|
|
169
|
-
if instance.status != InstanceStatus.TERMINATED:
|
|
170
|
-
raise ServerClientError("Cannot delete pool with running instances")
|
|
171
|
-
|
|
172
|
-
pool.deleted = True
|
|
173
|
-
pool.deleted_at = get_current_datetime()
|
|
174
|
-
if project.default_pool_id == pool.id:
|
|
175
|
-
project.default_pool_id = None
|
|
176
|
-
await session.commit()
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
def pool_model_to_pool(pool_model: PoolModel) -> Pool:
|
|
180
|
-
total = 0
|
|
181
|
-
available = 0
|
|
182
|
-
for instance in pool_model.instances:
|
|
183
|
-
if not instance.deleted:
|
|
184
|
-
total += 1
|
|
185
|
-
if instance.status.is_available():
|
|
186
|
-
available += 1
|
|
187
|
-
return Pool(
|
|
188
|
-
name=pool_model.name,
|
|
189
|
-
default=pool_model.project.default_pool_id == pool_model.id,
|
|
190
|
-
created_at=pool_model.created_at.replace(tzinfo=timezone.utc),
|
|
191
|
-
total_instances=total,
|
|
192
|
-
available_instances=available,
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
async def remove_instance(
|
|
197
|
-
session: AsyncSession,
|
|
198
|
-
project: ProjectModel,
|
|
199
|
-
pool_name: str,
|
|
200
|
-
instance_name: str,
|
|
201
|
-
force: bool,
|
|
202
|
-
):
|
|
203
|
-
# This is a buggy function since it doesn't lock instances (and never did correctly).
|
|
204
|
-
# No need to fix it since it's deprecated.
|
|
205
|
-
pool = await get_pool(session, project, pool_name)
|
|
206
|
-
if pool is None:
|
|
207
|
-
raise ResourceNotExistsError("Pool not found")
|
|
208
|
-
terminated = False
|
|
209
|
-
for instance in pool.instances:
|
|
210
|
-
if instance.name == instance_name:
|
|
211
|
-
if force or not instance.jobs:
|
|
212
|
-
instance.status = InstanceStatus.TERMINATING
|
|
213
|
-
terminated = True
|
|
214
|
-
await session.commit()
|
|
215
|
-
if not terminated:
|
|
216
|
-
raise ResourceNotExistsError("Could not find instance to terminate")
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
async def show_pool_instances(
|
|
220
|
-
session: AsyncSession, project: ProjectModel, pool_name: Optional[str]
|
|
221
|
-
) -> PoolInstances:
|
|
222
|
-
if pool_name is not None:
|
|
223
|
-
pool = await get_pool(session, project, pool_name, load_instance_fleets=True)
|
|
224
|
-
if pool is None:
|
|
225
|
-
raise ResourceNotExistsError("Pool not found")
|
|
226
|
-
else:
|
|
227
|
-
pool = await get_or_create_pool_by_name(
|
|
228
|
-
session, project, pool_name, load_instance_fleets=True
|
|
229
|
-
)
|
|
230
|
-
pool_instances = get_pool_instances(pool)
|
|
231
|
-
instances = list(map(instance_model_to_instance, pool_instances))
|
|
232
|
-
return PoolInstances(
|
|
233
|
-
name=pool.name,
|
|
234
|
-
instances=instances,
|
|
235
|
-
)
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
def get_pool_instances(pool: PoolModel) -> List[InstanceModel]:
|
|
239
|
-
return [instance for instance in pool.instances if not instance.deleted]
|
|
240
|
-
|
|
241
|
-
|
|
242
54
|
def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
|
|
243
55
|
instance = Instance(
|
|
244
56
|
id=instance_model.id,
|
|
@@ -315,120 +127,6 @@ def get_instance_ssh_private_keys(instance_model: InstanceModel) -> tuple[str, O
|
|
|
315
127
|
return host_private_key, proxy_private_keys[0]
|
|
316
128
|
|
|
317
129
|
|
|
318
|
-
async def generate_instance_name(
|
|
319
|
-
session: AsyncSession,
|
|
320
|
-
project: ProjectModel,
|
|
321
|
-
pool_name: str,
|
|
322
|
-
) -> str:
|
|
323
|
-
# FIXME: The locking is not correct since concurrently commited changes
|
|
324
|
-
# are not visible due to SQLite repeatable reads
|
|
325
|
-
lock, _ = get_locker().get_lockset(f"instance_names_{project.name}")
|
|
326
|
-
async with lock:
|
|
327
|
-
pool_instances = []
|
|
328
|
-
pool = await get_pool(session, project, pool_name)
|
|
329
|
-
if pool is not None:
|
|
330
|
-
pool_instances = get_pool_instances(pool)
|
|
331
|
-
names = {g.name for g in pool_instances}
|
|
332
|
-
while True:
|
|
333
|
-
name = f"{random_names.generate_name()}"
|
|
334
|
-
if name not in names:
|
|
335
|
-
return name
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
async def add_remote(
|
|
339
|
-
session: AsyncSession,
|
|
340
|
-
project: ProjectModel,
|
|
341
|
-
pool_name: Optional[str],
|
|
342
|
-
instance_name: Optional[str],
|
|
343
|
-
instance_network: Optional[str],
|
|
344
|
-
region: Optional[str],
|
|
345
|
-
host: str,
|
|
346
|
-
port: int,
|
|
347
|
-
ssh_user: str,
|
|
348
|
-
ssh_keys: List[SSHKey],
|
|
349
|
-
) -> Instance:
|
|
350
|
-
if instance_network is not None:
|
|
351
|
-
try:
|
|
352
|
-
interface = ipaddress.IPv4Interface(instance_network)
|
|
353
|
-
instance_network = str(interface.network)
|
|
354
|
-
except ipaddress.AddressValueError:
|
|
355
|
-
raise ServerClientError("Failed to parse network value")
|
|
356
|
-
|
|
357
|
-
# Check instance in all instances
|
|
358
|
-
pools = await list_project_pool_models(session, project)
|
|
359
|
-
for pool in pools:
|
|
360
|
-
for instance in pool.instances:
|
|
361
|
-
if instance.deleted:
|
|
362
|
-
continue
|
|
363
|
-
if instance.remote_connection_info is not None:
|
|
364
|
-
rci = RemoteConnectionInfo.__response__.parse_raw(instance.remote_connection_info)
|
|
365
|
-
if rci.host == host and rci.port == port and rci.ssh_user == ssh_user:
|
|
366
|
-
return instance_model_to_instance(instance)
|
|
367
|
-
|
|
368
|
-
pool_model = await get_or_create_pool_by_name(session, project, pool_name)
|
|
369
|
-
pool_model_name = pool_model.name
|
|
370
|
-
if instance_name is None:
|
|
371
|
-
instance_name = await generate_instance_name(session, project, pool_model_name)
|
|
372
|
-
|
|
373
|
-
# TODO: doc - will overwrite after remote connected
|
|
374
|
-
instance_resource = Resources(cpus=2, memory_mib=8, gpus=[], spot=False)
|
|
375
|
-
instance_type = InstanceType(name="ssh", resources=instance_resource)
|
|
376
|
-
|
|
377
|
-
host_region = region if region is not None else "remote"
|
|
378
|
-
|
|
379
|
-
remote = JobProvisioningData(
|
|
380
|
-
backend=BackendType.REMOTE,
|
|
381
|
-
instance_type=instance_type,
|
|
382
|
-
instance_id=instance_name,
|
|
383
|
-
hostname=host,
|
|
384
|
-
region=host_region,
|
|
385
|
-
internal_ip=None,
|
|
386
|
-
instance_network=instance_network,
|
|
387
|
-
price=0,
|
|
388
|
-
username=ssh_user,
|
|
389
|
-
ssh_port=port,
|
|
390
|
-
dockerized=True,
|
|
391
|
-
backend_data="",
|
|
392
|
-
ssh_proxy=None,
|
|
393
|
-
)
|
|
394
|
-
offer = InstanceOfferWithAvailability(
|
|
395
|
-
backend=BackendType.REMOTE,
|
|
396
|
-
instance=instance_type,
|
|
397
|
-
region=host_region,
|
|
398
|
-
price=0.0,
|
|
399
|
-
availability=InstanceAvailability.AVAILABLE,
|
|
400
|
-
)
|
|
401
|
-
|
|
402
|
-
ssh_connection_info = RemoteConnectionInfo(
|
|
403
|
-
host=host, port=port, ssh_user=ssh_user, ssh_keys=ssh_keys
|
|
404
|
-
).json()
|
|
405
|
-
|
|
406
|
-
im = InstanceModel(
|
|
407
|
-
id=uuid.uuid4(),
|
|
408
|
-
name=instance_name,
|
|
409
|
-
instance_num=0,
|
|
410
|
-
project=project,
|
|
411
|
-
pool=pool_model,
|
|
412
|
-
backend=BackendType.REMOTE,
|
|
413
|
-
created_at=common_utils.get_current_datetime(),
|
|
414
|
-
started_at=common_utils.get_current_datetime(),
|
|
415
|
-
status=InstanceStatus.PENDING,
|
|
416
|
-
unreachable=False,
|
|
417
|
-
job_provisioning_data=remote.json(),
|
|
418
|
-
remote_connection_info=ssh_connection_info,
|
|
419
|
-
offer=offer.json(),
|
|
420
|
-
region=offer.region,
|
|
421
|
-
price=offer.price,
|
|
422
|
-
termination_policy=TerminationPolicy.DONT_DESTROY,
|
|
423
|
-
termination_idle_time=0,
|
|
424
|
-
)
|
|
425
|
-
session.add(im)
|
|
426
|
-
await session.commit()
|
|
427
|
-
|
|
428
|
-
instance = instance_model_to_instance(im)
|
|
429
|
-
return instance
|
|
430
|
-
|
|
431
|
-
|
|
432
130
|
def filter_pool_instances(
|
|
433
131
|
pool_instances: List[InstanceModel],
|
|
434
132
|
profile: Profile,
|
|
@@ -483,8 +181,6 @@ def filter_pool_instances(
|
|
|
483
181
|
continue
|
|
484
182
|
if instance.unreachable:
|
|
485
183
|
continue
|
|
486
|
-
if profile.instance_name is not None and instance.name != profile.instance_name:
|
|
487
|
-
continue
|
|
488
184
|
if status is not None and instance.status != status:
|
|
489
185
|
continue
|
|
490
186
|
jpd = get_instance_provisioning_data(instance)
|
|
@@ -567,11 +263,24 @@ def get_shared_pool_instances_with_offers(
|
|
|
567
263
|
return instances_with_offers
|
|
568
264
|
|
|
569
265
|
|
|
570
|
-
async def
|
|
266
|
+
async def get_pool_instances(
|
|
267
|
+
session: AsyncSession,
|
|
268
|
+
project: ProjectModel,
|
|
269
|
+
) -> List[InstanceModel]:
|
|
270
|
+
res = await session.execute(
|
|
271
|
+
select(InstanceModel).where(
|
|
272
|
+
InstanceModel.project_id == project.id,
|
|
273
|
+
InstanceModel.deleted == False,
|
|
274
|
+
)
|
|
275
|
+
)
|
|
276
|
+
instance_models = list(res.unique().scalars().all())
|
|
277
|
+
return instance_models
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
async def list_projects_instance_models(
|
|
571
281
|
session: AsyncSession,
|
|
572
282
|
projects: List[ProjectModel],
|
|
573
283
|
fleet_ids: Optional[Iterable[uuid.UUID]],
|
|
574
|
-
pool: Optional[PoolModel],
|
|
575
284
|
only_active: bool,
|
|
576
285
|
prev_created_at: Optional[datetime],
|
|
577
286
|
prev_id: Optional[uuid.UUID],
|
|
@@ -583,8 +292,6 @@ async def list_pools_instance_models(
|
|
|
583
292
|
]
|
|
584
293
|
if fleet_ids is not None:
|
|
585
294
|
filters.append(InstanceModel.fleet_id.in_(fleet_ids))
|
|
586
|
-
if pool is not None:
|
|
587
|
-
filters.append(InstanceModel.pool_id == pool.id)
|
|
588
295
|
if only_active:
|
|
589
296
|
filters.extend(
|
|
590
297
|
[
|
|
@@ -628,18 +335,17 @@ async def list_pools_instance_models(
|
|
|
628
335
|
.where(*filters)
|
|
629
336
|
.order_by(*order_by)
|
|
630
337
|
.limit(limit)
|
|
631
|
-
.options(joinedload(InstanceModel.
|
|
338
|
+
.options(joinedload(InstanceModel.fleet))
|
|
632
339
|
)
|
|
633
340
|
instance_models = list(res.unique().scalars().all())
|
|
634
341
|
return instance_models
|
|
635
342
|
|
|
636
343
|
|
|
637
|
-
async def
|
|
344
|
+
async def list_user_instances(
|
|
638
345
|
session: AsyncSession,
|
|
639
346
|
user: UserModel,
|
|
640
347
|
project_names: Optional[Container[str]],
|
|
641
348
|
fleet_ids: Optional[Iterable[uuid.UUID]],
|
|
642
|
-
pool_name: Optional[str],
|
|
643
349
|
only_active: bool,
|
|
644
350
|
prev_created_at: Optional[datetime],
|
|
645
351
|
prev_id: Optional[uuid.UUID],
|
|
@@ -653,24 +359,15 @@ async def list_user_pool_instances(
|
|
|
653
359
|
if not projects:
|
|
654
360
|
return []
|
|
655
361
|
|
|
656
|
-
pool = None
|
|
657
362
|
if project_names is not None:
|
|
658
363
|
projects = [proj for proj in projects if proj.name in project_names]
|
|
659
364
|
if len(projects) == 0:
|
|
660
365
|
return []
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
session=session,
|
|
664
|
-
project=projects[0],
|
|
665
|
-
pool_name=pool_name,
|
|
666
|
-
select_deleted=(not only_active),
|
|
667
|
-
)
|
|
668
|
-
|
|
669
|
-
instance_models = await list_pools_instance_models(
|
|
366
|
+
|
|
367
|
+
instance_models = await list_projects_instance_models(
|
|
670
368
|
session=session,
|
|
671
369
|
projects=projects,
|
|
672
370
|
fleet_ids=fleet_ids,
|
|
673
|
-
pool=pool,
|
|
674
371
|
only_active=only_active,
|
|
675
372
|
prev_created_at=prev_created_at,
|
|
676
373
|
prev_id=prev_id,
|
|
@@ -699,7 +396,6 @@ async def create_instance_model(
|
|
|
699
396
|
session: AsyncSession,
|
|
700
397
|
project: ProjectModel,
|
|
701
398
|
user: UserModel,
|
|
702
|
-
pool: PoolModel,
|
|
703
399
|
profile: Profile,
|
|
704
400
|
requirements: Requirements,
|
|
705
401
|
instance_name: str,
|
|
@@ -709,7 +405,7 @@ async def create_instance_model(
|
|
|
709
405
|
blocks: Union[Literal["auto"], int],
|
|
710
406
|
) -> InstanceModel:
|
|
711
407
|
termination_policy, termination_idle_time = get_termination(
|
|
712
|
-
profile,
|
|
408
|
+
profile, DEFAULT_FLEET_TERMINATION_IDLE_TIME
|
|
713
409
|
)
|
|
714
410
|
instance_id = uuid.uuid4()
|
|
715
411
|
project_ssh_key = SSHKey(
|
|
@@ -730,7 +426,6 @@ async def create_instance_model(
|
|
|
730
426
|
name=instance_name,
|
|
731
427
|
instance_num=instance_num,
|
|
732
428
|
project=project,
|
|
733
|
-
pool=pool,
|
|
734
429
|
created_at=common_utils.get_current_datetime(),
|
|
735
430
|
status=InstanceStatus.PENDING,
|
|
736
431
|
unreachable=False,
|
|
@@ -748,7 +443,6 @@ async def create_instance_model(
|
|
|
748
443
|
|
|
749
444
|
async def create_ssh_instance_model(
|
|
750
445
|
project: ProjectModel,
|
|
751
|
-
pool: PoolModel,
|
|
752
446
|
instance_name: str,
|
|
753
447
|
instance_num: int,
|
|
754
448
|
internal_ip: Optional[str],
|
|
@@ -805,7 +499,6 @@ async def create_ssh_instance_model(
|
|
|
805
499
|
name=instance_name,
|
|
806
500
|
instance_num=instance_num,
|
|
807
501
|
project=project,
|
|
808
|
-
pool=pool,
|
|
809
502
|
backend=BackendType.REMOTE,
|
|
810
503
|
created_at=common_utils.get_current_datetime(),
|
|
811
504
|
started_at=common_utils.get_current_datetime(),
|
|
@@ -10,7 +10,8 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
10
10
|
from sqlalchemy.orm import joinedload
|
|
11
11
|
|
|
12
12
|
import dstack._internal.server.services.backends as backends_services
|
|
13
|
-
from dstack._internal.core.backends.base import Backend
|
|
13
|
+
from dstack._internal.core.backends.base.backend import Backend
|
|
14
|
+
from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
|
|
14
15
|
from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT, DSTACK_SHIM_HTTP_PORT
|
|
15
16
|
from dstack._internal.core.errors import (
|
|
16
17
|
BackendError,
|
|
@@ -42,6 +43,7 @@ from dstack._internal.server.models import (
|
|
|
42
43
|
)
|
|
43
44
|
from dstack._internal.server.services import services
|
|
44
45
|
from dstack._internal.server.services import volumes as volumes_services
|
|
46
|
+
from dstack._internal.server.services.instances import get_instance_ssh_private_keys
|
|
45
47
|
from dstack._internal.server.services.jobs.configurators.base import (
|
|
46
48
|
JobConfigurator,
|
|
47
49
|
interpolate_job_volumes,
|
|
@@ -50,7 +52,6 @@ from dstack._internal.server.services.jobs.configurators.dev import DevEnvironme
|
|
|
50
52
|
from dstack._internal.server.services.jobs.configurators.service import ServiceJobConfigurator
|
|
51
53
|
from dstack._internal.server.services.jobs.configurators.task import TaskJobConfigurator
|
|
52
54
|
from dstack._internal.server.services.logging import fmt
|
|
53
|
-
from dstack._internal.server.services.pools import get_instance_ssh_private_keys
|
|
54
55
|
from dstack._internal.server.services.runner import client
|
|
55
56
|
from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
|
|
56
57
|
from dstack._internal.server.services.volumes import (
|
|
@@ -461,24 +462,26 @@ async def _detach_volume_from_job_instance(
|
|
|
461
462
|
if volume.provisioning_data is None or not volume.provisioning_data.detachable:
|
|
462
463
|
# Backends without `detach_volume` detach volumes automatically
|
|
463
464
|
return detached
|
|
465
|
+
compute = backend.compute()
|
|
466
|
+
assert isinstance(compute, ComputeWithVolumeSupport)
|
|
464
467
|
try:
|
|
465
468
|
if job_model.volumes_detached_at is None:
|
|
466
469
|
# We haven't tried detaching volumes yet, try soft detach first
|
|
467
470
|
await run_async(
|
|
468
|
-
|
|
471
|
+
compute.detach_volume,
|
|
469
472
|
volume=volume,
|
|
470
473
|
instance_id=jpd.instance_id,
|
|
471
474
|
force=False,
|
|
472
475
|
)
|
|
473
476
|
# For some backends, the volume may be detached immediately
|
|
474
477
|
detached = await run_async(
|
|
475
|
-
|
|
478
|
+
compute.is_volume_detached,
|
|
476
479
|
volume=volume,
|
|
477
480
|
instance_id=jpd.instance_id,
|
|
478
481
|
)
|
|
479
482
|
else:
|
|
480
483
|
detached = await run_async(
|
|
481
|
-
|
|
484
|
+
compute.is_volume_detached,
|
|
482
485
|
volume=volume,
|
|
483
486
|
instance_id=jpd.instance_id,
|
|
484
487
|
)
|
|
@@ -489,7 +492,7 @@ async def _detach_volume_from_job_instance(
|
|
|
489
492
|
instance_model.name,
|
|
490
493
|
)
|
|
491
494
|
await run_async(
|
|
492
|
-
|
|
495
|
+
compute.detach_volume,
|
|
493
496
|
volume=volume,
|
|
494
497
|
instance_id=jpd.instance_id,
|
|
495
498
|
force=True,
|
|
@@ -13,10 +13,15 @@ from dstack._internal.core.models.configurations import (
|
|
|
13
13
|
PythonVersion,
|
|
14
14
|
RunConfigurationType,
|
|
15
15
|
)
|
|
16
|
-
from dstack._internal.core.models.profiles import
|
|
16
|
+
from dstack._internal.core.models.profiles import (
|
|
17
|
+
DEFAULT_STOP_DURATION,
|
|
18
|
+
SpotPolicy,
|
|
19
|
+
UtilizationPolicy,
|
|
20
|
+
)
|
|
17
21
|
from dstack._internal.core.models.runs import (
|
|
18
22
|
AppSpec,
|
|
19
23
|
JobSpec,
|
|
24
|
+
JobSSHKey,
|
|
20
25
|
Requirements,
|
|
21
26
|
Retry,
|
|
22
27
|
RunSpec,
|
|
@@ -26,6 +31,7 @@ from dstack._internal.core.models.volumes import MountPoint, VolumeMountPoint
|
|
|
26
31
|
from dstack._internal.core.services.profiles import get_retry
|
|
27
32
|
from dstack._internal.core.services.ssh.ports import filter_reserved_ports
|
|
28
33
|
from dstack._internal.server.services.docker import ImageConfig, get_image_config
|
|
34
|
+
from dstack._internal.utils import crypto
|
|
29
35
|
from dstack._internal.utils.common import run_async
|
|
30
36
|
from dstack._internal.utils.interpolator import InterpolatorError, VariablesInterpolator
|
|
31
37
|
|
|
@@ -53,6 +59,8 @@ class JobConfigurator(ABC):
|
|
|
53
59
|
TYPE: RunConfigurationType
|
|
54
60
|
|
|
55
61
|
_image_config: Optional[ImageConfig] = None
|
|
62
|
+
# JobSSHKey should be shared for all jobs in a replica for inter-node communitation.
|
|
63
|
+
_job_ssh_key: Optional[JobSSHKey] = None
|
|
56
64
|
|
|
57
65
|
def __init__(self, run_spec: RunSpec):
|
|
58
66
|
self.run_spec = run_spec
|
|
@@ -113,11 +121,13 @@ class JobConfigurator(ABC):
|
|
|
113
121
|
single_branch=self._single_branch(),
|
|
114
122
|
max_duration=self._max_duration(),
|
|
115
123
|
stop_duration=self._stop_duration(),
|
|
124
|
+
utilization_policy=self._utilization_policy(),
|
|
116
125
|
registry_auth=self._registry_auth(),
|
|
117
126
|
requirements=self._requirements(),
|
|
118
127
|
retry=self._retry(),
|
|
119
128
|
working_dir=self._working_dir(),
|
|
120
129
|
volumes=self._volumes(job_num),
|
|
130
|
+
ssh_key=self._ssh_key(jobs_per_replica),
|
|
121
131
|
)
|
|
122
132
|
return job_spec
|
|
123
133
|
|
|
@@ -201,6 +211,9 @@ class JobConfigurator(ABC):
|
|
|
201
211
|
# pydantic validator ensures this is int
|
|
202
212
|
return self.run_spec.merged_profile.stop_duration
|
|
203
213
|
|
|
214
|
+
def _utilization_policy(self) -> Optional[UtilizationPolicy]:
|
|
215
|
+
return self.run_spec.merged_profile.utilization_policy
|
|
216
|
+
|
|
204
217
|
def _registry_auth(self) -> Optional[RegistryAuth]:
|
|
205
218
|
return self.run_spec.configuration.registry_auth
|
|
206
219
|
|
|
@@ -230,6 +243,17 @@ class JobConfigurator(ABC):
|
|
|
230
243
|
def _volumes(self, job_num: int) -> List[MountPoint]:
|
|
231
244
|
return interpolate_job_volumes(self.run_spec.configuration.volumes, job_num)
|
|
232
245
|
|
|
246
|
+
def _ssh_key(self, jobs_per_replica: int) -> Optional[JobSSHKey]:
|
|
247
|
+
if jobs_per_replica < 2:
|
|
248
|
+
return None
|
|
249
|
+
if self._job_ssh_key is None:
|
|
250
|
+
private, public = crypto.generate_rsa_key_pair_bytes(comment="dstack_job")
|
|
251
|
+
self._job_ssh_key = JobSSHKey(
|
|
252
|
+
private=private.decode(),
|
|
253
|
+
public=public.decode(),
|
|
254
|
+
)
|
|
255
|
+
return self._job_ssh_key
|
|
256
|
+
|
|
233
257
|
|
|
234
258
|
def interpolate_job_volumes(
|
|
235
259
|
run_volumes: List[Union[MountPoint, str]],
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
from typing import List, Optional
|
|
2
2
|
|
|
3
|
+
from dstack._internal.core.errors import ServerClientError
|
|
3
4
|
from dstack._internal.core.models.configurations import PortMapping, RunConfigurationType
|
|
4
5
|
from dstack._internal.core.models.profiles import SpotPolicy
|
|
5
6
|
from dstack._internal.core.models.runs import RunSpec
|
|
6
7
|
from dstack._internal.server.services.jobs.configurators.base import JobConfigurator
|
|
8
|
+
from dstack._internal.server.services.jobs.configurators.extensions.cursor import CursorDesktop
|
|
7
9
|
from dstack._internal.server.services.jobs.configurators.extensions.vscode import VSCodeDesktop
|
|
8
10
|
|
|
9
11
|
INSTALL_IPYKERNEL = (
|
|
@@ -16,7 +18,13 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
|
|
|
16
18
|
TYPE: RunConfigurationType = RunConfigurationType.DEV_ENVIRONMENT
|
|
17
19
|
|
|
18
20
|
def __init__(self, run_spec: RunSpec):
|
|
19
|
-
|
|
21
|
+
if run_spec.configuration.ide == "vscode":
|
|
22
|
+
__class = VSCodeDesktop
|
|
23
|
+
elif run_spec.configuration.ide == "cursor":
|
|
24
|
+
__class = CursorDesktop
|
|
25
|
+
else:
|
|
26
|
+
raise ServerClientError(f"Unsupported IDE: {run_spec.configuration.ide}")
|
|
27
|
+
self.ide = __class(
|
|
20
28
|
run_name=run_spec.run_name,
|
|
21
29
|
version=run_spec.configuration.version,
|
|
22
30
|
extensions=["ms-python.python", "ms-toolsai.jupyter"],
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class CursorDesktop:
|
|
5
|
+
def __init__(
|
|
6
|
+
self,
|
|
7
|
+
run_name: str,
|
|
8
|
+
version: str,
|
|
9
|
+
extensions: List[str],
|
|
10
|
+
):
|
|
11
|
+
self.run_name = run_name
|
|
12
|
+
self.version = version
|
|
13
|
+
self.extensions = extensions
|
|
14
|
+
|
|
15
|
+
def get_install_commands(self) -> List[str]:
|
|
16
|
+
commands = []
|
|
17
|
+
if self.version is not None:
|
|
18
|
+
url = f"https://cursor.blob.core.windows.net/remote-releases/{self.version}/vscode-reh-linux-$arch.tar.gz"
|
|
19
|
+
archive = "vscode-reh-linux-$arch.tar.gz"
|
|
20
|
+
target = f'~/.cursor-server/cli/servers/"Stable-{self.version}"/server'
|
|
21
|
+
commands.extend(
|
|
22
|
+
[
|
|
23
|
+
'if [ $(uname -m) = "aarch64" ]; then arch="arm64"; else arch="x64"; fi',
|
|
24
|
+
"mkdir -p /tmp",
|
|
25
|
+
f'wget -q --show-progress "{url}" -O "/tmp/{archive}"',
|
|
26
|
+
f"mkdir -vp {target}",
|
|
27
|
+
f'tar --no-same-owner -xz --strip-components=1 -C {target} -f "/tmp/{archive}"',
|
|
28
|
+
f'rm "/tmp/{archive}"',
|
|
29
|
+
]
|
|
30
|
+
)
|
|
31
|
+
if self.extensions:
|
|
32
|
+
extensions = " ".join(f'--install-extension "{name}"' for name in self.extensions)
|
|
33
|
+
commands.append(f'PATH="$PATH":{target}/bin cursor-server {extensions}')
|
|
34
|
+
return commands
|
|
35
|
+
|
|
36
|
+
def get_print_readme_commands(self) -> List[str]:
|
|
37
|
+
return [
|
|
38
|
+
"echo To open in VS Code Desktop, use link below:",
|
|
39
|
+
"echo ''",
|
|
40
|
+
f"echo ' cursor://vscode-remote/ssh-remote+{self.run_name}/workflow'", # TODO use $REPO_DIR
|
|
41
|
+
"echo ''",
|
|
42
|
+
]
|