dstack 0.19.25rc1__py3-none-any.whl → 0.19.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +2 -2
- dstack/_internal/cli/commands/apply.py +3 -61
- dstack/_internal/cli/commands/attach.py +1 -1
- dstack/_internal/cli/commands/completion.py +1 -1
- dstack/_internal/cli/commands/delete.py +2 -2
- dstack/_internal/cli/commands/fleet.py +1 -1
- dstack/_internal/cli/commands/gateway.py +2 -2
- dstack/_internal/cli/commands/init.py +56 -24
- dstack/_internal/cli/commands/logs.py +1 -1
- dstack/_internal/cli/commands/metrics.py +1 -1
- dstack/_internal/cli/commands/offer.py +45 -7
- dstack/_internal/cli/commands/project.py +2 -2
- dstack/_internal/cli/commands/secrets.py +2 -2
- dstack/_internal/cli/commands/server.py +1 -1
- dstack/_internal/cli/commands/stop.py +1 -1
- dstack/_internal/cli/commands/volume.py +1 -1
- dstack/_internal/cli/main.py +2 -2
- dstack/_internal/cli/services/completion.py +2 -2
- dstack/_internal/cli/services/configurators/__init__.py +6 -2
- dstack/_internal/cli/services/configurators/base.py +6 -7
- dstack/_internal/cli/services/configurators/fleet.py +1 -3
- dstack/_internal/cli/services/configurators/gateway.py +2 -4
- dstack/_internal/cli/services/configurators/run.py +195 -58
- dstack/_internal/cli/services/configurators/volume.py +2 -4
- dstack/_internal/cli/services/profile.py +1 -1
- dstack/_internal/cli/services/repos.py +51 -47
- dstack/_internal/core/backends/aws/configurator.py +11 -7
- dstack/_internal/core/backends/azure/configurator.py +11 -7
- dstack/_internal/core/backends/base/configurator.py +25 -13
- dstack/_internal/core/backends/cloudrift/configurator.py +13 -7
- dstack/_internal/core/backends/cudo/configurator.py +11 -7
- dstack/_internal/core/backends/datacrunch/compute.py +5 -1
- dstack/_internal/core/backends/datacrunch/configurator.py +13 -7
- dstack/_internal/core/backends/gcp/configurator.py +11 -7
- dstack/_internal/core/backends/hotaisle/configurator.py +13 -7
- dstack/_internal/core/backends/kubernetes/configurator.py +13 -7
- dstack/_internal/core/backends/lambdalabs/configurator.py +11 -7
- dstack/_internal/core/backends/nebius/compute.py +1 -1
- dstack/_internal/core/backends/nebius/configurator.py +11 -7
- dstack/_internal/core/backends/nebius/resources.py +21 -11
- dstack/_internal/core/backends/oci/configurator.py +11 -7
- dstack/_internal/core/backends/runpod/configurator.py +11 -7
- dstack/_internal/core/backends/template/configurator.py.jinja +11 -7
- dstack/_internal/core/backends/tensordock/configurator.py +13 -7
- dstack/_internal/core/backends/vastai/configurator.py +11 -7
- dstack/_internal/core/backends/vultr/configurator.py +11 -4
- dstack/_internal/core/compatibility/gpus.py +13 -0
- dstack/_internal/core/compatibility/runs.py +1 -0
- dstack/_internal/core/models/common.py +3 -3
- dstack/_internal/core/models/configurations.py +172 -27
- dstack/_internal/core/models/files.py +1 -1
- dstack/_internal/core/models/fleets.py +5 -1
- dstack/_internal/core/models/profiles.py +41 -11
- dstack/_internal/core/models/resources.py +46 -42
- dstack/_internal/core/models/runs.py +4 -0
- dstack/_internal/core/services/configs/__init__.py +6 -3
- dstack/_internal/core/services/profiles.py +2 -2
- dstack/_internal/core/services/repos.py +5 -3
- dstack/_internal/core/services/ssh/ports.py +1 -1
- dstack/_internal/proxy/lib/deps.py +6 -2
- dstack/_internal/server/app.py +22 -17
- dstack/_internal/server/background/tasks/process_gateways.py +4 -1
- dstack/_internal/server/background/tasks/process_instances.py +10 -2
- dstack/_internal/server/background/tasks/process_probes.py +1 -1
- dstack/_internal/server/background/tasks/process_running_jobs.py +10 -4
- dstack/_internal/server/background/tasks/process_runs.py +1 -1
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +54 -43
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
- dstack/_internal/server/background/tasks/process_volumes.py +1 -1
- dstack/_internal/server/db.py +8 -4
- dstack/_internal/server/models.py +1 -0
- dstack/_internal/server/routers/gpus.py +1 -6
- dstack/_internal/server/schemas/runner.py +10 -0
- dstack/_internal/server/services/backends/__init__.py +14 -8
- dstack/_internal/server/services/backends/handlers.py +6 -1
- dstack/_internal/server/services/docker.py +5 -5
- dstack/_internal/server/services/fleets.py +14 -13
- dstack/_internal/server/services/gateways/__init__.py +2 -0
- dstack/_internal/server/services/gateways/client.py +5 -2
- dstack/_internal/server/services/gateways/connection.py +1 -1
- dstack/_internal/server/services/gpus.py +50 -49
- dstack/_internal/server/services/instances.py +41 -1
- dstack/_internal/server/services/jobs/__init__.py +15 -4
- dstack/_internal/server/services/jobs/configurators/base.py +7 -11
- dstack/_internal/server/services/jobs/configurators/dev.py +5 -0
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -3
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +3 -3
- dstack/_internal/server/services/jobs/configurators/service.py +1 -0
- dstack/_internal/server/services/jobs/configurators/task.py +3 -0
- dstack/_internal/server/services/locking.py +5 -5
- dstack/_internal/server/services/logging.py +10 -2
- dstack/_internal/server/services/logs/__init__.py +8 -6
- dstack/_internal/server/services/logs/aws.py +330 -327
- dstack/_internal/server/services/logs/filelog.py +7 -6
- dstack/_internal/server/services/logs/gcp.py +141 -139
- dstack/_internal/server/services/plugins.py +1 -1
- dstack/_internal/server/services/projects.py +2 -5
- dstack/_internal/server/services/proxy/repo.py +5 -1
- dstack/_internal/server/services/requirements/__init__.py +0 -0
- dstack/_internal/server/services/requirements/combine.py +259 -0
- dstack/_internal/server/services/runner/client.py +7 -0
- dstack/_internal/server/services/runs.py +1 -1
- dstack/_internal/server/services/services/__init__.py +8 -2
- dstack/_internal/server/services/services/autoscalers.py +2 -0
- dstack/_internal/server/services/ssh.py +2 -1
- dstack/_internal/server/services/storage/__init__.py +5 -6
- dstack/_internal/server/services/storage/gcs.py +49 -49
- dstack/_internal/server/services/storage/s3.py +52 -52
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/testing/common.py +1 -1
- dstack/_internal/server/utils/logging.py +3 -3
- dstack/_internal/server/utils/provisioning.py +3 -3
- dstack/_internal/utils/json_schema.py +3 -1
- dstack/_internal/utils/typing.py +14 -0
- dstack/api/_public/repos.py +21 -2
- dstack/api/_public/runs.py +5 -7
- dstack/api/server/__init__.py +17 -19
- dstack/api/server/_gpus.py +2 -1
- dstack/api/server/_group.py +4 -3
- dstack/api/server/_repos.py +20 -3
- dstack/plugins/builtin/rest_plugin/_plugin.py +1 -0
- dstack/version.py +1 -1
- {dstack-0.19.25rc1.dist-info → dstack-0.19.26.dist-info}/METADATA +1 -1
- {dstack-0.19.25rc1.dist-info → dstack-0.19.26.dist-info}/RECORD +127 -124
- dstack/api/huggingface/__init__.py +0 -73
- {dstack-0.19.25rc1.dist-info → dstack-0.19.26.dist-info}/WHEEL +0 -0
- {dstack-0.19.25rc1.dist-info → dstack-0.19.26.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.25rc1.dist-info → dstack-0.19.26.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from typing import Dict, List, Literal, Optional, Tuple
|
|
2
2
|
|
|
3
|
-
from sqlalchemy.ext.asyncio import AsyncSession
|
|
4
|
-
|
|
5
3
|
from dstack._internal.core.backends.base.backend import Backend
|
|
4
|
+
from dstack._internal.core.errors import ServerClientError
|
|
5
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
6
6
|
from dstack._internal.core.models.instances import InstanceOfferWithAvailability
|
|
7
7
|
from dstack._internal.core.models.profiles import SpotPolicy
|
|
8
8
|
from dstack._internal.core.models.resources import Range
|
|
@@ -15,10 +15,43 @@ from dstack._internal.server.schemas.gpus import (
|
|
|
15
15
|
ListGpusResponse,
|
|
16
16
|
)
|
|
17
17
|
from dstack._internal.server.services.offers import get_offers_by_requirements
|
|
18
|
+
from dstack._internal.utils.common import get_or_error
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
async def list_gpus_grouped(
|
|
22
|
+
project: ProjectModel,
|
|
23
|
+
run_spec: RunSpec,
|
|
24
|
+
group_by: Optional[List[Literal["backend", "region", "count"]]] = None,
|
|
25
|
+
) -> ListGpusResponse:
|
|
26
|
+
"""Retrieves available GPU specifications based on a run spec, with optional grouping."""
|
|
27
|
+
offers = await _get_gpu_offers(project=project, run_spec=run_spec)
|
|
28
|
+
backend_gpus = _process_offers_into_backend_gpus(offers)
|
|
29
|
+
group_by_set = set(group_by) if group_by else set()
|
|
30
|
+
if "region" in group_by_set and "backend" not in group_by_set:
|
|
31
|
+
raise ServerClientError("Cannot group by 'region' without also grouping by 'backend'")
|
|
32
|
+
|
|
33
|
+
# Determine grouping strategy based on combination
|
|
34
|
+
has_backend = "backend" in group_by_set
|
|
35
|
+
has_region = "region" in group_by_set
|
|
36
|
+
has_count = "count" in group_by_set
|
|
37
|
+
if has_backend and has_region and has_count:
|
|
38
|
+
gpus = _get_gpus_grouped_by_backend_region_and_count(backend_gpus)
|
|
39
|
+
elif has_backend and has_count:
|
|
40
|
+
gpus = _get_gpus_grouped_by_backend_and_count(backend_gpus)
|
|
41
|
+
elif has_backend and has_region:
|
|
42
|
+
gpus = _get_gpus_grouped_by_backend_and_region(backend_gpus)
|
|
43
|
+
elif has_backend:
|
|
44
|
+
gpus = _get_gpus_grouped_by_backend(backend_gpus)
|
|
45
|
+
elif has_count:
|
|
46
|
+
gpus = _get_gpus_grouped_by_count(backend_gpus)
|
|
47
|
+
else:
|
|
48
|
+
gpus = _get_gpus_with_no_grouping(backend_gpus)
|
|
49
|
+
|
|
50
|
+
return ListGpusResponse(gpus=gpus)
|
|
18
51
|
|
|
19
52
|
|
|
20
53
|
async def _get_gpu_offers(
|
|
21
|
-
|
|
54
|
+
project: ProjectModel, run_spec: RunSpec
|
|
22
55
|
) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
|
|
23
56
|
"""Fetches all available instance offers that match the run spec's GPU requirements."""
|
|
24
57
|
profile = run_spec.merged_profile
|
|
@@ -28,7 +61,6 @@ async def _get_gpu_offers(
|
|
|
28
61
|
spot=get_policy_map(profile.spot_policy, default=SpotPolicy.AUTO),
|
|
29
62
|
reservation=profile.reservation,
|
|
30
63
|
)
|
|
31
|
-
|
|
32
64
|
return await get_offers_by_requirements(
|
|
33
65
|
project=project,
|
|
34
66
|
profile=profile,
|
|
@@ -45,10 +77,10 @@ def _process_offers_into_backend_gpus(
|
|
|
45
77
|
offers: List[Tuple[Backend, InstanceOfferWithAvailability]],
|
|
46
78
|
) -> List[BackendGpus]:
|
|
47
79
|
"""Transforms raw offers into a structured list of BackendGpus, aggregating GPU info."""
|
|
48
|
-
backend_data: Dict[
|
|
80
|
+
backend_data: Dict[BackendType, Dict] = {}
|
|
49
81
|
|
|
50
|
-
for
|
|
51
|
-
backend_type = backend
|
|
82
|
+
for _, offer in offers:
|
|
83
|
+
backend_type = offer.backend
|
|
52
84
|
if backend_type not in backend_data:
|
|
53
85
|
backend_data[backend_type] = {"gpus": {}, "regions": set()}
|
|
54
86
|
|
|
@@ -111,7 +143,7 @@ def _process_offers_into_backend_gpus(
|
|
|
111
143
|
return backend_gpus_list
|
|
112
144
|
|
|
113
145
|
|
|
114
|
-
def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type:
|
|
146
|
+
def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type: BackendType):
|
|
115
147
|
"""Updates an existing GpuGroup with new data from another GPU offer."""
|
|
116
148
|
spot_type: Literal["spot", "on-demand"] = "spot" if gpu.spot else "on-demand"
|
|
117
149
|
|
|
@@ -122,6 +154,12 @@ def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type: str):
|
|
|
122
154
|
if row.backends and backend_type not in row.backends:
|
|
123
155
|
row.backends.append(backend_type)
|
|
124
156
|
|
|
157
|
+
# FIXME: Consider using non-optional range
|
|
158
|
+
assert row.count.min is not None
|
|
159
|
+
assert row.count.max is not None
|
|
160
|
+
assert row.price.min is not None
|
|
161
|
+
assert row.price.max is not None
|
|
162
|
+
|
|
125
163
|
row.count.min = min(row.count.min, gpu.count)
|
|
126
164
|
row.count.max = max(row.count.max, gpu.count)
|
|
127
165
|
per_gpu_price = gpu.price / gpu.count
|
|
@@ -194,7 +232,7 @@ def _get_gpus_grouped_by_backend(backend_gpus: List[BackendGpus]) -> List[GpuGro
|
|
|
194
232
|
not any(av.is_available() for av in g.availability),
|
|
195
233
|
g.price.min,
|
|
196
234
|
g.price.max,
|
|
197
|
-
g.backend.value,
|
|
235
|
+
get_or_error(g.backend).value,
|
|
198
236
|
g.name,
|
|
199
237
|
g.memory_mib,
|
|
200
238
|
),
|
|
@@ -229,7 +267,7 @@ def _get_gpus_grouped_by_backend_and_region(backend_gpus: List[BackendGpus]) ->
|
|
|
229
267
|
not any(av.is_available() for av in g.availability),
|
|
230
268
|
g.price.min,
|
|
231
269
|
g.price.max,
|
|
232
|
-
g.backend.value,
|
|
270
|
+
get_or_error(g.backend).value,
|
|
233
271
|
g.region,
|
|
234
272
|
g.name,
|
|
235
273
|
g.memory_mib,
|
|
@@ -299,7 +337,7 @@ def _get_gpus_grouped_by_backend_and_count(backend_gpus: List[BackendGpus]) -> L
|
|
|
299
337
|
not any(av.is_available() for av in g.availability),
|
|
300
338
|
g.price.min,
|
|
301
339
|
g.price.max,
|
|
302
|
-
g.backend.value,
|
|
340
|
+
get_or_error(g.backend).value,
|
|
303
341
|
g.count.min,
|
|
304
342
|
g.name,
|
|
305
343
|
g.memory_mib,
|
|
@@ -344,47 +382,10 @@ def _get_gpus_grouped_by_backend_region_and_count(
|
|
|
344
382
|
not any(av.is_available() for av in g.availability),
|
|
345
383
|
g.price.min,
|
|
346
384
|
g.price.max,
|
|
347
|
-
g.backend.value,
|
|
385
|
+
get_or_error(g.backend).value,
|
|
348
386
|
g.region,
|
|
349
387
|
g.count.min,
|
|
350
388
|
g.name,
|
|
351
389
|
g.memory_mib,
|
|
352
390
|
),
|
|
353
391
|
)
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
async def list_gpus_grouped(
|
|
357
|
-
session: AsyncSession,
|
|
358
|
-
project: ProjectModel,
|
|
359
|
-
run_spec: RunSpec,
|
|
360
|
-
group_by: Optional[List[Literal["backend", "region", "count"]]] = None,
|
|
361
|
-
) -> ListGpusResponse:
|
|
362
|
-
"""Retrieves available GPU specifications based on a run spec, with optional grouping."""
|
|
363
|
-
offers = await _get_gpu_offers(session, project, run_spec)
|
|
364
|
-
backend_gpus = _process_offers_into_backend_gpus(offers)
|
|
365
|
-
|
|
366
|
-
group_by_set = set(group_by) if group_by else set()
|
|
367
|
-
|
|
368
|
-
if "region" in group_by_set and "backend" not in group_by_set:
|
|
369
|
-
from dstack._internal.core.errors import ServerClientError
|
|
370
|
-
|
|
371
|
-
raise ServerClientError("Cannot group by 'region' without also grouping by 'backend'")
|
|
372
|
-
|
|
373
|
-
# Determine grouping strategy based on combination
|
|
374
|
-
has_backend = "backend" in group_by_set
|
|
375
|
-
has_region = "region" in group_by_set
|
|
376
|
-
has_count = "count" in group_by_set
|
|
377
|
-
if has_backend and has_region and has_count:
|
|
378
|
-
gpus = _get_gpus_grouped_by_backend_region_and_count(backend_gpus)
|
|
379
|
-
elif has_backend and has_count:
|
|
380
|
-
gpus = _get_gpus_grouped_by_backend_and_count(backend_gpus)
|
|
381
|
-
elif has_backend and has_region:
|
|
382
|
-
gpus = _get_gpus_grouped_by_backend_and_region(backend_gpus)
|
|
383
|
-
elif has_backend:
|
|
384
|
-
gpus = _get_gpus_grouped_by_backend(backend_gpus)
|
|
385
|
-
elif has_count:
|
|
386
|
-
gpus = _get_gpus_grouped_by_count(backend_gpus)
|
|
387
|
-
else:
|
|
388
|
-
gpus = _get_gpus_with_no_grouping(backend_gpus)
|
|
389
|
-
|
|
390
|
-
return ListGpusResponse(gpus=gpus)
|
|
@@ -39,6 +39,7 @@ from dstack._internal.core.models.profiles import (
|
|
|
39
39
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
40
40
|
from dstack._internal.core.models.volumes import Volume
|
|
41
41
|
from dstack._internal.core.services.profiles import get_termination
|
|
42
|
+
from dstack._internal.server import settings as server_settings
|
|
42
43
|
from dstack._internal.server.models import (
|
|
43
44
|
FleetModel,
|
|
44
45
|
InstanceHealthCheckModel,
|
|
@@ -47,9 +48,11 @@ from dstack._internal.server.models import (
|
|
|
47
48
|
UserModel,
|
|
48
49
|
)
|
|
49
50
|
from dstack._internal.server.schemas.health.dcgm import DCGMHealthResponse
|
|
50
|
-
from dstack._internal.server.schemas.runner import InstanceHealthResponse
|
|
51
|
+
from dstack._internal.server.schemas.runner import InstanceHealthResponse, TaskStatus
|
|
52
|
+
from dstack._internal.server.services.logging import fmt
|
|
51
53
|
from dstack._internal.server.services.offers import generate_shared_offer
|
|
52
54
|
from dstack._internal.server.services.projects import list_user_project_models
|
|
55
|
+
from dstack._internal.server.services.runner.client import ShimClient
|
|
53
56
|
from dstack._internal.utils import common as common_utils
|
|
54
57
|
from dstack._internal.utils.logging import get_logger
|
|
55
58
|
|
|
@@ -633,3 +636,40 @@ async def create_ssh_instance_model(
|
|
|
633
636
|
busy_blocks=0,
|
|
634
637
|
)
|
|
635
638
|
return im
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
def remove_dangling_tasks_from_instance(shim_client: ShimClient, instance: InstanceModel) -> None:
|
|
642
|
+
if not shim_client.is_api_v2_supported():
|
|
643
|
+
return
|
|
644
|
+
assigned_to_instance_job_ids = {str(j.id) for j in instance.jobs}
|
|
645
|
+
task_list_response = shim_client.list_tasks()
|
|
646
|
+
tasks: list[tuple[str, Optional[TaskStatus]]]
|
|
647
|
+
if task_list_response.tasks is not None:
|
|
648
|
+
tasks = [(t.id, t.status) for t in task_list_response.tasks]
|
|
649
|
+
elif task_list_response.ids is not None:
|
|
650
|
+
# compatibility with pre-0.19.26 shim
|
|
651
|
+
tasks = [(t_id, None) for t_id in task_list_response.ids]
|
|
652
|
+
else:
|
|
653
|
+
raise ValueError("Unexpected task list response, neither `tasks` nor `ids` is set")
|
|
654
|
+
for task_id, task_status in tasks:
|
|
655
|
+
if task_id in assigned_to_instance_job_ids:
|
|
656
|
+
continue
|
|
657
|
+
should_terminate = task_status != TaskStatus.TERMINATED
|
|
658
|
+
should_remove = not server_settings.SERVER_KEEP_SHIM_TASKS
|
|
659
|
+
if not (should_terminate or should_remove):
|
|
660
|
+
continue
|
|
661
|
+
logger.warning(
|
|
662
|
+
"%s: dangling task found, id=%s, status=%s. Terminating and/or removing",
|
|
663
|
+
fmt(instance),
|
|
664
|
+
task_id,
|
|
665
|
+
task_status or "<unknown>",
|
|
666
|
+
)
|
|
667
|
+
if should_terminate:
|
|
668
|
+
shim_client.terminate_task(
|
|
669
|
+
task_id=task_id,
|
|
670
|
+
reason=None,
|
|
671
|
+
message=None,
|
|
672
|
+
timeout=0,
|
|
673
|
+
)
|
|
674
|
+
if should_remove:
|
|
675
|
+
shim_client.remove_task(task_id=task_id)
|
|
@@ -256,7 +256,16 @@ async def process_terminating_job(
|
|
|
256
256
|
if jpd is not None:
|
|
257
257
|
logger.debug("%s: stopping container", fmt(job_model))
|
|
258
258
|
ssh_private_keys = get_instance_ssh_private_keys(instance_model)
|
|
259
|
-
await stop_container(job_model, jpd, ssh_private_keys)
|
|
259
|
+
if not await stop_container(job_model, jpd, ssh_private_keys):
|
|
260
|
+
# The dangling container can be removed later during instance processing
|
|
261
|
+
logger.warning(
|
|
262
|
+
(
|
|
263
|
+
"%s: could not stop container, possibly due to a communication error."
|
|
264
|
+
" See debug logs for details."
|
|
265
|
+
" Ignoring, can attempt to remove the container later"
|
|
266
|
+
),
|
|
267
|
+
fmt(job_model),
|
|
268
|
+
)
|
|
260
269
|
if jrd is not None and jrd.volume_names is not None:
|
|
261
270
|
volume_names = jrd.volume_names
|
|
262
271
|
else:
|
|
@@ -378,21 +387,22 @@ async def stop_container(
|
|
|
378
387
|
job_model: JobModel,
|
|
379
388
|
job_provisioning_data: JobProvisioningData,
|
|
380
389
|
ssh_private_keys: tuple[str, Optional[str]],
|
|
381
|
-
):
|
|
390
|
+
) -> bool:
|
|
382
391
|
if job_provisioning_data.dockerized:
|
|
383
392
|
# send a request to the shim to terminate the docker container
|
|
384
393
|
# SSHError and RequestException are caught in the `runner_ssh_tunner` decorator
|
|
385
|
-
await run_async(
|
|
394
|
+
return await run_async(
|
|
386
395
|
_shim_submit_stop,
|
|
387
396
|
ssh_private_keys,
|
|
388
397
|
job_provisioning_data,
|
|
389
398
|
None,
|
|
390
399
|
job_model,
|
|
391
400
|
)
|
|
401
|
+
return True
|
|
392
402
|
|
|
393
403
|
|
|
394
404
|
@runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT])
|
|
395
|
-
def _shim_submit_stop(ports: Dict[int, int], job_model: JobModel):
|
|
405
|
+
def _shim_submit_stop(ports: Dict[int, int], job_model: JobModel) -> bool:
|
|
396
406
|
shim_client = client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
|
|
397
407
|
|
|
398
408
|
resp = shim_client.healthcheck()
|
|
@@ -418,6 +428,7 @@ def _shim_submit_stop(ports: Dict[int, int], job_model: JobModel):
|
|
|
418
428
|
shim_client.remove_task(task_id=job_model.id)
|
|
419
429
|
else:
|
|
420
430
|
shim_client.stop(force=True)
|
|
431
|
+
return True
|
|
421
432
|
|
|
422
433
|
|
|
423
434
|
def group_jobs_by_replica_latest(jobs: List[JobModel]) -> Iterable[Tuple[int, List[JobModel]]]:
|
|
@@ -3,7 +3,7 @@ import sys
|
|
|
3
3
|
import threading
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
5
|
from pathlib import PurePosixPath
|
|
6
|
-
from typing import Dict, List, Optional
|
|
6
|
+
from typing import Dict, List, Optional
|
|
7
7
|
|
|
8
8
|
from cachetools import TTLCache, cached
|
|
9
9
|
|
|
@@ -179,6 +179,7 @@ class JobConfigurator(ABC):
|
|
|
179
179
|
|
|
180
180
|
async def _commands(self) -> List[str]:
|
|
181
181
|
if self.run_spec.configuration.entrypoint is not None: # docker-like format
|
|
182
|
+
assert self.run_spec.configuration.type != "dev-environment"
|
|
182
183
|
entrypoint = shlex.split(self.run_spec.configuration.entrypoint)
|
|
183
184
|
commands = self.run_spec.configuration.commands
|
|
184
185
|
elif shell_commands := self._shell_commands():
|
|
@@ -258,19 +259,17 @@ class JobConfigurator(ABC):
|
|
|
258
259
|
return self.run_spec.configuration.single_branch
|
|
259
260
|
|
|
260
261
|
def _max_duration(self) -> Optional[int]:
|
|
261
|
-
if self.run_spec.merged_profile.max_duration
|
|
262
|
+
if self.run_spec.merged_profile.max_duration is None:
|
|
262
263
|
return self._default_max_duration()
|
|
263
|
-
if self.run_spec.merged_profile.max_duration
|
|
264
|
+
if self.run_spec.merged_profile.max_duration == "off":
|
|
264
265
|
return None
|
|
265
|
-
# pydantic validator ensures this is int
|
|
266
266
|
return self.run_spec.merged_profile.max_duration
|
|
267
267
|
|
|
268
268
|
def _stop_duration(self) -> Optional[int]:
|
|
269
|
-
if self.run_spec.merged_profile.stop_duration
|
|
269
|
+
if self.run_spec.merged_profile.stop_duration is None:
|
|
270
270
|
return DEFAULT_STOP_DURATION
|
|
271
|
-
if self.run_spec.merged_profile.stop_duration
|
|
271
|
+
if self.run_spec.merged_profile.stop_duration == "off":
|
|
272
272
|
return None
|
|
273
|
-
# pydantic validator ensures this is int
|
|
274
273
|
return self.run_spec.merged_profile.stop_duration
|
|
275
274
|
|
|
276
275
|
def _utilization_policy(self) -> Optional[UtilizationPolicy]:
|
|
@@ -328,7 +327,7 @@ class JobConfigurator(ABC):
|
|
|
328
327
|
|
|
329
328
|
|
|
330
329
|
def interpolate_job_volumes(
|
|
331
|
-
run_volumes: List[
|
|
330
|
+
run_volumes: List[MountPoint],
|
|
332
331
|
job_num: int,
|
|
333
332
|
) -> List[MountPoint]:
|
|
334
333
|
if len(run_volumes) == 0:
|
|
@@ -343,9 +342,6 @@ def interpolate_job_volumes(
|
|
|
343
342
|
)
|
|
344
343
|
job_volumes = []
|
|
345
344
|
for mount_point in run_volumes:
|
|
346
|
-
if isinstance(mount_point, str):
|
|
347
|
-
# pydantic validator ensures strings are converted to MountPoint
|
|
348
|
-
continue
|
|
349
345
|
if not isinstance(mount_point, VolumeMountPoint):
|
|
350
346
|
job_volumes.append(mount_point.copy())
|
|
351
347
|
continue
|
|
@@ -18,6 +18,8 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
|
|
|
18
18
|
TYPE: RunConfigurationType = RunConfigurationType.DEV_ENVIRONMENT
|
|
19
19
|
|
|
20
20
|
def __init__(self, run_spec: RunSpec, secrets: Dict[str, str]):
|
|
21
|
+
assert run_spec.configuration.type == "dev-environment"
|
|
22
|
+
|
|
21
23
|
if run_spec.configuration.ide == "vscode":
|
|
22
24
|
__class = VSCodeDesktop
|
|
23
25
|
elif run_spec.configuration.ide == "cursor":
|
|
@@ -32,6 +34,8 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
|
|
|
32
34
|
super().__init__(run_spec=run_spec, secrets=secrets)
|
|
33
35
|
|
|
34
36
|
def _shell_commands(self) -> List[str]:
|
|
37
|
+
assert self.run_spec.configuration.type == "dev-environment"
|
|
38
|
+
|
|
35
39
|
commands = self.ide.get_install_commands()
|
|
36
40
|
commands.append(INSTALL_IPYKERNEL)
|
|
37
41
|
commands += self.run_spec.configuration.setup
|
|
@@ -56,4 +60,5 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
|
|
|
56
60
|
return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND
|
|
57
61
|
|
|
58
62
|
def _ports(self) -> List[PortMapping]:
|
|
63
|
+
assert self.run_spec.configuration.type == "dev-environment"
|
|
59
64
|
return self.run_spec.configuration.ports
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List, Optional
|
|
2
2
|
|
|
3
3
|
from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
|
|
4
4
|
|
|
@@ -6,8 +6,8 @@ from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
|
|
|
6
6
|
class CursorDesktop:
|
|
7
7
|
def __init__(
|
|
8
8
|
self,
|
|
9
|
-
run_name: str,
|
|
10
|
-
version: str,
|
|
9
|
+
run_name: Optional[str],
|
|
10
|
+
version: Optional[str],
|
|
11
11
|
extensions: List[str],
|
|
12
12
|
):
|
|
13
13
|
self.run_name = run_name
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List, Optional
|
|
2
2
|
|
|
3
3
|
from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
|
|
4
4
|
|
|
@@ -6,8 +6,8 @@ from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
|
|
|
6
6
|
class VSCodeDesktop:
|
|
7
7
|
def __init__(
|
|
8
8
|
self,
|
|
9
|
-
run_name: str,
|
|
10
|
-
version: str,
|
|
9
|
+
run_name: Optional[str],
|
|
10
|
+
version: Optional[str],
|
|
11
11
|
extensions: List[str],
|
|
12
12
|
):
|
|
13
13
|
self.run_name = run_name
|
|
@@ -9,6 +9,7 @@ class ServiceJobConfigurator(JobConfigurator):
|
|
|
9
9
|
TYPE: RunConfigurationType = RunConfigurationType.SERVICE
|
|
10
10
|
|
|
11
11
|
def _shell_commands(self) -> List[str]:
|
|
12
|
+
assert self.run_spec.configuration.type == "service"
|
|
12
13
|
return self.run_spec.configuration.commands
|
|
13
14
|
|
|
14
15
|
def _default_single_branch(self) -> bool:
|
|
@@ -10,6 +10,7 @@ class TaskJobConfigurator(JobConfigurator):
|
|
|
10
10
|
TYPE: RunConfigurationType = RunConfigurationType.TASK
|
|
11
11
|
|
|
12
12
|
async def get_job_specs(self, replica_num: int) -> List[JobSpec]:
|
|
13
|
+
assert self.run_spec.configuration.type == "task"
|
|
13
14
|
job_specs = []
|
|
14
15
|
for job_num in range(self.run_spec.configuration.nodes):
|
|
15
16
|
job_spec = await self._get_job_spec(
|
|
@@ -21,6 +22,7 @@ class TaskJobConfigurator(JobConfigurator):
|
|
|
21
22
|
return job_specs
|
|
22
23
|
|
|
23
24
|
def _shell_commands(self) -> List[str]:
|
|
25
|
+
assert self.run_spec.configuration.type == "task"
|
|
24
26
|
return self.run_spec.configuration.commands
|
|
25
27
|
|
|
26
28
|
def _default_single_branch(self) -> bool:
|
|
@@ -33,6 +35,7 @@ class TaskJobConfigurator(JobConfigurator):
|
|
|
33
35
|
return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND
|
|
34
36
|
|
|
35
37
|
def _ports(self) -> List[PortMapping]:
|
|
38
|
+
assert self.run_spec.configuration.type == "task"
|
|
36
39
|
return self.run_spec.configuration.ports
|
|
37
40
|
|
|
38
41
|
def _working_dir(self) -> Optional[str]:
|
|
@@ -23,13 +23,13 @@ T = TypeVar("T")
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class Lockset(Protocol[T]):
|
|
26
|
-
def __contains__(self, item: T) -> bool: ...
|
|
26
|
+
def __contains__(self, item: T, /) -> bool: ...
|
|
27
27
|
def __iter__(self) -> Iterator[T]: ...
|
|
28
28
|
def __len__(self) -> int: ...
|
|
29
|
-
def add(self, item: T) -> None: ...
|
|
30
|
-
def discard(self, item: T) -> None: ...
|
|
31
|
-
def update(self, other: Iterable[T]) -> None: ...
|
|
32
|
-
def difference_update(self, other: Iterable[T]) -> None: ...
|
|
29
|
+
def add(self, item: T, /) -> None: ...
|
|
30
|
+
def discard(self, item: T, /) -> None: ...
|
|
31
|
+
def update(self, other: Iterable[T], /) -> None: ...
|
|
32
|
+
def difference_update(self, other: Iterable[T], /) -> None: ...
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
class ResourceLocker:
|
|
@@ -1,14 +1,22 @@
|
|
|
1
1
|
from typing import Union
|
|
2
2
|
|
|
3
|
-
from dstack._internal.server.models import
|
|
3
|
+
from dstack._internal.server.models import (
|
|
4
|
+
GatewayModel,
|
|
5
|
+
InstanceModel,
|
|
6
|
+
JobModel,
|
|
7
|
+
ProbeModel,
|
|
8
|
+
RunModel,
|
|
9
|
+
)
|
|
4
10
|
|
|
5
11
|
|
|
6
|
-
def fmt(model: Union[RunModel, JobModel, GatewayModel, ProbeModel]) -> str:
|
|
12
|
+
def fmt(model: Union[RunModel, JobModel, InstanceModel, GatewayModel, ProbeModel]) -> str:
|
|
7
13
|
"""Consistent string representation of a model for logging."""
|
|
8
14
|
if isinstance(model, RunModel):
|
|
9
15
|
return f"run({model.id.hex[:6]}){model.run_name}"
|
|
10
16
|
if isinstance(model, JobModel):
|
|
11
17
|
return f"job({model.id.hex[:6]}){model.job_name}"
|
|
18
|
+
if isinstance(model, InstanceModel):
|
|
19
|
+
return f"instance({model.id.hex[:6]}){model.name}"
|
|
12
20
|
if isinstance(model, GatewayModel):
|
|
13
21
|
return f"gateway({model.id.hex[:6]}){model.name}"
|
|
14
22
|
if isinstance(model, ProbeModel):
|
|
@@ -7,14 +7,14 @@ from dstack._internal.server import settings
|
|
|
7
7
|
from dstack._internal.server.models import ProjectModel
|
|
8
8
|
from dstack._internal.server.schemas.logs import PollLogsRequest
|
|
9
9
|
from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
|
|
10
|
-
from dstack._internal.server.services.logs
|
|
10
|
+
from dstack._internal.server.services.logs import aws as aws_logs
|
|
11
|
+
from dstack._internal.server.services.logs import gcp as gcp_logs
|
|
11
12
|
from dstack._internal.server.services.logs.base import (
|
|
12
13
|
LogStorage,
|
|
13
14
|
LogStorageError,
|
|
14
15
|
b64encode_raw_message,
|
|
15
16
|
)
|
|
16
17
|
from dstack._internal.server.services.logs.filelog import FileLogStorage
|
|
17
|
-
from dstack._internal.server.services.logs.gcp import GCP_LOGGING_AVAILABLE, GCPLogStorage
|
|
18
18
|
from dstack._internal.utils.common import run_async
|
|
19
19
|
from dstack._internal.utils.logging import get_logger
|
|
20
20
|
|
|
@@ -29,9 +29,9 @@ def get_log_storage() -> LogStorage:
|
|
|
29
29
|
if _log_storage is not None:
|
|
30
30
|
return _log_storage
|
|
31
31
|
if settings.SERVER_CLOUDWATCH_LOG_GROUP:
|
|
32
|
-
if BOTO_AVAILABLE:
|
|
32
|
+
if aws_logs.BOTO_AVAILABLE:
|
|
33
33
|
try:
|
|
34
|
-
_log_storage = CloudWatchLogStorage(
|
|
34
|
+
_log_storage = aws_logs.CloudWatchLogStorage(
|
|
35
35
|
group=settings.SERVER_CLOUDWATCH_LOG_GROUP,
|
|
36
36
|
region=settings.SERVER_CLOUDWATCH_LOG_REGION,
|
|
37
37
|
)
|
|
@@ -44,9 +44,11 @@ def get_log_storage() -> LogStorage:
|
|
|
44
44
|
else:
|
|
45
45
|
logger.error("Cannot use CloudWatch Logs storage: boto3 is not installed")
|
|
46
46
|
elif settings.SERVER_GCP_LOGGING_PROJECT:
|
|
47
|
-
if GCP_LOGGING_AVAILABLE:
|
|
47
|
+
if gcp_logs.GCP_LOGGING_AVAILABLE:
|
|
48
48
|
try:
|
|
49
|
-
_log_storage = GCPLogStorage(
|
|
49
|
+
_log_storage = gcp_logs.GCPLogStorage(
|
|
50
|
+
project_id=settings.SERVER_GCP_LOGGING_PROJECT
|
|
51
|
+
)
|
|
50
52
|
except LogStorageError as e:
|
|
51
53
|
logger.error("Failed to initialize GCP Logs storage: %s", e)
|
|
52
54
|
except Exception:
|