dstack 0.19.27__py3-none-any.whl → 0.19.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +11 -8
- dstack/_internal/cli/commands/apply.py +6 -3
- dstack/_internal/cli/commands/completion.py +3 -1
- dstack/_internal/cli/commands/config.py +1 -0
- dstack/_internal/cli/commands/init.py +2 -2
- dstack/_internal/cli/commands/offer.py +1 -1
- dstack/_internal/cli/commands/project.py +1 -0
- dstack/_internal/cli/commands/server.py +2 -2
- dstack/_internal/cli/main.py +1 -1
- dstack/_internal/cli/services/configurators/base.py +2 -4
- dstack/_internal/cli/services/configurators/fleet.py +4 -5
- dstack/_internal/cli/services/configurators/gateway.py +3 -5
- dstack/_internal/cli/services/configurators/run.py +51 -27
- dstack/_internal/cli/services/configurators/volume.py +3 -5
- dstack/_internal/core/compatibility/runs.py +2 -0
- dstack/_internal/core/models/common.py +67 -43
- dstack/_internal/core/models/configurations.py +88 -62
- dstack/_internal/core/models/fleets.py +41 -24
- dstack/_internal/core/models/instances.py +5 -5
- dstack/_internal/core/models/profiles.py +66 -47
- dstack/_internal/core/models/repos/remote.py +21 -16
- dstack/_internal/core/models/resources.py +69 -65
- dstack/_internal/core/models/runs.py +17 -9
- dstack/_internal/server/app.py +5 -0
- dstack/_internal/server/background/tasks/process_fleets.py +8 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +32 -12
- dstack/_internal/server/models.py +6 -5
- dstack/_internal/server/schemas/gateways.py +10 -9
- dstack/_internal/server/services/backends/handlers.py +2 -0
- dstack/_internal/server/services/docker.py +8 -7
- dstack/_internal/server/services/projects.py +52 -1
- dstack/_internal/server/settings.py +46 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-56191c63d516fd0041c4.css → main-5e0d56245c4bd241ec27.css} +1 -1
- dstack/_internal/server/statics/{main-4eecc75fbe64067eb1bc.js → main-a2a16772fbf11a14d191.js} +70 -100
- dstack/_internal/server/statics/{main-4eecc75fbe64067eb1bc.js.map → main-a2a16772fbf11a14d191.js.map} +1 -1
- dstack/_internal/utils/env.py +85 -11
- dstack/version.py +1 -1
- {dstack-0.19.27.dist-info → dstack-0.19.28.dist-info}/METADATA +1 -1
- {dstack-0.19.27.dist-info → dstack-0.19.28.dist-info}/RECORD +43 -44
- dstack/_internal/server/statics/static/media/github.1f7102513534c83a9d8d735d2b8c12a2.svg +0 -3
- {dstack-0.19.27.dist-info → dstack-0.19.28.dist-info}/WHEEL +0 -0
- {dstack-0.19.27.dist-info → dstack-0.19.28.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.27.dist-info → dstack-0.19.28.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -7,7 +7,7 @@ from pydantic import Field, parse_obj_as, root_validator, validator
|
|
|
7
7
|
from pydantic.generics import GenericModel
|
|
8
8
|
from typing_extensions import Annotated
|
|
9
9
|
|
|
10
|
-
from dstack._internal.core.models.common import CoreModel
|
|
10
|
+
from dstack._internal.core.models.common import CoreConfig, CoreModel, generate_dual_core_model
|
|
11
11
|
from dstack._internal.utils.common import pretty_resources
|
|
12
12
|
from dstack._internal.utils.json_schema import add_extra_schema_types
|
|
13
13
|
from dstack._internal.utils.logging import get_logger
|
|
@@ -129,21 +129,22 @@ DEFAULT_MEMORY_SIZE = Range[Memory](min=Memory.parse("8GB"))
|
|
|
129
129
|
DEFAULT_GPU_COUNT = Range[int](min=1)
|
|
130
130
|
|
|
131
131
|
|
|
132
|
-
class
|
|
132
|
+
class CPUSpecConfig(CoreConfig):
|
|
133
|
+
@staticmethod
|
|
134
|
+
def schema_extra(schema: Dict[str, Any]):
|
|
135
|
+
add_extra_schema_types(
|
|
136
|
+
schema["properties"]["count"],
|
|
137
|
+
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class CPUSpec(generate_dual_core_model(CPUSpecConfig)):
|
|
133
142
|
arch: Annotated[
|
|
134
143
|
Optional[gpuhunt.CPUArchitecture],
|
|
135
144
|
Field(description="The CPU architecture, one of: `x86`, `arm`"),
|
|
136
145
|
] = None
|
|
137
146
|
count: Annotated[Range[int], Field(description="The number of CPU cores")] = DEFAULT_CPU_COUNT
|
|
138
147
|
|
|
139
|
-
class Config(CoreModel.Config):
|
|
140
|
-
@staticmethod
|
|
141
|
-
def schema_extra(schema: Dict[str, Any]):
|
|
142
|
-
add_extra_schema_types(
|
|
143
|
-
schema["properties"]["count"],
|
|
144
|
-
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
145
|
-
)
|
|
146
|
-
|
|
147
148
|
@classmethod
|
|
148
149
|
def __get_validators__(cls):
|
|
149
150
|
yield cls.parse
|
|
@@ -190,7 +191,28 @@ class CPUSpec(CoreModel):
|
|
|
190
191
|
return v
|
|
191
192
|
|
|
192
193
|
|
|
193
|
-
class
|
|
194
|
+
class GPUSpecConfig(CoreConfig):
|
|
195
|
+
@staticmethod
|
|
196
|
+
def schema_extra(schema: Dict[str, Any]):
|
|
197
|
+
add_extra_schema_types(
|
|
198
|
+
schema["properties"]["count"],
|
|
199
|
+
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
200
|
+
)
|
|
201
|
+
add_extra_schema_types(
|
|
202
|
+
schema["properties"]["name"],
|
|
203
|
+
extra_types=[{"type": "string"}],
|
|
204
|
+
)
|
|
205
|
+
add_extra_schema_types(
|
|
206
|
+
schema["properties"]["memory"],
|
|
207
|
+
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
208
|
+
)
|
|
209
|
+
add_extra_schema_types(
|
|
210
|
+
schema["properties"]["total_memory"],
|
|
211
|
+
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class GPUSpec(generate_dual_core_model(GPUSpecConfig)):
|
|
194
216
|
vendor: Annotated[
|
|
195
217
|
Optional[gpuhunt.AcceleratorVendor],
|
|
196
218
|
Field(
|
|
@@ -218,26 +240,6 @@ class GPUSpec(CoreModel):
|
|
|
218
240
|
Field(description="The minimum compute capability of the GPU (e.g., `7.5`)"),
|
|
219
241
|
] = None
|
|
220
242
|
|
|
221
|
-
class Config(CoreModel.Config):
|
|
222
|
-
@staticmethod
|
|
223
|
-
def schema_extra(schema: Dict[str, Any]):
|
|
224
|
-
add_extra_schema_types(
|
|
225
|
-
schema["properties"]["count"],
|
|
226
|
-
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
227
|
-
)
|
|
228
|
-
add_extra_schema_types(
|
|
229
|
-
schema["properties"]["name"],
|
|
230
|
-
extra_types=[{"type": "string"}],
|
|
231
|
-
)
|
|
232
|
-
add_extra_schema_types(
|
|
233
|
-
schema["properties"]["memory"],
|
|
234
|
-
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
235
|
-
)
|
|
236
|
-
add_extra_schema_types(
|
|
237
|
-
schema["properties"]["total_memory"],
|
|
238
|
-
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
239
|
-
)
|
|
240
|
-
|
|
241
243
|
@classmethod
|
|
242
244
|
def __get_validators__(cls):
|
|
243
245
|
yield cls.parse
|
|
@@ -317,16 +319,17 @@ class GPUSpec(CoreModel):
|
|
|
317
319
|
return gpuhunt.AcceleratorVendor.cast(v)
|
|
318
320
|
|
|
319
321
|
|
|
320
|
-
class
|
|
321
|
-
|
|
322
|
+
class DiskSpecConfig(CoreConfig):
|
|
323
|
+
@staticmethod
|
|
324
|
+
def schema_extra(schema: Dict[str, Any]):
|
|
325
|
+
add_extra_schema_types(
|
|
326
|
+
schema["properties"]["size"],
|
|
327
|
+
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
328
|
+
)
|
|
322
329
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
add_extra_schema_types(
|
|
327
|
-
schema["properties"]["size"],
|
|
328
|
-
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
329
|
-
)
|
|
330
|
+
|
|
331
|
+
class DiskSpec(generate_dual_core_model(DiskSpecConfig)):
|
|
332
|
+
size: Annotated[Range[Memory], Field(description="Disk size")]
|
|
330
333
|
|
|
331
334
|
@classmethod
|
|
332
335
|
def __get_validators__(cls):
|
|
@@ -343,7 +346,32 @@ class DiskSpec(CoreModel):
|
|
|
343
346
|
DEFAULT_DISK = DiskSpec(size=Range[Memory](min=Memory.parse("100GB"), max=None))
|
|
344
347
|
|
|
345
348
|
|
|
346
|
-
class
|
|
349
|
+
class ResourcesSpecConfig(CoreConfig):
|
|
350
|
+
@staticmethod
|
|
351
|
+
def schema_extra(schema: Dict[str, Any]):
|
|
352
|
+
add_extra_schema_types(
|
|
353
|
+
schema["properties"]["cpu"],
|
|
354
|
+
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
355
|
+
)
|
|
356
|
+
add_extra_schema_types(
|
|
357
|
+
schema["properties"]["memory"],
|
|
358
|
+
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
359
|
+
)
|
|
360
|
+
add_extra_schema_types(
|
|
361
|
+
schema["properties"]["shm_size"],
|
|
362
|
+
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
363
|
+
)
|
|
364
|
+
add_extra_schema_types(
|
|
365
|
+
schema["properties"]["gpu"],
|
|
366
|
+
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
367
|
+
)
|
|
368
|
+
add_extra_schema_types(
|
|
369
|
+
schema["properties"]["disk"],
|
|
370
|
+
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
class ResourcesSpec(generate_dual_core_model(ResourcesSpecConfig)):
|
|
347
375
|
# TODO: Remove Range[int] in 0.20. Range[int] for backward compatibility only.
|
|
348
376
|
cpu: Annotated[Union[CPUSpec, Range[int]], Field(description="The CPU requirements")] = (
|
|
349
377
|
CPUSpec()
|
|
@@ -362,30 +390,6 @@ class ResourcesSpec(CoreModel):
|
|
|
362
390
|
gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = None
|
|
363
391
|
disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK
|
|
364
392
|
|
|
365
|
-
class Config(CoreModel.Config):
|
|
366
|
-
@staticmethod
|
|
367
|
-
def schema_extra(schema: Dict[str, Any]):
|
|
368
|
-
add_extra_schema_types(
|
|
369
|
-
schema["properties"]["cpu"],
|
|
370
|
-
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
371
|
-
)
|
|
372
|
-
add_extra_schema_types(
|
|
373
|
-
schema["properties"]["memory"],
|
|
374
|
-
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
375
|
-
)
|
|
376
|
-
add_extra_schema_types(
|
|
377
|
-
schema["properties"]["shm_size"],
|
|
378
|
-
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
379
|
-
)
|
|
380
|
-
add_extra_schema_types(
|
|
381
|
-
schema["properties"]["gpu"],
|
|
382
|
-
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
383
|
-
)
|
|
384
|
-
add_extra_schema_types(
|
|
385
|
-
schema["properties"]["disk"],
|
|
386
|
-
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
387
|
-
)
|
|
388
|
-
|
|
389
393
|
def pretty_format(self) -> str:
|
|
390
394
|
# TODO: Remove in 0.20. Use self.cpu directly
|
|
391
395
|
cpu = parse_obj_as(CPUSpec, self.cpu)
|
|
@@ -1,13 +1,20 @@
|
|
|
1
1
|
from datetime import datetime, timedelta
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import Any, Dict, List, Literal, Optional
|
|
3
|
+
from typing import Any, Dict, List, Literal, Optional
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
5
|
|
|
6
6
|
from pydantic import UUID4, Field, root_validator
|
|
7
7
|
from typing_extensions import Annotated
|
|
8
8
|
|
|
9
9
|
from dstack._internal.core.models.backends.base import BackendType
|
|
10
|
-
from dstack._internal.core.models.common import
|
|
10
|
+
from dstack._internal.core.models.common import (
|
|
11
|
+
ApplyAction,
|
|
12
|
+
CoreConfig,
|
|
13
|
+
CoreModel,
|
|
14
|
+
NetworkMode,
|
|
15
|
+
RegistryAuth,
|
|
16
|
+
generate_dual_core_model,
|
|
17
|
+
)
|
|
11
18
|
from dstack._internal.core.models.configurations import (
|
|
12
19
|
DEFAULT_PROBE_METHOD,
|
|
13
20
|
LEGACY_REPO_DIR,
|
|
@@ -385,7 +392,14 @@ class Job(CoreModel):
|
|
|
385
392
|
job_submissions: List[JobSubmission]
|
|
386
393
|
|
|
387
394
|
|
|
388
|
-
class
|
|
395
|
+
class RunSpecConfig(CoreConfig):
|
|
396
|
+
@staticmethod
|
|
397
|
+
def schema_extra(schema: Dict[str, Any]):
|
|
398
|
+
prop = schema.get("properties", {})
|
|
399
|
+
prop.pop("merged_profile", None)
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
class RunSpec(generate_dual_core_model(RunSpecConfig)):
|
|
389
403
|
# TODO: run_name, working_dir are redundant here since they already passed in configuration
|
|
390
404
|
run_name: Annotated[
|
|
391
405
|
Optional[str],
|
|
@@ -458,12 +472,6 @@ class RunSpec(CoreModel):
|
|
|
458
472
|
# TODO: make merged_profile a computed field after migrating to pydanticV2
|
|
459
473
|
merged_profile: Annotated[Profile, Field(exclude=True)] = None
|
|
460
474
|
|
|
461
|
-
class Config(CoreModel.Config):
|
|
462
|
-
@staticmethod
|
|
463
|
-
def schema_extra(schema: Dict[str, Any], model: Type) -> None:
|
|
464
|
-
prop = schema.get("properties", {})
|
|
465
|
-
prop.pop("merged_profile", None)
|
|
466
|
-
|
|
467
475
|
@root_validator
|
|
468
476
|
def _merged_profile(cls, values) -> Dict:
|
|
469
477
|
if values.get("profile") is None:
|
dstack/_internal/server/app.py
CHANGED
|
@@ -160,6 +160,11 @@ async def lifespan(app: FastAPI):
|
|
|
160
160
|
logger.info("Background processing is disabled")
|
|
161
161
|
PROBES_SCHEDULER.start()
|
|
162
162
|
dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)"
|
|
163
|
+
logger.info(
|
|
164
|
+
"Job network mode: %s (%d)",
|
|
165
|
+
settings.JOB_NETWORK_MODE.name,
|
|
166
|
+
settings.JOB_NETWORK_MODE.value,
|
|
167
|
+
)
|
|
163
168
|
logger.info(f"The admin token is {admin.token.get_plaintext_or_error()}", {"show_path": False})
|
|
164
169
|
logger.info(
|
|
165
170
|
f"The dstack server {dstack_version} is running at {SERVER_URL}",
|
|
@@ -177,6 +177,14 @@ def _maintain_fleet_nodes_min(
|
|
|
177
177
|
|
|
178
178
|
|
|
179
179
|
def _autodelete_fleet(fleet_model: FleetModel) -> bool:
|
|
180
|
+
if fleet_model.project.deleted:
|
|
181
|
+
# It used to be possible to delete project with active resources:
|
|
182
|
+
# https://github.com/dstackai/dstack/issues/3077
|
|
183
|
+
fleet_model.status = FleetStatus.TERMINATED
|
|
184
|
+
fleet_model.deleted = True
|
|
185
|
+
logger.info("Fleet %s deleted due to deleted project", fleet_model.name)
|
|
186
|
+
return True
|
|
187
|
+
|
|
180
188
|
if is_fleet_in_use(fleet_model) or not is_fleet_empty(fleet_model):
|
|
181
189
|
return False
|
|
182
190
|
|
|
@@ -5,7 +5,7 @@ import uuid
|
|
|
5
5
|
from datetime import datetime, timedelta
|
|
6
6
|
from typing import List, Optional, Tuple
|
|
7
7
|
|
|
8
|
-
from sqlalchemy import and_, not_, or_, select
|
|
8
|
+
from sqlalchemy import and_, func, not_, or_, select
|
|
9
9
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
10
|
from sqlalchemy.orm import contains_eager, joinedload, load_only, noload, selectinload
|
|
11
11
|
|
|
@@ -54,6 +54,7 @@ from dstack._internal.server.models import (
|
|
|
54
54
|
from dstack._internal.server.services.backends import get_project_backend_by_type_or_error
|
|
55
55
|
from dstack._internal.server.services.fleets import (
|
|
56
56
|
fleet_model_to_fleet,
|
|
57
|
+
generate_fleet_name,
|
|
57
58
|
get_fleet_requirements,
|
|
58
59
|
get_next_instance_num,
|
|
59
60
|
)
|
|
@@ -71,7 +72,7 @@ from dstack._internal.server.services.jobs import (
|
|
|
71
72
|
get_job_configured_volumes,
|
|
72
73
|
get_job_runtime_data,
|
|
73
74
|
)
|
|
74
|
-
from dstack._internal.server.services.locking import get_locker
|
|
75
|
+
from dstack._internal.server.services.locking import get_locker, string_to_lock_id
|
|
75
76
|
from dstack._internal.server.services.logging import fmt
|
|
76
77
|
from dstack._internal.server.services.offers import get_offers_by_requirements
|
|
77
78
|
from dstack._internal.server.services.requirements.combine import (
|
|
@@ -87,7 +88,6 @@ from dstack._internal.server.services.volumes import (
|
|
|
87
88
|
)
|
|
88
89
|
from dstack._internal.server.utils import sentry_utils
|
|
89
90
|
from dstack._internal.utils import common as common_utils
|
|
90
|
-
from dstack._internal.utils import env as env_utils
|
|
91
91
|
from dstack._internal.utils.logging import get_logger
|
|
92
92
|
|
|
93
93
|
logger = get_logger(__name__)
|
|
@@ -188,6 +188,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
188
188
|
run_spec = run.run_spec
|
|
189
189
|
profile = run_spec.merged_profile
|
|
190
190
|
job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
|
|
191
|
+
multinode = job.job_spec.jobs_per_replica > 1
|
|
191
192
|
|
|
192
193
|
# Master job chooses fleet for the run.
|
|
193
194
|
# Due to two-step processing, it's saved to job_model.fleet.
|
|
@@ -310,6 +311,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
310
311
|
session=session,
|
|
311
312
|
instances_with_offers=fleet_instances_with_offers,
|
|
312
313
|
job_model=job_model,
|
|
314
|
+
multinode=multinode,
|
|
313
315
|
)
|
|
314
316
|
job_model.fleet = fleet_model
|
|
315
317
|
job_model.instance_assigned = True
|
|
@@ -363,7 +365,8 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
363
365
|
job_model.job_provisioning_data = job_provisioning_data.json()
|
|
364
366
|
job_model.status = JobStatus.PROVISIONING
|
|
365
367
|
if fleet_model is None:
|
|
366
|
-
fleet_model = _create_fleet_model_for_job(
|
|
368
|
+
fleet_model = await _create_fleet_model_for_job(
|
|
369
|
+
session=session,
|
|
367
370
|
project=project,
|
|
368
371
|
run=run,
|
|
369
372
|
)
|
|
@@ -385,7 +388,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
385
388
|
offer=offer,
|
|
386
389
|
instance_num=instance_num,
|
|
387
390
|
)
|
|
388
|
-
job_model.job_runtime_data = _prepare_job_runtime_data(offer).json()
|
|
391
|
+
job_model.job_runtime_data = _prepare_job_runtime_data(offer, multinode).json()
|
|
389
392
|
# Both this task and process_fleets can add instances to fleets.
|
|
390
393
|
# TODO: Ensure this does not violate nodes.max when it's enforced.
|
|
391
394
|
instance.fleet_id = fleet_model.id
|
|
@@ -614,6 +617,7 @@ async def _assign_job_to_fleet_instance(
|
|
|
614
617
|
session: AsyncSession,
|
|
615
618
|
instances_with_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]],
|
|
616
619
|
job_model: JobModel,
|
|
620
|
+
multinode: bool,
|
|
617
621
|
) -> Optional[InstanceModel]:
|
|
618
622
|
if len(instances_with_offers) == 0:
|
|
619
623
|
return None
|
|
@@ -643,7 +647,7 @@ async def _assign_job_to_fleet_instance(
|
|
|
643
647
|
job_model.instance = instance
|
|
644
648
|
job_model.used_instance_id = instance.id
|
|
645
649
|
job_model.job_provisioning_data = instance.job_provisioning_data
|
|
646
|
-
job_model.job_runtime_data = _prepare_job_runtime_data(offer).json()
|
|
650
|
+
job_model.job_runtime_data = _prepare_job_runtime_data(offer, multinode).json()
|
|
647
651
|
return instance
|
|
648
652
|
|
|
649
653
|
|
|
@@ -752,7 +756,8 @@ def _check_can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
|
|
|
752
756
|
return True
|
|
753
757
|
|
|
754
758
|
|
|
755
|
-
def _create_fleet_model_for_job(
|
|
759
|
+
async def _create_fleet_model_for_job(
|
|
760
|
+
session: AsyncSession,
|
|
756
761
|
project: ProjectModel,
|
|
757
762
|
run: Run,
|
|
758
763
|
) -> FleetModel:
|
|
@@ -760,9 +765,19 @@ def _create_fleet_model_for_job(
|
|
|
760
765
|
if run.run_spec.configuration.type == "task" and run.run_spec.configuration.nodes > 1:
|
|
761
766
|
placement = InstanceGroupPlacement.CLUSTER
|
|
762
767
|
nodes = _get_nodes_required_num_for_run(run.run_spec)
|
|
768
|
+
|
|
769
|
+
lock_namespace = f"fleet_names_{project.name}"
|
|
770
|
+
# TODO: Lock fleet names on SQLite.
|
|
771
|
+
# Needs some refactoring so that the lock is released after commit.
|
|
772
|
+
if get_db().dialect_name == "postgresql":
|
|
773
|
+
await session.execute(
|
|
774
|
+
select(func.pg_advisory_xact_lock(string_to_lock_id(lock_namespace)))
|
|
775
|
+
)
|
|
776
|
+
fleet_name = await generate_fleet_name(session=session, project=project)
|
|
777
|
+
|
|
763
778
|
spec = FleetSpec(
|
|
764
779
|
configuration=FleetConfiguration(
|
|
765
|
-
name=
|
|
780
|
+
name=fleet_name,
|
|
766
781
|
placement=placement,
|
|
767
782
|
reservation=run.run_spec.configuration.reservation,
|
|
768
783
|
nodes=FleetNodesSpec(
|
|
@@ -776,7 +791,7 @@ def _create_fleet_model_for_job(
|
|
|
776
791
|
)
|
|
777
792
|
fleet_model = FleetModel(
|
|
778
793
|
id=uuid.uuid4(),
|
|
779
|
-
name=
|
|
794
|
+
name=fleet_name,
|
|
780
795
|
project=project,
|
|
781
796
|
status=FleetStatus.ACTIVE,
|
|
782
797
|
spec=spec.json(),
|
|
@@ -839,12 +854,17 @@ def _create_instance_model_for_job(
|
|
|
839
854
|
return instance
|
|
840
855
|
|
|
841
856
|
|
|
842
|
-
def _prepare_job_runtime_data(
|
|
857
|
+
def _prepare_job_runtime_data(
|
|
858
|
+
offer: InstanceOfferWithAvailability, multinode: bool
|
|
859
|
+
) -> JobRuntimeData:
|
|
843
860
|
if offer.blocks == offer.total_blocks:
|
|
844
|
-
if
|
|
861
|
+
if settings.JOB_NETWORK_MODE == settings.JobNetworkMode.FORCED_BRIDGE:
|
|
845
862
|
network_mode = NetworkMode.BRIDGE
|
|
846
|
-
|
|
863
|
+
elif settings.JOB_NETWORK_MODE == settings.JobNetworkMode.HOST_WHEN_POSSIBLE:
|
|
847
864
|
network_mode = NetworkMode.HOST
|
|
865
|
+
else:
|
|
866
|
+
assert settings.JOB_NETWORK_MODE == settings.JobNetworkMode.HOST_FOR_MULTINODE_ONLY
|
|
867
|
+
network_mode = NetworkMode.HOST if multinode else NetworkMode.BRIDGE
|
|
848
868
|
return JobRuntimeData(
|
|
849
869
|
network_mode=network_mode,
|
|
850
870
|
offer=offer,
|
|
@@ -24,7 +24,7 @@ from sqlalchemy_utils import UUIDType
|
|
|
24
24
|
|
|
25
25
|
from dstack._internal.core.errors import DstackError
|
|
26
26
|
from dstack._internal.core.models.backends.base import BackendType
|
|
27
|
-
from dstack._internal.core.models.common import
|
|
27
|
+
from dstack._internal.core.models.common import CoreConfig, generate_dual_core_model
|
|
28
28
|
from dstack._internal.core.models.fleets import FleetStatus
|
|
29
29
|
from dstack._internal.core.models.gateways import GatewayStatus
|
|
30
30
|
from dstack._internal.core.models.health import HealthStatus
|
|
@@ -71,7 +71,11 @@ class NaiveDateTime(TypeDecorator):
|
|
|
71
71
|
return value.replace(tzinfo=timezone.utc)
|
|
72
72
|
|
|
73
73
|
|
|
74
|
-
class
|
|
74
|
+
class DecryptedStringConfig(CoreConfig):
|
|
75
|
+
arbitrary_types_allowed = True
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class DecryptedString(generate_dual_core_model(DecryptedStringConfig)):
|
|
75
79
|
"""
|
|
76
80
|
A type for representing plaintext strings encrypted with `EncryptedString`.
|
|
77
81
|
Besides the string, stores information if the decryption was successful.
|
|
@@ -84,9 +88,6 @@ class DecryptedString(CoreModel):
|
|
|
84
88
|
decrypted: bool = True
|
|
85
89
|
exc: Optional[Exception] = None
|
|
86
90
|
|
|
87
|
-
class Config(CoreModel.Config):
|
|
88
|
-
arbitrary_types_allowed = True
|
|
89
|
-
|
|
90
91
|
def get_plaintext_or_error(self) -> str:
|
|
91
92
|
if self.decrypted and self.plaintext is not None:
|
|
92
93
|
return self.plaintext
|
|
@@ -3,24 +3,25 @@ from typing import Annotated, Any, Dict, List, Optional
|
|
|
3
3
|
from pydantic import Field
|
|
4
4
|
|
|
5
5
|
from dstack._internal.core.models.backends.base import BackendType
|
|
6
|
-
from dstack._internal.core.models.common import CoreModel
|
|
6
|
+
from dstack._internal.core.models.common import CoreConfig, CoreModel, generate_dual_core_model
|
|
7
7
|
from dstack._internal.core.models.gateways import GatewayConfiguration
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
class
|
|
10
|
+
class CreateGatewayRequestConfig(CoreConfig):
|
|
11
|
+
@staticmethod
|
|
12
|
+
def schema_extra(schema: Dict[str, Any]):
|
|
13
|
+
del schema["properties"]["name"]
|
|
14
|
+
del schema["properties"]["backend_type"]
|
|
15
|
+
del schema["properties"]["region"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CreateGatewayRequest(generate_dual_core_model(CreateGatewayRequestConfig)):
|
|
11
19
|
configuration: GatewayConfiguration
|
|
12
20
|
# Deprecated and unused. Left for compatibility with 0.18 clients.
|
|
13
21
|
name: Annotated[Optional[str], Field(exclude=True)] = None
|
|
14
22
|
backend_type: Annotated[Optional[BackendType], Field(exclude=True)] = None
|
|
15
23
|
region: Annotated[Optional[str], Field(exclude=True)] = None
|
|
16
24
|
|
|
17
|
-
class Config(CoreModel.Config):
|
|
18
|
-
@staticmethod
|
|
19
|
-
def schema_extra(schema: Dict[str, Any]) -> None:
|
|
20
|
-
del schema["properties"]["name"]
|
|
21
|
-
del schema["properties"]["backend_type"]
|
|
22
|
-
del schema["properties"]["region"]
|
|
23
|
-
|
|
24
25
|
|
|
25
26
|
class GetGatewayRequest(CoreModel):
|
|
26
27
|
name: str
|
|
@@ -9,7 +9,11 @@ from pydantic import Field, ValidationError, validator
|
|
|
9
9
|
from typing_extensions import Annotated
|
|
10
10
|
|
|
11
11
|
from dstack._internal.core.errors import DockerRegistryError
|
|
12
|
-
from dstack._internal.core.models.common import
|
|
12
|
+
from dstack._internal.core.models.common import (
|
|
13
|
+
CoreModel,
|
|
14
|
+
FrozenCoreModel,
|
|
15
|
+
RegistryAuth,
|
|
16
|
+
)
|
|
13
17
|
from dstack._internal.server.utils.common import join_byte_stream_checked
|
|
14
18
|
from dstack._internal.utils.dxf import PatchedDXF
|
|
15
19
|
|
|
@@ -31,15 +35,12 @@ class DXFAuthAdapter:
|
|
|
31
35
|
)
|
|
32
36
|
|
|
33
37
|
|
|
34
|
-
class DockerImage(
|
|
38
|
+
class DockerImage(FrozenCoreModel):
|
|
35
39
|
image: str
|
|
36
|
-
registry: Optional[str]
|
|
40
|
+
registry: Optional[str] = None
|
|
37
41
|
repo: str
|
|
38
42
|
tag: str
|
|
39
|
-
digest: Optional[str]
|
|
40
|
-
|
|
41
|
-
class Config(CoreModel.Config):
|
|
42
|
-
frozen = True
|
|
43
|
+
digest: Optional[str] = None
|
|
43
44
|
|
|
44
45
|
|
|
45
46
|
class ImageConfig(CoreModel):
|
|
@@ -14,8 +14,16 @@ from dstack._internal.core.backends.dstack.models import (
|
|
|
14
14
|
from dstack._internal.core.backends.models import BackendInfo
|
|
15
15
|
from dstack._internal.core.errors import ForbiddenError, ResourceExistsError, ServerClientError
|
|
16
16
|
from dstack._internal.core.models.projects import Member, MemberPermissions, Project
|
|
17
|
+
from dstack._internal.core.models.runs import RunStatus
|
|
17
18
|
from dstack._internal.core.models.users import GlobalRole, ProjectRole
|
|
18
|
-
from dstack._internal.server.models import
|
|
19
|
+
from dstack._internal.server.models import (
|
|
20
|
+
FleetModel,
|
|
21
|
+
MemberModel,
|
|
22
|
+
ProjectModel,
|
|
23
|
+
RunModel,
|
|
24
|
+
UserModel,
|
|
25
|
+
VolumeModel,
|
|
26
|
+
)
|
|
19
27
|
from dstack._internal.server.schemas.projects import MemberSetting
|
|
20
28
|
from dstack._internal.server.services import users
|
|
21
29
|
from dstack._internal.server.services.backends import (
|
|
@@ -178,6 +186,19 @@ async def delete_projects(
|
|
|
178
186
|
raise ForbiddenError()
|
|
179
187
|
if all(name in projects_names for name in user_project_names):
|
|
180
188
|
raise ServerClientError("Cannot delete the only project")
|
|
189
|
+
|
|
190
|
+
res = await session.execute(
|
|
191
|
+
select(ProjectModel.id).where(ProjectModel.name.in_(projects_names))
|
|
192
|
+
)
|
|
193
|
+
project_ids = res.scalars().all()
|
|
194
|
+
if len(project_ids) != len(projects_names):
|
|
195
|
+
raise ServerClientError("Failed to delete non-existent projects")
|
|
196
|
+
|
|
197
|
+
for project_id in project_ids:
|
|
198
|
+
# FIXME: The checks are not under lock,
|
|
199
|
+
# so there can be dangling active resources due to race conditions.
|
|
200
|
+
await _check_project_has_active_resources(session=session, project_id=project_id)
|
|
201
|
+
|
|
181
202
|
timestamp = str(int(get_current_datetime().timestamp()))
|
|
182
203
|
new_project_name = "_deleted_" + timestamp + ProjectModel.name
|
|
183
204
|
await session.execute(
|
|
@@ -614,6 +635,36 @@ def _is_project_admin(
|
|
|
614
635
|
return False
|
|
615
636
|
|
|
616
637
|
|
|
638
|
+
async def _check_project_has_active_resources(session: AsyncSession, project_id: uuid.UUID):
|
|
639
|
+
res = await session.execute(
|
|
640
|
+
select(RunModel.run_name).where(
|
|
641
|
+
RunModel.project_id == project_id,
|
|
642
|
+
RunModel.status.not_in(RunStatus.finished_statuses()),
|
|
643
|
+
)
|
|
644
|
+
)
|
|
645
|
+
run_names = list(res.scalars().all())
|
|
646
|
+
if len(run_names) > 0:
|
|
647
|
+
raise ServerClientError(f"Failed to delete project with active runs: {run_names}")
|
|
648
|
+
res = await session.execute(
|
|
649
|
+
select(FleetModel.name).where(
|
|
650
|
+
FleetModel.project_id == project_id,
|
|
651
|
+
FleetModel.deleted.is_(False),
|
|
652
|
+
)
|
|
653
|
+
)
|
|
654
|
+
fleet_names = list(res.scalars().all())
|
|
655
|
+
if len(fleet_names) > 0:
|
|
656
|
+
raise ServerClientError(f"Failed to delete project with active fleets: {fleet_names}")
|
|
657
|
+
res = await session.execute(
|
|
658
|
+
select(VolumeModel.name).where(
|
|
659
|
+
VolumeModel.project_id == project_id,
|
|
660
|
+
VolumeModel.deleted.is_(False),
|
|
661
|
+
)
|
|
662
|
+
)
|
|
663
|
+
volume_names = list(res.scalars().all())
|
|
664
|
+
if len(volume_names) > 0:
|
|
665
|
+
raise ServerClientError(f"Failed to delete project with active volumes: {volume_names}")
|
|
666
|
+
|
|
667
|
+
|
|
617
668
|
async def remove_project_members(
|
|
618
669
|
session: AsyncSession,
|
|
619
670
|
user: UserModel,
|
|
@@ -4,8 +4,14 @@ Environment variables read by the dstack server. Documented in reference/environ
|
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
6
|
import warnings
|
|
7
|
+
from enum import Enum
|
|
7
8
|
from pathlib import Path
|
|
8
9
|
|
|
10
|
+
from dstack._internal.utils.env import environ
|
|
11
|
+
from dstack._internal.utils.logging import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger(__name__)
|
|
14
|
+
|
|
9
15
|
DSTACK_DIR_PATH = Path("~/.dstack/").expanduser()
|
|
10
16
|
|
|
11
17
|
SERVER_DIR_PATH = Path(os.getenv("DSTACK_SERVER_DIR", DSTACK_DIR_PATH / "server"))
|
|
@@ -136,3 +142,43 @@ UPDATE_DEFAULT_PROJECT = os.getenv("DSTACK_UPDATE_DEFAULT_PROJECT") is not None
|
|
|
136
142
|
DO_NOT_UPDATE_DEFAULT_PROJECT = os.getenv("DSTACK_DO_NOT_UPDATE_DEFAULT_PROJECT") is not None
|
|
137
143
|
SKIP_GATEWAY_UPDATE = os.getenv("DSTACK_SKIP_GATEWAY_UPDATE") is not None
|
|
138
144
|
ENABLE_PROMETHEUS_METRICS = os.getenv("DSTACK_ENABLE_PROMETHEUS_METRICS") is not None
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class JobNetworkMode(Enum):
|
|
148
|
+
# "host" for multinode runs only, "bridge" otherwise. Opt-in new defaut
|
|
149
|
+
HOST_FOR_MULTINODE_ONLY = 1
|
|
150
|
+
# "bridge" if the job occupies only a part of the instance, "host" otherswise. Current default
|
|
151
|
+
HOST_WHEN_POSSIBLE = 2
|
|
152
|
+
# Always "bridge", even for multinode runs. Same as legacy DSTACK_FORCE_BRIDGE_NETWORK=true
|
|
153
|
+
FORCED_BRIDGE = 3
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _get_job_network_mode() -> JobNetworkMode:
|
|
157
|
+
# Current default
|
|
158
|
+
mode = JobNetworkMode.HOST_WHEN_POSSIBLE
|
|
159
|
+
bridge_var = "DSTACK_FORCE_BRIDGE_NETWORK"
|
|
160
|
+
force_bridge = environ.get_bool(bridge_var)
|
|
161
|
+
mode_var = "DSTACK_SERVER_JOB_NETWORK_MODE"
|
|
162
|
+
mode_from_env = environ.get_enum(mode_var, JobNetworkMode, value_type=int)
|
|
163
|
+
if mode_from_env is not None:
|
|
164
|
+
if force_bridge is not None:
|
|
165
|
+
logger.warning(
|
|
166
|
+
f"{bridge_var} is deprecated since 0.19.27 and ignored when {mode_var} is set"
|
|
167
|
+
)
|
|
168
|
+
return mode_from_env
|
|
169
|
+
if force_bridge is not None:
|
|
170
|
+
if force_bridge:
|
|
171
|
+
mode = JobNetworkMode.FORCED_BRIDGE
|
|
172
|
+
logger.warning(
|
|
173
|
+
(
|
|
174
|
+
f"{bridge_var} is deprecated since 0.19.27."
|
|
175
|
+
f" Set {mode_var} to {mode.value} and remove {bridge_var}"
|
|
176
|
+
)
|
|
177
|
+
)
|
|
178
|
+
else:
|
|
179
|
+
logger.warning(f"{bridge_var} is deprecated since 0.19.27. Remove {bridge_var}")
|
|
180
|
+
return mode
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
JOB_NETWORK_MODE = _get_job_network_mode()
|
|
184
|
+
del _get_job_network_mode
|