dstack 0.19.23rc1__py3-none-any.whl → 0.19.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/apply.py +14 -2
- dstack/_internal/cli/commands/init.py +47 -2
- dstack/_internal/cli/commands/offer.py +68 -60
- dstack/_internal/cli/services/configurators/run.py +35 -10
- dstack/_internal/cli/services/repos.py +6 -24
- dstack/_internal/cli/utils/common.py +7 -0
- dstack/_internal/cli/utils/gpu.py +210 -0
- dstack/_internal/cli/utils/run.py +33 -0
- dstack/_internal/core/backends/aws/compute.py +1 -4
- dstack/_internal/core/backends/base/compute.py +0 -4
- dstack/_internal/core/backends/gcp/compute.py +1 -4
- dstack/_internal/core/backends/nebius/compute.py +1 -4
- dstack/_internal/core/models/common.py +1 -1
- dstack/_internal/core/models/config.py +3 -1
- dstack/_internal/core/models/configurations.py +16 -14
- dstack/_internal/core/models/fleets.py +2 -2
- dstack/_internal/core/models/instances.py +4 -1
- dstack/_internal/core/models/profiles.py +2 -2
- dstack/_internal/core/models/repos/remote.py +2 -2
- dstack/_internal/core/models/resources.py +4 -4
- dstack/_internal/core/models/runs.py +13 -9
- dstack/_internal/core/services/configs/__init__.py +8 -7
- dstack/_internal/proxy/gateway/services/registry.py +2 -0
- dstack/_internal/server/app.py +2 -0
- dstack/_internal/server/background/tasks/process_fleets.py +10 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +66 -46
- dstack/_internal/server/background/tasks/process_runs.py +16 -15
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +251 -52
- dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py +28 -0
- dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py +484 -0
- dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py +41 -0
- dstack/_internal/server/models.py +24 -13
- dstack/_internal/server/routers/gpus.py +29 -0
- dstack/_internal/server/schemas/gateways.py +1 -1
- dstack/_internal/server/schemas/gpus.py +66 -0
- dstack/_internal/server/services/docker.py +1 -1
- dstack/_internal/server/services/gpus.py +390 -0
- dstack/_internal/server/services/jobs/__init__.py +3 -1
- dstack/_internal/server/services/offers.py +48 -31
- dstack/_internal/server/services/probes.py +5 -1
- dstack/_internal/server/services/proxy/repo.py +1 -0
- dstack/_internal/server/services/repos.py +1 -1
- dstack/_internal/server/services/runs.py +15 -12
- dstack/_internal/server/services/secrets.py +1 -1
- dstack/_internal/server/services/services/__init__.py +60 -41
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/logo-notext.svg +116 -0
- dstack/_internal/server/statics/{main-03e818b110e1d5705378.css → main-aec4762350e34d6fbff9.css} +1 -1
- dstack/_internal/server/statics/{main-cc067b7fd1a8f33f97da.js → main-d151b300fcac3933213d.js} +20 -23
- dstack/_internal/server/statics/{main-cc067b7fd1a8f33f97da.js.map → main-d151b300fcac3933213d.js.map} +1 -1
- dstack/_internal/server/testing/common.py +7 -2
- dstack/api/_public/repos.py +8 -7
- dstack/api/server/__init__.py +6 -0
- dstack/api/server/_gpus.py +22 -0
- dstack/version.py +1 -1
- {dstack-0.19.23rc1.dist-info → dstack-0.19.25.dist-info}/METADATA +1 -1
- {dstack-0.19.23rc1.dist-info → dstack-0.19.25.dist-info}/RECORD +60 -51
- {dstack-0.19.23rc1.dist-info → dstack-0.19.25.dist-info}/WHEEL +0 -0
- {dstack-0.19.23rc1.dist-info → dstack-0.19.25.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.23rc1.dist-info → dstack-0.19.25.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,17 +1,20 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import itertools
|
|
3
|
+
import math
|
|
2
4
|
import uuid
|
|
3
5
|
from datetime import datetime, timedelta
|
|
4
6
|
from typing import List, Optional, Tuple
|
|
5
7
|
|
|
6
|
-
from sqlalchemy import select
|
|
8
|
+
from sqlalchemy import and_, or_, select
|
|
7
9
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
8
|
-
from sqlalchemy.orm import joinedload, load_only, selectinload
|
|
10
|
+
from sqlalchemy.orm import contains_eager, joinedload, load_only, selectinload
|
|
9
11
|
|
|
10
12
|
from dstack._internal.core.backends.base.backend import Backend
|
|
11
13
|
from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
|
|
12
14
|
from dstack._internal.core.errors import BackendError, ServerClientError
|
|
13
15
|
from dstack._internal.core.models.common import NetworkMode
|
|
14
16
|
from dstack._internal.core.models.fleets import (
|
|
17
|
+
Fleet,
|
|
15
18
|
FleetConfiguration,
|
|
16
19
|
FleetSpec,
|
|
17
20
|
FleetStatus,
|
|
@@ -23,7 +26,7 @@ from dstack._internal.core.models.profiles import (
|
|
|
23
26
|
CreationPolicy,
|
|
24
27
|
TerminationPolicy,
|
|
25
28
|
)
|
|
26
|
-
from dstack._internal.core.models.resources import Memory
|
|
29
|
+
from dstack._internal.core.models.resources import Memory, Range
|
|
27
30
|
from dstack._internal.core.models.runs import (
|
|
28
31
|
Job,
|
|
29
32
|
JobProvisioningData,
|
|
@@ -157,7 +160,10 @@ async def _process_next_submitted_job():
|
|
|
157
160
|
async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
158
161
|
# Refetch to load related attributes.
|
|
159
162
|
res = await session.execute(
|
|
160
|
-
select(JobModel)
|
|
163
|
+
select(JobModel)
|
|
164
|
+
.where(JobModel.id == job_model.id)
|
|
165
|
+
.options(joinedload(JobModel.instance))
|
|
166
|
+
.options(joinedload(JobModel.fleet).joinedload(FleetModel.instances))
|
|
161
167
|
)
|
|
162
168
|
job_model = res.unique().scalar_one()
|
|
163
169
|
res = await session.execute(
|
|
@@ -176,6 +182,12 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
176
182
|
profile = run_spec.merged_profile
|
|
177
183
|
job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
|
|
178
184
|
|
|
185
|
+
# Master job chooses fleet for the run.
|
|
186
|
+
# Due to two-step processing, it's saved to job_model.fleet.
|
|
187
|
+
# Other jobs just inherit fleet from run_model.fleet.
|
|
188
|
+
# If master job chooses no fleet, the new fleet will be created.
|
|
189
|
+
fleet_model = run_model.fleet or job_model.fleet
|
|
190
|
+
|
|
179
191
|
master_job = find_job(run.jobs, job_model.replica_num, 0)
|
|
180
192
|
master_job_provisioning_data = None
|
|
181
193
|
if job.job_spec.job_num != 0:
|
|
@@ -223,54 +235,80 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
223
235
|
# Then, the job runs on the assigned instance or a new instance is provisioned.
|
|
224
236
|
# This is needed to avoid holding instances lock for a long time.
|
|
225
237
|
if not job_model.instance_assigned:
|
|
226
|
-
#
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
238
|
+
# If another job freed the instance but is still trying to detach volumes,
|
|
239
|
+
# do not provision on it to prevent attaching volumes that are currently detaching.
|
|
240
|
+
detaching_instances_ids = await get_instances_ids_with_detaching_volumes(session)
|
|
241
|
+
|
|
242
|
+
fleet_filters = [
|
|
243
|
+
FleetModel.project_id == project.id,
|
|
244
|
+
FleetModel.deleted == False,
|
|
245
|
+
]
|
|
246
|
+
if run_model.fleet is not None:
|
|
247
|
+
fleet_filters.append(FleetModel.id == run_model.fleet_id)
|
|
248
|
+
if run_spec.configuration.fleets is not None:
|
|
249
|
+
fleet_filters.append(FleetModel.name.in_(run_spec.configuration.fleets))
|
|
250
|
+
|
|
251
|
+
instance_filters = [
|
|
252
|
+
InstanceModel.deleted == False,
|
|
253
|
+
InstanceModel.total_blocks > InstanceModel.busy_blocks,
|
|
254
|
+
InstanceModel.id.not_in(detaching_instances_ids),
|
|
255
|
+
]
|
|
256
|
+
|
|
257
|
+
fleet_models_with_instances, fleet_models_without_instances = await _select_fleet_models(
|
|
258
|
+
session=session,
|
|
259
|
+
fleet_filters=fleet_filters,
|
|
260
|
+
instance_filters=instance_filters,
|
|
261
|
+
)
|
|
262
|
+
instances_ids = sorted(
|
|
263
|
+
itertools.chain.from_iterable(
|
|
264
|
+
[i.id for i in f.instances] for f in fleet_models_with_instances
|
|
233
265
|
)
|
|
234
|
-
.order_by(InstanceModel.id) # take locks in order
|
|
235
|
-
.with_for_update(key_share=True)
|
|
236
266
|
)
|
|
237
|
-
|
|
238
|
-
|
|
267
|
+
fleet_models = fleet_models_with_instances + fleet_models_without_instances
|
|
268
|
+
fleets_ids = [f.id for f in fleet_models]
|
|
269
|
+
|
|
239
270
|
if get_db().dialect_name == "sqlite":
|
|
240
271
|
# Start new transaction to see committed changes after lock
|
|
241
272
|
await session.commit()
|
|
273
|
+
|
|
242
274
|
async with get_locker(get_db().dialect_name).lock_ctx(
|
|
243
275
|
InstanceModel.__tablename__, instances_ids
|
|
244
276
|
):
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
InstanceModel.id.not_in(detaching_instances_ids),
|
|
253
|
-
InstanceModel.id.in_(instances_ids),
|
|
254
|
-
InstanceModel.deleted == False,
|
|
255
|
-
InstanceModel.total_blocks > InstanceModel.busy_blocks,
|
|
277
|
+
if get_db().dialect_name == "sqlite":
|
|
278
|
+
fleet_models = await _refetch_fleet_models(
|
|
279
|
+
session=session,
|
|
280
|
+
fleets_ids=fleets_ids,
|
|
281
|
+
instances_ids=instances_ids,
|
|
282
|
+
fleet_filters=fleet_filters,
|
|
283
|
+
instance_filters=instance_filters,
|
|
256
284
|
)
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
instance = await _assign_job_to_pool_instance(
|
|
262
|
-
session=session,
|
|
263
|
-
pool_instances=pool_instances,
|
|
264
|
-
run_spec=run_spec,
|
|
265
|
-
job_model=job_model,
|
|
285
|
+
fleet_model, fleet_instances_with_offers = _find_optimal_fleet_with_offers(
|
|
286
|
+
fleet_models=fleet_models,
|
|
287
|
+
run_model=run_model,
|
|
288
|
+
run_spec=run.run_spec,
|
|
266
289
|
job=job,
|
|
267
|
-
fleet_model=run_model.fleet,
|
|
268
290
|
master_job_provisioning_data=master_job_provisioning_data,
|
|
269
291
|
volumes=volumes,
|
|
270
292
|
)
|
|
293
|
+
if fleet_model is None and run_spec.configuration.fleets is not None:
|
|
294
|
+
# Run cannot create new fleets when fleets are specified
|
|
295
|
+
logger.debug("%s: failed to use specified fleets", fmt(job_model))
|
|
296
|
+
job_model.status = JobStatus.TERMINATING
|
|
297
|
+
job_model.termination_reason = (
|
|
298
|
+
JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
|
|
299
|
+
)
|
|
300
|
+
job_model.last_processed_at = common_utils.get_current_datetime()
|
|
301
|
+
await session.commit()
|
|
302
|
+
return
|
|
303
|
+
instance = await _assign_job_to_fleet_instance(
|
|
304
|
+
session=session,
|
|
305
|
+
instances_with_offers=fleet_instances_with_offers,
|
|
306
|
+
job_model=job_model,
|
|
307
|
+
)
|
|
308
|
+
job_model.fleet = fleet_model
|
|
271
309
|
job_model.instance_assigned = True
|
|
272
310
|
job_model.last_processed_at = common_utils.get_current_datetime()
|
|
273
|
-
if len(
|
|
311
|
+
if len(instances_ids) > 0:
|
|
274
312
|
await session.commit()
|
|
275
313
|
return
|
|
276
314
|
# If no instances were locked, we can proceed in the same transaction.
|
|
@@ -297,7 +335,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
297
335
|
# Create a new cloud instance
|
|
298
336
|
run_job_result = await _run_job_on_new_instance(
|
|
299
337
|
project=project,
|
|
300
|
-
fleet_model=
|
|
338
|
+
fleet_model=fleet_model,
|
|
301
339
|
job_model=job_model,
|
|
302
340
|
run=run,
|
|
303
341
|
job=job,
|
|
@@ -318,11 +356,11 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
318
356
|
job_provisioning_data, offer = run_job_result
|
|
319
357
|
job_model.job_provisioning_data = job_provisioning_data.json()
|
|
320
358
|
job_model.status = JobStatus.PROVISIONING
|
|
321
|
-
fleet_model
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
359
|
+
if fleet_model is None:
|
|
360
|
+
fleet_model = _create_fleet_model_for_job(
|
|
361
|
+
project=project,
|
|
362
|
+
run=run,
|
|
363
|
+
)
|
|
326
364
|
instance_num = await _get_next_instance_num(
|
|
327
365
|
session=session,
|
|
328
366
|
fleet_model=fleet_model,
|
|
@@ -376,16 +414,156 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
376
414
|
await session.commit()
|
|
377
415
|
|
|
378
416
|
|
|
379
|
-
async def
|
|
417
|
+
async def _select_fleet_models(
|
|
418
|
+
session: AsyncSession, fleet_filters: list, instance_filters: list
|
|
419
|
+
) -> tuple[list[FleetModel], list[FleetModel]]:
|
|
420
|
+
# Selecting fleets in two queries since Postgres does not allow
|
|
421
|
+
# locking nullable side of an outer join. So, first lock instances with inner join.
|
|
422
|
+
# Then select left out fleets without instances.
|
|
423
|
+
res = await session.execute(
|
|
424
|
+
select(FleetModel)
|
|
425
|
+
.join(FleetModel.instances)
|
|
426
|
+
.where(*fleet_filters)
|
|
427
|
+
.where(*instance_filters)
|
|
428
|
+
.options(contains_eager(FleetModel.instances))
|
|
429
|
+
.order_by(InstanceModel.id) # take locks in order
|
|
430
|
+
.with_for_update(key_share=True, of=InstanceModel)
|
|
431
|
+
)
|
|
432
|
+
fleet_models_with_instances = list(res.unique().scalars().all())
|
|
433
|
+
fleet_models_with_instances_ids = [f.id for f in fleet_models_with_instances]
|
|
434
|
+
res = await session.execute(
|
|
435
|
+
select(FleetModel)
|
|
436
|
+
.outerjoin(FleetModel.instances)
|
|
437
|
+
.where(
|
|
438
|
+
*fleet_filters,
|
|
439
|
+
FleetModel.id.not_in(fleet_models_with_instances_ids),
|
|
440
|
+
)
|
|
441
|
+
.where(InstanceModel.id.is_(None))
|
|
442
|
+
.options(contains_eager(FleetModel.instances)) # loading empty relation
|
|
443
|
+
)
|
|
444
|
+
fleet_models_without_instances = list(res.unique().scalars().all())
|
|
445
|
+
return fleet_models_with_instances, fleet_models_without_instances
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
async def _refetch_fleet_models(
|
|
380
449
|
session: AsyncSession,
|
|
381
|
-
|
|
450
|
+
fleets_ids: list[uuid.UUID],
|
|
451
|
+
instances_ids: list[uuid.UUID],
|
|
452
|
+
fleet_filters: list,
|
|
453
|
+
instance_filters: list,
|
|
454
|
+
) -> list[FleetModel]:
|
|
455
|
+
res = await session.execute(
|
|
456
|
+
select(FleetModel)
|
|
457
|
+
.outerjoin(FleetModel.instances)
|
|
458
|
+
.where(
|
|
459
|
+
FleetModel.id.in_(fleets_ids),
|
|
460
|
+
*fleet_filters,
|
|
461
|
+
)
|
|
462
|
+
.where(
|
|
463
|
+
or_(
|
|
464
|
+
InstanceModel.id.is_(None),
|
|
465
|
+
and_(
|
|
466
|
+
InstanceModel.id.in_(instances_ids),
|
|
467
|
+
*instance_filters,
|
|
468
|
+
),
|
|
469
|
+
)
|
|
470
|
+
)
|
|
471
|
+
.options(contains_eager(FleetModel.instances))
|
|
472
|
+
.execution_options(populate_existing=True)
|
|
473
|
+
)
|
|
474
|
+
fleet_models = list(res.unique().scalars().all())
|
|
475
|
+
return fleet_models
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def _find_optimal_fleet_with_offers(
|
|
479
|
+
fleet_models: list[FleetModel],
|
|
480
|
+
run_model: RunModel,
|
|
481
|
+
run_spec: RunSpec,
|
|
482
|
+
job: Job,
|
|
483
|
+
master_job_provisioning_data: Optional[JobProvisioningData],
|
|
484
|
+
volumes: Optional[list[list[Volume]]],
|
|
485
|
+
) -> tuple[Optional[FleetModel], list[tuple[InstanceModel, InstanceOfferWithAvailability]]]:
|
|
486
|
+
if run_model.fleet is not None:
|
|
487
|
+
# Using the fleet that was already chosen by the master job
|
|
488
|
+
fleet_instances_with_offers = _get_fleet_instances_with_offers(
|
|
489
|
+
fleet_model=run_model.fleet,
|
|
490
|
+
run_spec=run_spec,
|
|
491
|
+
job=job,
|
|
492
|
+
master_job_provisioning_data=master_job_provisioning_data,
|
|
493
|
+
volumes=volumes,
|
|
494
|
+
)
|
|
495
|
+
return run_model.fleet, fleet_instances_with_offers
|
|
496
|
+
|
|
497
|
+
if len(fleet_models) == 0:
|
|
498
|
+
return None, []
|
|
499
|
+
|
|
500
|
+
nodes_required_num = _get_nodes_required_num_for_run(run_spec)
|
|
501
|
+
# The current strategy is to first consider fleets that can accommodate
|
|
502
|
+
# the run without additional provisioning and choose the one with the cheapest offer.
|
|
503
|
+
# Fallback to fleet with the cheapest offer among all fleets with offers.
|
|
504
|
+
candidate_fleets_with_offers: list[
|
|
505
|
+
tuple[
|
|
506
|
+
Optional[FleetModel],
|
|
507
|
+
list[tuple[InstanceModel, InstanceOfferWithAvailability]],
|
|
508
|
+
int,
|
|
509
|
+
tuple[int, float],
|
|
510
|
+
]
|
|
511
|
+
] = []
|
|
512
|
+
for candidate_fleet_model in fleet_models:
|
|
513
|
+
fleet_instances_with_offers = _get_fleet_instances_with_offers(
|
|
514
|
+
fleet_model=candidate_fleet_model,
|
|
515
|
+
run_spec=run_spec,
|
|
516
|
+
job=job,
|
|
517
|
+
master_job_provisioning_data=master_job_provisioning_data,
|
|
518
|
+
volumes=volumes,
|
|
519
|
+
)
|
|
520
|
+
fleet_available_offers = [
|
|
521
|
+
o for _, o in fleet_instances_with_offers if o.availability.is_available()
|
|
522
|
+
]
|
|
523
|
+
fleet_has_available_capacity = nodes_required_num <= len(fleet_available_offers)
|
|
524
|
+
fleet_cheapest_offer = math.inf
|
|
525
|
+
if len(fleet_available_offers) > 0:
|
|
526
|
+
fleet_cheapest_offer = fleet_available_offers[0].price
|
|
527
|
+
fleet_priority = (not fleet_has_available_capacity, fleet_cheapest_offer)
|
|
528
|
+
candidate_fleets_with_offers.append(
|
|
529
|
+
(
|
|
530
|
+
candidate_fleet_model,
|
|
531
|
+
fleet_instances_with_offers,
|
|
532
|
+
len(fleet_available_offers),
|
|
533
|
+
fleet_priority,
|
|
534
|
+
)
|
|
535
|
+
)
|
|
536
|
+
if run_spec.configuration.fleets is None and all(
|
|
537
|
+
t[2] == 0 for t in candidate_fleets_with_offers
|
|
538
|
+
):
|
|
539
|
+
# If fleets are not specified and no fleets have available offers, create a new fleet.
|
|
540
|
+
# This is for compatibility with non-fleet-first UX when runs created new fleets
|
|
541
|
+
# if there are no instances to reuse.
|
|
542
|
+
return None, []
|
|
543
|
+
candidate_fleets_with_offers.sort(key=lambda t: t[-1])
|
|
544
|
+
return candidate_fleets_with_offers[0][:2]
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
def _get_nodes_required_num_for_run(run_spec: RunSpec) -> int:
|
|
548
|
+
nodes_required_num = 1
|
|
549
|
+
if run_spec.configuration.type == "task":
|
|
550
|
+
nodes_required_num = run_spec.configuration.nodes
|
|
551
|
+
elif (
|
|
552
|
+
run_spec.configuration.type == "service"
|
|
553
|
+
and run_spec.configuration.replicas.min is not None
|
|
554
|
+
):
|
|
555
|
+
nodes_required_num = run_spec.configuration.replicas.min
|
|
556
|
+
return nodes_required_num
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def _get_fleet_instances_with_offers(
|
|
560
|
+
fleet_model: FleetModel,
|
|
382
561
|
run_spec: RunSpec,
|
|
383
|
-
job_model: JobModel,
|
|
384
562
|
job: Job,
|
|
385
|
-
fleet_model: Optional[FleetModel],
|
|
386
563
|
master_job_provisioning_data: Optional[JobProvisioningData] = None,
|
|
387
564
|
volumes: Optional[List[List[Volume]]] = None,
|
|
388
|
-
) ->
|
|
565
|
+
) -> list[tuple[InstanceModel, InstanceOfferWithAvailability]]:
|
|
566
|
+
pool_instances = fleet_model.instances
|
|
389
567
|
instances_with_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]]
|
|
390
568
|
profile = run_spec.merged_profile
|
|
391
569
|
multinode = job.job_spec.jobs_per_replica > 1
|
|
@@ -414,7 +592,15 @@ async def _assign_job_to_pool_instance(
|
|
|
414
592
|
volumes=volumes,
|
|
415
593
|
)
|
|
416
594
|
instances_with_offers.extend(shared_instances_with_offers)
|
|
595
|
+
instances_with_offers.sort(key=lambda instance_with_offer: instance_with_offer[0].price or 0)
|
|
596
|
+
return instances_with_offers
|
|
417
597
|
|
|
598
|
+
|
|
599
|
+
async def _assign_job_to_fleet_instance(
|
|
600
|
+
session: AsyncSession,
|
|
601
|
+
instances_with_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]],
|
|
602
|
+
job_model: JobModel,
|
|
603
|
+
) -> Optional[InstanceModel]:
|
|
418
604
|
if len(instances_with_offers) == 0:
|
|
419
605
|
return None
|
|
420
606
|
|
|
@@ -463,6 +649,11 @@ async def _run_job_on_new_instance(
|
|
|
463
649
|
fleet = None
|
|
464
650
|
if fleet_model is not None:
|
|
465
651
|
fleet = fleet_model_to_fleet(fleet_model)
|
|
652
|
+
if not _check_can_create_new_instance_in_fleet(fleet):
|
|
653
|
+
logger.debug(
|
|
654
|
+
"%s: cannot fit new instance into fleet %s", fmt(job_model), fleet_model.name
|
|
655
|
+
)
|
|
656
|
+
return None
|
|
466
657
|
multinode = job.job_spec.jobs_per_replica > 1 or (
|
|
467
658
|
fleet is not None and fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
|
|
468
659
|
)
|
|
@@ -522,13 +713,20 @@ async def _run_job_on_new_instance(
|
|
|
522
713
|
return None
|
|
523
714
|
|
|
524
715
|
|
|
525
|
-
def
|
|
716
|
+
def _check_can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
|
|
717
|
+
if fleet.spec.configuration.ssh_config is not None:
|
|
718
|
+
return False
|
|
719
|
+
# TODO: Respect nodes.max
|
|
720
|
+
# Ensure concurrent provisioning does not violate nodes.max
|
|
721
|
+
# E.g. lock fleet and split instance model creation
|
|
722
|
+
# and instance provisioning into separate transactions.
|
|
723
|
+
return True
|
|
724
|
+
|
|
725
|
+
|
|
726
|
+
def _create_fleet_model_for_job(
|
|
526
727
|
project: ProjectModel,
|
|
527
|
-
run_model: RunModel,
|
|
528
728
|
run: Run,
|
|
529
729
|
) -> FleetModel:
|
|
530
|
-
if run_model.fleet is not None:
|
|
531
|
-
return run_model.fleet
|
|
532
730
|
placement = InstanceGroupPlacement.ANY
|
|
533
731
|
if run.run_spec.configuration.type == "task" and run.run_spec.configuration.nodes > 1:
|
|
534
732
|
placement = InstanceGroupPlacement.CLUSTER
|
|
@@ -537,6 +735,7 @@ def _get_or_create_fleet_model_for_job(
|
|
|
537
735
|
name=run.run_spec.run_name,
|
|
538
736
|
placement=placement,
|
|
539
737
|
reservation=run.run_spec.configuration.reservation,
|
|
738
|
+
nodes=Range(min=_get_nodes_required_num_for_run(run.run_spec), max=None),
|
|
540
739
|
),
|
|
541
740
|
profile=run.run_spec.merged_profile,
|
|
542
741
|
autocreated=True,
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Add JobModel.registered
|
|
2
|
+
|
|
3
|
+
Revision ID: 3d7f6c2ec000
|
|
4
|
+
Revises: 74a1f55209bd
|
|
5
|
+
Create Date: 2025-08-11 13:23:39.530103
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = "3d7f6c2ec000"
|
|
14
|
+
down_revision = "74a1f55209bd"
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
21
|
+
batch_op.add_column(
|
|
22
|
+
sa.Column("registered", sa.Boolean(), server_default=sa.false(), nullable=False)
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def downgrade() -> None:
|
|
27
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
28
|
+
batch_op.drop_column("registered")
|