dstack 0.19.23rc1__py3-none-any.whl → 0.19.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (60) hide show
  1. dstack/_internal/cli/commands/apply.py +14 -2
  2. dstack/_internal/cli/commands/init.py +47 -2
  3. dstack/_internal/cli/commands/offer.py +68 -60
  4. dstack/_internal/cli/services/configurators/run.py +35 -10
  5. dstack/_internal/cli/services/repos.py +6 -24
  6. dstack/_internal/cli/utils/common.py +7 -0
  7. dstack/_internal/cli/utils/gpu.py +210 -0
  8. dstack/_internal/cli/utils/run.py +33 -0
  9. dstack/_internal/core/backends/aws/compute.py +1 -4
  10. dstack/_internal/core/backends/base/compute.py +0 -4
  11. dstack/_internal/core/backends/gcp/compute.py +1 -4
  12. dstack/_internal/core/backends/nebius/compute.py +1 -4
  13. dstack/_internal/core/models/common.py +1 -1
  14. dstack/_internal/core/models/config.py +3 -1
  15. dstack/_internal/core/models/configurations.py +16 -14
  16. dstack/_internal/core/models/fleets.py +2 -2
  17. dstack/_internal/core/models/instances.py +4 -1
  18. dstack/_internal/core/models/profiles.py +2 -2
  19. dstack/_internal/core/models/repos/remote.py +2 -2
  20. dstack/_internal/core/models/resources.py +4 -4
  21. dstack/_internal/core/models/runs.py +13 -9
  22. dstack/_internal/core/services/configs/__init__.py +8 -7
  23. dstack/_internal/proxy/gateway/services/registry.py +2 -0
  24. dstack/_internal/server/app.py +2 -0
  25. dstack/_internal/server/background/tasks/process_fleets.py +10 -2
  26. dstack/_internal/server/background/tasks/process_running_jobs.py +66 -46
  27. dstack/_internal/server/background/tasks/process_runs.py +16 -15
  28. dstack/_internal/server/background/tasks/process_submitted_jobs.py +251 -52
  29. dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py +28 -0
  30. dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py +484 -0
  31. dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py +41 -0
  32. dstack/_internal/server/models.py +24 -13
  33. dstack/_internal/server/routers/gpus.py +29 -0
  34. dstack/_internal/server/schemas/gateways.py +1 -1
  35. dstack/_internal/server/schemas/gpus.py +66 -0
  36. dstack/_internal/server/services/docker.py +1 -1
  37. dstack/_internal/server/services/gpus.py +390 -0
  38. dstack/_internal/server/services/jobs/__init__.py +3 -1
  39. dstack/_internal/server/services/offers.py +48 -31
  40. dstack/_internal/server/services/probes.py +5 -1
  41. dstack/_internal/server/services/proxy/repo.py +1 -0
  42. dstack/_internal/server/services/repos.py +1 -1
  43. dstack/_internal/server/services/runs.py +15 -12
  44. dstack/_internal/server/services/secrets.py +1 -1
  45. dstack/_internal/server/services/services/__init__.py +60 -41
  46. dstack/_internal/server/statics/index.html +1 -1
  47. dstack/_internal/server/statics/logo-notext.svg +116 -0
  48. dstack/_internal/server/statics/{main-03e818b110e1d5705378.css → main-aec4762350e34d6fbff9.css} +1 -1
  49. dstack/_internal/server/statics/{main-cc067b7fd1a8f33f97da.js → main-d151b300fcac3933213d.js} +20 -23
  50. dstack/_internal/server/statics/{main-cc067b7fd1a8f33f97da.js.map → main-d151b300fcac3933213d.js.map} +1 -1
  51. dstack/_internal/server/testing/common.py +7 -2
  52. dstack/api/_public/repos.py +8 -7
  53. dstack/api/server/__init__.py +6 -0
  54. dstack/api/server/_gpus.py +22 -0
  55. dstack/version.py +1 -1
  56. {dstack-0.19.23rc1.dist-info → dstack-0.19.25.dist-info}/METADATA +1 -1
  57. {dstack-0.19.23rc1.dist-info → dstack-0.19.25.dist-info}/RECORD +60 -51
  58. {dstack-0.19.23rc1.dist-info → dstack-0.19.25.dist-info}/WHEEL +0 -0
  59. {dstack-0.19.23rc1.dist-info → dstack-0.19.25.dist-info}/entry_points.txt +0 -0
  60. {dstack-0.19.23rc1.dist-info → dstack-0.19.25.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,484 @@
1
+ """Store enums as strings
2
+
3
+ Revision ID: 74a1f55209bd
4
+ Revises: 728b1488b1b4
5
+ Create Date: 2025-08-06 13:49:28.785378
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+ from sqlalchemy.dialects import postgresql
12
+
13
+ # revision identifiers, used by Alembic.
14
+ revision = "74a1f55209bd"
15
+ down_revision = "728b1488b1b4"
16
+ branch_labels = None
17
+ depends_on = None
18
+
19
+
20
+ def upgrade() -> None:
21
+ # ### commands auto generated by Alembic - please adjust! ###
22
+ with op.batch_alter_table("users", schema=None) as batch_op:
23
+ batch_op.alter_column(
24
+ "global_role",
25
+ existing_type=postgresql.ENUM("ADMIN", "USER", name="globalrole"),
26
+ type_=sa.String(length=100),
27
+ existing_nullable=False,
28
+ )
29
+
30
+ with op.batch_alter_table("members", schema=None) as batch_op:
31
+ batch_op.alter_column(
32
+ "project_role",
33
+ existing_type=postgresql.ENUM("ADMIN", "MANAGER", "USER", name="projectrole"),
34
+ type_=sa.String(length=100),
35
+ existing_nullable=False,
36
+ )
37
+
38
+ with op.batch_alter_table("repos", schema=None) as batch_op:
39
+ batch_op.alter_column(
40
+ "type",
41
+ existing_type=postgresql.ENUM("REMOTE", "LOCAL", "VIRTUAL", name="repotype"),
42
+ type_=sa.String(length=100),
43
+ existing_nullable=False,
44
+ )
45
+
46
+ with op.batch_alter_table("runs", schema=None) as batch_op:
47
+ batch_op.alter_column(
48
+ "status",
49
+ existing_type=postgresql.ENUM(
50
+ "PENDING",
51
+ "SUBMITTED",
52
+ "PROVISIONING",
53
+ "RUNNING",
54
+ "TERMINATING",
55
+ "TERMINATED",
56
+ "FAILED",
57
+ "DONE",
58
+ name="runstatus",
59
+ ),
60
+ type_=sa.String(length=100),
61
+ existing_nullable=False,
62
+ )
63
+ batch_op.alter_column(
64
+ "termination_reason",
65
+ existing_type=postgresql.ENUM(
66
+ "ALL_JOBS_DONE",
67
+ "JOB_FAILED",
68
+ "RETRY_LIMIT_EXCEEDED",
69
+ "STOPPED_BY_USER",
70
+ "ABORTED_BY_USER",
71
+ "SERVER_ERROR",
72
+ name="runterminationreason",
73
+ ),
74
+ type_=sa.String(length=100),
75
+ existing_nullable=True,
76
+ )
77
+
78
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
79
+ batch_op.alter_column(
80
+ "status",
81
+ existing_type=postgresql.ENUM(
82
+ "SUBMITTED",
83
+ "PROVISIONING",
84
+ "PULLING",
85
+ "RUNNING",
86
+ "TERMINATING",
87
+ "TERMINATED",
88
+ "ABORTED",
89
+ "FAILED",
90
+ "DONE",
91
+ name="jobstatus",
92
+ ),
93
+ type_=sa.String(length=100),
94
+ existing_nullable=False,
95
+ )
96
+ batch_op.alter_column(
97
+ "termination_reason",
98
+ existing_type=postgresql.ENUM(
99
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
100
+ "INTERRUPTED_BY_NO_CAPACITY",
101
+ "INSTANCE_UNREACHABLE",
102
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
103
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
104
+ "TERMINATED_BY_USER",
105
+ "VOLUME_ERROR",
106
+ "GATEWAY_ERROR",
107
+ "SCALED_DOWN",
108
+ "DONE_BY_RUNNER",
109
+ "ABORTED_BY_USER",
110
+ "TERMINATED_BY_SERVER",
111
+ "INACTIVITY_DURATION_EXCEEDED",
112
+ "TERMINATED_DUE_TO_UTILIZATION_POLICY",
113
+ "CONTAINER_EXITED_WITH_ERROR",
114
+ "PORTS_BINDING_FAILED",
115
+ "CREATING_CONTAINER_ERROR",
116
+ "EXECUTOR_ERROR",
117
+ "MAX_DURATION_EXCEEDED",
118
+ name="jobterminationreason",
119
+ ),
120
+ type_=sa.String(length=100),
121
+ existing_nullable=True,
122
+ )
123
+
124
+ with op.batch_alter_table("fleets", schema=None) as batch_op:
125
+ batch_op.alter_column(
126
+ "status",
127
+ existing_type=postgresql.ENUM(
128
+ "SUBMITTED", "ACTIVE", "TERMINATING", "TERMINATED", "FAILED", name="fleetstatus"
129
+ ),
130
+ type_=sa.String(length=100),
131
+ existing_nullable=False,
132
+ )
133
+
134
+ with op.batch_alter_table("gateways", schema=None) as batch_op:
135
+ batch_op.alter_column(
136
+ "status",
137
+ existing_type=postgresql.ENUM(
138
+ "SUBMITTED", "PROVISIONING", "RUNNING", "FAILED", name="gatewaystatus"
139
+ ),
140
+ type_=sa.String(length=100),
141
+ existing_nullable=False,
142
+ )
143
+
144
+ with op.batch_alter_table("instances", schema=None) as batch_op:
145
+ batch_op.alter_column(
146
+ "status",
147
+ existing_type=postgresql.ENUM(
148
+ "PENDING",
149
+ "PROVISIONING",
150
+ "IDLE",
151
+ "BUSY",
152
+ "TERMINATING",
153
+ "TERMINATED",
154
+ name="instancestatus",
155
+ ),
156
+ type_=sa.String(length=100),
157
+ existing_nullable=False,
158
+ )
159
+
160
+ with op.batch_alter_table("volumes", schema=None) as batch_op:
161
+ batch_op.alter_column(
162
+ "status",
163
+ existing_type=postgresql.ENUM(
164
+ "SUBMITTED", "PROVISIONING", "ACTIVE", "FAILED", name="volumestatus"
165
+ ),
166
+ type_=sa.String(length=100),
167
+ existing_nullable=False,
168
+ )
169
+
170
+ sa.Enum("ADMIN", "USER", name="globalrole").drop(op.get_bind())
171
+ sa.Enum(
172
+ "ALL_JOBS_DONE",
173
+ "JOB_FAILED",
174
+ "RETRY_LIMIT_EXCEEDED",
175
+ "STOPPED_BY_USER",
176
+ "ABORTED_BY_USER",
177
+ "SERVER_ERROR",
178
+ name="runterminationreason",
179
+ ).drop(op.get_bind())
180
+ sa.Enum("SUBMITTED", "PROVISIONING", "RUNNING", "FAILED", name="gatewaystatus").drop(
181
+ op.get_bind()
182
+ )
183
+ sa.Enum("SUBMITTED", "PROVISIONING", "ACTIVE", "FAILED", name="volumestatus").drop(
184
+ op.get_bind()
185
+ )
186
+ sa.Enum(
187
+ "PENDING",
188
+ "SUBMITTED",
189
+ "PROVISIONING",
190
+ "RUNNING",
191
+ "TERMINATING",
192
+ "TERMINATED",
193
+ "FAILED",
194
+ "DONE",
195
+ name="runstatus",
196
+ ).drop(op.get_bind())
197
+ sa.Enum("REMOTE", "LOCAL", "VIRTUAL", name="repotype").drop(op.get_bind())
198
+ sa.Enum(
199
+ "SUBMITTED",
200
+ "PROVISIONING",
201
+ "PULLING",
202
+ "RUNNING",
203
+ "TERMINATING",
204
+ "TERMINATED",
205
+ "ABORTED",
206
+ "FAILED",
207
+ "DONE",
208
+ name="jobstatus",
209
+ ).drop(op.get_bind())
210
+ sa.Enum(
211
+ "PENDING",
212
+ "PROVISIONING",
213
+ "IDLE",
214
+ "BUSY",
215
+ "TERMINATING",
216
+ "TERMINATED",
217
+ name="instancestatus",
218
+ ).drop(op.get_bind())
219
+ sa.Enum("SUBMITTED", "ACTIVE", "TERMINATING", "TERMINATED", "FAILED", name="fleetstatus").drop(
220
+ op.get_bind()
221
+ )
222
+ sa.Enum("ADMIN", "MANAGER", "USER", name="projectrole").drop(op.get_bind())
223
+ sa.Enum(
224
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
225
+ "INTERRUPTED_BY_NO_CAPACITY",
226
+ "INSTANCE_UNREACHABLE",
227
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
228
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
229
+ "TERMINATED_BY_USER",
230
+ "VOLUME_ERROR",
231
+ "GATEWAY_ERROR",
232
+ "SCALED_DOWN",
233
+ "DONE_BY_RUNNER",
234
+ "ABORTED_BY_USER",
235
+ "TERMINATED_BY_SERVER",
236
+ "INACTIVITY_DURATION_EXCEEDED",
237
+ "TERMINATED_DUE_TO_UTILIZATION_POLICY",
238
+ "CONTAINER_EXITED_WITH_ERROR",
239
+ "PORTS_BINDING_FAILED",
240
+ "CREATING_CONTAINER_ERROR",
241
+ "EXECUTOR_ERROR",
242
+ "MAX_DURATION_EXCEEDED",
243
+ name="jobterminationreason",
244
+ ).drop(op.get_bind())
245
+ # ### end Alembic commands ###
246
+
247
+
248
+ def downgrade() -> None:
249
+ # ### commands auto generated by Alembic - please adjust! ###
250
+ sa.Enum(
251
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
252
+ "INTERRUPTED_BY_NO_CAPACITY",
253
+ "INSTANCE_UNREACHABLE",
254
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
255
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
256
+ "TERMINATED_BY_USER",
257
+ "VOLUME_ERROR",
258
+ "GATEWAY_ERROR",
259
+ "SCALED_DOWN",
260
+ "DONE_BY_RUNNER",
261
+ "ABORTED_BY_USER",
262
+ "TERMINATED_BY_SERVER",
263
+ "INACTIVITY_DURATION_EXCEEDED",
264
+ "TERMINATED_DUE_TO_UTILIZATION_POLICY",
265
+ "CONTAINER_EXITED_WITH_ERROR",
266
+ "PORTS_BINDING_FAILED",
267
+ "CREATING_CONTAINER_ERROR",
268
+ "EXECUTOR_ERROR",
269
+ "MAX_DURATION_EXCEEDED",
270
+ name="jobterminationreason",
271
+ ).create(op.get_bind())
272
+ sa.Enum("ADMIN", "MANAGER", "USER", name="projectrole").create(op.get_bind())
273
+ sa.Enum(
274
+ "SUBMITTED", "ACTIVE", "TERMINATING", "TERMINATED", "FAILED", name="fleetstatus"
275
+ ).create(op.get_bind())
276
+ sa.Enum(
277
+ "PENDING",
278
+ "PROVISIONING",
279
+ "IDLE",
280
+ "BUSY",
281
+ "TERMINATING",
282
+ "TERMINATED",
283
+ name="instancestatus",
284
+ ).create(op.get_bind())
285
+ sa.Enum(
286
+ "SUBMITTED",
287
+ "PROVISIONING",
288
+ "PULLING",
289
+ "RUNNING",
290
+ "TERMINATING",
291
+ "TERMINATED",
292
+ "ABORTED",
293
+ "FAILED",
294
+ "DONE",
295
+ name="jobstatus",
296
+ ).create(op.get_bind())
297
+ sa.Enum("REMOTE", "LOCAL", "VIRTUAL", name="repotype").create(op.get_bind())
298
+ sa.Enum(
299
+ "PENDING",
300
+ "SUBMITTED",
301
+ "PROVISIONING",
302
+ "RUNNING",
303
+ "TERMINATING",
304
+ "TERMINATED",
305
+ "FAILED",
306
+ "DONE",
307
+ name="runstatus",
308
+ ).create(op.get_bind())
309
+ sa.Enum("SUBMITTED", "PROVISIONING", "ACTIVE", "FAILED", name="volumestatus").create(
310
+ op.get_bind()
311
+ )
312
+ sa.Enum("SUBMITTED", "PROVISIONING", "RUNNING", "FAILED", name="gatewaystatus").create(
313
+ op.get_bind()
314
+ )
315
+ sa.Enum(
316
+ "ALL_JOBS_DONE",
317
+ "JOB_FAILED",
318
+ "RETRY_LIMIT_EXCEEDED",
319
+ "STOPPED_BY_USER",
320
+ "ABORTED_BY_USER",
321
+ "SERVER_ERROR",
322
+ name="runterminationreason",
323
+ ).create(op.get_bind())
324
+ sa.Enum("ADMIN", "USER", name="globalrole").create(op.get_bind())
325
+ with op.batch_alter_table("volumes", schema=None) as batch_op:
326
+ batch_op.alter_column(
327
+ "status",
328
+ existing_type=sa.String(length=100),
329
+ type_=postgresql.ENUM(
330
+ "SUBMITTED", "PROVISIONING", "ACTIVE", "FAILED", name="volumestatus"
331
+ ),
332
+ existing_nullable=False,
333
+ postgresql_using="status::VARCHAR::volumestatus",
334
+ )
335
+
336
+ with op.batch_alter_table("users", schema=None) as batch_op:
337
+ batch_op.alter_column(
338
+ "global_role",
339
+ existing_type=sa.String(length=100),
340
+ type_=postgresql.ENUM("ADMIN", "USER", name="globalrole"),
341
+ existing_nullable=False,
342
+ postgresql_using="global_role::VARCHAR::globalrole",
343
+ )
344
+
345
+ with op.batch_alter_table("runs", schema=None) as batch_op:
346
+ batch_op.alter_column(
347
+ "termination_reason",
348
+ existing_type=sa.String(length=100),
349
+ type_=postgresql.ENUM(
350
+ "ALL_JOBS_DONE",
351
+ "JOB_FAILED",
352
+ "RETRY_LIMIT_EXCEEDED",
353
+ "STOPPED_BY_USER",
354
+ "ABORTED_BY_USER",
355
+ "SERVER_ERROR",
356
+ name="runterminationreason",
357
+ ),
358
+ existing_nullable=True,
359
+ postgresql_using="termination_reason::VARCHAR::runterminationreason",
360
+ )
361
+ batch_op.alter_column(
362
+ "status",
363
+ existing_type=sa.String(length=100),
364
+ type_=postgresql.ENUM(
365
+ "PENDING",
366
+ "SUBMITTED",
367
+ "PROVISIONING",
368
+ "RUNNING",
369
+ "TERMINATING",
370
+ "TERMINATED",
371
+ "FAILED",
372
+ "DONE",
373
+ name="runstatus",
374
+ ),
375
+ existing_nullable=False,
376
+ postgresql_using="status::VARCHAR::runstatus",
377
+ )
378
+
379
+ with op.batch_alter_table("repos", schema=None) as batch_op:
380
+ batch_op.alter_column(
381
+ "type",
382
+ existing_type=sa.String(length=100),
383
+ type_=postgresql.ENUM("REMOTE", "LOCAL", "VIRTUAL", name="repotype"),
384
+ existing_nullable=False,
385
+ postgresql_using="type::VARCHAR::repotype",
386
+ )
387
+
388
+ with op.batch_alter_table("members", schema=None) as batch_op:
389
+ batch_op.alter_column(
390
+ "project_role",
391
+ existing_type=sa.String(length=100),
392
+ type_=postgresql.ENUM("ADMIN", "MANAGER", "USER", name="projectrole"),
393
+ existing_nullable=False,
394
+ postgresql_using="project_role::VARCHAR::projectrole",
395
+ )
396
+
397
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
398
+ batch_op.alter_column(
399
+ "termination_reason",
400
+ existing_type=sa.String(length=100),
401
+ type_=postgresql.ENUM(
402
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
403
+ "INTERRUPTED_BY_NO_CAPACITY",
404
+ "INSTANCE_UNREACHABLE",
405
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
406
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
407
+ "TERMINATED_BY_USER",
408
+ "VOLUME_ERROR",
409
+ "GATEWAY_ERROR",
410
+ "SCALED_DOWN",
411
+ "DONE_BY_RUNNER",
412
+ "ABORTED_BY_USER",
413
+ "TERMINATED_BY_SERVER",
414
+ "INACTIVITY_DURATION_EXCEEDED",
415
+ "TERMINATED_DUE_TO_UTILIZATION_POLICY",
416
+ "CONTAINER_EXITED_WITH_ERROR",
417
+ "PORTS_BINDING_FAILED",
418
+ "CREATING_CONTAINER_ERROR",
419
+ "EXECUTOR_ERROR",
420
+ "MAX_DURATION_EXCEEDED",
421
+ name="jobterminationreason",
422
+ ),
423
+ existing_nullable=True,
424
+ postgresql_using="termination_reason::VARCHAR::jobterminationreason",
425
+ )
426
+ batch_op.alter_column(
427
+ "status",
428
+ existing_type=sa.String(length=100),
429
+ type_=postgresql.ENUM(
430
+ "SUBMITTED",
431
+ "PROVISIONING",
432
+ "PULLING",
433
+ "RUNNING",
434
+ "TERMINATING",
435
+ "TERMINATED",
436
+ "ABORTED",
437
+ "FAILED",
438
+ "DONE",
439
+ name="jobstatus",
440
+ ),
441
+ existing_nullable=False,
442
+ postgresql_using="status::VARCHAR::jobstatus",
443
+ )
444
+
445
+ with op.batch_alter_table("instances", schema=None) as batch_op:
446
+ batch_op.alter_column(
447
+ "status",
448
+ existing_type=sa.String(length=100),
449
+ type_=postgresql.ENUM(
450
+ "PENDING",
451
+ "PROVISIONING",
452
+ "IDLE",
453
+ "BUSY",
454
+ "TERMINATING",
455
+ "TERMINATED",
456
+ name="instancestatus",
457
+ ),
458
+ existing_nullable=False,
459
+ postgresql_using="status::VARCHAR::instancestatus",
460
+ )
461
+
462
+ with op.batch_alter_table("gateways", schema=None) as batch_op:
463
+ batch_op.alter_column(
464
+ "status",
465
+ existing_type=sa.String(length=100),
466
+ type_=postgresql.ENUM(
467
+ "SUBMITTED", "PROVISIONING", "RUNNING", "FAILED", name="gatewaystatus"
468
+ ),
469
+ existing_nullable=False,
470
+ postgresql_using="status::VARCHAR::gatewaystatus",
471
+ )
472
+
473
+ with op.batch_alter_table("fleets", schema=None) as batch_op:
474
+ batch_op.alter_column(
475
+ "status",
476
+ existing_type=sa.String(length=100),
477
+ type_=postgresql.ENUM(
478
+ "SUBMITTED", "ACTIVE", "TERMINATING", "TERMINATED", "FAILED", name="fleetstatus"
479
+ ),
480
+ existing_nullable=False,
481
+ postgresql_using="status::VARCHAR::fleetstatus",
482
+ )
483
+
484
+ # ### end Alembic commands ###
@@ -0,0 +1,41 @@
1
+ """Add JobModel.fleet
2
+
3
+ Revision ID: e2d08cd1b8d9
4
+ Revises: 3d7f6c2ec000
5
+ Create Date: 2025-08-15 11:26:05.670591
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ import sqlalchemy_utils
11
+ from alembic import op
12
+
13
+ # revision identifiers, used by Alembic.
14
+ revision = "e2d08cd1b8d9"
15
+ down_revision = "3d7f6c2ec000"
16
+ branch_labels = None
17
+ depends_on = None
18
+
19
+
20
+ def upgrade() -> None:
21
+ # ### commands auto generated by Alembic - please adjust! ###
22
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
23
+ batch_op.add_column(
24
+ sa.Column(
25
+ "fleet_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True
26
+ )
27
+ )
28
+ batch_op.create_foreign_key(
29
+ batch_op.f("fk_jobs_fleet_id_fleets"), "fleets", ["fleet_id"], ["id"]
30
+ )
31
+
32
+ # ### end Alembic commands ###
33
+
34
+
35
+ def downgrade() -> None:
36
+ # ### commands auto generated by Alembic - please adjust! ###
37
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
38
+ batch_op.drop_constraint(batch_op.f("fk_jobs_fleet_id_fleets"), type_="foreignkey")
39
+ batch_op.drop_column("fleet_id")
40
+
41
+ # ### end Alembic commands ###
@@ -7,7 +7,6 @@ from sqlalchemy import (
7
7
  BigInteger,
8
8
  Boolean,
9
9
  DateTime,
10
- Enum,
11
10
  Float,
12
11
  ForeignKey,
13
12
  Index,
@@ -85,7 +84,7 @@ class DecryptedString(CoreModel):
85
84
  decrypted: bool = True
86
85
  exc: Optional[Exception] = None
87
86
 
88
- class Config:
87
+ class Config(CoreModel.Config):
89
88
  arbitrary_types_allowed = True
90
89
 
91
90
  def get_plaintext_or_error(self) -> str:
@@ -186,7 +185,7 @@ class UserModel(BaseModel):
186
185
  token: Mapped[DecryptedString] = mapped_column(EncryptedString(200), unique=True)
187
186
  # token_hash is needed for fast search by token when stored token is encrypted
188
187
  token_hash: Mapped[str] = mapped_column(String(2000), unique=True)
189
- global_role: Mapped[GlobalRole] = mapped_column(Enum(GlobalRole))
188
+ global_role: Mapped[GlobalRole] = mapped_column(EnumAsString(GlobalRole, 100))
190
189
  # deactivated users cannot access API
191
190
  active: Mapped[bool] = mapped_column(Boolean, default=True)
192
191
 
@@ -247,7 +246,7 @@ class MemberModel(BaseModel):
247
246
  project: Mapped["ProjectModel"] = relationship()
248
247
  user_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("users.id", ondelete="CASCADE"))
249
248
  user: Mapped[UserModel] = relationship(lazy="joined")
250
- project_role: Mapped[ProjectRole] = mapped_column(Enum(ProjectRole))
249
+ project_role: Mapped[ProjectRole] = mapped_column(EnumAsString(ProjectRole, 100))
251
250
  # member_num defines members ordering
252
251
  member_num: Mapped[Optional[int]] = mapped_column(Integer)
253
252
 
@@ -279,7 +278,7 @@ class RepoModel(BaseModel):
279
278
  project: Mapped["ProjectModel"] = relationship()
280
279
  # RepoModel.name stores repo_id
281
280
  name: Mapped[str] = mapped_column(String(100))
282
- type: Mapped[RepoType] = mapped_column(Enum(RepoType))
281
+ type: Mapped[RepoType] = mapped_column(EnumAsString(RepoType, 100))
283
282
 
284
283
  info: Mapped[str] = mapped_column(Text)
285
284
 
@@ -360,9 +359,9 @@ class RunModel(BaseModel):
360
359
  submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime)
361
360
  last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime)
362
361
  next_triggered_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
363
- status: Mapped[RunStatus] = mapped_column(Enum(RunStatus), index=True)
362
+ status: Mapped[RunStatus] = mapped_column(EnumAsString(RunStatus, 100), index=True)
364
363
  termination_reason: Mapped[Optional[RunTerminationReason]] = mapped_column(
365
- Enum(RunTerminationReason)
364
+ EnumAsString(RunTerminationReason, 100)
366
365
  )
367
366
  # resubmission_attempt counts consecutive transitions to pending without provisioning.
368
367
  # Can be used to choose retry delay depending on the attempt number.
@@ -391,19 +390,27 @@ class JobModel(BaseModel):
391
390
  id: Mapped[uuid.UUID] = mapped_column(
392
391
  UUIDType(binary=False), primary_key=True, default=uuid.uuid4
393
392
  )
393
+
394
394
  project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE"))
395
395
  project: Mapped["ProjectModel"] = relationship()
396
+
396
397
  run_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("runs.id", ondelete="CASCADE"))
397
398
  run: Mapped["RunModel"] = relationship()
399
+
400
+ # Jobs need to reference fleets because we may choose an optimal fleet for a master job
401
+ # but not yet create an instance for it.
402
+ fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id"))
403
+ fleet: Mapped[Optional["FleetModel"]] = relationship(back_populates="jobs")
404
+
398
405
  run_name: Mapped[str] = mapped_column(String(100))
399
406
  job_num: Mapped[int] = mapped_column(Integer)
400
407
  job_name: Mapped[str] = mapped_column(String(100))
401
408
  submission_num: Mapped[int] = mapped_column(Integer)
402
409
  submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime)
403
410
  last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime)
404
- status: Mapped[JobStatus] = mapped_column(Enum(JobStatus), index=True)
411
+ status: Mapped[JobStatus] = mapped_column(EnumAsString(JobStatus, 100), index=True)
405
412
  termination_reason: Mapped[Optional[JobTerminationReason]] = mapped_column(
406
- Enum(JobTerminationReason)
413
+ EnumAsString(JobTerminationReason, 100)
407
414
  )
408
415
  termination_reason_message: Mapped[Optional[str]] = mapped_column(Text)
409
416
  # `disconnected_at` stores the first time of connectivity issues with the instance.
@@ -431,6 +438,9 @@ class JobModel(BaseModel):
431
438
  probes: Mapped[list["ProbeModel"]] = relationship(
432
439
  back_populates="job", order_by="ProbeModel.probe_num"
433
440
  )
441
+ # Whether the replica is registered to receive service requests.
442
+ # Always `False` for non-service runs.
443
+ registered: Mapped[bool] = mapped_column(Boolean, server_default=false())
434
444
 
435
445
 
436
446
  class GatewayModel(BaseModel):
@@ -446,7 +456,7 @@ class GatewayModel(BaseModel):
446
456
  # Use `get_gateway_configuration` to construct `configuration` for old gateways.
447
457
  configuration: Mapped[Optional[str]] = mapped_column(Text)
448
458
  created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime)
449
- status: Mapped[GatewayStatus] = mapped_column(Enum(GatewayStatus))
459
+ status: Mapped[GatewayStatus] = mapped_column(EnumAsString(GatewayStatus, 100))
450
460
  status_message: Mapped[Optional[str]] = mapped_column(Text)
451
461
  last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime)
452
462
 
@@ -532,12 +542,13 @@ class FleetModel(BaseModel):
532
542
  deleted: Mapped[bool] = mapped_column(Boolean, default=False)
533
543
  deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
534
544
 
535
- status: Mapped[FleetStatus] = mapped_column(Enum(FleetStatus), index=True)
545
+ status: Mapped[FleetStatus] = mapped_column(EnumAsString(FleetStatus, 100), index=True)
536
546
  status_message: Mapped[Optional[str]] = mapped_column(Text)
537
547
 
538
548
  spec: Mapped[str] = mapped_column(Text)
539
549
 
540
550
  runs: Mapped[List["RunModel"]] = relationship(back_populates="fleet")
551
+ jobs: Mapped[List["JobModel"]] = relationship(back_populates="fleet")
541
552
  instances: Mapped[List["InstanceModel"]] = relationship(back_populates="fleet")
542
553
 
543
554
 
@@ -571,7 +582,7 @@ class InstanceModel(BaseModel):
571
582
  fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id"))
572
583
  fleet: Mapped[Optional["FleetModel"]] = relationship(back_populates="instances")
573
584
 
574
- status: Mapped[InstanceStatus] = mapped_column(Enum(InstanceStatus), index=True)
585
+ status: Mapped[InstanceStatus] = mapped_column(EnumAsString(InstanceStatus, 100), index=True)
575
586
  unreachable: Mapped[bool] = mapped_column(Boolean)
576
587
 
577
588
  # VM
@@ -672,7 +683,7 @@ class VolumeModel(BaseModel):
672
683
  deleted: Mapped[bool] = mapped_column(Boolean, default=False)
673
684
  deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
674
685
 
675
- status: Mapped[VolumeStatus] = mapped_column(Enum(VolumeStatus), index=True)
686
+ status: Mapped[VolumeStatus] = mapped_column(EnumAsString(VolumeStatus, 100), index=True)
676
687
  status_message: Mapped[Optional[str]] = mapped_column(Text)
677
688
 
678
689
  configuration: Mapped[str] = mapped_column(Text)
@@ -0,0 +1,29 @@
1
+ from typing import Tuple
2
+
3
+ from fastapi import APIRouter, Depends
4
+ from sqlalchemy.ext.asyncio import AsyncSession
5
+
6
+ from dstack._internal.server.db import get_session
7
+ from dstack._internal.server.models import ProjectModel, UserModel
8
+ from dstack._internal.server.schemas.gpus import ListGpusRequest, ListGpusResponse
9
+ from dstack._internal.server.security.permissions import ProjectMember
10
+ from dstack._internal.server.services.gpus import list_gpus_grouped
11
+ from dstack._internal.server.utils.routers import get_base_api_additional_responses
12
+
13
+ project_router = APIRouter(
14
+ prefix="/api/project/{project_name}/gpus",
15
+ tags=["gpus"],
16
+ responses=get_base_api_additional_responses(),
17
+ )
18
+
19
+
20
+ @project_router.post("/list", response_model=ListGpusResponse, response_model_exclude_none=True)
21
+ async def list_gpus(
22
+ body: ListGpusRequest,
23
+ session: AsyncSession = Depends(get_session),
24
+ user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
25
+ ) -> ListGpusResponse:
26
+ _, project = user_project
27
+ return await list_gpus_grouped(
28
+ session=session, project=project, run_spec=body.run_spec, group_by=body.group_by
29
+ )