dstack 0.19.21__py3-none-any.whl → 0.19.23rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (71) hide show
  1. dstack/_internal/cli/commands/apply.py +8 -3
  2. dstack/_internal/cli/services/configurators/__init__.py +8 -0
  3. dstack/_internal/cli/services/configurators/fleet.py +1 -1
  4. dstack/_internal/cli/services/configurators/gateway.py +1 -1
  5. dstack/_internal/cli/services/configurators/run.py +11 -1
  6. dstack/_internal/cli/services/configurators/volume.py +1 -1
  7. dstack/_internal/cli/utils/common.py +48 -5
  8. dstack/_internal/cli/utils/fleet.py +5 -5
  9. dstack/_internal/cli/utils/run.py +32 -0
  10. dstack/_internal/core/backends/configurators.py +9 -0
  11. dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
  12. dstack/_internal/core/backends/hotaisle/api_client.py +109 -0
  13. dstack/_internal/core/backends/hotaisle/backend.py +16 -0
  14. dstack/_internal/core/backends/hotaisle/compute.py +225 -0
  15. dstack/_internal/core/backends/hotaisle/configurator.py +60 -0
  16. dstack/_internal/core/backends/hotaisle/models.py +45 -0
  17. dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
  18. dstack/_internal/core/backends/models.py +8 -0
  19. dstack/_internal/core/backends/nebius/compute.py +8 -2
  20. dstack/_internal/core/backends/nebius/fabrics.py +1 -0
  21. dstack/_internal/core/backends/nebius/resources.py +9 -0
  22. dstack/_internal/core/compatibility/runs.py +8 -0
  23. dstack/_internal/core/models/backends/base.py +2 -0
  24. dstack/_internal/core/models/configurations.py +139 -1
  25. dstack/_internal/core/models/health.py +28 -0
  26. dstack/_internal/core/models/instances.py +2 -0
  27. dstack/_internal/core/models/logs.py +2 -1
  28. dstack/_internal/core/models/runs.py +21 -1
  29. dstack/_internal/core/services/ssh/tunnel.py +7 -0
  30. dstack/_internal/server/app.py +4 -0
  31. dstack/_internal/server/background/__init__.py +4 -0
  32. dstack/_internal/server/background/tasks/process_instances.py +107 -56
  33. dstack/_internal/server/background/tasks/process_probes.py +164 -0
  34. dstack/_internal/server/background/tasks/process_running_jobs.py +13 -0
  35. dstack/_internal/server/background/tasks/process_runs.py +21 -14
  36. dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
  37. dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
  38. dstack/_internal/server/models.py +41 -0
  39. dstack/_internal/server/routers/instances.py +33 -5
  40. dstack/_internal/server/schemas/health/dcgm.py +56 -0
  41. dstack/_internal/server/schemas/instances.py +32 -0
  42. dstack/_internal/server/schemas/runner.py +5 -0
  43. dstack/_internal/server/services/instances.py +103 -1
  44. dstack/_internal/server/services/jobs/__init__.py +8 -1
  45. dstack/_internal/server/services/jobs/configurators/base.py +26 -0
  46. dstack/_internal/server/services/logging.py +4 -2
  47. dstack/_internal/server/services/logs/aws.py +13 -1
  48. dstack/_internal/server/services/logs/gcp.py +16 -1
  49. dstack/_internal/server/services/probes.py +6 -0
  50. dstack/_internal/server/services/projects.py +16 -4
  51. dstack/_internal/server/services/runner/client.py +52 -20
  52. dstack/_internal/server/services/runner/ssh.py +4 -4
  53. dstack/_internal/server/services/runs.py +49 -13
  54. dstack/_internal/server/services/ssh.py +66 -0
  55. dstack/_internal/server/settings.py +13 -0
  56. dstack/_internal/server/statics/index.html +1 -1
  57. dstack/_internal/server/statics/{main-8f9ee218d3eb45989682.css → main-03e818b110e1d5705378.css} +1 -1
  58. dstack/_internal/server/statics/{main-39a767528976f8078166.js → main-cc067b7fd1a8f33f97da.js} +26 -15
  59. dstack/_internal/server/statics/{main-39a767528976f8078166.js.map → main-cc067b7fd1a8f33f97da.js.map} +1 -1
  60. dstack/_internal/server/testing/common.py +44 -0
  61. dstack/_internal/{core/backends/remote → server/utils}/provisioning.py +22 -17
  62. dstack/_internal/settings.py +3 -0
  63. dstack/_internal/utils/common.py +15 -0
  64. dstack/api/server/__init__.py +1 -1
  65. dstack/version.py +1 -1
  66. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/METADATA +14 -14
  67. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/RECORD +71 -58
  68. /dstack/_internal/{core/backends/remote → server/schemas/health}/__init__.py +0 -0
  69. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/WHEEL +0 -0
  70. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/entry_points.txt +0 -0
  71. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/licenses/LICENSE.md +0 -0
@@ -35,6 +35,7 @@ from dstack._internal.core.models.fleets import (
35
35
  SSHParams,
36
36
  )
37
37
  from dstack._internal.core.models.gateways import GatewayComputeConfiguration, GatewayStatus
38
+ from dstack._internal.core.models.health import HealthStatus
38
39
  from dstack._internal.core.models.instances import (
39
40
  Disk,
40
41
  Gpu,
@@ -85,11 +86,13 @@ from dstack._internal.server.models import (
85
86
  FleetModel,
86
87
  GatewayComputeModel,
87
88
  GatewayModel,
89
+ InstanceHealthCheckModel,
88
90
  InstanceModel,
89
91
  JobMetricsPoint,
90
92
  JobModel,
91
93
  JobPrometheusMetrics,
92
94
  PlacementGroupModel,
95
+ ProbeModel,
93
96
  ProjectModel,
94
97
  RepoCredsModel,
95
98
  RepoModel,
@@ -368,6 +371,7 @@ async def create_job(
368
371
  instance_assigned=instance_assigned,
369
372
  used_instance_id=instance.id if instance is not None else None,
370
373
  disconnected_at=disconnected_at,
374
+ probes=[],
371
375
  )
372
376
  session.add(job)
373
377
  await session.commit()
@@ -439,6 +443,26 @@ def get_job_runtime_data(
439
443
  )
440
444
 
441
445
 
446
+ async def create_probe(
447
+ session: AsyncSession,
448
+ job: JobModel,
449
+ probe_num: int = 0,
450
+ due: datetime = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc),
451
+ success_streak: int = 0,
452
+ ) -> ProbeModel:
453
+ probe = ProbeModel(
454
+ name=f"{job.job_name}-{probe_num}",
455
+ job=job,
456
+ probe_num=probe_num,
457
+ due=due,
458
+ success_streak=success_streak,
459
+ active=True,
460
+ )
461
+ session.add(probe)
462
+ await session.commit()
463
+ return probe
464
+
465
+
442
466
  async def create_gateway(
443
467
  session: AsyncSession,
444
468
  project_id: UUID,
@@ -592,6 +616,7 @@ async def create_instance(
592
616
  fleet: Optional[FleetModel] = None,
593
617
  status: InstanceStatus = InstanceStatus.IDLE,
594
618
  unreachable: bool = False,
619
+ health_status: HealthStatus = HealthStatus.HEALTHY,
595
620
  created_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc),
596
621
  finished_at: Optional[datetime] = None,
597
622
  spot: bool = False,
@@ -654,6 +679,7 @@ async def create_instance(
654
679
  status=status,
655
680
  last_processed_at=last_processed_at,
656
681
  unreachable=unreachable,
682
+ health=health_status,
657
683
  created_at=created_at,
658
684
  started_at=created_at,
659
685
  finished_at=finished_at,
@@ -774,6 +800,24 @@ def get_ssh_key() -> SSHKey:
774
800
  )
775
801
 
776
802
 
803
+ async def create_instance_health_check(
804
+ session: AsyncSession,
805
+ instance: InstanceModel,
806
+ collected_at: datetime = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc),
807
+ status: HealthStatus = HealthStatus.HEALTHY,
808
+ response: str = "{}",
809
+ ) -> InstanceHealthCheckModel:
810
+ health_check = InstanceHealthCheckModel(
811
+ instance_id=instance.id,
812
+ collected_at=collected_at,
813
+ status=status,
814
+ response=response,
815
+ )
816
+ session.add(health_check)
817
+ await session.commit()
818
+ return health_check
819
+
820
+
777
821
  async def create_volume(
778
822
  session: AsyncSession,
779
823
  project: ProjectModel,
@@ -224,23 +224,28 @@ def get_shim_healthcheck(client: paramiko.SSHClient) -> str:
224
224
  retries = 20
225
225
  iter_delay = 3
226
226
  for _ in range(retries):
227
- try:
228
- _, stdout, stderr = client.exec_command(
229
- f"curl -s http://localhost:{DSTACK_SHIM_HTTP_PORT}/api/healthcheck", timeout=15
230
- )
231
- out = stdout.read().strip().decode()
232
- err = stderr.read().strip().decode()
233
- if err:
234
- raise ProvisioningError(
235
- f"The command 'get_shim_healthcheck' didn't work. stdout: {out}, stderr: {err}"
236
- )
237
- if not out:
238
- logger.debug("healthcheck is empty. retry")
239
- time.sleep(iter_delay)
240
- continue
241
- return out
242
- except (paramiko.SSHException, OSError) as e:
243
- raise ProvisioningError(f"get_shim_healthcheck failed: {e}") from e
227
+ healthcheck = _get_shim_healthcheck(client)
228
+ if healthcheck is not None:
229
+ return healthcheck
230
+ logger.debug("healthcheck is empty. retry")
231
+ time.sleep(iter_delay)
232
+ raise ProvisioningError("Cannot get HealthcheckResponse")
233
+
234
+
235
+ def _get_shim_healthcheck(client: paramiko.SSHClient) -> Optional[str]:
236
+ try:
237
+ _, stdout, stderr = client.exec_command(
238
+ f"curl -s http://localhost:{DSTACK_SHIM_HTTP_PORT}/api/healthcheck", timeout=15
239
+ )
240
+ out = stdout.read().strip().decode()
241
+ err = stderr.read().strip().decode()
242
+ except (paramiko.SSHException, OSError) as e:
243
+ raise ProvisioningError(f"get_shim_healthcheck failed: {e}") from e
244
+ if err:
245
+ raise ProvisioningError(f"get_shim_healthcheck didn't work. stdout: {out}, stderr: {err}")
246
+ if not out:
247
+ return None
248
+ return out
244
249
 
245
250
 
246
251
  def host_info_to_instance_type(host_info: Dict[str, Any], cpu_arch: GoArchType) -> InstanceType:
@@ -19,6 +19,9 @@ DSTACK_BASE_IMAGE_UBUNTU_VERSION = os.getenv(
19
19
  )
20
20
  DSTACK_DIND_IMAGE = os.getenv("DSTACK_DIND_IMAGE", "dstackai/dind")
21
21
 
22
+ CLI_LOG_LEVEL = os.getenv("DSTACK_CLI_LOG_LEVEL", "INFO").upper()
23
+ CLI_FILE_LOG_LEVEL = os.getenv("DSTACK_CLI_FILE_LOG_LEVEL", "DEBUG").upper()
24
+
22
25
  # Development settings
23
26
 
24
27
  LOCAL_BACKEND_ENABLED = os.getenv("DSTACK_LOCAL_BACKEND_ENABLED") is not None
@@ -222,6 +222,21 @@ def remove_prefix(text: str, prefix: str) -> str:
222
222
  return text
223
223
 
224
224
 
225
+ def has_duplicates(iterable: Iterable[Any]) -> bool:
226
+ """
227
+ Checks if there are any duplicate items in the given iterable.
228
+
229
+ O(n^2) implementation, but works with iterables with unhashable items.
230
+ For iterables with hashable items, prefer len(set(iterable)) != len(iterable).
231
+ """
232
+ seen = []
233
+ for item in iterable:
234
+ if item in seen:
235
+ return True
236
+ seen.append(item)
237
+ return False
238
+
239
+
225
240
  T = TypeVar("T")
226
241
 
227
242
 
@@ -173,7 +173,7 @@ class APIClient:
173
173
  raise ClientError(
174
174
  f"Unexpected error: status code {resp.status_code}"
175
175
  f" when requesting {resp.request.url}."
176
- " Check server logs or run with DSTACK_CLI_LOG_LEVEL=DEBUG to see more details"
176
+ " Check the server logs for backend issues, and the CLI logs at (~/.dstack/logs/cli/latest.log) local CLI output"
177
177
  )
178
178
  return resp
179
179
 
dstack/version.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.19.21"
1
+ __version__ = "0.19.23rc1"
2
2
  __is_release__ = True
3
3
  base_image = "0.10"
4
4
  base_image_ubuntu_version = "22.04"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dstack
3
- Version: 0.19.21
3
+ Version: 0.19.23rc1
4
4
  Summary: dstack is an open-source orchestration engine for running AI workloads on any cloud or on-premises.
5
5
  Project-URL: Homepage, https://dstack.ai
6
6
  Project-URL: Source, https://github.com/dstackai/dstack
@@ -22,7 +22,7 @@ Requires-Dist: cryptography
22
22
  Requires-Dist: cursor
23
23
  Requires-Dist: filelock
24
24
  Requires-Dist: gitpython
25
- Requires-Dist: gpuhunt==0.1.6
25
+ Requires-Dist: gpuhunt==0.1.7
26
26
  Requires-Dist: ignore-python>=0.2.0
27
27
  Requires-Dist: jsonschema
28
28
  Requires-Dist: orjson
@@ -73,13 +73,13 @@ Requires-Dist: grpcio>=1.50; extra == 'all'
73
73
  Requires-Dist: httpx; extra == 'all'
74
74
  Requires-Dist: jinja2; extra == 'all'
75
75
  Requires-Dist: kubernetes; extra == 'all'
76
- Requires-Dist: nebius<0.3,>=0.2.19; (python_version >= '3.10') and extra == 'all'
76
+ Requires-Dist: nebius<0.3,>=0.2.40; (python_version >= '3.10') and extra == 'all'
77
77
  Requires-Dist: oci>=2.150.0; extra == 'all'
78
78
  Requires-Dist: prometheus-client; extra == 'all'
79
79
  Requires-Dist: pyopenssl>=23.2.0; extra == 'all'
80
80
  Requires-Dist: python-dxf==12.1.0; extra == 'all'
81
81
  Requires-Dist: python-json-logger>=3.1.0; extra == 'all'
82
- Requires-Dist: sentry-sdk[fastapi]; extra == 'all'
82
+ Requires-Dist: sentry-sdk[fastapi]>=2.27.0; extra == 'all'
83
83
  Requires-Dist: sqlalchemy-utils>=0.40.0; extra == 'all'
84
84
  Requires-Dist: sqlalchemy[asyncio]>=2.0.0; extra == 'all'
85
85
  Requires-Dist: starlette>=0.26.0; extra == 'all'
@@ -104,7 +104,7 @@ Requires-Dist: jinja2; extra == 'aws'
104
104
  Requires-Dist: prometheus-client; extra == 'aws'
105
105
  Requires-Dist: python-dxf==12.1.0; extra == 'aws'
106
106
  Requires-Dist: python-json-logger>=3.1.0; extra == 'aws'
107
- Requires-Dist: sentry-sdk[fastapi]; extra == 'aws'
107
+ Requires-Dist: sentry-sdk[fastapi]>=2.27.0; extra == 'aws'
108
108
  Requires-Dist: sqlalchemy-utils>=0.40.0; extra == 'aws'
109
109
  Requires-Dist: sqlalchemy[asyncio]>=2.0.0; extra == 'aws'
110
110
  Requires-Dist: starlette>=0.26.0; extra == 'aws'
@@ -133,7 +133,7 @@ Requires-Dist: jinja2; extra == 'azure'
133
133
  Requires-Dist: prometheus-client; extra == 'azure'
134
134
  Requires-Dist: python-dxf==12.1.0; extra == 'azure'
135
135
  Requires-Dist: python-json-logger>=3.1.0; extra == 'azure'
136
- Requires-Dist: sentry-sdk[fastapi]; extra == 'azure'
136
+ Requires-Dist: sentry-sdk[fastapi]>=2.27.0; extra == 'azure'
137
137
  Requires-Dist: sqlalchemy-utils>=0.40.0; extra == 'azure'
138
138
  Requires-Dist: sqlalchemy[asyncio]>=2.0.0; extra == 'azure'
139
139
  Requires-Dist: starlette>=0.26.0; extra == 'azure'
@@ -156,7 +156,7 @@ Requires-Dist: jinja2; extra == 'datacrunch'
156
156
  Requires-Dist: prometheus-client; extra == 'datacrunch'
157
157
  Requires-Dist: python-dxf==12.1.0; extra == 'datacrunch'
158
158
  Requires-Dist: python-json-logger>=3.1.0; extra == 'datacrunch'
159
- Requires-Dist: sentry-sdk[fastapi]; extra == 'datacrunch'
159
+ Requires-Dist: sentry-sdk[fastapi]>=2.27.0; extra == 'datacrunch'
160
160
  Requires-Dist: sqlalchemy-utils>=0.40.0; extra == 'datacrunch'
161
161
  Requires-Dist: sqlalchemy[asyncio]>=2.0.0; extra == 'datacrunch'
162
162
  Requires-Dist: starlette>=0.26.0; extra == 'datacrunch'
@@ -193,7 +193,7 @@ Requires-Dist: jinja2; extra == 'gcp'
193
193
  Requires-Dist: prometheus-client; extra == 'gcp'
194
194
  Requires-Dist: python-dxf==12.1.0; extra == 'gcp'
195
195
  Requires-Dist: python-json-logger>=3.1.0; extra == 'gcp'
196
- Requires-Dist: sentry-sdk[fastapi]; extra == 'gcp'
196
+ Requires-Dist: sentry-sdk[fastapi]>=2.27.0; extra == 'gcp'
197
197
  Requires-Dist: sqlalchemy-utils>=0.40.0; extra == 'gcp'
198
198
  Requires-Dist: sqlalchemy[asyncio]>=2.0.0; extra == 'gcp'
199
199
  Requires-Dist: starlette>=0.26.0; extra == 'gcp'
@@ -216,7 +216,7 @@ Requires-Dist: kubernetes; extra == 'kubernetes'
216
216
  Requires-Dist: prometheus-client; extra == 'kubernetes'
217
217
  Requires-Dist: python-dxf==12.1.0; extra == 'kubernetes'
218
218
  Requires-Dist: python-json-logger>=3.1.0; extra == 'kubernetes'
219
- Requires-Dist: sentry-sdk[fastapi]; extra == 'kubernetes'
219
+ Requires-Dist: sentry-sdk[fastapi]>=2.27.0; extra == 'kubernetes'
220
220
  Requires-Dist: sqlalchemy-utils>=0.40.0; extra == 'kubernetes'
221
221
  Requires-Dist: sqlalchemy[asyncio]>=2.0.0; extra == 'kubernetes'
222
222
  Requires-Dist: starlette>=0.26.0; extra == 'kubernetes'
@@ -240,7 +240,7 @@ Requires-Dist: jinja2; extra == 'lambda'
240
240
  Requires-Dist: prometheus-client; extra == 'lambda'
241
241
  Requires-Dist: python-dxf==12.1.0; extra == 'lambda'
242
242
  Requires-Dist: python-json-logger>=3.1.0; extra == 'lambda'
243
- Requires-Dist: sentry-sdk[fastapi]; extra == 'lambda'
243
+ Requires-Dist: sentry-sdk[fastapi]>=2.27.0; extra == 'lambda'
244
244
  Requires-Dist: sqlalchemy-utils>=0.40.0; extra == 'lambda'
245
245
  Requires-Dist: sqlalchemy[asyncio]>=2.0.0; extra == 'lambda'
246
246
  Requires-Dist: starlette>=0.26.0; extra == 'lambda'
@@ -259,11 +259,11 @@ Requires-Dist: fastapi; extra == 'nebius'
259
259
  Requires-Dist: grpcio>=1.50; extra == 'nebius'
260
260
  Requires-Dist: httpx; extra == 'nebius'
261
261
  Requires-Dist: jinja2; extra == 'nebius'
262
- Requires-Dist: nebius<0.3,>=0.2.19; (python_version >= '3.10') and extra == 'nebius'
262
+ Requires-Dist: nebius<0.3,>=0.2.40; (python_version >= '3.10') and extra == 'nebius'
263
263
  Requires-Dist: prometheus-client; extra == 'nebius'
264
264
  Requires-Dist: python-dxf==12.1.0; extra == 'nebius'
265
265
  Requires-Dist: python-json-logger>=3.1.0; extra == 'nebius'
266
- Requires-Dist: sentry-sdk[fastapi]; extra == 'nebius'
266
+ Requires-Dist: sentry-sdk[fastapi]>=2.27.0; extra == 'nebius'
267
267
  Requires-Dist: sqlalchemy-utils>=0.40.0; extra == 'nebius'
268
268
  Requires-Dist: sqlalchemy[asyncio]>=2.0.0; extra == 'nebius'
269
269
  Requires-Dist: starlette>=0.26.0; extra == 'nebius'
@@ -288,7 +288,7 @@ Requires-Dist: prometheus-client; extra == 'oci'
288
288
  Requires-Dist: pyopenssl>=23.2.0; extra == 'oci'
289
289
  Requires-Dist: python-dxf==12.1.0; extra == 'oci'
290
290
  Requires-Dist: python-json-logger>=3.1.0; extra == 'oci'
291
- Requires-Dist: sentry-sdk[fastapi]; extra == 'oci'
291
+ Requires-Dist: sentry-sdk[fastapi]>=2.27.0; extra == 'oci'
292
292
  Requires-Dist: sqlalchemy-utils>=0.40.0; extra == 'oci'
293
293
  Requires-Dist: sqlalchemy[asyncio]>=2.0.0; extra == 'oci'
294
294
  Requires-Dist: starlette>=0.26.0; extra == 'oci'
@@ -310,7 +310,7 @@ Requires-Dist: jinja2; extra == 'server'
310
310
  Requires-Dist: prometheus-client; extra == 'server'
311
311
  Requires-Dist: python-dxf==12.1.0; extra == 'server'
312
312
  Requires-Dist: python-json-logger>=3.1.0; extra == 'server'
313
- Requires-Dist: sentry-sdk[fastapi]; extra == 'server'
313
+ Requires-Dist: sentry-sdk[fastapi]>=2.27.0; extra == 'server'
314
314
  Requires-Dist: sqlalchemy-utils>=0.40.0; extra == 'server'
315
315
  Requires-Dist: sqlalchemy[asyncio]>=2.0.0; extra == 'server'
316
316
  Requires-Dist: starlette>=0.26.0; extra == 'server'