dstack 0.18.42__py3-none-any.whl → 0.18.44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (115) hide show
  1. dstack/_internal/cli/commands/__init__.py +2 -1
  2. dstack/_internal/cli/commands/apply.py +4 -2
  3. dstack/_internal/cli/commands/attach.py +21 -1
  4. dstack/_internal/cli/commands/completion.py +20 -0
  5. dstack/_internal/cli/commands/delete.py +3 -1
  6. dstack/_internal/cli/commands/fleet.py +2 -1
  7. dstack/_internal/cli/commands/gateway.py +7 -2
  8. dstack/_internal/cli/commands/logs.py +3 -2
  9. dstack/_internal/cli/commands/stats.py +2 -1
  10. dstack/_internal/cli/commands/stop.py +2 -1
  11. dstack/_internal/cli/commands/volume.py +2 -1
  12. dstack/_internal/cli/main.py +6 -0
  13. dstack/_internal/cli/services/completion.py +86 -0
  14. dstack/_internal/cli/services/configurators/run.py +11 -17
  15. dstack/_internal/cli/utils/fleet.py +5 -1
  16. dstack/_internal/cli/utils/run.py +11 -0
  17. dstack/_internal/core/backends/aws/compute.py +23 -10
  18. dstack/_internal/core/backends/aws/resources.py +3 -3
  19. dstack/_internal/core/backends/azure/compute.py +15 -9
  20. dstack/_internal/core/backends/azure/resources.py +2 -0
  21. dstack/_internal/core/backends/base/compute.py +102 -2
  22. dstack/_internal/core/backends/base/offers.py +7 -1
  23. dstack/_internal/core/backends/cudo/compute.py +8 -4
  24. dstack/_internal/core/backends/datacrunch/compute.py +10 -4
  25. dstack/_internal/core/backends/gcp/auth.py +19 -13
  26. dstack/_internal/core/backends/gcp/compute.py +26 -20
  27. dstack/_internal/core/backends/gcp/resources.py +3 -10
  28. dstack/_internal/core/backends/kubernetes/compute.py +4 -3
  29. dstack/_internal/core/backends/lambdalabs/compute.py +9 -3
  30. dstack/_internal/core/backends/nebius/compute.py +2 -2
  31. dstack/_internal/core/backends/oci/compute.py +10 -4
  32. dstack/_internal/core/backends/runpod/compute.py +32 -7
  33. dstack/_internal/core/backends/runpod/config.py +8 -0
  34. dstack/_internal/core/backends/tensordock/compute.py +14 -3
  35. dstack/_internal/core/backends/vastai/compute.py +12 -2
  36. dstack/_internal/core/backends/vultr/api_client.py +3 -3
  37. dstack/_internal/core/backends/vultr/compute.py +9 -3
  38. dstack/_internal/core/models/backends/aws.py +2 -0
  39. dstack/_internal/core/models/backends/base.py +1 -0
  40. dstack/_internal/core/models/backends/runpod.py +2 -0
  41. dstack/_internal/core/models/configurations.py +2 -2
  42. dstack/_internal/core/models/profiles.py +46 -1
  43. dstack/_internal/core/models/runs.py +4 -0
  44. dstack/_internal/core/services/__init__.py +5 -1
  45. dstack/_internal/core/services/configs/__init__.py +3 -0
  46. dstack/_internal/server/app.py +11 -1
  47. dstack/_internal/server/background/__init__.py +10 -0
  48. dstack/_internal/server/background/tasks/common.py +22 -0
  49. dstack/_internal/server/background/tasks/process_instances.py +11 -18
  50. dstack/_internal/server/background/tasks/process_placement_groups.py +1 -0
  51. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
  52. dstack/_internal/server/background/tasks/process_running_jobs.py +74 -34
  53. dstack/_internal/server/background/tasks/process_runs.py +1 -0
  54. dstack/_internal/server/background/tasks/process_submitted_jobs.py +4 -1
  55. dstack/_internal/server/background/tasks/process_terminating_jobs.py +1 -7
  56. dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
  57. dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
  58. dstack/_internal/server/models.py +11 -0
  59. dstack/_internal/server/routers/logs.py +3 -0
  60. dstack/_internal/server/routers/metrics.py +21 -2
  61. dstack/_internal/server/routers/prometheus.py +36 -0
  62. dstack/_internal/server/security/permissions.py +1 -1
  63. dstack/_internal/server/services/backends/configurators/aws.py +31 -1
  64. dstack/_internal/server/services/backends/configurators/gcp.py +8 -15
  65. dstack/_internal/server/services/backends/configurators/runpod.py +3 -33
  66. dstack/_internal/server/services/config.py +24 -4
  67. dstack/_internal/server/services/fleets.py +1 -0
  68. dstack/_internal/server/services/gateways/__init__.py +1 -0
  69. dstack/_internal/server/services/jobs/__init__.py +12 -9
  70. dstack/_internal/server/services/jobs/configurators/base.py +9 -1
  71. dstack/_internal/server/services/jobs/configurators/dev.py +1 -3
  72. dstack/_internal/server/services/jobs/configurators/task.py +1 -3
  73. dstack/_internal/server/services/logs/__init__.py +78 -0
  74. dstack/_internal/server/services/{logs.py → logs/aws.py} +12 -207
  75. dstack/_internal/server/services/logs/base.py +47 -0
  76. dstack/_internal/server/services/logs/filelog.py +110 -0
  77. dstack/_internal/server/services/logs/gcp.py +165 -0
  78. dstack/_internal/server/services/metrics.py +103 -70
  79. dstack/_internal/server/services/pools.py +16 -17
  80. dstack/_internal/server/services/prometheus.py +87 -0
  81. dstack/_internal/server/services/proxy/routers/service_proxy.py +14 -7
  82. dstack/_internal/server/services/runner/client.py +14 -3
  83. dstack/_internal/server/services/runs.py +43 -15
  84. dstack/_internal/server/services/volumes.py +1 -0
  85. dstack/_internal/server/settings.py +6 -0
  86. dstack/_internal/server/statics/index.html +1 -1
  87. dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js → main-4eb116b97819badd1e2c.js} +131 -78
  88. dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js.map → main-4eb116b97819badd1e2c.js.map} +1 -1
  89. dstack/_internal/server/statics/{main-ad5150a441de98cd8987.css → main-da9f8c06a69c20dac23e.css} +1 -1
  90. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  91. dstack/_internal/server/testing/common.py +50 -8
  92. dstack/api/_public/runs.py +4 -1
  93. dstack/api/server/_fleets.py +2 -0
  94. dstack/api/server/_runs.py +4 -0
  95. dstack/api/utils.py +3 -0
  96. dstack/version.py +2 -2
  97. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/METADATA +13 -3
  98. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/RECORD +115 -97
  99. tests/_internal/core/backends/base/__init__.py +0 -0
  100. tests/_internal/core/backends/base/test_compute.py +56 -0
  101. tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +189 -0
  102. tests/_internal/server/background/tasks/test_process_running_jobs.py +126 -1
  103. tests/_internal/server/conftest.py +4 -5
  104. tests/_internal/server/routers/test_backends.py +1 -0
  105. tests/_internal/server/routers/test_fleets.py +2 -0
  106. tests/_internal/server/routers/test_logs.py +1 -1
  107. tests/_internal/server/routers/test_metrics.py +15 -0
  108. tests/_internal/server/routers/test_prometheus.py +244 -0
  109. tests/_internal/server/routers/test_runs.py +81 -58
  110. tests/_internal/server/services/test_logs.py +3 -3
  111. tests/_internal/server/services/test_metrics.py +163 -0
  112. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/LICENSE.md +0 -0
  113. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/WHEEL +0 -0
  114. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/entry_points.txt +0 -0
  115. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/top_level.txt +0 -0
@@ -6,7 +6,7 @@ from typing import List, Optional
6
6
  import dstack.version as version
7
7
  from dstack._internal import settings
8
8
  from dstack._internal.core.backends.base import Compute
9
- from dstack._internal.core.backends.base.compute import get_instance_name, get_user_data
9
+ from dstack._internal.core.backends.base.compute import get_job_instance_name, get_user_data
10
10
  from dstack._internal.core.backends.base.offers import get_catalog_offers
11
11
  from dstack._internal.core.backends.nebius.api_client import NebiusAPIClient
12
12
  from dstack._internal.core.backends.nebius.config import NebiusConfig
@@ -130,7 +130,7 @@ class NebiusCompute(Compute):
130
130
  ) -> JobProvisioningData:
131
131
  instance_config = InstanceConfiguration(
132
132
  project_name=run.project_name,
133
- instance_name=get_instance_name(run, job), # TODO: generate name
133
+ instance_name=get_job_instance_name(run, job), # TODO: generate name
134
134
  ssh_keys=[
135
135
  SSHKey(public=project_ssh_public_key.strip()),
136
136
  ],
@@ -4,7 +4,12 @@ from typing import List, Optional
4
4
 
5
5
  import oci
6
6
 
7
- from dstack._internal.core.backends.base.compute import Compute, get_instance_name, get_user_data
7
+ from dstack._internal.core.backends.base.compute import (
8
+ Compute,
9
+ generate_unique_instance_name,
10
+ get_job_instance_name,
11
+ get_user_data,
12
+ )
8
13
  from dstack._internal.core.backends.base.offers import get_catalog_offers
9
14
  from dstack._internal.core.backends.oci import resources
10
15
  from dstack._internal.core.backends.oci.config import OCIConfig
@@ -98,7 +103,7 @@ class OCICompute(Compute):
98
103
  ) -> JobProvisioningData:
99
104
  instance_config = InstanceConfiguration(
100
105
  project_name=run.project_name,
101
- instance_name=get_instance_name(run, job),
106
+ instance_name=get_job_instance_name(run, job),
102
107
  ssh_keys=[SSHKey(public=project_ssh_public_key.strip())],
103
108
  user=run.user,
104
109
  )
@@ -148,6 +153,7 @@ class OCICompute(Compute):
148
153
  ]
149
154
  cloud_init_user_data = get_user_data(instance_config.get_public_keys(), setup_commands)
150
155
 
156
+ display_name = generate_unique_instance_name(instance_config)
151
157
  try:
152
158
  instance = resources.launch_instance(
153
159
  region=region,
@@ -155,7 +161,7 @@ class OCICompute(Compute):
155
161
  compartment_id=self.config.compartment_id,
156
162
  subnet_id=subnet.id,
157
163
  security_group_id=security_group.id,
158
- display_name=instance_config.instance_name,
164
+ display_name=display_name,
159
165
  cloud_init_user_data=cloud_init_user_data,
160
166
  shape=instance_offer.instance.name,
161
167
  is_spot=instance_offer.instance.resources.spot,
@@ -163,7 +169,7 @@ class OCICompute(Compute):
163
169
  image_id=package.image_id,
164
170
  )
165
171
  except oci.exceptions.ServiceError as e:
166
- if e.code in ("LimitExceeded", "QuotaExceeded"):
172
+ if e.code in ("LimitExceeded", "QuotaExceeded") or "Out of host capacity" in e.message:
167
173
  raise NoCapacityError(e.message)
168
174
  raise
169
175
 
@@ -5,8 +5,10 @@ from typing import List, Optional
5
5
 
6
6
  from dstack._internal.core.backends.base import Compute
7
7
  from dstack._internal.core.backends.base.compute import (
8
+ generate_unique_instance_name,
9
+ generate_unique_volume_name,
8
10
  get_docker_commands,
9
- get_instance_name,
11
+ get_job_instance_name,
10
12
  )
11
13
  from dstack._internal.core.backends.base.offers import get_catalog_offers
12
14
  from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
@@ -31,6 +33,9 @@ from dstack._internal.utils.logging import get_logger
31
33
 
32
34
  logger = get_logger(__name__)
33
35
 
36
+ # Undocumented but names of len 60 work
37
+ MAX_RESOURCE_NAME_LEN = 60
38
+
34
39
  CONTAINER_REGISTRY_AUTH_CLEANUP_INTERVAL = 60 * 60 * 24 # 24 hour
35
40
 
36
41
 
@@ -47,8 +52,9 @@ class RunpodCompute(Compute):
47
52
  ) -> List[InstanceOfferWithAvailability]:
48
53
  offers = get_catalog_offers(
49
54
  backend=BackendType.RUNPOD,
50
- locations=self.config.regions,
55
+ locations=self.config.regions or None,
51
56
  requirements=requirements,
57
+ extra_filter=lambda o: _is_secure_cloud(o.region) or self.config.allow_community_cloud,
52
58
  )
53
59
  offers = [
54
60
  InstanceOfferWithAvailability(
@@ -69,7 +75,7 @@ class RunpodCompute(Compute):
69
75
  ) -> JobProvisioningData:
70
76
  instance_config = InstanceConfiguration(
71
77
  project_name=run.project_name,
72
- instance_name=get_instance_name(run, job),
78
+ instance_name=get_job_instance_name(run, job),
73
79
  ssh_keys=[
74
80
  SSHKey(public=run.run_spec.ssh_key_pub.strip()),
75
81
  SSHKey(public=project_ssh_public_key.strip()),
@@ -77,6 +83,7 @@ class RunpodCompute(Compute):
77
83
  user=run.user,
78
84
  )
79
85
 
86
+ pod_name = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN)
80
87
  authorized_keys = instance_config.get_public_keys()
81
88
  memory_size = round(instance_offer.instance.resources.memory_mib / 1024)
82
89
  disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
@@ -96,13 +103,22 @@ class RunpodCompute(Compute):
96
103
  bid_per_gpu = None
97
104
  if instance_offer.instance.resources.spot and gpu_count:
98
105
  bid_per_gpu = instance_offer.price / gpu_count
106
+ if _is_secure_cloud(instance_offer.region):
107
+ cloud_type = "SECURE"
108
+ data_center_id = instance_offer.region
109
+ country_code = None
110
+ else:
111
+ cloud_type = "COMMUNITY"
112
+ data_center_id = None
113
+ country_code = instance_offer.region
99
114
 
100
115
  resp = self.api_client.create_pod(
101
- name=instance_config.instance_name,
116
+ name=pod_name,
102
117
  image_name=job.job_spec.image_name,
103
118
  gpu_type_id=instance_offer.instance.name,
104
- cloud_type="SECURE", # ["ALL", "COMMUNITY", "SECURE"]:
105
- data_center_id=instance_offer.region,
119
+ cloud_type=cloud_type,
120
+ data_center_id=data_center_id,
121
+ country_code=country_code,
106
122
  gpu_count=gpu_count,
107
123
  container_disk_in_gb=disk_size,
108
124
  min_vcpu_count=instance_offer.instance.resources.cpus,
@@ -197,9 +213,10 @@ class RunpodCompute(Compute):
197
213
  )
198
214
 
199
215
  def create_volume(self, volume: Volume) -> VolumeProvisioningData:
216
+ volume_name = generate_unique_volume_name(volume, max_length=MAX_RESOURCE_NAME_LEN)
200
217
  size_gb = volume.configuration.size_gb
201
218
  volume_id = self.api_client.create_network_volume(
202
- name=volume.name,
219
+ name=volume_name,
203
220
  region=volume.configuration.region,
204
221
  size=size_gb,
205
222
  )
@@ -250,3 +267,11 @@ def _get_volume_price(size: int) -> float:
250
267
  if size < 1000:
251
268
  return 0.07 * size
252
269
  return 0.05 * size
270
+
271
+
272
+ def _is_secure_cloud(region: str) -> str:
273
+ """
274
+ Secure cloud regions are datacenter IDs: CA-MTL-1, EU-NL-1, etc.
275
+ Community cloud regions are country codes: CA, NL, etc.
276
+ """
277
+ return "-" in region
@@ -4,6 +4,14 @@ from dstack._internal.core.models.backends.runpod import (
4
4
  RunpodStoredConfig,
5
5
  )
6
6
 
7
+ RUNPOD_COMMUNITY_CLOUD_DEFAULT = True
8
+
7
9
 
8
10
  class RunpodConfig(RunpodStoredConfig, BackendConfig):
9
11
  creds: AnyRunpodCreds
12
+
13
+ @property
14
+ def allow_community_cloud(self) -> bool:
15
+ if self.community_cloud is not None:
16
+ return self.community_cloud
17
+ return RUNPOD_COMMUNITY_CLOUD_DEFAULT
@@ -4,7 +4,11 @@ from typing import List, Optional
4
4
  import requests
5
5
 
6
6
  from dstack._internal.core.backends.base import Compute
7
- from dstack._internal.core.backends.base.compute import get_instance_name, get_shim_commands
7
+ from dstack._internal.core.backends.base.compute import (
8
+ generate_unique_instance_name,
9
+ get_job_instance_name,
10
+ get_shim_commands,
11
+ )
8
12
  from dstack._internal.core.backends.base.offers import get_catalog_offers
9
13
  from dstack._internal.core.backends.tensordock.api_client import TensorDockAPIClient
10
14
  from dstack._internal.core.backends.tensordock.config import TensorDockConfig
@@ -23,6 +27,10 @@ from dstack._internal.utils.logging import get_logger
23
27
  logger = get_logger(__name__)
24
28
 
25
29
 
30
+ # Undocumented but names of len 60 work
31
+ MAX_INSTANCE_NAME_LEN = 60
32
+
33
+
26
34
  class TensorDockCompute(Compute):
27
35
  def __init__(self, config: TensorDockConfig):
28
36
  super().__init__()
@@ -49,10 +57,13 @@ class TensorDockCompute(Compute):
49
57
  instance_offer: InstanceOfferWithAvailability,
50
58
  instance_config: InstanceConfiguration,
51
59
  ) -> JobProvisioningData:
60
+ instance_name = generate_unique_instance_name(
61
+ instance_config, max_length=MAX_INSTANCE_NAME_LEN
62
+ )
52
63
  commands = get_shim_commands(authorized_keys=instance_config.get_public_keys())
53
64
  try:
54
65
  resp = self.api_client.deploy_single(
55
- instance_name=instance_config.instance_name,
66
+ instance_name=instance_name,
56
67
  instance=instance_offer.instance,
57
68
  cloudinit={
58
69
  "ssh_pwauth": False, # disable password auth
@@ -113,7 +124,7 @@ class TensorDockCompute(Compute):
113
124
  ) -> JobProvisioningData:
114
125
  instance_config = InstanceConfiguration(
115
126
  project_name=run.project_name,
116
- instance_name=get_instance_name(run, job), # TODO: generate name
127
+ instance_name=get_job_instance_name(run, job), # TODO: generate name
117
128
  ssh_keys=[
118
129
  SSHKey(public=run.run_spec.ssh_key_pub.strip()),
119
130
  SSHKey(public=project_ssh_public_key.strip()),
@@ -4,7 +4,10 @@ import gpuhunt
4
4
  from gpuhunt.providers.vastai import VastAIProvider
5
5
 
6
6
  from dstack._internal.core.backends.base import Compute
7
- from dstack._internal.core.backends.base.compute import get_docker_commands, get_instance_name
7
+ from dstack._internal.core.backends.base.compute import (
8
+ generate_unique_instance_name_for_job,
9
+ get_docker_commands,
10
+ )
8
11
  from dstack._internal.core.backends.base.offers import get_catalog_offers
9
12
  from dstack._internal.core.backends.vastai.api_client import VastAIAPIClient
10
13
  from dstack._internal.core.backends.vastai.config import VastAIConfig
@@ -23,6 +26,10 @@ from dstack._internal.utils.logging import get_logger
23
26
  logger = get_logger(__name__)
24
27
 
25
28
 
29
+ # Undocumented but names of len 60 work
30
+ MAX_INSTANCE_NAME_LEN = 60
31
+
32
+
26
33
  class VastAICompute(Compute):
27
34
  def __init__(self, config: VastAIConfig):
28
35
  super().__init__()
@@ -70,11 +77,14 @@ class VastAICompute(Compute):
70
77
  project_ssh_private_key: str,
71
78
  volumes: List[Volume],
72
79
  ) -> JobProvisioningData:
80
+ instance_name = generate_unique_instance_name_for_job(
81
+ run, job, max_length=MAX_INSTANCE_NAME_LEN
82
+ )
73
83
  commands = get_docker_commands(
74
84
  [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
75
85
  )
76
86
  resp = self.api_client.create_instance(
77
- instance_name=get_instance_name(run, job),
87
+ instance_name=instance_name,
78
88
  bundle_id=instance_offer.instance.name,
79
89
  image_name=job.job_spec.image_name,
80
90
  onstart=" && ".join(commands),
@@ -20,7 +20,7 @@ class VultrApiClient:
20
20
  return False
21
21
  return True
22
22
 
23
- def get_instance(self, instance_id: str, plan_type: str):
23
+ def get_instance(self, instance_id: str, plan_type: str) -> dict:
24
24
  if plan_type == "bare-metal":
25
25
  response = self._make_request("GET", f"/bare-metals/{instance_id}")
26
26
  return response.json()["bare_metal"]
@@ -28,7 +28,7 @@ class VultrApiClient:
28
28
  response = self._make_request("GET", f"/instances/{instance_id}")
29
29
  return response.json()["instance"]
30
30
 
31
- def get_vpc_for_region(self, region: str) -> Optional[str]:
31
+ def get_vpc_for_region(self, region: str) -> Optional[dict]:
32
32
  response = self._make_request("GET", "/vpcs?per_page=500")
33
33
  vpcs = response.json().get("vpcs", [])
34
34
  if vpcs:
@@ -37,7 +37,7 @@ class VultrApiClient:
37
37
  return vpc
38
38
  return None
39
39
 
40
- def create_vpc(self, region: str):
40
+ def create_vpc(self, region: str) -> dict:
41
41
  data = {"region": region, "description": f"dstack-vpc-{region}"}
42
42
  response = self._make_request("POST", "/vpcs", data=data)
43
43
  return response.json()["vpc"]
@@ -6,7 +6,8 @@ import requests
6
6
 
7
7
  from dstack._internal.core.backends.base import Compute
8
8
  from dstack._internal.core.backends.base.compute import (
9
- get_instance_name,
9
+ generate_unique_instance_name,
10
+ get_job_instance_name,
10
11
  get_user_data,
11
12
  )
12
13
  from dstack._internal.core.backends.base.offers import get_catalog_offers
@@ -27,6 +28,8 @@ from dstack._internal.utils.logging import get_logger
27
28
 
28
29
  logger = get_logger(__name__)
29
30
 
31
+ MAX_INSTANCE_NAME_LEN = 64
32
+
30
33
 
31
34
  class VultrCompute(Compute):
32
35
  def __init__(self, config: VultrConfig):
@@ -62,7 +65,7 @@ class VultrCompute(Compute):
62
65
  ) -> JobProvisioningData:
63
66
  instance_config = InstanceConfiguration(
64
67
  project_name=run.project_name,
65
- instance_name=get_instance_name(run, job),
68
+ instance_name=get_job_instance_name(run, job),
66
69
  ssh_keys=[SSHKey(public=project_ssh_public_key.strip())],
67
70
  user=run.user,
68
71
  )
@@ -71,6 +74,9 @@ class VultrCompute(Compute):
71
74
  def create_instance(
72
75
  self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration
73
76
  ) -> JobProvisioningData:
77
+ instance_name = generate_unique_instance_name(
78
+ instance_config, max_length=MAX_INSTANCE_NAME_LEN
79
+ )
74
80
  # create vpc
75
81
  vpc = self.api_client.get_vpc_for_region(instance_offer.region)
76
82
  if not vpc:
@@ -85,7 +91,7 @@ class VultrCompute(Compute):
85
91
  ]
86
92
  instance_id = self.api_client.launch_instance(
87
93
  region=instance_offer.region,
88
- label=instance_config.instance_name,
94
+ label=instance_name,
89
95
  plan=instance_offer.instance.name,
90
96
  user_data=get_user_data(
91
97
  authorized_keys=instance_config.get_public_keys(),
@@ -32,6 +32,7 @@ class AWSConfigInfo(CoreModel):
32
32
  vpc_ids: Optional[Dict[str, str]] = None
33
33
  default_vpcs: Optional[bool] = None
34
34
  public_ips: Optional[bool] = None
35
+ iam_instance_profile: Optional[str] = None
35
36
  tags: Optional[Dict[str, str]] = None
36
37
  os_images: Optional[AWSOSImageConfig] = None
37
38
 
@@ -70,6 +71,7 @@ class AWSConfigInfoWithCredsPartial(CoreModel):
70
71
  vpc_ids: Optional[Dict[str, str]]
71
72
  default_vpcs: Optional[bool]
72
73
  public_ips: Optional[bool]
74
+ iam_instance_profile: Optional[str]
73
75
  tags: Optional[Dict[str, str]]
74
76
  os_images: Optional["AWSOSImageConfig"]
75
77
 
@@ -15,6 +15,7 @@ class BackendType(str, enum.Enum):
15
15
  DATACRUNCH (BackendType): DataCrunch
16
16
  KUBERNETES (BackendType): Kubernetes
17
17
  LAMBDA (BackendType): Lambda Cloud
18
+ OCI (BackendType): Oracle Cloud Infrastructure
18
19
  RUNPOD (BackendType): Runpod Cloud
19
20
  TENSORDOCK (BackendType): TensorDock Marketplace
20
21
  VASTAI (BackendType): Vast.ai Marketplace
@@ -10,6 +10,7 @@ from dstack._internal.core.models.common import CoreModel
10
10
  class RunpodConfigInfo(CoreModel):
11
11
  type: Literal["runpod"] = "runpod"
12
12
  regions: Optional[List[str]] = None
13
+ community_cloud: Optional[bool] = None
13
14
 
14
15
 
15
16
  class RunpodStoredConfig(RunpodConfigInfo):
@@ -33,6 +34,7 @@ class RunpodConfigInfoWithCredsPartial(CoreModel):
33
34
  type: Literal["runpod"] = "runpod"
34
35
  creds: Optional[AnyRunpodCreds]
35
36
  regions: Optional[List[str]]
37
+ community_cloud: Optional[bool]
36
38
 
37
39
 
38
40
  class RunpodConfigValues(CoreModel):
@@ -31,7 +31,6 @@ class RunConfigurationType(str, Enum):
31
31
 
32
32
 
33
33
  class PythonVersion(str, Enum):
34
- PY38 = "3.8" # TODO(0.19 or earlier): drop 3.8, stop building Docker images with 3.8
35
34
  PY39 = "3.9"
36
35
  PY310 = "3.10"
37
36
  PY311 = "3.11"
@@ -222,7 +221,8 @@ class DevEnvironmentConfigurationParams(CoreModel):
222
221
  " Inactivity is defined as the absence of SSH connections to the"
223
222
  " dev environment, including VS Code connections, `ssh <run name>`"
224
223
  " shells, and attached `dstack apply` or `dstack attach` commands."
225
- " Use `off` for unlimited duration. Defaults to `off`"
224
+ " Use `off` for unlimited duration. Can be updated in-place."
225
+ " Defaults to `off`"
226
226
  )
227
227
  ),
228
228
  ]
@@ -1,5 +1,5 @@
1
1
  from enum import Enum
2
- from typing import List, Optional, Union
2
+ from typing import List, Optional, Union, overload
3
3
 
4
4
  from pydantic import Field, root_validator, validator
5
5
  from typing_extensions import Annotated, Literal
@@ -34,6 +34,14 @@ class TerminationPolicy(str, Enum):
34
34
  DESTROY_AFTER_IDLE = "destroy-after-idle"
35
35
 
36
36
 
37
+ @overload
38
+ def parse_duration(v: None) -> None: ...
39
+
40
+
41
+ @overload
42
+ def parse_duration(v: Union[int, str]) -> int: ...
43
+
44
+
37
45
  def parse_duration(v: Optional[Union[int, str]]) -> Optional[int]:
38
46
  if v is None:
39
47
  return None
@@ -112,6 +120,39 @@ class ProfileRetry(CoreModel):
112
120
  return values
113
121
 
114
122
 
123
+ class UtilizationPolicy(CoreModel):
124
+ _min_time_window = "5m"
125
+
126
+ min_gpu_utilization: Annotated[
127
+ int,
128
+ Field(
129
+ description=(
130
+ "Minimum required GPU utilization, percent."
131
+ " If any GPU has utilization below specified value during the whole time window,"
132
+ " the run is terminated"
133
+ ),
134
+ ge=0,
135
+ le=100,
136
+ ),
137
+ ]
138
+ time_window: Annotated[
139
+ Union[int, str],
140
+ Field(
141
+ description=(
142
+ "The time window of metric samples taking into account to measure utilization"
143
+ f" (e.g., `30m`, `1h`). Minimum is `{_min_time_window}`"
144
+ )
145
+ ),
146
+ ]
147
+
148
+ @validator("time_window", pre=True)
149
+ def validate_time_window(cls, v: Union[int, str]) -> int:
150
+ v = parse_duration(v)
151
+ if v < parse_duration(cls._min_time_window):
152
+ raise ValueError(f"Minimum time_window is {cls._min_time_window}")
153
+ return v
154
+
155
+
115
156
  class ProfileParams(CoreModel):
116
157
  backends: Annotated[
117
158
  Optional[List[BackendType]],
@@ -194,6 +235,10 @@ class ProfileParams(CoreModel):
194
235
  )
195
236
  ),
196
237
  ]
238
+ utilization_policy: Annotated[
239
+ Optional[UtilizationPolicy],
240
+ Field(description="Run termination policy based on utilization"),
241
+ ]
197
242
  # Deprecated:
198
243
  termination_policy: Annotated[
199
244
  Optional[TerminationPolicy],
@@ -23,6 +23,7 @@ from dstack._internal.core.models.profiles import (
23
23
  ProfileRetryPolicy,
24
24
  RetryEvent,
25
25
  SpotPolicy,
26
+ UtilizationPolicy,
26
27
  )
27
28
  from dstack._internal.core.models.repos import AnyRunRepoData
28
29
  from dstack._internal.core.models.resources import Memory, ResourcesSpec
@@ -114,6 +115,7 @@ class JobTerminationReason(str, Enum):
114
115
  ABORTED_BY_USER = "aborted_by_user"
115
116
  TERMINATED_BY_SERVER = "terminated_by_server"
116
117
  INACTIVITY_DURATION_EXCEEDED = "inactivity_duration_exceeded"
118
+ TERMINATED_DUE_TO_UTILIZATION_POLICY = "terminated_due_to_utilization_policy"
117
119
  # Set by the runner
118
120
  CONTAINER_EXITED_WITH_ERROR = "container_exited_with_error"
119
121
  PORTS_BINDING_FAILED = "ports_binding_failed"
@@ -135,6 +137,7 @@ class JobTerminationReason(str, Enum):
135
137
  self.ABORTED_BY_USER: JobStatus.ABORTED,
136
138
  self.TERMINATED_BY_SERVER: JobStatus.TERMINATED,
137
139
  self.INACTIVITY_DURATION_EXCEEDED: JobStatus.TERMINATED,
140
+ self.TERMINATED_DUE_TO_UTILIZATION_POLICY: JobStatus.TERMINATED,
138
141
  self.CONTAINER_EXITED_WITH_ERROR: JobStatus.FAILED,
139
142
  self.PORTS_BINDING_FAILED: JobStatus.FAILED,
140
143
  self.CREATING_CONTAINER_ERROR: JobStatus.FAILED,
@@ -190,6 +193,7 @@ class JobSpec(CoreModel):
190
193
  single_branch: Optional[bool] = None
191
194
  max_duration: Optional[int]
192
195
  stop_duration: Optional[int] = None
196
+ utilization_policy: Optional[UtilizationPolicy] = None
193
197
  registry_auth: Optional[RegistryAuth]
194
198
  requirements: Requirements
195
199
  retry: Optional[Retry]
@@ -4,5 +4,9 @@ from dstack._internal.core.errors import ServerClientError
4
4
 
5
5
 
6
6
  def validate_dstack_resource_name(resource_name: str):
7
- if not re.match("^[a-z][a-z0-9-]{1,40}$", resource_name):
7
+ if not is_valid_dstack_resource_name(resource_name):
8
8
  raise ServerClientError("Resource name should match regex '^[a-z][a-z0-9-]{1,40}$'")
9
+
10
+
11
+ def is_valid_dstack_resource_name(resource_name: str) -> bool:
12
+ return re.match("^[a-z][a-z0-9-]{1,40}$", resource_name) is not None
@@ -65,6 +65,9 @@ class ConfigManager:
65
65
  if len(self.config.projects) == 1:
66
66
  self.config.projects[0].default = True
67
67
 
68
+ def list_projects(self):
69
+ return [project.name for project in self.config.projects]
70
+
68
71
  def delete_project(self, name: str):
69
72
  self.config.projects = [p for p in self.config.projects if p.name != name]
70
73
 
@@ -29,6 +29,7 @@ from dstack._internal.server.routers import (
29
29
  metrics,
30
30
  pools,
31
31
  projects,
32
+ prometheus,
32
33
  repos,
33
34
  runs,
34
35
  secrets,
@@ -185,6 +186,7 @@ def register_routes(app: FastAPI, ui: bool = True):
185
186
  app.include_router(model_proxy.router, prefix="/proxy/models", tags=["model-proxy"])
186
187
  app.include_router(pools.root_router)
187
188
  app.include_router(pools.router)
189
+ app.include_router(prometheus.router)
188
190
 
189
191
  @app.exception_handler(ForbiddenError)
190
192
  async def forbidden_error_handler(request: Request, exc: ForbiddenError):
@@ -252,7 +254,11 @@ def register_routes(app: FastAPI, ui: bool = True):
252
254
 
253
255
  @app.exception_handler(404)
254
256
  async def custom_http_exception_handler(request, exc):
255
- if request.url.path.startswith("/api") or _is_proxy_request(request):
257
+ if (
258
+ request.url.path.startswith("/api")
259
+ or _is_proxy_request(request)
260
+ or _is_prometheus_request(request)
261
+ ):
256
262
  return JSONResponse(
257
263
  {"detail": exc.detail},
258
264
  status_code=status.HTTP_404_NOT_FOUND,
@@ -283,6 +289,10 @@ def _is_proxy_request(request: Request) -> bool:
283
289
  ) and referrer.path.startswith("/proxy")
284
290
 
285
291
 
292
+ def _is_prometheus_request(request: Request) -> bool:
293
+ return request.url.path.startswith("/metrics")
294
+
295
+
286
296
  def _print_dstack_logo():
287
297
  console.print(
288
298
  """[purple]╱╱╭╮╱╱╭╮╱╱╱╱╱╱╭╮
@@ -1,6 +1,7 @@
1
1
  from apscheduler.schedulers.asyncio import AsyncIOScheduler
2
2
  from apscheduler.triggers.interval import IntervalTrigger
3
3
 
4
+ from dstack._internal.server import settings
4
5
  from dstack._internal.server.background.tasks.process_fleets import process_fleets
5
6
  from dstack._internal.server.background.tasks.process_gateways import (
6
7
  process_gateways_connections,
@@ -16,6 +17,10 @@ from dstack._internal.server.background.tasks.process_metrics import (
16
17
  from dstack._internal.server.background.tasks.process_placement_groups import (
17
18
  process_placement_groups,
18
19
  )
20
+ from dstack._internal.server.background.tasks.process_prometheus_metrics import (
21
+ collect_prometheus_metrics,
22
+ delete_prometheus_metrics,
23
+ )
19
24
  from dstack._internal.server.background.tasks.process_running_jobs import process_running_jobs
20
25
  from dstack._internal.server.background.tasks.process_runs import process_runs
21
26
  from dstack._internal.server.background.tasks.process_submitted_jobs import process_submitted_jobs
@@ -43,6 +48,11 @@ def start_background_tasks() -> AsyncIOScheduler:
43
48
  # * 150 active instances with up to 2 minutes processing latency
44
49
  _scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
45
50
  _scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
51
+ if settings.ENABLE_PROMETHEUS_METRICS:
52
+ _scheduler.add_job(
53
+ collect_prometheus_metrics, IntervalTrigger(seconds=10), max_instances=1
54
+ )
55
+ _scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1)
46
56
  # process_submitted_jobs and process_instances max processing rate is 75 jobs(instances) per minute.
47
57
  _scheduler.add_job(
48
58
  process_submitted_jobs,
@@ -0,0 +1,22 @@
1
+ from datetime import timedelta
2
+
3
+ from dstack._internal.core.models.backends.base import BackendType
4
+
5
+
6
+ def get_provisioning_timeout(backend_type: BackendType, instance_type_name: str) -> timedelta:
7
+ """
8
+ This timeout is used in a few places, but roughly refers to the max time between
9
+ requesting instance creation and the instance becoming ready to accept jobs.
10
+ For container-based backends, this also includes the image pulling time.
11
+ """
12
+ if backend_type == BackendType.LAMBDA:
13
+ return timedelta(minutes=30)
14
+ if backend_type == BackendType.RUNPOD:
15
+ return timedelta(minutes=20)
16
+ if backend_type == BackendType.KUBERNETES:
17
+ return timedelta(minutes=20)
18
+ if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
19
+ return timedelta(minutes=20)
20
+ if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
21
+ return timedelta(minutes=55)
22
+ return timedelta(minutes=10)