dstack 0.19.10__py3-none-any.whl → 0.19.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/metrics.py +25 -10
- dstack/_internal/cli/commands/offer.py +2 -0
- dstack/_internal/cli/services/configurators/run.py +1 -1
- dstack/_internal/cli/utils/updates.py +13 -1
- dstack/_internal/core/backends/aws/compute.py +21 -9
- dstack/_internal/core/backends/azure/compute.py +7 -5
- dstack/_internal/core/backends/base/compute.py +9 -4
- dstack/_internal/core/backends/gcp/compute.py +43 -20
- dstack/_internal/core/backends/gcp/resources.py +18 -2
- dstack/_internal/core/backends/local/compute.py +4 -2
- dstack/_internal/core/models/configurations.py +2 -1
- dstack/_internal/core/models/runs.py +2 -1
- dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf +11 -1
- dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 +12 -6
- dstack/_internal/proxy/gateway/services/stats.py +17 -3
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +3 -3
- dstack/_internal/server/routers/repos.py +9 -4
- dstack/_internal/server/services/fleets.py +2 -2
- dstack/_internal/server/services/gateways/__init__.py +1 -1
- dstack/_internal/server/services/jobs/__init__.py +4 -4
- dstack/_internal/server/services/jobs/configurators/base.py +15 -1
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -1
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +3 -1
- dstack/_internal/server/services/plugins.py +64 -32
- dstack/_internal/server/services/runs.py +2 -2
- dstack/_internal/server/services/volumes.py +1 -1
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-b4803049eac16aea9a49.js → main-5b9786c955b42bf93581.js} +5 -5
- dstack/_internal/server/statics/{main-b4803049eac16aea9a49.js.map → main-5b9786c955b42bf93581.js.map} +1 -1
- dstack/plugins/builtin/__init__.py +0 -0
- dstack/plugins/builtin/rest_plugin/__init__.py +18 -0
- dstack/plugins/builtin/rest_plugin/_models.py +48 -0
- dstack/plugins/builtin/rest_plugin/_plugin.py +127 -0
- dstack/version.py +2 -2
- {dstack-0.19.10.dist-info → dstack-0.19.11.dist-info}/METADATA +1 -2
- {dstack-0.19.10.dist-info → dstack-0.19.11.dist-info}/RECORD +39 -35
- {dstack-0.19.10.dist-info → dstack-0.19.11.dist-info}/WHEEL +0 -0
- {dstack-0.19.10.dist-info → dstack-0.19.11.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.10.dist-info → dstack-0.19.11.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -39,8 +39,6 @@ class MetricsCommand(APIBaseCommand):
|
|
|
39
39
|
run = self.api.runs.get(run_name=args.run_name)
|
|
40
40
|
if run is None:
|
|
41
41
|
raise CLIError(f"Run {args.run_name} not found")
|
|
42
|
-
if run.status.is_finished():
|
|
43
|
-
raise CLIError(f"Run {args.run_name} is finished")
|
|
44
42
|
metrics = _get_run_jobs_metrics(api=self.api, run=run)
|
|
45
43
|
|
|
46
44
|
if not args.watch:
|
|
@@ -55,8 +53,6 @@ class MetricsCommand(APIBaseCommand):
|
|
|
55
53
|
run = self.api.runs.get(run_name=args.run_name)
|
|
56
54
|
if run is None:
|
|
57
55
|
raise CLIError(f"Run {args.run_name} not found")
|
|
58
|
-
if run.status.is_finished():
|
|
59
|
-
raise CLIError(f"Run {args.run_name} is finished")
|
|
60
56
|
metrics = _get_run_jobs_metrics(api=self.api, run=run)
|
|
61
57
|
except KeyboardInterrupt:
|
|
62
58
|
pass
|
|
@@ -78,11 +74,12 @@ def _get_run_jobs_metrics(api: Client, run: Run) -> List[JobMetrics]:
|
|
|
78
74
|
def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
|
|
79
75
|
table = Table(box=None)
|
|
80
76
|
table.add_column("NAME", style="bold", no_wrap=True)
|
|
77
|
+
table.add_column("STATUS")
|
|
81
78
|
table.add_column("CPU")
|
|
82
79
|
table.add_column("MEMORY")
|
|
83
80
|
table.add_column("GPU")
|
|
84
81
|
|
|
85
|
-
run_row: Dict[Union[str, int], Any] = {"NAME": run.name}
|
|
82
|
+
run_row: Dict[Union[str, int], Any] = {"NAME": run.name, "STATUS": run.status.value}
|
|
86
83
|
if len(run._run.jobs) != 1:
|
|
87
84
|
add_row_from_dict(table, run_row)
|
|
88
85
|
|
|
@@ -101,9 +98,9 @@ def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
|
|
|
101
98
|
cpu_usage = f"{cpu_usage:.0f}%"
|
|
102
99
|
memory_usage = _get_metric_value(job_metrics, "memory_working_set_bytes")
|
|
103
100
|
if memory_usage is not None:
|
|
104
|
-
memory_usage =
|
|
101
|
+
memory_usage = _format_memory(memory_usage, 2)
|
|
105
102
|
if resources is not None:
|
|
106
|
-
memory_usage += f"/{resources.memory_mib}
|
|
103
|
+
memory_usage += f"/{_format_memory(resources.memory_mib * 1024 * 1024, 2)}"
|
|
107
104
|
gpu_metrics = ""
|
|
108
105
|
gpus_detected_num = _get_metric_value(job_metrics, "gpus_detected_num")
|
|
109
106
|
if gpus_detected_num is not None:
|
|
@@ -113,13 +110,16 @@ def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
|
|
|
113
110
|
if gpu_memory_usage is not None:
|
|
114
111
|
if i != 0:
|
|
115
112
|
gpu_metrics += "\n"
|
|
116
|
-
gpu_metrics += f"
|
|
113
|
+
gpu_metrics += f"gpu={i} mem={_format_memory(gpu_memory_usage, 2)}"
|
|
117
114
|
if resources is not None:
|
|
118
|
-
gpu_metrics +=
|
|
119
|
-
|
|
115
|
+
gpu_metrics += (
|
|
116
|
+
f"/{_format_memory(resources.gpus[i].memory_mib * 1024 * 1024, 2)}"
|
|
117
|
+
)
|
|
118
|
+
gpu_metrics += f" util={gpu_util_percent}%"
|
|
120
119
|
|
|
121
120
|
job_row: Dict[Union[str, int], Any] = {
|
|
122
121
|
"NAME": f" replica={job.job_spec.replica_num} job={job.job_spec.job_num}",
|
|
122
|
+
"STATUS": job.job_submissions[-1].status.value,
|
|
123
123
|
"CPU": cpu_usage or "-",
|
|
124
124
|
"MEMORY": memory_usage or "-",
|
|
125
125
|
"GPU": gpu_metrics or "-",
|
|
@@ -136,3 +136,18 @@ def _get_metric_value(job_metrics: JobMetrics, name: str) -> Optional[Any]:
|
|
|
136
136
|
if metric.name == name:
|
|
137
137
|
return metric.values[-1]
|
|
138
138
|
return None
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _format_memory(memory_bytes: int, decimal_places: int) -> str:
|
|
142
|
+
"""See test_format_memory in tests/_internal/cli/commands/test_metrics.py for examples."""
|
|
143
|
+
memory_mb = memory_bytes / 1024 / 1024
|
|
144
|
+
if memory_mb >= 1024:
|
|
145
|
+
value = memory_mb / 1024
|
|
146
|
+
unit = "GB"
|
|
147
|
+
else:
|
|
148
|
+
value = memory_mb
|
|
149
|
+
unit = "MB"
|
|
150
|
+
|
|
151
|
+
if decimal_places == 0:
|
|
152
|
+
return f"{round(value)}{unit}"
|
|
153
|
+
return f"{value:.{decimal_places}f}".rstrip("0").rstrip(".") + unit
|
|
@@ -84,6 +84,8 @@ class OfferCommand(APIBaseCommand):
|
|
|
84
84
|
job_plan = run_plan.job_plans[0]
|
|
85
85
|
|
|
86
86
|
if args.format == "json":
|
|
87
|
+
# FIXME: Should use effective_run_spec from run_plan,
|
|
88
|
+
# since the spec can be changed by the server and plugins
|
|
87
89
|
output = {
|
|
88
90
|
"project": run_plan.project_name,
|
|
89
91
|
"user": run_plan.user,
|
|
@@ -105,7 +105,7 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
|
|
|
105
105
|
changed_fields = []
|
|
106
106
|
if run_plan.action == ApplyAction.UPDATE:
|
|
107
107
|
diff = diff_models(
|
|
108
|
-
run_plan.
|
|
108
|
+
run_plan.get_effective_run_spec().configuration,
|
|
109
109
|
run_plan.current_resource.run_spec.configuration,
|
|
110
110
|
)
|
|
111
111
|
changed_fields = list(diff.keys())
|
|
@@ -57,10 +57,22 @@ def _is_last_check_time_outdated() -> bool:
|
|
|
57
57
|
)
|
|
58
58
|
|
|
59
59
|
|
|
60
|
+
def is_update_available(current_version: str, latest_version: str) -> bool:
|
|
61
|
+
"""
|
|
62
|
+
Return True if latest_version is newer than current_version.
|
|
63
|
+
Pre-releases are only considered if the current version is also a pre-release.
|
|
64
|
+
"""
|
|
65
|
+
_current_version = pkg_version.parse(str(current_version))
|
|
66
|
+
_latest_version = pkg_version.parse(str(latest_version))
|
|
67
|
+
return _current_version < _latest_version and (
|
|
68
|
+
not _latest_version.is_prerelease or _current_version.is_prerelease
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
60
72
|
def _check_version():
|
|
61
73
|
latest_version = get_latest_version()
|
|
62
74
|
if latest_version is not None:
|
|
63
|
-
if
|
|
75
|
+
if is_update_available(version.__version__, latest_version):
|
|
64
76
|
console.print(f"A new version of dstack is available: [code]{latest_version}[/]\n")
|
|
65
77
|
|
|
66
78
|
|
|
@@ -611,9 +611,12 @@ class AWSCompute(
|
|
|
611
611
|
raise e
|
|
612
612
|
logger.debug("Deleted EBS volume %s", volume.configuration.name)
|
|
613
613
|
|
|
614
|
-
def attach_volume(
|
|
614
|
+
def attach_volume(
|
|
615
|
+
self, volume: Volume, provisioning_data: JobProvisioningData
|
|
616
|
+
) -> VolumeAttachmentData:
|
|
615
617
|
ec2_client = self.session.client("ec2", region_name=volume.configuration.region)
|
|
616
618
|
|
|
619
|
+
instance_id = provisioning_data.instance_id
|
|
617
620
|
device_names = aws_resources.list_available_device_names(
|
|
618
621
|
ec2_client=ec2_client, instance_id=instance_id
|
|
619
622
|
)
|
|
@@ -646,9 +649,12 @@ class AWSCompute(
|
|
|
646
649
|
logger.debug("Attached EBS volume %s to instance %s", volume.volume_id, instance_id)
|
|
647
650
|
return VolumeAttachmentData(device_name=device_name)
|
|
648
651
|
|
|
649
|
-
def detach_volume(
|
|
652
|
+
def detach_volume(
|
|
653
|
+
self, volume: Volume, provisioning_data: JobProvisioningData, force: bool = False
|
|
654
|
+
):
|
|
650
655
|
ec2_client = self.session.client("ec2", region_name=volume.configuration.region)
|
|
651
656
|
|
|
657
|
+
instance_id = provisioning_data.instance_id
|
|
652
658
|
logger.debug("Detaching EBS volume %s from instance %s", volume.volume_id, instance_id)
|
|
653
659
|
attachment_data = get_or_error(volume.get_attachment_data_for_instance(instance_id))
|
|
654
660
|
try:
|
|
@@ -667,9 +673,10 @@ class AWSCompute(
|
|
|
667
673
|
raise e
|
|
668
674
|
logger.debug("Detached EBS volume %s from instance %s", volume.volume_id, instance_id)
|
|
669
675
|
|
|
670
|
-
def is_volume_detached(self, volume: Volume,
|
|
676
|
+
def is_volume_detached(self, volume: Volume, provisioning_data: JobProvisioningData) -> bool:
|
|
671
677
|
ec2_client = self.session.client("ec2", region_name=volume.configuration.region)
|
|
672
678
|
|
|
679
|
+
instance_id = provisioning_data.instance_id
|
|
673
680
|
logger.debug("Getting EBS volume %s status", volume.volume_id)
|
|
674
681
|
response = ec2_client.describe_volumes(VolumeIds=[volume.volume_id])
|
|
675
682
|
volumes_infos = response.get("Volumes")
|
|
@@ -819,18 +826,23 @@ def _get_regions_to_zones(session: boto3.Session, regions: List[str]) -> Dict[st
|
|
|
819
826
|
|
|
820
827
|
def _supported_instances(offer: InstanceOffer) -> bool:
|
|
821
828
|
for family in [
|
|
829
|
+
"m7i.",
|
|
830
|
+
"c7i.",
|
|
831
|
+
"r7i.",
|
|
832
|
+
"t3.",
|
|
822
833
|
"t2.small",
|
|
823
834
|
"c5.",
|
|
824
835
|
"m5.",
|
|
825
|
-
"
|
|
826
|
-
"
|
|
836
|
+
"p5.",
|
|
837
|
+
"p5e.",
|
|
838
|
+
"p4d.",
|
|
839
|
+
"p4de.",
|
|
840
|
+
"p3.",
|
|
827
841
|
"g6.",
|
|
828
842
|
"g6e.",
|
|
829
843
|
"gr6.",
|
|
830
|
-
"
|
|
831
|
-
"
|
|
832
|
-
"p4de.",
|
|
833
|
-
"p5.",
|
|
844
|
+
"g5.",
|
|
845
|
+
"g4dn.",
|
|
834
846
|
]:
|
|
835
847
|
if offer.instance.name.startswith(family):
|
|
836
848
|
return True
|
|
@@ -391,11 +391,8 @@ class VMImageVariant(enum.Enum):
|
|
|
391
391
|
|
|
392
392
|
|
|
393
393
|
_SUPPORTED_VM_SERIES_PATTERNS = [
|
|
394
|
-
#
|
|
395
|
-
#
|
|
396
|
-
r"D(\d+)s_v3", # Dsv3-series (general purpose)
|
|
397
|
-
r"E(\d+)i?s_v4", # Esv4-series (memory optimized)
|
|
398
|
-
r"E(\d+)-(\d+)s_v4", # Esv4-series (constrained vCPU)
|
|
394
|
+
r"D(\d+)s_v6", # Dsv6-series (general purpose)
|
|
395
|
+
r"E(\d+)i?s_v6", # Esv6-series (memory optimized)
|
|
399
396
|
r"F(\d+)s_v2", # Fsv2-series (compute optimized)
|
|
400
397
|
r"NC(\d+)s_v3", # NCv3-series [V100 16GB]
|
|
401
398
|
r"NC(\d+)as_T4_v3", # NCasT4_v3-series [T4]
|
|
@@ -404,6 +401,11 @@ _SUPPORTED_VM_SERIES_PATTERNS = [
|
|
|
404
401
|
r"NC(\d+)ads_A100_v4", # NC A100 v4-series [A100 80GB]
|
|
405
402
|
r"ND(\d+)asr_v4", # ND A100 v4-series [8xA100 40GB]
|
|
406
403
|
r"ND(\d+)amsr_A100_v4", # NDm A100 v4-series [8xA100 80GB]
|
|
404
|
+
# Deprecated series
|
|
405
|
+
# TODO: Remove after several releases
|
|
406
|
+
r"D(\d+)s_v3", # Dsv3-series (general purpose)
|
|
407
|
+
r"E(\d+)i?s_v4", # Esv4-series (memory optimized)
|
|
408
|
+
r"E(\d+)-(\d+)s_v4", # Esv4-series (constrained vCPU)
|
|
407
409
|
]
|
|
408
410
|
_SUPPORTED_VM_SERIES_PATTERN = (
|
|
409
411
|
"^Standard_(" + "|".join(f"({s})" for s in _SUPPORTED_VM_SERIES_PATTERNS) + ")$"
|
|
@@ -19,6 +19,7 @@ from dstack._internal.core.consts import (
|
|
|
19
19
|
DSTACK_RUNNER_SSH_PORT,
|
|
20
20
|
DSTACK_SHIM_HTTP_PORT,
|
|
21
21
|
)
|
|
22
|
+
from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
|
|
22
23
|
from dstack._internal.core.models.gateways import (
|
|
23
24
|
GatewayComputeConfiguration,
|
|
24
25
|
GatewayProvisioningData,
|
|
@@ -335,7 +336,9 @@ class ComputeWithVolumeSupport(ABC):
|
|
|
335
336
|
"""
|
|
336
337
|
raise NotImplementedError()
|
|
337
338
|
|
|
338
|
-
def attach_volume(
|
|
339
|
+
def attach_volume(
|
|
340
|
+
self, volume: Volume, provisioning_data: JobProvisioningData
|
|
341
|
+
) -> VolumeAttachmentData:
|
|
339
342
|
"""
|
|
340
343
|
Attaches a volume to the instance.
|
|
341
344
|
If the volume is not found, it should raise `ComputeError()`.
|
|
@@ -344,7 +347,9 @@ class ComputeWithVolumeSupport(ABC):
|
|
|
344
347
|
"""
|
|
345
348
|
raise NotImplementedError()
|
|
346
349
|
|
|
347
|
-
def detach_volume(
|
|
350
|
+
def detach_volume(
|
|
351
|
+
self, volume: Volume, provisioning_data: JobProvisioningData, force: bool = False
|
|
352
|
+
):
|
|
348
353
|
"""
|
|
349
354
|
Detaches a volume from the instance.
|
|
350
355
|
Implement only if compute may return `VolumeProvisioningData.detachable`.
|
|
@@ -352,7 +357,7 @@ class ComputeWithVolumeSupport(ABC):
|
|
|
352
357
|
"""
|
|
353
358
|
raise NotImplementedError()
|
|
354
359
|
|
|
355
|
-
def is_volume_detached(self, volume: Volume,
|
|
360
|
+
def is_volume_detached(self, volume: Volume, provisioning_data: JobProvisioningData) -> bool:
|
|
356
361
|
"""
|
|
357
362
|
Checks if a volume was detached from the instance.
|
|
358
363
|
If `detach_volume()` may fail to detach volume,
|
|
@@ -754,7 +759,7 @@ def get_docker_commands(
|
|
|
754
759
|
f" --ssh-port {DSTACK_RUNNER_SSH_PORT}"
|
|
755
760
|
" --temp-dir /tmp/runner"
|
|
756
761
|
" --home-dir /root"
|
|
757
|
-
" --working-dir
|
|
762
|
+
f" --working-dir {DEFAULT_REPO_DIR}"
|
|
758
763
|
),
|
|
759
764
|
]
|
|
760
765
|
|
|
@@ -649,13 +649,24 @@ class GCPCompute(
|
|
|
649
649
|
pass
|
|
650
650
|
logger.debug("Deleted persistent disk for volume %s", volume.name)
|
|
651
651
|
|
|
652
|
-
def attach_volume(
|
|
652
|
+
def attach_volume(
|
|
653
|
+
self, volume: Volume, provisioning_data: JobProvisioningData
|
|
654
|
+
) -> VolumeAttachmentData:
|
|
655
|
+
instance_id = provisioning_data.instance_id
|
|
653
656
|
logger.debug(
|
|
654
657
|
"Attaching persistent disk for volume %s to instance %s",
|
|
655
658
|
volume.volume_id,
|
|
656
659
|
instance_id,
|
|
657
660
|
)
|
|
661
|
+
if not gcp_resources.instance_type_supports_persistent_disk(
|
|
662
|
+
provisioning_data.instance_type.name
|
|
663
|
+
):
|
|
664
|
+
raise ComputeError(
|
|
665
|
+
f"Instance type {provisioning_data.instance_type.name} does not support Persistent disk volumes"
|
|
666
|
+
)
|
|
667
|
+
|
|
658
668
|
zone = get_or_error(volume.provisioning_data).availability_zone
|
|
669
|
+
is_tpu = _is_tpu_provisioning_data(provisioning_data)
|
|
659
670
|
try:
|
|
660
671
|
disk = self.disk_client.get(
|
|
661
672
|
project=self.config.project_id,
|
|
@@ -663,18 +674,16 @@ class GCPCompute(
|
|
|
663
674
|
disk=volume.volume_id,
|
|
664
675
|
)
|
|
665
676
|
disk_url = disk.self_link
|
|
677
|
+
except google.api_core.exceptions.NotFound:
|
|
678
|
+
raise ComputeError("Persistent disk found")
|
|
666
679
|
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
try:
|
|
680
|
+
try:
|
|
681
|
+
if is_tpu:
|
|
670
682
|
get_node_request = tpu_v2.GetNodeRequest(
|
|
671
683
|
name=f"projects/{self.config.project_id}/locations/{zone}/nodes/{instance_id}",
|
|
672
684
|
)
|
|
673
685
|
tpu_node = self.tpu_client.get_node(get_node_request)
|
|
674
|
-
except google.api_core.exceptions.NotFound:
|
|
675
|
-
tpu_node = None
|
|
676
686
|
|
|
677
|
-
if tpu_node is not None:
|
|
678
687
|
# Python API to attach a disk to a TPU is not documented,
|
|
679
688
|
# so we follow the code from the gcloud CLI:
|
|
680
689
|
# https://github.com/twistedpair/google-cloud-sdk/blob/26ab5a281d56b384cc25750f3279a27afe5b499f/google-cloud-sdk/lib/googlecloudsdk/command_lib/compute/tpus/tpu_vm/util.py#L113
|
|
@@ -711,7 +720,6 @@ class GCPCompute(
|
|
|
711
720
|
attached_disk.auto_delete = False
|
|
712
721
|
attached_disk.device_name = f"pd-{volume.volume_id}"
|
|
713
722
|
device_name = attached_disk.device_name
|
|
714
|
-
|
|
715
723
|
operation = self.instances_client.attach_disk(
|
|
716
724
|
project=self.config.project_id,
|
|
717
725
|
zone=zone,
|
|
@@ -720,13 +728,16 @@ class GCPCompute(
|
|
|
720
728
|
)
|
|
721
729
|
gcp_resources.wait_for_extended_operation(operation, "persistent disk attachment")
|
|
722
730
|
except google.api_core.exceptions.NotFound:
|
|
723
|
-
raise ComputeError("
|
|
731
|
+
raise ComputeError("Disk or instance not found")
|
|
724
732
|
logger.debug(
|
|
725
733
|
"Attached persistent disk for volume %s to instance %s", volume.volume_id, instance_id
|
|
726
734
|
)
|
|
727
735
|
return VolumeAttachmentData(device_name=device_name)
|
|
728
736
|
|
|
729
|
-
def detach_volume(
|
|
737
|
+
def detach_volume(
|
|
738
|
+
self, volume: Volume, provisioning_data: JobProvisioningData, force: bool = False
|
|
739
|
+
):
|
|
740
|
+
instance_id = provisioning_data.instance_id
|
|
730
741
|
logger.debug(
|
|
731
742
|
"Detaching persistent disk for volume %s from instance %s",
|
|
732
743
|
volume.volume_id,
|
|
@@ -734,17 +745,16 @@ class GCPCompute(
|
|
|
734
745
|
)
|
|
735
746
|
zone = get_or_error(volume.provisioning_data).availability_zone
|
|
736
747
|
attachment_data = get_or_error(volume.get_attachment_data_for_instance(instance_id))
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
748
|
+
is_tpu = _is_tpu_provisioning_data(provisioning_data)
|
|
749
|
+
if is_tpu:
|
|
750
|
+
try:
|
|
751
|
+
get_node_request = tpu_v2.GetNodeRequest(
|
|
752
|
+
name=f"projects/{self.config.project_id}/locations/{zone}/nodes/{instance_id}",
|
|
753
|
+
)
|
|
754
|
+
tpu_node = self.tpu_client.get_node(get_node_request)
|
|
755
|
+
except google.api_core.exceptions.NotFound:
|
|
756
|
+
raise ComputeError("Instance not found")
|
|
746
757
|
|
|
747
|
-
if tpu_node is not None:
|
|
748
758
|
source_disk = (
|
|
749
759
|
f"projects/{self.config.project_id}/zones/{zone}/disks/{volume.volume_id}"
|
|
750
760
|
)
|
|
@@ -815,6 +825,11 @@ def _supported_instances_and_zones(
|
|
|
815
825
|
if _is_tpu(offer.instance.name) and not _is_single_host_tpu(offer.instance.name):
|
|
816
826
|
return False
|
|
817
827
|
for family in [
|
|
828
|
+
"m4-",
|
|
829
|
+
"c4-",
|
|
830
|
+
"n4-",
|
|
831
|
+
"h3-",
|
|
832
|
+
"n2-",
|
|
818
833
|
"e2-medium",
|
|
819
834
|
"e2-standard-",
|
|
820
835
|
"e2-highmem-",
|
|
@@ -1001,3 +1016,11 @@ def _get_tpu_data_disk_for_volume(project_id: str, volume: Volume) -> tpu_v2.Att
|
|
|
1001
1016
|
mode=tpu_v2.AttachedDisk.DiskMode.READ_WRITE,
|
|
1002
1017
|
)
|
|
1003
1018
|
return attached_disk
|
|
1019
|
+
|
|
1020
|
+
|
|
1021
|
+
def _is_tpu_provisioning_data(provisioning_data: JobProvisioningData) -> bool:
|
|
1022
|
+
is_tpu = False
|
|
1023
|
+
if provisioning_data.backend_data:
|
|
1024
|
+
backend_data_dict = json.loads(provisioning_data.backend_data)
|
|
1025
|
+
is_tpu = backend_data_dict.get("is_tpu", False)
|
|
1026
|
+
return is_tpu
|
|
@@ -140,7 +140,10 @@ def create_instance_struct(
|
|
|
140
140
|
initialize_params = compute_v1.AttachedDiskInitializeParams()
|
|
141
141
|
initialize_params.source_image = image_id
|
|
142
142
|
initialize_params.disk_size_gb = disk_size
|
|
143
|
-
|
|
143
|
+
if instance_type_supports_persistent_disk(machine_type):
|
|
144
|
+
initialize_params.disk_type = f"zones/{zone}/diskTypes/pd-balanced"
|
|
145
|
+
else:
|
|
146
|
+
initialize_params.disk_type = f"zones/{zone}/diskTypes/hyperdisk-balanced"
|
|
144
147
|
disk.initialize_params = initialize_params
|
|
145
148
|
instance.disks = [disk]
|
|
146
149
|
|
|
@@ -421,7 +424,7 @@ def wait_for_extended_operation(
|
|
|
421
424
|
|
|
422
425
|
if operation.error_code:
|
|
423
426
|
# Write only debug logs here.
|
|
424
|
-
# The unexpected errors will be propagated and logged
|
|
427
|
+
# The unexpected errors will be propagated and logged appropriately by the caller.
|
|
425
428
|
logger.debug(
|
|
426
429
|
"Error during %s: [Code: %s]: %s",
|
|
427
430
|
verbose_name,
|
|
@@ -462,3 +465,16 @@ def get_placement_policy_resource_name(
|
|
|
462
465
|
placement_policy: str,
|
|
463
466
|
) -> str:
|
|
464
467
|
return f"projects/{project_id}/regions/{region}/resourcePolicies/{placement_policy}"
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def instance_type_supports_persistent_disk(instance_type_name: str) -> bool:
|
|
471
|
+
return not any(
|
|
472
|
+
instance_type_name.startswith(series)
|
|
473
|
+
for series in [
|
|
474
|
+
"m4-",
|
|
475
|
+
"c4-",
|
|
476
|
+
"n4-",
|
|
477
|
+
"h3-",
|
|
478
|
+
"v6e",
|
|
479
|
+
]
|
|
480
|
+
)
|
|
@@ -110,8 +110,10 @@ class LocalCompute(
|
|
|
110
110
|
def delete_volume(self, volume: Volume):
|
|
111
111
|
pass
|
|
112
112
|
|
|
113
|
-
def attach_volume(self, volume: Volume,
|
|
113
|
+
def attach_volume(self, volume: Volume, provisioning_data: JobProvisioningData):
|
|
114
114
|
pass
|
|
115
115
|
|
|
116
|
-
def detach_volume(
|
|
116
|
+
def detach_volume(
|
|
117
|
+
self, volume: Volume, provisioning_data: JobProvisioningData, force: bool = False
|
|
118
|
+
):
|
|
117
119
|
pass
|
|
@@ -26,6 +26,7 @@ STRIP_PREFIX_DEFAULT = True
|
|
|
26
26
|
RUN_PRIOTIRY_MIN = 0
|
|
27
27
|
RUN_PRIOTIRY_MAX = 100
|
|
28
28
|
RUN_PRIORITY_DEFAULT = 0
|
|
29
|
+
DEFAULT_REPO_DIR = "/workflow"
|
|
29
30
|
|
|
30
31
|
|
|
31
32
|
class RunConfigurationType(str, Enum):
|
|
@@ -181,7 +182,7 @@ class BaseRunConfiguration(CoreModel):
|
|
|
181
182
|
Field(
|
|
182
183
|
description=(
|
|
183
184
|
"The path to the working directory inside the container."
|
|
184
|
-
" It's specified relative to the repository directory (
|
|
185
|
+
f" It's specified relative to the repository directory (`{DEFAULT_REPO_DIR}`) and should be inside it."
|
|
185
186
|
' Defaults to `"."` '
|
|
186
187
|
)
|
|
187
188
|
),
|
|
@@ -8,6 +8,7 @@ from typing_extensions import Annotated
|
|
|
8
8
|
from dstack._internal.core.models.backends.base import BackendType
|
|
9
9
|
from dstack._internal.core.models.common import ApplyAction, CoreModel, NetworkMode, RegistryAuth
|
|
10
10
|
from dstack._internal.core.models.configurations import (
|
|
11
|
+
DEFAULT_REPO_DIR,
|
|
11
12
|
AnyRunConfiguration,
|
|
12
13
|
RunConfiguration,
|
|
13
14
|
)
|
|
@@ -338,7 +339,7 @@ class RunSpec(CoreModel):
|
|
|
338
339
|
Field(
|
|
339
340
|
description=(
|
|
340
341
|
"The path to the working directory inside the container."
|
|
341
|
-
" It's specified relative to the repository directory (
|
|
342
|
+
f" It's specified relative to the repository directory (`{DEFAULT_REPO_DIR}`) and should be inside it."
|
|
342
343
|
' Defaults to `"."`.'
|
|
343
344
|
)
|
|
344
345
|
),
|
|
@@ -1 +1,11 @@
|
|
|
1
|
-
log_format dstack_stat '$time_iso8601 $host $status $request_time';
|
|
1
|
+
log_format dstack_stat '$time_iso8601 $host $status $request_time $dstack_replica_hit';
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
# A hack to avoid this Nginx reload error when no services are registered:
|
|
5
|
+
# nginx: [emerg] unknown "dstack_replica_hit" variable
|
|
6
|
+
server {
|
|
7
|
+
listen unix:/tmp/dstack-dummy-nginx.sock;
|
|
8
|
+
server_name placeholder.local;
|
|
9
|
+
deny all;
|
|
10
|
+
set $dstack_replica_hit 0;
|
|
11
|
+
}
|
|
@@ -14,6 +14,7 @@ upstream {{ domain }}.upstream {
|
|
|
14
14
|
server {
|
|
15
15
|
server_name {{ domain }};
|
|
16
16
|
limit_req_status 429;
|
|
17
|
+
set $dstack_replica_hit 0;
|
|
17
18
|
access_log {{ access_log_path }} dstack_stat;
|
|
18
19
|
client_max_body_size {{ client_max_body_size }};
|
|
19
20
|
|
|
@@ -23,11 +24,7 @@ server {
|
|
|
23
24
|
auth_request /_dstack_auth;
|
|
24
25
|
{% endif %}
|
|
25
26
|
|
|
26
|
-
{% if replicas %}
|
|
27
27
|
try_files /nonexistent @$http_upgrade;
|
|
28
|
-
{% else %}
|
|
29
|
-
return 503;
|
|
30
|
-
{% endif %}
|
|
31
28
|
|
|
32
29
|
{% if location.limit_req %}
|
|
33
30
|
limit_req zone={{ location.limit_req.zone }}{% if location.limit_req.burst %} burst={{ location.limit_req.burst }} nodelay{% endif %};
|
|
@@ -35,8 +32,9 @@ server {
|
|
|
35
32
|
}
|
|
36
33
|
{% endfor %}
|
|
37
34
|
|
|
38
|
-
{% if replicas %}
|
|
39
35
|
location @websocket {
|
|
36
|
+
set $dstack_replica_hit 1;
|
|
37
|
+
{% if replicas %}
|
|
40
38
|
proxy_pass http://{{ domain }}.upstream;
|
|
41
39
|
proxy_set_header X-Real-IP $remote_addr;
|
|
42
40
|
proxy_set_header Host $host;
|
|
@@ -44,19 +42,27 @@ server {
|
|
|
44
42
|
proxy_set_header Upgrade $http_upgrade;
|
|
45
43
|
proxy_set_header Connection "Upgrade";
|
|
46
44
|
proxy_read_timeout 300s;
|
|
45
|
+
{% else %}
|
|
46
|
+
return 503;
|
|
47
|
+
{% endif %}
|
|
47
48
|
}
|
|
48
49
|
location @ {
|
|
50
|
+
set $dstack_replica_hit 1;
|
|
51
|
+
{% if replicas %}
|
|
49
52
|
proxy_pass http://{{ domain }}.upstream;
|
|
50
53
|
proxy_set_header X-Real-IP $remote_addr;
|
|
51
54
|
proxy_set_header Host $host;
|
|
52
55
|
proxy_read_timeout 300s;
|
|
56
|
+
{% else %}
|
|
57
|
+
return 503;
|
|
58
|
+
{% endif %}
|
|
53
59
|
}
|
|
54
|
-
{% endif %}
|
|
55
60
|
|
|
56
61
|
{% if auth %}
|
|
57
62
|
location = /_dstack_auth {
|
|
58
63
|
internal;
|
|
59
64
|
if ($remote_addr = 127.0.0.1) {
|
|
65
|
+
# for requests from the gateway app, e.g. from the OpenAI-compatible API
|
|
60
66
|
return 200;
|
|
61
67
|
}
|
|
62
68
|
proxy_pass http://localhost:{{ proxy_port }}/api/auth/{{ project_name }};
|
|
@@ -11,10 +11,10 @@ from pydantic import BaseModel
|
|
|
11
11
|
|
|
12
12
|
from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo
|
|
13
13
|
from dstack._internal.proxy.gateway.schemas.stats import PerWindowStats, ServiceStats, Stat
|
|
14
|
+
from dstack._internal.proxy.lib.errors import UnexpectedProxyError
|
|
14
15
|
from dstack._internal.utils.common import run_async
|
|
15
16
|
|
|
16
17
|
logger = logging.getLogger(__name__)
|
|
17
|
-
IGNORE_STATUSES = {403, 404}
|
|
18
18
|
WINDOWS = (30, 60, 300)
|
|
19
19
|
TTL = WINDOWS[-1]
|
|
20
20
|
EMPTY_STATS = {window: Stat(requests=0, request_time=0.0) for window in WINDOWS}
|
|
@@ -35,6 +35,7 @@ class LogEntry(BaseModel):
|
|
|
35
35
|
host: str
|
|
36
36
|
status: int
|
|
37
37
|
request_time: float
|
|
38
|
+
is_replica_hit: bool
|
|
38
39
|
|
|
39
40
|
|
|
40
41
|
class StatsCollector:
|
|
@@ -87,7 +88,8 @@ class StatsCollector:
|
|
|
87
88
|
now = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
88
89
|
|
|
89
90
|
for entry in self._read_access_log(now - datetime.timedelta(seconds=TTL)):
|
|
90
|
-
|
|
91
|
+
# only include requests that hit or should hit a service replica
|
|
92
|
+
if not entry.is_replica_hit:
|
|
91
93
|
continue
|
|
92
94
|
|
|
93
95
|
frame_timestamp = int(entry.timestamp.timestamp())
|
|
@@ -119,7 +121,10 @@ class StatsCollector:
|
|
|
119
121
|
line = self._file.readline()
|
|
120
122
|
if not line:
|
|
121
123
|
break
|
|
122
|
-
|
|
124
|
+
cells = line.split()
|
|
125
|
+
if len(cells) == 4: # compatibility with pre-0.19.11 logs
|
|
126
|
+
cells.append("0" if cells[2] in ["403", "404"] else "1")
|
|
127
|
+
timestamp_str, host, status, request_time, dstack_replica_hit = cells
|
|
123
128
|
timestamp = datetime.datetime.fromisoformat(timestamp_str)
|
|
124
129
|
if timestamp < after:
|
|
125
130
|
continue
|
|
@@ -128,6 +133,7 @@ class StatsCollector:
|
|
|
128
133
|
host=host,
|
|
129
134
|
status=int(status),
|
|
130
135
|
request_time=float(request_time),
|
|
136
|
+
is_replica_hit=_parse_nginx_bool(dstack_replica_hit),
|
|
131
137
|
)
|
|
132
138
|
if os.fstat(self._file.fileno()).st_ino != st_ino:
|
|
133
139
|
# file was rotated
|
|
@@ -154,3 +160,11 @@ async def get_service_stats(
|
|
|
154
160
|
)
|
|
155
161
|
for service in services
|
|
156
162
|
]
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _parse_nginx_bool(v: str) -> bool:
|
|
166
|
+
if v == "0":
|
|
167
|
+
return False
|
|
168
|
+
if v == "1":
|
|
169
|
+
return True
|
|
170
|
+
raise UnexpectedProxyError(f"Cannot parse boolean value: expected '0' or '1', got {v!r}")
|
|
@@ -659,7 +659,7 @@ async def _attach_volumes(
|
|
|
659
659
|
backend=backend,
|
|
660
660
|
volume_model=volume_model,
|
|
661
661
|
instance=instance,
|
|
662
|
-
|
|
662
|
+
jpd=job_provisioning_data,
|
|
663
663
|
)
|
|
664
664
|
job_runtime_data.volume_names.append(volume.name)
|
|
665
665
|
break # attach next mount point
|
|
@@ -685,7 +685,7 @@ async def _attach_volume(
|
|
|
685
685
|
backend: Backend,
|
|
686
686
|
volume_model: VolumeModel,
|
|
687
687
|
instance: InstanceModel,
|
|
688
|
-
|
|
688
|
+
jpd: JobProvisioningData,
|
|
689
689
|
):
|
|
690
690
|
compute = backend.compute()
|
|
691
691
|
assert isinstance(compute, ComputeWithVolumeSupport)
|
|
@@ -697,7 +697,7 @@ async def _attach_volume(
|
|
|
697
697
|
attachment_data = await common_utils.run_async(
|
|
698
698
|
compute.attach_volume,
|
|
699
699
|
volume=volume,
|
|
700
|
-
|
|
700
|
+
provisioning_data=jpd,
|
|
701
701
|
)
|
|
702
702
|
volume_attachment_model = VolumeAttachmentModel(
|
|
703
703
|
volume=volume_model,
|