truss 0.10.9rc535__py3-none-any.whl → 0.10.10rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of truss might be problematic. Click here for more details.
- truss/cli/logs/base_watcher.py +1 -1
- truss/cli/train/deploy_checkpoints/deploy_checkpoints.py +30 -22
- truss/cli/train/deploy_checkpoints/deploy_checkpoints_helpers.py +8 -2
- truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py +14 -7
- truss/cli/train/deploy_checkpoints/deploy_whisper_checkpoints.py +63 -0
- truss/cli/train/deploy_from_checkpoint_config_whisper.yml +17 -0
- truss/cli/train/metrics_watcher.py +170 -59
- truss/cli/train_commands.py +11 -3
- truss/contexts/image_builder/serving_image_builder.py +22 -39
- truss/remote/baseten/api.py +11 -0
- truss/remote/baseten/core.py +209 -1
- truss/remote/baseten/utils/time.py +15 -0
- truss/templates/base.Dockerfile.jinja +6 -23
- truss/templates/cache.Dockerfile.jinja +5 -5
- truss/templates/copy_cache_files.Dockerfile.jinja +1 -1
- truss/templates/docker_server/supervisord.conf.jinja +0 -1
- truss/templates/server/requirements.txt +1 -1
- truss/templates/server.Dockerfile.jinja +16 -33
- truss/tests/cli/train/test_deploy_checkpoints.py +446 -2
- truss/tests/cli/train/test_train_cli_core.py +96 -0
- truss/tests/remote/baseten/conftest.py +18 -0
- truss/tests/remote/baseten/test_api.py +49 -14
- truss/tests/remote/baseten/test_core.py +517 -1
- {truss-0.10.9rc535.dist-info → truss-0.10.10rc0.dist-info}/METADATA +2 -2
- {truss-0.10.9rc535.dist-info → truss-0.10.10rc0.dist-info}/RECORD +31 -29
- truss_train/definitions.py +6 -0
- truss_train/deployment.py +15 -2
- truss_train/loader.py +7 -20
- truss/tests/util/test_basetenpointer.py +0 -227
- truss/util/basetenpointer.py +0 -160
- {truss-0.10.9rc535.dist-info → truss-0.10.10rc0.dist-info}/WHEEL +0 -0
- {truss-0.10.9rc535.dist-info → truss-0.10.10rc0.dist-info}/entry_points.txt +0 -0
- {truss-0.10.9rc535.dist-info → truss-0.10.10rc0.dist-info}/licenses/LICENSE +0 -0
truss/cli/logs/base_watcher.py
CHANGED
|
@@ -33,6 +33,10 @@ from .deploy_lora_checkpoints import (
|
|
|
33
33
|
hydrate_lora_checkpoint,
|
|
34
34
|
render_vllm_lora_truss_config,
|
|
35
35
|
)
|
|
36
|
+
from .deploy_whisper_checkpoints import (
|
|
37
|
+
hydrate_whisper_checkpoint,
|
|
38
|
+
render_vllm_whisper_truss_config,
|
|
39
|
+
)
|
|
36
40
|
|
|
37
41
|
HF_TOKEN_ENVVAR_NAME = "HF_TOKEN"
|
|
38
42
|
# If we change this, make sure to update the logic in backend codebase
|
|
@@ -178,6 +182,8 @@ def hydrate_checkpoint(
|
|
|
178
182
|
return hydrate_lora_checkpoint(job_id, checkpoint_id, checkpoint)
|
|
179
183
|
elif checkpoint_type.lower() == ModelWeightsFormat.FULL.value:
|
|
180
184
|
return hydrate_full_checkpoint(job_id, checkpoint_id, checkpoint)
|
|
185
|
+
elif checkpoint_type.lower() == ModelWeightsFormat.WHISPER.value:
|
|
186
|
+
return hydrate_whisper_checkpoint(job_id, checkpoint_id, checkpoint)
|
|
181
187
|
else:
|
|
182
188
|
raise ValueError(
|
|
183
189
|
f"Unsupported checkpoint type: {checkpoint_type}. Contact Baseten for support with other checkpoint types."
|
|
@@ -196,6 +202,8 @@ def _render_truss_config_for_checkpoint_deployment(
|
|
|
196
202
|
return render_vllm_lora_truss_config(checkpoint_deploy)
|
|
197
203
|
elif checkpoint_deploy.model_weight_format == ModelWeightsFormat.FULL:
|
|
198
204
|
return render_vllm_full_truss_config(checkpoint_deploy)
|
|
205
|
+
elif checkpoint_deploy.model_weight_format == ModelWeightsFormat.WHISPER:
|
|
206
|
+
return render_vllm_whisper_truss_config(checkpoint_deploy)
|
|
199
207
|
else:
|
|
200
208
|
raise ValueError(
|
|
201
209
|
f"Unsupported model weight format: {checkpoint_deploy.model_weight_format}. Please upgrade to the latest Truss version to access the latest supported formats. Contact Baseten if you would like us to support additional formats."
|
|
@@ -288,18 +296,6 @@ def _get_checkpoint_ids_to_deploy(
|
|
|
288
296
|
return checkpoint_ids
|
|
289
297
|
|
|
290
298
|
|
|
291
|
-
def _select_single_checkpoint(checkpoint_id_options: List[str]) -> List[str]:
|
|
292
|
-
"""Select a single checkpoint using interactive prompt."""
|
|
293
|
-
checkpoint_id = inquirer.select(
|
|
294
|
-
message="Select the checkpoint to deploy:", choices=checkpoint_id_options
|
|
295
|
-
).execute()
|
|
296
|
-
|
|
297
|
-
if not checkpoint_id:
|
|
298
|
-
raise click.UsageError("A checkpoint must be selected.")
|
|
299
|
-
|
|
300
|
-
return [checkpoint_id]
|
|
301
|
-
|
|
302
|
-
|
|
303
299
|
def _select_multiple_checkpoints(checkpoint_id_options: List[str]) -> List[str]:
|
|
304
300
|
"""Select multiple checkpoints using interactive checkbox."""
|
|
305
301
|
checkpoint_ids = inquirer.checkbox(
|
|
@@ -351,6 +347,8 @@ def _get_base_model_id(user_input: Optional[str], checkpoint: dict) -> Optional[
|
|
|
351
347
|
)
|
|
352
348
|
elif checkpoint.get("checkpoint_type") == ModelWeightsFormat.FULL.value.lower():
|
|
353
349
|
return None
|
|
350
|
+
elif checkpoint.get("checkpoint_type") == ModelWeightsFormat.WHISPER.value.lower():
|
|
351
|
+
return None
|
|
354
352
|
else:
|
|
355
353
|
base_model_id = inquirer.text(message="Enter the base model id.").execute()
|
|
356
354
|
if not base_model_id:
|
|
@@ -416,18 +414,28 @@ def _validate_selected_checkpoints(
|
|
|
416
414
|
"Unable to infer model weight format. Reach out to Baseten for support."
|
|
417
415
|
)
|
|
418
416
|
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
417
|
+
validation_rules = {
|
|
418
|
+
ModelWeightsFormat.FULL.value: {
|
|
419
|
+
"error_message": "Full checkpoints are not supported for multiple checkpoints. Please select a single checkpoint.",
|
|
420
|
+
"reason": "vLLM does not support multiple checkpoints when any checkpoint is full model weights.",
|
|
421
|
+
},
|
|
422
|
+
ModelWeightsFormat.WHISPER.value: {
|
|
423
|
+
"error_message": "Whisper checkpoints are not supported for multiple checkpoints. Please select a single checkpoint.",
|
|
424
|
+
"reason": "vLLM does not support multiple checkpoints when any checkpoint is whisper model weights.",
|
|
425
|
+
},
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
# Check each checkpoint type that has restrictions
|
|
429
|
+
for checkpoint_type, rule in validation_rules.items():
|
|
430
|
+
has_restricted_checkpoint = any(
|
|
431
|
+
response_checkpoints[checkpoint_id].get("checkpoint_type")
|
|
432
|
+
== checkpoint_type
|
|
433
|
+
for checkpoint_id in checkpoint_ids
|
|
429
434
|
)
|
|
430
435
|
|
|
436
|
+
if has_restricted_checkpoint and len(checkpoint_ids) > 1:
|
|
437
|
+
raise ValueError(rule["error_message"])
|
|
438
|
+
|
|
431
439
|
|
|
432
440
|
def get_hf_secret_name(user_input: Union[str, SecretReference, None]) -> str:
|
|
433
441
|
"""Get HuggingFace secret name from user input or prompt for it."""
|
|
@@ -3,7 +3,7 @@ from pathlib import Path
|
|
|
3
3
|
|
|
4
4
|
from truss.base import truss_config
|
|
5
5
|
from truss.cli.train.types import DeployCheckpointsConfigComplete
|
|
6
|
-
from truss_train.definitions import SecretReference
|
|
6
|
+
from truss_train.definitions import ModelWeightsFormat, SecretReference
|
|
7
7
|
|
|
8
8
|
START_COMMAND_ENVVAR_NAME = "BT_DOCKER_SERVER_START_CMD"
|
|
9
9
|
|
|
@@ -12,8 +12,14 @@ def setup_base_truss_config(
|
|
|
12
12
|
checkpoint_deploy: DeployCheckpointsConfigComplete,
|
|
13
13
|
) -> truss_config.TrussConfig:
|
|
14
14
|
"""Set up the base truss config with common properties."""
|
|
15
|
+
truss_deploy_config = None
|
|
16
|
+
truss_base_file = (
|
|
17
|
+
"deploy_from_checkpoint_config_whisper.yml"
|
|
18
|
+
if checkpoint_deploy.model_weight_format == ModelWeightsFormat.WHISPER
|
|
19
|
+
else "deploy_from_checkpoint_config.yml"
|
|
20
|
+
)
|
|
15
21
|
truss_deploy_config = truss_config.TrussConfig.from_yaml(
|
|
16
|
-
Path(os.path.dirname(__file__), "..",
|
|
22
|
+
Path(os.path.dirname(__file__), "..", truss_base_file)
|
|
17
23
|
)
|
|
18
24
|
if not truss_deploy_config.docker_server:
|
|
19
25
|
raise ValueError(
|
|
@@ -14,12 +14,19 @@ from .deploy_checkpoints_helpers import (
|
|
|
14
14
|
setup_environment_variables_and_secrets,
|
|
15
15
|
)
|
|
16
16
|
|
|
17
|
+
# NB(aghilan): Transformers was recently changed to save a chat_template.jinja file instead of inside the tokenizer_config.json file.
|
|
18
|
+
# Old Models will not have this file, so we check for it and use it if it exists.
|
|
19
|
+
# vLLM will not automatically resolve the chat_template.jinja file, so we need to pass it to the start command.
|
|
20
|
+
# This logic is needed for any models trained using Transformers v4.51.3 or later
|
|
17
21
|
VLLM_FULL_START_COMMAND = Template(
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
22
|
+
"sh -c '{% if envvars %}{{ envvars }} {% endif %}"
|
|
23
|
+
'HF_TOKEN="$$(cat /secrets/hf_access_token)" && export HF_TOKEN && '
|
|
24
|
+
"if [ -f {{ model_path }}/chat_template.jinja ]; then "
|
|
25
|
+
" vllm serve {{ model_path }} --chat-template {{ model_path }}/chat_template.jinja "
|
|
26
|
+
" --port 8000 --tensor-parallel-size {{ specify_tensor_parallelism }} --dtype bfloat16; "
|
|
27
|
+
"else "
|
|
28
|
+
" vllm serve {{ model_path }} --port 8000 --tensor-parallel-size {{ specify_tensor_parallelism }} --dtype bfloat16; "
|
|
29
|
+
"fi'"
|
|
23
30
|
)
|
|
24
31
|
|
|
25
32
|
|
|
@@ -33,7 +40,7 @@ def render_vllm_full_truss_config(
|
|
|
33
40
|
truss_deploy_config, checkpoint_deploy
|
|
34
41
|
)
|
|
35
42
|
|
|
36
|
-
checkpoint_str =
|
|
43
|
+
checkpoint_str = build_full_checkpoint_string(truss_deploy_config)
|
|
37
44
|
|
|
38
45
|
accelerator = checkpoint_deploy.compute.accelerator
|
|
39
46
|
|
|
@@ -64,7 +71,7 @@ def hydrate_full_checkpoint(
|
|
|
64
71
|
return FullCheckpoint(training_job_id=job_id, paths=paths)
|
|
65
72
|
|
|
66
73
|
|
|
67
|
-
def
|
|
74
|
+
def build_full_checkpoint_string(truss_deploy_config) -> str:
|
|
68
75
|
"""Build checkpoint string from artifact references for full checkpoints.
|
|
69
76
|
|
|
70
77
|
Args:
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from jinja2 import Template
|
|
2
|
+
|
|
3
|
+
from truss.base import truss_config
|
|
4
|
+
from truss.cli.train.deploy_checkpoints.deploy_checkpoints_helpers import (
|
|
5
|
+
START_COMMAND_ENVVAR_NAME,
|
|
6
|
+
)
|
|
7
|
+
from truss.cli.train.deploy_checkpoints.deploy_full_checkpoints import (
|
|
8
|
+
build_full_checkpoint_string,
|
|
9
|
+
)
|
|
10
|
+
from truss.cli.train.types import DeployCheckpointsConfigComplete
|
|
11
|
+
from truss_train.definitions import WhisperCheckpoint
|
|
12
|
+
|
|
13
|
+
from .deploy_checkpoints_helpers import (
|
|
14
|
+
setup_base_truss_config,
|
|
15
|
+
setup_environment_variables_and_secrets,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
VLLM_WHISPER_START_COMMAND = Template(
|
|
19
|
+
"sh -c '{% if envvars %}{{ envvars }} {% endif %}"
|
|
20
|
+
'HF_TOKEN="$$(cat /secrets/hf_access_token)" && export HF_TOKEN && '
|
|
21
|
+
"vllm serve {{ model_path }} --port 8000 --tensor-parallel-size {{ specify_tensor_parallelism }}'"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def render_vllm_whisper_truss_config(
|
|
26
|
+
checkpoint_deploy: DeployCheckpointsConfigComplete,
|
|
27
|
+
) -> truss_config.TrussConfig:
|
|
28
|
+
"""Render truss config specifically for whisper checkpoints using vLLM."""
|
|
29
|
+
truss_deploy_config = setup_base_truss_config(checkpoint_deploy)
|
|
30
|
+
|
|
31
|
+
start_command_envvars = setup_environment_variables_and_secrets(
|
|
32
|
+
truss_deploy_config, checkpoint_deploy
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
checkpoint_str = build_full_checkpoint_string(truss_deploy_config)
|
|
36
|
+
|
|
37
|
+
accelerator = checkpoint_deploy.compute.accelerator
|
|
38
|
+
|
|
39
|
+
start_command_args = {
|
|
40
|
+
"model_path": checkpoint_str,
|
|
41
|
+
"envvars": start_command_envvars,
|
|
42
|
+
"specify_tensor_parallelism": accelerator.count if accelerator else 1,
|
|
43
|
+
}
|
|
44
|
+
# Note: we set the start command as an environment variable in supervisord config.
|
|
45
|
+
# This is so that we don't have to change the supervisord config when the start command changes.
|
|
46
|
+
# Our goal is to reduce the number of times we need to rebuild the image, and allow us to deploy faster.
|
|
47
|
+
start_command = VLLM_WHISPER_START_COMMAND.render(**start_command_args)
|
|
48
|
+
truss_deploy_config.environment_variables[START_COMMAND_ENVVAR_NAME] = start_command
|
|
49
|
+
# Note: supervisord uses the convention %(ENV_VAR_NAME)s to access environment variable VAR_NAME
|
|
50
|
+
truss_deploy_config.docker_server.start_command = ( # type: ignore[union-attr]
|
|
51
|
+
f"%(ENV_{START_COMMAND_ENVVAR_NAME})s"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
return truss_deploy_config
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def hydrate_whisper_checkpoint(
|
|
58
|
+
job_id: str, checkpoint_id: str, checkpoint: dict
|
|
59
|
+
) -> WhisperCheckpoint:
|
|
60
|
+
"""Create a Checkpoint object for whisper model weights."""
|
|
61
|
+
# NOTE: Slash at the end is important since it means the checkpoint is a directory
|
|
62
|
+
paths = [f"rank-0/{checkpoint_id}/"]
|
|
63
|
+
return WhisperCheckpoint(training_job_id=job_id, paths=paths)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
base_image:
|
|
2
|
+
image: vllm/vllm-openai:latest
|
|
3
|
+
|
|
4
|
+
docker_server:
|
|
5
|
+
start_command: sh -c "" # replaced when deploying
|
|
6
|
+
readiness_endpoint: /health
|
|
7
|
+
liveness_endpoint: /health
|
|
8
|
+
predict_endpoint: /v1/audio/transcriptions
|
|
9
|
+
server_port: 8000
|
|
10
|
+
runtime:
|
|
11
|
+
predict_concurrency : 256
|
|
12
|
+
environment_variables:
|
|
13
|
+
VLLM_LOGGING_LEVEL: WARNING
|
|
14
|
+
VLLM_USE_V1: 0
|
|
15
|
+
HF_HUB_ENABLE_HF_TRANSFER: 1
|
|
16
|
+
requirements:
|
|
17
|
+
- vllm[audio]
|
|
@@ -4,6 +4,7 @@ import traceback
|
|
|
4
4
|
from typing import Any, Dict, List, Optional, Tuple, cast
|
|
5
5
|
|
|
6
6
|
from rich.columns import Columns
|
|
7
|
+
from rich.layout import Layout
|
|
7
8
|
from rich.live import Live
|
|
8
9
|
from rich.table import Table
|
|
9
10
|
from rich.text import Text
|
|
@@ -96,90 +97,200 @@ class MetricsWatcher(TrainingPollerMixin):
|
|
|
96
97
|
)
|
|
97
98
|
return True
|
|
98
99
|
|
|
99
|
-
def create_metrics_table(self, metrics_data: Dict) ->
|
|
100
|
+
def create_metrics_table(self, metrics_data: Dict) -> Layout:
|
|
100
101
|
"""Create a Rich table with the metrics"""
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
102
|
+
tables = []
|
|
103
|
+
|
|
104
|
+
timestamp = self._get_timestamp_from_metrics(metrics_data)
|
|
105
|
+
|
|
106
|
+
node_tables = self._create_unified_node_metrics_tables(metrics_data)
|
|
107
|
+
tables.extend(node_tables)
|
|
108
|
+
|
|
109
|
+
storage_tables = self._create_storage_tables(metrics_data)
|
|
110
|
+
tables.extend(storage_tables)
|
|
111
|
+
|
|
112
|
+
columns = Columns(tables, title="Training Job Metrics")
|
|
113
|
+
|
|
114
|
+
layout = Layout()
|
|
115
|
+
|
|
116
|
+
if timestamp:
|
|
117
|
+
from rich.panel import Panel
|
|
118
|
+
|
|
119
|
+
layout.split_column(
|
|
120
|
+
Layout(
|
|
121
|
+
Panel(
|
|
122
|
+
f"🕐 Last Updated: {timestamp}\n💡 Press Ctrl+C to exit",
|
|
123
|
+
style="bold cyan",
|
|
124
|
+
),
|
|
125
|
+
size=4,
|
|
126
|
+
),
|
|
127
|
+
Layout(columns),
|
|
128
|
+
)
|
|
129
|
+
else:
|
|
130
|
+
layout.split_column(Layout(columns))
|
|
131
|
+
|
|
132
|
+
return layout
|
|
133
|
+
|
|
134
|
+
def _get_timestamp_from_metrics(self, metrics_data: Dict) -> Optional[str]:
|
|
135
|
+
"""Extract timestamp from metrics data for display"""
|
|
136
|
+
# Try to get timestamp from per_node_metrics first. Fall back to main metrics if not there.
|
|
137
|
+
per_node_metrics = metrics_data.get("per_node_metrics", [])
|
|
138
|
+
if per_node_metrics and len(per_node_metrics) > 0:
|
|
139
|
+
first_node_metrics = per_node_metrics[0].get("metrics", {})
|
|
140
|
+
cpu_usage_data = first_node_metrics.get("cpu_usage", [])
|
|
141
|
+
if cpu_usage_data and len(cpu_usage_data) > 0:
|
|
142
|
+
timestamp = cpu_usage_data[-1].get("timestamp")
|
|
143
|
+
if timestamp:
|
|
144
|
+
return common.format_localized_time(timestamp)
|
|
112
145
|
|
|
113
|
-
# Add timestamp if available
|
|
114
146
|
cpu_usage_data = metrics_data.get("cpu_usage", [])
|
|
115
147
|
if cpu_usage_data and len(cpu_usage_data) > 0:
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
148
|
+
timestamp = cpu_usage_data[-1].get("timestamp")
|
|
149
|
+
if timestamp:
|
|
150
|
+
return common.format_localized_time(timestamp)
|
|
151
|
+
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
def _create_unified_node_metrics_tables(self, metrics_data: Dict) -> List[Table]:
|
|
155
|
+
"""Create tables for node metrics, handling both single and multi-node scenarios"""
|
|
156
|
+
tables = []
|
|
157
|
+
|
|
158
|
+
per_node_metrics = metrics_data.get("per_node_metrics", [])
|
|
159
|
+
|
|
160
|
+
if not per_node_metrics:
|
|
161
|
+
# Job is likely just starting up - it takes some type for the
|
|
162
|
+
# the metrics to become available after the job starts running.
|
|
163
|
+
from rich.text import Text
|
|
164
|
+
|
|
165
|
+
waiting_table = Table(title="Training Job Status")
|
|
166
|
+
waiting_table.add_column("Status")
|
|
167
|
+
waiting_table.add_column("Message")
|
|
168
|
+
|
|
169
|
+
waiting_table.add_row(
|
|
170
|
+
"Status",
|
|
171
|
+
Text("⏳ Waiting for metrics to become available...", style="yellow"),
|
|
172
|
+
)
|
|
173
|
+
waiting_table.add_row(
|
|
174
|
+
"Note",
|
|
175
|
+
Text(
|
|
176
|
+
"Metrics will appear once the training job starts running.",
|
|
177
|
+
style="dim",
|
|
178
|
+
),
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
tables.append(waiting_table)
|
|
182
|
+
return tables
|
|
183
|
+
|
|
184
|
+
for node_metrics in per_node_metrics:
|
|
185
|
+
node_id = node_metrics.get("node_id", "Unknown")
|
|
186
|
+
metrics = node_metrics.get("metrics", {})
|
|
187
|
+
|
|
188
|
+
if not metrics:
|
|
189
|
+
continue
|
|
123
190
|
|
|
124
|
-
|
|
125
|
-
|
|
191
|
+
table = self._create_node_table(node_id, metrics)
|
|
192
|
+
tables.append(table)
|
|
193
|
+
|
|
194
|
+
return tables
|
|
195
|
+
|
|
196
|
+
def _create_node_table(self, node_id: str, metrics: Dict) -> Table:
|
|
197
|
+
"""Create a table for a single node's metrics"""
|
|
198
|
+
table = Table(title=f"Node: {node_id}")
|
|
199
|
+
table.add_column("Metric")
|
|
200
|
+
table.add_column("Value")
|
|
201
|
+
|
|
202
|
+
cpu_usage = self._get_latest_metric(metrics.get("cpu_usage", []))
|
|
126
203
|
if cpu_usage is not None:
|
|
127
|
-
table.add_row("CPU
|
|
204
|
+
table.add_row("CPU usage", f"{cpu_usage:.2f} cores")
|
|
128
205
|
|
|
129
|
-
cpu_memory = self._get_latest_metric(
|
|
130
|
-
metrics_data.get("cpu_memory_usage_bytes", [])
|
|
131
|
-
)
|
|
206
|
+
cpu_memory = self._get_latest_metric(metrics.get("cpu_memory_usage_bytes", []))
|
|
132
207
|
if cpu_memory is not None:
|
|
133
208
|
formatted_value, color = self._format_bytes(cpu_memory)
|
|
134
|
-
table.add_row("CPU
|
|
209
|
+
table.add_row("CPU memory", Text(formatted_value, style=color))
|
|
135
210
|
|
|
136
|
-
|
|
137
|
-
|
|
211
|
+
if cpu_usage is not None or cpu_memory is not None:
|
|
212
|
+
table.add_section()
|
|
138
213
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
gpu_memory = metrics_data.get("gpu_memory_usage_bytes", {})
|
|
214
|
+
gpu_utilization = metrics.get("gpu_utilization", {})
|
|
215
|
+
gpu_memory = metrics.get("gpu_memory_usage_bytes", {})
|
|
142
216
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
217
|
+
# API should return same GPU IDs for utilization and memory
|
|
218
|
+
keys = gpu_utilization.keys()
|
|
219
|
+
for idx, gpu_id in enumerate(keys):
|
|
220
|
+
latest_util = self._get_latest_metric(gpu_utilization.get(gpu_id, []))
|
|
146
221
|
if latest_util is not None:
|
|
147
|
-
table.add_row(f"GPU {gpu_id}
|
|
222
|
+
table.add_row(f"GPU {gpu_id} utilization", f"{latest_util * 100:.1f}%")
|
|
148
223
|
|
|
149
|
-
# Add GPU memory right after its utilization
|
|
150
224
|
latest_memory = self._get_latest_metric(gpu_memory.get(gpu_id, []))
|
|
151
225
|
if latest_memory is not None:
|
|
152
226
|
formatted_value, color = self._format_bytes(latest_memory)
|
|
153
227
|
table.add_row(
|
|
154
|
-
f"GPU {gpu_id}
|
|
228
|
+
f"GPU {gpu_id} memory", Text(formatted_value, style=color)
|
|
155
229
|
)
|
|
156
230
|
|
|
157
|
-
|
|
158
|
-
if gpu_id != max(set(gpu_metrics.keys()) | set(gpu_memory.keys())):
|
|
231
|
+
if idx != len(keys) - 1:
|
|
159
232
|
table.add_section()
|
|
160
233
|
|
|
161
|
-
|
|
162
|
-
if
|
|
163
|
-
|
|
164
|
-
|
|
234
|
+
ephemeral_storage = metrics.get("ephemeral_storage")
|
|
235
|
+
if ephemeral_storage:
|
|
236
|
+
if gpu_utilization or gpu_memory:
|
|
237
|
+
table.add_section()
|
|
165
238
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
cache_storage_metrics = metrics_data.get("cache")
|
|
169
|
-
if ephemeral_storage_metrics or cache_storage_metrics:
|
|
170
|
-
storage_table = Table(title="Storage Metrics")
|
|
171
|
-
storage_table.add_column("Storage Type")
|
|
172
|
-
storage_table.add_column("Usage")
|
|
173
|
-
storage_table.add_column("Utilization")
|
|
174
|
-
did_add_ephemeral = self._maybe_format_storage_table_row(
|
|
175
|
-
storage_table, "Ephemeral Storage", ephemeral_storage_metrics
|
|
239
|
+
usage_bytes = self._get_latest_metric(
|
|
240
|
+
ephemeral_storage.get("usage_bytes", [])
|
|
176
241
|
)
|
|
177
|
-
|
|
178
|
-
|
|
242
|
+
utilization = self._get_latest_metric(
|
|
243
|
+
ephemeral_storage.get("utilization", [])
|
|
179
244
|
)
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
245
|
+
|
|
246
|
+
if usage_bytes is not None:
|
|
247
|
+
formatted_value, color = self._format_bytes(usage_bytes)
|
|
248
|
+
table.add_row("Eph. storage usage", Text(formatted_value, style=color))
|
|
249
|
+
|
|
250
|
+
if utilization is not None:
|
|
251
|
+
utilization_percent = utilization * 100
|
|
252
|
+
if utilization_percent > 90:
|
|
253
|
+
color = "red"
|
|
254
|
+
elif utilization_percent > 70:
|
|
255
|
+
color = "yellow"
|
|
256
|
+
else:
|
|
257
|
+
color = "green"
|
|
258
|
+
table.add_row(
|
|
259
|
+
"Eph. storage utilization",
|
|
260
|
+
Text(f"{utilization_percent:.1f}%", style=color),
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
return table
|
|
264
|
+
|
|
265
|
+
def _create_storage_tables(self, metrics_data: Dict) -> List[Table]:
|
|
266
|
+
"""Create storage tables - only cache per job (ephemeral is now in node tables)"""
|
|
267
|
+
tables = []
|
|
268
|
+
|
|
269
|
+
# Create cache storage table (job-level, shown once)
|
|
270
|
+
cache_storage = metrics_data.get("cache")
|
|
271
|
+
if cache_storage:
|
|
272
|
+
table = self._create_cache_storage_table(cache_storage)
|
|
273
|
+
if table:
|
|
274
|
+
tables.append(table)
|
|
275
|
+
|
|
276
|
+
return tables
|
|
277
|
+
|
|
278
|
+
def _create_cache_storage_table(self, cache_storage: Dict) -> Optional[Table]:
|
|
279
|
+
"""Create table for cache storage metrics (job-level)"""
|
|
280
|
+
usage_bytes = self._get_latest_metric(cache_storage.get("usage_bytes", []))
|
|
281
|
+
utilization = self._get_latest_metric(cache_storage.get("utilization", []))
|
|
282
|
+
|
|
283
|
+
if usage_bytes is None and utilization is None:
|
|
284
|
+
return None
|
|
285
|
+
|
|
286
|
+
table = Table(title="Cache storage")
|
|
287
|
+
table.add_column("Storage Type")
|
|
288
|
+
table.add_column("Usage")
|
|
289
|
+
table.add_column("Utilization")
|
|
290
|
+
|
|
291
|
+
self._maybe_format_storage_table_row(table, "Cache storage", cache_storage)
|
|
292
|
+
|
|
293
|
+
return table
|
|
183
294
|
|
|
184
295
|
def watch(self, refresh_rate: int = METRICS_POLL_INTERVAL_SEC):
|
|
185
296
|
"""Display continuously updating metrics"""
|
truss/cli/train_commands.py
CHANGED
|
@@ -13,6 +13,7 @@ from truss.cli.train import common as train_common
|
|
|
13
13
|
from truss.cli.train import core
|
|
14
14
|
from truss.cli.utils import common
|
|
15
15
|
from truss.cli.utils.output import console, error_console
|
|
16
|
+
from truss.remote.baseten.core import get_training_job_logs_with_pagination
|
|
16
17
|
from truss.remote.baseten.remote import BasetenRemote
|
|
17
18
|
from truss.remote.remote_factory import RemoteFactory
|
|
18
19
|
|
|
@@ -72,8 +73,11 @@ def _prepare_click_context(f: click.Command, params: dict) -> click.Context:
|
|
|
72
73
|
@click.argument("config", type=Path, required=True)
|
|
73
74
|
@click.option("--remote", type=str, required=False, help="Remote to use")
|
|
74
75
|
@click.option("--tail", is_flag=True, help="Tail for status + logs after push.")
|
|
76
|
+
@click.option("--job-name", type=str, required=False, help="Name of the training job.")
|
|
75
77
|
@common.common_options()
|
|
76
|
-
def push_training_job(
|
|
78
|
+
def push_training_job(
|
|
79
|
+
config: Path, remote: Optional[str], tail: bool, job_name: Optional[str]
|
|
80
|
+
):
|
|
77
81
|
"""Run a training job"""
|
|
78
82
|
from truss_train import deployment
|
|
79
83
|
|
|
@@ -84,7 +88,9 @@ def push_training_job(config: Path, remote: Optional[str], tail: bool):
|
|
|
84
88
|
remote_provider: BasetenRemote = cast(
|
|
85
89
|
BasetenRemote, RemoteFactory.create(remote=remote)
|
|
86
90
|
)
|
|
87
|
-
job_resp = deployment.create_training_job_from_file(
|
|
91
|
+
job_resp = deployment.create_training_job_from_file(
|
|
92
|
+
remote_provider, config, job_name
|
|
93
|
+
)
|
|
88
94
|
|
|
89
95
|
# Note: This post create logic needs to happen outside the context
|
|
90
96
|
# of the above context manager, as only one console session can be active
|
|
@@ -138,7 +144,9 @@ def get_job_logs(
|
|
|
138
144
|
)
|
|
139
145
|
|
|
140
146
|
if not tail:
|
|
141
|
-
logs =
|
|
147
|
+
logs = get_training_job_logs_with_pagination(
|
|
148
|
+
remote_provider.api, project_id, job_id
|
|
149
|
+
)
|
|
142
150
|
for log in cli_log_utils.parse_logs(logs):
|
|
143
151
|
cli_log_utils.output_log(log)
|
|
144
152
|
else:
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import os
|
|
4
3
|
import json
|
|
5
4
|
import logging
|
|
6
5
|
import re
|
|
@@ -74,7 +73,6 @@ from truss.contexts.image_builder.util import (
|
|
|
74
73
|
)
|
|
75
74
|
from truss.contexts.truss_context import TrussContext
|
|
76
75
|
from truss.truss_handle.patch.hash import directory_content_hash
|
|
77
|
-
from truss.util.basetenpointer import model_cache_hf_to_b10ptr
|
|
78
76
|
from truss.util.jinja import read_template_from_fs
|
|
79
77
|
from truss.util.path import (
|
|
80
78
|
build_truss_target_directory,
|
|
@@ -327,36 +325,27 @@ def get_files_to_model_cache_v1(config: TrussConfig, truss_dir: Path, build_dir:
|
|
|
327
325
|
def build_model_cache_v2_and_copy_bptr_manifest(config: TrussConfig, build_dir: Path):
|
|
328
326
|
assert config.model_cache.is_v2
|
|
329
327
|
assert all(model.volume_folder is not None for model in config.model_cache.models)
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
f.write(basetenpointer_json)
|
|
352
|
-
except Exception as e:
|
|
353
|
-
logging.warning(f"debug: failed to create BasetenPointer: {e}")
|
|
354
|
-
# TODO: remove below section + remove logging lines above.
|
|
355
|
-
# builds BasetenManifest for caching
|
|
356
|
-
basetenpointers = model_cache_hf_to_b10ptr(config.model_cache)
|
|
357
|
-
# write json of bastenpointers into build dir
|
|
358
|
-
with open(build_dir / "bptr-manifest", "w") as f:
|
|
359
|
-
f.write(basetenpointers.model_dump_json())
|
|
328
|
+
from truss_transfer import PyModelRepo, create_basetenpointer_from_models
|
|
329
|
+
|
|
330
|
+
py_models = [
|
|
331
|
+
PyModelRepo(
|
|
332
|
+
repo_id=model.repo_id,
|
|
333
|
+
revision=model.revision,
|
|
334
|
+
runtime_secret_name=model.runtime_secret_name,
|
|
335
|
+
allow_patterns=model.allow_patterns,
|
|
336
|
+
ignore_patterns=model.ignore_patterns,
|
|
337
|
+
volume_folder=model.volume_folder,
|
|
338
|
+
kind=model.kind.value,
|
|
339
|
+
)
|
|
340
|
+
for model in config.model_cache.models
|
|
341
|
+
]
|
|
342
|
+
# create BasetenPointer from models
|
|
343
|
+
basetenpointer_json = create_basetenpointer_from_models(models=py_models)
|
|
344
|
+
bptr_py = json.loads(basetenpointer_json)["pointers"]
|
|
345
|
+
logging.info(f"created ({len(bptr_py)}) Basetenpointer")
|
|
346
|
+
logging.info(f"pointers json: {basetenpointer_json}")
|
|
347
|
+
with open(build_dir / "bptr-manifest", "w") as f:
|
|
348
|
+
f.write(basetenpointer_json)
|
|
360
349
|
|
|
361
350
|
|
|
362
351
|
def generate_docker_server_nginx_config(build_dir, config):
|
|
@@ -794,7 +783,6 @@ class ServingImageBuilder(ImageBuilder):
|
|
|
794
783
|
config
|
|
795
784
|
)
|
|
796
785
|
|
|
797
|
-
non_root_user = os.getenv("BT_USE_NON_ROOT_USER", False)
|
|
798
786
|
dockerfile_contents = dockerfile_template.render(
|
|
799
787
|
should_install_server_requirements=should_install_server_requirements,
|
|
800
788
|
base_image_name_and_tag=base_image_name_and_tag,
|
|
@@ -828,12 +816,7 @@ class ServingImageBuilder(ImageBuilder):
|
|
|
828
816
|
build_commands=build_commands,
|
|
829
817
|
use_local_src=config.use_local_src,
|
|
830
818
|
passthrough_environment_variables=passthrough_environment_variables,
|
|
831
|
-
|
|
832
|
-
app_username="app",
|
|
833
|
-
app_user_uid=60000,
|
|
834
|
-
control_server_dir="/control",
|
|
835
|
-
default_owner="0:0", # root user
|
|
836
|
-
**FILENAME_CONSTANTS_MAP, # Add this line
|
|
819
|
+
**FILENAME_CONSTANTS_MAP,
|
|
837
820
|
)
|
|
838
821
|
# Consolidate repeated empty lines to single empty lines.
|
|
839
822
|
dockerfile_contents = re.sub(
|