truss 0.10.9rc535__py3-none-any.whl → 0.10.10rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of truss might be problematic. Click here for more details.

Files changed (33) hide show
  1. truss/cli/logs/base_watcher.py +1 -1
  2. truss/cli/train/deploy_checkpoints/deploy_checkpoints.py +30 -22
  3. truss/cli/train/deploy_checkpoints/deploy_checkpoints_helpers.py +8 -2
  4. truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py +14 -7
  5. truss/cli/train/deploy_checkpoints/deploy_whisper_checkpoints.py +63 -0
  6. truss/cli/train/deploy_from_checkpoint_config_whisper.yml +17 -0
  7. truss/cli/train/metrics_watcher.py +170 -59
  8. truss/cli/train_commands.py +11 -3
  9. truss/contexts/image_builder/serving_image_builder.py +22 -39
  10. truss/remote/baseten/api.py +11 -0
  11. truss/remote/baseten/core.py +209 -1
  12. truss/remote/baseten/utils/time.py +15 -0
  13. truss/templates/base.Dockerfile.jinja +6 -23
  14. truss/templates/cache.Dockerfile.jinja +5 -5
  15. truss/templates/copy_cache_files.Dockerfile.jinja +1 -1
  16. truss/templates/docker_server/supervisord.conf.jinja +0 -1
  17. truss/templates/server/requirements.txt +1 -1
  18. truss/templates/server.Dockerfile.jinja +16 -33
  19. truss/tests/cli/train/test_deploy_checkpoints.py +446 -2
  20. truss/tests/cli/train/test_train_cli_core.py +96 -0
  21. truss/tests/remote/baseten/conftest.py +18 -0
  22. truss/tests/remote/baseten/test_api.py +49 -14
  23. truss/tests/remote/baseten/test_core.py +517 -1
  24. {truss-0.10.9rc535.dist-info → truss-0.10.10rc0.dist-info}/METADATA +2 -2
  25. {truss-0.10.9rc535.dist-info → truss-0.10.10rc0.dist-info}/RECORD +31 -29
  26. truss_train/definitions.py +6 -0
  27. truss_train/deployment.py +15 -2
  28. truss_train/loader.py +7 -20
  29. truss/tests/util/test_basetenpointer.py +0 -227
  30. truss/util/basetenpointer.py +0 -160
  31. {truss-0.10.9rc535.dist-info → truss-0.10.10rc0.dist-info}/WHEEL +0 -0
  32. {truss-0.10.9rc535.dist-info → truss-0.10.10rc0.dist-info}/entry_points.txt +0 -0
  33. {truss-0.10.9rc535.dist-info → truss-0.10.10rc0.dist-info}/licenses/LICENSE +0 -0
@@ -9,7 +9,7 @@ from truss.remote.baseten.api import BasetenApi
9
9
 
10
10
  POLL_INTERVAL_SEC = 2
11
11
  # NB(nikhil): This helps account for (1) log processing delays (2) clock skews
12
- CLOCK_SKEW_BUFFER_MS = 10000
12
+ CLOCK_SKEW_BUFFER_MS = 60000
13
13
 
14
14
 
15
15
  class LogWatcher(ABC):
@@ -33,6 +33,10 @@ from .deploy_lora_checkpoints import (
33
33
  hydrate_lora_checkpoint,
34
34
  render_vllm_lora_truss_config,
35
35
  )
36
+ from .deploy_whisper_checkpoints import (
37
+ hydrate_whisper_checkpoint,
38
+ render_vllm_whisper_truss_config,
39
+ )
36
40
 
37
41
  HF_TOKEN_ENVVAR_NAME = "HF_TOKEN"
38
42
  # If we change this, make sure to update the logic in backend codebase
@@ -178,6 +182,8 @@ def hydrate_checkpoint(
178
182
  return hydrate_lora_checkpoint(job_id, checkpoint_id, checkpoint)
179
183
  elif checkpoint_type.lower() == ModelWeightsFormat.FULL.value:
180
184
  return hydrate_full_checkpoint(job_id, checkpoint_id, checkpoint)
185
+ elif checkpoint_type.lower() == ModelWeightsFormat.WHISPER.value:
186
+ return hydrate_whisper_checkpoint(job_id, checkpoint_id, checkpoint)
181
187
  else:
182
188
  raise ValueError(
183
189
  f"Unsupported checkpoint type: {checkpoint_type}. Contact Baseten for support with other checkpoint types."
@@ -196,6 +202,8 @@ def _render_truss_config_for_checkpoint_deployment(
196
202
  return render_vllm_lora_truss_config(checkpoint_deploy)
197
203
  elif checkpoint_deploy.model_weight_format == ModelWeightsFormat.FULL:
198
204
  return render_vllm_full_truss_config(checkpoint_deploy)
205
+ elif checkpoint_deploy.model_weight_format == ModelWeightsFormat.WHISPER:
206
+ return render_vllm_whisper_truss_config(checkpoint_deploy)
199
207
  else:
200
208
  raise ValueError(
201
209
  f"Unsupported model weight format: {checkpoint_deploy.model_weight_format}. Please upgrade to the latest Truss version to access the latest supported formats. Contact Baseten if you would like us to support additional formats."
@@ -288,18 +296,6 @@ def _get_checkpoint_ids_to_deploy(
288
296
  return checkpoint_ids
289
297
 
290
298
 
291
- def _select_single_checkpoint(checkpoint_id_options: List[str]) -> List[str]:
292
- """Select a single checkpoint using interactive prompt."""
293
- checkpoint_id = inquirer.select(
294
- message="Select the checkpoint to deploy:", choices=checkpoint_id_options
295
- ).execute()
296
-
297
- if not checkpoint_id:
298
- raise click.UsageError("A checkpoint must be selected.")
299
-
300
- return [checkpoint_id]
301
-
302
-
303
299
  def _select_multiple_checkpoints(checkpoint_id_options: List[str]) -> List[str]:
304
300
  """Select multiple checkpoints using interactive checkbox."""
305
301
  checkpoint_ids = inquirer.checkbox(
@@ -351,6 +347,8 @@ def _get_base_model_id(user_input: Optional[str], checkpoint: dict) -> Optional[
351
347
  )
352
348
  elif checkpoint.get("checkpoint_type") == ModelWeightsFormat.FULL.value.lower():
353
349
  return None
350
+ elif checkpoint.get("checkpoint_type") == ModelWeightsFormat.WHISPER.value.lower():
351
+ return None
354
352
  else:
355
353
  base_model_id = inquirer.text(message="Enter the base model id.").execute()
356
354
  if not base_model_id:
@@ -416,18 +414,28 @@ def _validate_selected_checkpoints(
416
414
  "Unable to infer model weight format. Reach out to Baseten for support."
417
415
  )
418
416
 
419
- has_full_checkpoint = any(
420
- response_checkpoints[checkpoint_id].get("checkpoint_type")
421
- == ModelWeightsFormat.FULL.value
422
- for checkpoint_id in checkpoint_ids
423
- )
424
-
425
- if has_full_checkpoint and len(checkpoint_ids) > 1:
426
- # vLLM does not support multiple checkpoints when any checkpoint is full model weights.
427
- raise ValueError(
428
- "Full checkpoints are not supported for multiple checkpoints. Please select a single checkpoint."
417
+ validation_rules = {
418
+ ModelWeightsFormat.FULL.value: {
419
+ "error_message": "Full checkpoints are not supported for multiple checkpoints. Please select a single checkpoint.",
420
+ "reason": "vLLM does not support multiple checkpoints when any checkpoint is full model weights.",
421
+ },
422
+ ModelWeightsFormat.WHISPER.value: {
423
+ "error_message": "Whisper checkpoints are not supported for multiple checkpoints. Please select a single checkpoint.",
424
+ "reason": "vLLM does not support multiple checkpoints when any checkpoint is whisper model weights.",
425
+ },
426
+ }
427
+
428
+ # Check each checkpoint type that has restrictions
429
+ for checkpoint_type, rule in validation_rules.items():
430
+ has_restricted_checkpoint = any(
431
+ response_checkpoints[checkpoint_id].get("checkpoint_type")
432
+ == checkpoint_type
433
+ for checkpoint_id in checkpoint_ids
429
434
  )
430
435
 
436
+ if has_restricted_checkpoint and len(checkpoint_ids) > 1:
437
+ raise ValueError(rule["error_message"])
438
+
431
439
 
432
440
  def get_hf_secret_name(user_input: Union[str, SecretReference, None]) -> str:
433
441
  """Get HuggingFace secret name from user input or prompt for it."""
@@ -3,7 +3,7 @@ from pathlib import Path
3
3
 
4
4
  from truss.base import truss_config
5
5
  from truss.cli.train.types import DeployCheckpointsConfigComplete
6
- from truss_train.definitions import SecretReference
6
+ from truss_train.definitions import ModelWeightsFormat, SecretReference
7
7
 
8
8
  START_COMMAND_ENVVAR_NAME = "BT_DOCKER_SERVER_START_CMD"
9
9
 
@@ -12,8 +12,14 @@ def setup_base_truss_config(
12
12
  checkpoint_deploy: DeployCheckpointsConfigComplete,
13
13
  ) -> truss_config.TrussConfig:
14
14
  """Set up the base truss config with common properties."""
15
+ truss_deploy_config = None
16
+ truss_base_file = (
17
+ "deploy_from_checkpoint_config_whisper.yml"
18
+ if checkpoint_deploy.model_weight_format == ModelWeightsFormat.WHISPER
19
+ else "deploy_from_checkpoint_config.yml"
20
+ )
15
21
  truss_deploy_config = truss_config.TrussConfig.from_yaml(
16
- Path(os.path.dirname(__file__), "..", "deploy_from_checkpoint_config.yml")
22
+ Path(os.path.dirname(__file__), "..", truss_base_file)
17
23
  )
18
24
  if not truss_deploy_config.docker_server:
19
25
  raise ValueError(
@@ -14,12 +14,19 @@ from .deploy_checkpoints_helpers import (
14
14
  setup_environment_variables_and_secrets,
15
15
  )
16
16
 
17
+ # NB(aghilan): Transformers was recently changed to save a chat_template.jinja file instead of inside the tokenizer_config.json file.
18
+ # Old Models will not have this file, so we check for it and use it if it exists.
19
+ # vLLM will not automatically resolve the chat_template.jinja file, so we need to pass it to the start command.
20
+ # This logic is needed for any models trained using Transformers v4.51.3 or later
17
21
  VLLM_FULL_START_COMMAND = Template(
18
- 'sh -c "{%if envvars %}{{ envvars }} {% endif %}vllm serve {{ model_path }}'
19
- + " --port 8000"
20
- + " --tensor-parallel-size {{ specify_tensor_parallelism }}"
21
- + " --dtype bfloat16"
22
- + '"'
22
+ "sh -c '{% if envvars %}{{ envvars }} {% endif %}"
23
+ 'HF_TOKEN="$$(cat /secrets/hf_access_token)" && export HF_TOKEN && '
24
+ "if [ -f {{ model_path }}/chat_template.jinja ]; then "
25
+ " vllm serve {{ model_path }} --chat-template {{ model_path }}/chat_template.jinja "
26
+ " --port 8000 --tensor-parallel-size {{ specify_tensor_parallelism }} --dtype bfloat16; "
27
+ "else "
28
+ " vllm serve {{ model_path }} --port 8000 --tensor-parallel-size {{ specify_tensor_parallelism }} --dtype bfloat16; "
29
+ "fi'"
23
30
  )
24
31
 
25
32
 
@@ -33,7 +40,7 @@ def render_vllm_full_truss_config(
33
40
  truss_deploy_config, checkpoint_deploy
34
41
  )
35
42
 
36
- checkpoint_str = _build_full_checkpoint_string(truss_deploy_config)
43
+ checkpoint_str = build_full_checkpoint_string(truss_deploy_config)
37
44
 
38
45
  accelerator = checkpoint_deploy.compute.accelerator
39
46
 
@@ -64,7 +71,7 @@ def hydrate_full_checkpoint(
64
71
  return FullCheckpoint(training_job_id=job_id, paths=paths)
65
72
 
66
73
 
67
- def _build_full_checkpoint_string(truss_deploy_config) -> str:
74
+ def build_full_checkpoint_string(truss_deploy_config) -> str:
68
75
  """Build checkpoint string from artifact references for full checkpoints.
69
76
 
70
77
  Args:
@@ -0,0 +1,63 @@
1
+ from jinja2 import Template
2
+
3
+ from truss.base import truss_config
4
+ from truss.cli.train.deploy_checkpoints.deploy_checkpoints_helpers import (
5
+ START_COMMAND_ENVVAR_NAME,
6
+ )
7
+ from truss.cli.train.deploy_checkpoints.deploy_full_checkpoints import (
8
+ build_full_checkpoint_string,
9
+ )
10
+ from truss.cli.train.types import DeployCheckpointsConfigComplete
11
+ from truss_train.definitions import WhisperCheckpoint
12
+
13
+ from .deploy_checkpoints_helpers import (
14
+ setup_base_truss_config,
15
+ setup_environment_variables_and_secrets,
16
+ )
17
+
18
+ VLLM_WHISPER_START_COMMAND = Template(
19
+ "sh -c '{% if envvars %}{{ envvars }} {% endif %}"
20
+ 'HF_TOKEN="$$(cat /secrets/hf_access_token)" && export HF_TOKEN && '
21
+ "vllm serve {{ model_path }} --port 8000 --tensor-parallel-size {{ specify_tensor_parallelism }}'"
22
+ )
23
+
24
+
25
+ def render_vllm_whisper_truss_config(
26
+ checkpoint_deploy: DeployCheckpointsConfigComplete,
27
+ ) -> truss_config.TrussConfig:
28
+ """Render truss config specifically for whisper checkpoints using vLLM."""
29
+ truss_deploy_config = setup_base_truss_config(checkpoint_deploy)
30
+
31
+ start_command_envvars = setup_environment_variables_and_secrets(
32
+ truss_deploy_config, checkpoint_deploy
33
+ )
34
+
35
+ checkpoint_str = build_full_checkpoint_string(truss_deploy_config)
36
+
37
+ accelerator = checkpoint_deploy.compute.accelerator
38
+
39
+ start_command_args = {
40
+ "model_path": checkpoint_str,
41
+ "envvars": start_command_envvars,
42
+ "specify_tensor_parallelism": accelerator.count if accelerator else 1,
43
+ }
44
+ # Note: we set the start command as an environment variable in supervisord config.
45
+ # This is so that we don't have to change the supervisord config when the start command changes.
46
+ # Our goal is to reduce the number of times we need to rebuild the image, and allow us to deploy faster.
47
+ start_command = VLLM_WHISPER_START_COMMAND.render(**start_command_args)
48
+ truss_deploy_config.environment_variables[START_COMMAND_ENVVAR_NAME] = start_command
49
+ # Note: supervisord uses the convention %(ENV_VAR_NAME)s to access environment variable VAR_NAME
50
+ truss_deploy_config.docker_server.start_command = ( # type: ignore[union-attr]
51
+ f"%(ENV_{START_COMMAND_ENVVAR_NAME})s"
52
+ )
53
+
54
+ return truss_deploy_config
55
+
56
+
57
+ def hydrate_whisper_checkpoint(
58
+ job_id: str, checkpoint_id: str, checkpoint: dict
59
+ ) -> WhisperCheckpoint:
60
+ """Create a Checkpoint object for whisper model weights."""
61
+ # NOTE: Slash at the end is important since it means the checkpoint is a directory
62
+ paths = [f"rank-0/{checkpoint_id}/"]
63
+ return WhisperCheckpoint(training_job_id=job_id, paths=paths)
@@ -0,0 +1,17 @@
1
+ base_image:
2
+ image: vllm/vllm-openai:latest
3
+
4
+ docker_server:
5
+ start_command: sh -c "" # replaced when deploying
6
+ readiness_endpoint: /health
7
+ liveness_endpoint: /health
8
+ predict_endpoint: /v1/audio/transcriptions
9
+ server_port: 8000
10
+ runtime:
11
+ predict_concurrency : 256
12
+ environment_variables:
13
+ VLLM_LOGGING_LEVEL: WARNING
14
+ VLLM_USE_V1: 0
15
+ HF_HUB_ENABLE_HF_TRANSFER: 1
16
+ requirements:
17
+ - vllm[audio]
@@ -4,6 +4,7 @@ import traceback
4
4
  from typing import Any, Dict, List, Optional, Tuple, cast
5
5
 
6
6
  from rich.columns import Columns
7
+ from rich.layout import Layout
7
8
  from rich.live import Live
8
9
  from rich.table import Table
9
10
  from rich.text import Text
@@ -96,90 +97,200 @@ class MetricsWatcher(TrainingPollerMixin):
96
97
  )
97
98
  return True
98
99
 
99
- def create_metrics_table(self, metrics_data: Dict) -> Columns:
100
+ def create_metrics_table(self, metrics_data: Dict) -> Layout:
100
101
  """Create a Rich table with the metrics"""
101
- compute_table = self._create_compute_table(metrics_data)
102
- storage_table = self._maybe_create_storage_table(metrics_data)
103
- tables = [compute_table]
104
- if storage_table:
105
- tables.append(storage_table)
106
- return Columns(tables, title="Training Job Metrics")
107
-
108
- def _create_compute_table(self, metrics_data: Dict) -> Table:
109
- table = Table(title="Compute Metrics")
110
- table.add_column("Metric")
111
- table.add_column("Value")
102
+ tables = []
103
+
104
+ timestamp = self._get_timestamp_from_metrics(metrics_data)
105
+
106
+ node_tables = self._create_unified_node_metrics_tables(metrics_data)
107
+ tables.extend(node_tables)
108
+
109
+ storage_tables = self._create_storage_tables(metrics_data)
110
+ tables.extend(storage_tables)
111
+
112
+ columns = Columns(tables, title="Training Job Metrics")
113
+
114
+ layout = Layout()
115
+
116
+ if timestamp:
117
+ from rich.panel import Panel
118
+
119
+ layout.split_column(
120
+ Layout(
121
+ Panel(
122
+ f"🕐 Last Updated: {timestamp}\n💡 Press Ctrl+C to exit",
123
+ style="bold cyan",
124
+ ),
125
+ size=4,
126
+ ),
127
+ Layout(columns),
128
+ )
129
+ else:
130
+ layout.split_column(Layout(columns))
131
+
132
+ return layout
133
+
134
+ def _get_timestamp_from_metrics(self, metrics_data: Dict) -> Optional[str]:
135
+ """Extract timestamp from metrics data for display"""
136
+ # Try to get timestamp from per_node_metrics first. Fall back to main metrics if not there.
137
+ per_node_metrics = metrics_data.get("per_node_metrics", [])
138
+ if per_node_metrics and len(per_node_metrics) > 0:
139
+ first_node_metrics = per_node_metrics[0].get("metrics", {})
140
+ cpu_usage_data = first_node_metrics.get("cpu_usage", [])
141
+ if cpu_usage_data and len(cpu_usage_data) > 0:
142
+ timestamp = cpu_usage_data[-1].get("timestamp")
143
+ if timestamp:
144
+ return common.format_localized_time(timestamp)
112
145
 
113
- # Add timestamp if available
114
146
  cpu_usage_data = metrics_data.get("cpu_usage", [])
115
147
  if cpu_usage_data and len(cpu_usage_data) > 0:
116
- latest_timestamp = cpu_usage_data[-1].get("timestamp")
117
- # TODO: API result has missing timezone info.
118
- if latest_timestamp:
119
- table.add_row(
120
- "Timestamp", common.format_localized_time(latest_timestamp)
121
- )
122
- table.add_section()
148
+ timestamp = cpu_usage_data[-1].get("timestamp")
149
+ if timestamp:
150
+ return common.format_localized_time(timestamp)
151
+
152
+ return None
153
+
154
+ def _create_unified_node_metrics_tables(self, metrics_data: Dict) -> List[Table]:
155
+ """Create tables for node metrics, handling both single and multi-node scenarios"""
156
+ tables = []
157
+
158
+ per_node_metrics = metrics_data.get("per_node_metrics", [])
159
+
160
+ if not per_node_metrics:
161
+ # Job is likely just starting up - it takes some type for the
162
+ # the metrics to become available after the job starts running.
163
+ from rich.text import Text
164
+
165
+ waiting_table = Table(title="Training Job Status")
166
+ waiting_table.add_column("Status")
167
+ waiting_table.add_column("Message")
168
+
169
+ waiting_table.add_row(
170
+ "Status",
171
+ Text("⏳ Waiting for metrics to become available...", style="yellow"),
172
+ )
173
+ waiting_table.add_row(
174
+ "Note",
175
+ Text(
176
+ "Metrics will appear once the training job starts running.",
177
+ style="dim",
178
+ ),
179
+ )
180
+
181
+ tables.append(waiting_table)
182
+ return tables
183
+
184
+ for node_metrics in per_node_metrics:
185
+ node_id = node_metrics.get("node_id", "Unknown")
186
+ metrics = node_metrics.get("metrics", {})
187
+
188
+ if not metrics:
189
+ continue
123
190
 
124
- # CPU metrics
125
- cpu_usage = self._get_latest_metric(metrics_data.get("cpu_usage", []))
191
+ table = self._create_node_table(node_id, metrics)
192
+ tables.append(table)
193
+
194
+ return tables
195
+
196
+ def _create_node_table(self, node_id: str, metrics: Dict) -> Table:
197
+ """Create a table for a single node's metrics"""
198
+ table = Table(title=f"Node: {node_id}")
199
+ table.add_column("Metric")
200
+ table.add_column("Value")
201
+
202
+ cpu_usage = self._get_latest_metric(metrics.get("cpu_usage", []))
126
203
  if cpu_usage is not None:
127
- table.add_row("CPU Usage", f"{cpu_usage:.2f} cores")
204
+ table.add_row("CPU usage", f"{cpu_usage:.2f} cores")
128
205
 
129
- cpu_memory = self._get_latest_metric(
130
- metrics_data.get("cpu_memory_usage_bytes", [])
131
- )
206
+ cpu_memory = self._get_latest_metric(metrics.get("cpu_memory_usage_bytes", []))
132
207
  if cpu_memory is not None:
133
208
  formatted_value, color = self._format_bytes(cpu_memory)
134
- table.add_row("CPU Memory", Text(formatted_value, style=color))
209
+ table.add_row("CPU memory", Text(formatted_value, style=color))
135
210
 
136
- # Add separator after CPU metrics
137
- table.add_section()
211
+ if cpu_usage is not None or cpu_memory is not None:
212
+ table.add_section()
138
213
 
139
- # GPU metrics - grouped by GPU ID
140
- gpu_metrics = metrics_data.get("gpu_utilization", {})
141
- gpu_memory = metrics_data.get("gpu_memory_usage_bytes", {})
214
+ gpu_utilization = metrics.get("gpu_utilization", {})
215
+ gpu_memory = metrics.get("gpu_memory_usage_bytes", {})
142
216
 
143
- for gpu_id in sorted(set(gpu_metrics.keys()) | set(gpu_memory.keys())):
144
- # Add GPU utilization
145
- latest_util = self._get_latest_metric(gpu_metrics.get(gpu_id, []))
217
+ # API should return same GPU IDs for utilization and memory
218
+ keys = gpu_utilization.keys()
219
+ for idx, gpu_id in enumerate(keys):
220
+ latest_util = self._get_latest_metric(gpu_utilization.get(gpu_id, []))
146
221
  if latest_util is not None:
147
- table.add_row(f"GPU {gpu_id} Usage", f"{latest_util * 100:.1f}%")
222
+ table.add_row(f"GPU {gpu_id} utilization", f"{latest_util * 100:.1f}%")
148
223
 
149
- # Add GPU memory right after its utilization
150
224
  latest_memory = self._get_latest_metric(gpu_memory.get(gpu_id, []))
151
225
  if latest_memory is not None:
152
226
  formatted_value, color = self._format_bytes(latest_memory)
153
227
  table.add_row(
154
- f"GPU {gpu_id} Memory", Text(formatted_value, style=color)
228
+ f"GPU {gpu_id} memory", Text(formatted_value, style=color)
155
229
  )
156
230
 
157
- # Add separator after each GPU's metrics (except for the last one)
158
- if gpu_id != max(set(gpu_metrics.keys()) | set(gpu_memory.keys())):
231
+ if idx != len(keys) - 1:
159
232
  table.add_section()
160
233
 
161
- # Add separator before storage metrics
162
- if gpu_metrics or gpu_memory:
163
- table.add_section()
164
- return table
234
+ ephemeral_storage = metrics.get("ephemeral_storage")
235
+ if ephemeral_storage:
236
+ if gpu_utilization or gpu_memory:
237
+ table.add_section()
165
238
 
166
- def _maybe_create_storage_table(self, metrics_data: Dict) -> Optional[Table]:
167
- ephemeral_storage_metrics = metrics_data.get("ephemeral_storage")
168
- cache_storage_metrics = metrics_data.get("cache")
169
- if ephemeral_storage_metrics or cache_storage_metrics:
170
- storage_table = Table(title="Storage Metrics")
171
- storage_table.add_column("Storage Type")
172
- storage_table.add_column("Usage")
173
- storage_table.add_column("Utilization")
174
- did_add_ephemeral = self._maybe_format_storage_table_row(
175
- storage_table, "Ephemeral Storage", ephemeral_storage_metrics
239
+ usage_bytes = self._get_latest_metric(
240
+ ephemeral_storage.get("usage_bytes", [])
176
241
  )
177
- did_add_cache = self._maybe_format_storage_table_row(
178
- storage_table, "Cache Storage", cache_storage_metrics
242
+ utilization = self._get_latest_metric(
243
+ ephemeral_storage.get("utilization", [])
179
244
  )
180
- if did_add_ephemeral or did_add_cache:
181
- return storage_table
182
- return None
245
+
246
+ if usage_bytes is not None:
247
+ formatted_value, color = self._format_bytes(usage_bytes)
248
+ table.add_row("Eph. storage usage", Text(formatted_value, style=color))
249
+
250
+ if utilization is not None:
251
+ utilization_percent = utilization * 100
252
+ if utilization_percent > 90:
253
+ color = "red"
254
+ elif utilization_percent > 70:
255
+ color = "yellow"
256
+ else:
257
+ color = "green"
258
+ table.add_row(
259
+ "Eph. storage utilization",
260
+ Text(f"{utilization_percent:.1f}%", style=color),
261
+ )
262
+
263
+ return table
264
+
265
+ def _create_storage_tables(self, metrics_data: Dict) -> List[Table]:
266
+ """Create storage tables - only cache per job (ephemeral is now in node tables)"""
267
+ tables = []
268
+
269
+ # Create cache storage table (job-level, shown once)
270
+ cache_storage = metrics_data.get("cache")
271
+ if cache_storage:
272
+ table = self._create_cache_storage_table(cache_storage)
273
+ if table:
274
+ tables.append(table)
275
+
276
+ return tables
277
+
278
+ def _create_cache_storage_table(self, cache_storage: Dict) -> Optional[Table]:
279
+ """Create table for cache storage metrics (job-level)"""
280
+ usage_bytes = self._get_latest_metric(cache_storage.get("usage_bytes", []))
281
+ utilization = self._get_latest_metric(cache_storage.get("utilization", []))
282
+
283
+ if usage_bytes is None and utilization is None:
284
+ return None
285
+
286
+ table = Table(title="Cache storage")
287
+ table.add_column("Storage Type")
288
+ table.add_column("Usage")
289
+ table.add_column("Utilization")
290
+
291
+ self._maybe_format_storage_table_row(table, "Cache storage", cache_storage)
292
+
293
+ return table
183
294
 
184
295
  def watch(self, refresh_rate: int = METRICS_POLL_INTERVAL_SEC):
185
296
  """Display continuously updating metrics"""
@@ -13,6 +13,7 @@ from truss.cli.train import common as train_common
13
13
  from truss.cli.train import core
14
14
  from truss.cli.utils import common
15
15
  from truss.cli.utils.output import console, error_console
16
+ from truss.remote.baseten.core import get_training_job_logs_with_pagination
16
17
  from truss.remote.baseten.remote import BasetenRemote
17
18
  from truss.remote.remote_factory import RemoteFactory
18
19
 
@@ -72,8 +73,11 @@ def _prepare_click_context(f: click.Command, params: dict) -> click.Context:
72
73
  @click.argument("config", type=Path, required=True)
73
74
  @click.option("--remote", type=str, required=False, help="Remote to use")
74
75
  @click.option("--tail", is_flag=True, help="Tail for status + logs after push.")
76
+ @click.option("--job-name", type=str, required=False, help="Name of the training job.")
75
77
  @common.common_options()
76
- def push_training_job(config: Path, remote: Optional[str], tail: bool):
78
+ def push_training_job(
79
+ config: Path, remote: Optional[str], tail: bool, job_name: Optional[str]
80
+ ):
77
81
  """Run a training job"""
78
82
  from truss_train import deployment
79
83
 
@@ -84,7 +88,9 @@ def push_training_job(config: Path, remote: Optional[str], tail: bool):
84
88
  remote_provider: BasetenRemote = cast(
85
89
  BasetenRemote, RemoteFactory.create(remote=remote)
86
90
  )
87
- job_resp = deployment.create_training_job_from_file(remote_provider, config)
91
+ job_resp = deployment.create_training_job_from_file(
92
+ remote_provider, config, job_name
93
+ )
88
94
 
89
95
  # Note: This post create logic needs to happen outside the context
90
96
  # of the above context manager, as only one console session can be active
@@ -138,7 +144,9 @@ def get_job_logs(
138
144
  )
139
145
 
140
146
  if not tail:
141
- logs = remote_provider.api.get_training_job_logs(project_id, job_id)
147
+ logs = get_training_job_logs_with_pagination(
148
+ remote_provider.api, project_id, job_id
149
+ )
142
150
  for log in cli_log_utils.parse_logs(logs):
143
151
  cli_log_utils.output_log(log)
144
152
  else:
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- import os
4
3
  import json
5
4
  import logging
6
5
  import re
@@ -74,7 +73,6 @@ from truss.contexts.image_builder.util import (
74
73
  )
75
74
  from truss.contexts.truss_context import TrussContext
76
75
  from truss.truss_handle.patch.hash import directory_content_hash
77
- from truss.util.basetenpointer import model_cache_hf_to_b10ptr
78
76
  from truss.util.jinja import read_template_from_fs
79
77
  from truss.util.path import (
80
78
  build_truss_target_directory,
@@ -327,36 +325,27 @@ def get_files_to_model_cache_v1(config: TrussConfig, truss_dir: Path, build_dir:
327
325
  def build_model_cache_v2_and_copy_bptr_manifest(config: TrussConfig, build_dir: Path):
328
326
  assert config.model_cache.is_v2
329
327
  assert all(model.volume_folder is not None for model in config.model_cache.models)
330
- try:
331
- from truss_transfer import PyModelRepo, create_basetenpointer_from_models
332
-
333
- py_models = [
334
- PyModelRepo(
335
- repo_id=model.repo_id,
336
- revision=model.revision,
337
- runtime_secret_name=model.runtime_secret_name,
338
- allow_patterns=model.allow_patterns,
339
- ignore_patterns=model.ignore_patterns,
340
- volume_folder=model.volume_folder,
341
- kind=model.kind.value,
342
- )
343
- for model in config.model_cache.models
344
- ]
345
- # create BasetenPointer from models
346
- basetenpointer_json = create_basetenpointer_from_models(models=py_models)
347
- bptr_py = json.loads(basetenpointer_json)["pointers"]
348
- logging.info(f"created ({len(bptr_py)}) Basetenpointer")
349
- logging.info(f"pointers json: {basetenpointer_json}")
350
- with open(build_dir / "bptr-manifest", "w") as f:
351
- f.write(basetenpointer_json)
352
- except Exception as e:
353
- logging.warning(f"debug: failed to create BasetenPointer: {e}")
354
- # TODO: remove below section + remove logging lines above.
355
- # builds BasetenManifest for caching
356
- basetenpointers = model_cache_hf_to_b10ptr(config.model_cache)
357
- # write json of bastenpointers into build dir
358
- with open(build_dir / "bptr-manifest", "w") as f:
359
- f.write(basetenpointers.model_dump_json())
328
+ from truss_transfer import PyModelRepo, create_basetenpointer_from_models
329
+
330
+ py_models = [
331
+ PyModelRepo(
332
+ repo_id=model.repo_id,
333
+ revision=model.revision,
334
+ runtime_secret_name=model.runtime_secret_name,
335
+ allow_patterns=model.allow_patterns,
336
+ ignore_patterns=model.ignore_patterns,
337
+ volume_folder=model.volume_folder,
338
+ kind=model.kind.value,
339
+ )
340
+ for model in config.model_cache.models
341
+ ]
342
+ # create BasetenPointer from models
343
+ basetenpointer_json = create_basetenpointer_from_models(models=py_models)
344
+ bptr_py = json.loads(basetenpointer_json)["pointers"]
345
+ logging.info(f"created ({len(bptr_py)}) Basetenpointer")
346
+ logging.info(f"pointers json: {basetenpointer_json}")
347
+ with open(build_dir / "bptr-manifest", "w") as f:
348
+ f.write(basetenpointer_json)
360
349
 
361
350
 
362
351
  def generate_docker_server_nginx_config(build_dir, config):
@@ -794,7 +783,6 @@ class ServingImageBuilder(ImageBuilder):
794
783
  config
795
784
  )
796
785
 
797
- non_root_user = os.getenv("BT_USE_NON_ROOT_USER", False)
798
786
  dockerfile_contents = dockerfile_template.render(
799
787
  should_install_server_requirements=should_install_server_requirements,
800
788
  base_image_name_and_tag=base_image_name_and_tag,
@@ -828,12 +816,7 @@ class ServingImageBuilder(ImageBuilder):
828
816
  build_commands=build_commands,
829
817
  use_local_src=config.use_local_src,
830
818
  passthrough_environment_variables=passthrough_environment_variables,
831
- non_root_user=non_root_user,
832
- app_username="app",
833
- app_user_uid=60000,
834
- control_server_dir="/control",
835
- default_owner="0:0", # root user
836
- **FILENAME_CONSTANTS_MAP, # Add this line
819
+ **FILENAME_CONSTANTS_MAP,
837
820
  )
838
821
  # Consolidate repeated empty lines to single empty lines.
839
822
  dockerfile_contents = re.sub(