PyPI - truss - Versions diffs - 0.10.9rc535__py3-none-any.whl → 0.10.10rc0__py3-none-any.whl - Mend

truss 0.10.9rc535py3-none-any.whl → 0.10.10rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of truss might be problematic. Click here for more details.

Files changed (33) hide show

truss/cli/logs/base_watcher.py +1 -1
truss/cli/train/deploy_checkpoints/deploy_checkpoints.py +30 -22
truss/cli/train/deploy_checkpoints/deploy_checkpoints_helpers.py +8 -2
truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py +14 -7
truss/cli/train/deploy_checkpoints/deploy_whisper_checkpoints.py +63 -0
truss/cli/train/deploy_from_checkpoint_config_whisper.yml +17 -0
truss/cli/train/metrics_watcher.py +170 -59
truss/cli/train_commands.py +11 -3
truss/contexts/image_builder/serving_image_builder.py +22 -39
truss/remote/baseten/api.py +11 -0
truss/remote/baseten/core.py +209 -1
truss/remote/baseten/utils/time.py +15 -0
truss/templates/base.Dockerfile.jinja +6 -23
truss/templates/cache.Dockerfile.jinja +5 -5
truss/templates/copy_cache_files.Dockerfile.jinja +1 -1
truss/templates/docker_server/supervisord.conf.jinja +0 -1
truss/templates/server/requirements.txt +1 -1
truss/templates/server.Dockerfile.jinja +16 -33
truss/tests/cli/train/test_deploy_checkpoints.py +446 -2
truss/tests/cli/train/test_train_cli_core.py +96 -0
truss/tests/remote/baseten/conftest.py +18 -0
truss/tests/remote/baseten/test_api.py +49 -14
truss/tests/remote/baseten/test_core.py +517 -1
{truss-0.10.9rc535.dist-info → truss-0.10.10rc0.dist-info}/METADATA +2 -2
{truss-0.10.9rc535.dist-info → truss-0.10.10rc0.dist-info}/RECORD +31 -29
truss_train/definitions.py +6 -0
truss_train/deployment.py +15 -2
truss_train/loader.py +7 -20
truss/tests/util/test_basetenpointer.py +0 -227
truss/util/basetenpointer.py +0 -160
{truss-0.10.9rc535.dist-info → truss-0.10.10rc0.dist-info}/WHEEL +0 -0
{truss-0.10.9rc535.dist-info → truss-0.10.10rc0.dist-info}/entry_points.txt +0 -0
{truss-0.10.9rc535.dist-info → truss-0.10.10rc0.dist-info}/licenses/LICENSE +0 -0

truss/cli/logs/base_watcher.py CHANGED Viewed

@@ -9,7 +9,7 @@ from truss.remote.baseten.api import BasetenApi
 POLL_INTERVAL_SEC = 2
 # NB(nikhil): This helps account for (1) log processing delays (2) clock skews
-CLOCK_SKEW_BUFFER_MS = 10000
+CLOCK_SKEW_BUFFER_MS = 60000
 class LogWatcher(ABC):

truss/cli/train/deploy_checkpoints/deploy_checkpoints.py CHANGED Viewed

@@ -33,6 +33,10 @@ from .deploy_lora_checkpoints import (
     hydrate_lora_checkpoint,
     render_vllm_lora_truss_config,
 )
+from .deploy_whisper_checkpoints import (
+    hydrate_whisper_checkpoint,
+    render_vllm_whisper_truss_config,
+)
 HF_TOKEN_ENVVAR_NAME = "HF_TOKEN"
 # If we change this, make sure to update the logic in backend codebase
@@ -178,6 +182,8 @@ def hydrate_checkpoint(
         return hydrate_lora_checkpoint(job_id, checkpoint_id, checkpoint)
     elif checkpoint_type.lower() == ModelWeightsFormat.FULL.value:
         return hydrate_full_checkpoint(job_id, checkpoint_id, checkpoint)
+    elif checkpoint_type.lower() == ModelWeightsFormat.WHISPER.value:
+        return hydrate_whisper_checkpoint(job_id, checkpoint_id, checkpoint)
     else:
         raise ValueError(
             f"Unsupported checkpoint type: {checkpoint_type}. Contact Baseten for support with other checkpoint types."
@@ -196,6 +202,8 @@ def _render_truss_config_for_checkpoint_deployment(
         return render_vllm_lora_truss_config(checkpoint_deploy)
     elif checkpoint_deploy.model_weight_format == ModelWeightsFormat.FULL:
         return render_vllm_full_truss_config(checkpoint_deploy)
+    elif checkpoint_deploy.model_weight_format == ModelWeightsFormat.WHISPER:
+        return render_vllm_whisper_truss_config(checkpoint_deploy)
     else:
         raise ValueError(
             f"Unsupported model weight format: {checkpoint_deploy.model_weight_format}. Please upgrade to the latest Truss version to access the latest supported formats. Contact Baseten if you would like us to support additional formats."
@@ -288,18 +296,6 @@ def _get_checkpoint_ids_to_deploy(
     return checkpoint_ids
-def _select_single_checkpoint(checkpoint_id_options: List[str]) -> List[str]:
-    """Select a single checkpoint using interactive prompt."""
-    checkpoint_id = inquirer.select(
-        message="Select the checkpoint to deploy:", choices=checkpoint_id_options
-    ).execute()
-    if not checkpoint_id:
-        raise click.UsageError("A checkpoint must be selected.")
-    return [checkpoint_id]
 def _select_multiple_checkpoints(checkpoint_id_options: List[str]) -> List[str]:
     """Select multiple checkpoints using interactive checkbox."""
     checkpoint_ids = inquirer.checkbox(
@@ -351,6 +347,8 @@ def _get_base_model_id(user_input: Optional[str], checkpoint: dict) -> Optional[
         )
     elif checkpoint.get("checkpoint_type") == ModelWeightsFormat.FULL.value.lower():
         return None
+    elif checkpoint.get("checkpoint_type") == ModelWeightsFormat.WHISPER.value.lower():
+        return None
     else:
         base_model_id = inquirer.text(message="Enter the base model id.").execute()
     if not base_model_id:
@@ -416,18 +414,28 @@ def _validate_selected_checkpoints(
             "Unable to infer model weight format. Reach out to Baseten for support."
         )
-    has_full_checkpoint = any(
-        response_checkpoints[checkpoint_id].get("checkpoint_type")
-        == ModelWeightsFormat.FULL.value
-        for checkpoint_id in checkpoint_ids
-    )
-    if has_full_checkpoint and len(checkpoint_ids) > 1:
-        # vLLM does not support multiple checkpoints when any checkpoint is full model weights.
-        raise ValueError(
-            "Full checkpoints are not supported for multiple checkpoints. Please select a single checkpoint."
+    validation_rules = {
+        ModelWeightsFormat.FULL.value: {
+            "error_message": "Full checkpoints are not supported for multiple checkpoints. Please select a single checkpoint.",
+            "reason": "vLLM does not support multiple checkpoints when any checkpoint is full model weights.",
+        },
+        ModelWeightsFormat.WHISPER.value: {
+            "error_message": "Whisper checkpoints are not supported for multiple checkpoints. Please select a single checkpoint.",
+            "reason": "vLLM does not support multiple checkpoints when any checkpoint is whisper model weights.",
+        },
+    }
+    # Check each checkpoint type that has restrictions
+    for checkpoint_type, rule in validation_rules.items():
+        has_restricted_checkpoint = any(
+            response_checkpoints[checkpoint_id].get("checkpoint_type")
+            == checkpoint_type
+            for checkpoint_id in checkpoint_ids
         )
+        if has_restricted_checkpoint and len(checkpoint_ids) > 1:
+            raise ValueError(rule["error_message"])
 def get_hf_secret_name(user_input: Union[str, SecretReference, None]) -> str:
     """Get HuggingFace secret name from user input or prompt for it."""

truss/cli/train/deploy_checkpoints/deploy_checkpoints_helpers.py CHANGED Viewed

@@ -3,7 +3,7 @@ from pathlib import Path
 from truss.base import truss_config
 from truss.cli.train.types import DeployCheckpointsConfigComplete
-from truss_train.definitions import SecretReference
+from truss_train.definitions import ModelWeightsFormat, SecretReference
 START_COMMAND_ENVVAR_NAME = "BT_DOCKER_SERVER_START_CMD"
@@ -12,8 +12,14 @@ def setup_base_truss_config(
     checkpoint_deploy: DeployCheckpointsConfigComplete,
 ) -> truss_config.TrussConfig:
     """Set up the base truss config with common properties."""
+    truss_deploy_config = None
+    truss_base_file = (
+        "deploy_from_checkpoint_config_whisper.yml"
+        if checkpoint_deploy.model_weight_format == ModelWeightsFormat.WHISPER
+        else "deploy_from_checkpoint_config.yml"
+    )
     truss_deploy_config = truss_config.TrussConfig.from_yaml(
-        Path(os.path.dirname(__file__), "..", "deploy_from_checkpoint_config.yml")
+        Path(os.path.dirname(__file__), "..", truss_base_file)
     )
     if not truss_deploy_config.docker_server:
         raise ValueError(

truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py CHANGED Viewed

@@ -14,12 +14,19 @@ from .deploy_checkpoints_helpers import (
     setup_environment_variables_and_secrets,
 )
+# NB(aghilan): Transformers was recently changed to save a chat_template.jinja file instead of inside the tokenizer_config.json file.
+# Old Models will not have this file, so we check for it and use it if it exists.
+# vLLM will not automatically resolve the chat_template.jinja file, so we need to pass it to the start command.
+# This logic is needed for any models trained using Transformers v4.51.3 or later
 VLLM_FULL_START_COMMAND = Template(
-    'sh -c "{%if envvars %}{{ envvars }} {% endif %}vllm serve {{ model_path }}'
-    + " --port 8000"
-    + " --tensor-parallel-size {{ specify_tensor_parallelism }}"
-    + " --dtype bfloat16"
-    + '"'
+    "sh -c '{% if envvars %}{{ envvars }} {% endif %}"
+    'HF_TOKEN="$$(cat /secrets/hf_access_token)" && export HF_TOKEN && '
+    "if [ -f {{ model_path }}/chat_template.jinja ]; then "
+    "  vllm serve {{ model_path }} --chat-template {{ model_path }}/chat_template.jinja "
+    "  --port 8000 --tensor-parallel-size {{ specify_tensor_parallelism }} --dtype bfloat16; "
+    "else "
+    "  vllm serve {{ model_path }} --port 8000 --tensor-parallel-size {{ specify_tensor_parallelism }} --dtype bfloat16; "
+    "fi'"
 )
@@ -33,7 +40,7 @@ def render_vllm_full_truss_config(
         truss_deploy_config, checkpoint_deploy
     )
-    checkpoint_str = _build_full_checkpoint_string(truss_deploy_config)
+    checkpoint_str = build_full_checkpoint_string(truss_deploy_config)
     accelerator = checkpoint_deploy.compute.accelerator
@@ -64,7 +71,7 @@ def hydrate_full_checkpoint(
     return FullCheckpoint(training_job_id=job_id, paths=paths)
-def _build_full_checkpoint_string(truss_deploy_config) -> str:
+def build_full_checkpoint_string(truss_deploy_config) -> str:
     """Build checkpoint string from artifact references for full checkpoints.
     Args:

truss/cli/train/deploy_checkpoints/deploy_whisper_checkpoints.py ADDED Viewed

@@ -0,0 +1,63 @@
+from jinja2 import Template
+from truss.base import truss_config
+from truss.cli.train.deploy_checkpoints.deploy_checkpoints_helpers import (
+    START_COMMAND_ENVVAR_NAME,
+)
+from truss.cli.train.deploy_checkpoints.deploy_full_checkpoints import (
+    build_full_checkpoint_string,
+)
+from truss.cli.train.types import DeployCheckpointsConfigComplete
+from truss_train.definitions import WhisperCheckpoint
+from .deploy_checkpoints_helpers import (
+    setup_base_truss_config,
+    setup_environment_variables_and_secrets,
+)
+VLLM_WHISPER_START_COMMAND = Template(
+    "sh -c '{% if envvars %}{{ envvars }} {% endif %}"
+    'HF_TOKEN="$$(cat /secrets/hf_access_token)" && export HF_TOKEN && '
+    "vllm serve {{ model_path }} --port 8000 --tensor-parallel-size {{ specify_tensor_parallelism }}'"
+)
+def render_vllm_whisper_truss_config(
+    checkpoint_deploy: DeployCheckpointsConfigComplete,
+) -> truss_config.TrussConfig:
+    """Render truss config specifically for whisper checkpoints using vLLM."""
+    truss_deploy_config = setup_base_truss_config(checkpoint_deploy)
+    start_command_envvars = setup_environment_variables_and_secrets(
+        truss_deploy_config, checkpoint_deploy
+    )
+    checkpoint_str = build_full_checkpoint_string(truss_deploy_config)
+    accelerator = checkpoint_deploy.compute.accelerator
+    start_command_args = {
+        "model_path": checkpoint_str,
+        "envvars": start_command_envvars,
+        "specify_tensor_parallelism": accelerator.count if accelerator else 1,
+    }
+    # Note: we set the start command as an environment variable in supervisord config.
+    # This is so that we don't have to change the supervisord config when the start command changes.
+    # Our goal is to reduce the number of times we need to rebuild the image, and allow us to deploy faster.
+    start_command = VLLM_WHISPER_START_COMMAND.render(**start_command_args)
+    truss_deploy_config.environment_variables[START_COMMAND_ENVVAR_NAME] = start_command
+    # Note: supervisord uses the convention %(ENV_VAR_NAME)s to access environment variable VAR_NAME
+    truss_deploy_config.docker_server.start_command = (  # type: ignore[union-attr]
+        f"%(ENV_{START_COMMAND_ENVVAR_NAME})s"
+    )
+    return truss_deploy_config
+def hydrate_whisper_checkpoint(
+    job_id: str, checkpoint_id: str, checkpoint: dict
+) -> WhisperCheckpoint:
+    """Create a Checkpoint object for whisper model weights."""
+    # NOTE: Slash at the end is important since it means the checkpoint is a directory
+    paths = [f"rank-0/{checkpoint_id}/"]
+    return WhisperCheckpoint(training_job_id=job_id, paths=paths)

truss/cli/train/deploy_from_checkpoint_config_whisper.yml ADDED Viewed

@@ -0,0 +1,17 @@
+base_image:
+  image:  vllm/vllm-openai:latest
+docker_server:
+  start_command: sh -c "" # replaced when deploying
+  readiness_endpoint: /health
+  liveness_endpoint: /health
+  predict_endpoint: /v1/audio/transcriptions
+  server_port: 8000
+runtime:
+  predict_concurrency : 256
+environment_variables:
+  VLLM_LOGGING_LEVEL: WARNING
+  VLLM_USE_V1: 0
+  HF_HUB_ENABLE_HF_TRANSFER: 1
+requirements:
+  - vllm[audio]

truss/cli/train/metrics_watcher.py CHANGED Viewed

@@ -4,6 +4,7 @@ import traceback
 from typing import Any, Dict, List, Optional, Tuple, cast
 from rich.columns import Columns
+from rich.layout import Layout
 from rich.live import Live
 from rich.table import Table
 from rich.text import Text
@@ -96,90 +97,200 @@ class MetricsWatcher(TrainingPollerMixin):
         )
         return True
-    def create_metrics_table(self, metrics_data: Dict) -> Columns:
+    def create_metrics_table(self, metrics_data: Dict) -> Layout:
         """Create a Rich table with the metrics"""
-        compute_table = self._create_compute_table(metrics_data)
-        storage_table = self._maybe_create_storage_table(metrics_data)
-        tables = [compute_table]
-        if storage_table:
-            tables.append(storage_table)
-        return Columns(tables, title="Training Job Metrics")
-    def _create_compute_table(self, metrics_data: Dict) -> Table:
-        table = Table(title="Compute Metrics")
-        table.add_column("Metric")
-        table.add_column("Value")
+        tables = []
+        timestamp = self._get_timestamp_from_metrics(metrics_data)
+        node_tables = self._create_unified_node_metrics_tables(metrics_data)
+        tables.extend(node_tables)
+        storage_tables = self._create_storage_tables(metrics_data)
+        tables.extend(storage_tables)
+        columns = Columns(tables, title="Training Job Metrics")
+        layout = Layout()
+        if timestamp:
+            from rich.panel import Panel
+            layout.split_column(
+                Layout(
+                    Panel(
+                        f"🕐 Last Updated: {timestamp}\n💡 Press Ctrl+C to exit",
+                        style="bold cyan",
+                    ),
+                    size=4,
+                ),
+                Layout(columns),
+            )
+        else:
+            layout.split_column(Layout(columns))
+        return layout
+    def _get_timestamp_from_metrics(self, metrics_data: Dict) -> Optional[str]:
+        """Extract timestamp from metrics data for display"""
+        # Try to get timestamp from per_node_metrics first. Fall back to main metrics if not there.
+        per_node_metrics = metrics_data.get("per_node_metrics", [])
+        if per_node_metrics and len(per_node_metrics) > 0:
+            first_node_metrics = per_node_metrics[0].get("metrics", {})
+            cpu_usage_data = first_node_metrics.get("cpu_usage", [])
+            if cpu_usage_data and len(cpu_usage_data) > 0:
+                timestamp = cpu_usage_data[-1].get("timestamp")
+                if timestamp:
+                    return common.format_localized_time(timestamp)
-        # Add timestamp if available
         cpu_usage_data = metrics_data.get("cpu_usage", [])
         if cpu_usage_data and len(cpu_usage_data) > 0:
-            latest_timestamp = cpu_usage_data[-1].get("timestamp")
-            # TODO: API result has missing timezone info.
-            if latest_timestamp:
-                table.add_row(
-                    "Timestamp", common.format_localized_time(latest_timestamp)
-                )
-                table.add_section()
+            timestamp = cpu_usage_data[-1].get("timestamp")
+            if timestamp:
+                return common.format_localized_time(timestamp)
+        return None
+    def _create_unified_node_metrics_tables(self, metrics_data: Dict) -> List[Table]:
+        """Create tables for node metrics, handling both single and multi-node scenarios"""
+        tables = []
+        per_node_metrics = metrics_data.get("per_node_metrics", [])
+        if not per_node_metrics:
+            # Job is likely just starting up - it takes some type for the
+            # the metrics to become available after the job starts running.
+            from rich.text import Text
+            waiting_table = Table(title="Training Job Status")
+            waiting_table.add_column("Status")
+            waiting_table.add_column("Message")
+            waiting_table.add_row(
+                "Status",
+                Text("⏳ Waiting for metrics to become available...", style="yellow"),
+            )
+            waiting_table.add_row(
+                "Note",
+                Text(
+                    "Metrics will appear once the training job starts running.",
+                    style="dim",
+                ),
+            )
+            tables.append(waiting_table)
+            return tables
+        for node_metrics in per_node_metrics:
+            node_id = node_metrics.get("node_id", "Unknown")
+            metrics = node_metrics.get("metrics", {})
+            if not metrics:
+                continue
-        # CPU metrics
-        cpu_usage = self._get_latest_metric(metrics_data.get("cpu_usage", []))
+            table = self._create_node_table(node_id, metrics)
+            tables.append(table)
+        return tables
+    def _create_node_table(self, node_id: str, metrics: Dict) -> Table:
+        """Create a table for a single node's metrics"""
+        table = Table(title=f"Node: {node_id}")
+        table.add_column("Metric")
+        table.add_column("Value")
+        cpu_usage = self._get_latest_metric(metrics.get("cpu_usage", []))
         if cpu_usage is not None:
-            table.add_row("CPU Usage", f"{cpu_usage:.2f} cores")
+            table.add_row("CPU usage", f"{cpu_usage:.2f} cores")
-        cpu_memory = self._get_latest_metric(
-            metrics_data.get("cpu_memory_usage_bytes", [])
-        )
+        cpu_memory = self._get_latest_metric(metrics.get("cpu_memory_usage_bytes", []))
         if cpu_memory is not None:
             formatted_value, color = self._format_bytes(cpu_memory)
-            table.add_row("CPU Memory", Text(formatted_value, style=color))
+            table.add_row("CPU memory", Text(formatted_value, style=color))
-        # Add separator after CPU metrics
-        table.add_section()
+        if cpu_usage is not None or cpu_memory is not None:
+            table.add_section()
-        # GPU metrics - grouped by GPU ID
-        gpu_metrics = metrics_data.get("gpu_utilization", {})
-        gpu_memory = metrics_data.get("gpu_memory_usage_bytes", {})
+        gpu_utilization = metrics.get("gpu_utilization", {})
+        gpu_memory = metrics.get("gpu_memory_usage_bytes", {})
-        for gpu_id in sorted(set(gpu_metrics.keys()) | set(gpu_memory.keys())):
-            # Add GPU utilization
-            latest_util = self._get_latest_metric(gpu_metrics.get(gpu_id, []))
+        # API should return same GPU IDs for utilization and memory
+        keys = gpu_utilization.keys()
+        for idx, gpu_id in enumerate(keys):
+            latest_util = self._get_latest_metric(gpu_utilization.get(gpu_id, []))
             if latest_util is not None:
-                table.add_row(f"GPU {gpu_id} Usage", f"{latest_util * 100:.1f}%")
+                table.add_row(f"GPU {gpu_id} utilization", f"{latest_util * 100:.1f}%")
-            # Add GPU memory right after its utilization
             latest_memory = self._get_latest_metric(gpu_memory.get(gpu_id, []))
             if latest_memory is not None:
                 formatted_value, color = self._format_bytes(latest_memory)
                 table.add_row(
-                    f"GPU {gpu_id} Memory", Text(formatted_value, style=color)
+                    f"GPU {gpu_id} memory", Text(formatted_value, style=color)
                 )
-            # Add separator after each GPU's metrics (except for the last one)
-            if gpu_id != max(set(gpu_metrics.keys()) | set(gpu_memory.keys())):
+            if idx != len(keys) - 1:
                 table.add_section()
-        # Add separator before storage metrics
-        if gpu_metrics or gpu_memory:
-            table.add_section()
-        return table
+        ephemeral_storage = metrics.get("ephemeral_storage")
+        if ephemeral_storage:
+            if gpu_utilization or gpu_memory:
+                table.add_section()
-    def _maybe_create_storage_table(self, metrics_data: Dict) -> Optional[Table]:
-        ephemeral_storage_metrics = metrics_data.get("ephemeral_storage")
-        cache_storage_metrics = metrics_data.get("cache")
-        if ephemeral_storage_metrics or cache_storage_metrics:
-            storage_table = Table(title="Storage Metrics")
-            storage_table.add_column("Storage Type")
-            storage_table.add_column("Usage")
-            storage_table.add_column("Utilization")
-            did_add_ephemeral = self._maybe_format_storage_table_row(
-                storage_table, "Ephemeral Storage", ephemeral_storage_metrics
+            usage_bytes = self._get_latest_metric(
+                ephemeral_storage.get("usage_bytes", [])
             )
-            did_add_cache = self._maybe_format_storage_table_row(
-                storage_table, "Cache Storage", cache_storage_metrics
+            utilization = self._get_latest_metric(
+                ephemeral_storage.get("utilization", [])
             )
-            if did_add_ephemeral or did_add_cache:
-                return storage_table
-        return None
+            if usage_bytes is not None:
+                formatted_value, color = self._format_bytes(usage_bytes)
+                table.add_row("Eph. storage usage", Text(formatted_value, style=color))
+            if utilization is not None:
+                utilization_percent = utilization * 100
+                if utilization_percent > 90:
+                    color = "red"
+                elif utilization_percent > 70:
+                    color = "yellow"
+                else:
+                    color = "green"
+                table.add_row(
+                    "Eph. storage utilization",
+                    Text(f"{utilization_percent:.1f}%", style=color),
+                )
+        return table
+    def _create_storage_tables(self, metrics_data: Dict) -> List[Table]:
+        """Create storage tables - only cache per job (ephemeral is now in node tables)"""
+        tables = []
+        # Create cache storage table (job-level, shown once)
+        cache_storage = metrics_data.get("cache")
+        if cache_storage:
+            table = self._create_cache_storage_table(cache_storage)
+            if table:
+                tables.append(table)
+        return tables
+    def _create_cache_storage_table(self, cache_storage: Dict) -> Optional[Table]:
+        """Create table for cache storage metrics (job-level)"""
+        usage_bytes = self._get_latest_metric(cache_storage.get("usage_bytes", []))
+        utilization = self._get_latest_metric(cache_storage.get("utilization", []))
+        if usage_bytes is None and utilization is None:
+            return None
+        table = Table(title="Cache storage")
+        table.add_column("Storage Type")
+        table.add_column("Usage")
+        table.add_column("Utilization")
+        self._maybe_format_storage_table_row(table, "Cache storage", cache_storage)
+        return table
     def watch(self, refresh_rate: int = METRICS_POLL_INTERVAL_SEC):
         """Display continuously updating metrics"""

truss/cli/train_commands.py CHANGED Viewed

@@ -13,6 +13,7 @@ from truss.cli.train import common as train_common
 from truss.cli.train import core
 from truss.cli.utils import common
 from truss.cli.utils.output import console, error_console
+from truss.remote.baseten.core import get_training_job_logs_with_pagination
 from truss.remote.baseten.remote import BasetenRemote
 from truss.remote.remote_factory import RemoteFactory
@@ -72,8 +73,11 @@ def _prepare_click_context(f: click.Command, params: dict) -> click.Context:
 @click.argument("config", type=Path, required=True)
 @click.option("--remote", type=str, required=False, help="Remote to use")
 @click.option("--tail", is_flag=True, help="Tail for status + logs after push.")
+@click.option("--job-name", type=str, required=False, help="Name of the training job.")
 @common.common_options()
-def push_training_job(config: Path, remote: Optional[str], tail: bool):
+def push_training_job(
+    config: Path, remote: Optional[str], tail: bool, job_name: Optional[str]
+):
     """Run a training job"""
     from truss_train import deployment
@@ -84,7 +88,9 @@ def push_training_job(config: Path, remote: Optional[str], tail: bool):
         remote_provider: BasetenRemote = cast(
             BasetenRemote, RemoteFactory.create(remote=remote)
         )
-        job_resp = deployment.create_training_job_from_file(remote_provider, config)
+        job_resp = deployment.create_training_job_from_file(
+            remote_provider, config, job_name
+        )
     # Note: This post create logic needs to happen outside the context
     # of the above context manager, as only one console session can be active
@@ -138,7 +144,9 @@ def get_job_logs(
     )
     if not tail:
-        logs = remote_provider.api.get_training_job_logs(project_id, job_id)
+        logs = get_training_job_logs_with_pagination(
+            remote_provider.api, project_id, job_id
+        )
         for log in cli_log_utils.parse_logs(logs):
             cli_log_utils.output_log(log)
     else:

truss/contexts/image_builder/serving_image_builder.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-import os
 import json
 import logging
 import re
@@ -74,7 +73,6 @@ from truss.contexts.image_builder.util import (
 )
 from truss.contexts.truss_context import TrussContext
 from truss.truss_handle.patch.hash import directory_content_hash
-from truss.util.basetenpointer import model_cache_hf_to_b10ptr
 from truss.util.jinja import read_template_from_fs
 from truss.util.path import (
     build_truss_target_directory,
@@ -327,36 +325,27 @@ def get_files_to_model_cache_v1(config: TrussConfig, truss_dir: Path, build_dir:
 def build_model_cache_v2_and_copy_bptr_manifest(config: TrussConfig, build_dir: Path):
     assert config.model_cache.is_v2
     assert all(model.volume_folder is not None for model in config.model_cache.models)
-    try:
-        from truss_transfer import PyModelRepo, create_basetenpointer_from_models
-        py_models = [
-            PyModelRepo(
-                repo_id=model.repo_id,
-                revision=model.revision,
-                runtime_secret_name=model.runtime_secret_name,
-                allow_patterns=model.allow_patterns,
-                ignore_patterns=model.ignore_patterns,
-                volume_folder=model.volume_folder,
-                kind=model.kind.value,
-            )
-            for model in config.model_cache.models
-        ]
-        # create BasetenPointer from models
-        basetenpointer_json = create_basetenpointer_from_models(models=py_models)
-        bptr_py = json.loads(basetenpointer_json)["pointers"]
-        logging.info(f"created ({len(bptr_py)}) Basetenpointer")
-        logging.info(f"pointers json: {basetenpointer_json}")
-        with open(build_dir / "bptr-manifest", "w") as f:
-            f.write(basetenpointer_json)
-    except Exception as e:
-        logging.warning(f"debug: failed to create BasetenPointer: {e}")
-        # TODO: remove below section + remove logging lines above.
-        # builds BasetenManifest for caching
-        basetenpointers = model_cache_hf_to_b10ptr(config.model_cache)
-        # write json of bastenpointers into build dir
-        with open(build_dir / "bptr-manifest", "w") as f:
-            f.write(basetenpointers.model_dump_json())
+    from truss_transfer import PyModelRepo, create_basetenpointer_from_models
+    py_models = [
+        PyModelRepo(
+            repo_id=model.repo_id,
+            revision=model.revision,
+            runtime_secret_name=model.runtime_secret_name,
+            allow_patterns=model.allow_patterns,
+            ignore_patterns=model.ignore_patterns,
+            volume_folder=model.volume_folder,
+            kind=model.kind.value,
+        )
+        for model in config.model_cache.models
+    ]
+    # create BasetenPointer from models
+    basetenpointer_json = create_basetenpointer_from_models(models=py_models)
+    bptr_py = json.loads(basetenpointer_json)["pointers"]
+    logging.info(f"created ({len(bptr_py)}) Basetenpointer")
+    logging.info(f"pointers json: {basetenpointer_json}")
+    with open(build_dir / "bptr-manifest", "w") as f:
+        f.write(basetenpointer_json)
 def generate_docker_server_nginx_config(build_dir, config):
@@ -794,7 +783,6 @@ class ServingImageBuilder(ImageBuilder):
             config
         )
-        non_root_user = os.getenv("BT_USE_NON_ROOT_USER", False)
         dockerfile_contents = dockerfile_template.render(
             should_install_server_requirements=should_install_server_requirements,
             base_image_name_and_tag=base_image_name_and_tag,
@@ -828,12 +816,7 @@ class ServingImageBuilder(ImageBuilder):
             build_commands=build_commands,
             use_local_src=config.use_local_src,
             passthrough_environment_variables=passthrough_environment_variables,
-            non_root_user=non_root_user,
-            app_username="app",
-            app_user_uid=60000,
-            control_server_dir="/control",
-            default_owner="0:0",  # root user
-            **FILENAME_CONSTANTS_MAP,  # Add this line
+            **FILENAME_CONSTANTS_MAP,
         )
         # Consolidate repeated empty lines to single empty lines.
         dockerfile_contents = re.sub(

truss 0.10.9rc535__py3-none-any.whl → 0.10.10rc0__py3-none-any.whl

Potentially problematic release.

truss 0.10.9rc535py3-none-any.whl → 0.10.10rc0py3-none-any.whl