PyPI - truss - Versions diffs - 0.11.9rc1__py3-none-any.whl → 0.11.9rc2__py3-none-any.whl - Mend

truss 0.11.9rc1py3-none-any.whl → 0.11.9rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of truss might be problematic. Click here for more details.

Files changed (24) hide show

truss/cli/train/deploy_checkpoints/deploy_lora_checkpoints.py CHANGED Viewed

@@ -1,12 +1,3 @@
-from pathlib import Path
-from jinja2 import Template
-from truss.base import truss_config
-from truss.cli.train.deploy_checkpoints.deploy_checkpoints_helpers import (
-    START_COMMAND_ENVVAR_NAME,
-)
-from truss.cli.train.types import DeployCheckpointsConfigComplete
 from truss_train.definitions import (
     ALLOWED_LORA_RANKS,
     DEFAULT_LORA_RANK,
@@ -14,21 +5,6 @@ from truss_train.definitions import (
     LoRADetails,
 )
-from .deploy_checkpoints_helpers import (
-    setup_base_truss_config,
-    setup_environment_variables_and_secrets,
-)
-VLLM_LORA_START_COMMAND = Template(
-    'sh -c "{%if envvars %}{{ envvars }} {% endif %}vllm serve {{ base_model_id }}'
-    + " --port 8000"
-    + "{{ specify_tensor_parallelism }}"
-    + " --enable-lora"
-    + " --max-lora-rank {{ max_lora_rank }}"
-    + " --dtype bfloat16"
-    + ' --lora-modules {{ lora_modules }}"'
-)
 def hydrate_lora_checkpoint(
     job_id: str, checkpoint_id: str, checkpoint: dict
@@ -43,49 +19,6 @@ def hydrate_lora_checkpoint(
     )
-def render_vllm_lora_truss_config(
-    checkpoint_deploy: DeployCheckpointsConfigComplete,
-) -> truss_config.TrussConfig:
-    """Render truss config specifically for LoRA checkpoints using vLLM."""
-    truss_deploy_config = setup_base_truss_config(checkpoint_deploy)
-    start_command_envvars = setup_environment_variables_and_secrets(
-        truss_deploy_config, checkpoint_deploy
-    )
-    checkpoint_str = _build_lora_checkpoint_string(truss_deploy_config)
-    max_lora_rank = max(
-        [
-            checkpoint.lora_details.rank or DEFAULT_LORA_RANK
-            for checkpoint in checkpoint_deploy.checkpoint_details.checkpoints
-            if hasattr(checkpoint, "lora_details") and checkpoint.lora_details
-        ]
-    )
-    accelerator = checkpoint_deploy.compute.accelerator
-    if accelerator:
-        specify_tensor_parallelism = f" --tensor-parallel-size {accelerator.count}"
-    else:
-        specify_tensor_parallelism = ""
-    start_command_args = {
-        "base_model_id": checkpoint_deploy.checkpoint_details.base_model_id,
-        "lora_modules": checkpoint_str,
-        "envvars": start_command_envvars,
-        "max_lora_rank": max_lora_rank,
-        "specify_tensor_parallelism": specify_tensor_parallelism,
-    }
-    start_command = VLLM_LORA_START_COMMAND.render(**start_command_args)
-    # Note: we set the start command as an environment variable in supervisord config.
-    # This is so that we don't have to change the supervisord config when the start command changes.
-    # Our goal is to reduce the number of times we need to rebuild the image, and allow us to deploy faster.
-    truss_deploy_config.environment_variables[START_COMMAND_ENVVAR_NAME] = start_command
-    # Note: supervisord uses the convention %(ENV_VAR_NAME)s to access environment variable VAR_NAME
-    truss_deploy_config.docker_server.start_command = (  # type: ignore[union-attr]
-        f"%(ENV_{START_COMMAND_ENVVAR_NAME})s"
-    )
-    return truss_deploy_config
 def _get_lora_rank(checkpoint_resp: dict) -> int:
     """Extract and validate LoRA rank from checkpoint response."""
     lora_adapter_config = checkpoint_resp.get("lora_adapter_config") or {}
@@ -99,19 +32,3 @@ def _get_lora_rank(checkpoint_resp: dict) -> int:
         )
     return lora_rank
-def _build_lora_checkpoint_string(truss_deploy_config) -> str:
-    """Build the checkpoint string for LoRA modules from truss deploy config."""
-    checkpoint_parts = []
-    for (
-        truss_checkpoint
-    ) in truss_deploy_config.training_checkpoints.artifact_references:  # type: ignore
-        ckpt_path = Path(
-            truss_deploy_config.training_checkpoints.download_folder,  # type: ignore
-            truss_checkpoint.training_job_id,
-            truss_checkpoint.paths[0],
-        )
-        checkpoint_parts.append(f"{truss_checkpoint.training_job_id}={ckpt_path}")
-    return " ".join(checkpoint_parts)

truss/cli/train/deploy_checkpoints/deploy_whisper_checkpoints.py CHANGED Viewed

@@ -1,58 +1,5 @@
-from jinja2 import Template
-from truss.base import truss_config
-from truss.cli.train.deploy_checkpoints.deploy_checkpoints_helpers import (
-    START_COMMAND_ENVVAR_NAME,
-)
-from truss.cli.train.deploy_checkpoints.deploy_full_checkpoints import (
-    build_full_checkpoint_string,
-)
-from truss.cli.train.types import DeployCheckpointsConfigComplete
 from truss_train.definitions import WhisperCheckpoint
-from .deploy_checkpoints_helpers import (
-    setup_base_truss_config,
-    setup_environment_variables_and_secrets,
-)
-VLLM_WHISPER_START_COMMAND = Template(
-    "sh -c '{% if envvars %}{{ envvars }} {% endif %}"
-    'HF_TOKEN="$$(cat /secrets/hf_access_token)" && export HF_TOKEN && '
-    "vllm serve {{ model_path }} --port 8000 --tensor-parallel-size {{ specify_tensor_parallelism }}'"
-)
-def render_vllm_whisper_truss_config(
-    checkpoint_deploy: DeployCheckpointsConfigComplete,
-) -> truss_config.TrussConfig:
-    """Render truss config specifically for whisper checkpoints using vLLM."""
-    truss_deploy_config = setup_base_truss_config(checkpoint_deploy)
-    start_command_envvars = setup_environment_variables_and_secrets(
-        truss_deploy_config, checkpoint_deploy
-    )
-    checkpoint_str = build_full_checkpoint_string(truss_deploy_config)
-    accelerator = checkpoint_deploy.compute.accelerator
-    start_command_args = {
-        "model_path": checkpoint_str,
-        "envvars": start_command_envvars,
-        "specify_tensor_parallelism": accelerator.count if accelerator else 1,
-    }
-    # Note: we set the start command as an environment variable in supervisord config.
-    # This is so that we don't have to change the supervisord config when the start command changes.
-    # Our goal is to reduce the number of times we need to rebuild the image, and allow us to deploy faster.
-    start_command = VLLM_WHISPER_START_COMMAND.render(**start_command_args)
-    truss_deploy_config.environment_variables[START_COMMAND_ENVVAR_NAME] = start_command
-    # Note: supervisord uses the convention %(ENV_VAR_NAME)s to access environment variable VAR_NAME
-    truss_deploy_config.docker_server.start_command = (  # type: ignore[union-attr]
-        f"%(ENV_{START_COMMAND_ENVVAR_NAME})s"
-    )
-    return truss_deploy_config
 def hydrate_whisper_checkpoint(
     job_id: str, checkpoint_id: str, checkpoint: dict

truss/cli/train/types.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from dataclasses import dataclass
-from pathlib import Path
 from typing import Optional
 from truss_train.definitions import (
@@ -7,12 +6,11 @@ from truss_train.definitions import (
     Compute,
     DeployCheckpointsConfig,
     DeployCheckpointsRuntime,
-    ModelWeightsFormat,
 )
 @dataclass
-class PrepareCheckpointArgs:
+class DeployCheckpointArgs:
     project_id: Optional[str]
     job_id: Optional[str]
     deploy_config_path: Optional[str]
@@ -26,13 +24,5 @@ class DeployCheckpointsConfigComplete(DeployCheckpointsConfig):
     checkpoint_details: CheckpointList
     model_name: str
-    deployment_name: str
     runtime: DeployCheckpointsRuntime
     compute: Compute
-    model_weight_format: ModelWeightsFormat
-@dataclass
-class PrepareCheckpointResult:
-    truss_directory: Path
-    checkpoint_deploy_config: DeployCheckpointsConfigComplete

truss/cli/train_commands.py CHANGED Viewed

@@ -8,7 +8,7 @@ import rich_click as click
 import truss.cli.train.core as train_cli
 from truss.base.constants import TRAINING_TEMPLATE_DIR
 from truss.cli import remote_cli
-from truss.cli.cli import push, truss_cli
+from truss.cli.cli import truss_cli
 from truss.cli.logs import utils as cli_log_utils
 from truss.cli.logs.training_log_watcher import TrainingLogWatcher
 from truss.cli.train import common as train_common
@@ -329,26 +329,16 @@ def deploy_checkpoints(
     project_id = _maybe_resolve_project_id_from_id_or_name(
         remote_provider, project_id=project_id, project=project
     )
-    prepare_checkpoint_result = train_cli.prepare_checkpoint_deploy(
+    result = train_cli.create_model_version_from_inference_template(
         remote_provider,
-        train_cli.PrepareCheckpointArgs(
+        train_cli.DeployCheckpointArgs(
             project_id=project_id, job_id=job_id, deploy_config_path=config
         ),
     )
-    params = {
-        "target_directory": prepare_checkpoint_result.truss_directory,
-        "remote": remote,
-        "model_name": prepare_checkpoint_result.checkpoint_deploy_config.model_name,
-        "publish": True,
-        "deployment_name": prepare_checkpoint_result.checkpoint_deploy_config.deployment_name,
-    }
-    ctx = _prepare_click_context(push, params)
     if dry_run:
-        console.print("--dry-run flag provided, not deploying", style="yellow")
-    else:
-        push.invoke(ctx)
-    train_cli.print_deploy_checkpoints_success_message(prepare_checkpoint_result)
+        console.print("--dry-run flag provided, did not deploy", style="yellow")
+    train_cli.print_deploy_checkpoints_success_message(result)
 @train.command(name="download")

truss/remote/baseten/api.py CHANGED Viewed

@@ -3,6 +3,7 @@ from enum import Enum
 from typing import Any, Dict, List, Mapping, Optional
 import requests
+from pydantic import BaseModel, Field
 from truss.remote.baseten import custom_types as b10_types
 from truss.remote.baseten.auth import ApiKey, AuthService
@@ -14,6 +15,39 @@ from truss.remote.baseten.utils.transfer import base64_encoded_json_str
 logger = logging.getLogger(__name__)
+class InstanceTypeV1(BaseModel):
+    """An instance type."""
+    id: str = Field(description="Identifier string for the instance type")
+    name: str = Field(description="Name of the instance type")
+    display_name: str = Field(
+        alias="displayName", description="Display name of the instance type"
+    )
+    gpu_count: int = Field(
+        alias="gpuCount", description="Number of GPUs on the instance type"
+    )
+    default: bool = Field(description="Whether this is the default instance type")
+    gpu_memory: Optional[int] = Field(alias="gpuMemory", description="GPU memory in MB")
+    node_count: int = Field(alias="nodeCount", description="Number of nodes")
+    gpu_type: Optional[str] = Field(
+        alias="gpuType", description="Type of GPU on the instance type"
+    )
+    millicpu_limit: int = Field(
+        alias="millicpuLimit", description="CPU limit of the instance type in millicpu"
+    )
+    memory_limit: int = Field(
+        alias="memoryLimit", description="Memory limit of the instance type in MB"
+    )
+    price: Optional[float] = Field(description="Price of the instance type")
+    limited_capacity: Optional[bool] = Field(
+        alias="limitedCapacity",
+        description="Whether this instance type has limited capacity",
+    )
+    class Config:
+        populate_by_name = True
 API_URL_MAPPING = {
     "https://app.baseten.co": "https://api.baseten.co",
     "https://app.staging.baseten.co": "https://api.staging.baseten.co",
@@ -750,3 +784,56 @@ class BasetenApi:
         # NB(nikhil): reverse order so latest logs are at the end
         return resp_json["logs"][::-1]
+    def create_model_version_from_inference_template(self, request_data: dict):
+        """
+        Create a model version from an inference template using GraphQL mutation.
+        Args:
+            request_data: Dictionary containing the request structure with metadata,
+                         weights_sources, inference_stack, and instance_type_id
+        """
+        query_string = """
+        mutation ($request: CreateModelVersionFromInferenceTemplateRequest!) {
+            create_model_version_from_inference_template(request: $request) {
+                model_version {
+                    id
+                    name
+                }
+            }
+        }
+        """
+        resp = self._post_graphql_query(
+            query_string, variables={"request": request_data}
+        )
+        return resp["data"]["create_model_version_from_inference_template"]
+    def get_instance_types(self) -> List[InstanceTypeV1]:
+        """
+        Get all available instance types via GraphQL API.
+        """
+        query_string = """
+        query Instances {
+            listedInstances: listed_instances {
+                id
+                name
+                millicpuLimit: millicpu_limit
+                memoryLimit: memory_limit
+                gpuCount: gpu_count
+                gpuType: gpu_type
+                gpuMemory: gpu_memory
+                default
+                displayName: display_name
+                nodeCount: node_count
+                price
+                limitedCapacity: limited_capacity
+            }
+        }
+        """
+        resp = self._post_graphql_query(query_string)
+        instance_types_data = resp["data"]["listedInstances"]
+        return [
+            InstanceTypeV1(**instance_type) for instance_type in instance_types_data
+        ]

truss/templates/control/control/application.py CHANGED Viewed

@@ -1,13 +1,15 @@
 import asyncio
+import http
 import logging
 import logging.config
 import re
+import traceback
 from pathlib import Path
-from typing import Dict
+from typing import Awaitable, Callable, Dict
 import httpx
 from endpoints import control_app
-from fastapi import FastAPI
+from fastapi import FastAPI, Request, Response
 from fastapi.responses import JSONResponse
 from helpers.errors import ModelLoadFailed, PatchApplicatonError
 from helpers.inference_server_controller import InferenceServerController
@@ -16,22 +18,47 @@ from helpers.inference_server_starter import async_inference_server_startup_flow
 from helpers.truss_patch.model_container_patch_applier import ModelContainerPatchApplier
 from shared import log_config
 from starlette.datastructures import State
-async def handle_patch_error(_, exc):
-    error_type = _camel_to_snake_case(type(exc).__name__)
-    return JSONResponse(content={"error": {"type": error_type, "msg": str(exc)}})
-async def generic_error_handler(_, exc):
-    return JSONResponse(
-        content={"error": {"type": "unknown", "msg": f"{type(exc)}: {exc}"}}
-    )
-async def handle_model_load_failed(_, error):
-    # Model load failures should result in 503 status
-    return JSONResponse({"error": str(error)}, 503)
+from starlette.middleware.base import BaseHTTPMiddleware
+SANITIZED_EXCEPTION_FRAMES = 2
+# NB(nikhil): SanitizedExceptionMiddleware will reduce the noise of control server stack frames, since
+# users often complain about the verbosity. Now, if any exceptions are explicitly raised during a proxied
+# request, we'll log the last two stack frames which should be sufficient for debugging while significantly
+# cutting down the volume.
+class SanitizedExceptionMiddleware(BaseHTTPMiddleware):
+    def __init__(self, app, num_frames: int = SANITIZED_EXCEPTION_FRAMES):
+        super().__init__(app)
+        self.num_frames = num_frames
+    async def dispatch(
+        self, request: Request, call_next: Callable[[Request], Awaitable[Response]]
+    ) -> Response:
+        try:
+            return await call_next(request)
+        except Exception as exc:
+            sanitized_traceback = self._create_sanitized_traceback(exc)
+            request.app.state.logger.error(sanitized_traceback)
+            if isinstance(exc, ModelLoadFailed):
+                return JSONResponse(
+                    {"error": str(exc)}, status_code=http.HTTPStatus.BAD_GATEWAY.value
+                )
+            elif isinstance(exc, PatchApplicatonError):
+                error_type = _camel_to_snake_case(type(exc).__name__)
+                return JSONResponse({"error": {"type": error_type, "msg": str(exc)}})
+            else:
+                return JSONResponse(
+                    {"error": {"type": "unknown", "msg": str(exc)}},
+                    status_code=http.HTTPStatus.INTERNAL_SERVER_ERROR.value,
+                )
+    def _create_sanitized_traceback(self, error: Exception) -> str:
+        tb_lines = traceback.format_tb(error.__traceback__)
+        if tb_lines and self.num_frames > 0:
+            return "".join(tb_lines[-self.num_frames :])
+        return f"{type(error).__name__}: {error}"
 def create_app(base_config: Dict):
@@ -82,14 +109,10 @@ def create_app(base_config: Dict):
     app = FastAPI(
         title="Truss Live Reload Server",
         on_startup=[start_background_inference_startup],
-        exception_handlers={
-            PatchApplicatonError: handle_patch_error,
-            ModelLoadFailed: handle_model_load_failed,
-            Exception: generic_error_handler,
-        },
     )
     app.state = app_state
     app.include_router(control_app)
+    app.add_middleware(SanitizedExceptionMiddleware)
     @app.on_event("shutdown")
     def on_shutdown():

truss/templates/control/control/endpoints.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Any, Callable, Dict, Optional, Protocol
 import httpx
 from fastapi import APIRouter, WebSocket
 from fastapi.responses import JSONResponse, StreamingResponse
+from helpers.errors import ModelLoadFailed, ModelNotReady
 from httpx_ws import AsyncWebSocketSession, WebSocketDisconnect, aconnect_ws
 from httpx_ws import _exceptions as httpx_ws_exceptions
 from starlette.requests import ClientDisconnect, Request
@@ -13,11 +14,6 @@ from starlette.websockets import WebSocketDisconnect as StartletteWebSocketDisco
 from tenacity import RetryCallState, Retrying, retry_if_exception_type, wait_fixed
 from wsproto.events import BytesMessage, TextMessage
-from truss.templates.control.control.helpers.errors import (
-    ModelLoadFailed,
-    ModelNotReady,
-)
 INFERENCE_SERVER_START_WAIT_SECS = 60
 BASE_RETRY_EXCEPTIONS = (
     retry_if_exception_type(httpx.ConnectError)

truss/templates/docker_server/proxy.conf.jinja CHANGED Viewed

@@ -45,6 +45,19 @@ server {
         proxy_pass http://127.0.0.1:{{server_port}};
     }
+    location ~ ^/v1/websocket$ {
+        proxy_redirect off;
+        proxy_read_timeout 18030s;
+        proxy_http_version 1.1;
+        proxy_set_header Upgrade $upgrade_header;
+        proxy_set_header Connection $connection_header;
+        rewrite ^/v1/websocket$ {{server_endpoint}} break;
+        proxy_pass http://127.0.0.1:{{server_port}};
+    }
     # Forward all other paths
     location / {
         proxy_redirect off;

truss 0.11.9rc1__py3-none-any.whl → 0.11.9rc2__py3-none-any.whl

Potentially problematic release.

truss 0.11.9rc1py3-none-any.whl → 0.11.9rc2py3-none-any.whl