PyPI - truss - Versions diffs - 0.10.9rc601__py3-none-any.whl → 0.10.10rc1__py3-none-any.whl - Mend

truss 0.10.9rc601py3-none-any.whl → 0.10.10rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of truss might be problematic. Click here for more details.

Files changed (32) hide show

truss/base/constants.py +0 -1
truss/cli/train/deploy_checkpoints/deploy_checkpoints.py +30 -22
truss/cli/train/deploy_checkpoints/deploy_checkpoints_helpers.py +8 -2
truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py +2 -2
truss/cli/train/deploy_checkpoints/deploy_whisper_checkpoints.py +63 -0
truss/cli/train/deploy_from_checkpoint_config_whisper.yml +17 -0
truss/cli/train_commands.py +11 -3
truss/contexts/image_builder/cache_warmer.py +1 -3
truss/contexts/image_builder/serving_image_builder.py +24 -32
truss/remote/baseten/api.py +11 -0
truss/remote/baseten/core.py +209 -1
truss/remote/baseten/utils/time.py +15 -0
truss/templates/server/model_wrapper.py +0 -12
truss/templates/server/requirements.txt +1 -1
truss/templates/server/truss_server.py +0 -13
truss/templates/server.Dockerfile.jinja +1 -1
truss/tests/cli/train/test_deploy_checkpoints.py +436 -0
truss/tests/contexts/image_builder/test_serving_image_builder.py +1 -1
truss/tests/remote/baseten/conftest.py +18 -0
truss/tests/remote/baseten/test_api.py +49 -14
truss/tests/remote/baseten/test_core.py +517 -1
truss/tests/test_data/test_openai/model/model.py +0 -3
truss/truss_handle/truss_handle.py +0 -1
{truss-0.10.9rc601.dist-info → truss-0.10.10rc1.dist-info}/METADATA +2 -2
{truss-0.10.9rc601.dist-info → truss-0.10.10rc1.dist-info}/RECORD +30 -28
truss_train/definitions.py +6 -0
truss_train/deployment.py +15 -2
truss/tests/util/test_basetenpointer.py +0 -227
truss/util/basetenpointer.py +0 -160
{truss-0.10.9rc601.dist-info → truss-0.10.10rc1.dist-info}/WHEEL +0 -0
{truss-0.10.9rc601.dist-info → truss-0.10.10rc1.dist-info}/entry_points.txt +0 -0
{truss-0.10.9rc601.dist-info → truss-0.10.10rc1.dist-info}/licenses/LICENSE +0 -0

truss/base/constants.py CHANGED Viewed

@@ -29,7 +29,6 @@ BEI_REQUIRED_MAX_NUM_TOKENS = 16384
 TRTLLM_MIN_MEMORY_REQUEST_GI = 10
 HF_MODELS_API_URL = "https://huggingface.co/api/models"
 HF_ACCESS_TOKEN_KEY = "hf_access_token"
-HF_ACCESS_TOKEN_FILE_NAME = "hf_access_token"
 TRUSSLESS_MAX_PAYLOAD_SIZE = "64M"
 # Alias for TEMPLATES_DIR
 SERVING_DIR: pathlib.Path = TEMPLATES_DIR

truss/cli/train/deploy_checkpoints/deploy_checkpoints.py CHANGED Viewed

@@ -33,6 +33,10 @@ from .deploy_lora_checkpoints import (
     hydrate_lora_checkpoint,
     render_vllm_lora_truss_config,
 )
+from .deploy_whisper_checkpoints import (
+    hydrate_whisper_checkpoint,
+    render_vllm_whisper_truss_config,
+)
 HF_TOKEN_ENVVAR_NAME = "HF_TOKEN"
 # If we change this, make sure to update the logic in backend codebase
@@ -178,6 +182,8 @@ def hydrate_checkpoint(
         return hydrate_lora_checkpoint(job_id, checkpoint_id, checkpoint)
     elif checkpoint_type.lower() == ModelWeightsFormat.FULL.value:
         return hydrate_full_checkpoint(job_id, checkpoint_id, checkpoint)
+    elif checkpoint_type.lower() == ModelWeightsFormat.WHISPER.value:
+        return hydrate_whisper_checkpoint(job_id, checkpoint_id, checkpoint)
     else:
         raise ValueError(
             f"Unsupported checkpoint type: {checkpoint_type}. Contact Baseten for support with other checkpoint types."
@@ -196,6 +202,8 @@ def _render_truss_config_for_checkpoint_deployment(
         return render_vllm_lora_truss_config(checkpoint_deploy)
     elif checkpoint_deploy.model_weight_format == ModelWeightsFormat.FULL:
         return render_vllm_full_truss_config(checkpoint_deploy)
+    elif checkpoint_deploy.model_weight_format == ModelWeightsFormat.WHISPER:
+        return render_vllm_whisper_truss_config(checkpoint_deploy)
     else:
         raise ValueError(
             f"Unsupported model weight format: {checkpoint_deploy.model_weight_format}. Please upgrade to the latest Truss version to access the latest supported formats. Contact Baseten if you would like us to support additional formats."
@@ -288,18 +296,6 @@ def _get_checkpoint_ids_to_deploy(
     return checkpoint_ids
-def _select_single_checkpoint(checkpoint_id_options: List[str]) -> List[str]:
-    """Select a single checkpoint using interactive prompt."""
-    checkpoint_id = inquirer.select(
-        message="Select the checkpoint to deploy:", choices=checkpoint_id_options
-    ).execute()
-    if not checkpoint_id:
-        raise click.UsageError("A checkpoint must be selected.")
-    return [checkpoint_id]
 def _select_multiple_checkpoints(checkpoint_id_options: List[str]) -> List[str]:
     """Select multiple checkpoints using interactive checkbox."""
     checkpoint_ids = inquirer.checkbox(
@@ -351,6 +347,8 @@ def _get_base_model_id(user_input: Optional[str], checkpoint: dict) -> Optional[
         )
     elif checkpoint.get("checkpoint_type") == ModelWeightsFormat.FULL.value.lower():
         return None
+    elif checkpoint.get("checkpoint_type") == ModelWeightsFormat.WHISPER.value.lower():
+        return None
     else:
         base_model_id = inquirer.text(message="Enter the base model id.").execute()
     if not base_model_id:
@@ -416,18 +414,28 @@ def _validate_selected_checkpoints(
             "Unable to infer model weight format. Reach out to Baseten for support."
         )
-    has_full_checkpoint = any(
-        response_checkpoints[checkpoint_id].get("checkpoint_type")
-        == ModelWeightsFormat.FULL.value
-        for checkpoint_id in checkpoint_ids
-    )
-    if has_full_checkpoint and len(checkpoint_ids) > 1:
-        # vLLM does not support multiple checkpoints when any checkpoint is full model weights.
-        raise ValueError(
-            "Full checkpoints are not supported for multiple checkpoints. Please select a single checkpoint."
+    validation_rules = {
+        ModelWeightsFormat.FULL.value: {
+            "error_message": "Full checkpoints are not supported for multiple checkpoints. Please select a single checkpoint.",
+            "reason": "vLLM does not support multiple checkpoints when any checkpoint is full model weights.",
+        },
+        ModelWeightsFormat.WHISPER.value: {
+            "error_message": "Whisper checkpoints are not supported for multiple checkpoints. Please select a single checkpoint.",
+            "reason": "vLLM does not support multiple checkpoints when any checkpoint is whisper model weights.",
+        },
+    }
+    # Check each checkpoint type that has restrictions
+    for checkpoint_type, rule in validation_rules.items():
+        has_restricted_checkpoint = any(
+            response_checkpoints[checkpoint_id].get("checkpoint_type")
+            == checkpoint_type
+            for checkpoint_id in checkpoint_ids
         )
+        if has_restricted_checkpoint and len(checkpoint_ids) > 1:
+            raise ValueError(rule["error_message"])
 def get_hf_secret_name(user_input: Union[str, SecretReference, None]) -> str:
     """Get HuggingFace secret name from user input or prompt for it."""

truss/cli/train/deploy_checkpoints/deploy_checkpoints_helpers.py CHANGED Viewed

@@ -3,7 +3,7 @@ from pathlib import Path
 from truss.base import truss_config
 from truss.cli.train.types import DeployCheckpointsConfigComplete
-from truss_train.definitions import SecretReference
+from truss_train.definitions import ModelWeightsFormat, SecretReference
 START_COMMAND_ENVVAR_NAME = "BT_DOCKER_SERVER_START_CMD"
@@ -12,8 +12,14 @@ def setup_base_truss_config(
     checkpoint_deploy: DeployCheckpointsConfigComplete,
 ) -> truss_config.TrussConfig:
     """Set up the base truss config with common properties."""
+    truss_deploy_config = None
+    truss_base_file = (
+        "deploy_from_checkpoint_config_whisper.yml"
+        if checkpoint_deploy.model_weight_format == ModelWeightsFormat.WHISPER
+        else "deploy_from_checkpoint_config.yml"
+    )
     truss_deploy_config = truss_config.TrussConfig.from_yaml(
-        Path(os.path.dirname(__file__), "..", "deploy_from_checkpoint_config.yml")
+        Path(os.path.dirname(__file__), "..", truss_base_file)
     )
     if not truss_deploy_config.docker_server:
         raise ValueError(

truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py CHANGED Viewed

@@ -40,7 +40,7 @@ def render_vllm_full_truss_config(
         truss_deploy_config, checkpoint_deploy
     )
-    checkpoint_str = _build_full_checkpoint_string(truss_deploy_config)
+    checkpoint_str = build_full_checkpoint_string(truss_deploy_config)
     accelerator = checkpoint_deploy.compute.accelerator
@@ -71,7 +71,7 @@ def hydrate_full_checkpoint(
     return FullCheckpoint(training_job_id=job_id, paths=paths)
-def _build_full_checkpoint_string(truss_deploy_config) -> str:
+def build_full_checkpoint_string(truss_deploy_config) -> str:
     """Build checkpoint string from artifact references for full checkpoints.
     Args:

truss/cli/train/deploy_checkpoints/deploy_whisper_checkpoints.py ADDED Viewed

@@ -0,0 +1,63 @@
+from jinja2 import Template
+from truss.base import truss_config
+from truss.cli.train.deploy_checkpoints.deploy_checkpoints_helpers import (
+    START_COMMAND_ENVVAR_NAME,
+)
+from truss.cli.train.deploy_checkpoints.deploy_full_checkpoints import (
+    build_full_checkpoint_string,
+)
+from truss.cli.train.types import DeployCheckpointsConfigComplete
+from truss_train.definitions import WhisperCheckpoint
+from .deploy_checkpoints_helpers import (
+    setup_base_truss_config,
+    setup_environment_variables_and_secrets,
+)
+VLLM_WHISPER_START_COMMAND = Template(
+    "sh -c '{% if envvars %}{{ envvars }} {% endif %}"
+    'HF_TOKEN="$$(cat /secrets/hf_access_token)" && export HF_TOKEN && '
+    "vllm serve {{ model_path }} --port 8000 --tensor-parallel-size {{ specify_tensor_parallelism }}'"
+)
+def render_vllm_whisper_truss_config(
+    checkpoint_deploy: DeployCheckpointsConfigComplete,
+) -> truss_config.TrussConfig:
+    """Render truss config specifically for whisper checkpoints using vLLM."""
+    truss_deploy_config = setup_base_truss_config(checkpoint_deploy)
+    start_command_envvars = setup_environment_variables_and_secrets(
+        truss_deploy_config, checkpoint_deploy
+    )
+    checkpoint_str = build_full_checkpoint_string(truss_deploy_config)
+    accelerator = checkpoint_deploy.compute.accelerator
+    start_command_args = {
+        "model_path": checkpoint_str,
+        "envvars": start_command_envvars,
+        "specify_tensor_parallelism": accelerator.count if accelerator else 1,
+    }
+    # Note: we set the start command as an environment variable in supervisord config.
+    # This is so that we don't have to change the supervisord config when the start command changes.
+    # Our goal is to reduce the number of times we need to rebuild the image, and allow us to deploy faster.
+    start_command = VLLM_WHISPER_START_COMMAND.render(**start_command_args)
+    truss_deploy_config.environment_variables[START_COMMAND_ENVVAR_NAME] = start_command
+    # Note: supervisord uses the convention %(ENV_VAR_NAME)s to access environment variable VAR_NAME
+    truss_deploy_config.docker_server.start_command = (  # type: ignore[union-attr]
+        f"%(ENV_{START_COMMAND_ENVVAR_NAME})s"
+    )
+    return truss_deploy_config
+def hydrate_whisper_checkpoint(
+    job_id: str, checkpoint_id: str, checkpoint: dict
+) -> WhisperCheckpoint:
+    """Create a Checkpoint object for whisper model weights."""
+    # NOTE: Slash at the end is important since it means the checkpoint is a directory
+    paths = [f"rank-0/{checkpoint_id}/"]
+    return WhisperCheckpoint(training_job_id=job_id, paths=paths)

truss/cli/train/deploy_from_checkpoint_config_whisper.yml ADDED Viewed

@@ -0,0 +1,17 @@
+base_image:
+  image:  vllm/vllm-openai:latest
+docker_server:
+  start_command: sh -c "" # replaced when deploying
+  readiness_endpoint: /health
+  liveness_endpoint: /health
+  predict_endpoint: /v1/audio/transcriptions
+  server_port: 8000
+runtime:
+  predict_concurrency : 256
+environment_variables:
+  VLLM_LOGGING_LEVEL: WARNING
+  VLLM_USE_V1: 0
+  HF_HUB_ENABLE_HF_TRANSFER: 1
+requirements:
+  - vllm[audio]

truss/cli/train_commands.py CHANGED Viewed

@@ -13,6 +13,7 @@ from truss.cli.train import common as train_common
 from truss.cli.train import core
 from truss.cli.utils import common
 from truss.cli.utils.output import console, error_console
+from truss.remote.baseten.core import get_training_job_logs_with_pagination
 from truss.remote.baseten.remote import BasetenRemote
 from truss.remote.remote_factory import RemoteFactory
@@ -72,8 +73,11 @@ def _prepare_click_context(f: click.Command, params: dict) -> click.Context:
 @click.argument("config", type=Path, required=True)
 @click.option("--remote", type=str, required=False, help="Remote to use")
 @click.option("--tail", is_flag=True, help="Tail for status + logs after push.")
+@click.option("--job-name", type=str, required=False, help="Name of the training job.")
 @common.common_options()
-def push_training_job(config: Path, remote: Optional[str], tail: bool):
+def push_training_job(
+    config: Path, remote: Optional[str], tail: bool, job_name: Optional[str]
+):
     """Run a training job"""
     from truss_train import deployment
@@ -84,7 +88,9 @@ def push_training_job(config: Path, remote: Optional[str], tail: bool):
         remote_provider: BasetenRemote = cast(
             BasetenRemote, RemoteFactory.create(remote=remote)
         )
-        job_resp = deployment.create_training_job_from_file(remote_provider, config)
+        job_resp = deployment.create_training_job_from_file(
+            remote_provider, config, job_name
+        )
     # Note: This post create logic needs to happen outside the context
     # of the above context manager, as only one console session can be active
@@ -138,7 +144,9 @@ def get_job_logs(
     )
     if not tail:
-        logs = remote_provider.api.get_training_job_logs(project_id, job_id)
+        logs = get_training_job_logs_with_pagination(
+            remote_provider.api, project_id, job_id
+        )
         for log in cli_log_utils.parse_logs(logs):
             cli_log_utils.output_log(log)
     else:

truss/contexts/image_builder/cache_warmer.py CHANGED Viewed

@@ -15,8 +15,6 @@ from botocore.exceptions import ClientError, NoCredentialsError
 from google.cloud import storage
 from huggingface_hub import hf_hub_download
-from truss.base import constants
 B10CP_PATH_TRUSS_ENV_VAR_NAME = "B10CP_PATH_TRUSS"
 GCS_CREDENTIALS = "/app/data/service_account.json"
@@ -110,7 +108,7 @@ class RepositoryFile(ABC):
 class HuggingFaceFile(RepositoryFile):
     def download_to_cache(self):
-        secret_path = Path(f"/etc/secrets/{constants.HF_ACCESS_TOKEN_FILE_NAME}")
+        secret_path = Path("/etc/secrets/hf-access-token")
         secret = secret_path.read_text().strip() if secret_path.exists() else None
         try:
             hf_hub_download(

truss/contexts/image_builder/serving_image_builder.py CHANGED Viewed

@@ -73,7 +73,6 @@ from truss.contexts.image_builder.util import (
 )
 from truss.contexts.truss_context import TrussContext
 from truss.truss_handle.patch.hash import directory_content_hash
-from truss.util.basetenpointer import model_cache_hf_to_b10ptr
 from truss.util.jinja import read_template_from_fs
 from truss.util.path import (
     build_truss_target_directory,
@@ -93,6 +92,8 @@ USER_TRUSS_IGNORE_FILE = ".truss_ignore"
 GCS_CREDENTIALS = "service_account.json"
 S3_CREDENTIALS = "s3_credentials.json"
+HF_ACCESS_TOKEN_FILE_NAME = "hf-access-token"
 CLOUD_BUCKET_CACHE = MODEL_CACHE_PATH
 HF_SOURCE_DIR = Path("./root/.cache/huggingface/hub/")
@@ -324,36 +325,27 @@ def get_files_to_model_cache_v1(config: TrussConfig, truss_dir: Path, build_dir:
 def build_model_cache_v2_and_copy_bptr_manifest(config: TrussConfig, build_dir: Path):
     assert config.model_cache.is_v2
     assert all(model.volume_folder is not None for model in config.model_cache.models)
-    try:
-        from truss_transfer import PyModelRepo, create_basetenpointer_from_models
-        py_models = [
-            PyModelRepo(
-                repo_id=model.repo_id,
-                revision=model.revision,
-                runtime_secret_name=model.runtime_secret_name,
-                allow_patterns=model.allow_patterns,
-                ignore_patterns=model.ignore_patterns,
-                volume_folder=model.volume_folder,
-                kind=model.kind.value,
-            )
-            for model in config.model_cache.models
-        ]
-        # create BasetenPointer from models
-        basetenpointer_json = create_basetenpointer_from_models(models=py_models)
-        bptr_py = json.loads(basetenpointer_json)["pointers"]
-        logging.info(f"created ({len(bptr_py)}) Basetenpointer")
-        logging.info(f"pointers json: {basetenpointer_json}")
-        with open(build_dir / "bptr-manifest", "w") as f:
-            f.write(basetenpointer_json)
-    except Exception as e:
-        logging.warning(f"debug: failed to create BasetenPointer: {e}")
-        # TODO: remove below section + remove logging lines above.
-        # builds BasetenManifest for caching
-        basetenpointers = model_cache_hf_to_b10ptr(config.model_cache)
-        # write json of bastenpointers into build dir
-        with open(build_dir / "bptr-manifest", "w") as f:
-            f.write(basetenpointers.model_dump_json())
+    from truss_transfer import PyModelRepo, create_basetenpointer_from_models
+    py_models = [
+        PyModelRepo(
+            repo_id=model.repo_id,
+            revision=model.revision,
+            runtime_secret_name=model.runtime_secret_name,
+            allow_patterns=model.allow_patterns,
+            ignore_patterns=model.ignore_patterns,
+            volume_folder=model.volume_folder,
+            kind=model.kind.value,
+        )
+        for model in config.model_cache.models
+    ]
+    # create BasetenPointer from models
+    basetenpointer_json = create_basetenpointer_from_models(models=py_models)
+    bptr_py = json.loads(basetenpointer_json)["pointers"]
+    logging.info(f"created ({len(bptr_py)}) Basetenpointer")
+    logging.info(f"pointers json: {basetenpointer_json}")
+    with open(build_dir / "bptr-manifest", "w") as f:
+        f.write(basetenpointer_json)
 def generate_docker_server_nginx_config(build_dir, config):
@@ -819,7 +811,7 @@ class ServingImageBuilder(ImageBuilder):
             model_cache_v1=config.model_cache.is_v1,
             model_cache_v2=config.model_cache.is_v2,
             hf_access_token=hf_access_token,
-            hf_access_token_file_name=constants.HF_ACCESS_TOKEN_FILE_NAME,
+            hf_access_token_file_name=HF_ACCESS_TOKEN_FILE_NAME,
             external_data_files=external_data_files,
             build_commands=build_commands,
             use_local_src=config.use_local_src,

truss/remote/baseten/api.py CHANGED Viewed

@@ -669,6 +669,17 @@ class BasetenApi:
         # NB(nikhil): reverse order so latest logs are at the end
         return resp_json["logs"][::-1]
+    def _fetch_log_batch(
+        self, project_id: str, job_id: str, query_params: Dict[str, Any]
+    ) -> List[Any]:
+        """
+        Fetch a single batch of logs from the API.
+        """
+        resp_json = self._rest_api_client.post(
+            f"v1/training_projects/{project_id}/jobs/{job_id}/logs", body=query_params
+        )
+        return resp_json["logs"]
     def get_training_job_checkpoint_presigned_url(
         self, project_id: str, job_id: str, page_size: int = 100
     ) -> List[Dict[str, str]]:

truss/remote/baseten/core.py CHANGED Viewed

@@ -3,7 +3,9 @@ import json
 import logging
 import pathlib
 import textwrap
-from typing import IO, TYPE_CHECKING, List, NamedTuple, Optional, Tuple, Type
+from typing import IO, TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Tuple, Type
+import requests
 from truss.base.errors import ValidationError
@@ -15,6 +17,7 @@ from truss.remote.baseten import custom_types as b10_types
 from truss.remote.baseten.api import BasetenApi
 from truss.remote.baseten.error import ApiError
 from truss.remote.baseten.utils.tar import create_tar_with_progress_bar
+from truss.remote.baseten.utils.time import iso_to_millis
 from truss.remote.baseten.utils.transfer import multipart_upload_boto3
 from truss.util.path import load_trussignore_patterns_from_truss_dir
@@ -27,6 +30,16 @@ NO_ENVIRONMENTS_EXIST_ERROR_MESSAGING = (
     "Model hasn't been deployed yet. No environments exist."
 )
+# Maximum number of iterations to prevent infinite loops when paginating logs
+MAX_ITERATIONS = 10_000
+MIN_BATCH_SIZE = 100
+# LIMIT for the number of logs to fetch per request defined by the server
+MAX_BATCH_SIZE = 1000
+NANOSECONDS_PER_MILLISECOND = 1_000_000
+MILLISECONDS_PER_HOUR = 60 * 60 * 1000
 class ModelIdentifier:
     value: str
@@ -465,3 +478,198 @@ def validate_truss_config_against_backend(api: BasetenApi, config: str):
             raise ValidationError(
                 f"Validation failed with the following errors:\n{error_messages}"
             )
+def _build_log_query_params(
+    start_time: Optional[int], end_time: Optional[int], batch_size: int
+) -> Dict[str, Any]:
+    """
+    Build query parameters for log fetching request.
+    Args:
+        start_time: Start time in milliseconds since epoch
+        end_time: End time in milliseconds since epoch
+        batch_size: Number of logs to fetch per request
+    Returns:
+        Dictionary of query parameters with None values removed
+    """
+    query_body = {
+        "start_epoch_millis": start_time,
+        "end_epoch_millis": end_time,
+        "limit": batch_size,
+        "direction": "asc",
+    }
+    return {k: v for k, v in query_body.items() if v is not None}
+def _handle_server_error_backoff(
+    error: requests.HTTPError, job_id: str, iteration: int, batch_size: int
+) -> int:
+    """
+    Slash the batch size in half and return the new batch size
+    """
+    old_batch_size = batch_size
+    new_batch_size = max(batch_size // 2, MIN_BATCH_SIZE)
+    logging.warning(
+        f"Server error (HTTP {error.response.status_code}) for job {job_id} at iteration {iteration}. "
+        f"Reducing batch size from {old_batch_size} to {new_batch_size}. Retrying..."
+    )
+    return new_batch_size
+def _process_batch_logs(
+    batch_logs: List[Any], job_id: str, iteration: int, batch_size: int
+) -> Tuple[bool, Optional[int], Optional[int]]:
+    """
+    Process a batch of logs and determine if pagination should continue.
+    Args:
+        batch_logs: List of logs from the current batch
+        job_id: The job ID for logging
+        iteration: Current iteration number for logging
+        batch_size: Expected batch size
+    Returns:
+        Tuple of (should_continue, next_start_time, next_end_time)
+    """
+    # If no logs returned, we're done
+    if not batch_logs:
+        logging.info(f"No logs returned for job {job_id} at iteration {iteration}")
+        return False, None, None
+    # If we got fewer logs than the batch size, we've reached the end
+    if len(batch_logs) == 0:
+        logging.info(f"Reached end of logs for job {job_id} at iteration {iteration}")
+        return False, None, None
+    # Timestamp returned in nanoseconds for the last log in this batch converted
+    # to milliseconds to use as start for next iteration
+    last_log_timestamp = int(batch_logs[-1]["timestamp"]) // NANOSECONDS_PER_MILLISECOND
+    # Update start time for next iteration (add 1ms to avoid overlap)
+    next_start_time_ms = last_log_timestamp + 1
+    # Set end time to 2 hours from next start time, maximum time delta allowed by the API
+    next_end_time_ms = next_start_time_ms + 2 * MILLISECONDS_PER_HOUR
+    return True, next_start_time_ms, next_end_time_ms
+class BatchedTrainingLogsFetcher:
+    """
+    Iterator for fetching training job logs in batches using time-based pagination.
+    This iterator handles the complexity of paginating through training job logs,
+    including error handling, batch size adjustment, and time window management.
+    """
+    def __init__(
+        self,
+        api: BasetenApi,
+        project_id: str,
+        job_id: str,
+        batch_size: int = MAX_BATCH_SIZE,
+    ):
+        self.api = api
+        self.project_id = project_id
+        self.job_id = job_id
+        self.batch_size = batch_size
+        self.iteration = 0
+        self.current_start_time = None
+        self.current_end_time = None
+        self._initialize_time_window()
+    def _initialize_time_window(self):
+        training_job = self.api.get_training_job(self.project_id, self.job_id)
+        self.current_start_time = iso_to_millis(
+            training_job["training_job"]["created_at"]
+        )
+        self.current_end_time = self.current_start_time + 2 * MILLISECONDS_PER_HOUR
+    def __iter__(self):
+        return self
+    def __next__(self) -> List[Any]:
+        if self.iteration >= MAX_ITERATIONS:
+            logging.warning(
+                f"Reached maximum iteration limit ({MAX_ITERATIONS}) while paginating "
+                f"training job logs for project_id={self.project_id}, job_id={self.job_id}."
+            )
+            raise StopIteration
+        query_params = _build_log_query_params(
+            self.current_start_time, self.current_end_time, self.batch_size
+        )
+        try:
+            batch_logs = self.api._fetch_log_batch(
+                self.project_id, self.job_id, query_params
+            )
+            should_continue, next_start_time, next_end_time = _process_batch_logs(
+                batch_logs, self.job_id, self.iteration, self.batch_size
+            )
+            if not should_continue:
+                logging.info(
+                    f"Completed pagination for job {self.job_id}. Total iterations: {self.iteration + 1}"
+                )
+                raise StopIteration
+            self.current_start_time = next_start_time  # type: ignore[assignment]
+            self.current_end_time = next_end_time  # type: ignore[assignment]
+            self.iteration += 1
+            return batch_logs
+        except requests.HTTPError as e:
+            if 500 <= e.response.status_code < 600:
+                if self.batch_size == MIN_BATCH_SIZE:
+                    logging.error(
+                        "Failed to fetch all training job logs due to persistent server errors. "
+                        "Please try again later or contact support if the issue persists."
+                    )
+                    raise StopIteration
+                self.batch_size = _handle_server_error_backoff(
+                    e, self.job_id, self.iteration, self.batch_size
+                )
+                # Retry the same iteration with reduced batch size
+                return self.__next__()
+            else:
+                logging.error(
+                    f"HTTP error fetching logs for job {self.job_id} at iteration {self.iteration}: {e}"
+                )
+                raise StopIteration
+        except Exception as e:
+            logging.error(
+                f"Error fetching logs for job {self.job_id} at iteration {self.iteration}: {e}"
+            )
+            raise StopIteration
+def get_training_job_logs_with_pagination(
+    api: BasetenApi, project_id: str, job_id: str, batch_size: int = MAX_BATCH_SIZE
+) -> List[Any]:
+    """
+    This method implements forward time-based pagination by starting from the earliest
+    available log and working forward in time. It uses the timestamp of the newest log in
+    each batch as the start time for the next request.
+    Returns:
+        List of all logs in chronological order (oldest first)
+    """
+    all_logs = []
+    logs_iterator = BatchedTrainingLogsFetcher(api, project_id, job_id, batch_size)
+    for batch_logs in logs_iterator:
+        all_logs.extend(batch_logs)
+    logging.info(f"Completed pagination for job {job_id}. Total logs: {len(all_logs)}")
+    return all_logs

truss/remote/baseten/utils/time.py ADDED Viewed

@@ -0,0 +1,15 @@
+from dateutil import parser
+def iso_to_millis(ts: str) -> int:
+    """
+    Convert ISO 8601 timestamp string to milliseconds since epoch.
+    Args:
+        ts: ISO 8601 timestamp string (handles Zulu/UTC (Z) automatically)
+    Returns:
+        Milliseconds since epoch as integer
+    """
+    dt = parser.isoparse(ts)  # handles Zulu/UTC (Z) automatically
+    return int(dt.timestamp() * 1000)

truss 0.10.9rc601__py3-none-any.whl → 0.10.10rc1__py3-none-any.whl

Potentially problematic release.

truss 0.10.9rc601py3-none-any.whl → 0.10.10rc1py3-none-any.whl