PyPI - guidellm - Versions diffs - 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl - Mend

guidellm 0.3.1py3-none-any.whl → 0.6.0a5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

guidellm/__init__.py +5 -2
guidellm/__main__.py +524 -255
guidellm/backends/__init__.py +33 -0
guidellm/backends/backend.py +109 -0
guidellm/backends/openai.py +340 -0
guidellm/backends/response_handlers.py +428 -0
guidellm/benchmark/__init__.py +69 -39
guidellm/benchmark/benchmarker.py +160 -316
guidellm/benchmark/entrypoints.py +560 -127
guidellm/benchmark/outputs/__init__.py +24 -0
guidellm/benchmark/outputs/console.py +633 -0
guidellm/benchmark/outputs/csv.py +721 -0
guidellm/benchmark/outputs/html.py +473 -0
guidellm/benchmark/outputs/output.py +169 -0
guidellm/benchmark/outputs/serialized.py +69 -0
guidellm/benchmark/profiles.py +718 -0
guidellm/benchmark/progress.py +553 -556
guidellm/benchmark/scenarios/__init__.py +40 -0
guidellm/benchmark/scenarios/chat.json +6 -0
guidellm/benchmark/scenarios/rag.json +6 -0
guidellm/benchmark/schemas/__init__.py +66 -0
guidellm/benchmark/schemas/base.py +402 -0
guidellm/benchmark/schemas/generative/__init__.py +55 -0
guidellm/benchmark/schemas/generative/accumulator.py +841 -0
guidellm/benchmark/schemas/generative/benchmark.py +163 -0
guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
guidellm/benchmark/schemas/generative/metrics.py +927 -0
guidellm/benchmark/schemas/generative/report.py +158 -0
guidellm/data/__init__.py +34 -4
guidellm/data/builders.py +541 -0
guidellm/data/collators.py +16 -0
guidellm/data/config.py +120 -0
guidellm/data/deserializers/__init__.py +49 -0
guidellm/data/deserializers/deserializer.py +141 -0
guidellm/data/deserializers/file.py +223 -0
guidellm/data/deserializers/huggingface.py +94 -0
guidellm/data/deserializers/memory.py +194 -0
guidellm/data/deserializers/synthetic.py +246 -0
guidellm/data/entrypoints.py +52 -0
guidellm/data/loaders.py +190 -0
guidellm/data/preprocessors/__init__.py +27 -0
guidellm/data/preprocessors/formatters.py +410 -0
guidellm/data/preprocessors/mappers.py +196 -0
guidellm/data/preprocessors/preprocessor.py +30 -0
guidellm/data/processor.py +29 -0
guidellm/data/schemas.py +175 -0
guidellm/data/utils/__init__.py +6 -0
guidellm/data/utils/dataset.py +94 -0
guidellm/extras/__init__.py +4 -0
guidellm/extras/audio.py +220 -0
guidellm/extras/vision.py +242 -0
guidellm/logger.py +2 -2
guidellm/mock_server/__init__.py +8 -0
guidellm/mock_server/config.py +84 -0
guidellm/mock_server/handlers/__init__.py +17 -0
guidellm/mock_server/handlers/chat_completions.py +280 -0
guidellm/mock_server/handlers/completions.py +280 -0
guidellm/mock_server/handlers/tokenizer.py +142 -0
guidellm/mock_server/models.py +510 -0
guidellm/mock_server/server.py +238 -0
guidellm/mock_server/utils.py +302 -0
guidellm/scheduler/__init__.py +69 -26
guidellm/scheduler/constraints/__init__.py +49 -0
guidellm/scheduler/constraints/constraint.py +325 -0
guidellm/scheduler/constraints/error.py +411 -0
guidellm/scheduler/constraints/factory.py +182 -0
guidellm/scheduler/constraints/request.py +312 -0
guidellm/scheduler/constraints/saturation.py +722 -0
guidellm/scheduler/environments.py +252 -0
guidellm/scheduler/scheduler.py +137 -368
guidellm/scheduler/schemas.py +358 -0
guidellm/scheduler/strategies.py +617 -0
guidellm/scheduler/worker.py +413 -419
guidellm/scheduler/worker_group.py +712 -0
guidellm/schemas/__init__.py +65 -0
guidellm/schemas/base.py +417 -0
guidellm/schemas/info.py +188 -0
guidellm/schemas/request.py +235 -0
guidellm/schemas/request_stats.py +349 -0
guidellm/schemas/response.py +124 -0
guidellm/schemas/statistics.py +1018 -0
guidellm/{config.py → settings.py} +31 -24
guidellm/utils/__init__.py +71 -8
guidellm/utils/auto_importer.py +98 -0
guidellm/utils/cli.py +132 -5
guidellm/utils/console.py +566 -0
guidellm/utils/encoding.py +778 -0
guidellm/utils/functions.py +159 -0
guidellm/utils/hf_datasets.py +1 -2
guidellm/utils/hf_transformers.py +4 -4
guidellm/utils/imports.py +9 -0
guidellm/utils/messaging.py +1118 -0
guidellm/utils/mixins.py +115 -0
guidellm/utils/random.py +3 -4
guidellm/utils/registry.py +220 -0
guidellm/utils/singleton.py +133 -0
guidellm/utils/synchronous.py +159 -0
guidellm/utils/text.py +163 -50
guidellm/utils/typing.py +41 -0
guidellm/version.py +2 -2
guidellm-0.6.0a5.dist-info/METADATA +364 -0
guidellm-0.6.0a5.dist-info/RECORD +109 -0
guidellm/backend/__init__.py +0 -23
guidellm/backend/backend.py +0 -259
guidellm/backend/openai.py +0 -708
guidellm/backend/response.py +0 -136
guidellm/benchmark/aggregator.py +0 -760
guidellm/benchmark/benchmark.py +0 -837
guidellm/benchmark/output.py +0 -997
guidellm/benchmark/profile.py +0 -409
guidellm/benchmark/scenario.py +0 -104
guidellm/data/prideandprejudice.txt.gz +0 -0
guidellm/dataset/__init__.py +0 -22
guidellm/dataset/creator.py +0 -213
guidellm/dataset/entrypoints.py +0 -42
guidellm/dataset/file.py +0 -92
guidellm/dataset/hf_datasets.py +0 -62
guidellm/dataset/in_memory.py +0 -132
guidellm/dataset/synthetic.py +0 -287
guidellm/objects/__init__.py +0 -18
guidellm/objects/pydantic.py +0 -89
guidellm/objects/statistics.py +0 -953
guidellm/preprocess/__init__.py +0 -3
guidellm/preprocess/dataset.py +0 -374
guidellm/presentation/__init__.py +0 -28
guidellm/presentation/builder.py +0 -27
guidellm/presentation/data_models.py +0 -232
guidellm/presentation/injector.py +0 -66
guidellm/request/__init__.py +0 -18
guidellm/request/loader.py +0 -284
guidellm/request/request.py +0 -79
guidellm/request/types.py +0 -10
guidellm/scheduler/queues.py +0 -25
guidellm/scheduler/result.py +0 -155
guidellm/scheduler/strategy.py +0 -495
guidellm-0.3.1.dist-info/METADATA +0 -329
guidellm-0.3.1.dist-info/RECORD +0 -62
{guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
{guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
{guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
{guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0

guidellm/extras/vision.py ADDED Viewed

@@ -0,0 +1,242 @@
+from __future__ import annotations
+import base64
+import io
+from pathlib import Path
+from typing import Any, Literal
+import httpx
+import numpy as np
+try:
+    from PIL import Image as PILImage
+except ImportError as e:
+    raise ImportError(
+        "Please install guidellm[vision] to use image/video features"
+    ) from e
+__all__ = [
+    "encode_image",
+    "encode_video",
+    "get_file_format",
+    "is_url",
+    "resize_image",
+]
+def is_url(text: Any) -> bool:
+    return isinstance(text, str) and text.startswith(("http://", "https://"))
+def encode_image(
+    image: bytes | str | Path | np.ndarray | PILImage.Image,
+    width: int | None = None,
+    height: int | None = None,
+    max_size: int | None = None,
+    max_width: int | None = None,
+    max_height: int | None = None,
+    encode_type: Literal["base64", "url"] | None = "base64",
+) -> dict[Literal["type", "image", "image_pixels", "image_bytes"], str | int | None]:
+    """
+    Input image types:
+    - bytes: raw image bytes, decoded with Pillow
+    - str: file path on disk, url, or already base64 encoded image string
+    - pathlib.Path: file path on disk
+    - np.ndarray: image array, decoded with Pillow
+    - PIL.Image.Image: Pillow image
+    - datasets.Image: HuggingFace datasets Image object
+    max_size: maximum size of the longest edge of the image
+    max_width: maximum width of the image
+    max_height: maximum height of the image
+    encode_type: None to return the supported format
+        (url for url, base64 string for others)
+        "base64" to return base64 encoded string (or download URL and encode)
+        "url" to return url (only if input is url, otherwise fails)
+    Returns a str of either:
+    - image url
+    - "data:image/{type};base64, {data}" string
+    """
+    if isinstance(image, str) and is_url(image):
+        if encode_type == "base64":
+            response = httpx.get(image)
+            response.raise_for_status()
+            return encode_image(
+                image=response.content,
+                max_size=max_size,
+                max_width=max_width,
+                max_height=max_height,
+                encode_type="base64",
+            )
+        if any([width, height, max_size, max_width, max_height]):
+            raise ValueError(f"Cannot resize image {image} when encode_type is 'url'")
+        return {
+            "type": "image_url",
+            "image": image,
+            "image_pixels": None,
+            "image_bytes": None,
+        }
+    decoded_image: PILImage.Image
+    if isinstance(image, bytes):
+        decoded_image = PILImage.open(io.BytesIO(image))
+    elif isinstance(image, str) and image.startswith("data:image/"):
+        _, encoded = image.split(",", 1)
+        image_data = base64.b64decode(encoded)
+        decoded_image = PILImage.open(io.BytesIO(image_data))
+    elif isinstance(image, str | Path):
+        decoded_image = PILImage.open(image)
+    elif isinstance(image, np.ndarray):
+        decoded_image = PILImage.fromarray(image)
+    elif isinstance(image, PILImage.Image):
+        decoded_image = image
+    else:
+        raise ValueError(f"Unsupported image type: {type(image)} for {image}")
+    output_image = resize_image(
+        decoded_image,
+        width=width,
+        height=height,
+        max_width=max_width,
+        max_height=max_height,
+        max_size=max_size,
+    )
+    if output_image.mode != "RGB":
+        output_image = output_image.convert("RGB")
+    buffer = io.BytesIO()
+    output_image.save(buffer, format="JPEG")
+    image_bytes = buffer.getvalue()
+    image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+    return {
+        "type": "image_base64",
+        "image": f"data:image/jpeg;base64,{image_base64}",
+        "image_pixels": output_image.width * output_image.height,
+        "image_bytes": len(image_bytes),
+    }
+def resize_image(
+    image: PILImage.Image,
+    width: int | None = None,
+    height: int | None = None,
+    max_width: int | None = None,
+    max_height: int | None = None,
+    max_size: int | None = None,
+) -> PILImage.Image:
+    if not isinstance(image, PILImage.Image):
+        raise ValueError(f"Unsupported image type: {type(image)}")
+    if width is not None and height is not None:
+        return image.resize((width, height), PILImage.Resampling.BILINEAR)
+    orig_w, orig_h = image.size
+    aspect = orig_w / orig_h
+    if width is not None:
+        target_w = width
+        target_h = round(width / aspect)
+    elif height is not None:
+        target_h = height
+        target_w = round(height * aspect)
+    else:
+        target_w, target_h = orig_w, orig_h
+    # Normalize max_size → max_width/max_height
+    if max_size is not None:
+        max_width = max_width or max_size
+        max_height = max_height or max_size
+    # Apply max constraints (preserve aspect ratio)
+    if max_width or max_height:
+        scale_w = max_width / target_w if max_width else 1.0
+        scale_h = max_height / target_h if max_height else 1.0
+        scale = min(scale_w, scale_h, 1.0)  # never upscale
+        target_w = round(target_w * scale)
+        target_h = round(target_h * scale)
+    if (target_w, target_h) != (orig_w, orig_h):
+        image = image.resize((target_w, target_h), PILImage.Resampling.BILINEAR)
+    return image
+def encode_video(
+    video: bytes | str | Path,
+    encode_type: Literal["base64", "url"] | None = "base64",
+) -> dict[
+    Literal["type", "video", "video_frames", "video_seconds", "video_bytes"],
+    str | int | float | None,
+]:
+    """
+    Input video types:
+    - bytes: raw video bytes
+    - str: file path on disk, url, or already base64 encoded video string
+    - pathlib.Path: file path on disk
+    - datasets.Video: HuggingFace datasets Video object
+    encode_type: None to return the supported format
+        (url for url, base64 string for others)
+        "base64" to return base64 encoded string (or download URL and encode)
+        "url" to return url (only if input is url, otherwise fails)
+    Returns a str of either:
+    - video url
+    - "data:video/{type};base64, {data}" string
+    """
+    if isinstance(video, str) and is_url(video):
+        if encode_type == "base64":
+            response = httpx.get(video)
+            response.raise_for_status()
+            return encode_video(video=response.content, encode_type="base64")
+        return {
+            "type": "video_url",
+            "video": video,
+            "video_frames": None,
+            "video_seconds": None,
+            "video_bytes": None,
+        }
+    if isinstance(video, str) and video.startswith("data:video/"):
+        data_str = video.split(",", 1)[1]
+        return {
+            "type": "video_base64",
+            "video": video,
+            "video_frames": None,
+            "video_seconds": None,
+            "video_bytes": len(data_str) * 3 // 4,  # base64 to bytes
+        }
+    if isinstance(video, str | Path):
+        path = Path(video)
+        video_bytes = path.read_bytes()
+        video_format = get_file_format(path)
+    elif isinstance(video, bytes):
+        video_bytes = video
+        video_format = "unknown"
+    else:
+        raise ValueError(f"Unsupported video type: {type(video)} for {video}")
+    video_base64 = base64.b64encode(video_bytes).decode("utf-8")
+    return {
+        "type": "video_base64",
+        "video": f"data:video/{video_format};base64,{video_base64}",
+        "video_frames": None,
+        "video_seconds": None,
+        "video_bytes": len(video_bytes),
+    }
+def get_file_format(path: Path | str) -> str:
+    """Get file format from path extension."""
+    suffix = Path(path).suffix.lower()
+    return suffix[1:] if suffix.startswith(".") else "unknown"

guidellm/logger.py CHANGED Viewed

@@ -41,7 +41,7 @@ import sys
 from loguru import logger
-from guidellm.config import LoggingSettings, settings
+from guidellm.settings import LoggingSettings, settings
 __all__ = ["configure_logger", "logger"]
@@ -72,7 +72,7 @@ def configure_logger(config: LoggingSettings = settings.logging):
         sys.stdout,
         level=config.console_log_level.upper(),
         format="<green>{time:YY-MM-DD HH:mm:ss}</green>|<level>{level: <8}</level> \
-        |<cyan>{name}:{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
+        |<cyan>{name}:{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
     )
     if config.log_file or config.log_file_level:

guidellm/mock_server/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""
+GuideLLM Mock Server for OpenAI and vLLM API compatibility.
+"""
+from .config import MockServerConfig
+from .server import MockServer
+__all__ = ["MockServer", "MockServerConfig"]

guidellm/mock_server/config.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""
+Configuration settings for the mock server component.
+Provides centralized configuration management for mock server behavior including
+network binding, model identification, response timing characteristics, and token
+generation parameters. Supports environment variable configuration for deployment
+flexibility with automatic validation through Pydantic settings.
+"""
+from __future__ import annotations
+from pydantic import Field
+from pydantic_settings import BaseSettings
+__all__ = ["MockServerConfig"]
+class MockServerConfig(BaseSettings):
+    """
+    Configuration settings for mock server behavior and deployment.
+    Centralizes all configurable parameters for mock server operation including
+    network settings, model identification, response timing characteristics, and
+    token generation behavior. Environment variables with GUIDELLM_MOCK_SERVER_
+    prefix override default values for deployment flexibility.
+    Example:
+    ::
+        config = MockServerConfig(host="0.0.0.0", port=8080, model="custom-model")
+        # Use with environment variables:
+        # GUIDELLM_MOCK_SERVER_HOST=127.0.0.1 GUIDELLM_MOCK_SERVER_PORT=9000
+    """
+    host: str = Field(
+        default="127.0.0.1", description="Host address to bind the server to"
+    )
+    port: int = Field(default=8000, description="Port number to bind the server to")
+    workers: int = Field(default=1, description="Number of worker processes to spawn")
+    model: str = Field(
+        default="llama-3.1-8b-instruct",
+        description="Model name to present in API responses",
+    )
+    processor: str | None = Field(
+        default=None,
+        description=(
+            "Processor type to use for token stats, tokenize, and detokenize. "
+            "If None, a mock one is created."
+        ),
+    )
+    request_latency: float = Field(
+        default=3.0,
+        description="Base request latency in seconds for non-streaming responses",
+    )
+    request_latency_std: float = Field(
+        default=0.0,
+        description="Standard deviation for request latency variation",
+    )
+    ttft_ms: float = Field(
+        default=150.0,
+        description="Time to first token in milliseconds for streaming responses",
+    )
+    ttft_ms_std: float = Field(
+        default=0.0,
+        description="Standard deviation for time to first token variation",
+    )
+    itl_ms: float = Field(
+        default=10.0,
+        description="Inter-token latency in milliseconds for streaming responses",
+    )
+    itl_ms_std: float = Field(
+        default=0.0,
+        description="Standard deviation for inter-token latency variation",
+    )
+    output_tokens: int = Field(
+        default=128, description="Number of output tokens to generate in responses"
+    )
+    output_tokens_std: float = Field(
+        default=0.0,
+        description="Standard deviation for output token count variation",
+    )
+    class Config:
+        env_prefix = "GUIDELLM_MOCK_SERVER_"
+        case_sensitive = False

guidellm/mock_server/handlers/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""
+HTTP request handlers for the GuideLLM mock server.
+This module exposes request handlers that implement OpenAI-compatible API endpoints
+for the mock server. The handlers provide realistic LLM simulation capabilities
+including chat completions, legacy completions, and tokenization services with
+configurable timing characteristics, token counting, and proper error handling to
+support comprehensive benchmarking and testing scenarios.
+"""
+from __future__ import annotations
+from .chat_completions import ChatCompletionsHandler
+from .completions import CompletionsHandler
+from .tokenizer import TokenizerHandler
+__all__ = ["ChatCompletionsHandler", "CompletionsHandler", "TokenizerHandler"]

guidellm/mock_server/handlers/chat_completions.py ADDED Viewed

@@ -0,0 +1,280 @@
+"""
+OpenAI Chat Completions API endpoint handler for the mock server.
+Provides a complete implementation of the /v1/chat/completions endpoint that simulates
+realistic LLM behavior with configurable timing characteristics. Supports both streaming
+and non-streaming responses with proper token counting, latency simulation including
+TTFT (Time To First Token) and ITL (Inter-Token Latency), and OpenAI-compatible error
+handling for comprehensive benchmarking scenarios.
+"""
+from __future__ import annotations
+import asyncio
+import json
+import math
+import time
+import uuid
+from pydantic import ValidationError
+from sanic import response
+from sanic.request import Request
+from sanic.response import HTTPResponse, ResponseStream
+from transformers import PreTrainedTokenizer
+from guidellm.mock_server.config import MockServerConfig
+from guidellm.mock_server.models import (
+    ChatCompletionChoice,
+    ChatCompletionsRequest,
+    ChatCompletionsResponse,
+    ChatMessage,
+    ErrorDetail,
+    ErrorResponse,
+    Usage,
+)
+from guidellm.mock_server.utils import (
+    MockTokenizer,
+    create_fake_text,
+    create_fake_tokens_str,
+    sample_number,
+    times_generator,
+)
+__all__ = ["ChatCompletionsHandler"]
+class ChatCompletionsHandler:
+    """
+    Handles OpenAI Chat Completions API requests with realistic LLM simulation.
+    Implements the /v1/chat/completions endpoint behavior including request validation,
+    response generation, and timing simulation. Supports both streaming and
+    non-streaming modes with configurable latency characteristics for comprehensive
+    benchmarking. Uses either a mock tokenizer or a real tokenizer for accurate token
+    counting and realistic text generation.
+    Example:
+    ::
+        config = MockServerConfig(ttft_ms=100, itl_ms=50)
+        handler = ChatCompletionsHandler(config)
+        response = await handler.handle(request)
+    """
+    def __init__(self, config: MockServerConfig) -> None:
+        """
+        Initialize the Chat Completions handler with server configuration.
+        :param config: Mock server configuration containing timing and behavior settings
+        """
+        self.config = config
+        self.tokenizer = (
+            MockTokenizer()
+            if config.processor is None
+            else PreTrainedTokenizer.from_pretrained(config.processor)
+        )
+    async def handle(self, request: Request) -> HTTPResponse:
+        """
+        Process incoming chat completion requests with validation and routing.
+        Validates the request payload, handles errors gracefully, and routes to
+        appropriate streaming or non-streaming response handlers based on the
+        request configuration.
+        :param request: Sanic HTTP request containing chat completion parameters
+        :return: HTTP response with completion data or error information
+        :raises ValidationError: When request payload fails validation
+        :raises JSONDecodeError: When request contains invalid JSON
+        """
+        try:
+            # Parse and validate request
+            req_data = ChatCompletionsRequest(**request.json)
+        except ValidationError as exc:
+            return response.json(
+                ErrorResponse(
+                    error=ErrorDetail(
+                        message=f"Invalid request: {str(exc)}",
+                        type="invalid_request_error",
+                        code="invalid_request",
+                    )
+                ).model_dump(),
+                status=400,
+            )
+        except (json.JSONDecodeError, TypeError):
+            return response.json(
+                ErrorResponse(
+                    error=ErrorDetail(
+                        message="Invalid JSON in request body",
+                        type="invalid_request_error",
+                        code="invalid_json",
+                    )
+                ).model_dump(),
+                status=400,
+            )
+        # Handle streaming vs non-streaming
+        if req_data.stream:
+            return await self._handle_stream(req_data)
+        else:
+            return await self._handle_non_stream(req_data)
+    async def _handle_non_stream(self, req: ChatCompletionsRequest) -> HTTPResponse:
+        """
+        Generate complete non-streaming chat completion response.
+        Simulates realistic LLM behavior with TTFT and ITL delays, generates
+        appropriate token counts, and returns a complete response with usage
+        statistics and generated content.
+        :param req: Validated chat completion request parameters
+        :return: Complete HTTP response with generated completion data
+        """
+        # TTFT delay
+        await asyncio.sleep(
+            sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
+        )
+        # Token counts
+        prompt_text = self.tokenizer.apply_chat_template(req.messages)
+        prompt_tokens = len(self.tokenizer(prompt_text))  # type: ignore[arg-type]
+        max_tokens = req.max_completion_tokens or req.max_tokens or math.inf
+        completion_tokens_count = min(
+            sample_number(self.config.output_tokens, self.config.output_tokens_std),
+            max_tokens,
+        )
+        # ITL delay
+        itl_delay = 0.0
+        delays_iter = iter(times_generator(self.config.itl_ms, self.config.itl_ms_std))
+        for _ in range(int(completion_tokens_count) - 1):
+            itl_delay += next(delays_iter)
+        await asyncio.sleep(itl_delay / 1000.0)
+        # Response
+        chat_response = ChatCompletionsResponse(
+            id=f"chatcmpl-{uuid.uuid4().hex[:29]}",
+            model=req.model,
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message=ChatMessage(
+                        role="assistant",
+                        content=create_fake_text(
+                            int(completion_tokens_count), self.tokenizer
+                        ),
+                    ),
+                    finish_reason="stop",
+                )
+            ],
+            usage=Usage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=int(completion_tokens_count),
+            ),
+            system_fingerprint=f"fp_{uuid.uuid4().hex[:10]}",
+        )
+        return response.json(chat_response.model_dump())
+    async def _handle_stream(self, req: ChatCompletionsRequest) -> HTTPResponse:
+        """
+        Generate streaming chat completion response with real-time token delivery.
+        Creates a streaming response that delivers tokens incrementally with
+        realistic timing delays. Supports optional usage statistics in the final
+        stream chunk when requested via stream_options.
+        :param req: Validated chat completion request with streaming enabled
+        :return: Streaming HTTP response delivering tokens with proper timing
+        """
+        async def generate_stream(stream_response):
+            completion_id = f"chatcmpl-{uuid.uuid4().hex[:29]}"
+            # TTFT delay
+            await asyncio.sleep(
+                sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
+            )
+            # Token counts
+            prompt_text = self.tokenizer.apply_chat_template(req.messages)
+            prompt_tokens = len(self.tokenizer(prompt_text))  # type: ignore[arg-type]
+            max_tokens = req.max_completion_tokens or req.max_tokens or math.inf
+            completion_tokens_count = int(
+                min(
+                    sample_number(
+                        self.config.output_tokens, self.config.output_tokens_std
+                    ),
+                    max_tokens,
+                )
+            )
+            # Send tokens
+            tokens = create_fake_tokens_str(completion_tokens_count, self.tokenizer)
+            delays_iter = iter(
+                times_generator(self.config.itl_ms, self.config.itl_ms_std)
+            )
+            for index, token in enumerate(tokens):
+                if index > 0:
+                    itl_delay = next(delays_iter)
+                    await asyncio.sleep(itl_delay / 1000.0)
+                chunk_data = {
+                    "id": completion_id,
+                    "object": "chat.completion.chunk",
+                    "created": int(time.time()),
+                    "model": req.model,
+                    "choices": [
+                        {
+                            "index": 0,
+                            "delta": {"content": token},
+                            "finish_reason": None,
+                        }
+                    ],
+                }
+                await stream_response.write(f"data: {json.dumps(chunk_data)}\n\n")
+            # Send final chunk with finish reason
+            final_chunk = {
+                "id": completion_id,
+                "object": "chat.completion.chunk",
+                "created": int(time.time()),
+                "model": req.model,
+                "choices": [
+                    {
+                        "index": 0,
+                        "delta": {},
+                        "finish_reason": "stop",
+                    }
+                ],
+            }
+            await stream_response.write(f"data: {json.dumps(final_chunk)}\n\n")
+            # Send usage if requested
+            if req.stream_options and req.stream_options.include_usage:
+                usage_chunk = {
+                    "id": completion_id,
+                    "object": "chat.completion.chunk",
+                    "created": int(time.time()),
+                    "model": req.model,
+                    "choices": [],
+                    "usage": {
+                        "prompt_tokens": prompt_tokens,
+                        "completion_tokens": completion_tokens_count,
+                        "total_tokens": prompt_tokens + completion_tokens_count,
+                    },
+                }
+                await stream_response.write(f"data: {json.dumps(usage_chunk)}\n\n")
+            # End stream
+            await stream_response.write("data: [DONE]\n\n")
+        return ResponseStream(  # type: ignore[return-value]
+            generate_stream,
+            content_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+                "X-Accel-Buffering": "no",
+            },
+        )

guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl

guidellm 0.3.1py3-none-any.whl → 0.6.0a5py3-none-any.whl