PyPI - guidellm - Versions diffs - 0.4.0a21__py3-none-any.whl → 0.4.0a169__py3-none-any.whl - Mend

guidellm 0.4.0a21py3-none-any.whl → 0.4.0a169py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of guidellm might be problematic. Click here for more details.

Files changed (115) hide show

guidellm/__init__.py +5 -2
guidellm/__main__.py +452 -252
guidellm/backends/__init__.py +33 -0
guidellm/backends/backend.py +110 -0
guidellm/backends/openai.py +355 -0
guidellm/backends/response_handlers.py +455 -0
guidellm/benchmark/__init__.py +53 -39
guidellm/benchmark/benchmarker.py +150 -317
guidellm/benchmark/entrypoints.py +467 -128
guidellm/benchmark/output.py +519 -771
guidellm/benchmark/profile.py +580 -280
guidellm/benchmark/progress.py +568 -549
guidellm/benchmark/scenarios/__init__.py +40 -0
guidellm/benchmark/scenarios/chat.json +6 -0
guidellm/benchmark/scenarios/rag.json +6 -0
guidellm/benchmark/schemas.py +2086 -0
guidellm/data/__init__.py +28 -4
guidellm/data/collators.py +16 -0
guidellm/data/deserializers/__init__.py +53 -0
guidellm/data/deserializers/deserializer.py +144 -0
guidellm/data/deserializers/file.py +222 -0
guidellm/data/deserializers/huggingface.py +94 -0
guidellm/data/deserializers/memory.py +194 -0
guidellm/data/deserializers/synthetic.py +348 -0
guidellm/data/loaders.py +149 -0
guidellm/data/preprocessors/__init__.py +25 -0
guidellm/data/preprocessors/formatters.py +404 -0
guidellm/data/preprocessors/mappers.py +198 -0
guidellm/data/preprocessors/preprocessor.py +31 -0
guidellm/data/processor.py +31 -0
guidellm/data/schemas.py +13 -0
guidellm/data/utils/__init__.py +6 -0
guidellm/data/utils/dataset.py +94 -0
guidellm/extras/__init__.py +4 -0
guidellm/extras/audio.py +215 -0
guidellm/extras/vision.py +242 -0
guidellm/logger.py +2 -2
guidellm/mock_server/__init__.py +8 -0
guidellm/mock_server/config.py +84 -0
guidellm/mock_server/handlers/__init__.py +17 -0
guidellm/mock_server/handlers/chat_completions.py +280 -0
guidellm/mock_server/handlers/completions.py +280 -0
guidellm/mock_server/handlers/tokenizer.py +142 -0
guidellm/mock_server/models.py +510 -0
guidellm/mock_server/server.py +168 -0
guidellm/mock_server/utils.py +302 -0
guidellm/preprocess/dataset.py +23 -26
guidellm/presentation/builder.py +2 -2
guidellm/presentation/data_models.py +25 -21
guidellm/presentation/injector.py +2 -3
guidellm/scheduler/__init__.py +65 -26
guidellm/scheduler/constraints.py +1035 -0
guidellm/scheduler/environments.py +252 -0
guidellm/scheduler/scheduler.py +140 -368
guidellm/scheduler/schemas.py +272 -0
guidellm/scheduler/strategies.py +519 -0
guidellm/scheduler/worker.py +391 -420
guidellm/scheduler/worker_group.py +707 -0
guidellm/schemas/__init__.py +31 -0
guidellm/schemas/info.py +159 -0
guidellm/schemas/request.py +226 -0
guidellm/schemas/response.py +119 -0
guidellm/schemas/stats.py +228 -0
guidellm/{config.py → settings.py} +32 -21
guidellm/utils/__init__.py +95 -8
guidellm/utils/auto_importer.py +98 -0
guidellm/utils/cli.py +71 -2
guidellm/utils/console.py +183 -0
guidellm/utils/encoding.py +778 -0
guidellm/utils/functions.py +134 -0
guidellm/utils/hf_datasets.py +1 -2
guidellm/utils/hf_transformers.py +4 -4
guidellm/utils/imports.py +9 -0
guidellm/utils/messaging.py +1118 -0
guidellm/utils/mixins.py +115 -0
guidellm/utils/pydantic_utils.py +411 -0
guidellm/utils/random.py +3 -4
guidellm/utils/registry.py +220 -0
guidellm/utils/singleton.py +133 -0
guidellm/{objects → utils}/statistics.py +341 -247
guidellm/utils/synchronous.py +159 -0
guidellm/utils/text.py +163 -50
guidellm/utils/typing.py +41 -0
guidellm/version.py +1 -1
{guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/METADATA +33 -10
guidellm-0.4.0a169.dist-info/RECORD +95 -0
guidellm/backend/__init__.py +0 -23
guidellm/backend/backend.py +0 -259
guidellm/backend/openai.py +0 -705
guidellm/backend/response.py +0 -136
guidellm/benchmark/aggregator.py +0 -760
guidellm/benchmark/benchmark.py +0 -837
guidellm/benchmark/scenario.py +0 -104
guidellm/data/prideandprejudice.txt.gz +0 -0
guidellm/dataset/__init__.py +0 -22
guidellm/dataset/creator.py +0 -213
guidellm/dataset/entrypoints.py +0 -42
guidellm/dataset/file.py +0 -92
guidellm/dataset/hf_datasets.py +0 -62
guidellm/dataset/in_memory.py +0 -132
guidellm/dataset/synthetic.py +0 -287
guidellm/objects/__init__.py +0 -18
guidellm/objects/pydantic.py +0 -89
guidellm/request/__init__.py +0 -18
guidellm/request/loader.py +0 -284
guidellm/request/request.py +0 -79
guidellm/request/types.py +0 -10
guidellm/scheduler/queues.py +0 -25
guidellm/scheduler/result.py +0 -155
guidellm/scheduler/strategy.py +0 -495
guidellm-0.4.0a21.dist-info/RECORD +0 -62
{guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/WHEEL +0 -0
{guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/entry_points.txt +0 -0
{guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/licenses/LICENSE +0 -0
{guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/top_level.txt +0 -0

guidellm/mock_server/handlers/chat_completions.py ADDED Viewed

@@ -0,0 +1,280 @@
+"""
+OpenAI Chat Completions API endpoint handler for the mock server.
+Provides a complete implementation of the /v1/chat/completions endpoint that simulates
+realistic LLM behavior with configurable timing characteristics. Supports both streaming
+and non-streaming responses with proper token counting, latency simulation including
+TTFT (Time To First Token) and ITL (Inter-Token Latency), and OpenAI-compatible error
+handling for comprehensive benchmarking scenarios.
+"""
+from __future__ import annotations
+import asyncio
+import json
+import math
+import time
+import uuid
+from pydantic import ValidationError
+from sanic import response
+from sanic.request import Request
+from sanic.response import HTTPResponse, ResponseStream
+from transformers import PreTrainedTokenizer
+from guidellm.mock_server.config import MockServerConfig
+from guidellm.mock_server.models import (
+    ChatCompletionChoice,
+    ChatCompletionsRequest,
+    ChatCompletionsResponse,
+    ChatMessage,
+    ErrorDetail,
+    ErrorResponse,
+    Usage,
+)
+from guidellm.mock_server.utils import (
+    MockTokenizer,
+    create_fake_text,
+    create_fake_tokens_str,
+    sample_number,
+    times_generator,
+)
+__all__ = ["ChatCompletionsHandler"]
+class ChatCompletionsHandler:
+    """
+    Handles OpenAI Chat Completions API requests with realistic LLM simulation.
+    Implements the /v1/chat/completions endpoint behavior including request validation,
+    response generation, and timing simulation. Supports both streaming and
+    non-streaming modes with configurable latency characteristics for comprehensive
+    benchmarking. Uses either a mock tokenizer or a real tokenizer for accurate token
+    counting and realistic text generation.
+    Example:
+    ::
+        config = MockServerConfig(ttft_ms=100, itl_ms=50)
+        handler = ChatCompletionsHandler(config)
+        response = await handler.handle(request)
+    """
+    def __init__(self, config: MockServerConfig) -> None:
+        """
+        Initialize the Chat Completions handler with server configuration.
+        :param config: Mock server configuration containing timing and behavior settings
+        """
+        self.config = config
+        self.tokenizer = (
+            MockTokenizer()
+            if config.processor is None
+            else PreTrainedTokenizer.from_pretrained(config.processor)
+        )
+    async def handle(self, request: Request) -> HTTPResponse:
+        """
+        Process incoming chat completion requests with validation and routing.
+        Validates the request payload, handles errors gracefully, and routes to
+        appropriate streaming or non-streaming response handlers based on the
+        request configuration.
+        :param request: Sanic HTTP request containing chat completion parameters
+        :return: HTTP response with completion data or error information
+        :raises ValidationError: When request payload fails validation
+        :raises JSONDecodeError: When request contains invalid JSON
+        """
+        try:
+            # Parse and validate request
+            req_data = ChatCompletionsRequest(**request.json)
+        except ValidationError as exc:
+            return response.json(
+                ErrorResponse(
+                    error=ErrorDetail(
+                        message=f"Invalid request: {str(exc)}",
+                        type="invalid_request_error",
+                        code="invalid_request",
+                    )
+                ).model_dump(),
+                status=400,
+            )
+        except (json.JSONDecodeError, TypeError):
+            return response.json(
+                ErrorResponse(
+                    error=ErrorDetail(
+                        message="Invalid JSON in request body",
+                        type="invalid_request_error",
+                        code="invalid_json",
+                    )
+                ).model_dump(),
+                status=400,
+            )
+        # Handle streaming vs non-streaming
+        if req_data.stream:
+            return await self._handle_stream(req_data)
+        else:
+            return await self._handle_non_stream(req_data)
+    async def _handle_non_stream(self, req: ChatCompletionsRequest) -> HTTPResponse:
+        """
+        Generate complete non-streaming chat completion response.
+        Simulates realistic LLM behavior with TTFT and ITL delays, generates
+        appropriate token counts, and returns a complete response with usage
+        statistics and generated content.
+        :param req: Validated chat completion request parameters
+        :return: Complete HTTP response with generated completion data
+        """
+        # TTFT delay
+        await asyncio.sleep(
+            sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
+        )
+        # Token counts
+        prompt_text = self.tokenizer.apply_chat_template(req.messages)
+        prompt_tokens = len(self.tokenizer(prompt_text))  # type: ignore[arg-type]
+        max_tokens = req.max_completion_tokens or req.max_tokens or math.inf
+        completion_tokens_count = min(
+            sample_number(self.config.output_tokens, self.config.output_tokens_std),
+            max_tokens,
+        )
+        # ITL delay
+        itl_delay = 0.0
+        delays_iter = iter(times_generator(self.config.itl_ms, self.config.itl_ms_std))
+        for _ in range(int(completion_tokens_count) - 1):
+            itl_delay += next(delays_iter)
+        await asyncio.sleep(itl_delay / 1000.0)
+        # Response
+        chat_response = ChatCompletionsResponse(
+            id=f"chatcmpl-{uuid.uuid4().hex[:29]}",
+            model=req.model,
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message=ChatMessage(
+                        role="assistant",
+                        content=create_fake_text(
+                            int(completion_tokens_count), self.tokenizer
+                        ),
+                    ),
+                    finish_reason="stop",
+                )
+            ],
+            usage=Usage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=int(completion_tokens_count),
+            ),
+            system_fingerprint=f"fp_{uuid.uuid4().hex[:10]}",
+        )
+        return response.json(chat_response.model_dump())
+    async def _handle_stream(self, req: ChatCompletionsRequest) -> HTTPResponse:
+        """
+        Generate streaming chat completion response with real-time token delivery.
+        Creates a streaming response that delivers tokens incrementally with
+        realistic timing delays. Supports optional usage statistics in the final
+        stream chunk when requested via stream_options.
+        :param req: Validated chat completion request with streaming enabled
+        :return: Streaming HTTP response delivering tokens with proper timing
+        """
+        async def generate_stream(stream_response):
+            completion_id = f"chatcmpl-{uuid.uuid4().hex[:29]}"
+            # TTFT delay
+            await asyncio.sleep(
+                sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
+            )
+            # Token counts
+            prompt_text = self.tokenizer.apply_chat_template(req.messages)
+            prompt_tokens = len(self.tokenizer(prompt_text))  # type: ignore[arg-type]
+            max_tokens = req.max_completion_tokens or req.max_tokens or math.inf
+            completion_tokens_count = int(
+                min(
+                    sample_number(
+                        self.config.output_tokens, self.config.output_tokens_std
+                    ),
+                    max_tokens,
+                )
+            )
+            # Send tokens
+            tokens = create_fake_tokens_str(completion_tokens_count, self.tokenizer)
+            delays_iter = iter(
+                times_generator(self.config.itl_ms, self.config.itl_ms_std)
+            )
+            for index, token in enumerate(tokens):
+                if index > 0:
+                    itl_delay = next(delays_iter)
+                    await asyncio.sleep(itl_delay / 1000.0)
+                chunk_data = {
+                    "id": completion_id,
+                    "object": "chat.completion.chunk",
+                    "created": int(time.time()),
+                    "model": req.model,
+                    "choices": [
+                        {
+                            "index": 0,
+                            "delta": {"content": token},
+                            "finish_reason": None,
+                        }
+                    ],
+                }
+                await stream_response.write(f"data: {json.dumps(chunk_data)}\n\n")
+            # Send final chunk with finish reason
+            final_chunk = {
+                "id": completion_id,
+                "object": "chat.completion.chunk",
+                "created": int(time.time()),
+                "model": req.model,
+                "choices": [
+                    {
+                        "index": 0,
+                        "delta": {},
+                        "finish_reason": "stop",
+                    }
+                ],
+            }
+            await stream_response.write(f"data: {json.dumps(final_chunk)}\n\n")
+            # Send usage if requested
+            if req.stream_options and req.stream_options.include_usage:
+                usage_chunk = {
+                    "id": completion_id,
+                    "object": "chat.completion.chunk",
+                    "created": int(time.time()),
+                    "model": req.model,
+                    "choices": [],
+                    "usage": {
+                        "prompt_tokens": prompt_tokens,
+                        "completion_tokens": completion_tokens_count,
+                        "total_tokens": prompt_tokens + completion_tokens_count,
+                    },
+                }
+                await stream_response.write(f"data: {json.dumps(usage_chunk)}\n\n")
+            # End stream
+            await stream_response.write("data: [DONE]\n\n")
+        return ResponseStream(  # type: ignore[return-value]
+            generate_stream,
+            content_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+                "X-Accel-Buffering": "no",
+            },
+        )

guidellm/mock_server/handlers/completions.py ADDED Viewed

@@ -0,0 +1,280 @@
+"""
+Legacy OpenAI Completions API handler for the mock server.
+This module provides the CompletionsHandler class that implements the /v1/completions
+endpoint for the guidellm mock server. It supports both streaming and non-streaming
+completions with configurable timing parameters (TTFT, ITL) and token generation to
+simulate realistic LLM behavior for benchmarking and testing purposes.
+"""
+from __future__ import annotations
+import asyncio
+import json
+import math
+import time
+import uuid
+from pydantic import ValidationError
+from sanic import response
+from sanic.request import Request
+from sanic.response import HTTPResponse, ResponseStream
+from transformers import PreTrainedTokenizer
+from guidellm.mock_server.config import MockServerConfig
+from guidellm.mock_server.models import (
+    CompletionChoice,
+    CompletionsRequest,
+    CompletionsResponse,
+    ErrorDetail,
+    ErrorResponse,
+    Usage,
+)
+from guidellm.mock_server.utils import (
+    MockTokenizer,
+    create_fake_text,
+    create_fake_tokens_str,
+    sample_number,
+    times_generator,
+)
+__all__ = ["CompletionsHandler"]
+class CompletionsHandler:
+    """
+    Handler for the OpenAI /v1/completions endpoint in the mock server.
+    This handler simulates the legacy OpenAI completions API by processing incoming
+    requests and generating responses with configurable timing and token generation
+    patterns. It supports both streaming and non-streaming modes, applying realistic
+    timing delays (TTFT and ITL) to mimic actual LLM behavior for benchmarking.
+    Example:
+    ::
+        config = MockServerConfig(ttft_ms=100, itl_ms=50)
+        handler = CompletionsHandler(config)
+        response = await handler.handle(sanic_request)
+    """
+    def __init__(self, config: MockServerConfig) -> None:
+        """
+        Initialize the completions handler with configuration settings.
+        :param config: Mock server configuration containing timing parameters
+            and tokenizer settings
+        """
+        self.config = config
+        self.tokenizer = (
+            MockTokenizer()
+            if config.processor is None
+            else PreTrainedTokenizer.from_pretrained(config.processor)
+        )
+    async def handle(self, request: Request) -> HTTPResponse:
+        """
+        Process a completions request and return the appropriate response.
+        Validates the incoming request, determines whether to use streaming or
+        non-streaming mode, and delegates to the appropriate handler method.
+        :param request: Sanic request object containing the completions request data
+        :return: HTTP response with completion data or error information
+        :raises ValidationError: When request validation fails
+        :raises json.JSONDecodeError: When request JSON is malformed
+        """
+        try:
+            # Parse and validate request
+            req_data = CompletionsRequest(**request.json)
+        except ValidationError as e:
+            return response.json(
+                ErrorResponse(
+                    error=ErrorDetail(
+                        message=f"Invalid request: {str(e)}",
+                        type="invalid_request_error",
+                        code="invalid_request",
+                    )
+                ).model_dump(),
+                status=400,
+            )
+        except (json.JSONDecodeError, TypeError):
+            return response.json(
+                ErrorResponse(
+                    error=ErrorDetail(
+                        message="Invalid JSON in request body",
+                        type="invalid_request_error",
+                        code="invalid_json",
+                    )
+                ).model_dump(),
+                status=400,
+            )
+        # Handle streaming vs non-streaming
+        if req_data.stream:
+            return await self._handle_stream(req_data)
+        else:
+            return await self._handle_non_stream(req_data)
+    async def _handle_non_stream(self, req: CompletionsRequest) -> HTTPResponse:
+        """
+        Generate a non-streaming completion response.
+        Simulates TTFT and ITL delays, generates appropriate token counts, and returns
+        a complete response with the generated text and usage statistics.
+        :param req: Validated completions request containing prompt and parameters
+        :return: JSON HTTP response with completion text and usage data
+        :raises NotImplementedError: When batch processing is requested
+        """
+        if isinstance(req.prompt, list):
+            raise NotImplementedError("Batch processing is not supported.")
+        # TTFT delay
+        await asyncio.sleep(
+            sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
+        )
+        # Token counts
+        prompt_tokens = len(self.tokenizer(req.prompt))
+        max_tokens = req.max_tokens or math.inf
+        completion_tokens_count = int(
+            min(
+                sample_number(self.config.output_tokens, self.config.output_tokens_std),
+                max_tokens,
+            )
+            if req.stop
+            else max_tokens
+        )
+        # ITL delay
+        itl_delay = 0.0
+        delays_iter = iter(times_generator(self.config.itl_ms, self.config.itl_ms_std))
+        for _ in range(int(completion_tokens_count) - 1):
+            itl_delay += next(delays_iter)
+        await asyncio.sleep(itl_delay / 1000.0)
+        # Response
+        completion_response = CompletionsResponse(
+            id=f"cmpl-{uuid.uuid4().hex[:29]}",
+            model=req.model,
+            choices=[
+                CompletionChoice(
+                    text=create_fake_text(completion_tokens_count, self.tokenizer),
+                    index=0,
+                    finish_reason="stop",
+                )
+            ],
+            usage=Usage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens_count,
+            ),
+            system_fingerprint=f"fp_{uuid.uuid4().hex[:10]}",
+        )
+        return response.json(completion_response.model_dump())
+    async def _handle_stream(self, req: CompletionsRequest) -> HTTPResponse:
+        """
+        Generate a streaming completion response.
+        Creates a server-sent events stream that delivers tokens incrementally with
+        realistic timing delays between each token. Includes usage statistics if
+        requested and properly terminates the stream.
+        :param req: Validated completions request containing prompt and streaming
+            options
+        :return: ResponseStream object that generates server-sent events
+        """
+        async def generate_stream(stream_response):
+            completion_id = f"cmpl-{uuid.uuid4().hex[:29]}"
+            # TTFT delay
+            await asyncio.sleep(
+                sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
+            )
+            # Token counts
+            prompt_tokens = len(self.tokenizer(req.prompt))
+            max_tokens = req.max_tokens or math.inf
+            completion_tokens_count = int(
+                min(
+                    sample_number(
+                        self.config.output_tokens, self.config.output_tokens_std
+                    ),
+                    max_tokens,
+                )
+                if req.stop
+                else max_tokens
+            )
+            # Send tokens
+            tokens = create_fake_tokens_str(completion_tokens_count, self.tokenizer)
+            delays_iter = iter(
+                times_generator(self.config.itl_ms, self.config.itl_ms_std)
+            )
+            for index, token in enumerate(tokens):
+                if index > 0:
+                    itl_delay = next(delays_iter)
+                    await asyncio.sleep(itl_delay / 1000.0)
+                chunk_data = {
+                    "id": completion_id,
+                    "object": "text_completion",
+                    "created": int(time.time()),
+                    "model": req.model,
+                    "choices": [
+                        {
+                            "text": token,
+                            "index": index,
+                            "finish_reason": None,
+                        }
+                    ],
+                }
+                await stream_response.write(f"data: {json.dumps(chunk_data)}\n\n")
+            # Send final chunk with finish reason
+            final_chunk = {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": int(time.time()),
+                "model": req.model,
+                "choices": [
+                    {
+                        "text": "",
+                        "index": index,
+                        "finish_reason": "stop",
+                    }
+                ],
+            }
+            await stream_response.write(f"data: {json.dumps(final_chunk)}\n\n")
+            # Send usage if requested
+            if req.stream_options and req.stream_options.include_usage:
+                usage_chunk = {
+                    "id": completion_id,
+                    "object": "text_completion",
+                    "created": int(time.time()),
+                    "model": req.model,
+                    "choices": [],
+                    "usage": {
+                        "prompt_tokens": prompt_tokens,
+                        "completion_tokens": completion_tokens_count,
+                        "total_tokens": prompt_tokens + completion_tokens_count,
+                    },
+                }
+                await stream_response.write(f"data: {json.dumps(usage_chunk)}\n\n")
+            # End stream
+            await stream_response.write("data: [DONE]\n\n")
+        return ResponseStream(  # type: ignore[return-value]
+            generate_stream,
+            content_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+                "X-Accel-Buffering": "no",
+            },
+        )

guidellm/mock_server/handlers/tokenizer.py ADDED Viewed

@@ -0,0 +1,142 @@
+"""
+HTTP request handler for vLLM tokenization API endpoints in the mock server.
+This module provides the TokenizerHandler class that implements vLLM-compatible
+tokenization and detokenization endpoints for testing and development purposes.
+It handles text-to-token conversion, token-to-text reconstruction, request
+validation, and error responses with proper HTTP status codes and JSON formatting.
+"""
+from __future__ import annotations
+from pydantic import ValidationError
+from sanic import response
+from sanic.request import Request
+from sanic.response import HTTPResponse
+from transformers.tokenization_utils import PreTrainedTokenizer
+from guidellm.mock_server.config import MockServerConfig
+from guidellm.mock_server.models import (
+    DetokenizeRequest,
+    DetokenizeResponse,
+    ErrorDetail,
+    ErrorResponse,
+    TokenizeRequest,
+    TokenizeResponse,
+)
+from guidellm.mock_server.utils import MockTokenizer
+__all__ = ["TokenizerHandler"]
+class TokenizerHandler:
+    """
+    HTTP request handler for vLLM tokenization and detokenization endpoints.
+    Provides mock implementations of vLLM's tokenization API endpoints including
+    /tokenize for converting text to tokens and /detokenize for reconstructing
+    text from token sequences. Handles request validation, error responses, and
+    JSON serialization with proper HTTP status codes.
+    Example:
+    ::
+        handler = TokenizerHandler(config)
+        response = await handler.tokenize(request)
+        response = await handler.detokenize(request)
+    """
+    def __init__(self, config: MockServerConfig) -> None:
+        """
+        Initialize the tokenizer handler with configuration.
+        :param config: Server configuration object containing tokenizer settings
+        """
+        self.config = config
+        self.tokenizer = (
+            MockTokenizer()
+            if config.processor is None
+            else PreTrainedTokenizer.from_pretrained(config.processor)
+        )
+    async def tokenize(self, request: Request) -> HTTPResponse:
+        """
+        Convert input text to token IDs via the /tokenize endpoint.
+        Validates the request payload, extracts text content, and returns a JSON
+        response containing the token sequence and count. Handles validation errors
+        and malformed JSON with appropriate HTTP error responses.
+        :param request: Sanic HTTP request containing JSON payload with text field
+        :return: JSON response with tokens list and count, or error response
+        """
+        try:
+            req_data = TokenizeRequest(**request.json)
+        except ValidationError as exc:
+            return response.json(
+                ErrorResponse(
+                    error=ErrorDetail(
+                        message=f"Invalid request: {str(exc)}",
+                        type="invalid_request_error",
+                        code="invalid_request",
+                    )
+                ).model_dump(),
+                status=400,
+            )
+        except (ValueError, TypeError, KeyError):
+            return response.json(
+                ErrorResponse(
+                    error=ErrorDetail(
+                        message="Invalid JSON in request body",
+                        type="invalid_request_error",
+                        code="invalid_json",
+                    )
+                ).model_dump(),
+                status=400,
+            )
+        tokens = self.tokenizer.tokenize(req_data.text)
+        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
+        return response.json(
+            TokenizeResponse(tokens=token_ids, count=len(token_ids)).model_dump()
+        )
+    async def detokenize(self, request: Request) -> HTTPResponse:
+        """
+        Convert token IDs back to text via the /detokenize endpoint.
+        Validates the request payload, extracts token sequences, and returns a JSON
+        response containing the reconstructed text. Handles validation errors and
+        malformed JSON with appropriate HTTP error responses.
+        :param request: Sanic HTTP request containing JSON payload with tokens field
+        :return: JSON response with reconstructed text, or error response
+        """
+        try:
+            req_data = DetokenizeRequest(**request.json)
+        except ValidationError as exc:
+            return response.json(
+                ErrorResponse(
+                    error=ErrorDetail(
+                        message=f"Invalid request: {str(exc)}",
+                        type="invalid_request_error",
+                        code="invalid_request",
+                    )
+                ).model_dump(),
+                status=400,
+            )
+        except (ValueError, TypeError, KeyError):
+            return response.json(
+                ErrorResponse(
+                    error=ErrorDetail(
+                        message="Invalid JSON in request body",
+                        type="invalid_request_error",
+                        code="invalid_json",
+                    )
+                ).model_dump(),
+                status=400,
+            )
+        text = self.tokenizer.decode(req_data.tokens, skip_special_tokens=False)
+        return response.json(DetokenizeResponse(text=text).model_dump())

guidellm 0.4.0a21__py3-none-any.whl → 0.4.0a169__py3-none-any.whl

Potentially problematic release.

guidellm 0.4.0a21py3-none-any.whl → 0.4.0a169py3-none-any.whl