PyPI - guidellm - Versions diffs - 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl - Mend

guidellm 0.3.1py3-none-any.whl → 0.6.0a5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

guidellm/__init__.py +5 -2
guidellm/__main__.py +524 -255
guidellm/backends/__init__.py +33 -0
guidellm/backends/backend.py +109 -0
guidellm/backends/openai.py +340 -0
guidellm/backends/response_handlers.py +428 -0
guidellm/benchmark/__init__.py +69 -39
guidellm/benchmark/benchmarker.py +160 -316
guidellm/benchmark/entrypoints.py +560 -127
guidellm/benchmark/outputs/__init__.py +24 -0
guidellm/benchmark/outputs/console.py +633 -0
guidellm/benchmark/outputs/csv.py +721 -0
guidellm/benchmark/outputs/html.py +473 -0
guidellm/benchmark/outputs/output.py +169 -0
guidellm/benchmark/outputs/serialized.py +69 -0
guidellm/benchmark/profiles.py +718 -0
guidellm/benchmark/progress.py +553 -556
guidellm/benchmark/scenarios/__init__.py +40 -0
guidellm/benchmark/scenarios/chat.json +6 -0
guidellm/benchmark/scenarios/rag.json +6 -0
guidellm/benchmark/schemas/__init__.py +66 -0
guidellm/benchmark/schemas/base.py +402 -0
guidellm/benchmark/schemas/generative/__init__.py +55 -0
guidellm/benchmark/schemas/generative/accumulator.py +841 -0
guidellm/benchmark/schemas/generative/benchmark.py +163 -0
guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
guidellm/benchmark/schemas/generative/metrics.py +927 -0
guidellm/benchmark/schemas/generative/report.py +158 -0
guidellm/data/__init__.py +34 -4
guidellm/data/builders.py +541 -0
guidellm/data/collators.py +16 -0
guidellm/data/config.py +120 -0
guidellm/data/deserializers/__init__.py +49 -0
guidellm/data/deserializers/deserializer.py +141 -0
guidellm/data/deserializers/file.py +223 -0
guidellm/data/deserializers/huggingface.py +94 -0
guidellm/data/deserializers/memory.py +194 -0
guidellm/data/deserializers/synthetic.py +246 -0
guidellm/data/entrypoints.py +52 -0
guidellm/data/loaders.py +190 -0
guidellm/data/preprocessors/__init__.py +27 -0
guidellm/data/preprocessors/formatters.py +410 -0
guidellm/data/preprocessors/mappers.py +196 -0
guidellm/data/preprocessors/preprocessor.py +30 -0
guidellm/data/processor.py +29 -0
guidellm/data/schemas.py +175 -0
guidellm/data/utils/__init__.py +6 -0
guidellm/data/utils/dataset.py +94 -0
guidellm/extras/__init__.py +4 -0
guidellm/extras/audio.py +220 -0
guidellm/extras/vision.py +242 -0
guidellm/logger.py +2 -2
guidellm/mock_server/__init__.py +8 -0
guidellm/mock_server/config.py +84 -0
guidellm/mock_server/handlers/__init__.py +17 -0
guidellm/mock_server/handlers/chat_completions.py +280 -0
guidellm/mock_server/handlers/completions.py +280 -0
guidellm/mock_server/handlers/tokenizer.py +142 -0
guidellm/mock_server/models.py +510 -0
guidellm/mock_server/server.py +238 -0
guidellm/mock_server/utils.py +302 -0
guidellm/scheduler/__init__.py +69 -26
guidellm/scheduler/constraints/__init__.py +49 -0
guidellm/scheduler/constraints/constraint.py +325 -0
guidellm/scheduler/constraints/error.py +411 -0
guidellm/scheduler/constraints/factory.py +182 -0
guidellm/scheduler/constraints/request.py +312 -0
guidellm/scheduler/constraints/saturation.py +722 -0
guidellm/scheduler/environments.py +252 -0
guidellm/scheduler/scheduler.py +137 -368
guidellm/scheduler/schemas.py +358 -0
guidellm/scheduler/strategies.py +617 -0
guidellm/scheduler/worker.py +413 -419
guidellm/scheduler/worker_group.py +712 -0
guidellm/schemas/__init__.py +65 -0
guidellm/schemas/base.py +417 -0
guidellm/schemas/info.py +188 -0
guidellm/schemas/request.py +235 -0
guidellm/schemas/request_stats.py +349 -0
guidellm/schemas/response.py +124 -0
guidellm/schemas/statistics.py +1018 -0
guidellm/{config.py → settings.py} +31 -24
guidellm/utils/__init__.py +71 -8
guidellm/utils/auto_importer.py +98 -0
guidellm/utils/cli.py +132 -5
guidellm/utils/console.py +566 -0
guidellm/utils/encoding.py +778 -0
guidellm/utils/functions.py +159 -0
guidellm/utils/hf_datasets.py +1 -2
guidellm/utils/hf_transformers.py +4 -4
guidellm/utils/imports.py +9 -0
guidellm/utils/messaging.py +1118 -0
guidellm/utils/mixins.py +115 -0
guidellm/utils/random.py +3 -4
guidellm/utils/registry.py +220 -0
guidellm/utils/singleton.py +133 -0
guidellm/utils/synchronous.py +159 -0
guidellm/utils/text.py +163 -50
guidellm/utils/typing.py +41 -0
guidellm/version.py +2 -2
guidellm-0.6.0a5.dist-info/METADATA +364 -0
guidellm-0.6.0a5.dist-info/RECORD +109 -0
guidellm/backend/__init__.py +0 -23
guidellm/backend/backend.py +0 -259
guidellm/backend/openai.py +0 -708
guidellm/backend/response.py +0 -136
guidellm/benchmark/aggregator.py +0 -760
guidellm/benchmark/benchmark.py +0 -837
guidellm/benchmark/output.py +0 -997
guidellm/benchmark/profile.py +0 -409
guidellm/benchmark/scenario.py +0 -104
guidellm/data/prideandprejudice.txt.gz +0 -0
guidellm/dataset/__init__.py +0 -22
guidellm/dataset/creator.py +0 -213
guidellm/dataset/entrypoints.py +0 -42
guidellm/dataset/file.py +0 -92
guidellm/dataset/hf_datasets.py +0 -62
guidellm/dataset/in_memory.py +0 -132
guidellm/dataset/synthetic.py +0 -287
guidellm/objects/__init__.py +0 -18
guidellm/objects/pydantic.py +0 -89
guidellm/objects/statistics.py +0 -953
guidellm/preprocess/__init__.py +0 -3
guidellm/preprocess/dataset.py +0 -374
guidellm/presentation/__init__.py +0 -28
guidellm/presentation/builder.py +0 -27
guidellm/presentation/data_models.py +0 -232
guidellm/presentation/injector.py +0 -66
guidellm/request/__init__.py +0 -18
guidellm/request/loader.py +0 -284
guidellm/request/request.py +0 -79
guidellm/request/types.py +0 -10
guidellm/scheduler/queues.py +0 -25
guidellm/scheduler/result.py +0 -155
guidellm/scheduler/strategy.py +0 -495
guidellm-0.3.1.dist-info/METADATA +0 -329
guidellm-0.3.1.dist-info/RECORD +0 -62
{guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
{guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
{guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
{guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0

guidellm/mock_server/handlers/completions.py ADDED Viewed

@@ -0,0 +1,280 @@
+"""
+Legacy OpenAI Completions API handler for the mock server.
+This module provides the CompletionsHandler class that implements the /v1/completions
+endpoint for the guidellm mock server. It supports both streaming and non-streaming
+completions with configurable timing parameters (TTFT, ITL) and token generation to
+simulate realistic LLM behavior for benchmarking and testing purposes.
+"""
+from __future__ import annotations
+import asyncio
+import json
+import math
+import time
+import uuid
+from pydantic import ValidationError
+from sanic import response
+from sanic.request import Request
+from sanic.response import HTTPResponse, ResponseStream
+from transformers import PreTrainedTokenizer
+from guidellm.mock_server.config import MockServerConfig
+from guidellm.mock_server.models import (
+    CompletionChoice,
+    CompletionsRequest,
+    CompletionsResponse,
+    ErrorDetail,
+    ErrorResponse,
+    Usage,
+)
+from guidellm.mock_server.utils import (
+    MockTokenizer,
+    create_fake_text,
+    create_fake_tokens_str,
+    sample_number,
+    times_generator,
+)
+__all__ = ["CompletionsHandler"]
+class CompletionsHandler:
+    """
+    Handler for the OpenAI /v1/completions endpoint in the mock server.
+    This handler simulates the legacy OpenAI completions API by processing incoming
+    requests and generating responses with configurable timing and token generation
+    patterns. It supports both streaming and non-streaming modes, applying realistic
+    timing delays (TTFT and ITL) to mimic actual LLM behavior for benchmarking.
+    Example:
+    ::
+        config = MockServerConfig(ttft_ms=100, itl_ms=50)
+        handler = CompletionsHandler(config)
+        response = await handler.handle(sanic_request)
+    """
+    def __init__(self, config: MockServerConfig) -> None:
+        """
+        Initialize the completions handler with configuration settings.
+        :param config: Mock server configuration containing timing parameters
+            and tokenizer settings
+        """
+        self.config = config
+        self.tokenizer = (
+            MockTokenizer()
+            if config.processor is None
+            else PreTrainedTokenizer.from_pretrained(config.processor)
+        )
+    async def handle(self, request: Request) -> HTTPResponse:
+        """
+        Process a completions request and return the appropriate response.
+        Validates the incoming request, determines whether to use streaming or
+        non-streaming mode, and delegates to the appropriate handler method.
+        :param request: Sanic request object containing the completions request data
+        :return: HTTP response with completion data or error information
+        :raises ValidationError: When request validation fails
+        :raises json.JSONDecodeError: When request JSON is malformed
+        """
+        try:
+            # Parse and validate request
+            req_data = CompletionsRequest(**request.json)
+        except ValidationError as e:
+            return response.json(
+                ErrorResponse(
+                    error=ErrorDetail(
+                        message=f"Invalid request: {str(e)}",
+                        type="invalid_request_error",
+                        code="invalid_request",
+                    )
+                ).model_dump(),
+                status=400,
+            )
+        except (json.JSONDecodeError, TypeError):
+            return response.json(
+                ErrorResponse(
+                    error=ErrorDetail(
+                        message="Invalid JSON in request body",
+                        type="invalid_request_error",
+                        code="invalid_json",
+                    )
+                ).model_dump(),
+                status=400,
+            )
+        # Handle streaming vs non-streaming
+        if req_data.stream:
+            return await self._handle_stream(req_data)
+        else:
+            return await self._handle_non_stream(req_data)
+    async def _handle_non_stream(self, req: CompletionsRequest) -> HTTPResponse:
+        """
+        Generate a non-streaming completion response.
+        Simulates TTFT and ITL delays, generates appropriate token counts, and returns
+        a complete response with the generated text and usage statistics.
+        :param req: Validated completions request containing prompt and parameters
+        :return: JSON HTTP response with completion text and usage data
+        :raises NotImplementedError: When batch processing is requested
+        """
+        if isinstance(req.prompt, list):
+            raise NotImplementedError("Batch processing is not supported.")
+        # TTFT delay
+        await asyncio.sleep(
+            sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
+        )
+        # Token counts
+        prompt_tokens = len(self.tokenizer(req.prompt))
+        max_tokens = req.max_tokens or math.inf
+        completion_tokens_count = int(
+            min(
+                sample_number(self.config.output_tokens, self.config.output_tokens_std),
+                max_tokens,
+            )
+            if req.stop
+            else max_tokens
+        )
+        # ITL delay
+        itl_delay = 0.0
+        delays_iter = iter(times_generator(self.config.itl_ms, self.config.itl_ms_std))
+        for _ in range(int(completion_tokens_count) - 1):
+            itl_delay += next(delays_iter)
+        await asyncio.sleep(itl_delay / 1000.0)
+        # Response
+        completion_response = CompletionsResponse(
+            id=f"cmpl-{uuid.uuid4().hex[:29]}",
+            model=req.model,
+            choices=[
+                CompletionChoice(
+                    text=create_fake_text(completion_tokens_count, self.tokenizer),
+                    index=0,
+                    finish_reason="stop",
+                )
+            ],
+            usage=Usage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens_count,
+            ),
+            system_fingerprint=f"fp_{uuid.uuid4().hex[:10]}",
+        )
+        return response.json(completion_response.model_dump())
+    async def _handle_stream(self, req: CompletionsRequest) -> HTTPResponse:
+        """
+        Generate a streaming completion response.
+        Creates a server-sent events stream that delivers tokens incrementally with
+        realistic timing delays between each token. Includes usage statistics if
+        requested and properly terminates the stream.
+        :param req: Validated completions request containing prompt and streaming
+            options
+        :return: ResponseStream object that generates server-sent events
+        """
+        async def generate_stream(stream_response):
+            completion_id = f"cmpl-{uuid.uuid4().hex[:29]}"
+            # TTFT delay
+            await asyncio.sleep(
+                sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
+            )
+            # Token counts
+            prompt_tokens = len(self.tokenizer(req.prompt))
+            max_tokens = req.max_tokens or math.inf
+            completion_tokens_count = int(
+                min(
+                    sample_number(
+                        self.config.output_tokens, self.config.output_tokens_std
+                    ),
+                    max_tokens,
+                )
+                if req.stop
+                else max_tokens
+            )
+            # Send tokens
+            tokens = create_fake_tokens_str(completion_tokens_count, self.tokenizer)
+            delays_iter = iter(
+                times_generator(self.config.itl_ms, self.config.itl_ms_std)
+            )
+            for index, token in enumerate(tokens):
+                if index > 0:
+                    itl_delay = next(delays_iter)
+                    await asyncio.sleep(itl_delay / 1000.0)
+                chunk_data = {
+                    "id": completion_id,
+                    "object": "text_completion",
+                    "created": int(time.time()),
+                    "model": req.model,
+                    "choices": [
+                        {
+                            "text": token,
+                            "index": index,
+                            "finish_reason": None,
+                        }
+                    ],
+                }
+                await stream_response.write(f"data: {json.dumps(chunk_data)}\n\n")
+            # Send final chunk with finish reason
+            final_chunk = {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": int(time.time()),
+                "model": req.model,
+                "choices": [
+                    {
+                        "text": "",
+                        "index": index,
+                        "finish_reason": "stop",
+                    }
+                ],
+            }
+            await stream_response.write(f"data: {json.dumps(final_chunk)}\n\n")
+            # Send usage if requested
+            if req.stream_options and req.stream_options.include_usage:
+                usage_chunk = {
+                    "id": completion_id,
+                    "object": "text_completion",
+                    "created": int(time.time()),
+                    "model": req.model,
+                    "choices": [],
+                    "usage": {
+                        "prompt_tokens": prompt_tokens,
+                        "completion_tokens": completion_tokens_count,
+                        "total_tokens": prompt_tokens + completion_tokens_count,
+                    },
+                }
+                await stream_response.write(f"data: {json.dumps(usage_chunk)}\n\n")
+            # End stream
+            await stream_response.write("data: [DONE]\n\n")
+        return ResponseStream(  # type: ignore[return-value]
+            generate_stream,
+            content_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+                "X-Accel-Buffering": "no",
+            },
+        )

guidellm/mock_server/handlers/tokenizer.py ADDED Viewed

@@ -0,0 +1,142 @@
+"""
+HTTP request handler for vLLM tokenization API endpoints in the mock server.
+This module provides the TokenizerHandler class that implements vLLM-compatible
+tokenization and detokenization endpoints for testing and development purposes.
+It handles text-to-token conversion, token-to-text reconstruction, request
+validation, and error responses with proper HTTP status codes and JSON formatting.
+"""
+from __future__ import annotations
+from pydantic import ValidationError
+from sanic import response
+from sanic.request import Request
+from sanic.response import HTTPResponse
+from transformers.tokenization_utils import PreTrainedTokenizer
+from guidellm.mock_server.config import MockServerConfig
+from guidellm.mock_server.models import (
+    DetokenizeRequest,
+    DetokenizeResponse,
+    ErrorDetail,
+    ErrorResponse,
+    TokenizeRequest,
+    TokenizeResponse,
+)
+from guidellm.mock_server.utils import MockTokenizer
+__all__ = ["TokenizerHandler"]
+class TokenizerHandler:
+    """
+    HTTP request handler for vLLM tokenization and detokenization endpoints.
+    Provides mock implementations of vLLM's tokenization API endpoints including
+    /tokenize for converting text to tokens and /detokenize for reconstructing
+    text from token sequences. Handles request validation, error responses, and
+    JSON serialization with proper HTTP status codes.
+    Example:
+    ::
+        handler = TokenizerHandler(config)
+        response = await handler.tokenize(request)
+        response = await handler.detokenize(request)
+    """
+    def __init__(self, config: MockServerConfig) -> None:
+        """
+        Initialize the tokenizer handler with configuration.
+        :param config: Server configuration object containing tokenizer settings
+        """
+        self.config = config
+        self.tokenizer = (
+            MockTokenizer()
+            if config.processor is None
+            else PreTrainedTokenizer.from_pretrained(config.processor)
+        )
+    async def tokenize(self, request: Request) -> HTTPResponse:
+        """
+        Convert input text to token IDs via the /tokenize endpoint.
+        Validates the request payload, extracts text content, and returns a JSON
+        response containing the token sequence and count. Handles validation errors
+        and malformed JSON with appropriate HTTP error responses.
+        :param request: Sanic HTTP request containing JSON payload with text field
+        :return: JSON response with tokens list and count, or error response
+        """
+        try:
+            req_data = TokenizeRequest(**request.json)
+        except ValidationError as exc:
+            return response.json(
+                ErrorResponse(
+                    error=ErrorDetail(
+                        message=f"Invalid request: {str(exc)}",
+                        type="invalid_request_error",
+                        code="invalid_request",
+                    )
+                ).model_dump(),
+                status=400,
+            )
+        except (ValueError, TypeError, KeyError):
+            return response.json(
+                ErrorResponse(
+                    error=ErrorDetail(
+                        message="Invalid JSON in request body",
+                        type="invalid_request_error",
+                        code="invalid_json",
+                    )
+                ).model_dump(),
+                status=400,
+            )
+        tokens = self.tokenizer.tokenize(req_data.text)
+        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
+        return response.json(
+            TokenizeResponse(tokens=token_ids, count=len(token_ids)).model_dump()
+        )
+    async def detokenize(self, request: Request) -> HTTPResponse:
+        """
+        Convert token IDs back to text via the /detokenize endpoint.
+        Validates the request payload, extracts token sequences, and returns a JSON
+        response containing the reconstructed text. Handles validation errors and
+        malformed JSON with appropriate HTTP error responses.
+        :param request: Sanic HTTP request containing JSON payload with tokens field
+        :return: JSON response with reconstructed text, or error response
+        """
+        try:
+            req_data = DetokenizeRequest(**request.json)
+        except ValidationError as exc:
+            return response.json(
+                ErrorResponse(
+                    error=ErrorDetail(
+                        message=f"Invalid request: {str(exc)}",
+                        type="invalid_request_error",
+                        code="invalid_request",
+                    )
+                ).model_dump(),
+                status=400,
+            )
+        except (ValueError, TypeError, KeyError):
+            return response.json(
+                ErrorResponse(
+                    error=ErrorDetail(
+                        message="Invalid JSON in request body",
+                        type="invalid_request_error",
+                        code="invalid_json",
+                    )
+                ).model_dump(),
+                status=400,
+            )
+        text = self.tokenizer.decode(req_data.tokens, skip_special_tokens=False)
+        return response.json(DetokenizeResponse(text=text).model_dump())

guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl

guidellm 0.3.1py3-none-any.whl → 0.6.0a5py3-none-any.whl