guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +524 -255
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +109 -0
  5. guidellm/backends/openai.py +340 -0
  6. guidellm/backends/response_handlers.py +428 -0
  7. guidellm/benchmark/__init__.py +69 -39
  8. guidellm/benchmark/benchmarker.py +160 -316
  9. guidellm/benchmark/entrypoints.py +560 -127
  10. guidellm/benchmark/outputs/__init__.py +24 -0
  11. guidellm/benchmark/outputs/console.py +633 -0
  12. guidellm/benchmark/outputs/csv.py +721 -0
  13. guidellm/benchmark/outputs/html.py +473 -0
  14. guidellm/benchmark/outputs/output.py +169 -0
  15. guidellm/benchmark/outputs/serialized.py +69 -0
  16. guidellm/benchmark/profiles.py +718 -0
  17. guidellm/benchmark/progress.py +553 -556
  18. guidellm/benchmark/scenarios/__init__.py +40 -0
  19. guidellm/benchmark/scenarios/chat.json +6 -0
  20. guidellm/benchmark/scenarios/rag.json +6 -0
  21. guidellm/benchmark/schemas/__init__.py +66 -0
  22. guidellm/benchmark/schemas/base.py +402 -0
  23. guidellm/benchmark/schemas/generative/__init__.py +55 -0
  24. guidellm/benchmark/schemas/generative/accumulator.py +841 -0
  25. guidellm/benchmark/schemas/generative/benchmark.py +163 -0
  26. guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
  27. guidellm/benchmark/schemas/generative/metrics.py +927 -0
  28. guidellm/benchmark/schemas/generative/report.py +158 -0
  29. guidellm/data/__init__.py +34 -4
  30. guidellm/data/builders.py +541 -0
  31. guidellm/data/collators.py +16 -0
  32. guidellm/data/config.py +120 -0
  33. guidellm/data/deserializers/__init__.py +49 -0
  34. guidellm/data/deserializers/deserializer.py +141 -0
  35. guidellm/data/deserializers/file.py +223 -0
  36. guidellm/data/deserializers/huggingface.py +94 -0
  37. guidellm/data/deserializers/memory.py +194 -0
  38. guidellm/data/deserializers/synthetic.py +246 -0
  39. guidellm/data/entrypoints.py +52 -0
  40. guidellm/data/loaders.py +190 -0
  41. guidellm/data/preprocessors/__init__.py +27 -0
  42. guidellm/data/preprocessors/formatters.py +410 -0
  43. guidellm/data/preprocessors/mappers.py +196 -0
  44. guidellm/data/preprocessors/preprocessor.py +30 -0
  45. guidellm/data/processor.py +29 -0
  46. guidellm/data/schemas.py +175 -0
  47. guidellm/data/utils/__init__.py +6 -0
  48. guidellm/data/utils/dataset.py +94 -0
  49. guidellm/extras/__init__.py +4 -0
  50. guidellm/extras/audio.py +220 -0
  51. guidellm/extras/vision.py +242 -0
  52. guidellm/logger.py +2 -2
  53. guidellm/mock_server/__init__.py +8 -0
  54. guidellm/mock_server/config.py +84 -0
  55. guidellm/mock_server/handlers/__init__.py +17 -0
  56. guidellm/mock_server/handlers/chat_completions.py +280 -0
  57. guidellm/mock_server/handlers/completions.py +280 -0
  58. guidellm/mock_server/handlers/tokenizer.py +142 -0
  59. guidellm/mock_server/models.py +510 -0
  60. guidellm/mock_server/server.py +238 -0
  61. guidellm/mock_server/utils.py +302 -0
  62. guidellm/scheduler/__init__.py +69 -26
  63. guidellm/scheduler/constraints/__init__.py +49 -0
  64. guidellm/scheduler/constraints/constraint.py +325 -0
  65. guidellm/scheduler/constraints/error.py +411 -0
  66. guidellm/scheduler/constraints/factory.py +182 -0
  67. guidellm/scheduler/constraints/request.py +312 -0
  68. guidellm/scheduler/constraints/saturation.py +722 -0
  69. guidellm/scheduler/environments.py +252 -0
  70. guidellm/scheduler/scheduler.py +137 -368
  71. guidellm/scheduler/schemas.py +358 -0
  72. guidellm/scheduler/strategies.py +617 -0
  73. guidellm/scheduler/worker.py +413 -419
  74. guidellm/scheduler/worker_group.py +712 -0
  75. guidellm/schemas/__init__.py +65 -0
  76. guidellm/schemas/base.py +417 -0
  77. guidellm/schemas/info.py +188 -0
  78. guidellm/schemas/request.py +235 -0
  79. guidellm/schemas/request_stats.py +349 -0
  80. guidellm/schemas/response.py +124 -0
  81. guidellm/schemas/statistics.py +1018 -0
  82. guidellm/{config.py → settings.py} +31 -24
  83. guidellm/utils/__init__.py +71 -8
  84. guidellm/utils/auto_importer.py +98 -0
  85. guidellm/utils/cli.py +132 -5
  86. guidellm/utils/console.py +566 -0
  87. guidellm/utils/encoding.py +778 -0
  88. guidellm/utils/functions.py +159 -0
  89. guidellm/utils/hf_datasets.py +1 -2
  90. guidellm/utils/hf_transformers.py +4 -4
  91. guidellm/utils/imports.py +9 -0
  92. guidellm/utils/messaging.py +1118 -0
  93. guidellm/utils/mixins.py +115 -0
  94. guidellm/utils/random.py +3 -4
  95. guidellm/utils/registry.py +220 -0
  96. guidellm/utils/singleton.py +133 -0
  97. guidellm/utils/synchronous.py +159 -0
  98. guidellm/utils/text.py +163 -50
  99. guidellm/utils/typing.py +41 -0
  100. guidellm/version.py +2 -2
  101. guidellm-0.6.0a5.dist-info/METADATA +364 -0
  102. guidellm-0.6.0a5.dist-info/RECORD +109 -0
  103. guidellm/backend/__init__.py +0 -23
  104. guidellm/backend/backend.py +0 -259
  105. guidellm/backend/openai.py +0 -708
  106. guidellm/backend/response.py +0 -136
  107. guidellm/benchmark/aggregator.py +0 -760
  108. guidellm/benchmark/benchmark.py +0 -837
  109. guidellm/benchmark/output.py +0 -997
  110. guidellm/benchmark/profile.py +0 -409
  111. guidellm/benchmark/scenario.py +0 -104
  112. guidellm/data/prideandprejudice.txt.gz +0 -0
  113. guidellm/dataset/__init__.py +0 -22
  114. guidellm/dataset/creator.py +0 -213
  115. guidellm/dataset/entrypoints.py +0 -42
  116. guidellm/dataset/file.py +0 -92
  117. guidellm/dataset/hf_datasets.py +0 -62
  118. guidellm/dataset/in_memory.py +0 -132
  119. guidellm/dataset/synthetic.py +0 -287
  120. guidellm/objects/__init__.py +0 -18
  121. guidellm/objects/pydantic.py +0 -89
  122. guidellm/objects/statistics.py +0 -953
  123. guidellm/preprocess/__init__.py +0 -3
  124. guidellm/preprocess/dataset.py +0 -374
  125. guidellm/presentation/__init__.py +0 -28
  126. guidellm/presentation/builder.py +0 -27
  127. guidellm/presentation/data_models.py +0 -232
  128. guidellm/presentation/injector.py +0 -66
  129. guidellm/request/__init__.py +0 -18
  130. guidellm/request/loader.py +0 -284
  131. guidellm/request/request.py +0 -79
  132. guidellm/request/types.py +0 -10
  133. guidellm/scheduler/queues.py +0 -25
  134. guidellm/scheduler/result.py +0 -155
  135. guidellm/scheduler/strategy.py +0 -495
  136. guidellm-0.3.1.dist-info/METADATA +0 -329
  137. guidellm-0.3.1.dist-info/RECORD +0 -62
  138. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
  139. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
  140. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
  141. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,242 @@
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import io
5
+ from pathlib import Path
6
+ from typing import Any, Literal
7
+
8
+ import httpx
9
+ import numpy as np
10
+
11
+ try:
12
+ from PIL import Image as PILImage
13
+ except ImportError as e:
14
+ raise ImportError(
15
+ "Please install guidellm[vision] to use image/video features"
16
+ ) from e
17
+
18
+ __all__ = [
19
+ "encode_image",
20
+ "encode_video",
21
+ "get_file_format",
22
+ "is_url",
23
+ "resize_image",
24
+ ]
25
+
26
+
27
+ def is_url(text: Any) -> bool:
28
+ return isinstance(text, str) and text.startswith(("http://", "https://"))
29
+
30
+
31
+ def encode_image(
32
+ image: bytes | str | Path | np.ndarray | PILImage.Image,
33
+ width: int | None = None,
34
+ height: int | None = None,
35
+ max_size: int | None = None,
36
+ max_width: int | None = None,
37
+ max_height: int | None = None,
38
+ encode_type: Literal["base64", "url"] | None = "base64",
39
+ ) -> dict[Literal["type", "image", "image_pixels", "image_bytes"], str | int | None]:
40
+ """
41
+ Input image types:
42
+ - bytes: raw image bytes, decoded with Pillow
43
+ - str: file path on disk, url, or already base64 encoded image string
44
+ - pathlib.Path: file path on disk
45
+ - np.ndarray: image array, decoded with Pillow
46
+ - PIL.Image.Image: Pillow image
47
+ - datasets.Image: HuggingFace datasets Image object
48
+
49
+ max_size: maximum size of the longest edge of the image
50
+ max_width: maximum width of the image
51
+ max_height: maximum height of the image
52
+
53
+ encode_type: None to return the supported format
54
+ (url for url, base64 string for others)
55
+ "base64" to return base64 encoded string (or download URL and encode)
56
+ "url" to return url (only if input is url, otherwise fails)
57
+
58
+ Returns a str of either:
59
+ - image url
60
+ - "data:image/{type};base64, {data}" string
61
+ """
62
+ if isinstance(image, str) and is_url(image):
63
+ if encode_type == "base64":
64
+ response = httpx.get(image)
65
+ response.raise_for_status()
66
+ return encode_image(
67
+ image=response.content,
68
+ max_size=max_size,
69
+ max_width=max_width,
70
+ max_height=max_height,
71
+ encode_type="base64",
72
+ )
73
+
74
+ if any([width, height, max_size, max_width, max_height]):
75
+ raise ValueError(f"Cannot resize image {image} when encode_type is 'url'")
76
+
77
+ return {
78
+ "type": "image_url",
79
+ "image": image,
80
+ "image_pixels": None,
81
+ "image_bytes": None,
82
+ }
83
+
84
+ decoded_image: PILImage.Image
85
+
86
+ if isinstance(image, bytes):
87
+ decoded_image = PILImage.open(io.BytesIO(image))
88
+ elif isinstance(image, str) and image.startswith("data:image/"):
89
+ _, encoded = image.split(",", 1)
90
+ image_data = base64.b64decode(encoded)
91
+ decoded_image = PILImage.open(io.BytesIO(image_data))
92
+ elif isinstance(image, str | Path):
93
+ decoded_image = PILImage.open(image)
94
+ elif isinstance(image, np.ndarray):
95
+ decoded_image = PILImage.fromarray(image)
96
+ elif isinstance(image, PILImage.Image):
97
+ decoded_image = image
98
+ else:
99
+ raise ValueError(f"Unsupported image type: {type(image)} for {image}")
100
+
101
+ output_image = resize_image(
102
+ decoded_image,
103
+ width=width,
104
+ height=height,
105
+ max_width=max_width,
106
+ max_height=max_height,
107
+ max_size=max_size,
108
+ )
109
+ if output_image.mode != "RGB":
110
+ output_image = output_image.convert("RGB")
111
+
112
+ buffer = io.BytesIO()
113
+ output_image.save(buffer, format="JPEG")
114
+ image_bytes = buffer.getvalue()
115
+ image_base64 = base64.b64encode(image_bytes).decode("utf-8")
116
+
117
+ return {
118
+ "type": "image_base64",
119
+ "image": f"data:image/jpeg;base64,{image_base64}",
120
+ "image_pixels": output_image.width * output_image.height,
121
+ "image_bytes": len(image_bytes),
122
+ }
123
+
124
+
125
+ def resize_image(
126
+ image: PILImage.Image,
127
+ width: int | None = None,
128
+ height: int | None = None,
129
+ max_width: int | None = None,
130
+ max_height: int | None = None,
131
+ max_size: int | None = None,
132
+ ) -> PILImage.Image:
133
+ if not isinstance(image, PILImage.Image):
134
+ raise ValueError(f"Unsupported image type: {type(image)}")
135
+
136
+ if width is not None and height is not None:
137
+ return image.resize((width, height), PILImage.Resampling.BILINEAR)
138
+
139
+ orig_w, orig_h = image.size
140
+ aspect = orig_w / orig_h
141
+
142
+ if width is not None:
143
+ target_w = width
144
+ target_h = round(width / aspect)
145
+ elif height is not None:
146
+ target_h = height
147
+ target_w = round(height * aspect)
148
+ else:
149
+ target_w, target_h = orig_w, orig_h
150
+
151
+ # Normalize max_size → max_width/max_height
152
+ if max_size is not None:
153
+ max_width = max_width or max_size
154
+ max_height = max_height or max_size
155
+
156
+ # Apply max constraints (preserve aspect ratio)
157
+ if max_width or max_height:
158
+ scale_w = max_width / target_w if max_width else 1.0
159
+ scale_h = max_height / target_h if max_height else 1.0
160
+ scale = min(scale_w, scale_h, 1.0) # never upscale
161
+ target_w = round(target_w * scale)
162
+ target_h = round(target_h * scale)
163
+
164
+ if (target_w, target_h) != (orig_w, orig_h):
165
+ image = image.resize((target_w, target_h), PILImage.Resampling.BILINEAR)
166
+
167
+ return image
168
+
169
+
170
+ def encode_video(
171
+ video: bytes | str | Path,
172
+ encode_type: Literal["base64", "url"] | None = "base64",
173
+ ) -> dict[
174
+ Literal["type", "video", "video_frames", "video_seconds", "video_bytes"],
175
+ str | int | float | None,
176
+ ]:
177
+ """
178
+ Input video types:
179
+ - bytes: raw video bytes
180
+ - str: file path on disk, url, or already base64 encoded video string
181
+ - pathlib.Path: file path on disk
182
+ - datasets.Video: HuggingFace datasets Video object
183
+
184
+ encode_type: None to return the supported format
185
+ (url for url, base64 string for others)
186
+ "base64" to return base64 encoded string (or download URL and encode)
187
+ "url" to return url (only if input is url, otherwise fails)
188
+
189
+ Returns a str of either:
190
+ - video url
191
+ - "data:video/{type};base64, {data}" string
192
+ """
193
+ if isinstance(video, str) and is_url(video):
194
+ if encode_type == "base64":
195
+ response = httpx.get(video)
196
+ response.raise_for_status()
197
+ return encode_video(video=response.content, encode_type="base64")
198
+
199
+ return {
200
+ "type": "video_url",
201
+ "video": video,
202
+ "video_frames": None,
203
+ "video_seconds": None,
204
+ "video_bytes": None,
205
+ }
206
+
207
+ if isinstance(video, str) and video.startswith("data:video/"):
208
+ data_str = video.split(",", 1)[1]
209
+
210
+ return {
211
+ "type": "video_base64",
212
+ "video": video,
213
+ "video_frames": None,
214
+ "video_seconds": None,
215
+ "video_bytes": len(data_str) * 3 // 4, # base64 to bytes
216
+ }
217
+
218
+ if isinstance(video, str | Path):
219
+ path = Path(video)
220
+ video_bytes = path.read_bytes()
221
+ video_format = get_file_format(path)
222
+ elif isinstance(video, bytes):
223
+ video_bytes = video
224
+ video_format = "unknown"
225
+ else:
226
+ raise ValueError(f"Unsupported video type: {type(video)} for {video}")
227
+
228
+ video_base64 = base64.b64encode(video_bytes).decode("utf-8")
229
+
230
+ return {
231
+ "type": "video_base64",
232
+ "video": f"data:video/{video_format};base64,{video_base64}",
233
+ "video_frames": None,
234
+ "video_seconds": None,
235
+ "video_bytes": len(video_bytes),
236
+ }
237
+
238
+
239
+ def get_file_format(path: Path | str) -> str:
240
+ """Get file format from path extension."""
241
+ suffix = Path(path).suffix.lower()
242
+ return suffix[1:] if suffix.startswith(".") else "unknown"
guidellm/logger.py CHANGED
@@ -41,7 +41,7 @@ import sys
41
41
 
42
42
  from loguru import logger
43
43
 
44
- from guidellm.config import LoggingSettings, settings
44
+ from guidellm.settings import LoggingSettings, settings
45
45
 
46
46
  __all__ = ["configure_logger", "logger"]
47
47
 
@@ -72,7 +72,7 @@ def configure_logger(config: LoggingSettings = settings.logging):
72
72
  sys.stdout,
73
73
  level=config.console_log_level.upper(),
74
74
  format="<green>{time:YY-MM-DD HH:mm:ss}</green>|<level>{level: <8}</level> \
75
- |<cyan>{name}:{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
75
+ |<cyan>{name}:{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
76
76
  )
77
77
 
78
78
  if config.log_file or config.log_file_level:
@@ -0,0 +1,8 @@
1
+ """
2
+ GuideLLM Mock Server for OpenAI and vLLM API compatibility.
3
+ """
4
+
5
+ from .config import MockServerConfig
6
+ from .server import MockServer
7
+
8
+ __all__ = ["MockServer", "MockServerConfig"]
@@ -0,0 +1,84 @@
1
+ """
2
+ Configuration settings for the mock server component.
3
+
4
+ Provides centralized configuration management for mock server behavior including
5
+ network binding, model identification, response timing characteristics, and token
6
+ generation parameters. Supports environment variable configuration for deployment
7
+ flexibility with automatic validation through Pydantic settings.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from pydantic import Field
13
+ from pydantic_settings import BaseSettings
14
+
15
+ __all__ = ["MockServerConfig"]
16
+
17
+
18
+ class MockServerConfig(BaseSettings):
19
+ """
20
+ Configuration settings for mock server behavior and deployment.
21
+
22
+ Centralizes all configurable parameters for mock server operation including
23
+ network settings, model identification, response timing characteristics, and
24
+ token generation behavior. Environment variables with GUIDELLM_MOCK_SERVER_
25
+ prefix override default values for deployment flexibility.
26
+
27
+ Example:
28
+ ::
29
+ config = MockServerConfig(host="0.0.0.0", port=8080, model="custom-model")
30
+ # Use with environment variables:
31
+ # GUIDELLM_MOCK_SERVER_HOST=127.0.0.1 GUIDELLM_MOCK_SERVER_PORT=9000
32
+ """
33
+
34
+ host: str = Field(
35
+ default="127.0.0.1", description="Host address to bind the server to"
36
+ )
37
+ port: int = Field(default=8000, description="Port number to bind the server to")
38
+ workers: int = Field(default=1, description="Number of worker processes to spawn")
39
+ model: str = Field(
40
+ default="llama-3.1-8b-instruct",
41
+ description="Model name to present in API responses",
42
+ )
43
+ processor: str | None = Field(
44
+ default=None,
45
+ description=(
46
+ "Processor type to use for token stats, tokenize, and detokenize. "
47
+ "If None, a mock one is created."
48
+ ),
49
+ )
50
+ request_latency: float = Field(
51
+ default=3.0,
52
+ description="Base request latency in seconds for non-streaming responses",
53
+ )
54
+ request_latency_std: float = Field(
55
+ default=0.0,
56
+ description="Standard deviation for request latency variation",
57
+ )
58
+ ttft_ms: float = Field(
59
+ default=150.0,
60
+ description="Time to first token in milliseconds for streaming responses",
61
+ )
62
+ ttft_ms_std: float = Field(
63
+ default=0.0,
64
+ description="Standard deviation for time to first token variation",
65
+ )
66
+ itl_ms: float = Field(
67
+ default=10.0,
68
+ description="Inter-token latency in milliseconds for streaming responses",
69
+ )
70
+ itl_ms_std: float = Field(
71
+ default=0.0,
72
+ description="Standard deviation for inter-token latency variation",
73
+ )
74
+ output_tokens: int = Field(
75
+ default=128, description="Number of output tokens to generate in responses"
76
+ )
77
+ output_tokens_std: float = Field(
78
+ default=0.0,
79
+ description="Standard deviation for output token count variation",
80
+ )
81
+
82
+ class Config:
83
+ env_prefix = "GUIDELLM_MOCK_SERVER_"
84
+ case_sensitive = False
@@ -0,0 +1,17 @@
1
+ """
2
+ HTTP request handlers for the GuideLLM mock server.
3
+
4
+ This module exposes request handlers that implement OpenAI-compatible API endpoints
5
+ for the mock server. The handlers provide realistic LLM simulation capabilities
6
+ including chat completions, legacy completions, and tokenization services with
7
+ configurable timing characteristics, token counting, and proper error handling to
8
+ support comprehensive benchmarking and testing scenarios.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from .chat_completions import ChatCompletionsHandler
14
+ from .completions import CompletionsHandler
15
+ from .tokenizer import TokenizerHandler
16
+
17
+ __all__ = ["ChatCompletionsHandler", "CompletionsHandler", "TokenizerHandler"]
@@ -0,0 +1,280 @@
1
+ """
2
+ OpenAI Chat Completions API endpoint handler for the mock server.
3
+
4
+ Provides a complete implementation of the /v1/chat/completions endpoint that simulates
5
+ realistic LLM behavior with configurable timing characteristics. Supports both streaming
6
+ and non-streaming responses with proper token counting, latency simulation including
7
+ TTFT (Time To First Token) and ITL (Inter-Token Latency), and OpenAI-compatible error
8
+ handling for comprehensive benchmarking scenarios.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ import json
15
+ import math
16
+ import time
17
+ import uuid
18
+
19
+ from pydantic import ValidationError
20
+ from sanic import response
21
+ from sanic.request import Request
22
+ from sanic.response import HTTPResponse, ResponseStream
23
+ from transformers import PreTrainedTokenizer
24
+
25
+ from guidellm.mock_server.config import MockServerConfig
26
+ from guidellm.mock_server.models import (
27
+ ChatCompletionChoice,
28
+ ChatCompletionsRequest,
29
+ ChatCompletionsResponse,
30
+ ChatMessage,
31
+ ErrorDetail,
32
+ ErrorResponse,
33
+ Usage,
34
+ )
35
+ from guidellm.mock_server.utils import (
36
+ MockTokenizer,
37
+ create_fake_text,
38
+ create_fake_tokens_str,
39
+ sample_number,
40
+ times_generator,
41
+ )
42
+
43
+ __all__ = ["ChatCompletionsHandler"]
44
+
45
+
46
+ class ChatCompletionsHandler:
47
+ """
48
+ Handles OpenAI Chat Completions API requests with realistic LLM simulation.
49
+
50
+ Implements the /v1/chat/completions endpoint behavior including request validation,
51
+ response generation, and timing simulation. Supports both streaming and
52
+ non-streaming modes with configurable latency characteristics for comprehensive
53
+ benchmarking. Uses either a mock tokenizer or a real tokenizer for accurate token
54
+ counting and realistic text generation.
55
+
56
+ Example:
57
+ ::
58
+ config = MockServerConfig(ttft_ms=100, itl_ms=50)
59
+ handler = ChatCompletionsHandler(config)
60
+ response = await handler.handle(request)
61
+ """
62
+
63
+ def __init__(self, config: MockServerConfig) -> None:
64
+ """
65
+ Initialize the Chat Completions handler with server configuration.
66
+
67
+ :param config: Mock server configuration containing timing and behavior settings
68
+ """
69
+ self.config = config
70
+ self.tokenizer = (
71
+ MockTokenizer()
72
+ if config.processor is None
73
+ else PreTrainedTokenizer.from_pretrained(config.processor)
74
+ )
75
+
76
+ async def handle(self, request: Request) -> HTTPResponse:
77
+ """
78
+ Process incoming chat completion requests with validation and routing.
79
+
80
+ Validates the request payload, handles errors gracefully, and routes to
81
+ appropriate streaming or non-streaming response handlers based on the
82
+ request configuration.
83
+
84
+ :param request: Sanic HTTP request containing chat completion parameters
85
+ :return: HTTP response with completion data or error information
86
+ :raises ValidationError: When request payload fails validation
87
+ :raises JSONDecodeError: When request contains invalid JSON
88
+ """
89
+ try:
90
+ # Parse and validate request
91
+ req_data = ChatCompletionsRequest(**request.json)
92
+ except ValidationError as exc:
93
+ return response.json(
94
+ ErrorResponse(
95
+ error=ErrorDetail(
96
+ message=f"Invalid request: {str(exc)}",
97
+ type="invalid_request_error",
98
+ code="invalid_request",
99
+ )
100
+ ).model_dump(),
101
+ status=400,
102
+ )
103
+ except (json.JSONDecodeError, TypeError):
104
+ return response.json(
105
+ ErrorResponse(
106
+ error=ErrorDetail(
107
+ message="Invalid JSON in request body",
108
+ type="invalid_request_error",
109
+ code="invalid_json",
110
+ )
111
+ ).model_dump(),
112
+ status=400,
113
+ )
114
+
115
+ # Handle streaming vs non-streaming
116
+ if req_data.stream:
117
+ return await self._handle_stream(req_data)
118
+ else:
119
+ return await self._handle_non_stream(req_data)
120
+
121
+ async def _handle_non_stream(self, req: ChatCompletionsRequest) -> HTTPResponse:
122
+ """
123
+ Generate complete non-streaming chat completion response.
124
+
125
+ Simulates realistic LLM behavior with TTFT and ITL delays, generates
126
+ appropriate token counts, and returns a complete response with usage
127
+ statistics and generated content.
128
+
129
+ :param req: Validated chat completion request parameters
130
+ :return: Complete HTTP response with generated completion data
131
+ """
132
+ # TTFT delay
133
+ await asyncio.sleep(
134
+ sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
135
+ )
136
+
137
+ # Token counts
138
+ prompt_text = self.tokenizer.apply_chat_template(req.messages)
139
+ prompt_tokens = len(self.tokenizer(prompt_text)) # type: ignore[arg-type]
140
+ max_tokens = req.max_completion_tokens or req.max_tokens or math.inf
141
+ completion_tokens_count = min(
142
+ sample_number(self.config.output_tokens, self.config.output_tokens_std),
143
+ max_tokens,
144
+ )
145
+
146
+ # ITL delay
147
+ itl_delay = 0.0
148
+ delays_iter = iter(times_generator(self.config.itl_ms, self.config.itl_ms_std))
149
+ for _ in range(int(completion_tokens_count) - 1):
150
+ itl_delay += next(delays_iter)
151
+ await asyncio.sleep(itl_delay / 1000.0)
152
+
153
+ # Response
154
+ chat_response = ChatCompletionsResponse(
155
+ id=f"chatcmpl-{uuid.uuid4().hex[:29]}",
156
+ model=req.model,
157
+ choices=[
158
+ ChatCompletionChoice(
159
+ index=0,
160
+ message=ChatMessage(
161
+ role="assistant",
162
+ content=create_fake_text(
163
+ int(completion_tokens_count), self.tokenizer
164
+ ),
165
+ ),
166
+ finish_reason="stop",
167
+ )
168
+ ],
169
+ usage=Usage(
170
+ prompt_tokens=prompt_tokens,
171
+ completion_tokens=int(completion_tokens_count),
172
+ ),
173
+ system_fingerprint=f"fp_{uuid.uuid4().hex[:10]}",
174
+ )
175
+
176
+ return response.json(chat_response.model_dump())
177
+
178
+ async def _handle_stream(self, req: ChatCompletionsRequest) -> HTTPResponse:
179
+ """
180
+ Generate streaming chat completion response with real-time token delivery.
181
+
182
+ Creates a streaming response that delivers tokens incrementally with
183
+ realistic timing delays. Supports optional usage statistics in the final
184
+ stream chunk when requested via stream_options.
185
+
186
+ :param req: Validated chat completion request with streaming enabled
187
+ :return: Streaming HTTP response delivering tokens with proper timing
188
+ """
189
+
190
+ async def generate_stream(stream_response):
191
+ completion_id = f"chatcmpl-{uuid.uuid4().hex[:29]}"
192
+
193
+ # TTFT delay
194
+ await asyncio.sleep(
195
+ sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
196
+ )
197
+
198
+ # Token counts
199
+ prompt_text = self.tokenizer.apply_chat_template(req.messages)
200
+ prompt_tokens = len(self.tokenizer(prompt_text)) # type: ignore[arg-type]
201
+ max_tokens = req.max_completion_tokens or req.max_tokens or math.inf
202
+ completion_tokens_count = int(
203
+ min(
204
+ sample_number(
205
+ self.config.output_tokens, self.config.output_tokens_std
206
+ ),
207
+ max_tokens,
208
+ )
209
+ )
210
+
211
+ # Send tokens
212
+ tokens = create_fake_tokens_str(completion_tokens_count, self.tokenizer)
213
+ delays_iter = iter(
214
+ times_generator(self.config.itl_ms, self.config.itl_ms_std)
215
+ )
216
+
217
+ for index, token in enumerate(tokens):
218
+ if index > 0:
219
+ itl_delay = next(delays_iter)
220
+ await asyncio.sleep(itl_delay / 1000.0)
221
+
222
+ chunk_data = {
223
+ "id": completion_id,
224
+ "object": "chat.completion.chunk",
225
+ "created": int(time.time()),
226
+ "model": req.model,
227
+ "choices": [
228
+ {
229
+ "index": 0,
230
+ "delta": {"content": token},
231
+ "finish_reason": None,
232
+ }
233
+ ],
234
+ }
235
+ await stream_response.write(f"data: {json.dumps(chunk_data)}\n\n")
236
+
237
+ # Send final chunk with finish reason
238
+ final_chunk = {
239
+ "id": completion_id,
240
+ "object": "chat.completion.chunk",
241
+ "created": int(time.time()),
242
+ "model": req.model,
243
+ "choices": [
244
+ {
245
+ "index": 0,
246
+ "delta": {},
247
+ "finish_reason": "stop",
248
+ }
249
+ ],
250
+ }
251
+ await stream_response.write(f"data: {json.dumps(final_chunk)}\n\n")
252
+
253
+ # Send usage if requested
254
+ if req.stream_options and req.stream_options.include_usage:
255
+ usage_chunk = {
256
+ "id": completion_id,
257
+ "object": "chat.completion.chunk",
258
+ "created": int(time.time()),
259
+ "model": req.model,
260
+ "choices": [],
261
+ "usage": {
262
+ "prompt_tokens": prompt_tokens,
263
+ "completion_tokens": completion_tokens_count,
264
+ "total_tokens": prompt_tokens + completion_tokens_count,
265
+ },
266
+ }
267
+ await stream_response.write(f"data: {json.dumps(usage_chunk)}\n\n")
268
+
269
+ # End stream
270
+ await stream_response.write("data: [DONE]\n\n")
271
+
272
+ return ResponseStream( # type: ignore[return-value]
273
+ generate_stream,
274
+ content_type="text/event-stream",
275
+ headers={
276
+ "Cache-Control": "no-cache",
277
+ "Connection": "keep-alive",
278
+ "X-Accel-Buffering": "no",
279
+ },
280
+ )