guidellm 0.4.0a21__py3-none-any.whl → 0.4.0a169__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of guidellm might be problematic. Click here for more details.
- guidellm/__init__.py +5 -2
- guidellm/__main__.py +452 -252
- guidellm/backends/__init__.py +33 -0
- guidellm/backends/backend.py +110 -0
- guidellm/backends/openai.py +355 -0
- guidellm/backends/response_handlers.py +455 -0
- guidellm/benchmark/__init__.py +53 -39
- guidellm/benchmark/benchmarker.py +150 -317
- guidellm/benchmark/entrypoints.py +467 -128
- guidellm/benchmark/output.py +519 -771
- guidellm/benchmark/profile.py +580 -280
- guidellm/benchmark/progress.py +568 -549
- guidellm/benchmark/scenarios/__init__.py +40 -0
- guidellm/benchmark/scenarios/chat.json +6 -0
- guidellm/benchmark/scenarios/rag.json +6 -0
- guidellm/benchmark/schemas.py +2086 -0
- guidellm/data/__init__.py +28 -4
- guidellm/data/collators.py +16 -0
- guidellm/data/deserializers/__init__.py +53 -0
- guidellm/data/deserializers/deserializer.py +144 -0
- guidellm/data/deserializers/file.py +222 -0
- guidellm/data/deserializers/huggingface.py +94 -0
- guidellm/data/deserializers/memory.py +194 -0
- guidellm/data/deserializers/synthetic.py +348 -0
- guidellm/data/loaders.py +149 -0
- guidellm/data/preprocessors/__init__.py +25 -0
- guidellm/data/preprocessors/formatters.py +404 -0
- guidellm/data/preprocessors/mappers.py +198 -0
- guidellm/data/preprocessors/preprocessor.py +31 -0
- guidellm/data/processor.py +31 -0
- guidellm/data/schemas.py +13 -0
- guidellm/data/utils/__init__.py +6 -0
- guidellm/data/utils/dataset.py +94 -0
- guidellm/extras/__init__.py +4 -0
- guidellm/extras/audio.py +215 -0
- guidellm/extras/vision.py +242 -0
- guidellm/logger.py +2 -2
- guidellm/mock_server/__init__.py +8 -0
- guidellm/mock_server/config.py +84 -0
- guidellm/mock_server/handlers/__init__.py +17 -0
- guidellm/mock_server/handlers/chat_completions.py +280 -0
- guidellm/mock_server/handlers/completions.py +280 -0
- guidellm/mock_server/handlers/tokenizer.py +142 -0
- guidellm/mock_server/models.py +510 -0
- guidellm/mock_server/server.py +168 -0
- guidellm/mock_server/utils.py +302 -0
- guidellm/preprocess/dataset.py +23 -26
- guidellm/presentation/builder.py +2 -2
- guidellm/presentation/data_models.py +25 -21
- guidellm/presentation/injector.py +2 -3
- guidellm/scheduler/__init__.py +65 -26
- guidellm/scheduler/constraints.py +1035 -0
- guidellm/scheduler/environments.py +252 -0
- guidellm/scheduler/scheduler.py +140 -368
- guidellm/scheduler/schemas.py +272 -0
- guidellm/scheduler/strategies.py +519 -0
- guidellm/scheduler/worker.py +391 -420
- guidellm/scheduler/worker_group.py +707 -0
- guidellm/schemas/__init__.py +31 -0
- guidellm/schemas/info.py +159 -0
- guidellm/schemas/request.py +226 -0
- guidellm/schemas/response.py +119 -0
- guidellm/schemas/stats.py +228 -0
- guidellm/{config.py → settings.py} +32 -21
- guidellm/utils/__init__.py +95 -8
- guidellm/utils/auto_importer.py +98 -0
- guidellm/utils/cli.py +71 -2
- guidellm/utils/console.py +183 -0
- guidellm/utils/encoding.py +778 -0
- guidellm/utils/functions.py +134 -0
- guidellm/utils/hf_datasets.py +1 -2
- guidellm/utils/hf_transformers.py +4 -4
- guidellm/utils/imports.py +9 -0
- guidellm/utils/messaging.py +1118 -0
- guidellm/utils/mixins.py +115 -0
- guidellm/utils/pydantic_utils.py +411 -0
- guidellm/utils/random.py +3 -4
- guidellm/utils/registry.py +220 -0
- guidellm/utils/singleton.py +133 -0
- guidellm/{objects → utils}/statistics.py +341 -247
- guidellm/utils/synchronous.py +159 -0
- guidellm/utils/text.py +163 -50
- guidellm/utils/typing.py +41 -0
- guidellm/version.py +1 -1
- {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/METADATA +33 -10
- guidellm-0.4.0a169.dist-info/RECORD +95 -0
- guidellm/backend/__init__.py +0 -23
- guidellm/backend/backend.py +0 -259
- guidellm/backend/openai.py +0 -705
- guidellm/backend/response.py +0 -136
- guidellm/benchmark/aggregator.py +0 -760
- guidellm/benchmark/benchmark.py +0 -837
- guidellm/benchmark/scenario.py +0 -104
- guidellm/data/prideandprejudice.txt.gz +0 -0
- guidellm/dataset/__init__.py +0 -22
- guidellm/dataset/creator.py +0 -213
- guidellm/dataset/entrypoints.py +0 -42
- guidellm/dataset/file.py +0 -92
- guidellm/dataset/hf_datasets.py +0 -62
- guidellm/dataset/in_memory.py +0 -132
- guidellm/dataset/synthetic.py +0 -287
- guidellm/objects/__init__.py +0 -18
- guidellm/objects/pydantic.py +0 -89
- guidellm/request/__init__.py +0 -18
- guidellm/request/loader.py +0 -284
- guidellm/request/request.py +0 -79
- guidellm/request/types.py +0 -10
- guidellm/scheduler/queues.py +0 -25
- guidellm/scheduler/result.py +0 -155
- guidellm/scheduler/strategy.py +0 -495
- guidellm-0.4.0a21.dist-info/RECORD +0 -62
- {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/WHEEL +0 -0
- {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/entry_points.txt +0 -0
- {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/licenses/LICENSE +0 -0
- {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,510 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pydantic models for OpenAI API and vLLM API request/response validation.
|
|
3
|
+
|
|
4
|
+
This module defines comprehensive data models for validating and serializing API
|
|
5
|
+
requests and responses compatible with both OpenAI's API specification and vLLM's
|
|
6
|
+
extended parameters. It includes models for chat completions, legacy text completions,
|
|
7
|
+
tokenization operations, and error handling, supporting both streaming and non-streaming
|
|
8
|
+
responses with full type safety and validation.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import time
|
|
14
|
+
from typing import Any, Literal
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel, Field
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"ChatCompletionChoice",
|
|
20
|
+
"ChatCompletionChunk",
|
|
21
|
+
"ChatCompletionsRequest",
|
|
22
|
+
"ChatCompletionsResponse",
|
|
23
|
+
"ChatMessage",
|
|
24
|
+
"CompletionChoice",
|
|
25
|
+
"CompletionsRequest",
|
|
26
|
+
"CompletionsResponse",
|
|
27
|
+
"DetokenizeRequest",
|
|
28
|
+
"DetokenizeResponse",
|
|
29
|
+
"ErrorDetail",
|
|
30
|
+
"ErrorResponse",
|
|
31
|
+
"StreamOptions",
|
|
32
|
+
"TokenizeRequest",
|
|
33
|
+
"TokenizeResponse",
|
|
34
|
+
"Usage",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class Usage(BaseModel):
|
|
39
|
+
"""Token usage statistics for API requests and responses.
|
|
40
|
+
|
|
41
|
+
Tracks the number of tokens consumed in prompts, completions, and total
|
|
42
|
+
usage for billing and monitoring purposes.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
prompt_tokens: int = Field(description="Number of tokens in the input prompt")
|
|
46
|
+
completion_tokens: int = Field(
|
|
47
|
+
description="Number of tokens in the generated completion"
|
|
48
|
+
)
|
|
49
|
+
total_tokens: int = Field(description="Total tokens used (prompt + completion)")
|
|
50
|
+
|
|
51
|
+
def __init__(self, prompt_tokens: int = 0, completion_tokens: int = 0, **kwargs):
|
|
52
|
+
"""Initialize usage statistics.
|
|
53
|
+
|
|
54
|
+
:param prompt_tokens: Number of tokens in the input prompt
|
|
55
|
+
:param completion_tokens: Number of tokens in the generated completion
|
|
56
|
+
:param kwargs: Additional keyword arguments passed to BaseModel
|
|
57
|
+
"""
|
|
58
|
+
super().__init__(
|
|
59
|
+
prompt_tokens=prompt_tokens,
|
|
60
|
+
completion_tokens=completion_tokens,
|
|
61
|
+
total_tokens=prompt_tokens + completion_tokens,
|
|
62
|
+
**kwargs,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class StreamOptions(BaseModel):
|
|
67
|
+
"""Configuration options for streaming API responses.
|
|
68
|
+
|
|
69
|
+
Controls the behavior and content of streamed responses including
|
|
70
|
+
whether to include usage statistics in the final chunk.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
include_usage: bool | None = Field(
|
|
74
|
+
default=None,
|
|
75
|
+
description="Whether to include usage statistics in streaming responses",
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class ChatMessage(BaseModel):
|
|
80
|
+
"""A single message in a chat conversation.
|
|
81
|
+
|
|
82
|
+
Represents one exchange in a conversational interface with role-based
|
|
83
|
+
content and optional metadata for advanced features.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
role: Literal["system", "user", "assistant", "tool"] = Field(
|
|
87
|
+
description="Role of the message sender in the conversation"
|
|
88
|
+
)
|
|
89
|
+
content: str = Field(description="Text content of the message")
|
|
90
|
+
name: str | None = Field(
|
|
91
|
+
default=None, description="Optional name identifier for the message sender"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class ChatCompletionsRequest(BaseModel):
|
|
96
|
+
"""Request parameters for chat completion API endpoints.
|
|
97
|
+
|
|
98
|
+
Comprehensive model supporting both OpenAI standard parameters and vLLM
|
|
99
|
+
extensions for advanced generation control, guided decoding, and performance
|
|
100
|
+
optimization.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
model: str = Field(description="Model identifier to use for generation")
|
|
104
|
+
messages: list[ChatMessage] = Field(
|
|
105
|
+
description="List of messages in the conversation"
|
|
106
|
+
)
|
|
107
|
+
max_tokens: int | None = Field(
|
|
108
|
+
default=None, description="Maximum number of tokens to generate"
|
|
109
|
+
)
|
|
110
|
+
max_completion_tokens: int | None = Field(
|
|
111
|
+
default=None, description="Maximum tokens in completion (OpenAI naming)"
|
|
112
|
+
)
|
|
113
|
+
temperature: float | None = Field(
|
|
114
|
+
default=1.0, description="Sampling temperature for randomness control"
|
|
115
|
+
)
|
|
116
|
+
top_p: float | None = Field(default=1.0, description="Nucleus sampling parameter")
|
|
117
|
+
n: int | None = Field(
|
|
118
|
+
default=1, description="Number of completion choices to generate"
|
|
119
|
+
)
|
|
120
|
+
stream: bool | None = Field(
|
|
121
|
+
default=False, description="Whether to stream response chunks"
|
|
122
|
+
)
|
|
123
|
+
stream_options: StreamOptions | None = Field(
|
|
124
|
+
default=None, description="Configuration for streaming responses"
|
|
125
|
+
)
|
|
126
|
+
stop: str | list[str] | None = Field(
|
|
127
|
+
default=None, description="Stop sequences to end generation"
|
|
128
|
+
)
|
|
129
|
+
presence_penalty: float | None = Field(
|
|
130
|
+
default=0.0, description="Penalty for token presence to encourage diversity"
|
|
131
|
+
)
|
|
132
|
+
frequency_penalty: float | None = Field(
|
|
133
|
+
default=0.0, description="Penalty for token frequency to reduce repetition"
|
|
134
|
+
)
|
|
135
|
+
logit_bias: dict[str, float] | None = Field(
|
|
136
|
+
default=None, description="Bias values for specific tokens"
|
|
137
|
+
)
|
|
138
|
+
seed: int | None = Field(
|
|
139
|
+
default=None, description="Random seed for reproducible outputs"
|
|
140
|
+
)
|
|
141
|
+
user: str | None = Field(
|
|
142
|
+
default=None, description="User identifier for tracking and abuse monitoring"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# vLLM extensions
|
|
146
|
+
use_beam_search: bool | None = Field(
|
|
147
|
+
default=False, description="Enable beam search for better quality"
|
|
148
|
+
)
|
|
149
|
+
top_k: int | None = Field(default=None, description="Top-k sampling parameter")
|
|
150
|
+
min_p: float | None = Field(
|
|
151
|
+
default=None, description="Minimum probability threshold for sampling"
|
|
152
|
+
)
|
|
153
|
+
repetition_penalty: float | None = Field(
|
|
154
|
+
default=None, description="Penalty for repeated tokens"
|
|
155
|
+
)
|
|
156
|
+
length_penalty: float | None = Field(
|
|
157
|
+
default=1.0, description="Length penalty for sequence scoring"
|
|
158
|
+
)
|
|
159
|
+
stop_token_ids: list[int] | None = Field(
|
|
160
|
+
default=None, description="Token IDs that trigger generation stop"
|
|
161
|
+
)
|
|
162
|
+
include_stop_str_in_output: bool | None = Field(
|
|
163
|
+
default=False, description="Include stop sequence in output"
|
|
164
|
+
)
|
|
165
|
+
ignore_eos: bool | None = Field(
|
|
166
|
+
default=False, description="Ignore end-of-sequence tokens"
|
|
167
|
+
)
|
|
168
|
+
min_tokens: int | None = Field(
|
|
169
|
+
default=0, description="Minimum number of tokens to generate"
|
|
170
|
+
)
|
|
171
|
+
skip_special_tokens: bool | None = Field(
|
|
172
|
+
default=True, description="Skip special tokens in output"
|
|
173
|
+
)
|
|
174
|
+
spaces_between_special_tokens: bool | None = Field(
|
|
175
|
+
default=True, description="Add spaces between special tokens"
|
|
176
|
+
)
|
|
177
|
+
truncate_prompt_tokens: int | None = Field(
|
|
178
|
+
default=None, description="Maximum prompt tokens before truncation"
|
|
179
|
+
)
|
|
180
|
+
allowed_token_ids: list[int] | None = Field(
|
|
181
|
+
default=None, description="Restrict generation to specific token IDs"
|
|
182
|
+
)
|
|
183
|
+
prompt_logprobs: int | None = Field(
|
|
184
|
+
default=None, description="Number of logprobs to return for prompt tokens"
|
|
185
|
+
)
|
|
186
|
+
add_special_tokens: bool | None = Field(
|
|
187
|
+
default=True, description="Add special tokens during processing"
|
|
188
|
+
)
|
|
189
|
+
guided_json: str | dict[str, Any] | None = Field(
|
|
190
|
+
default=None, description="JSON schema for guided generation"
|
|
191
|
+
)
|
|
192
|
+
guided_regex: str | None = Field(
|
|
193
|
+
default=None, description="Regex pattern for guided generation"
|
|
194
|
+
)
|
|
195
|
+
guided_choice: list[str] | None = Field(
|
|
196
|
+
default=None, description="List of choices for guided generation"
|
|
197
|
+
)
|
|
198
|
+
guided_grammar: str | None = Field(
|
|
199
|
+
default=None, description="Grammar specification for guided generation"
|
|
200
|
+
)
|
|
201
|
+
guided_decoding_backend: str | None = Field(
|
|
202
|
+
default=None, description="Backend to use for guided decoding"
|
|
203
|
+
)
|
|
204
|
+
guided_whitespace_pattern: str | None = Field(
|
|
205
|
+
default=None, description="Whitespace pattern for guided generation"
|
|
206
|
+
)
|
|
207
|
+
priority: int | None = Field(
|
|
208
|
+
default=0, description="Request priority for scheduling"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
class ChatCompletionChoice(BaseModel):
|
|
213
|
+
"""A single completion choice from a chat completion response.
|
|
214
|
+
|
|
215
|
+
Contains the generated message and metadata about why generation
|
|
216
|
+
stopped and the choice's position in the response.
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
index: int = Field(description="Index of this choice in the response")
|
|
220
|
+
message: ChatMessage = Field(description="Generated message content")
|
|
221
|
+
finish_reason: Literal["stop", "length", "content_filter", "tool_calls"] | None = (
|
|
222
|
+
Field(description="Reason why generation finished")
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class ChatCompletionsResponse(BaseModel):
|
|
227
|
+
"""Response from chat completion API endpoints.
|
|
228
|
+
|
|
229
|
+
Contains generated choices, usage statistics, and metadata for
|
|
230
|
+
non-streaming chat completion requests.
|
|
231
|
+
"""
|
|
232
|
+
|
|
233
|
+
id: str = Field(description="Unique identifier for this completion")
|
|
234
|
+
object: Literal["chat.completion"] = Field(
|
|
235
|
+
default="chat.completion", description="Object type identifier"
|
|
236
|
+
)
|
|
237
|
+
created: int = Field(
|
|
238
|
+
default_factory=lambda: int(time.time()),
|
|
239
|
+
description="Unix timestamp of creation",
|
|
240
|
+
)
|
|
241
|
+
model: str = Field(description="Model used for generation")
|
|
242
|
+
choices: list[ChatCompletionChoice] = Field(
|
|
243
|
+
description="Generated completion choices"
|
|
244
|
+
)
|
|
245
|
+
usage: Usage | None = Field(default=None, description="Token usage statistics")
|
|
246
|
+
system_fingerprint: str | None = Field(
|
|
247
|
+
default=None, description="System configuration fingerprint"
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
class ChatCompletionChunk(BaseModel):
|
|
252
|
+
"""A single chunk in a streamed chat completion response.
|
|
253
|
+
|
|
254
|
+
Represents one piece of a streaming response with delta content
|
|
255
|
+
and optional usage statistics in the final chunk.
|
|
256
|
+
"""
|
|
257
|
+
|
|
258
|
+
id: str = Field(description="Unique identifier for this completion")
|
|
259
|
+
object: Literal["chat.completion.chunk"] = Field(
|
|
260
|
+
default="chat.completion.chunk",
|
|
261
|
+
description="Object type identifier for streaming chunks",
|
|
262
|
+
)
|
|
263
|
+
created: int = Field(
|
|
264
|
+
default_factory=lambda: int(time.time()),
|
|
265
|
+
description="Unix timestamp of creation",
|
|
266
|
+
)
|
|
267
|
+
model: str = Field(description="Model used for generation")
|
|
268
|
+
choices: list[dict[str, Any]] = Field(description="Delta choices for streaming")
|
|
269
|
+
usage: Usage | None = Field(
|
|
270
|
+
default=None, description="Token usage statistics (typically in final chunk)"
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
class CompletionsRequest(BaseModel):
|
|
275
|
+
"""Request parameters for legacy text completion API endpoints.
|
|
276
|
+
|
|
277
|
+
Supports the older text completion format with prompt-based input
|
|
278
|
+
and the same extensive parameter set as chat completions for
|
|
279
|
+
backward compatibility.
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
model: str = Field(description="Model identifier to use for generation")
|
|
283
|
+
prompt: str | list[str] = Field(description="Input prompt(s) for completion")
|
|
284
|
+
max_tokens: int | None = Field(
|
|
285
|
+
default=16, description="Maximum number of tokens to generate"
|
|
286
|
+
)
|
|
287
|
+
temperature: float | None = Field(
|
|
288
|
+
default=1.0, description="Sampling temperature for randomness control"
|
|
289
|
+
)
|
|
290
|
+
top_p: float | None = Field(default=1.0, description="Nucleus sampling parameter")
|
|
291
|
+
n: int | None = Field(
|
|
292
|
+
default=1, description="Number of completion choices to generate"
|
|
293
|
+
)
|
|
294
|
+
stream: bool | None = Field(
|
|
295
|
+
default=False, description="Whether to stream response chunks"
|
|
296
|
+
)
|
|
297
|
+
stream_options: StreamOptions | None = Field(
|
|
298
|
+
default=None, description="Configuration for streaming responses"
|
|
299
|
+
)
|
|
300
|
+
logprobs: int | None = Field(
|
|
301
|
+
default=None, description="Number of logprobs to return"
|
|
302
|
+
)
|
|
303
|
+
echo: bool | None = Field(
|
|
304
|
+
default=False, description="Whether to echo the prompt in output"
|
|
305
|
+
)
|
|
306
|
+
stop: str | list[str] | None = Field(
|
|
307
|
+
default_factory=lambda: ["<|endoftext|>"],
|
|
308
|
+
description="Stop sequences to end generation",
|
|
309
|
+
)
|
|
310
|
+
presence_penalty: float | None = Field(
|
|
311
|
+
default=0.0, description="Penalty for token presence to encourage diversity"
|
|
312
|
+
)
|
|
313
|
+
frequency_penalty: float | None = Field(
|
|
314
|
+
default=0.0, description="Penalty for token frequency to reduce repetition"
|
|
315
|
+
)
|
|
316
|
+
best_of: int | None = Field(
|
|
317
|
+
default=1, description="Number of candidates to generate and return the best"
|
|
318
|
+
)
|
|
319
|
+
logit_bias: dict[str, float] | None = Field(
|
|
320
|
+
default=None, description="Bias values for specific tokens"
|
|
321
|
+
)
|
|
322
|
+
seed: int | None = Field(
|
|
323
|
+
default=None, description="Random seed for reproducible outputs"
|
|
324
|
+
)
|
|
325
|
+
suffix: str | None = Field(
|
|
326
|
+
default=None, description="Suffix to append after completion"
|
|
327
|
+
)
|
|
328
|
+
user: str | None = Field(
|
|
329
|
+
default=None, description="User identifier for tracking and abuse monitoring"
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
# vLLM extensions (same as chat completions)
|
|
333
|
+
use_beam_search: bool | None = Field(
|
|
334
|
+
default=False, description="Enable beam search for better quality"
|
|
335
|
+
)
|
|
336
|
+
top_k: int | None = Field(default=None, description="Top-k sampling parameter")
|
|
337
|
+
min_p: float | None = Field(
|
|
338
|
+
default=None, description="Minimum probability threshold for sampling"
|
|
339
|
+
)
|
|
340
|
+
repetition_penalty: float | None = Field(
|
|
341
|
+
default=None, description="Penalty for repeated tokens"
|
|
342
|
+
)
|
|
343
|
+
length_penalty: float | None = Field(
|
|
344
|
+
default=1.0, description="Length penalty for sequence scoring"
|
|
345
|
+
)
|
|
346
|
+
stop_token_ids: list[int] | None = Field(
|
|
347
|
+
default=None, description="Token IDs that trigger generation stop"
|
|
348
|
+
)
|
|
349
|
+
include_stop_str_in_output: bool | None = Field(
|
|
350
|
+
default=False, description="Include stop sequence in output"
|
|
351
|
+
)
|
|
352
|
+
ignore_eos: bool | None = Field(
|
|
353
|
+
default=False, description="Ignore end-of-sequence tokens"
|
|
354
|
+
)
|
|
355
|
+
min_tokens: int | None = Field(
|
|
356
|
+
default=0, description="Minimum number of tokens to generate"
|
|
357
|
+
)
|
|
358
|
+
skip_special_tokens: bool | None = Field(
|
|
359
|
+
default=True, description="Skip special tokens in output"
|
|
360
|
+
)
|
|
361
|
+
spaces_between_special_tokens: bool | None = Field(
|
|
362
|
+
default=True, description="Add spaces between special tokens"
|
|
363
|
+
)
|
|
364
|
+
truncate_prompt_tokens: int | None = Field(
|
|
365
|
+
default=None, description="Maximum prompt tokens before truncation"
|
|
366
|
+
)
|
|
367
|
+
allowed_token_ids: list[int] | None = Field(
|
|
368
|
+
default=None, description="Restrict generation to specific token IDs"
|
|
369
|
+
)
|
|
370
|
+
prompt_logprobs: int | None = Field(
|
|
371
|
+
default=None, description="Number of logprobs to return for prompt tokens"
|
|
372
|
+
)
|
|
373
|
+
add_special_tokens: bool | None = Field(
|
|
374
|
+
default=True, description="Add special tokens during processing"
|
|
375
|
+
)
|
|
376
|
+
guided_json: str | dict[str, Any] | None = Field(
|
|
377
|
+
default=None, description="JSON schema for guided generation"
|
|
378
|
+
)
|
|
379
|
+
guided_regex: str | None = Field(
|
|
380
|
+
default=None, description="Regex pattern for guided generation"
|
|
381
|
+
)
|
|
382
|
+
guided_choice: list[str] | None = Field(
|
|
383
|
+
default=None, description="List of choices for guided generation"
|
|
384
|
+
)
|
|
385
|
+
guided_grammar: str | None = Field(
|
|
386
|
+
default=None, description="Grammar specification for guided generation"
|
|
387
|
+
)
|
|
388
|
+
guided_decoding_backend: str | None = Field(
|
|
389
|
+
default=None, description="Backend to use for guided decoding"
|
|
390
|
+
)
|
|
391
|
+
guided_whitespace_pattern: str | None = Field(
|
|
392
|
+
default=None, description="Whitespace pattern for guided generation"
|
|
393
|
+
)
|
|
394
|
+
priority: int | None = Field(
|
|
395
|
+
default=0, description="Request priority for scheduling"
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
class CompletionChoice(BaseModel):
|
|
400
|
+
"""A single completion choice from a text completion response.
|
|
401
|
+
|
|
402
|
+
Contains the generated text and metadata about completion
|
|
403
|
+
quality and stopping conditions.
|
|
404
|
+
"""
|
|
405
|
+
|
|
406
|
+
text: str = Field(description="Generated text content")
|
|
407
|
+
index: int = Field(description="Index of this choice in the response")
|
|
408
|
+
logprobs: dict[str, Any] | None = Field(
|
|
409
|
+
default=None, description="Log probabilities for generated tokens"
|
|
410
|
+
)
|
|
411
|
+
finish_reason: Literal["stop", "length", "content_filter"] | None = Field(
|
|
412
|
+
description="Reason why generation finished"
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
class CompletionsResponse(BaseModel):
|
|
417
|
+
"""Response from legacy text completion API endpoints.
|
|
418
|
+
|
|
419
|
+
Contains generated text choices, usage statistics, and metadata
|
|
420
|
+
for non-streaming text completion requests.
|
|
421
|
+
"""
|
|
422
|
+
|
|
423
|
+
id: str = Field(description="Unique identifier for this completion")
|
|
424
|
+
object: Literal["text_completion"] = Field(
|
|
425
|
+
default="text_completion", description="Object type identifier"
|
|
426
|
+
)
|
|
427
|
+
created: int = Field(
|
|
428
|
+
default_factory=lambda: int(time.time()),
|
|
429
|
+
description="Unix timestamp of creation",
|
|
430
|
+
)
|
|
431
|
+
model: str = Field(description="Model used for generation")
|
|
432
|
+
choices: list[CompletionChoice] = Field(description="Generated completion choices")
|
|
433
|
+
usage: Usage | None = Field(default=None, description="Token usage statistics")
|
|
434
|
+
system_fingerprint: str | None = Field(
|
|
435
|
+
default=None, description="System configuration fingerprint"
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
class TokenizeRequest(BaseModel):
|
|
440
|
+
"""Request for tokenizing text into token sequences.
|
|
441
|
+
|
|
442
|
+
Converts input text into model-specific token representations
|
|
443
|
+
with optional special token handling.
|
|
444
|
+
"""
|
|
445
|
+
|
|
446
|
+
text: str = Field(description="Text to tokenize")
|
|
447
|
+
add_special_tokens: bool | None = Field(
|
|
448
|
+
default=True, description="Whether to add model-specific special tokens"
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
class TokenizeResponse(BaseModel):
|
|
453
|
+
"""Response containing tokenized representation of input text.
|
|
454
|
+
|
|
455
|
+
Provides both the token sequence and count for analysis
|
|
456
|
+
and token budget planning.
|
|
457
|
+
"""
|
|
458
|
+
|
|
459
|
+
tokens: list[int] = Field(description="List of token IDs")
|
|
460
|
+
count: int = Field(description="Total number of tokens")
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
class DetokenizeRequest(BaseModel):
|
|
464
|
+
"""Request for converting token sequences back to text.
|
|
465
|
+
|
|
466
|
+
Reconstructs human-readable text from model token representations
|
|
467
|
+
with configurable special token handling.
|
|
468
|
+
"""
|
|
469
|
+
|
|
470
|
+
tokens: list[int] = Field(description="List of token IDs to convert")
|
|
471
|
+
skip_special_tokens: bool | None = Field(
|
|
472
|
+
default=True, description="Whether to skip special tokens in output"
|
|
473
|
+
)
|
|
474
|
+
spaces_between_special_tokens: bool | None = Field(
|
|
475
|
+
default=True, description="Whether to add spaces between special tokens"
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
class DetokenizeResponse(BaseModel):
|
|
480
|
+
"""Response containing text reconstructed from tokens.
|
|
481
|
+
|
|
482
|
+
Provides the human-readable text representation of the
|
|
483
|
+
input token sequence.
|
|
484
|
+
"""
|
|
485
|
+
|
|
486
|
+
text: str = Field(description="Reconstructed text from tokens")
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
class ErrorDetail(BaseModel):
|
|
490
|
+
"""Detailed error information for API failures.
|
|
491
|
+
|
|
492
|
+
Provides structured error data including message, type classification,
|
|
493
|
+
and optional error codes for debugging and error handling.
|
|
494
|
+
"""
|
|
495
|
+
|
|
496
|
+
message: str = Field(description="Human-readable error description")
|
|
497
|
+
type: str = Field(description="Error type classification")
|
|
498
|
+
code: str | None = Field(
|
|
499
|
+
default=None, description="Optional error code for programmatic handling"
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
class ErrorResponse(BaseModel):
|
|
504
|
+
"""Standardized error response structure for API failures.
|
|
505
|
+
|
|
506
|
+
Wraps error details in a consistent format compatible with
|
|
507
|
+
OpenAI API error response conventions.
|
|
508
|
+
"""
|
|
509
|
+
|
|
510
|
+
error: ErrorDetail = Field(description="Detailed error information")
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""
|
|
2
|
+
High-performance mock server for OpenAI and vLLM API compatibility testing.
|
|
3
|
+
|
|
4
|
+
This module provides a Sanic-based mock server that simulates OpenAI and vLLM APIs
|
|
5
|
+
with configurable latency, token generation patterns, and response characteristics.
|
|
6
|
+
The server supports both streaming and non-streaming endpoints, enabling realistic
|
|
7
|
+
performance testing and validation of GuideLLM benchmarking workflows without
|
|
8
|
+
requiring actual model deployments.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import time
|
|
14
|
+
|
|
15
|
+
from sanic import Sanic, response
|
|
16
|
+
from sanic.exceptions import NotFound
|
|
17
|
+
from sanic.log import logger
|
|
18
|
+
from sanic.request import Request
|
|
19
|
+
from sanic.response import HTTPResponse
|
|
20
|
+
|
|
21
|
+
from guidellm.mock_server.config import MockServerConfig
|
|
22
|
+
from guidellm.mock_server.handlers import (
|
|
23
|
+
ChatCompletionsHandler,
|
|
24
|
+
CompletionsHandler,
|
|
25
|
+
TokenizerHandler,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
__all__ = ["MockServer"]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class MockServer:
|
|
32
|
+
"""
|
|
33
|
+
High-performance mock server implementing OpenAI and vLLM API endpoints.
|
|
34
|
+
|
|
35
|
+
Provides a Sanic-based web server that simulates API responses with configurable
|
|
36
|
+
timing characteristics for testing and benchmarking purposes. Supports chat
|
|
37
|
+
completions, text completions, tokenization endpoints, and model listing with
|
|
38
|
+
realistic latency patterns to enable comprehensive performance validation.
|
|
39
|
+
|
|
40
|
+
Example:
|
|
41
|
+
::
|
|
42
|
+
config = ServerConfig(model="test-model", port=8080)
|
|
43
|
+
server = MockServer(config)
|
|
44
|
+
server.run()
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, config: MockServerConfig) -> None:
|
|
48
|
+
"""
|
|
49
|
+
Initialize the mock server with configuration.
|
|
50
|
+
|
|
51
|
+
:param config: Server configuration containing network settings and response
|
|
52
|
+
timing parameters
|
|
53
|
+
"""
|
|
54
|
+
self.config = config
|
|
55
|
+
self.app = Sanic("guidellm-mock-server")
|
|
56
|
+
self.chat_handler = ChatCompletionsHandler(config)
|
|
57
|
+
self.completions_handler = CompletionsHandler(config)
|
|
58
|
+
self.tokenizer_handler = TokenizerHandler(config)
|
|
59
|
+
|
|
60
|
+
self._setup_middleware()
|
|
61
|
+
self._setup_routes()
|
|
62
|
+
self._setup_error_handlers()
|
|
63
|
+
|
|
64
|
+
def _setup_middleware(self):
|
|
65
|
+
"""Setup middleware for CORS, logging, etc."""
|
|
66
|
+
|
|
67
|
+
@self.app.middleware("request")
|
|
68
|
+
async def add_cors_headers(_request: Request):
|
|
69
|
+
"""Add CORS headers to all requests."""
|
|
70
|
+
|
|
71
|
+
@self.app.middleware("response")
|
|
72
|
+
async def add_response_headers(_request: Request, resp: HTTPResponse):
|
|
73
|
+
"""Add standard response headers."""
|
|
74
|
+
resp.headers["Access-Control-Allow-Origin"] = "*"
|
|
75
|
+
resp.headers["Access-Control-Allow-Methods"] = "GET, POST, OPTIONS"
|
|
76
|
+
resp.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization"
|
|
77
|
+
resp.headers["Server"] = "guidellm-mock-server"
|
|
78
|
+
|
|
79
|
+
def _setup_routes(self): # noqa: C901
|
|
80
|
+
@self.app.get("/health")
|
|
81
|
+
async def health_check(_request: Request):
|
|
82
|
+
return response.json({"status": "healthy", "timestamp": time.time()})
|
|
83
|
+
|
|
84
|
+
@self.app.get("/v1/models")
|
|
85
|
+
async def list_models(_request: Request):
|
|
86
|
+
return response.json(
|
|
87
|
+
{
|
|
88
|
+
"object": "list",
|
|
89
|
+
"data": [
|
|
90
|
+
{
|
|
91
|
+
"id": self.config.model,
|
|
92
|
+
"object": "model",
|
|
93
|
+
"created": int(time.time()),
|
|
94
|
+
"owned_by": "guidellm-mock",
|
|
95
|
+
}
|
|
96
|
+
],
|
|
97
|
+
}
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
@self.app.route("/v1/chat/completions", methods=["POST", "OPTIONS"])
|
|
101
|
+
async def chat_completions(request: Request):
|
|
102
|
+
if request.method == "OPTIONS":
|
|
103
|
+
return response.text("", status=204)
|
|
104
|
+
return await self.chat_handler.handle(request)
|
|
105
|
+
|
|
106
|
+
@self.app.route("/v1/completions", methods=["POST", "OPTIONS"])
|
|
107
|
+
async def completions(request: Request):
|
|
108
|
+
if request.method == "OPTIONS":
|
|
109
|
+
return response.text("", status=204)
|
|
110
|
+
return await self.completions_handler.handle(request)
|
|
111
|
+
|
|
112
|
+
@self.app.route("/tokenize", methods=["POST", "OPTIONS"])
|
|
113
|
+
async def tokenize(request: Request):
|
|
114
|
+
if request.method == "OPTIONS":
|
|
115
|
+
return response.text("", status=204)
|
|
116
|
+
return await self.tokenizer_handler.tokenize(request)
|
|
117
|
+
|
|
118
|
+
@self.app.route("/detokenize", methods=["POST", "OPTIONS"])
|
|
119
|
+
async def detokenize(request: Request):
|
|
120
|
+
if request.method == "OPTIONS":
|
|
121
|
+
return response.text("", status=204)
|
|
122
|
+
return await self.tokenizer_handler.detokenize(request)
|
|
123
|
+
|
|
124
|
+
def _setup_error_handlers(self):
|
|
125
|
+
"""Setup error handlers."""
|
|
126
|
+
|
|
127
|
+
@self.app.exception(Exception)
|
|
128
|
+
async def generic_error_handler(_request: Request, exception: Exception):
|
|
129
|
+
logger.error(f"Unhandled exception: {exception}")
|
|
130
|
+
return response.json(
|
|
131
|
+
{
|
|
132
|
+
"error": {
|
|
133
|
+
"message": "Internal server error",
|
|
134
|
+
"type": type(exception).__name__,
|
|
135
|
+
"error": str(exception),
|
|
136
|
+
}
|
|
137
|
+
},
|
|
138
|
+
status=500,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
@self.app.exception(NotFound)
|
|
142
|
+
async def not_found_handler(_request: Request, _exception):
|
|
143
|
+
return response.json(
|
|
144
|
+
{
|
|
145
|
+
"error": {
|
|
146
|
+
"message": "Not Found",
|
|
147
|
+
"type": "not_found_error",
|
|
148
|
+
"code": "not_found",
|
|
149
|
+
}
|
|
150
|
+
},
|
|
151
|
+
status=404,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
def run(self) -> None:
|
|
155
|
+
"""
|
|
156
|
+
Start the mock server with configured settings.
|
|
157
|
+
|
|
158
|
+
Runs the Sanic application in single-process mode with access logging enabled
|
|
159
|
+
for debugging and monitoring request patterns during testing.
|
|
160
|
+
"""
|
|
161
|
+
self.app.run(
|
|
162
|
+
host=self.config.host,
|
|
163
|
+
port=self.config.port,
|
|
164
|
+
debug=False,
|
|
165
|
+
single_process=True,
|
|
166
|
+
access_log=True,
|
|
167
|
+
register_sys_signals=False, # Disable signal handlers for threading
|
|
168
|
+
)
|