guidellm 0.4.0a21__py3-none-any.whl → 0.4.0a169__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of guidellm might be problematic. Click here for more details.

Files changed (115) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +452 -252
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +110 -0
  5. guidellm/backends/openai.py +355 -0
  6. guidellm/backends/response_handlers.py +455 -0
  7. guidellm/benchmark/__init__.py +53 -39
  8. guidellm/benchmark/benchmarker.py +150 -317
  9. guidellm/benchmark/entrypoints.py +467 -128
  10. guidellm/benchmark/output.py +519 -771
  11. guidellm/benchmark/profile.py +580 -280
  12. guidellm/benchmark/progress.py +568 -549
  13. guidellm/benchmark/scenarios/__init__.py +40 -0
  14. guidellm/benchmark/scenarios/chat.json +6 -0
  15. guidellm/benchmark/scenarios/rag.json +6 -0
  16. guidellm/benchmark/schemas.py +2086 -0
  17. guidellm/data/__init__.py +28 -4
  18. guidellm/data/collators.py +16 -0
  19. guidellm/data/deserializers/__init__.py +53 -0
  20. guidellm/data/deserializers/deserializer.py +144 -0
  21. guidellm/data/deserializers/file.py +222 -0
  22. guidellm/data/deserializers/huggingface.py +94 -0
  23. guidellm/data/deserializers/memory.py +194 -0
  24. guidellm/data/deserializers/synthetic.py +348 -0
  25. guidellm/data/loaders.py +149 -0
  26. guidellm/data/preprocessors/__init__.py +25 -0
  27. guidellm/data/preprocessors/formatters.py +404 -0
  28. guidellm/data/preprocessors/mappers.py +198 -0
  29. guidellm/data/preprocessors/preprocessor.py +31 -0
  30. guidellm/data/processor.py +31 -0
  31. guidellm/data/schemas.py +13 -0
  32. guidellm/data/utils/__init__.py +6 -0
  33. guidellm/data/utils/dataset.py +94 -0
  34. guidellm/extras/__init__.py +4 -0
  35. guidellm/extras/audio.py +215 -0
  36. guidellm/extras/vision.py +242 -0
  37. guidellm/logger.py +2 -2
  38. guidellm/mock_server/__init__.py +8 -0
  39. guidellm/mock_server/config.py +84 -0
  40. guidellm/mock_server/handlers/__init__.py +17 -0
  41. guidellm/mock_server/handlers/chat_completions.py +280 -0
  42. guidellm/mock_server/handlers/completions.py +280 -0
  43. guidellm/mock_server/handlers/tokenizer.py +142 -0
  44. guidellm/mock_server/models.py +510 -0
  45. guidellm/mock_server/server.py +168 -0
  46. guidellm/mock_server/utils.py +302 -0
  47. guidellm/preprocess/dataset.py +23 -26
  48. guidellm/presentation/builder.py +2 -2
  49. guidellm/presentation/data_models.py +25 -21
  50. guidellm/presentation/injector.py +2 -3
  51. guidellm/scheduler/__init__.py +65 -26
  52. guidellm/scheduler/constraints.py +1035 -0
  53. guidellm/scheduler/environments.py +252 -0
  54. guidellm/scheduler/scheduler.py +140 -368
  55. guidellm/scheduler/schemas.py +272 -0
  56. guidellm/scheduler/strategies.py +519 -0
  57. guidellm/scheduler/worker.py +391 -420
  58. guidellm/scheduler/worker_group.py +707 -0
  59. guidellm/schemas/__init__.py +31 -0
  60. guidellm/schemas/info.py +159 -0
  61. guidellm/schemas/request.py +226 -0
  62. guidellm/schemas/response.py +119 -0
  63. guidellm/schemas/stats.py +228 -0
  64. guidellm/{config.py → settings.py} +32 -21
  65. guidellm/utils/__init__.py +95 -8
  66. guidellm/utils/auto_importer.py +98 -0
  67. guidellm/utils/cli.py +71 -2
  68. guidellm/utils/console.py +183 -0
  69. guidellm/utils/encoding.py +778 -0
  70. guidellm/utils/functions.py +134 -0
  71. guidellm/utils/hf_datasets.py +1 -2
  72. guidellm/utils/hf_transformers.py +4 -4
  73. guidellm/utils/imports.py +9 -0
  74. guidellm/utils/messaging.py +1118 -0
  75. guidellm/utils/mixins.py +115 -0
  76. guidellm/utils/pydantic_utils.py +411 -0
  77. guidellm/utils/random.py +3 -4
  78. guidellm/utils/registry.py +220 -0
  79. guidellm/utils/singleton.py +133 -0
  80. guidellm/{objects → utils}/statistics.py +341 -247
  81. guidellm/utils/synchronous.py +159 -0
  82. guidellm/utils/text.py +163 -50
  83. guidellm/utils/typing.py +41 -0
  84. guidellm/version.py +1 -1
  85. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/METADATA +33 -10
  86. guidellm-0.4.0a169.dist-info/RECORD +95 -0
  87. guidellm/backend/__init__.py +0 -23
  88. guidellm/backend/backend.py +0 -259
  89. guidellm/backend/openai.py +0 -705
  90. guidellm/backend/response.py +0 -136
  91. guidellm/benchmark/aggregator.py +0 -760
  92. guidellm/benchmark/benchmark.py +0 -837
  93. guidellm/benchmark/scenario.py +0 -104
  94. guidellm/data/prideandprejudice.txt.gz +0 -0
  95. guidellm/dataset/__init__.py +0 -22
  96. guidellm/dataset/creator.py +0 -213
  97. guidellm/dataset/entrypoints.py +0 -42
  98. guidellm/dataset/file.py +0 -92
  99. guidellm/dataset/hf_datasets.py +0 -62
  100. guidellm/dataset/in_memory.py +0 -132
  101. guidellm/dataset/synthetic.py +0 -287
  102. guidellm/objects/__init__.py +0 -18
  103. guidellm/objects/pydantic.py +0 -89
  104. guidellm/request/__init__.py +0 -18
  105. guidellm/request/loader.py +0 -284
  106. guidellm/request/request.py +0 -79
  107. guidellm/request/types.py +0 -10
  108. guidellm/scheduler/queues.py +0 -25
  109. guidellm/scheduler/result.py +0 -155
  110. guidellm/scheduler/strategy.py +0 -495
  111. guidellm-0.4.0a21.dist-info/RECORD +0 -62
  112. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/WHEEL +0 -0
  113. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/entry_points.txt +0 -0
  114. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/licenses/LICENSE +0 -0
  115. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,510 @@
1
+ """
2
+ Pydantic models for OpenAI API and vLLM API request/response validation.
3
+
4
+ This module defines comprehensive data models for validating and serializing API
5
+ requests and responses compatible with both OpenAI's API specification and vLLM's
6
+ extended parameters. It includes models for chat completions, legacy text completions,
7
+ tokenization operations, and error handling, supporting both streaming and non-streaming
8
+ responses with full type safety and validation.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import time
14
+ from typing import Any, Literal
15
+
16
+ from pydantic import BaseModel, Field
17
+
18
+ __all__ = [
19
+ "ChatCompletionChoice",
20
+ "ChatCompletionChunk",
21
+ "ChatCompletionsRequest",
22
+ "ChatCompletionsResponse",
23
+ "ChatMessage",
24
+ "CompletionChoice",
25
+ "CompletionsRequest",
26
+ "CompletionsResponse",
27
+ "DetokenizeRequest",
28
+ "DetokenizeResponse",
29
+ "ErrorDetail",
30
+ "ErrorResponse",
31
+ "StreamOptions",
32
+ "TokenizeRequest",
33
+ "TokenizeResponse",
34
+ "Usage",
35
+ ]
36
+
37
+
38
+ class Usage(BaseModel):
39
+ """Token usage statistics for API requests and responses.
40
+
41
+ Tracks the number of tokens consumed in prompts, completions, and total
42
+ usage for billing and monitoring purposes.
43
+ """
44
+
45
+ prompt_tokens: int = Field(description="Number of tokens in the input prompt")
46
+ completion_tokens: int = Field(
47
+ description="Number of tokens in the generated completion"
48
+ )
49
+ total_tokens: int = Field(description="Total tokens used (prompt + completion)")
50
+
51
+ def __init__(self, prompt_tokens: int = 0, completion_tokens: int = 0, **kwargs):
52
+ """Initialize usage statistics.
53
+
54
+ :param prompt_tokens: Number of tokens in the input prompt
55
+ :param completion_tokens: Number of tokens in the generated completion
56
+ :param kwargs: Additional keyword arguments passed to BaseModel
57
+ """
58
+ super().__init__(
59
+ prompt_tokens=prompt_tokens,
60
+ completion_tokens=completion_tokens,
61
+ total_tokens=prompt_tokens + completion_tokens,
62
+ **kwargs,
63
+ )
64
+
65
+
66
+ class StreamOptions(BaseModel):
67
+ """Configuration options for streaming API responses.
68
+
69
+ Controls the behavior and content of streamed responses including
70
+ whether to include usage statistics in the final chunk.
71
+ """
72
+
73
+ include_usage: bool | None = Field(
74
+ default=None,
75
+ description="Whether to include usage statistics in streaming responses",
76
+ )
77
+
78
+
79
+ class ChatMessage(BaseModel):
80
+ """A single message in a chat conversation.
81
+
82
+ Represents one exchange in a conversational interface with role-based
83
+ content and optional metadata for advanced features.
84
+ """
85
+
86
+ role: Literal["system", "user", "assistant", "tool"] = Field(
87
+ description="Role of the message sender in the conversation"
88
+ )
89
+ content: str = Field(description="Text content of the message")
90
+ name: str | None = Field(
91
+ default=None, description="Optional name identifier for the message sender"
92
+ )
93
+
94
+
95
+ class ChatCompletionsRequest(BaseModel):
96
+ """Request parameters for chat completion API endpoints.
97
+
98
+ Comprehensive model supporting both OpenAI standard parameters and vLLM
99
+ extensions for advanced generation control, guided decoding, and performance
100
+ optimization.
101
+ """
102
+
103
+ model: str = Field(description="Model identifier to use for generation")
104
+ messages: list[ChatMessage] = Field(
105
+ description="List of messages in the conversation"
106
+ )
107
+ max_tokens: int | None = Field(
108
+ default=None, description="Maximum number of tokens to generate"
109
+ )
110
+ max_completion_tokens: int | None = Field(
111
+ default=None, description="Maximum tokens in completion (OpenAI naming)"
112
+ )
113
+ temperature: float | None = Field(
114
+ default=1.0, description="Sampling temperature for randomness control"
115
+ )
116
+ top_p: float | None = Field(default=1.0, description="Nucleus sampling parameter")
117
+ n: int | None = Field(
118
+ default=1, description="Number of completion choices to generate"
119
+ )
120
+ stream: bool | None = Field(
121
+ default=False, description="Whether to stream response chunks"
122
+ )
123
+ stream_options: StreamOptions | None = Field(
124
+ default=None, description="Configuration for streaming responses"
125
+ )
126
+ stop: str | list[str] | None = Field(
127
+ default=None, description="Stop sequences to end generation"
128
+ )
129
+ presence_penalty: float | None = Field(
130
+ default=0.0, description="Penalty for token presence to encourage diversity"
131
+ )
132
+ frequency_penalty: float | None = Field(
133
+ default=0.0, description="Penalty for token frequency to reduce repetition"
134
+ )
135
+ logit_bias: dict[str, float] | None = Field(
136
+ default=None, description="Bias values for specific tokens"
137
+ )
138
+ seed: int | None = Field(
139
+ default=None, description="Random seed for reproducible outputs"
140
+ )
141
+ user: str | None = Field(
142
+ default=None, description="User identifier for tracking and abuse monitoring"
143
+ )
144
+
145
+ # vLLM extensions
146
+ use_beam_search: bool | None = Field(
147
+ default=False, description="Enable beam search for better quality"
148
+ )
149
+ top_k: int | None = Field(default=None, description="Top-k sampling parameter")
150
+ min_p: float | None = Field(
151
+ default=None, description="Minimum probability threshold for sampling"
152
+ )
153
+ repetition_penalty: float | None = Field(
154
+ default=None, description="Penalty for repeated tokens"
155
+ )
156
+ length_penalty: float | None = Field(
157
+ default=1.0, description="Length penalty for sequence scoring"
158
+ )
159
+ stop_token_ids: list[int] | None = Field(
160
+ default=None, description="Token IDs that trigger generation stop"
161
+ )
162
+ include_stop_str_in_output: bool | None = Field(
163
+ default=False, description="Include stop sequence in output"
164
+ )
165
+ ignore_eos: bool | None = Field(
166
+ default=False, description="Ignore end-of-sequence tokens"
167
+ )
168
+ min_tokens: int | None = Field(
169
+ default=0, description="Minimum number of tokens to generate"
170
+ )
171
+ skip_special_tokens: bool | None = Field(
172
+ default=True, description="Skip special tokens in output"
173
+ )
174
+ spaces_between_special_tokens: bool | None = Field(
175
+ default=True, description="Add spaces between special tokens"
176
+ )
177
+ truncate_prompt_tokens: int | None = Field(
178
+ default=None, description="Maximum prompt tokens before truncation"
179
+ )
180
+ allowed_token_ids: list[int] | None = Field(
181
+ default=None, description="Restrict generation to specific token IDs"
182
+ )
183
+ prompt_logprobs: int | None = Field(
184
+ default=None, description="Number of logprobs to return for prompt tokens"
185
+ )
186
+ add_special_tokens: bool | None = Field(
187
+ default=True, description="Add special tokens during processing"
188
+ )
189
+ guided_json: str | dict[str, Any] | None = Field(
190
+ default=None, description="JSON schema for guided generation"
191
+ )
192
+ guided_regex: str | None = Field(
193
+ default=None, description="Regex pattern for guided generation"
194
+ )
195
+ guided_choice: list[str] | None = Field(
196
+ default=None, description="List of choices for guided generation"
197
+ )
198
+ guided_grammar: str | None = Field(
199
+ default=None, description="Grammar specification for guided generation"
200
+ )
201
+ guided_decoding_backend: str | None = Field(
202
+ default=None, description="Backend to use for guided decoding"
203
+ )
204
+ guided_whitespace_pattern: str | None = Field(
205
+ default=None, description="Whitespace pattern for guided generation"
206
+ )
207
+ priority: int | None = Field(
208
+ default=0, description="Request priority for scheduling"
209
+ )
210
+
211
+
212
+ class ChatCompletionChoice(BaseModel):
213
+ """A single completion choice from a chat completion response.
214
+
215
+ Contains the generated message and metadata about why generation
216
+ stopped and the choice's position in the response.
217
+ """
218
+
219
+ index: int = Field(description="Index of this choice in the response")
220
+ message: ChatMessage = Field(description="Generated message content")
221
+ finish_reason: Literal["stop", "length", "content_filter", "tool_calls"] | None = (
222
+ Field(description="Reason why generation finished")
223
+ )
224
+
225
+
226
+ class ChatCompletionsResponse(BaseModel):
227
+ """Response from chat completion API endpoints.
228
+
229
+ Contains generated choices, usage statistics, and metadata for
230
+ non-streaming chat completion requests.
231
+ """
232
+
233
+ id: str = Field(description="Unique identifier for this completion")
234
+ object: Literal["chat.completion"] = Field(
235
+ default="chat.completion", description="Object type identifier"
236
+ )
237
+ created: int = Field(
238
+ default_factory=lambda: int(time.time()),
239
+ description="Unix timestamp of creation",
240
+ )
241
+ model: str = Field(description="Model used for generation")
242
+ choices: list[ChatCompletionChoice] = Field(
243
+ description="Generated completion choices"
244
+ )
245
+ usage: Usage | None = Field(default=None, description="Token usage statistics")
246
+ system_fingerprint: str | None = Field(
247
+ default=None, description="System configuration fingerprint"
248
+ )
249
+
250
+
251
+ class ChatCompletionChunk(BaseModel):
252
+ """A single chunk in a streamed chat completion response.
253
+
254
+ Represents one piece of a streaming response with delta content
255
+ and optional usage statistics in the final chunk.
256
+ """
257
+
258
+ id: str = Field(description="Unique identifier for this completion")
259
+ object: Literal["chat.completion.chunk"] = Field(
260
+ default="chat.completion.chunk",
261
+ description="Object type identifier for streaming chunks",
262
+ )
263
+ created: int = Field(
264
+ default_factory=lambda: int(time.time()),
265
+ description="Unix timestamp of creation",
266
+ )
267
+ model: str = Field(description="Model used for generation")
268
+ choices: list[dict[str, Any]] = Field(description="Delta choices for streaming")
269
+ usage: Usage | None = Field(
270
+ default=None, description="Token usage statistics (typically in final chunk)"
271
+ )
272
+
273
+
274
+ class CompletionsRequest(BaseModel):
275
+ """Request parameters for legacy text completion API endpoints.
276
+
277
+ Supports the older text completion format with prompt-based input
278
+ and the same extensive parameter set as chat completions for
279
+ backward compatibility.
280
+ """
281
+
282
+ model: str = Field(description="Model identifier to use for generation")
283
+ prompt: str | list[str] = Field(description="Input prompt(s) for completion")
284
+ max_tokens: int | None = Field(
285
+ default=16, description="Maximum number of tokens to generate"
286
+ )
287
+ temperature: float | None = Field(
288
+ default=1.0, description="Sampling temperature for randomness control"
289
+ )
290
+ top_p: float | None = Field(default=1.0, description="Nucleus sampling parameter")
291
+ n: int | None = Field(
292
+ default=1, description="Number of completion choices to generate"
293
+ )
294
+ stream: bool | None = Field(
295
+ default=False, description="Whether to stream response chunks"
296
+ )
297
+ stream_options: StreamOptions | None = Field(
298
+ default=None, description="Configuration for streaming responses"
299
+ )
300
+ logprobs: int | None = Field(
301
+ default=None, description="Number of logprobs to return"
302
+ )
303
+ echo: bool | None = Field(
304
+ default=False, description="Whether to echo the prompt in output"
305
+ )
306
+ stop: str | list[str] | None = Field(
307
+ default_factory=lambda: ["<|endoftext|>"],
308
+ description="Stop sequences to end generation",
309
+ )
310
+ presence_penalty: float | None = Field(
311
+ default=0.0, description="Penalty for token presence to encourage diversity"
312
+ )
313
+ frequency_penalty: float | None = Field(
314
+ default=0.0, description="Penalty for token frequency to reduce repetition"
315
+ )
316
+ best_of: int | None = Field(
317
+ default=1, description="Number of candidates to generate and return the best"
318
+ )
319
+ logit_bias: dict[str, float] | None = Field(
320
+ default=None, description="Bias values for specific tokens"
321
+ )
322
+ seed: int | None = Field(
323
+ default=None, description="Random seed for reproducible outputs"
324
+ )
325
+ suffix: str | None = Field(
326
+ default=None, description="Suffix to append after completion"
327
+ )
328
+ user: str | None = Field(
329
+ default=None, description="User identifier for tracking and abuse monitoring"
330
+ )
331
+
332
+ # vLLM extensions (same as chat completions)
333
+ use_beam_search: bool | None = Field(
334
+ default=False, description="Enable beam search for better quality"
335
+ )
336
+ top_k: int | None = Field(default=None, description="Top-k sampling parameter")
337
+ min_p: float | None = Field(
338
+ default=None, description="Minimum probability threshold for sampling"
339
+ )
340
+ repetition_penalty: float | None = Field(
341
+ default=None, description="Penalty for repeated tokens"
342
+ )
343
+ length_penalty: float | None = Field(
344
+ default=1.0, description="Length penalty for sequence scoring"
345
+ )
346
+ stop_token_ids: list[int] | None = Field(
347
+ default=None, description="Token IDs that trigger generation stop"
348
+ )
349
+ include_stop_str_in_output: bool | None = Field(
350
+ default=False, description="Include stop sequence in output"
351
+ )
352
+ ignore_eos: bool | None = Field(
353
+ default=False, description="Ignore end-of-sequence tokens"
354
+ )
355
+ min_tokens: int | None = Field(
356
+ default=0, description="Minimum number of tokens to generate"
357
+ )
358
+ skip_special_tokens: bool | None = Field(
359
+ default=True, description="Skip special tokens in output"
360
+ )
361
+ spaces_between_special_tokens: bool | None = Field(
362
+ default=True, description="Add spaces between special tokens"
363
+ )
364
+ truncate_prompt_tokens: int | None = Field(
365
+ default=None, description="Maximum prompt tokens before truncation"
366
+ )
367
+ allowed_token_ids: list[int] | None = Field(
368
+ default=None, description="Restrict generation to specific token IDs"
369
+ )
370
+ prompt_logprobs: int | None = Field(
371
+ default=None, description="Number of logprobs to return for prompt tokens"
372
+ )
373
+ add_special_tokens: bool | None = Field(
374
+ default=True, description="Add special tokens during processing"
375
+ )
376
+ guided_json: str | dict[str, Any] | None = Field(
377
+ default=None, description="JSON schema for guided generation"
378
+ )
379
+ guided_regex: str | None = Field(
380
+ default=None, description="Regex pattern for guided generation"
381
+ )
382
+ guided_choice: list[str] | None = Field(
383
+ default=None, description="List of choices for guided generation"
384
+ )
385
+ guided_grammar: str | None = Field(
386
+ default=None, description="Grammar specification for guided generation"
387
+ )
388
+ guided_decoding_backend: str | None = Field(
389
+ default=None, description="Backend to use for guided decoding"
390
+ )
391
+ guided_whitespace_pattern: str | None = Field(
392
+ default=None, description="Whitespace pattern for guided generation"
393
+ )
394
+ priority: int | None = Field(
395
+ default=0, description="Request priority for scheduling"
396
+ )
397
+
398
+
399
+ class CompletionChoice(BaseModel):
400
+ """A single completion choice from a text completion response.
401
+
402
+ Contains the generated text and metadata about completion
403
+ quality and stopping conditions.
404
+ """
405
+
406
+ text: str = Field(description="Generated text content")
407
+ index: int = Field(description="Index of this choice in the response")
408
+ logprobs: dict[str, Any] | None = Field(
409
+ default=None, description="Log probabilities for generated tokens"
410
+ )
411
+ finish_reason: Literal["stop", "length", "content_filter"] | None = Field(
412
+ description="Reason why generation finished"
413
+ )
414
+
415
+
416
+ class CompletionsResponse(BaseModel):
417
+ """Response from legacy text completion API endpoints.
418
+
419
+ Contains generated text choices, usage statistics, and metadata
420
+ for non-streaming text completion requests.
421
+ """
422
+
423
+ id: str = Field(description="Unique identifier for this completion")
424
+ object: Literal["text_completion"] = Field(
425
+ default="text_completion", description="Object type identifier"
426
+ )
427
+ created: int = Field(
428
+ default_factory=lambda: int(time.time()),
429
+ description="Unix timestamp of creation",
430
+ )
431
+ model: str = Field(description="Model used for generation")
432
+ choices: list[CompletionChoice] = Field(description="Generated completion choices")
433
+ usage: Usage | None = Field(default=None, description="Token usage statistics")
434
+ system_fingerprint: str | None = Field(
435
+ default=None, description="System configuration fingerprint"
436
+ )
437
+
438
+
439
+ class TokenizeRequest(BaseModel):
440
+ """Request for tokenizing text into token sequences.
441
+
442
+ Converts input text into model-specific token representations
443
+ with optional special token handling.
444
+ """
445
+
446
+ text: str = Field(description="Text to tokenize")
447
+ add_special_tokens: bool | None = Field(
448
+ default=True, description="Whether to add model-specific special tokens"
449
+ )
450
+
451
+
452
+ class TokenizeResponse(BaseModel):
453
+ """Response containing tokenized representation of input text.
454
+
455
+ Provides both the token sequence and count for analysis
456
+ and token budget planning.
457
+ """
458
+
459
+ tokens: list[int] = Field(description="List of token IDs")
460
+ count: int = Field(description="Total number of tokens")
461
+
462
+
463
+ class DetokenizeRequest(BaseModel):
464
+ """Request for converting token sequences back to text.
465
+
466
+ Reconstructs human-readable text from model token representations
467
+ with configurable special token handling.
468
+ """
469
+
470
+ tokens: list[int] = Field(description="List of token IDs to convert")
471
+ skip_special_tokens: bool | None = Field(
472
+ default=True, description="Whether to skip special tokens in output"
473
+ )
474
+ spaces_between_special_tokens: bool | None = Field(
475
+ default=True, description="Whether to add spaces between special tokens"
476
+ )
477
+
478
+
479
+ class DetokenizeResponse(BaseModel):
480
+ """Response containing text reconstructed from tokens.
481
+
482
+ Provides the human-readable text representation of the
483
+ input token sequence.
484
+ """
485
+
486
+ text: str = Field(description="Reconstructed text from tokens")
487
+
488
+
489
+ class ErrorDetail(BaseModel):
490
+ """Detailed error information for API failures.
491
+
492
+ Provides structured error data including message, type classification,
493
+ and optional error codes for debugging and error handling.
494
+ """
495
+
496
+ message: str = Field(description="Human-readable error description")
497
+ type: str = Field(description="Error type classification")
498
+ code: str | None = Field(
499
+ default=None, description="Optional error code for programmatic handling"
500
+ )
501
+
502
+
503
+ class ErrorResponse(BaseModel):
504
+ """Standardized error response structure for API failures.
505
+
506
+ Wraps error details in a consistent format compatible with
507
+ OpenAI API error response conventions.
508
+ """
509
+
510
+ error: ErrorDetail = Field(description="Detailed error information")
@@ -0,0 +1,168 @@
1
+ """
2
+ High-performance mock server for OpenAI and vLLM API compatibility testing.
3
+
4
+ This module provides a Sanic-based mock server that simulates OpenAI and vLLM APIs
5
+ with configurable latency, token generation patterns, and response characteristics.
6
+ The server supports both streaming and non-streaming endpoints, enabling realistic
7
+ performance testing and validation of GuideLLM benchmarking workflows without
8
+ requiring actual model deployments.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import time
14
+
15
+ from sanic import Sanic, response
16
+ from sanic.exceptions import NotFound
17
+ from sanic.log import logger
18
+ from sanic.request import Request
19
+ from sanic.response import HTTPResponse
20
+
21
+ from guidellm.mock_server.config import MockServerConfig
22
+ from guidellm.mock_server.handlers import (
23
+ ChatCompletionsHandler,
24
+ CompletionsHandler,
25
+ TokenizerHandler,
26
+ )
27
+
28
+ __all__ = ["MockServer"]
29
+
30
+
31
+ class MockServer:
32
+ """
33
+ High-performance mock server implementing OpenAI and vLLM API endpoints.
34
+
35
+ Provides a Sanic-based web server that simulates API responses with configurable
36
+ timing characteristics for testing and benchmarking purposes. Supports chat
37
+ completions, text completions, tokenization endpoints, and model listing with
38
+ realistic latency patterns to enable comprehensive performance validation.
39
+
40
+ Example:
41
+ ::
42
+ config = ServerConfig(model="test-model", port=8080)
43
+ server = MockServer(config)
44
+ server.run()
45
+ """
46
+
47
+ def __init__(self, config: MockServerConfig) -> None:
48
+ """
49
+ Initialize the mock server with configuration.
50
+
51
+ :param config: Server configuration containing network settings and response
52
+ timing parameters
53
+ """
54
+ self.config = config
55
+ self.app = Sanic("guidellm-mock-server")
56
+ self.chat_handler = ChatCompletionsHandler(config)
57
+ self.completions_handler = CompletionsHandler(config)
58
+ self.tokenizer_handler = TokenizerHandler(config)
59
+
60
+ self._setup_middleware()
61
+ self._setup_routes()
62
+ self._setup_error_handlers()
63
+
64
+ def _setup_middleware(self):
65
+ """Setup middleware for CORS, logging, etc."""
66
+
67
+ @self.app.middleware("request")
68
+ async def add_cors_headers(_request: Request):
69
+ """Add CORS headers to all requests."""
70
+
71
+ @self.app.middleware("response")
72
+ async def add_response_headers(_request: Request, resp: HTTPResponse):
73
+ """Add standard response headers."""
74
+ resp.headers["Access-Control-Allow-Origin"] = "*"
75
+ resp.headers["Access-Control-Allow-Methods"] = "GET, POST, OPTIONS"
76
+ resp.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization"
77
+ resp.headers["Server"] = "guidellm-mock-server"
78
+
79
+ def _setup_routes(self): # noqa: C901
80
+ @self.app.get("/health")
81
+ async def health_check(_request: Request):
82
+ return response.json({"status": "healthy", "timestamp": time.time()})
83
+
84
+ @self.app.get("/v1/models")
85
+ async def list_models(_request: Request):
86
+ return response.json(
87
+ {
88
+ "object": "list",
89
+ "data": [
90
+ {
91
+ "id": self.config.model,
92
+ "object": "model",
93
+ "created": int(time.time()),
94
+ "owned_by": "guidellm-mock",
95
+ }
96
+ ],
97
+ }
98
+ )
99
+
100
+ @self.app.route("/v1/chat/completions", methods=["POST", "OPTIONS"])
101
+ async def chat_completions(request: Request):
102
+ if request.method == "OPTIONS":
103
+ return response.text("", status=204)
104
+ return await self.chat_handler.handle(request)
105
+
106
+ @self.app.route("/v1/completions", methods=["POST", "OPTIONS"])
107
+ async def completions(request: Request):
108
+ if request.method == "OPTIONS":
109
+ return response.text("", status=204)
110
+ return await self.completions_handler.handle(request)
111
+
112
+ @self.app.route("/tokenize", methods=["POST", "OPTIONS"])
113
+ async def tokenize(request: Request):
114
+ if request.method == "OPTIONS":
115
+ return response.text("", status=204)
116
+ return await self.tokenizer_handler.tokenize(request)
117
+
118
+ @self.app.route("/detokenize", methods=["POST", "OPTIONS"])
119
+ async def detokenize(request: Request):
120
+ if request.method == "OPTIONS":
121
+ return response.text("", status=204)
122
+ return await self.tokenizer_handler.detokenize(request)
123
+
124
+ def _setup_error_handlers(self):
125
+ """Setup error handlers."""
126
+
127
+ @self.app.exception(Exception)
128
+ async def generic_error_handler(_request: Request, exception: Exception):
129
+ logger.error(f"Unhandled exception: {exception}")
130
+ return response.json(
131
+ {
132
+ "error": {
133
+ "message": "Internal server error",
134
+ "type": type(exception).__name__,
135
+ "error": str(exception),
136
+ }
137
+ },
138
+ status=500,
139
+ )
140
+
141
+ @self.app.exception(NotFound)
142
+ async def not_found_handler(_request: Request, _exception):
143
+ return response.json(
144
+ {
145
+ "error": {
146
+ "message": "Not Found",
147
+ "type": "not_found_error",
148
+ "code": "not_found",
149
+ }
150
+ },
151
+ status=404,
152
+ )
153
+
154
+ def run(self) -> None:
155
+ """
156
+ Start the mock server with configured settings.
157
+
158
+ Runs the Sanic application in single-process mode with access logging enabled
159
+ for debugging and monitoring request patterns during testing.
160
+ """
161
+ self.app.run(
162
+ host=self.config.host,
163
+ port=self.config.port,
164
+ debug=False,
165
+ single_process=True,
166
+ access_log=True,
167
+ register_sys_signals=False, # Disable signal handlers for threading
168
+ )