guidellm 0.4.0a21__py3-none-any.whl → 0.4.0a155__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of guidellm might be problematic. Click here for more details.
- guidellm/__init__.py +5 -2
- guidellm/__main__.py +451 -252
- guidellm/backends/__init__.py +33 -0
- guidellm/backends/backend.py +110 -0
- guidellm/backends/openai.py +355 -0
- guidellm/backends/response_handlers.py +455 -0
- guidellm/benchmark/__init__.py +53 -39
- guidellm/benchmark/benchmarker.py +148 -317
- guidellm/benchmark/entrypoints.py +466 -128
- guidellm/benchmark/output.py +517 -771
- guidellm/benchmark/profile.py +580 -280
- guidellm/benchmark/progress.py +568 -549
- guidellm/benchmark/scenarios/__init__.py +40 -0
- guidellm/benchmark/scenarios/chat.json +6 -0
- guidellm/benchmark/scenarios/rag.json +6 -0
- guidellm/benchmark/schemas.py +2085 -0
- guidellm/data/__init__.py +28 -4
- guidellm/data/collators.py +16 -0
- guidellm/data/deserializers/__init__.py +53 -0
- guidellm/data/deserializers/deserializer.py +109 -0
- guidellm/data/deserializers/file.py +222 -0
- guidellm/data/deserializers/huggingface.py +94 -0
- guidellm/data/deserializers/memory.py +192 -0
- guidellm/data/deserializers/synthetic.py +346 -0
- guidellm/data/loaders.py +145 -0
- guidellm/data/preprocessors/__init__.py +25 -0
- guidellm/data/preprocessors/formatters.py +412 -0
- guidellm/data/preprocessors/mappers.py +198 -0
- guidellm/data/preprocessors/preprocessor.py +29 -0
- guidellm/data/processor.py +30 -0
- guidellm/data/schemas.py +13 -0
- guidellm/data/utils/__init__.py +10 -0
- guidellm/data/utils/dataset.py +94 -0
- guidellm/data/utils/functions.py +18 -0
- guidellm/extras/__init__.py +4 -0
- guidellm/extras/audio.py +215 -0
- guidellm/extras/vision.py +242 -0
- guidellm/logger.py +2 -2
- guidellm/mock_server/__init__.py +8 -0
- guidellm/mock_server/config.py +84 -0
- guidellm/mock_server/handlers/__init__.py +17 -0
- guidellm/mock_server/handlers/chat_completions.py +280 -0
- guidellm/mock_server/handlers/completions.py +280 -0
- guidellm/mock_server/handlers/tokenizer.py +142 -0
- guidellm/mock_server/models.py +510 -0
- guidellm/mock_server/server.py +168 -0
- guidellm/mock_server/utils.py +302 -0
- guidellm/preprocess/dataset.py +23 -26
- guidellm/presentation/builder.py +2 -2
- guidellm/presentation/data_models.py +25 -21
- guidellm/presentation/injector.py +2 -3
- guidellm/scheduler/__init__.py +65 -26
- guidellm/scheduler/constraints.py +1035 -0
- guidellm/scheduler/environments.py +252 -0
- guidellm/scheduler/scheduler.py +140 -368
- guidellm/scheduler/schemas.py +272 -0
- guidellm/scheduler/strategies.py +519 -0
- guidellm/scheduler/worker.py +391 -420
- guidellm/scheduler/worker_group.py +707 -0
- guidellm/schemas/__init__.py +31 -0
- guidellm/schemas/info.py +159 -0
- guidellm/schemas/request.py +216 -0
- guidellm/schemas/response.py +119 -0
- guidellm/schemas/stats.py +228 -0
- guidellm/{config.py → settings.py} +32 -21
- guidellm/utils/__init__.py +95 -8
- guidellm/utils/auto_importer.py +98 -0
- guidellm/utils/cli.py +46 -2
- guidellm/utils/console.py +183 -0
- guidellm/utils/encoding.py +778 -0
- guidellm/utils/functions.py +134 -0
- guidellm/utils/hf_datasets.py +1 -2
- guidellm/utils/hf_transformers.py +4 -4
- guidellm/utils/imports.py +9 -0
- guidellm/utils/messaging.py +1118 -0
- guidellm/utils/mixins.py +115 -0
- guidellm/utils/pydantic_utils.py +411 -0
- guidellm/utils/random.py +3 -4
- guidellm/utils/registry.py +220 -0
- guidellm/utils/singleton.py +133 -0
- guidellm/{objects → utils}/statistics.py +341 -247
- guidellm/utils/synchronous.py +159 -0
- guidellm/utils/text.py +163 -50
- guidellm/utils/typing.py +41 -0
- guidellm/version.py +1 -1
- {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/METADATA +33 -10
- guidellm-0.4.0a155.dist-info/RECORD +96 -0
- guidellm/backend/__init__.py +0 -23
- guidellm/backend/backend.py +0 -259
- guidellm/backend/openai.py +0 -705
- guidellm/backend/response.py +0 -136
- guidellm/benchmark/aggregator.py +0 -760
- guidellm/benchmark/benchmark.py +0 -837
- guidellm/benchmark/scenario.py +0 -104
- guidellm/data/prideandprejudice.txt.gz +0 -0
- guidellm/dataset/__init__.py +0 -22
- guidellm/dataset/creator.py +0 -213
- guidellm/dataset/entrypoints.py +0 -42
- guidellm/dataset/file.py +0 -92
- guidellm/dataset/hf_datasets.py +0 -62
- guidellm/dataset/in_memory.py +0 -132
- guidellm/dataset/synthetic.py +0 -287
- guidellm/objects/__init__.py +0 -18
- guidellm/objects/pydantic.py +0 -89
- guidellm/request/__init__.py +0 -18
- guidellm/request/loader.py +0 -284
- guidellm/request/request.py +0 -79
- guidellm/request/types.py +0 -10
- guidellm/scheduler/queues.py +0 -25
- guidellm/scheduler/result.py +0 -155
- guidellm/scheduler/strategy.py +0 -495
- guidellm-0.4.0a21.dist-info/RECORD +0 -62
- {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/WHEEL +0 -0
- {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/entry_points.txt +0 -0
- {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/licenses/LICENSE +0 -0
- {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HTTP request handlers for the GuideLLM mock server.
|
|
3
|
+
|
|
4
|
+
This module exposes request handlers that implement OpenAI-compatible API endpoints
|
|
5
|
+
for the mock server. The handlers provide realistic LLM simulation capabilities
|
|
6
|
+
including chat completions, legacy completions, and tokenization services with
|
|
7
|
+
configurable timing characteristics, token counting, and proper error handling to
|
|
8
|
+
support comprehensive benchmarking and testing scenarios.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from .chat_completions import ChatCompletionsHandler
|
|
14
|
+
from .completions import CompletionsHandler
|
|
15
|
+
from .tokenizer import TokenizerHandler
|
|
16
|
+
|
|
17
|
+
__all__ = ["ChatCompletionsHandler", "CompletionsHandler", "TokenizerHandler"]
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OpenAI Chat Completions API endpoint handler for the mock server.
|
|
3
|
+
|
|
4
|
+
Provides a complete implementation of the /v1/chat/completions endpoint that simulates
|
|
5
|
+
realistic LLM behavior with configurable timing characteristics. Supports both streaming
|
|
6
|
+
and non-streaming responses with proper token counting, latency simulation including
|
|
7
|
+
TTFT (Time To First Token) and ITL (Inter-Token Latency), and OpenAI-compatible error
|
|
8
|
+
handling for comprehensive benchmarking scenarios.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import json
|
|
15
|
+
import math
|
|
16
|
+
import time
|
|
17
|
+
import uuid
|
|
18
|
+
|
|
19
|
+
from pydantic import ValidationError
|
|
20
|
+
from sanic import response
|
|
21
|
+
from sanic.request import Request
|
|
22
|
+
from sanic.response import HTTPResponse, ResponseStream
|
|
23
|
+
from transformers import PreTrainedTokenizer
|
|
24
|
+
|
|
25
|
+
from guidellm.mock_server.config import MockServerConfig
|
|
26
|
+
from guidellm.mock_server.models import (
|
|
27
|
+
ChatCompletionChoice,
|
|
28
|
+
ChatCompletionsRequest,
|
|
29
|
+
ChatCompletionsResponse,
|
|
30
|
+
ChatMessage,
|
|
31
|
+
ErrorDetail,
|
|
32
|
+
ErrorResponse,
|
|
33
|
+
Usage,
|
|
34
|
+
)
|
|
35
|
+
from guidellm.mock_server.utils import (
|
|
36
|
+
MockTokenizer,
|
|
37
|
+
create_fake_text,
|
|
38
|
+
create_fake_tokens_str,
|
|
39
|
+
sample_number,
|
|
40
|
+
times_generator,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
__all__ = ["ChatCompletionsHandler"]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ChatCompletionsHandler:
|
|
47
|
+
"""
|
|
48
|
+
Handles OpenAI Chat Completions API requests with realistic LLM simulation.
|
|
49
|
+
|
|
50
|
+
Implements the /v1/chat/completions endpoint behavior including request validation,
|
|
51
|
+
response generation, and timing simulation. Supports both streaming and
|
|
52
|
+
non-streaming modes with configurable latency characteristics for comprehensive
|
|
53
|
+
benchmarking. Uses either a mock tokenizer or a real tokenizer for accurate token
|
|
54
|
+
counting and realistic text generation.
|
|
55
|
+
|
|
56
|
+
Example:
|
|
57
|
+
::
|
|
58
|
+
config = MockServerConfig(ttft_ms=100, itl_ms=50)
|
|
59
|
+
handler = ChatCompletionsHandler(config)
|
|
60
|
+
response = await handler.handle(request)
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(self, config: MockServerConfig) -> None:
|
|
64
|
+
"""
|
|
65
|
+
Initialize the Chat Completions handler with server configuration.
|
|
66
|
+
|
|
67
|
+
:param config: Mock server configuration containing timing and behavior settings
|
|
68
|
+
"""
|
|
69
|
+
self.config = config
|
|
70
|
+
self.tokenizer = (
|
|
71
|
+
MockTokenizer()
|
|
72
|
+
if config.processor is None
|
|
73
|
+
else PreTrainedTokenizer.from_pretrained(config.processor)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
async def handle(self, request: Request) -> HTTPResponse:
|
|
77
|
+
"""
|
|
78
|
+
Process incoming chat completion requests with validation and routing.
|
|
79
|
+
|
|
80
|
+
Validates the request payload, handles errors gracefully, and routes to
|
|
81
|
+
appropriate streaming or non-streaming response handlers based on the
|
|
82
|
+
request configuration.
|
|
83
|
+
|
|
84
|
+
:param request: Sanic HTTP request containing chat completion parameters
|
|
85
|
+
:return: HTTP response with completion data or error information
|
|
86
|
+
:raises ValidationError: When request payload fails validation
|
|
87
|
+
:raises JSONDecodeError: When request contains invalid JSON
|
|
88
|
+
"""
|
|
89
|
+
try:
|
|
90
|
+
# Parse and validate request
|
|
91
|
+
req_data = ChatCompletionsRequest(**request.json)
|
|
92
|
+
except ValidationError as exc:
|
|
93
|
+
return response.json(
|
|
94
|
+
ErrorResponse(
|
|
95
|
+
error=ErrorDetail(
|
|
96
|
+
message=f"Invalid request: {str(exc)}",
|
|
97
|
+
type="invalid_request_error",
|
|
98
|
+
code="invalid_request",
|
|
99
|
+
)
|
|
100
|
+
).model_dump(),
|
|
101
|
+
status=400,
|
|
102
|
+
)
|
|
103
|
+
except (json.JSONDecodeError, TypeError):
|
|
104
|
+
return response.json(
|
|
105
|
+
ErrorResponse(
|
|
106
|
+
error=ErrorDetail(
|
|
107
|
+
message="Invalid JSON in request body",
|
|
108
|
+
type="invalid_request_error",
|
|
109
|
+
code="invalid_json",
|
|
110
|
+
)
|
|
111
|
+
).model_dump(),
|
|
112
|
+
status=400,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Handle streaming vs non-streaming
|
|
116
|
+
if req_data.stream:
|
|
117
|
+
return await self._handle_stream(req_data)
|
|
118
|
+
else:
|
|
119
|
+
return await self._handle_non_stream(req_data)
|
|
120
|
+
|
|
121
|
+
async def _handle_non_stream(self, req: ChatCompletionsRequest) -> HTTPResponse:
|
|
122
|
+
"""
|
|
123
|
+
Generate complete non-streaming chat completion response.
|
|
124
|
+
|
|
125
|
+
Simulates realistic LLM behavior with TTFT and ITL delays, generates
|
|
126
|
+
appropriate token counts, and returns a complete response with usage
|
|
127
|
+
statistics and generated content.
|
|
128
|
+
|
|
129
|
+
:param req: Validated chat completion request parameters
|
|
130
|
+
:return: Complete HTTP response with generated completion data
|
|
131
|
+
"""
|
|
132
|
+
# TTFT delay
|
|
133
|
+
await asyncio.sleep(
|
|
134
|
+
sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Token counts
|
|
138
|
+
prompt_text = self.tokenizer.apply_chat_template(req.messages)
|
|
139
|
+
prompt_tokens = len(self.tokenizer(prompt_text)) # type: ignore[arg-type]
|
|
140
|
+
max_tokens = req.max_completion_tokens or req.max_tokens or math.inf
|
|
141
|
+
completion_tokens_count = min(
|
|
142
|
+
sample_number(self.config.output_tokens, self.config.output_tokens_std),
|
|
143
|
+
max_tokens,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# ITL delay
|
|
147
|
+
itl_delay = 0.0
|
|
148
|
+
delays_iter = iter(times_generator(self.config.itl_ms, self.config.itl_ms_std))
|
|
149
|
+
for _ in range(int(completion_tokens_count) - 1):
|
|
150
|
+
itl_delay += next(delays_iter)
|
|
151
|
+
await asyncio.sleep(itl_delay / 1000.0)
|
|
152
|
+
|
|
153
|
+
# Response
|
|
154
|
+
chat_response = ChatCompletionsResponse(
|
|
155
|
+
id=f"chatcmpl-{uuid.uuid4().hex[:29]}",
|
|
156
|
+
model=req.model,
|
|
157
|
+
choices=[
|
|
158
|
+
ChatCompletionChoice(
|
|
159
|
+
index=0,
|
|
160
|
+
message=ChatMessage(
|
|
161
|
+
role="assistant",
|
|
162
|
+
content=create_fake_text(
|
|
163
|
+
int(completion_tokens_count), self.tokenizer
|
|
164
|
+
),
|
|
165
|
+
),
|
|
166
|
+
finish_reason="stop",
|
|
167
|
+
)
|
|
168
|
+
],
|
|
169
|
+
usage=Usage(
|
|
170
|
+
prompt_tokens=prompt_tokens,
|
|
171
|
+
completion_tokens=int(completion_tokens_count),
|
|
172
|
+
),
|
|
173
|
+
system_fingerprint=f"fp_{uuid.uuid4().hex[:10]}",
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
return response.json(chat_response.model_dump())
|
|
177
|
+
|
|
178
|
+
async def _handle_stream(self, req: ChatCompletionsRequest) -> HTTPResponse:
|
|
179
|
+
"""
|
|
180
|
+
Generate streaming chat completion response with real-time token delivery.
|
|
181
|
+
|
|
182
|
+
Creates a streaming response that delivers tokens incrementally with
|
|
183
|
+
realistic timing delays. Supports optional usage statistics in the final
|
|
184
|
+
stream chunk when requested via stream_options.
|
|
185
|
+
|
|
186
|
+
:param req: Validated chat completion request with streaming enabled
|
|
187
|
+
:return: Streaming HTTP response delivering tokens with proper timing
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
async def generate_stream(stream_response):
|
|
191
|
+
completion_id = f"chatcmpl-{uuid.uuid4().hex[:29]}"
|
|
192
|
+
|
|
193
|
+
# TTFT delay
|
|
194
|
+
await asyncio.sleep(
|
|
195
|
+
sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# Token counts
|
|
199
|
+
prompt_text = self.tokenizer.apply_chat_template(req.messages)
|
|
200
|
+
prompt_tokens = len(self.tokenizer(prompt_text)) # type: ignore[arg-type]
|
|
201
|
+
max_tokens = req.max_completion_tokens or req.max_tokens or math.inf
|
|
202
|
+
completion_tokens_count = int(
|
|
203
|
+
min(
|
|
204
|
+
sample_number(
|
|
205
|
+
self.config.output_tokens, self.config.output_tokens_std
|
|
206
|
+
),
|
|
207
|
+
max_tokens,
|
|
208
|
+
)
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Send tokens
|
|
212
|
+
tokens = create_fake_tokens_str(completion_tokens_count, self.tokenizer)
|
|
213
|
+
delays_iter = iter(
|
|
214
|
+
times_generator(self.config.itl_ms, self.config.itl_ms_std)
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
for index, token in enumerate(tokens):
|
|
218
|
+
if index > 0:
|
|
219
|
+
itl_delay = next(delays_iter)
|
|
220
|
+
await asyncio.sleep(itl_delay / 1000.0)
|
|
221
|
+
|
|
222
|
+
chunk_data = {
|
|
223
|
+
"id": completion_id,
|
|
224
|
+
"object": "chat.completion.chunk",
|
|
225
|
+
"created": int(time.time()),
|
|
226
|
+
"model": req.model,
|
|
227
|
+
"choices": [
|
|
228
|
+
{
|
|
229
|
+
"index": 0,
|
|
230
|
+
"delta": {"content": token},
|
|
231
|
+
"finish_reason": None,
|
|
232
|
+
}
|
|
233
|
+
],
|
|
234
|
+
}
|
|
235
|
+
await stream_response.write(f"data: {json.dumps(chunk_data)}\n\n")
|
|
236
|
+
|
|
237
|
+
# Send final chunk with finish reason
|
|
238
|
+
final_chunk = {
|
|
239
|
+
"id": completion_id,
|
|
240
|
+
"object": "chat.completion.chunk",
|
|
241
|
+
"created": int(time.time()),
|
|
242
|
+
"model": req.model,
|
|
243
|
+
"choices": [
|
|
244
|
+
{
|
|
245
|
+
"index": 0,
|
|
246
|
+
"delta": {},
|
|
247
|
+
"finish_reason": "stop",
|
|
248
|
+
}
|
|
249
|
+
],
|
|
250
|
+
}
|
|
251
|
+
await stream_response.write(f"data: {json.dumps(final_chunk)}\n\n")
|
|
252
|
+
|
|
253
|
+
# Send usage if requested
|
|
254
|
+
if req.stream_options and req.stream_options.include_usage:
|
|
255
|
+
usage_chunk = {
|
|
256
|
+
"id": completion_id,
|
|
257
|
+
"object": "chat.completion.chunk",
|
|
258
|
+
"created": int(time.time()),
|
|
259
|
+
"model": req.model,
|
|
260
|
+
"choices": [],
|
|
261
|
+
"usage": {
|
|
262
|
+
"prompt_tokens": prompt_tokens,
|
|
263
|
+
"completion_tokens": completion_tokens_count,
|
|
264
|
+
"total_tokens": prompt_tokens + completion_tokens_count,
|
|
265
|
+
},
|
|
266
|
+
}
|
|
267
|
+
await stream_response.write(f"data: {json.dumps(usage_chunk)}\n\n")
|
|
268
|
+
|
|
269
|
+
# End stream
|
|
270
|
+
await stream_response.write("data: [DONE]\n\n")
|
|
271
|
+
|
|
272
|
+
return ResponseStream( # type: ignore[return-value]
|
|
273
|
+
generate_stream,
|
|
274
|
+
content_type="text/event-stream",
|
|
275
|
+
headers={
|
|
276
|
+
"Cache-Control": "no-cache",
|
|
277
|
+
"Connection": "keep-alive",
|
|
278
|
+
"X-Accel-Buffering": "no",
|
|
279
|
+
},
|
|
280
|
+
)
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Legacy OpenAI Completions API handler for the mock server.
|
|
3
|
+
|
|
4
|
+
This module provides the CompletionsHandler class that implements the /v1/completions
|
|
5
|
+
endpoint for the guidellm mock server. It supports both streaming and non-streaming
|
|
6
|
+
completions with configurable timing parameters (TTFT, ITL) and token generation to
|
|
7
|
+
simulate realistic LLM behavior for benchmarking and testing purposes.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
import json
|
|
14
|
+
import math
|
|
15
|
+
import time
|
|
16
|
+
import uuid
|
|
17
|
+
|
|
18
|
+
from pydantic import ValidationError
|
|
19
|
+
from sanic import response
|
|
20
|
+
from sanic.request import Request
|
|
21
|
+
from sanic.response import HTTPResponse, ResponseStream
|
|
22
|
+
from transformers import PreTrainedTokenizer
|
|
23
|
+
|
|
24
|
+
from guidellm.mock_server.config import MockServerConfig
|
|
25
|
+
from guidellm.mock_server.models import (
|
|
26
|
+
CompletionChoice,
|
|
27
|
+
CompletionsRequest,
|
|
28
|
+
CompletionsResponse,
|
|
29
|
+
ErrorDetail,
|
|
30
|
+
ErrorResponse,
|
|
31
|
+
Usage,
|
|
32
|
+
)
|
|
33
|
+
from guidellm.mock_server.utils import (
|
|
34
|
+
MockTokenizer,
|
|
35
|
+
create_fake_text,
|
|
36
|
+
create_fake_tokens_str,
|
|
37
|
+
sample_number,
|
|
38
|
+
times_generator,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
__all__ = ["CompletionsHandler"]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class CompletionsHandler:
|
|
45
|
+
"""
|
|
46
|
+
Handler for the OpenAI /v1/completions endpoint in the mock server.
|
|
47
|
+
|
|
48
|
+
This handler simulates the legacy OpenAI completions API by processing incoming
|
|
49
|
+
requests and generating responses with configurable timing and token generation
|
|
50
|
+
patterns. It supports both streaming and non-streaming modes, applying realistic
|
|
51
|
+
timing delays (TTFT and ITL) to mimic actual LLM behavior for benchmarking.
|
|
52
|
+
|
|
53
|
+
Example:
|
|
54
|
+
::
|
|
55
|
+
config = MockServerConfig(ttft_ms=100, itl_ms=50)
|
|
56
|
+
handler = CompletionsHandler(config)
|
|
57
|
+
response = await handler.handle(sanic_request)
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(self, config: MockServerConfig) -> None:
|
|
61
|
+
"""
|
|
62
|
+
Initialize the completions handler with configuration settings.
|
|
63
|
+
|
|
64
|
+
:param config: Mock server configuration containing timing parameters
|
|
65
|
+
and tokenizer settings
|
|
66
|
+
"""
|
|
67
|
+
self.config = config
|
|
68
|
+
self.tokenizer = (
|
|
69
|
+
MockTokenizer()
|
|
70
|
+
if config.processor is None
|
|
71
|
+
else PreTrainedTokenizer.from_pretrained(config.processor)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
async def handle(self, request: Request) -> HTTPResponse:
|
|
75
|
+
"""
|
|
76
|
+
Process a completions request and return the appropriate response.
|
|
77
|
+
|
|
78
|
+
Validates the incoming request, determines whether to use streaming or
|
|
79
|
+
non-streaming mode, and delegates to the appropriate handler method.
|
|
80
|
+
|
|
81
|
+
:param request: Sanic request object containing the completions request data
|
|
82
|
+
:return: HTTP response with completion data or error information
|
|
83
|
+
:raises ValidationError: When request validation fails
|
|
84
|
+
:raises json.JSONDecodeError: When request JSON is malformed
|
|
85
|
+
"""
|
|
86
|
+
try:
|
|
87
|
+
# Parse and validate request
|
|
88
|
+
req_data = CompletionsRequest(**request.json)
|
|
89
|
+
except ValidationError as e:
|
|
90
|
+
return response.json(
|
|
91
|
+
ErrorResponse(
|
|
92
|
+
error=ErrorDetail(
|
|
93
|
+
message=f"Invalid request: {str(e)}",
|
|
94
|
+
type="invalid_request_error",
|
|
95
|
+
code="invalid_request",
|
|
96
|
+
)
|
|
97
|
+
).model_dump(),
|
|
98
|
+
status=400,
|
|
99
|
+
)
|
|
100
|
+
except (json.JSONDecodeError, TypeError):
|
|
101
|
+
return response.json(
|
|
102
|
+
ErrorResponse(
|
|
103
|
+
error=ErrorDetail(
|
|
104
|
+
message="Invalid JSON in request body",
|
|
105
|
+
type="invalid_request_error",
|
|
106
|
+
code="invalid_json",
|
|
107
|
+
)
|
|
108
|
+
).model_dump(),
|
|
109
|
+
status=400,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Handle streaming vs non-streaming
|
|
113
|
+
if req_data.stream:
|
|
114
|
+
return await self._handle_stream(req_data)
|
|
115
|
+
else:
|
|
116
|
+
return await self._handle_non_stream(req_data)
|
|
117
|
+
|
|
118
|
+
async def _handle_non_stream(self, req: CompletionsRequest) -> HTTPResponse:
|
|
119
|
+
"""
|
|
120
|
+
Generate a non-streaming completion response.
|
|
121
|
+
|
|
122
|
+
Simulates TTFT and ITL delays, generates appropriate token counts, and returns
|
|
123
|
+
a complete response with the generated text and usage statistics.
|
|
124
|
+
|
|
125
|
+
:param req: Validated completions request containing prompt and parameters
|
|
126
|
+
:return: JSON HTTP response with completion text and usage data
|
|
127
|
+
:raises NotImplementedError: When batch processing is requested
|
|
128
|
+
"""
|
|
129
|
+
if isinstance(req.prompt, list):
|
|
130
|
+
raise NotImplementedError("Batch processing is not supported.")
|
|
131
|
+
|
|
132
|
+
# TTFT delay
|
|
133
|
+
await asyncio.sleep(
|
|
134
|
+
sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Token counts
|
|
138
|
+
prompt_tokens = len(self.tokenizer(req.prompt))
|
|
139
|
+
max_tokens = req.max_tokens or math.inf
|
|
140
|
+
completion_tokens_count = int(
|
|
141
|
+
min(
|
|
142
|
+
sample_number(self.config.output_tokens, self.config.output_tokens_std),
|
|
143
|
+
max_tokens,
|
|
144
|
+
)
|
|
145
|
+
if req.stop
|
|
146
|
+
else max_tokens
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# ITL delay
|
|
150
|
+
itl_delay = 0.0
|
|
151
|
+
delays_iter = iter(times_generator(self.config.itl_ms, self.config.itl_ms_std))
|
|
152
|
+
for _ in range(int(completion_tokens_count) - 1):
|
|
153
|
+
itl_delay += next(delays_iter)
|
|
154
|
+
await asyncio.sleep(itl_delay / 1000.0)
|
|
155
|
+
|
|
156
|
+
# Response
|
|
157
|
+
completion_response = CompletionsResponse(
|
|
158
|
+
id=f"cmpl-{uuid.uuid4().hex[:29]}",
|
|
159
|
+
model=req.model,
|
|
160
|
+
choices=[
|
|
161
|
+
CompletionChoice(
|
|
162
|
+
text=create_fake_text(completion_tokens_count, self.tokenizer),
|
|
163
|
+
index=0,
|
|
164
|
+
finish_reason="stop",
|
|
165
|
+
)
|
|
166
|
+
],
|
|
167
|
+
usage=Usage(
|
|
168
|
+
prompt_tokens=prompt_tokens,
|
|
169
|
+
completion_tokens=completion_tokens_count,
|
|
170
|
+
),
|
|
171
|
+
system_fingerprint=f"fp_{uuid.uuid4().hex[:10]}",
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
return response.json(completion_response.model_dump())
|
|
175
|
+
|
|
176
|
+
async def _handle_stream(self, req: CompletionsRequest) -> HTTPResponse:
|
|
177
|
+
"""
|
|
178
|
+
Generate a streaming completion response.
|
|
179
|
+
|
|
180
|
+
Creates a server-sent events stream that delivers tokens incrementally with
|
|
181
|
+
realistic timing delays between each token. Includes usage statistics if
|
|
182
|
+
requested and properly terminates the stream.
|
|
183
|
+
|
|
184
|
+
:param req: Validated completions request containing prompt and streaming
|
|
185
|
+
options
|
|
186
|
+
:return: ResponseStream object that generates server-sent events
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
async def generate_stream(stream_response):
|
|
190
|
+
completion_id = f"cmpl-{uuid.uuid4().hex[:29]}"
|
|
191
|
+
|
|
192
|
+
# TTFT delay
|
|
193
|
+
await asyncio.sleep(
|
|
194
|
+
sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Token counts
|
|
198
|
+
prompt_tokens = len(self.tokenizer(req.prompt))
|
|
199
|
+
max_tokens = req.max_tokens or math.inf
|
|
200
|
+
completion_tokens_count = int(
|
|
201
|
+
min(
|
|
202
|
+
sample_number(
|
|
203
|
+
self.config.output_tokens, self.config.output_tokens_std
|
|
204
|
+
),
|
|
205
|
+
max_tokens,
|
|
206
|
+
)
|
|
207
|
+
if req.stop
|
|
208
|
+
else max_tokens
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Send tokens
|
|
212
|
+
tokens = create_fake_tokens_str(completion_tokens_count, self.tokenizer)
|
|
213
|
+
delays_iter = iter(
|
|
214
|
+
times_generator(self.config.itl_ms, self.config.itl_ms_std)
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
for index, token in enumerate(tokens):
|
|
218
|
+
if index > 0:
|
|
219
|
+
itl_delay = next(delays_iter)
|
|
220
|
+
await asyncio.sleep(itl_delay / 1000.0)
|
|
221
|
+
|
|
222
|
+
chunk_data = {
|
|
223
|
+
"id": completion_id,
|
|
224
|
+
"object": "text_completion",
|
|
225
|
+
"created": int(time.time()),
|
|
226
|
+
"model": req.model,
|
|
227
|
+
"choices": [
|
|
228
|
+
{
|
|
229
|
+
"text": token,
|
|
230
|
+
"index": index,
|
|
231
|
+
"finish_reason": None,
|
|
232
|
+
}
|
|
233
|
+
],
|
|
234
|
+
}
|
|
235
|
+
await stream_response.write(f"data: {json.dumps(chunk_data)}\n\n")
|
|
236
|
+
|
|
237
|
+
# Send final chunk with finish reason
|
|
238
|
+
final_chunk = {
|
|
239
|
+
"id": completion_id,
|
|
240
|
+
"object": "text_completion",
|
|
241
|
+
"created": int(time.time()),
|
|
242
|
+
"model": req.model,
|
|
243
|
+
"choices": [
|
|
244
|
+
{
|
|
245
|
+
"text": "",
|
|
246
|
+
"index": index,
|
|
247
|
+
"finish_reason": "stop",
|
|
248
|
+
}
|
|
249
|
+
],
|
|
250
|
+
}
|
|
251
|
+
await stream_response.write(f"data: {json.dumps(final_chunk)}\n\n")
|
|
252
|
+
|
|
253
|
+
# Send usage if requested
|
|
254
|
+
if req.stream_options and req.stream_options.include_usage:
|
|
255
|
+
usage_chunk = {
|
|
256
|
+
"id": completion_id,
|
|
257
|
+
"object": "text_completion",
|
|
258
|
+
"created": int(time.time()),
|
|
259
|
+
"model": req.model,
|
|
260
|
+
"choices": [],
|
|
261
|
+
"usage": {
|
|
262
|
+
"prompt_tokens": prompt_tokens,
|
|
263
|
+
"completion_tokens": completion_tokens_count,
|
|
264
|
+
"total_tokens": prompt_tokens + completion_tokens_count,
|
|
265
|
+
},
|
|
266
|
+
}
|
|
267
|
+
await stream_response.write(f"data: {json.dumps(usage_chunk)}\n\n")
|
|
268
|
+
|
|
269
|
+
# End stream
|
|
270
|
+
await stream_response.write("data: [DONE]\n\n")
|
|
271
|
+
|
|
272
|
+
return ResponseStream( # type: ignore[return-value]
|
|
273
|
+
generate_stream,
|
|
274
|
+
content_type="text/event-stream",
|
|
275
|
+
headers={
|
|
276
|
+
"Cache-Control": "no-cache",
|
|
277
|
+
"Connection": "keep-alive",
|
|
278
|
+
"X-Accel-Buffering": "no",
|
|
279
|
+
},
|
|
280
|
+
)
|