guidellm 0.4.0a21__py3-none-any.whl → 0.4.0a169__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of guidellm might be problematic. Click here for more details.

Files changed (115) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +452 -252
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +110 -0
  5. guidellm/backends/openai.py +355 -0
  6. guidellm/backends/response_handlers.py +455 -0
  7. guidellm/benchmark/__init__.py +53 -39
  8. guidellm/benchmark/benchmarker.py +150 -317
  9. guidellm/benchmark/entrypoints.py +467 -128
  10. guidellm/benchmark/output.py +519 -771
  11. guidellm/benchmark/profile.py +580 -280
  12. guidellm/benchmark/progress.py +568 -549
  13. guidellm/benchmark/scenarios/__init__.py +40 -0
  14. guidellm/benchmark/scenarios/chat.json +6 -0
  15. guidellm/benchmark/scenarios/rag.json +6 -0
  16. guidellm/benchmark/schemas.py +2086 -0
  17. guidellm/data/__init__.py +28 -4
  18. guidellm/data/collators.py +16 -0
  19. guidellm/data/deserializers/__init__.py +53 -0
  20. guidellm/data/deserializers/deserializer.py +144 -0
  21. guidellm/data/deserializers/file.py +222 -0
  22. guidellm/data/deserializers/huggingface.py +94 -0
  23. guidellm/data/deserializers/memory.py +194 -0
  24. guidellm/data/deserializers/synthetic.py +348 -0
  25. guidellm/data/loaders.py +149 -0
  26. guidellm/data/preprocessors/__init__.py +25 -0
  27. guidellm/data/preprocessors/formatters.py +404 -0
  28. guidellm/data/preprocessors/mappers.py +198 -0
  29. guidellm/data/preprocessors/preprocessor.py +31 -0
  30. guidellm/data/processor.py +31 -0
  31. guidellm/data/schemas.py +13 -0
  32. guidellm/data/utils/__init__.py +6 -0
  33. guidellm/data/utils/dataset.py +94 -0
  34. guidellm/extras/__init__.py +4 -0
  35. guidellm/extras/audio.py +215 -0
  36. guidellm/extras/vision.py +242 -0
  37. guidellm/logger.py +2 -2
  38. guidellm/mock_server/__init__.py +8 -0
  39. guidellm/mock_server/config.py +84 -0
  40. guidellm/mock_server/handlers/__init__.py +17 -0
  41. guidellm/mock_server/handlers/chat_completions.py +280 -0
  42. guidellm/mock_server/handlers/completions.py +280 -0
  43. guidellm/mock_server/handlers/tokenizer.py +142 -0
  44. guidellm/mock_server/models.py +510 -0
  45. guidellm/mock_server/server.py +168 -0
  46. guidellm/mock_server/utils.py +302 -0
  47. guidellm/preprocess/dataset.py +23 -26
  48. guidellm/presentation/builder.py +2 -2
  49. guidellm/presentation/data_models.py +25 -21
  50. guidellm/presentation/injector.py +2 -3
  51. guidellm/scheduler/__init__.py +65 -26
  52. guidellm/scheduler/constraints.py +1035 -0
  53. guidellm/scheduler/environments.py +252 -0
  54. guidellm/scheduler/scheduler.py +140 -368
  55. guidellm/scheduler/schemas.py +272 -0
  56. guidellm/scheduler/strategies.py +519 -0
  57. guidellm/scheduler/worker.py +391 -420
  58. guidellm/scheduler/worker_group.py +707 -0
  59. guidellm/schemas/__init__.py +31 -0
  60. guidellm/schemas/info.py +159 -0
  61. guidellm/schemas/request.py +226 -0
  62. guidellm/schemas/response.py +119 -0
  63. guidellm/schemas/stats.py +228 -0
  64. guidellm/{config.py → settings.py} +32 -21
  65. guidellm/utils/__init__.py +95 -8
  66. guidellm/utils/auto_importer.py +98 -0
  67. guidellm/utils/cli.py +71 -2
  68. guidellm/utils/console.py +183 -0
  69. guidellm/utils/encoding.py +778 -0
  70. guidellm/utils/functions.py +134 -0
  71. guidellm/utils/hf_datasets.py +1 -2
  72. guidellm/utils/hf_transformers.py +4 -4
  73. guidellm/utils/imports.py +9 -0
  74. guidellm/utils/messaging.py +1118 -0
  75. guidellm/utils/mixins.py +115 -0
  76. guidellm/utils/pydantic_utils.py +411 -0
  77. guidellm/utils/random.py +3 -4
  78. guidellm/utils/registry.py +220 -0
  79. guidellm/utils/singleton.py +133 -0
  80. guidellm/{objects → utils}/statistics.py +341 -247
  81. guidellm/utils/synchronous.py +159 -0
  82. guidellm/utils/text.py +163 -50
  83. guidellm/utils/typing.py +41 -0
  84. guidellm/version.py +1 -1
  85. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/METADATA +33 -10
  86. guidellm-0.4.0a169.dist-info/RECORD +95 -0
  87. guidellm/backend/__init__.py +0 -23
  88. guidellm/backend/backend.py +0 -259
  89. guidellm/backend/openai.py +0 -705
  90. guidellm/backend/response.py +0 -136
  91. guidellm/benchmark/aggregator.py +0 -760
  92. guidellm/benchmark/benchmark.py +0 -837
  93. guidellm/benchmark/scenario.py +0 -104
  94. guidellm/data/prideandprejudice.txt.gz +0 -0
  95. guidellm/dataset/__init__.py +0 -22
  96. guidellm/dataset/creator.py +0 -213
  97. guidellm/dataset/entrypoints.py +0 -42
  98. guidellm/dataset/file.py +0 -92
  99. guidellm/dataset/hf_datasets.py +0 -62
  100. guidellm/dataset/in_memory.py +0 -132
  101. guidellm/dataset/synthetic.py +0 -287
  102. guidellm/objects/__init__.py +0 -18
  103. guidellm/objects/pydantic.py +0 -89
  104. guidellm/request/__init__.py +0 -18
  105. guidellm/request/loader.py +0 -284
  106. guidellm/request/request.py +0 -79
  107. guidellm/request/types.py +0 -10
  108. guidellm/scheduler/queues.py +0 -25
  109. guidellm/scheduler/result.py +0 -155
  110. guidellm/scheduler/strategy.py +0 -495
  111. guidellm-0.4.0a21.dist-info/RECORD +0 -62
  112. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/WHEEL +0 -0
  113. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/entry_points.txt +0 -0
  114. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/licenses/LICENSE +0 -0
  115. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a169.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,280 @@
1
+ """
2
+ OpenAI Chat Completions API endpoint handler for the mock server.
3
+
4
+ Provides a complete implementation of the /v1/chat/completions endpoint that simulates
5
+ realistic LLM behavior with configurable timing characteristics. Supports both streaming
6
+ and non-streaming responses with proper token counting, latency simulation including
7
+ TTFT (Time To First Token) and ITL (Inter-Token Latency), and OpenAI-compatible error
8
+ handling for comprehensive benchmarking scenarios.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ import json
15
+ import math
16
+ import time
17
+ import uuid
18
+
19
+ from pydantic import ValidationError
20
+ from sanic import response
21
+ from sanic.request import Request
22
+ from sanic.response import HTTPResponse, ResponseStream
23
+ from transformers import PreTrainedTokenizer
24
+
25
+ from guidellm.mock_server.config import MockServerConfig
26
+ from guidellm.mock_server.models import (
27
+ ChatCompletionChoice,
28
+ ChatCompletionsRequest,
29
+ ChatCompletionsResponse,
30
+ ChatMessage,
31
+ ErrorDetail,
32
+ ErrorResponse,
33
+ Usage,
34
+ )
35
+ from guidellm.mock_server.utils import (
36
+ MockTokenizer,
37
+ create_fake_text,
38
+ create_fake_tokens_str,
39
+ sample_number,
40
+ times_generator,
41
+ )
42
+
43
+ __all__ = ["ChatCompletionsHandler"]
44
+
45
+
46
+ class ChatCompletionsHandler:
47
+ """
48
+ Handles OpenAI Chat Completions API requests with realistic LLM simulation.
49
+
50
+ Implements the /v1/chat/completions endpoint behavior including request validation,
51
+ response generation, and timing simulation. Supports both streaming and
52
+ non-streaming modes with configurable latency characteristics for comprehensive
53
+ benchmarking. Uses either a mock tokenizer or a real tokenizer for accurate token
54
+ counting and realistic text generation.
55
+
56
+ Example:
57
+ ::
58
+ config = MockServerConfig(ttft_ms=100, itl_ms=50)
59
+ handler = ChatCompletionsHandler(config)
60
+ response = await handler.handle(request)
61
+ """
62
+
63
+ def __init__(self, config: MockServerConfig) -> None:
64
+ """
65
+ Initialize the Chat Completions handler with server configuration.
66
+
67
+ :param config: Mock server configuration containing timing and behavior settings
68
+ """
69
+ self.config = config
70
+ self.tokenizer = (
71
+ MockTokenizer()
72
+ if config.processor is None
73
+ else PreTrainedTokenizer.from_pretrained(config.processor)
74
+ )
75
+
76
+ async def handle(self, request: Request) -> HTTPResponse:
77
+ """
78
+ Process incoming chat completion requests with validation and routing.
79
+
80
+ Validates the request payload, handles errors gracefully, and routes to
81
+ appropriate streaming or non-streaming response handlers based on the
82
+ request configuration.
83
+
84
+ :param request: Sanic HTTP request containing chat completion parameters
85
+ :return: HTTP response with completion data or error information
86
+ :raises ValidationError: When request payload fails validation
87
+ :raises JSONDecodeError: When request contains invalid JSON
88
+ """
89
+ try:
90
+ # Parse and validate request
91
+ req_data = ChatCompletionsRequest(**request.json)
92
+ except ValidationError as exc:
93
+ return response.json(
94
+ ErrorResponse(
95
+ error=ErrorDetail(
96
+ message=f"Invalid request: {str(exc)}",
97
+ type="invalid_request_error",
98
+ code="invalid_request",
99
+ )
100
+ ).model_dump(),
101
+ status=400,
102
+ )
103
+ except (json.JSONDecodeError, TypeError):
104
+ return response.json(
105
+ ErrorResponse(
106
+ error=ErrorDetail(
107
+ message="Invalid JSON in request body",
108
+ type="invalid_request_error",
109
+ code="invalid_json",
110
+ )
111
+ ).model_dump(),
112
+ status=400,
113
+ )
114
+
115
+ # Handle streaming vs non-streaming
116
+ if req_data.stream:
117
+ return await self._handle_stream(req_data)
118
+ else:
119
+ return await self._handle_non_stream(req_data)
120
+
121
+ async def _handle_non_stream(self, req: ChatCompletionsRequest) -> HTTPResponse:
122
+ """
123
+ Generate complete non-streaming chat completion response.
124
+
125
+ Simulates realistic LLM behavior with TTFT and ITL delays, generates
126
+ appropriate token counts, and returns a complete response with usage
127
+ statistics and generated content.
128
+
129
+ :param req: Validated chat completion request parameters
130
+ :return: Complete HTTP response with generated completion data
131
+ """
132
+ # TTFT delay
133
+ await asyncio.sleep(
134
+ sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
135
+ )
136
+
137
+ # Token counts
138
+ prompt_text = self.tokenizer.apply_chat_template(req.messages)
139
+ prompt_tokens = len(self.tokenizer(prompt_text)) # type: ignore[arg-type]
140
+ max_tokens = req.max_completion_tokens or req.max_tokens or math.inf
141
+ completion_tokens_count = min(
142
+ sample_number(self.config.output_tokens, self.config.output_tokens_std),
143
+ max_tokens,
144
+ )
145
+
146
+ # ITL delay
147
+ itl_delay = 0.0
148
+ delays_iter = iter(times_generator(self.config.itl_ms, self.config.itl_ms_std))
149
+ for _ in range(int(completion_tokens_count) - 1):
150
+ itl_delay += next(delays_iter)
151
+ await asyncio.sleep(itl_delay / 1000.0)
152
+
153
+ # Response
154
+ chat_response = ChatCompletionsResponse(
155
+ id=f"chatcmpl-{uuid.uuid4().hex[:29]}",
156
+ model=req.model,
157
+ choices=[
158
+ ChatCompletionChoice(
159
+ index=0,
160
+ message=ChatMessage(
161
+ role="assistant",
162
+ content=create_fake_text(
163
+ int(completion_tokens_count), self.tokenizer
164
+ ),
165
+ ),
166
+ finish_reason="stop",
167
+ )
168
+ ],
169
+ usage=Usage(
170
+ prompt_tokens=prompt_tokens,
171
+ completion_tokens=int(completion_tokens_count),
172
+ ),
173
+ system_fingerprint=f"fp_{uuid.uuid4().hex[:10]}",
174
+ )
175
+
176
+ return response.json(chat_response.model_dump())
177
+
178
+ async def _handle_stream(self, req: ChatCompletionsRequest) -> HTTPResponse:
179
+ """
180
+ Generate streaming chat completion response with real-time token delivery.
181
+
182
+ Creates a streaming response that delivers tokens incrementally with
183
+ realistic timing delays. Supports optional usage statistics in the final
184
+ stream chunk when requested via stream_options.
185
+
186
+ :param req: Validated chat completion request with streaming enabled
187
+ :return: Streaming HTTP response delivering tokens with proper timing
188
+ """
189
+
190
+ async def generate_stream(stream_response):
191
+ completion_id = f"chatcmpl-{uuid.uuid4().hex[:29]}"
192
+
193
+ # TTFT delay
194
+ await asyncio.sleep(
195
+ sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
196
+ )
197
+
198
+ # Token counts
199
+ prompt_text = self.tokenizer.apply_chat_template(req.messages)
200
+ prompt_tokens = len(self.tokenizer(prompt_text)) # type: ignore[arg-type]
201
+ max_tokens = req.max_completion_tokens or req.max_tokens or math.inf
202
+ completion_tokens_count = int(
203
+ min(
204
+ sample_number(
205
+ self.config.output_tokens, self.config.output_tokens_std
206
+ ),
207
+ max_tokens,
208
+ )
209
+ )
210
+
211
+ # Send tokens
212
+ tokens = create_fake_tokens_str(completion_tokens_count, self.tokenizer)
213
+ delays_iter = iter(
214
+ times_generator(self.config.itl_ms, self.config.itl_ms_std)
215
+ )
216
+
217
+ for index, token in enumerate(tokens):
218
+ if index > 0:
219
+ itl_delay = next(delays_iter)
220
+ await asyncio.sleep(itl_delay / 1000.0)
221
+
222
+ chunk_data = {
223
+ "id": completion_id,
224
+ "object": "chat.completion.chunk",
225
+ "created": int(time.time()),
226
+ "model": req.model,
227
+ "choices": [
228
+ {
229
+ "index": 0,
230
+ "delta": {"content": token},
231
+ "finish_reason": None,
232
+ }
233
+ ],
234
+ }
235
+ await stream_response.write(f"data: {json.dumps(chunk_data)}\n\n")
236
+
237
+ # Send final chunk with finish reason
238
+ final_chunk = {
239
+ "id": completion_id,
240
+ "object": "chat.completion.chunk",
241
+ "created": int(time.time()),
242
+ "model": req.model,
243
+ "choices": [
244
+ {
245
+ "index": 0,
246
+ "delta": {},
247
+ "finish_reason": "stop",
248
+ }
249
+ ],
250
+ }
251
+ await stream_response.write(f"data: {json.dumps(final_chunk)}\n\n")
252
+
253
+ # Send usage if requested
254
+ if req.stream_options and req.stream_options.include_usage:
255
+ usage_chunk = {
256
+ "id": completion_id,
257
+ "object": "chat.completion.chunk",
258
+ "created": int(time.time()),
259
+ "model": req.model,
260
+ "choices": [],
261
+ "usage": {
262
+ "prompt_tokens": prompt_tokens,
263
+ "completion_tokens": completion_tokens_count,
264
+ "total_tokens": prompt_tokens + completion_tokens_count,
265
+ },
266
+ }
267
+ await stream_response.write(f"data: {json.dumps(usage_chunk)}\n\n")
268
+
269
+ # End stream
270
+ await stream_response.write("data: [DONE]\n\n")
271
+
272
+ return ResponseStream( # type: ignore[return-value]
273
+ generate_stream,
274
+ content_type="text/event-stream",
275
+ headers={
276
+ "Cache-Control": "no-cache",
277
+ "Connection": "keep-alive",
278
+ "X-Accel-Buffering": "no",
279
+ },
280
+ )
@@ -0,0 +1,280 @@
1
+ """
2
+ Legacy OpenAI Completions API handler for the mock server.
3
+
4
+ This module provides the CompletionsHandler class that implements the /v1/completions
5
+ endpoint for the guidellm mock server. It supports both streaming and non-streaming
6
+ completions with configurable timing parameters (TTFT, ITL) and token generation to
7
+ simulate realistic LLM behavior for benchmarking and testing purposes.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import asyncio
13
+ import json
14
+ import math
15
+ import time
16
+ import uuid
17
+
18
+ from pydantic import ValidationError
19
+ from sanic import response
20
+ from sanic.request import Request
21
+ from sanic.response import HTTPResponse, ResponseStream
22
+ from transformers import PreTrainedTokenizer
23
+
24
+ from guidellm.mock_server.config import MockServerConfig
25
+ from guidellm.mock_server.models import (
26
+ CompletionChoice,
27
+ CompletionsRequest,
28
+ CompletionsResponse,
29
+ ErrorDetail,
30
+ ErrorResponse,
31
+ Usage,
32
+ )
33
+ from guidellm.mock_server.utils import (
34
+ MockTokenizer,
35
+ create_fake_text,
36
+ create_fake_tokens_str,
37
+ sample_number,
38
+ times_generator,
39
+ )
40
+
41
+ __all__ = ["CompletionsHandler"]
42
+
43
+
44
+ class CompletionsHandler:
45
+ """
46
+ Handler for the OpenAI /v1/completions endpoint in the mock server.
47
+
48
+ This handler simulates the legacy OpenAI completions API by processing incoming
49
+ requests and generating responses with configurable timing and token generation
50
+ patterns. It supports both streaming and non-streaming modes, applying realistic
51
+ timing delays (TTFT and ITL) to mimic actual LLM behavior for benchmarking.
52
+
53
+ Example:
54
+ ::
55
+ config = MockServerConfig(ttft_ms=100, itl_ms=50)
56
+ handler = CompletionsHandler(config)
57
+ response = await handler.handle(sanic_request)
58
+ """
59
+
60
+ def __init__(self, config: MockServerConfig) -> None:
61
+ """
62
+ Initialize the completions handler with configuration settings.
63
+
64
+ :param config: Mock server configuration containing timing parameters
65
+ and tokenizer settings
66
+ """
67
+ self.config = config
68
+ self.tokenizer = (
69
+ MockTokenizer()
70
+ if config.processor is None
71
+ else PreTrainedTokenizer.from_pretrained(config.processor)
72
+ )
73
+
74
+ async def handle(self, request: Request) -> HTTPResponse:
75
+ """
76
+ Process a completions request and return the appropriate response.
77
+
78
+ Validates the incoming request, determines whether to use streaming or
79
+ non-streaming mode, and delegates to the appropriate handler method.
80
+
81
+ :param request: Sanic request object containing the completions request data
82
+ :return: HTTP response with completion data or error information
83
+ :raises ValidationError: When request validation fails
84
+ :raises json.JSONDecodeError: When request JSON is malformed
85
+ """
86
+ try:
87
+ # Parse and validate request
88
+ req_data = CompletionsRequest(**request.json)
89
+ except ValidationError as e:
90
+ return response.json(
91
+ ErrorResponse(
92
+ error=ErrorDetail(
93
+ message=f"Invalid request: {str(e)}",
94
+ type="invalid_request_error",
95
+ code="invalid_request",
96
+ )
97
+ ).model_dump(),
98
+ status=400,
99
+ )
100
+ except (json.JSONDecodeError, TypeError):
101
+ return response.json(
102
+ ErrorResponse(
103
+ error=ErrorDetail(
104
+ message="Invalid JSON in request body",
105
+ type="invalid_request_error",
106
+ code="invalid_json",
107
+ )
108
+ ).model_dump(),
109
+ status=400,
110
+ )
111
+
112
+ # Handle streaming vs non-streaming
113
+ if req_data.stream:
114
+ return await self._handle_stream(req_data)
115
+ else:
116
+ return await self._handle_non_stream(req_data)
117
+
118
+ async def _handle_non_stream(self, req: CompletionsRequest) -> HTTPResponse:
119
+ """
120
+ Generate a non-streaming completion response.
121
+
122
+ Simulates TTFT and ITL delays, generates appropriate token counts, and returns
123
+ a complete response with the generated text and usage statistics.
124
+
125
+ :param req: Validated completions request containing prompt and parameters
126
+ :return: JSON HTTP response with completion text and usage data
127
+ :raises NotImplementedError: When batch processing is requested
128
+ """
129
+ if isinstance(req.prompt, list):
130
+ raise NotImplementedError("Batch processing is not supported.")
131
+
132
+ # TTFT delay
133
+ await asyncio.sleep(
134
+ sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
135
+ )
136
+
137
+ # Token counts
138
+ prompt_tokens = len(self.tokenizer(req.prompt))
139
+ max_tokens = req.max_tokens or math.inf
140
+ completion_tokens_count = int(
141
+ min(
142
+ sample_number(self.config.output_tokens, self.config.output_tokens_std),
143
+ max_tokens,
144
+ )
145
+ if req.stop
146
+ else max_tokens
147
+ )
148
+
149
+ # ITL delay
150
+ itl_delay = 0.0
151
+ delays_iter = iter(times_generator(self.config.itl_ms, self.config.itl_ms_std))
152
+ for _ in range(int(completion_tokens_count) - 1):
153
+ itl_delay += next(delays_iter)
154
+ await asyncio.sleep(itl_delay / 1000.0)
155
+
156
+ # Response
157
+ completion_response = CompletionsResponse(
158
+ id=f"cmpl-{uuid.uuid4().hex[:29]}",
159
+ model=req.model,
160
+ choices=[
161
+ CompletionChoice(
162
+ text=create_fake_text(completion_tokens_count, self.tokenizer),
163
+ index=0,
164
+ finish_reason="stop",
165
+ )
166
+ ],
167
+ usage=Usage(
168
+ prompt_tokens=prompt_tokens,
169
+ completion_tokens=completion_tokens_count,
170
+ ),
171
+ system_fingerprint=f"fp_{uuid.uuid4().hex[:10]}",
172
+ )
173
+
174
+ return response.json(completion_response.model_dump())
175
+
176
+ async def _handle_stream(self, req: CompletionsRequest) -> HTTPResponse:
177
+ """
178
+ Generate a streaming completion response.
179
+
180
+ Creates a server-sent events stream that delivers tokens incrementally with
181
+ realistic timing delays between each token. Includes usage statistics if
182
+ requested and properly terminates the stream.
183
+
184
+ :param req: Validated completions request containing prompt and streaming
185
+ options
186
+ :return: ResponseStream object that generates server-sent events
187
+ """
188
+
189
+ async def generate_stream(stream_response):
190
+ completion_id = f"cmpl-{uuid.uuid4().hex[:29]}"
191
+
192
+ # TTFT delay
193
+ await asyncio.sleep(
194
+ sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
195
+ )
196
+
197
+ # Token counts
198
+ prompt_tokens = len(self.tokenizer(req.prompt))
199
+ max_tokens = req.max_tokens or math.inf
200
+ completion_tokens_count = int(
201
+ min(
202
+ sample_number(
203
+ self.config.output_tokens, self.config.output_tokens_std
204
+ ),
205
+ max_tokens,
206
+ )
207
+ if req.stop
208
+ else max_tokens
209
+ )
210
+
211
+ # Send tokens
212
+ tokens = create_fake_tokens_str(completion_tokens_count, self.tokenizer)
213
+ delays_iter = iter(
214
+ times_generator(self.config.itl_ms, self.config.itl_ms_std)
215
+ )
216
+
217
+ for index, token in enumerate(tokens):
218
+ if index > 0:
219
+ itl_delay = next(delays_iter)
220
+ await asyncio.sleep(itl_delay / 1000.0)
221
+
222
+ chunk_data = {
223
+ "id": completion_id,
224
+ "object": "text_completion",
225
+ "created": int(time.time()),
226
+ "model": req.model,
227
+ "choices": [
228
+ {
229
+ "text": token,
230
+ "index": index,
231
+ "finish_reason": None,
232
+ }
233
+ ],
234
+ }
235
+ await stream_response.write(f"data: {json.dumps(chunk_data)}\n\n")
236
+
237
+ # Send final chunk with finish reason
238
+ final_chunk = {
239
+ "id": completion_id,
240
+ "object": "text_completion",
241
+ "created": int(time.time()),
242
+ "model": req.model,
243
+ "choices": [
244
+ {
245
+ "text": "",
246
+ "index": index,
247
+ "finish_reason": "stop",
248
+ }
249
+ ],
250
+ }
251
+ await stream_response.write(f"data: {json.dumps(final_chunk)}\n\n")
252
+
253
+ # Send usage if requested
254
+ if req.stream_options and req.stream_options.include_usage:
255
+ usage_chunk = {
256
+ "id": completion_id,
257
+ "object": "text_completion",
258
+ "created": int(time.time()),
259
+ "model": req.model,
260
+ "choices": [],
261
+ "usage": {
262
+ "prompt_tokens": prompt_tokens,
263
+ "completion_tokens": completion_tokens_count,
264
+ "total_tokens": prompt_tokens + completion_tokens_count,
265
+ },
266
+ }
267
+ await stream_response.write(f"data: {json.dumps(usage_chunk)}\n\n")
268
+
269
+ # End stream
270
+ await stream_response.write("data: [DONE]\n\n")
271
+
272
+ return ResponseStream( # type: ignore[return-value]
273
+ generate_stream,
274
+ content_type="text/event-stream",
275
+ headers={
276
+ "Cache-Control": "no-cache",
277
+ "Connection": "keep-alive",
278
+ "X-Accel-Buffering": "no",
279
+ },
280
+ )
@@ -0,0 +1,142 @@
1
+ """
2
+ HTTP request handler for vLLM tokenization API endpoints in the mock server.
3
+
4
+ This module provides the TokenizerHandler class that implements vLLM-compatible
5
+ tokenization and detokenization endpoints for testing and development purposes.
6
+ It handles text-to-token conversion, token-to-text reconstruction, request
7
+ validation, and error responses with proper HTTP status codes and JSON formatting.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from pydantic import ValidationError
13
+ from sanic import response
14
+ from sanic.request import Request
15
+ from sanic.response import HTTPResponse
16
+ from transformers.tokenization_utils import PreTrainedTokenizer
17
+
18
+ from guidellm.mock_server.config import MockServerConfig
19
+ from guidellm.mock_server.models import (
20
+ DetokenizeRequest,
21
+ DetokenizeResponse,
22
+ ErrorDetail,
23
+ ErrorResponse,
24
+ TokenizeRequest,
25
+ TokenizeResponse,
26
+ )
27
+ from guidellm.mock_server.utils import MockTokenizer
28
+
29
+ __all__ = ["TokenizerHandler"]
30
+
31
+
32
+ class TokenizerHandler:
33
+ """
34
+ HTTP request handler for vLLM tokenization and detokenization endpoints.
35
+
36
+ Provides mock implementations of vLLM's tokenization API endpoints including
37
+ /tokenize for converting text to tokens and /detokenize for reconstructing
38
+ text from token sequences. Handles request validation, error responses, and
39
+ JSON serialization with proper HTTP status codes.
40
+
41
+ Example:
42
+ ::
43
+ handler = TokenizerHandler(config)
44
+ response = await handler.tokenize(request)
45
+ response = await handler.detokenize(request)
46
+ """
47
+
48
+ def __init__(self, config: MockServerConfig) -> None:
49
+ """
50
+ Initialize the tokenizer handler with configuration.
51
+
52
+ :param config: Server configuration object containing tokenizer settings
53
+ """
54
+ self.config = config
55
+ self.tokenizer = (
56
+ MockTokenizer()
57
+ if config.processor is None
58
+ else PreTrainedTokenizer.from_pretrained(config.processor)
59
+ )
60
+
61
+ async def tokenize(self, request: Request) -> HTTPResponse:
62
+ """
63
+ Convert input text to token IDs via the /tokenize endpoint.
64
+
65
+ Validates the request payload, extracts text content, and returns a JSON
66
+ response containing the token sequence and count. Handles validation errors
67
+ and malformed JSON with appropriate HTTP error responses.
68
+
69
+ :param request: Sanic HTTP request containing JSON payload with text field
70
+ :return: JSON response with tokens list and count, or error response
71
+ """
72
+ try:
73
+ req_data = TokenizeRequest(**request.json)
74
+ except ValidationError as exc:
75
+ return response.json(
76
+ ErrorResponse(
77
+ error=ErrorDetail(
78
+ message=f"Invalid request: {str(exc)}",
79
+ type="invalid_request_error",
80
+ code="invalid_request",
81
+ )
82
+ ).model_dump(),
83
+ status=400,
84
+ )
85
+ except (ValueError, TypeError, KeyError):
86
+ return response.json(
87
+ ErrorResponse(
88
+ error=ErrorDetail(
89
+ message="Invalid JSON in request body",
90
+ type="invalid_request_error",
91
+ code="invalid_json",
92
+ )
93
+ ).model_dump(),
94
+ status=400,
95
+ )
96
+
97
+ tokens = self.tokenizer.tokenize(req_data.text)
98
+ token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
99
+
100
+ return response.json(
101
+ TokenizeResponse(tokens=token_ids, count=len(token_ids)).model_dump()
102
+ )
103
+
104
+ async def detokenize(self, request: Request) -> HTTPResponse:
105
+ """
106
+ Convert token IDs back to text via the /detokenize endpoint.
107
+
108
+ Validates the request payload, extracts token sequences, and returns a JSON
109
+ response containing the reconstructed text. Handles validation errors and
110
+ malformed JSON with appropriate HTTP error responses.
111
+
112
+ :param request: Sanic HTTP request containing JSON payload with tokens field
113
+ :return: JSON response with reconstructed text, or error response
114
+ """
115
+ try:
116
+ req_data = DetokenizeRequest(**request.json)
117
+ except ValidationError as exc:
118
+ return response.json(
119
+ ErrorResponse(
120
+ error=ErrorDetail(
121
+ message=f"Invalid request: {str(exc)}",
122
+ type="invalid_request_error",
123
+ code="invalid_request",
124
+ )
125
+ ).model_dump(),
126
+ status=400,
127
+ )
128
+ except (ValueError, TypeError, KeyError):
129
+ return response.json(
130
+ ErrorResponse(
131
+ error=ErrorDetail(
132
+ message="Invalid JSON in request body",
133
+ type="invalid_request_error",
134
+ code="invalid_json",
135
+ )
136
+ ).model_dump(),
137
+ status=400,
138
+ )
139
+
140
+ text = self.tokenizer.decode(req_data.tokens, skip_special_tokens=False)
141
+
142
+ return response.json(DetokenizeResponse(text=text).model_dump())