guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +524 -255
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +109 -0
  5. guidellm/backends/openai.py +340 -0
  6. guidellm/backends/response_handlers.py +428 -0
  7. guidellm/benchmark/__init__.py +69 -39
  8. guidellm/benchmark/benchmarker.py +160 -316
  9. guidellm/benchmark/entrypoints.py +560 -127
  10. guidellm/benchmark/outputs/__init__.py +24 -0
  11. guidellm/benchmark/outputs/console.py +633 -0
  12. guidellm/benchmark/outputs/csv.py +721 -0
  13. guidellm/benchmark/outputs/html.py +473 -0
  14. guidellm/benchmark/outputs/output.py +169 -0
  15. guidellm/benchmark/outputs/serialized.py +69 -0
  16. guidellm/benchmark/profiles.py +718 -0
  17. guidellm/benchmark/progress.py +553 -556
  18. guidellm/benchmark/scenarios/__init__.py +40 -0
  19. guidellm/benchmark/scenarios/chat.json +6 -0
  20. guidellm/benchmark/scenarios/rag.json +6 -0
  21. guidellm/benchmark/schemas/__init__.py +66 -0
  22. guidellm/benchmark/schemas/base.py +402 -0
  23. guidellm/benchmark/schemas/generative/__init__.py +55 -0
  24. guidellm/benchmark/schemas/generative/accumulator.py +841 -0
  25. guidellm/benchmark/schemas/generative/benchmark.py +163 -0
  26. guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
  27. guidellm/benchmark/schemas/generative/metrics.py +927 -0
  28. guidellm/benchmark/schemas/generative/report.py +158 -0
  29. guidellm/data/__init__.py +34 -4
  30. guidellm/data/builders.py +541 -0
  31. guidellm/data/collators.py +16 -0
  32. guidellm/data/config.py +120 -0
  33. guidellm/data/deserializers/__init__.py +49 -0
  34. guidellm/data/deserializers/deserializer.py +141 -0
  35. guidellm/data/deserializers/file.py +223 -0
  36. guidellm/data/deserializers/huggingface.py +94 -0
  37. guidellm/data/deserializers/memory.py +194 -0
  38. guidellm/data/deserializers/synthetic.py +246 -0
  39. guidellm/data/entrypoints.py +52 -0
  40. guidellm/data/loaders.py +190 -0
  41. guidellm/data/preprocessors/__init__.py +27 -0
  42. guidellm/data/preprocessors/formatters.py +410 -0
  43. guidellm/data/preprocessors/mappers.py +196 -0
  44. guidellm/data/preprocessors/preprocessor.py +30 -0
  45. guidellm/data/processor.py +29 -0
  46. guidellm/data/schemas.py +175 -0
  47. guidellm/data/utils/__init__.py +6 -0
  48. guidellm/data/utils/dataset.py +94 -0
  49. guidellm/extras/__init__.py +4 -0
  50. guidellm/extras/audio.py +220 -0
  51. guidellm/extras/vision.py +242 -0
  52. guidellm/logger.py +2 -2
  53. guidellm/mock_server/__init__.py +8 -0
  54. guidellm/mock_server/config.py +84 -0
  55. guidellm/mock_server/handlers/__init__.py +17 -0
  56. guidellm/mock_server/handlers/chat_completions.py +280 -0
  57. guidellm/mock_server/handlers/completions.py +280 -0
  58. guidellm/mock_server/handlers/tokenizer.py +142 -0
  59. guidellm/mock_server/models.py +510 -0
  60. guidellm/mock_server/server.py +238 -0
  61. guidellm/mock_server/utils.py +302 -0
  62. guidellm/scheduler/__init__.py +69 -26
  63. guidellm/scheduler/constraints/__init__.py +49 -0
  64. guidellm/scheduler/constraints/constraint.py +325 -0
  65. guidellm/scheduler/constraints/error.py +411 -0
  66. guidellm/scheduler/constraints/factory.py +182 -0
  67. guidellm/scheduler/constraints/request.py +312 -0
  68. guidellm/scheduler/constraints/saturation.py +722 -0
  69. guidellm/scheduler/environments.py +252 -0
  70. guidellm/scheduler/scheduler.py +137 -368
  71. guidellm/scheduler/schemas.py +358 -0
  72. guidellm/scheduler/strategies.py +617 -0
  73. guidellm/scheduler/worker.py +413 -419
  74. guidellm/scheduler/worker_group.py +712 -0
  75. guidellm/schemas/__init__.py +65 -0
  76. guidellm/schemas/base.py +417 -0
  77. guidellm/schemas/info.py +188 -0
  78. guidellm/schemas/request.py +235 -0
  79. guidellm/schemas/request_stats.py +349 -0
  80. guidellm/schemas/response.py +124 -0
  81. guidellm/schemas/statistics.py +1018 -0
  82. guidellm/{config.py → settings.py} +31 -24
  83. guidellm/utils/__init__.py +71 -8
  84. guidellm/utils/auto_importer.py +98 -0
  85. guidellm/utils/cli.py +132 -5
  86. guidellm/utils/console.py +566 -0
  87. guidellm/utils/encoding.py +778 -0
  88. guidellm/utils/functions.py +159 -0
  89. guidellm/utils/hf_datasets.py +1 -2
  90. guidellm/utils/hf_transformers.py +4 -4
  91. guidellm/utils/imports.py +9 -0
  92. guidellm/utils/messaging.py +1118 -0
  93. guidellm/utils/mixins.py +115 -0
  94. guidellm/utils/random.py +3 -4
  95. guidellm/utils/registry.py +220 -0
  96. guidellm/utils/singleton.py +133 -0
  97. guidellm/utils/synchronous.py +159 -0
  98. guidellm/utils/text.py +163 -50
  99. guidellm/utils/typing.py +41 -0
  100. guidellm/version.py +2 -2
  101. guidellm-0.6.0a5.dist-info/METADATA +364 -0
  102. guidellm-0.6.0a5.dist-info/RECORD +109 -0
  103. guidellm/backend/__init__.py +0 -23
  104. guidellm/backend/backend.py +0 -259
  105. guidellm/backend/openai.py +0 -708
  106. guidellm/backend/response.py +0 -136
  107. guidellm/benchmark/aggregator.py +0 -760
  108. guidellm/benchmark/benchmark.py +0 -837
  109. guidellm/benchmark/output.py +0 -997
  110. guidellm/benchmark/profile.py +0 -409
  111. guidellm/benchmark/scenario.py +0 -104
  112. guidellm/data/prideandprejudice.txt.gz +0 -0
  113. guidellm/dataset/__init__.py +0 -22
  114. guidellm/dataset/creator.py +0 -213
  115. guidellm/dataset/entrypoints.py +0 -42
  116. guidellm/dataset/file.py +0 -92
  117. guidellm/dataset/hf_datasets.py +0 -62
  118. guidellm/dataset/in_memory.py +0 -132
  119. guidellm/dataset/synthetic.py +0 -287
  120. guidellm/objects/__init__.py +0 -18
  121. guidellm/objects/pydantic.py +0 -89
  122. guidellm/objects/statistics.py +0 -953
  123. guidellm/preprocess/__init__.py +0 -3
  124. guidellm/preprocess/dataset.py +0 -374
  125. guidellm/presentation/__init__.py +0 -28
  126. guidellm/presentation/builder.py +0 -27
  127. guidellm/presentation/data_models.py +0 -232
  128. guidellm/presentation/injector.py +0 -66
  129. guidellm/request/__init__.py +0 -18
  130. guidellm/request/loader.py +0 -284
  131. guidellm/request/request.py +0 -79
  132. guidellm/request/types.py +0 -10
  133. guidellm/scheduler/queues.py +0 -25
  134. guidellm/scheduler/result.py +0 -155
  135. guidellm/scheduler/strategy.py +0 -495
  136. guidellm-0.3.1.dist-info/METADATA +0 -329
  137. guidellm-0.3.1.dist-info/RECORD +0 -62
  138. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
  139. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
  140. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
  141. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,280 @@
1
+ """
2
+ Legacy OpenAI Completions API handler for the mock server.
3
+
4
+ This module provides the CompletionsHandler class that implements the /v1/completions
5
+ endpoint for the guidellm mock server. It supports both streaming and non-streaming
6
+ completions with configurable timing parameters (TTFT, ITL) and token generation to
7
+ simulate realistic LLM behavior for benchmarking and testing purposes.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import asyncio
13
+ import json
14
+ import math
15
+ import time
16
+ import uuid
17
+
18
+ from pydantic import ValidationError
19
+ from sanic import response
20
+ from sanic.request import Request
21
+ from sanic.response import HTTPResponse, ResponseStream
22
+ from transformers import PreTrainedTokenizer
23
+
24
+ from guidellm.mock_server.config import MockServerConfig
25
+ from guidellm.mock_server.models import (
26
+ CompletionChoice,
27
+ CompletionsRequest,
28
+ CompletionsResponse,
29
+ ErrorDetail,
30
+ ErrorResponse,
31
+ Usage,
32
+ )
33
+ from guidellm.mock_server.utils import (
34
+ MockTokenizer,
35
+ create_fake_text,
36
+ create_fake_tokens_str,
37
+ sample_number,
38
+ times_generator,
39
+ )
40
+
41
+ __all__ = ["CompletionsHandler"]
42
+
43
+
44
+ class CompletionsHandler:
45
+ """
46
+ Handler for the OpenAI /v1/completions endpoint in the mock server.
47
+
48
+ This handler simulates the legacy OpenAI completions API by processing incoming
49
+ requests and generating responses with configurable timing and token generation
50
+ patterns. It supports both streaming and non-streaming modes, applying realistic
51
+ timing delays (TTFT and ITL) to mimic actual LLM behavior for benchmarking.
52
+
53
+ Example:
54
+ ::
55
+ config = MockServerConfig(ttft_ms=100, itl_ms=50)
56
+ handler = CompletionsHandler(config)
57
+ response = await handler.handle(sanic_request)
58
+ """
59
+
60
+ def __init__(self, config: MockServerConfig) -> None:
61
+ """
62
+ Initialize the completions handler with configuration settings.
63
+
64
+ :param config: Mock server configuration containing timing parameters
65
+ and tokenizer settings
66
+ """
67
+ self.config = config
68
+ self.tokenizer = (
69
+ MockTokenizer()
70
+ if config.processor is None
71
+ else PreTrainedTokenizer.from_pretrained(config.processor)
72
+ )
73
+
74
+ async def handle(self, request: Request) -> HTTPResponse:
75
+ """
76
+ Process a completions request and return the appropriate response.
77
+
78
+ Validates the incoming request, determines whether to use streaming or
79
+ non-streaming mode, and delegates to the appropriate handler method.
80
+
81
+ :param request: Sanic request object containing the completions request data
82
+ :return: HTTP response with completion data or error information
83
+ :raises ValidationError: When request validation fails
84
+ :raises json.JSONDecodeError: When request JSON is malformed
85
+ """
86
+ try:
87
+ # Parse and validate request
88
+ req_data = CompletionsRequest(**request.json)
89
+ except ValidationError as e:
90
+ return response.json(
91
+ ErrorResponse(
92
+ error=ErrorDetail(
93
+ message=f"Invalid request: {str(e)}",
94
+ type="invalid_request_error",
95
+ code="invalid_request",
96
+ )
97
+ ).model_dump(),
98
+ status=400,
99
+ )
100
+ except (json.JSONDecodeError, TypeError):
101
+ return response.json(
102
+ ErrorResponse(
103
+ error=ErrorDetail(
104
+ message="Invalid JSON in request body",
105
+ type="invalid_request_error",
106
+ code="invalid_json",
107
+ )
108
+ ).model_dump(),
109
+ status=400,
110
+ )
111
+
112
+ # Handle streaming vs non-streaming
113
+ if req_data.stream:
114
+ return await self._handle_stream(req_data)
115
+ else:
116
+ return await self._handle_non_stream(req_data)
117
+
118
+ async def _handle_non_stream(self, req: CompletionsRequest) -> HTTPResponse:
119
+ """
120
+ Generate a non-streaming completion response.
121
+
122
+ Simulates TTFT and ITL delays, generates appropriate token counts, and returns
123
+ a complete response with the generated text and usage statistics.
124
+
125
+ :param req: Validated completions request containing prompt and parameters
126
+ :return: JSON HTTP response with completion text and usage data
127
+ :raises NotImplementedError: When batch processing is requested
128
+ """
129
+ if isinstance(req.prompt, list):
130
+ raise NotImplementedError("Batch processing is not supported.")
131
+
132
+ # TTFT delay
133
+ await asyncio.sleep(
134
+ sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
135
+ )
136
+
137
+ # Token counts
138
+ prompt_tokens = len(self.tokenizer(req.prompt))
139
+ max_tokens = req.max_tokens or math.inf
140
+ completion_tokens_count = int(
141
+ min(
142
+ sample_number(self.config.output_tokens, self.config.output_tokens_std),
143
+ max_tokens,
144
+ )
145
+ if req.stop
146
+ else max_tokens
147
+ )
148
+
149
+ # ITL delay
150
+ itl_delay = 0.0
151
+ delays_iter = iter(times_generator(self.config.itl_ms, self.config.itl_ms_std))
152
+ for _ in range(int(completion_tokens_count) - 1):
153
+ itl_delay += next(delays_iter)
154
+ await asyncio.sleep(itl_delay / 1000.0)
155
+
156
+ # Response
157
+ completion_response = CompletionsResponse(
158
+ id=f"cmpl-{uuid.uuid4().hex[:29]}",
159
+ model=req.model,
160
+ choices=[
161
+ CompletionChoice(
162
+ text=create_fake_text(completion_tokens_count, self.tokenizer),
163
+ index=0,
164
+ finish_reason="stop",
165
+ )
166
+ ],
167
+ usage=Usage(
168
+ prompt_tokens=prompt_tokens,
169
+ completion_tokens=completion_tokens_count,
170
+ ),
171
+ system_fingerprint=f"fp_{uuid.uuid4().hex[:10]}",
172
+ )
173
+
174
+ return response.json(completion_response.model_dump())
175
+
176
+ async def _handle_stream(self, req: CompletionsRequest) -> HTTPResponse:
177
+ """
178
+ Generate a streaming completion response.
179
+
180
+ Creates a server-sent events stream that delivers tokens incrementally with
181
+ realistic timing delays between each token. Includes usage statistics if
182
+ requested and properly terminates the stream.
183
+
184
+ :param req: Validated completions request containing prompt and streaming
185
+ options
186
+ :return: ResponseStream object that generates server-sent events
187
+ """
188
+
189
+ async def generate_stream(stream_response):
190
+ completion_id = f"cmpl-{uuid.uuid4().hex[:29]}"
191
+
192
+ # TTFT delay
193
+ await asyncio.sleep(
194
+ sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
195
+ )
196
+
197
+ # Token counts
198
+ prompt_tokens = len(self.tokenizer(req.prompt))
199
+ max_tokens = req.max_tokens or math.inf
200
+ completion_tokens_count = int(
201
+ min(
202
+ sample_number(
203
+ self.config.output_tokens, self.config.output_tokens_std
204
+ ),
205
+ max_tokens,
206
+ )
207
+ if req.stop
208
+ else max_tokens
209
+ )
210
+
211
+ # Send tokens
212
+ tokens = create_fake_tokens_str(completion_tokens_count, self.tokenizer)
213
+ delays_iter = iter(
214
+ times_generator(self.config.itl_ms, self.config.itl_ms_std)
215
+ )
216
+
217
+ for index, token in enumerate(tokens):
218
+ if index > 0:
219
+ itl_delay = next(delays_iter)
220
+ await asyncio.sleep(itl_delay / 1000.0)
221
+
222
+ chunk_data = {
223
+ "id": completion_id,
224
+ "object": "text_completion",
225
+ "created": int(time.time()),
226
+ "model": req.model,
227
+ "choices": [
228
+ {
229
+ "text": token,
230
+ "index": index,
231
+ "finish_reason": None,
232
+ }
233
+ ],
234
+ }
235
+ await stream_response.write(f"data: {json.dumps(chunk_data)}\n\n")
236
+
237
+ # Send final chunk with finish reason
238
+ final_chunk = {
239
+ "id": completion_id,
240
+ "object": "text_completion",
241
+ "created": int(time.time()),
242
+ "model": req.model,
243
+ "choices": [
244
+ {
245
+ "text": "",
246
+ "index": index,
247
+ "finish_reason": "stop",
248
+ }
249
+ ],
250
+ }
251
+ await stream_response.write(f"data: {json.dumps(final_chunk)}\n\n")
252
+
253
+ # Send usage if requested
254
+ if req.stream_options and req.stream_options.include_usage:
255
+ usage_chunk = {
256
+ "id": completion_id,
257
+ "object": "text_completion",
258
+ "created": int(time.time()),
259
+ "model": req.model,
260
+ "choices": [],
261
+ "usage": {
262
+ "prompt_tokens": prompt_tokens,
263
+ "completion_tokens": completion_tokens_count,
264
+ "total_tokens": prompt_tokens + completion_tokens_count,
265
+ },
266
+ }
267
+ await stream_response.write(f"data: {json.dumps(usage_chunk)}\n\n")
268
+
269
+ # End stream
270
+ await stream_response.write("data: [DONE]\n\n")
271
+
272
+ return ResponseStream( # type: ignore[return-value]
273
+ generate_stream,
274
+ content_type="text/event-stream",
275
+ headers={
276
+ "Cache-Control": "no-cache",
277
+ "Connection": "keep-alive",
278
+ "X-Accel-Buffering": "no",
279
+ },
280
+ )
@@ -0,0 +1,142 @@
1
+ """
2
+ HTTP request handler for vLLM tokenization API endpoints in the mock server.
3
+
4
+ This module provides the TokenizerHandler class that implements vLLM-compatible
5
+ tokenization and detokenization endpoints for testing and development purposes.
6
+ It handles text-to-token conversion, token-to-text reconstruction, request
7
+ validation, and error responses with proper HTTP status codes and JSON formatting.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from pydantic import ValidationError
13
+ from sanic import response
14
+ from sanic.request import Request
15
+ from sanic.response import HTTPResponse
16
+ from transformers.tokenization_utils import PreTrainedTokenizer
17
+
18
+ from guidellm.mock_server.config import MockServerConfig
19
+ from guidellm.mock_server.models import (
20
+ DetokenizeRequest,
21
+ DetokenizeResponse,
22
+ ErrorDetail,
23
+ ErrorResponse,
24
+ TokenizeRequest,
25
+ TokenizeResponse,
26
+ )
27
+ from guidellm.mock_server.utils import MockTokenizer
28
+
29
+ __all__ = ["TokenizerHandler"]
30
+
31
+
32
+ class TokenizerHandler:
33
+ """
34
+ HTTP request handler for vLLM tokenization and detokenization endpoints.
35
+
36
+ Provides mock implementations of vLLM's tokenization API endpoints including
37
+ /tokenize for converting text to tokens and /detokenize for reconstructing
38
+ text from token sequences. Handles request validation, error responses, and
39
+ JSON serialization with proper HTTP status codes.
40
+
41
+ Example:
42
+ ::
43
+ handler = TokenizerHandler(config)
44
+ response = await handler.tokenize(request)
45
+ response = await handler.detokenize(request)
46
+ """
47
+
48
+ def __init__(self, config: MockServerConfig) -> None:
49
+ """
50
+ Initialize the tokenizer handler with configuration.
51
+
52
+ :param config: Server configuration object containing tokenizer settings
53
+ """
54
+ self.config = config
55
+ self.tokenizer = (
56
+ MockTokenizer()
57
+ if config.processor is None
58
+ else PreTrainedTokenizer.from_pretrained(config.processor)
59
+ )
60
+
61
+ async def tokenize(self, request: Request) -> HTTPResponse:
62
+ """
63
+ Convert input text to token IDs via the /tokenize endpoint.
64
+
65
+ Validates the request payload, extracts text content, and returns a JSON
66
+ response containing the token sequence and count. Handles validation errors
67
+ and malformed JSON with appropriate HTTP error responses.
68
+
69
+ :param request: Sanic HTTP request containing JSON payload with text field
70
+ :return: JSON response with tokens list and count, or error response
71
+ """
72
+ try:
73
+ req_data = TokenizeRequest(**request.json)
74
+ except ValidationError as exc:
75
+ return response.json(
76
+ ErrorResponse(
77
+ error=ErrorDetail(
78
+ message=f"Invalid request: {str(exc)}",
79
+ type="invalid_request_error",
80
+ code="invalid_request",
81
+ )
82
+ ).model_dump(),
83
+ status=400,
84
+ )
85
+ except (ValueError, TypeError, KeyError):
86
+ return response.json(
87
+ ErrorResponse(
88
+ error=ErrorDetail(
89
+ message="Invalid JSON in request body",
90
+ type="invalid_request_error",
91
+ code="invalid_json",
92
+ )
93
+ ).model_dump(),
94
+ status=400,
95
+ )
96
+
97
+ tokens = self.tokenizer.tokenize(req_data.text)
98
+ token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
99
+
100
+ return response.json(
101
+ TokenizeResponse(tokens=token_ids, count=len(token_ids)).model_dump()
102
+ )
103
+
104
+ async def detokenize(self, request: Request) -> HTTPResponse:
105
+ """
106
+ Convert token IDs back to text via the /detokenize endpoint.
107
+
108
+ Validates the request payload, extracts token sequences, and returns a JSON
109
+ response containing the reconstructed text. Handles validation errors and
110
+ malformed JSON with appropriate HTTP error responses.
111
+
112
+ :param request: Sanic HTTP request containing JSON payload with tokens field
113
+ :return: JSON response with reconstructed text, or error response
114
+ """
115
+ try:
116
+ req_data = DetokenizeRequest(**request.json)
117
+ except ValidationError as exc:
118
+ return response.json(
119
+ ErrorResponse(
120
+ error=ErrorDetail(
121
+ message=f"Invalid request: {str(exc)}",
122
+ type="invalid_request_error",
123
+ code="invalid_request",
124
+ )
125
+ ).model_dump(),
126
+ status=400,
127
+ )
128
+ except (ValueError, TypeError, KeyError):
129
+ return response.json(
130
+ ErrorResponse(
131
+ error=ErrorDetail(
132
+ message="Invalid JSON in request body",
133
+ type="invalid_request_error",
134
+ code="invalid_json",
135
+ )
136
+ ).model_dump(),
137
+ status=400,
138
+ )
139
+
140
+ text = self.tokenizer.decode(req_data.tokens, skip_special_tokens=False)
141
+
142
+ return response.json(DetokenizeResponse(text=text).model_dump())