guidellm 0.4.0a18__py3-none-any.whl → 0.4.0a155__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of guidellm might be problematic. Click here for more details.
- guidellm/__init__.py +5 -2
- guidellm/__main__.py +451 -252
- guidellm/backends/__init__.py +33 -0
- guidellm/backends/backend.py +110 -0
- guidellm/backends/openai.py +355 -0
- guidellm/backends/response_handlers.py +455 -0
- guidellm/benchmark/__init__.py +53 -39
- guidellm/benchmark/benchmarker.py +148 -317
- guidellm/benchmark/entrypoints.py +466 -128
- guidellm/benchmark/output.py +517 -771
- guidellm/benchmark/profile.py +580 -280
- guidellm/benchmark/progress.py +568 -549
- guidellm/benchmark/scenarios/__init__.py +40 -0
- guidellm/benchmark/scenarios/chat.json +6 -0
- guidellm/benchmark/scenarios/rag.json +6 -0
- guidellm/benchmark/schemas.py +2085 -0
- guidellm/data/__init__.py +28 -4
- guidellm/data/collators.py +16 -0
- guidellm/data/deserializers/__init__.py +53 -0
- guidellm/data/deserializers/deserializer.py +109 -0
- guidellm/data/deserializers/file.py +222 -0
- guidellm/data/deserializers/huggingface.py +94 -0
- guidellm/data/deserializers/memory.py +192 -0
- guidellm/data/deserializers/synthetic.py +346 -0
- guidellm/data/loaders.py +145 -0
- guidellm/data/preprocessors/__init__.py +25 -0
- guidellm/data/preprocessors/formatters.py +412 -0
- guidellm/data/preprocessors/mappers.py +198 -0
- guidellm/data/preprocessors/preprocessor.py +29 -0
- guidellm/data/processor.py +30 -0
- guidellm/data/schemas.py +13 -0
- guidellm/data/utils/__init__.py +10 -0
- guidellm/data/utils/dataset.py +94 -0
- guidellm/data/utils/functions.py +18 -0
- guidellm/extras/__init__.py +4 -0
- guidellm/extras/audio.py +215 -0
- guidellm/extras/vision.py +242 -0
- guidellm/logger.py +2 -2
- guidellm/mock_server/__init__.py +8 -0
- guidellm/mock_server/config.py +84 -0
- guidellm/mock_server/handlers/__init__.py +17 -0
- guidellm/mock_server/handlers/chat_completions.py +280 -0
- guidellm/mock_server/handlers/completions.py +280 -0
- guidellm/mock_server/handlers/tokenizer.py +142 -0
- guidellm/mock_server/models.py +510 -0
- guidellm/mock_server/server.py +168 -0
- guidellm/mock_server/utils.py +302 -0
- guidellm/preprocess/dataset.py +23 -26
- guidellm/presentation/builder.py +2 -2
- guidellm/presentation/data_models.py +25 -21
- guidellm/presentation/injector.py +2 -3
- guidellm/scheduler/__init__.py +65 -26
- guidellm/scheduler/constraints.py +1035 -0
- guidellm/scheduler/environments.py +252 -0
- guidellm/scheduler/scheduler.py +140 -368
- guidellm/scheduler/schemas.py +272 -0
- guidellm/scheduler/strategies.py +519 -0
- guidellm/scheduler/worker.py +391 -420
- guidellm/scheduler/worker_group.py +707 -0
- guidellm/schemas/__init__.py +31 -0
- guidellm/schemas/info.py +159 -0
- guidellm/schemas/request.py +216 -0
- guidellm/schemas/response.py +119 -0
- guidellm/schemas/stats.py +228 -0
- guidellm/{config.py → settings.py} +32 -21
- guidellm/utils/__init__.py +95 -8
- guidellm/utils/auto_importer.py +98 -0
- guidellm/utils/cli.py +46 -2
- guidellm/utils/console.py +183 -0
- guidellm/utils/encoding.py +778 -0
- guidellm/utils/functions.py +134 -0
- guidellm/utils/hf_datasets.py +1 -2
- guidellm/utils/hf_transformers.py +4 -4
- guidellm/utils/imports.py +9 -0
- guidellm/utils/messaging.py +1118 -0
- guidellm/utils/mixins.py +115 -0
- guidellm/utils/pydantic_utils.py +411 -0
- guidellm/utils/random.py +3 -4
- guidellm/utils/registry.py +220 -0
- guidellm/utils/singleton.py +133 -0
- guidellm/{objects → utils}/statistics.py +341 -247
- guidellm/utils/synchronous.py +159 -0
- guidellm/utils/text.py +163 -50
- guidellm/utils/typing.py +41 -0
- guidellm/version.py +1 -1
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/METADATA +33 -10
- guidellm-0.4.0a155.dist-info/RECORD +96 -0
- guidellm/backend/__init__.py +0 -23
- guidellm/backend/backend.py +0 -259
- guidellm/backend/openai.py +0 -705
- guidellm/backend/response.py +0 -136
- guidellm/benchmark/aggregator.py +0 -760
- guidellm/benchmark/benchmark.py +0 -837
- guidellm/benchmark/scenario.py +0 -104
- guidellm/data/prideandprejudice.txt.gz +0 -0
- guidellm/dataset/__init__.py +0 -22
- guidellm/dataset/creator.py +0 -213
- guidellm/dataset/entrypoints.py +0 -42
- guidellm/dataset/file.py +0 -92
- guidellm/dataset/hf_datasets.py +0 -62
- guidellm/dataset/in_memory.py +0 -132
- guidellm/dataset/synthetic.py +0 -287
- guidellm/objects/__init__.py +0 -18
- guidellm/objects/pydantic.py +0 -89
- guidellm/request/__init__.py +0 -18
- guidellm/request/loader.py +0 -284
- guidellm/request/request.py +0 -79
- guidellm/request/types.py +0 -10
- guidellm/scheduler/queues.py +0 -25
- guidellm/scheduler/result.py +0 -155
- guidellm/scheduler/strategy.py +0 -495
- guidellm-0.4.0a18.dist-info/RECORD +0 -62
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/WHEEL +0 -0
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/entry_points.txt +0 -0
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/licenses/LICENSE +0 -0
- {guidellm-0.4.0a18.dist-info → guidellm-0.4.0a155.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,455 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Response handlers for processing API responses from different generation backends.
|
|
3
|
+
|
|
4
|
+
Provides a pluggable system for handling responses from language model backends,
|
|
5
|
+
supporting both streaming and non-streaming responses. Each handler implements the
|
|
6
|
+
GenerationResponseHandler protocol to parse API responses, extract usage metrics,
|
|
7
|
+
and convert them into standardized GenerationResponse objects.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Any, Protocol
|
|
13
|
+
|
|
14
|
+
from guidellm.schemas import GenerationRequest, GenerationResponse, UsageMetrics
|
|
15
|
+
from guidellm.utils import RegistryMixin, json
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"AudioResponseHandler",
|
|
19
|
+
"ChatCompletionsResponseHandler",
|
|
20
|
+
"GenerationResponseHandler",
|
|
21
|
+
"GenerationResponseHandlerFactory",
|
|
22
|
+
"TextCompletionsResponseHandler",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class GenerationResponseHandler(Protocol):
|
|
27
|
+
"""
|
|
28
|
+
Protocol for handling generation API responses.
|
|
29
|
+
|
|
30
|
+
Defines the interface for processing both streaming and non-streaming responses
|
|
31
|
+
from backend APIs, converting them into standardized GenerationResponse objects
|
|
32
|
+
with consistent metrics extraction.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def compile_non_streaming(
|
|
36
|
+
self, request: GenerationRequest, response: Any
|
|
37
|
+
) -> GenerationResponse:
|
|
38
|
+
"""
|
|
39
|
+
Process a complete non-streaming API response.
|
|
40
|
+
|
|
41
|
+
:param request: Original generation request
|
|
42
|
+
:param response: Raw API response data from the backend
|
|
43
|
+
:return: Standardized GenerationResponse with extracted metrics
|
|
44
|
+
"""
|
|
45
|
+
...
|
|
46
|
+
|
|
47
|
+
def add_streaming_line(self, line: str) -> int | None:
|
|
48
|
+
"""
|
|
49
|
+
Process a single line from a streaming response.
|
|
50
|
+
|
|
51
|
+
:param line: Raw line from the streaming response
|
|
52
|
+
:return: 1 if content was updated, 0 if line was ignored, None if done
|
|
53
|
+
"""
|
|
54
|
+
...
|
|
55
|
+
|
|
56
|
+
def compile_streaming(self, request: GenerationRequest) -> GenerationResponse:
|
|
57
|
+
"""
|
|
58
|
+
Compile accumulated streaming data into a final response.
|
|
59
|
+
|
|
60
|
+
:param request: Original generation request
|
|
61
|
+
:return: Standardized GenerationResponse with extracted metrics
|
|
62
|
+
"""
|
|
63
|
+
...
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class GenerationResponseHandlerFactory(RegistryMixin[type[GenerationResponseHandler]]):
|
|
67
|
+
"""
|
|
68
|
+
Factory for registering and creating response handlers by backend type.
|
|
69
|
+
|
|
70
|
+
Registry-based system for associating handler classes with specific backend API
|
|
71
|
+
types, enabling automatic selection of the appropriate handler for processing
|
|
72
|
+
responses from different generation services.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@GenerationResponseHandlerFactory.register("text_completions")
|
|
77
|
+
class TextCompletionsResponseHandler(GenerationResponseHandler):
|
|
78
|
+
"""
|
|
79
|
+
Response handler for OpenAI-style text completion endpoints.
|
|
80
|
+
|
|
81
|
+
Processes responses from text completion APIs that return generated text in the
|
|
82
|
+
'choices' array with 'text' fields. Handles both streaming and non-streaming
|
|
83
|
+
responses, extracting usage metrics for input and output tokens.
|
|
84
|
+
|
|
85
|
+
Example:
|
|
86
|
+
::
|
|
87
|
+
handler = TextCompletionsResponseHandler()
|
|
88
|
+
response = handler.compile_non_streaming(request, api_response)
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def __init__(self):
|
|
92
|
+
"""
|
|
93
|
+
Initialize the text completions response handler.
|
|
94
|
+
|
|
95
|
+
Sets up internal state for accumulating streaming response data including
|
|
96
|
+
text chunks and usage metrics.
|
|
97
|
+
"""
|
|
98
|
+
self.streaming_texts: list[str] = []
|
|
99
|
+
self.streaming_usage: dict[str, int | dict[str, int]] | None = None
|
|
100
|
+
|
|
101
|
+
def compile_non_streaming(
|
|
102
|
+
self, request: GenerationRequest, response: dict
|
|
103
|
+
) -> GenerationResponse:
|
|
104
|
+
"""
|
|
105
|
+
Process a complete text completion response.
|
|
106
|
+
|
|
107
|
+
:param request: Original generation request
|
|
108
|
+
:param response: Complete API response containing choices and usage data
|
|
109
|
+
:return: Standardized GenerationResponse with extracted text and metrics
|
|
110
|
+
"""
|
|
111
|
+
choices, usage = self.extract_choices_and_usage(response)
|
|
112
|
+
input_metrics, output_metrics = self.extract_metrics(usage)
|
|
113
|
+
|
|
114
|
+
return GenerationResponse(
|
|
115
|
+
request_id=request.request_id,
|
|
116
|
+
request_args=str(
|
|
117
|
+
request.arguments.model_dump() if request.arguments else None
|
|
118
|
+
),
|
|
119
|
+
text=choices[0].get("text", "") if choices else "",
|
|
120
|
+
input_metrics=input_metrics,
|
|
121
|
+
output_metrics=output_metrics,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def add_streaming_line(self, line: str) -> int | None:
|
|
125
|
+
"""
|
|
126
|
+
Process a single line from a text completion streaming response.
|
|
127
|
+
|
|
128
|
+
Parses Server-Sent Events (SSE) formatted lines, extracting text content
|
|
129
|
+
and usage metrics. Accumulates text chunks for final response compilation.
|
|
130
|
+
|
|
131
|
+
:param line: Raw SSE line from the streaming response
|
|
132
|
+
:return: 1 if text content was extracted, 0 if line ignored, None if done
|
|
133
|
+
"""
|
|
134
|
+
if not (data := self.extract_line_data(line)):
|
|
135
|
+
return None if data is None else 0
|
|
136
|
+
|
|
137
|
+
updated = False
|
|
138
|
+
choices, usage = self.extract_choices_and_usage(data)
|
|
139
|
+
|
|
140
|
+
if text := choices[0].get("text"):
|
|
141
|
+
self.streaming_texts.append(text)
|
|
142
|
+
updated = True
|
|
143
|
+
|
|
144
|
+
if usage:
|
|
145
|
+
self.streaming_usage = usage
|
|
146
|
+
|
|
147
|
+
return 1 if updated else 0
|
|
148
|
+
|
|
149
|
+
def compile_streaming(self, request: GenerationRequest) -> GenerationResponse:
|
|
150
|
+
"""
|
|
151
|
+
Compile accumulated streaming text chunks into a final response.
|
|
152
|
+
|
|
153
|
+
:param request: Original generation request
|
|
154
|
+
:return: Standardized GenerationResponse with concatenated text and metrics
|
|
155
|
+
"""
|
|
156
|
+
input_metrics, output_metrics = self.extract_metrics(self.streaming_usage)
|
|
157
|
+
|
|
158
|
+
return GenerationResponse(
|
|
159
|
+
request_id=request.request_id,
|
|
160
|
+
request_args=str(
|
|
161
|
+
request.arguments.model_dump() if request.arguments else None
|
|
162
|
+
),
|
|
163
|
+
text="".join(self.streaming_texts),
|
|
164
|
+
input_metrics=input_metrics,
|
|
165
|
+
output_metrics=output_metrics,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def extract_line_data(self, line: str) -> dict[str, Any] | None:
|
|
169
|
+
"""
|
|
170
|
+
Extract JSON data from a streaming response line.
|
|
171
|
+
|
|
172
|
+
:param line: Raw line from the streaming response
|
|
173
|
+
:return: Parsed JSON data as dictionary, or None if line indicates completion
|
|
174
|
+
"""
|
|
175
|
+
if line == "data: [DONE]":
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
if not line or not (line := line.strip()) or not line.startswith("data:"):
|
|
179
|
+
return {}
|
|
180
|
+
|
|
181
|
+
line = line[len("data:") :].strip()
|
|
182
|
+
|
|
183
|
+
return json.loads(line)
|
|
184
|
+
|
|
185
|
+
def extract_choices_and_usage(
|
|
186
|
+
self, response: dict
|
|
187
|
+
) -> tuple[list[dict], dict[str, int | dict[str, int]]]:
|
|
188
|
+
"""
|
|
189
|
+
Extract choices and usage data from the API response.
|
|
190
|
+
|
|
191
|
+
:param response: Complete API response containing choices and usage data
|
|
192
|
+
:return: Tuple of choices list and usage dictionary
|
|
193
|
+
"""
|
|
194
|
+
return response.get("choices", []), response.get("usage", {})
|
|
195
|
+
|
|
196
|
+
def extract_metrics(
|
|
197
|
+
self, usage: dict[str, int | dict[str, int]] | None
|
|
198
|
+
) -> tuple[UsageMetrics, UsageMetrics]:
|
|
199
|
+
"""
|
|
200
|
+
Extract input and output usage metrics from API response usage data.
|
|
201
|
+
|
|
202
|
+
:param usage: Usage data dictionary from API response
|
|
203
|
+
:return: Tuple of input_metrics and output_metrics as UsageMetrics objects
|
|
204
|
+
"""
|
|
205
|
+
if not usage:
|
|
206
|
+
return UsageMetrics(), UsageMetrics()
|
|
207
|
+
|
|
208
|
+
input_details: dict[str, int] = usage.get("prompt_tokens_details", {}) or {}
|
|
209
|
+
output_details: dict[str, int] = (
|
|
210
|
+
usage.get("completion_tokens_details", {}) or {}
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
return UsageMetrics(
|
|
214
|
+
text_tokens=(
|
|
215
|
+
input_details.get("prompt_tokens") or usage.get("prompt_tokens")
|
|
216
|
+
),
|
|
217
|
+
image_tokens=input_details.get("image_tokens"),
|
|
218
|
+
video_tokens=input_details.get("video_tokens"),
|
|
219
|
+
audio_tokens=input_details.get("audio_tokens"),
|
|
220
|
+
audio_seconds=input_details.get("seconds"),
|
|
221
|
+
), UsageMetrics(
|
|
222
|
+
text_tokens=(
|
|
223
|
+
output_details.get("completion_tokens")
|
|
224
|
+
or usage.get("completion_tokens")
|
|
225
|
+
),
|
|
226
|
+
image_tokens=output_details.get("image_tokens"),
|
|
227
|
+
video_tokens=output_details.get("video_tokens"),
|
|
228
|
+
audio_tokens=output_details.get("audio_tokens"),
|
|
229
|
+
audio_seconds=output_details.get("seconds"),
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
@GenerationResponseHandlerFactory.register("chat_completions")
|
|
234
|
+
class ChatCompletionsResponseHandler(TextCompletionsResponseHandler):
|
|
235
|
+
"""
|
|
236
|
+
Response handler for OpenAI-style chat completion endpoints.
|
|
237
|
+
|
|
238
|
+
Extends TextCompletionsResponseHandler to handle chat completion responses where
|
|
239
|
+
generated text is nested within message objects in the choices array. Processes
|
|
240
|
+
both streaming and non-streaming chat completion responses.
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
def compile_non_streaming(
|
|
244
|
+
self, request: GenerationRequest, response: dict
|
|
245
|
+
) -> GenerationResponse:
|
|
246
|
+
"""
|
|
247
|
+
Process a complete chat completion response.
|
|
248
|
+
|
|
249
|
+
Extracts content from the message object within choices, handling the nested
|
|
250
|
+
structure specific to chat completion endpoints.
|
|
251
|
+
|
|
252
|
+
:param request: Original generation request
|
|
253
|
+
:param response: Complete API response containing choices and usage data
|
|
254
|
+
:return: Standardized GenerationResponse with extracted content and metrics
|
|
255
|
+
"""
|
|
256
|
+
choices, usage = self.extract_choices_and_usage(response)
|
|
257
|
+
input_metrics, output_metrics = self.extract_metrics(usage)
|
|
258
|
+
|
|
259
|
+
return GenerationResponse(
|
|
260
|
+
request_id=request.request_id,
|
|
261
|
+
request_args=str(
|
|
262
|
+
request.arguments.model_dump() if request.arguments else None
|
|
263
|
+
),
|
|
264
|
+
text=(choices[0].get("message", {}).get("content", "") if choices else ""),
|
|
265
|
+
input_metrics=input_metrics,
|
|
266
|
+
output_metrics=output_metrics,
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
def add_streaming_line(self, line: str) -> int | None:
|
|
270
|
+
"""
|
|
271
|
+
Process a single line from a chat completion streaming response.
|
|
272
|
+
|
|
273
|
+
Handles the chat completion specific delta structure where content is nested
|
|
274
|
+
within delta objects in the streaming response chunks.
|
|
275
|
+
|
|
276
|
+
:param line: Raw SSE line from the streaming response
|
|
277
|
+
:return: 1 if content was extracted, 0 if line ignored, None if done
|
|
278
|
+
"""
|
|
279
|
+
if not (data := self.extract_line_data(line)):
|
|
280
|
+
return None if data is None else 0
|
|
281
|
+
|
|
282
|
+
updated = False
|
|
283
|
+
choices, usage = self.extract_choices_and_usage(data)
|
|
284
|
+
|
|
285
|
+
if choices and (content := choices[0].get("delta", {}).get("content")):
|
|
286
|
+
self.streaming_texts.append(content)
|
|
287
|
+
updated = True
|
|
288
|
+
|
|
289
|
+
if usage:
|
|
290
|
+
self.streaming_usage = usage
|
|
291
|
+
|
|
292
|
+
return 1 if updated else 0
|
|
293
|
+
|
|
294
|
+
def compile_streaming(self, request: GenerationRequest) -> GenerationResponse:
|
|
295
|
+
"""
|
|
296
|
+
Compile accumulated streaming chat completion content into a final response.
|
|
297
|
+
|
|
298
|
+
:param request: Original generation request
|
|
299
|
+
:return: Standardized GenerationResponse with concatenated content and metrics
|
|
300
|
+
"""
|
|
301
|
+
input_metrics, output_metrics = self.extract_metrics(self.streaming_usage)
|
|
302
|
+
|
|
303
|
+
return GenerationResponse(
|
|
304
|
+
request_id=request.request_id,
|
|
305
|
+
request_args=str(
|
|
306
|
+
request.arguments.model_dump() if request.arguments else None
|
|
307
|
+
),
|
|
308
|
+
text="".join(self.streaming_texts),
|
|
309
|
+
input_metrics=input_metrics,
|
|
310
|
+
output_metrics=output_metrics,
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
@GenerationResponseHandlerFactory.register(
|
|
315
|
+
["audio_transcriptions", "audio_translations"]
|
|
316
|
+
)
|
|
317
|
+
class AudioResponseHandler:
|
|
318
|
+
"""
|
|
319
|
+
Response handler for audio transcription and translation endpoints.
|
|
320
|
+
|
|
321
|
+
Processes responses from audio processing APIs that convert speech to text,
|
|
322
|
+
handling both transcription and translation services. Manages audio-specific
|
|
323
|
+
usage metrics including audio tokens and processing duration.
|
|
324
|
+
|
|
325
|
+
Example:
|
|
326
|
+
::
|
|
327
|
+
handler = AudioResponseHandler()
|
|
328
|
+
response = handler.compile_non_streaming(request, api_response)
|
|
329
|
+
"""
|
|
330
|
+
|
|
331
|
+
def __init__(self):
|
|
332
|
+
"""
|
|
333
|
+
Initialize the audio response handler.
|
|
334
|
+
|
|
335
|
+
Sets up internal state for accumulating streaming response data including
|
|
336
|
+
audio buffers, text chunks, and usage metrics.
|
|
337
|
+
"""
|
|
338
|
+
self.streaming_buffer: bytearray = bytearray()
|
|
339
|
+
self.streaming_texts: list[str] = []
|
|
340
|
+
self.streaming_usage: dict[str, int | dict[str, int]] | None = None
|
|
341
|
+
|
|
342
|
+
def compile_non_streaming(
|
|
343
|
+
self, request: GenerationRequest, response: dict
|
|
344
|
+
) -> GenerationResponse:
|
|
345
|
+
"""
|
|
346
|
+
Process a complete audio transcription or translation response.
|
|
347
|
+
|
|
348
|
+
Extracts transcribed or translated text and audio-specific usage metrics
|
|
349
|
+
including processing duration and token counts for audio content.
|
|
350
|
+
|
|
351
|
+
:param request: Original generation request
|
|
352
|
+
:param response: Complete API response containing text and usage data
|
|
353
|
+
:return: Standardized GenerationResponse with extracted text and metrics
|
|
354
|
+
"""
|
|
355
|
+
usage: dict[str, int | dict[str, int]] = response.get("usage", {})
|
|
356
|
+
input_details: dict[str, int] = usage.get("input_token_details", {}) or {}
|
|
357
|
+
output_details: dict[str, int] = usage.get("output_token_details", {}) or {}
|
|
358
|
+
text: str = response.get("text", "")
|
|
359
|
+
|
|
360
|
+
return GenerationResponse(
|
|
361
|
+
request_id=request.request_id,
|
|
362
|
+
request_args=str(
|
|
363
|
+
request.arguments.model_dump() if request.arguments else None
|
|
364
|
+
),
|
|
365
|
+
text=text,
|
|
366
|
+
input_metrics=UsageMetrics(
|
|
367
|
+
text_tokens=input_details.get("text_tokens", usage.get("input_tokens")),
|
|
368
|
+
audio_tokens=input_details.get(
|
|
369
|
+
"audio_tokens", usage.get("input_tokens")
|
|
370
|
+
),
|
|
371
|
+
audio_seconds=input_details.get("seconds", usage.get("seconds")),
|
|
372
|
+
),
|
|
373
|
+
output_metrics=UsageMetrics(
|
|
374
|
+
text_tokens=output_details.get(
|
|
375
|
+
"text_tokens", usage.get("output_tokens")
|
|
376
|
+
),
|
|
377
|
+
),
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
def add_streaming_line(self, line: str) -> int | None:
|
|
381
|
+
"""
|
|
382
|
+
Process a single line from an audio streaming response.
|
|
383
|
+
|
|
384
|
+
Handles JSON-formatted streaming responses from audio processing endpoints,
|
|
385
|
+
extracting text content and usage metrics as they become available.
|
|
386
|
+
|
|
387
|
+
:param line: Raw JSON line from the streaming response
|
|
388
|
+
:return: 1 if text content was extracted, 0 if line ignored, None if done
|
|
389
|
+
"""
|
|
390
|
+
if line == "data: [DONE]":
|
|
391
|
+
return None
|
|
392
|
+
|
|
393
|
+
if not line or not (line := line.strip()) or not line.startswith("{"):
|
|
394
|
+
return 0
|
|
395
|
+
|
|
396
|
+
data: dict[str, Any] = json.loads(line)
|
|
397
|
+
text: str
|
|
398
|
+
usage: dict[str, int | dict[str, int]]
|
|
399
|
+
updated = False
|
|
400
|
+
|
|
401
|
+
if text := data.get("text"):
|
|
402
|
+
self.streaming_texts.append(text)
|
|
403
|
+
updated = True
|
|
404
|
+
|
|
405
|
+
if usage := data.get("usage"):
|
|
406
|
+
self.streaming_usage = usage
|
|
407
|
+
|
|
408
|
+
return 1 if updated else 0
|
|
409
|
+
|
|
410
|
+
def compile_streaming(self, request: GenerationRequest) -> GenerationResponse:
|
|
411
|
+
"""
|
|
412
|
+
Compile accumulated streaming audio text into a final response.
|
|
413
|
+
|
|
414
|
+
:param request: Original generation request
|
|
415
|
+
:return: Standardized GenerationResponse with concatenated text and metrics
|
|
416
|
+
"""
|
|
417
|
+
input_metrics, output_metrics = self.extract_metrics(self.streaming_usage)
|
|
418
|
+
|
|
419
|
+
return GenerationResponse(
|
|
420
|
+
request_id=request.request_id,
|
|
421
|
+
request_args=str(
|
|
422
|
+
request.arguments.model_dump() if request.arguments else None
|
|
423
|
+
),
|
|
424
|
+
text="".join(self.streaming_texts),
|
|
425
|
+
input_metrics=input_metrics,
|
|
426
|
+
output_metrics=output_metrics,
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
def extract_metrics(
|
|
430
|
+
self, usage: dict[str, int | dict[str, int]] | None
|
|
431
|
+
) -> tuple[UsageMetrics, UsageMetrics]:
|
|
432
|
+
"""
|
|
433
|
+
Extract input and output usage metrics from audio API response usage data.
|
|
434
|
+
|
|
435
|
+
Handles audio-specific metrics including processing duration and audio tokens
|
|
436
|
+
in addition to standard text token counts.
|
|
437
|
+
|
|
438
|
+
:param usage: Usage data dictionary from audio API response
|
|
439
|
+
:return: Tuple of input_metrics and output_metrics as UsageMetrics objects
|
|
440
|
+
"""
|
|
441
|
+
if not usage:
|
|
442
|
+
return UsageMetrics(), UsageMetrics()
|
|
443
|
+
|
|
444
|
+
input_details: dict[str, int] = usage.get("input_token_details", {}) or {}
|
|
445
|
+
output_details: dict[str, int] = usage.get("output_token_details", {}) or {}
|
|
446
|
+
|
|
447
|
+
return UsageMetrics(
|
|
448
|
+
text_tokens=(input_details.get("text_tokens") or usage.get("input_tokens")),
|
|
449
|
+
audio_tokens=(
|
|
450
|
+
input_details.get("audio_tokens") or usage.get("audio_tokens")
|
|
451
|
+
),
|
|
452
|
+
audio_seconds=(input_details.get("seconds") or usage.get("seconds")),
|
|
453
|
+
), UsageMetrics(
|
|
454
|
+
text_tokens=output_details.get("text_tokens") or usage.get("output_tokens"),
|
|
455
|
+
)
|
guidellm/benchmark/__init__.py
CHANGED
|
@@ -1,19 +1,23 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
from .benchmarker import Benchmarker, BenchmarkerResult, GenerativeBenchmarker
|
|
1
|
+
"""
|
|
2
|
+
Benchmark execution and performance analysis framework.
|
|
3
|
+
|
|
4
|
+
Provides comprehensive benchmarking capabilities for LLM inference workloads,
|
|
5
|
+
including profile-based execution strategies, metrics collection and aggregation,
|
|
6
|
+
progress tracking, and multi-format output generation. Supports synchronous,
|
|
7
|
+
asynchronous, concurrent, sweep, and throughput-based benchmarking profiles for
|
|
8
|
+
evaluating model performance under various load conditions.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from .benchmarker import Benchmarker
|
|
15
14
|
from .entrypoints import benchmark_generative_text, reimport_benchmarks_report
|
|
16
|
-
from .output import
|
|
15
|
+
from .output import (
|
|
16
|
+
GenerativeBenchmarkerConsole,
|
|
17
|
+
GenerativeBenchmarkerCSV,
|
|
18
|
+
GenerativeBenchmarkerHTML,
|
|
19
|
+
GenerativeBenchmarkerOutput,
|
|
20
|
+
)
|
|
17
21
|
from .profile import (
|
|
18
22
|
AsyncProfile,
|
|
19
23
|
ConcurrentProfile,
|
|
@@ -22,46 +26,56 @@ from .profile import (
|
|
|
22
26
|
SweepProfile,
|
|
23
27
|
SynchronousProfile,
|
|
24
28
|
ThroughputProfile,
|
|
25
|
-
create_profile,
|
|
26
29
|
)
|
|
27
|
-
from .progress import
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
30
|
+
from .progress import BenchmarkerProgress, GenerativeConsoleBenchmarkerProgress
|
|
31
|
+
from .scenarios import get_builtin_scenarios
|
|
32
|
+
from .schemas import (
|
|
33
|
+
Benchmark,
|
|
34
|
+
BenchmarkerArgs,
|
|
35
|
+
BenchmarkerDict,
|
|
36
|
+
BenchmarkGenerativeTextArgs,
|
|
37
|
+
BenchmarkSchedulerStats,
|
|
38
|
+
EstimatedBenchmarkState,
|
|
39
|
+
GenerativeAudioMetricsSummary,
|
|
40
|
+
GenerativeBenchmark,
|
|
41
|
+
GenerativeBenchmarksReport,
|
|
42
|
+
GenerativeImageMetricsSummary,
|
|
43
|
+
GenerativeMetrics,
|
|
44
|
+
GenerativeMetricsSummary,
|
|
45
|
+
GenerativeVideoMetricsSummary,
|
|
46
|
+
SchedulerDict,
|
|
32
47
|
)
|
|
33
48
|
|
|
34
49
|
__all__ = [
|
|
35
|
-
"AggregatorT",
|
|
36
50
|
"AsyncProfile",
|
|
37
51
|
"Benchmark",
|
|
38
|
-
"
|
|
39
|
-
"
|
|
40
|
-
"BenchmarkMetrics",
|
|
41
|
-
"BenchmarkRunStats",
|
|
42
|
-
"BenchmarkT",
|
|
52
|
+
"BenchmarkGenerativeTextArgs",
|
|
53
|
+
"BenchmarkSchedulerStats",
|
|
43
54
|
"Benchmarker",
|
|
44
|
-
"
|
|
45
|
-
"
|
|
46
|
-
"
|
|
55
|
+
"BenchmarkerArgs",
|
|
56
|
+
"BenchmarkerDict",
|
|
57
|
+
"BenchmarkerProgress",
|
|
47
58
|
"ConcurrentProfile",
|
|
59
|
+
"EstimatedBenchmarkState",
|
|
60
|
+
"GenerativeAudioMetricsSummary",
|
|
48
61
|
"GenerativeBenchmark",
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
"
|
|
62
|
+
"GenerativeBenchmarkerCSV",
|
|
63
|
+
"GenerativeBenchmarkerConsole",
|
|
64
|
+
"GenerativeBenchmarkerHTML",
|
|
65
|
+
"GenerativeBenchmarkerOutput",
|
|
52
66
|
"GenerativeBenchmarksReport",
|
|
67
|
+
"GenerativeConsoleBenchmarkerProgress",
|
|
68
|
+
"GenerativeImageMetricsSummary",
|
|
53
69
|
"GenerativeMetrics",
|
|
54
|
-
"
|
|
55
|
-
"
|
|
56
|
-
"GenerativeTextErrorStats",
|
|
57
|
-
"GenerativeTextResponseStats",
|
|
70
|
+
"GenerativeMetricsSummary",
|
|
71
|
+
"GenerativeVideoMetricsSummary",
|
|
58
72
|
"Profile",
|
|
59
73
|
"ProfileType",
|
|
60
|
-
"
|
|
74
|
+
"SchedulerDict",
|
|
61
75
|
"SweepProfile",
|
|
62
76
|
"SynchronousProfile",
|
|
63
77
|
"ThroughputProfile",
|
|
64
78
|
"benchmark_generative_text",
|
|
65
|
-
"
|
|
79
|
+
"get_builtin_scenarios",
|
|
66
80
|
"reimport_benchmarks_report",
|
|
67
81
|
]
|