guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- guidellm/__init__.py +5 -2
- guidellm/__main__.py +524 -255
- guidellm/backends/__init__.py +33 -0
- guidellm/backends/backend.py +109 -0
- guidellm/backends/openai.py +340 -0
- guidellm/backends/response_handlers.py +428 -0
- guidellm/benchmark/__init__.py +69 -39
- guidellm/benchmark/benchmarker.py +160 -316
- guidellm/benchmark/entrypoints.py +560 -127
- guidellm/benchmark/outputs/__init__.py +24 -0
- guidellm/benchmark/outputs/console.py +633 -0
- guidellm/benchmark/outputs/csv.py +721 -0
- guidellm/benchmark/outputs/html.py +473 -0
- guidellm/benchmark/outputs/output.py +169 -0
- guidellm/benchmark/outputs/serialized.py +69 -0
- guidellm/benchmark/profiles.py +718 -0
- guidellm/benchmark/progress.py +553 -556
- guidellm/benchmark/scenarios/__init__.py +40 -0
- guidellm/benchmark/scenarios/chat.json +6 -0
- guidellm/benchmark/scenarios/rag.json +6 -0
- guidellm/benchmark/schemas/__init__.py +66 -0
- guidellm/benchmark/schemas/base.py +402 -0
- guidellm/benchmark/schemas/generative/__init__.py +55 -0
- guidellm/benchmark/schemas/generative/accumulator.py +841 -0
- guidellm/benchmark/schemas/generative/benchmark.py +163 -0
- guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
- guidellm/benchmark/schemas/generative/metrics.py +927 -0
- guidellm/benchmark/schemas/generative/report.py +158 -0
- guidellm/data/__init__.py +34 -4
- guidellm/data/builders.py +541 -0
- guidellm/data/collators.py +16 -0
- guidellm/data/config.py +120 -0
- guidellm/data/deserializers/__init__.py +49 -0
- guidellm/data/deserializers/deserializer.py +141 -0
- guidellm/data/deserializers/file.py +223 -0
- guidellm/data/deserializers/huggingface.py +94 -0
- guidellm/data/deserializers/memory.py +194 -0
- guidellm/data/deserializers/synthetic.py +246 -0
- guidellm/data/entrypoints.py +52 -0
- guidellm/data/loaders.py +190 -0
- guidellm/data/preprocessors/__init__.py +27 -0
- guidellm/data/preprocessors/formatters.py +410 -0
- guidellm/data/preprocessors/mappers.py +196 -0
- guidellm/data/preprocessors/preprocessor.py +30 -0
- guidellm/data/processor.py +29 -0
- guidellm/data/schemas.py +175 -0
- guidellm/data/utils/__init__.py +6 -0
- guidellm/data/utils/dataset.py +94 -0
- guidellm/extras/__init__.py +4 -0
- guidellm/extras/audio.py +220 -0
- guidellm/extras/vision.py +242 -0
- guidellm/logger.py +2 -2
- guidellm/mock_server/__init__.py +8 -0
- guidellm/mock_server/config.py +84 -0
- guidellm/mock_server/handlers/__init__.py +17 -0
- guidellm/mock_server/handlers/chat_completions.py +280 -0
- guidellm/mock_server/handlers/completions.py +280 -0
- guidellm/mock_server/handlers/tokenizer.py +142 -0
- guidellm/mock_server/models.py +510 -0
- guidellm/mock_server/server.py +238 -0
- guidellm/mock_server/utils.py +302 -0
- guidellm/scheduler/__init__.py +69 -26
- guidellm/scheduler/constraints/__init__.py +49 -0
- guidellm/scheduler/constraints/constraint.py +325 -0
- guidellm/scheduler/constraints/error.py +411 -0
- guidellm/scheduler/constraints/factory.py +182 -0
- guidellm/scheduler/constraints/request.py +312 -0
- guidellm/scheduler/constraints/saturation.py +722 -0
- guidellm/scheduler/environments.py +252 -0
- guidellm/scheduler/scheduler.py +137 -368
- guidellm/scheduler/schemas.py +358 -0
- guidellm/scheduler/strategies.py +617 -0
- guidellm/scheduler/worker.py +413 -419
- guidellm/scheduler/worker_group.py +712 -0
- guidellm/schemas/__init__.py +65 -0
- guidellm/schemas/base.py +417 -0
- guidellm/schemas/info.py +188 -0
- guidellm/schemas/request.py +235 -0
- guidellm/schemas/request_stats.py +349 -0
- guidellm/schemas/response.py +124 -0
- guidellm/schemas/statistics.py +1018 -0
- guidellm/{config.py → settings.py} +31 -24
- guidellm/utils/__init__.py +71 -8
- guidellm/utils/auto_importer.py +98 -0
- guidellm/utils/cli.py +132 -5
- guidellm/utils/console.py +566 -0
- guidellm/utils/encoding.py +778 -0
- guidellm/utils/functions.py +159 -0
- guidellm/utils/hf_datasets.py +1 -2
- guidellm/utils/hf_transformers.py +4 -4
- guidellm/utils/imports.py +9 -0
- guidellm/utils/messaging.py +1118 -0
- guidellm/utils/mixins.py +115 -0
- guidellm/utils/random.py +3 -4
- guidellm/utils/registry.py +220 -0
- guidellm/utils/singleton.py +133 -0
- guidellm/utils/synchronous.py +159 -0
- guidellm/utils/text.py +163 -50
- guidellm/utils/typing.py +41 -0
- guidellm/version.py +2 -2
- guidellm-0.6.0a5.dist-info/METADATA +364 -0
- guidellm-0.6.0a5.dist-info/RECORD +109 -0
- guidellm/backend/__init__.py +0 -23
- guidellm/backend/backend.py +0 -259
- guidellm/backend/openai.py +0 -708
- guidellm/backend/response.py +0 -136
- guidellm/benchmark/aggregator.py +0 -760
- guidellm/benchmark/benchmark.py +0 -837
- guidellm/benchmark/output.py +0 -997
- guidellm/benchmark/profile.py +0 -409
- guidellm/benchmark/scenario.py +0 -104
- guidellm/data/prideandprejudice.txt.gz +0 -0
- guidellm/dataset/__init__.py +0 -22
- guidellm/dataset/creator.py +0 -213
- guidellm/dataset/entrypoints.py +0 -42
- guidellm/dataset/file.py +0 -92
- guidellm/dataset/hf_datasets.py +0 -62
- guidellm/dataset/in_memory.py +0 -132
- guidellm/dataset/synthetic.py +0 -287
- guidellm/objects/__init__.py +0 -18
- guidellm/objects/pydantic.py +0 -89
- guidellm/objects/statistics.py +0 -953
- guidellm/preprocess/__init__.py +0 -3
- guidellm/preprocess/dataset.py +0 -374
- guidellm/presentation/__init__.py +0 -28
- guidellm/presentation/builder.py +0 -27
- guidellm/presentation/data_models.py +0 -232
- guidellm/presentation/injector.py +0 -66
- guidellm/request/__init__.py +0 -18
- guidellm/request/loader.py +0 -284
- guidellm/request/request.py +0 -79
- guidellm/request/types.py +0 -10
- guidellm/scheduler/queues.py +0 -25
- guidellm/scheduler/result.py +0 -155
- guidellm/scheduler/strategy.py +0 -495
- guidellm-0.3.1.dist-info/METADATA +0 -329
- guidellm-0.3.1.dist-info/RECORD +0 -62
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Response handlers for processing API responses from different generation backends.
|
|
3
|
+
|
|
4
|
+
Provides a pluggable system for handling responses from language model backends,
|
|
5
|
+
supporting both streaming and non-streaming responses. Each handler implements the
|
|
6
|
+
GenerationResponseHandler protocol to parse API responses, extract usage metrics,
|
|
7
|
+
and convert them into standardized GenerationResponse objects.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Any, Protocol, cast
|
|
13
|
+
|
|
14
|
+
from guidellm.schemas import GenerationRequest, GenerationResponse, UsageMetrics
|
|
15
|
+
from guidellm.utils import RegistryMixin, json
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"AudioResponseHandler",
|
|
19
|
+
"ChatCompletionsResponseHandler",
|
|
20
|
+
"GenerationResponseHandler",
|
|
21
|
+
"GenerationResponseHandlerFactory",
|
|
22
|
+
"TextCompletionsResponseHandler",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class GenerationResponseHandler(Protocol):
|
|
27
|
+
"""
|
|
28
|
+
Protocol for handling generation API responses.
|
|
29
|
+
|
|
30
|
+
Defines the interface for processing both streaming and non-streaming responses
|
|
31
|
+
from backend APIs, converting them into standardized GenerationResponse objects
|
|
32
|
+
with consistent metrics extraction.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def compile_non_streaming(
|
|
36
|
+
self, request: GenerationRequest, response: Any
|
|
37
|
+
) -> GenerationResponse:
|
|
38
|
+
"""
|
|
39
|
+
Process a complete non-streaming API response.
|
|
40
|
+
|
|
41
|
+
:param request: Original generation request
|
|
42
|
+
:param response: Raw API response data from the backend
|
|
43
|
+
:return: Standardized GenerationResponse with extracted metrics
|
|
44
|
+
"""
|
|
45
|
+
...
|
|
46
|
+
|
|
47
|
+
def add_streaming_line(self, line: str) -> int | None:
|
|
48
|
+
"""
|
|
49
|
+
Process a single line from a streaming response.
|
|
50
|
+
|
|
51
|
+
:param line: Raw line from the streaming response
|
|
52
|
+
:return: 1 if content was updated, 0 if line was ignored, None if done
|
|
53
|
+
"""
|
|
54
|
+
...
|
|
55
|
+
|
|
56
|
+
def compile_streaming(self, request: GenerationRequest) -> GenerationResponse:
|
|
57
|
+
"""
|
|
58
|
+
Compile accumulated streaming data into a final response.
|
|
59
|
+
|
|
60
|
+
:param request: Original generation request
|
|
61
|
+
:return: Standardized GenerationResponse with extracted metrics
|
|
62
|
+
"""
|
|
63
|
+
...
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class GenerationResponseHandlerFactory(RegistryMixin[type[GenerationResponseHandler]]):
|
|
67
|
+
"""
|
|
68
|
+
Factory for registering and creating response handlers by backend type.
|
|
69
|
+
|
|
70
|
+
Registry-based system for associating handler classes with specific backend API
|
|
71
|
+
types, enabling automatic selection of the appropriate handler for processing
|
|
72
|
+
responses from different generation services.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
def create(
|
|
77
|
+
cls,
|
|
78
|
+
request_type: str,
|
|
79
|
+
handler_overrides: dict[str, type[GenerationResponseHandler]] | None = None,
|
|
80
|
+
) -> GenerationResponseHandler:
|
|
81
|
+
"""
|
|
82
|
+
Create a response handler class for the given request type.
|
|
83
|
+
|
|
84
|
+
:param request_type: The type of generation request (e.g., "text_completions")
|
|
85
|
+
:param handler_overrides: Optional mapping of request types to handler classes
|
|
86
|
+
to override the default registry by checking first and then falling back
|
|
87
|
+
to the registered handlers.
|
|
88
|
+
:return: The corresponding instantiated GenerationResponseHandler
|
|
89
|
+
:raises ValueError: When no handler is registered for the request type
|
|
90
|
+
"""
|
|
91
|
+
if handler_overrides and request_type in handler_overrides:
|
|
92
|
+
return handler_overrides[request_type]()
|
|
93
|
+
|
|
94
|
+
handler_cls = cls.get_registered_object(request_type)
|
|
95
|
+
if not handler_cls:
|
|
96
|
+
raise ValueError(
|
|
97
|
+
f"No response handler registered for type '{request_type}'."
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
return handler_cls()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@GenerationResponseHandlerFactory.register("text_completions")
|
|
104
|
+
class TextCompletionsResponseHandler(GenerationResponseHandler):
|
|
105
|
+
"""
|
|
106
|
+
Response handler for OpenAI-style text completion endpoints.
|
|
107
|
+
|
|
108
|
+
Processes responses from text completion APIs that return generated text in the
|
|
109
|
+
'choices' array with 'text' fields. Handles both streaming and non-streaming
|
|
110
|
+
responses, extracting usage metrics for input and output tokens.
|
|
111
|
+
|
|
112
|
+
Example:
|
|
113
|
+
::
|
|
114
|
+
handler = TextCompletionsResponseHandler()
|
|
115
|
+
response = handler.compile_non_streaming(request, api_response)
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
def __init__(self):
|
|
119
|
+
"""
|
|
120
|
+
Initialize the text completions response handler.
|
|
121
|
+
|
|
122
|
+
Sets up internal state for accumulating streaming response data including
|
|
123
|
+
text chunks and usage metrics.
|
|
124
|
+
"""
|
|
125
|
+
self.streaming_texts: list[str] = []
|
|
126
|
+
self.streaming_usage: dict[str, int | dict[str, int]] | None = None
|
|
127
|
+
self.streaming_response_id: str | None = None
|
|
128
|
+
|
|
129
|
+
def compile_non_streaming(
|
|
130
|
+
self, request: GenerationRequest, response: dict
|
|
131
|
+
) -> GenerationResponse:
|
|
132
|
+
"""
|
|
133
|
+
Process a complete text completion response.
|
|
134
|
+
|
|
135
|
+
:param request: Original generation request
|
|
136
|
+
:param response: Complete API response containing choices and usage data
|
|
137
|
+
:return: Standardized GenerationResponse with extracted text and metrics
|
|
138
|
+
"""
|
|
139
|
+
choices, usage = self.extract_choices_and_usage(response)
|
|
140
|
+
choice = choices[0] if choices else {}
|
|
141
|
+
text = choice.get("text", "")
|
|
142
|
+
input_metrics, output_metrics = self.extract_metrics(usage, text)
|
|
143
|
+
|
|
144
|
+
return GenerationResponse(
|
|
145
|
+
request_id=request.request_id,
|
|
146
|
+
request_args=str(
|
|
147
|
+
request.arguments.model_dump() if request.arguments else None
|
|
148
|
+
),
|
|
149
|
+
response_id=response.get("id"), # use vLLM ID if available
|
|
150
|
+
text=text,
|
|
151
|
+
input_metrics=input_metrics,
|
|
152
|
+
output_metrics=output_metrics,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
def add_streaming_line(self, line: str) -> int | None:
|
|
156
|
+
"""
|
|
157
|
+
Process a single line from a text completion streaming response.
|
|
158
|
+
|
|
159
|
+
Parses Server-Sent Events (SSE) formatted lines, extracting text content
|
|
160
|
+
and usage metrics. Accumulates text chunks for final response compilation.
|
|
161
|
+
|
|
162
|
+
:param line: Raw SSE line from the streaming response
|
|
163
|
+
:return: 1 if text content was extracted, 0 if line ignored, None if done
|
|
164
|
+
"""
|
|
165
|
+
if not (data := self.extract_line_data(line)):
|
|
166
|
+
return None if data is None else 0
|
|
167
|
+
|
|
168
|
+
if "id" in data and self.streaming_response_id is None:
|
|
169
|
+
self.streaming_response_id = data["id"]
|
|
170
|
+
|
|
171
|
+
updated = False
|
|
172
|
+
choices, usage = self.extract_choices_and_usage(data)
|
|
173
|
+
choice = choices[0] if choices else {}
|
|
174
|
+
|
|
175
|
+
if choices and (text := choice.get("text")):
|
|
176
|
+
self.streaming_texts.append(text)
|
|
177
|
+
updated = True
|
|
178
|
+
|
|
179
|
+
if usage:
|
|
180
|
+
self.streaming_usage = usage
|
|
181
|
+
|
|
182
|
+
return 1 if updated else 0
|
|
183
|
+
|
|
184
|
+
def compile_streaming(self, request: GenerationRequest) -> GenerationResponse:
|
|
185
|
+
"""
|
|
186
|
+
Compile accumulated streaming text chunks into a final response.
|
|
187
|
+
|
|
188
|
+
:param request: Original generation request
|
|
189
|
+
:return: Standardized GenerationResponse with concatenated text and metrics
|
|
190
|
+
"""
|
|
191
|
+
text = "".join(self.streaming_texts)
|
|
192
|
+
input_metrics, output_metrics = self.extract_metrics(self.streaming_usage, text)
|
|
193
|
+
|
|
194
|
+
return GenerationResponse(
|
|
195
|
+
request_id=request.request_id,
|
|
196
|
+
request_args=str(
|
|
197
|
+
request.arguments.model_dump() if request.arguments else None
|
|
198
|
+
),
|
|
199
|
+
response_id=self.streaming_response_id, # use vLLM ID if available
|
|
200
|
+
text=text,
|
|
201
|
+
input_metrics=input_metrics,
|
|
202
|
+
output_metrics=output_metrics,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
def extract_line_data(self, line: str) -> dict[str, Any] | None:
|
|
206
|
+
"""
|
|
207
|
+
Extract JSON data from a streaming response line.
|
|
208
|
+
|
|
209
|
+
:param line: Raw line from the streaming response
|
|
210
|
+
:return: Parsed JSON data as dictionary, or None if line indicates completion
|
|
211
|
+
"""
|
|
212
|
+
if line == "data: [DONE]":
|
|
213
|
+
return None
|
|
214
|
+
|
|
215
|
+
if not line or not (line := line.strip()) or not line.startswith("data:"):
|
|
216
|
+
return {}
|
|
217
|
+
|
|
218
|
+
line = line[len("data:") :].strip()
|
|
219
|
+
|
|
220
|
+
return json.loads(line)
|
|
221
|
+
|
|
222
|
+
def extract_choices_and_usage(
|
|
223
|
+
self, response: dict
|
|
224
|
+
) -> tuple[list[dict], dict[str, int | dict[str, int]]]:
|
|
225
|
+
"""
|
|
226
|
+
Extract choices and usage data from the API response.
|
|
227
|
+
|
|
228
|
+
:param response: Complete API response containing choices and usage data
|
|
229
|
+
:return: Tuple of choices list and usage dictionary
|
|
230
|
+
"""
|
|
231
|
+
return response.get("choices", []), response.get("usage", {})
|
|
232
|
+
|
|
233
|
+
def extract_metrics(
|
|
234
|
+
self, usage: dict[str, int | dict[str, int]] | None, text: str
|
|
235
|
+
) -> tuple[UsageMetrics, UsageMetrics]:
|
|
236
|
+
"""
|
|
237
|
+
Extract input and output usage metrics from API response usage data.
|
|
238
|
+
|
|
239
|
+
:param usage: Usage data dictionary from API response
|
|
240
|
+
:param text: Generated text for calculating word and character counts
|
|
241
|
+
:return: Tuple of input_metrics and output_metrics as UsageMetrics objects
|
|
242
|
+
"""
|
|
243
|
+
if not usage:
|
|
244
|
+
return UsageMetrics(), UsageMetrics(
|
|
245
|
+
text_words=len(text.split()) if text else 0,
|
|
246
|
+
text_characters=len(text) if text else 0,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
input_details: dict[str, int] = cast(
|
|
250
|
+
"dict[str, int]", usage.get("prompt_tokens_details", {}) or {}
|
|
251
|
+
)
|
|
252
|
+
output_details: dict[str, int] = cast(
|
|
253
|
+
"dict[str, int]", usage.get("completion_tokens_details", {}) or {}
|
|
254
|
+
)
|
|
255
|
+
usage_metrics: dict[str, int] = cast("dict[str, int]", usage)
|
|
256
|
+
|
|
257
|
+
return UsageMetrics(
|
|
258
|
+
text_tokens=(
|
|
259
|
+
input_details.get("prompt_tokens")
|
|
260
|
+
or usage_metrics.get("prompt_tokens")
|
|
261
|
+
or 0
|
|
262
|
+
),
|
|
263
|
+
image_tokens=input_details.get("image_tokens"),
|
|
264
|
+
video_tokens=input_details.get("video_tokens"),
|
|
265
|
+
audio_tokens=input_details.get("audio_tokens"),
|
|
266
|
+
audio_seconds=input_details.get("seconds"),
|
|
267
|
+
), UsageMetrics(
|
|
268
|
+
text_tokens=(
|
|
269
|
+
output_details.get("completion_tokens")
|
|
270
|
+
or usage_metrics.get("completion_tokens")
|
|
271
|
+
or 0
|
|
272
|
+
),
|
|
273
|
+
text_words=len(text.split()) if text else 0,
|
|
274
|
+
text_characters=len(text) if text else 0,
|
|
275
|
+
image_tokens=output_details.get("image_tokens"),
|
|
276
|
+
video_tokens=output_details.get("video_tokens"),
|
|
277
|
+
audio_tokens=output_details.get("audio_tokens"),
|
|
278
|
+
audio_seconds=output_details.get("seconds"),
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
@GenerationResponseHandlerFactory.register("chat_completions")
|
|
283
|
+
class ChatCompletionsResponseHandler(TextCompletionsResponseHandler):
|
|
284
|
+
"""
|
|
285
|
+
Response handler for OpenAI-style chat completion endpoints.
|
|
286
|
+
|
|
287
|
+
Extends TextCompletionsResponseHandler to handle chat completion responses where
|
|
288
|
+
generated text is nested within message objects in the choices array. Processes
|
|
289
|
+
both streaming and non-streaming chat completion responses.
|
|
290
|
+
"""
|
|
291
|
+
|
|
292
|
+
def compile_non_streaming(
|
|
293
|
+
self, request: GenerationRequest, response: dict
|
|
294
|
+
) -> GenerationResponse:
|
|
295
|
+
"""
|
|
296
|
+
Process a complete chat completion response.
|
|
297
|
+
|
|
298
|
+
Extracts content from the message object within choices, handling the nested
|
|
299
|
+
structure specific to chat completion endpoints.
|
|
300
|
+
|
|
301
|
+
:param request: Original generation request
|
|
302
|
+
:param response: Complete API response containing choices and usage data
|
|
303
|
+
:return: Standardized GenerationResponse with extracted content and metrics
|
|
304
|
+
"""
|
|
305
|
+
choices, usage = self.extract_choices_and_usage(response)
|
|
306
|
+
choice: dict[str, dict] = choices[0] if choices else {}
|
|
307
|
+
text = choice.get("message", {}).get("content", "")
|
|
308
|
+
input_metrics, output_metrics = self.extract_metrics(usage, text)
|
|
309
|
+
|
|
310
|
+
return GenerationResponse(
|
|
311
|
+
request_id=request.request_id,
|
|
312
|
+
request_args=str(
|
|
313
|
+
request.arguments.model_dump() if request.arguments else None
|
|
314
|
+
),
|
|
315
|
+
response_id=response.get("id"), # use vLLM ID if available
|
|
316
|
+
text=text,
|
|
317
|
+
input_metrics=input_metrics,
|
|
318
|
+
output_metrics=output_metrics,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
def add_streaming_line(self, line: str) -> int | None:
|
|
322
|
+
"""
|
|
323
|
+
Process a single line from a chat completion streaming response.
|
|
324
|
+
|
|
325
|
+
Handles the chat completion specific delta structure where content is nested
|
|
326
|
+
within delta objects in the streaming response chunks.
|
|
327
|
+
|
|
328
|
+
:param line: Raw SSE line from the streaming response
|
|
329
|
+
:return: 1 if content was extracted, 0 if line ignored, None if done
|
|
330
|
+
"""
|
|
331
|
+
if not (data := self.extract_line_data(line)):
|
|
332
|
+
return None if data is None else 0
|
|
333
|
+
|
|
334
|
+
if "id" in data and self.streaming_response_id is None:
|
|
335
|
+
self.streaming_response_id = data["id"]
|
|
336
|
+
|
|
337
|
+
updated = False
|
|
338
|
+
choices, usage = self.extract_choices_and_usage(data)
|
|
339
|
+
choice: dict[str, dict] = choices[0] if choices else {}
|
|
340
|
+
|
|
341
|
+
if choices and (content := choice.get("delta", {}).get("content")):
|
|
342
|
+
self.streaming_texts.append(content)
|
|
343
|
+
updated = True
|
|
344
|
+
|
|
345
|
+
if usage:
|
|
346
|
+
self.streaming_usage = usage
|
|
347
|
+
|
|
348
|
+
return 1 if updated else 0
|
|
349
|
+
|
|
350
|
+
def compile_streaming(self, request: GenerationRequest) -> GenerationResponse:
|
|
351
|
+
"""
|
|
352
|
+
Compile accumulated streaming chat completion content into a final response.
|
|
353
|
+
|
|
354
|
+
:param request: Original generation request
|
|
355
|
+
:return: Standardized GenerationResponse with concatenated content and metrics
|
|
356
|
+
"""
|
|
357
|
+
text = "".join(self.streaming_texts)
|
|
358
|
+
input_metrics, output_metrics = self.extract_metrics(self.streaming_usage, text)
|
|
359
|
+
|
|
360
|
+
return GenerationResponse(
|
|
361
|
+
request_id=request.request_id,
|
|
362
|
+
request_args=str(
|
|
363
|
+
request.arguments.model_dump() if request.arguments else None
|
|
364
|
+
),
|
|
365
|
+
response_id=self.streaming_response_id, # use vLLM ID if available
|
|
366
|
+
text=text,
|
|
367
|
+
input_metrics=input_metrics,
|
|
368
|
+
output_metrics=output_metrics,
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
@GenerationResponseHandlerFactory.register(
|
|
373
|
+
["audio_transcriptions", "audio_translations"]
|
|
374
|
+
)
|
|
375
|
+
class AudioResponseHandler(ChatCompletionsResponseHandler):
|
|
376
|
+
"""
|
|
377
|
+
Response handler for audio transcription and translation endpoints.
|
|
378
|
+
|
|
379
|
+
Processes responses from audio processing APIs that convert speech to text,
|
|
380
|
+
handling both transcription and translation services. Manages audio-specific
|
|
381
|
+
usage metrics including audio tokens and processing duration.
|
|
382
|
+
|
|
383
|
+
Example:
|
|
384
|
+
::
|
|
385
|
+
handler = AudioResponseHandler()
|
|
386
|
+
response = handler.compile_non_streaming(request, api_response)
|
|
387
|
+
"""
|
|
388
|
+
|
|
389
|
+
def __init__(self):
|
|
390
|
+
"""
|
|
391
|
+
Initialize the audio response handler.
|
|
392
|
+
|
|
393
|
+
Sets up internal state for accumulating streaming response data including
|
|
394
|
+
audio buffers, text chunks, and usage metrics.
|
|
395
|
+
"""
|
|
396
|
+
self.streaming_buffer: bytearray = bytearray()
|
|
397
|
+
self.streaming_texts: list[str] = []
|
|
398
|
+
self.streaming_usage: dict[str, int | dict[str, int]] | None = None
|
|
399
|
+
self.streaming_response_id: str | None = None
|
|
400
|
+
|
|
401
|
+
def extract_metrics(
|
|
402
|
+
self, usage: dict[str, int | dict[str, int]] | None, text: str
|
|
403
|
+
) -> tuple[UsageMetrics, UsageMetrics]:
|
|
404
|
+
"""
|
|
405
|
+
Extract input and output usage metrics from audio API response usage data.
|
|
406
|
+
|
|
407
|
+
Handles audio-specific metrics including processing duration and audio tokens
|
|
408
|
+
in addition to standard text token counts.
|
|
409
|
+
|
|
410
|
+
:param usage: Usage data dictionary from audio API response
|
|
411
|
+
:param text: Generated text for calculating word and character counts
|
|
412
|
+
:return: Tuple of input_metrics and output_metrics as UsageMetrics objects
|
|
413
|
+
"""
|
|
414
|
+
if not usage:
|
|
415
|
+
return UsageMetrics(), UsageMetrics(
|
|
416
|
+
text_words=len(text.split()) if text else 0,
|
|
417
|
+
text_characters=len(text) if text else 0,
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
usage_metrics: dict[str, int] = cast("dict[str, int]", usage)
|
|
421
|
+
|
|
422
|
+
return UsageMetrics(
|
|
423
|
+
audio_tokens=(usage_metrics.get("prompt_tokens") or 0),
|
|
424
|
+
), UsageMetrics(
|
|
425
|
+
text_tokens=(usage_metrics.get("completion_tokens") or 0),
|
|
426
|
+
text_words=len(text.split()) if text else 0,
|
|
427
|
+
text_characters=len(text) if text else 0,
|
|
428
|
+
)
|
guidellm/benchmark/__init__.py
CHANGED
|
@@ -1,20 +1,24 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
from .benchmarker import Benchmarker, BenchmarkerResult, GenerativeBenchmarker
|
|
1
|
+
"""
|
|
2
|
+
Benchmark execution and performance analysis framework.
|
|
3
|
+
|
|
4
|
+
Provides comprehensive benchmarking capabilities for LLM inference workloads,
|
|
5
|
+
including profile-based execution strategies, metrics collection and aggregation,
|
|
6
|
+
progress tracking, and multi-format output generation. Supports synchronous,
|
|
7
|
+
asynchronous, concurrent, sweep, and throughput-based benchmarking profiles for
|
|
8
|
+
evaluating model performance under various load conditions.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from .benchmarker import Benchmarker
|
|
15
14
|
from .entrypoints import benchmark_generative_text, reimport_benchmarks_report
|
|
16
|
-
from .
|
|
17
|
-
|
|
15
|
+
from .outputs import (
|
|
16
|
+
GenerativeBenchmarkerConsole,
|
|
17
|
+
GenerativeBenchmarkerCSV,
|
|
18
|
+
GenerativeBenchmarkerHTML,
|
|
19
|
+
GenerativeBenchmarkerOutput,
|
|
20
|
+
)
|
|
21
|
+
from .profiles import (
|
|
18
22
|
AsyncProfile,
|
|
19
23
|
ConcurrentProfile,
|
|
20
24
|
Profile,
|
|
@@ -22,46 +26,72 @@ from .profile import (
|
|
|
22
26
|
SweepProfile,
|
|
23
27
|
SynchronousProfile,
|
|
24
28
|
ThroughputProfile,
|
|
25
|
-
create_profile,
|
|
26
29
|
)
|
|
27
|
-
from .progress import
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
30
|
+
from .progress import BenchmarkerProgress, GenerativeConsoleBenchmarkerProgress
|
|
31
|
+
from .scenarios import get_builtin_scenarios
|
|
32
|
+
from .schemas import (
|
|
33
|
+
Benchmark,
|
|
34
|
+
BenchmarkAccumulator,
|
|
35
|
+
BenchmarkAccumulatorT,
|
|
36
|
+
BenchmarkConfig,
|
|
37
|
+
BenchmarkGenerativeTextArgs,
|
|
38
|
+
BenchmarkT,
|
|
39
|
+
GenerativeAudioMetricsSummary,
|
|
40
|
+
GenerativeBenchmark,
|
|
41
|
+
GenerativeBenchmarkAccumulator,
|
|
42
|
+
GenerativeBenchmarkMetadata,
|
|
43
|
+
GenerativeBenchmarksReport,
|
|
44
|
+
GenerativeBenchmarkTimings,
|
|
45
|
+
GenerativeImageMetricsSummary,
|
|
46
|
+
GenerativeMetrics,
|
|
47
|
+
GenerativeMetricsAccumulator,
|
|
48
|
+
GenerativeMetricsSummary,
|
|
49
|
+
GenerativeRequestsAccumulator,
|
|
50
|
+
GenerativeTextMetricsSummary,
|
|
51
|
+
GenerativeVideoMetricsSummary,
|
|
52
|
+
RunningMetricStats,
|
|
53
|
+
SchedulerMetrics,
|
|
54
|
+
SchedulerMetricsAccumulator,
|
|
32
55
|
)
|
|
33
56
|
|
|
34
57
|
__all__ = [
|
|
35
|
-
"AggregatorT",
|
|
36
58
|
"AsyncProfile",
|
|
37
59
|
"Benchmark",
|
|
38
|
-
"
|
|
39
|
-
"
|
|
40
|
-
"
|
|
41
|
-
"
|
|
60
|
+
"BenchmarkAccumulator",
|
|
61
|
+
"BenchmarkAccumulatorT",
|
|
62
|
+
"BenchmarkConfig",
|
|
63
|
+
"BenchmarkGenerativeTextArgs",
|
|
42
64
|
"BenchmarkT",
|
|
43
65
|
"Benchmarker",
|
|
44
|
-
"
|
|
45
|
-
"BenchmarkerResult",
|
|
46
|
-
"BenchmarkerTaskProgressState",
|
|
66
|
+
"BenchmarkerProgress",
|
|
47
67
|
"ConcurrentProfile",
|
|
68
|
+
"GenerativeAudioMetricsSummary",
|
|
48
69
|
"GenerativeBenchmark",
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
"
|
|
70
|
+
"GenerativeBenchmarkAccumulator",
|
|
71
|
+
"GenerativeBenchmarkMetadata",
|
|
72
|
+
"GenerativeBenchmarkTimings",
|
|
73
|
+
"GenerativeBenchmarkerCSV",
|
|
74
|
+
"GenerativeBenchmarkerConsole",
|
|
75
|
+
"GenerativeBenchmarkerHTML",
|
|
76
|
+
"GenerativeBenchmarkerOutput",
|
|
52
77
|
"GenerativeBenchmarksReport",
|
|
78
|
+
"GenerativeConsoleBenchmarkerProgress",
|
|
79
|
+
"GenerativeImageMetricsSummary",
|
|
53
80
|
"GenerativeMetrics",
|
|
54
|
-
"
|
|
55
|
-
"
|
|
56
|
-
"
|
|
57
|
-
"
|
|
81
|
+
"GenerativeMetricsAccumulator",
|
|
82
|
+
"GenerativeMetricsSummary",
|
|
83
|
+
"GenerativeRequestsAccumulator",
|
|
84
|
+
"GenerativeTextMetricsSummary",
|
|
85
|
+
"GenerativeVideoMetricsSummary",
|
|
58
86
|
"Profile",
|
|
59
87
|
"ProfileType",
|
|
60
|
-
"
|
|
88
|
+
"RunningMetricStats",
|
|
89
|
+
"SchedulerMetrics",
|
|
90
|
+
"SchedulerMetricsAccumulator",
|
|
61
91
|
"SweepProfile",
|
|
62
92
|
"SynchronousProfile",
|
|
63
93
|
"ThroughputProfile",
|
|
64
94
|
"benchmark_generative_text",
|
|
65
|
-
"
|
|
95
|
+
"get_builtin_scenarios",
|
|
66
96
|
"reimport_benchmarks_report",
|
|
67
97
|
]
|