guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- guidellm/__init__.py +5 -2
- guidellm/__main__.py +524 -255
- guidellm/backends/__init__.py +33 -0
- guidellm/backends/backend.py +109 -0
- guidellm/backends/openai.py +340 -0
- guidellm/backends/response_handlers.py +428 -0
- guidellm/benchmark/__init__.py +69 -39
- guidellm/benchmark/benchmarker.py +160 -316
- guidellm/benchmark/entrypoints.py +560 -127
- guidellm/benchmark/outputs/__init__.py +24 -0
- guidellm/benchmark/outputs/console.py +633 -0
- guidellm/benchmark/outputs/csv.py +721 -0
- guidellm/benchmark/outputs/html.py +473 -0
- guidellm/benchmark/outputs/output.py +169 -0
- guidellm/benchmark/outputs/serialized.py +69 -0
- guidellm/benchmark/profiles.py +718 -0
- guidellm/benchmark/progress.py +553 -556
- guidellm/benchmark/scenarios/__init__.py +40 -0
- guidellm/benchmark/scenarios/chat.json +6 -0
- guidellm/benchmark/scenarios/rag.json +6 -0
- guidellm/benchmark/schemas/__init__.py +66 -0
- guidellm/benchmark/schemas/base.py +402 -0
- guidellm/benchmark/schemas/generative/__init__.py +55 -0
- guidellm/benchmark/schemas/generative/accumulator.py +841 -0
- guidellm/benchmark/schemas/generative/benchmark.py +163 -0
- guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
- guidellm/benchmark/schemas/generative/metrics.py +927 -0
- guidellm/benchmark/schemas/generative/report.py +158 -0
- guidellm/data/__init__.py +34 -4
- guidellm/data/builders.py +541 -0
- guidellm/data/collators.py +16 -0
- guidellm/data/config.py +120 -0
- guidellm/data/deserializers/__init__.py +49 -0
- guidellm/data/deserializers/deserializer.py +141 -0
- guidellm/data/deserializers/file.py +223 -0
- guidellm/data/deserializers/huggingface.py +94 -0
- guidellm/data/deserializers/memory.py +194 -0
- guidellm/data/deserializers/synthetic.py +246 -0
- guidellm/data/entrypoints.py +52 -0
- guidellm/data/loaders.py +190 -0
- guidellm/data/preprocessors/__init__.py +27 -0
- guidellm/data/preprocessors/formatters.py +410 -0
- guidellm/data/preprocessors/mappers.py +196 -0
- guidellm/data/preprocessors/preprocessor.py +30 -0
- guidellm/data/processor.py +29 -0
- guidellm/data/schemas.py +175 -0
- guidellm/data/utils/__init__.py +6 -0
- guidellm/data/utils/dataset.py +94 -0
- guidellm/extras/__init__.py +4 -0
- guidellm/extras/audio.py +220 -0
- guidellm/extras/vision.py +242 -0
- guidellm/logger.py +2 -2
- guidellm/mock_server/__init__.py +8 -0
- guidellm/mock_server/config.py +84 -0
- guidellm/mock_server/handlers/__init__.py +17 -0
- guidellm/mock_server/handlers/chat_completions.py +280 -0
- guidellm/mock_server/handlers/completions.py +280 -0
- guidellm/mock_server/handlers/tokenizer.py +142 -0
- guidellm/mock_server/models.py +510 -0
- guidellm/mock_server/server.py +238 -0
- guidellm/mock_server/utils.py +302 -0
- guidellm/scheduler/__init__.py +69 -26
- guidellm/scheduler/constraints/__init__.py +49 -0
- guidellm/scheduler/constraints/constraint.py +325 -0
- guidellm/scheduler/constraints/error.py +411 -0
- guidellm/scheduler/constraints/factory.py +182 -0
- guidellm/scheduler/constraints/request.py +312 -0
- guidellm/scheduler/constraints/saturation.py +722 -0
- guidellm/scheduler/environments.py +252 -0
- guidellm/scheduler/scheduler.py +137 -368
- guidellm/scheduler/schemas.py +358 -0
- guidellm/scheduler/strategies.py +617 -0
- guidellm/scheduler/worker.py +413 -419
- guidellm/scheduler/worker_group.py +712 -0
- guidellm/schemas/__init__.py +65 -0
- guidellm/schemas/base.py +417 -0
- guidellm/schemas/info.py +188 -0
- guidellm/schemas/request.py +235 -0
- guidellm/schemas/request_stats.py +349 -0
- guidellm/schemas/response.py +124 -0
- guidellm/schemas/statistics.py +1018 -0
- guidellm/{config.py → settings.py} +31 -24
- guidellm/utils/__init__.py +71 -8
- guidellm/utils/auto_importer.py +98 -0
- guidellm/utils/cli.py +132 -5
- guidellm/utils/console.py +566 -0
- guidellm/utils/encoding.py +778 -0
- guidellm/utils/functions.py +159 -0
- guidellm/utils/hf_datasets.py +1 -2
- guidellm/utils/hf_transformers.py +4 -4
- guidellm/utils/imports.py +9 -0
- guidellm/utils/messaging.py +1118 -0
- guidellm/utils/mixins.py +115 -0
- guidellm/utils/random.py +3 -4
- guidellm/utils/registry.py +220 -0
- guidellm/utils/singleton.py +133 -0
- guidellm/utils/synchronous.py +159 -0
- guidellm/utils/text.py +163 -50
- guidellm/utils/typing.py +41 -0
- guidellm/version.py +2 -2
- guidellm-0.6.0a5.dist-info/METADATA +364 -0
- guidellm-0.6.0a5.dist-info/RECORD +109 -0
- guidellm/backend/__init__.py +0 -23
- guidellm/backend/backend.py +0 -259
- guidellm/backend/openai.py +0 -708
- guidellm/backend/response.py +0 -136
- guidellm/benchmark/aggregator.py +0 -760
- guidellm/benchmark/benchmark.py +0 -837
- guidellm/benchmark/output.py +0 -997
- guidellm/benchmark/profile.py +0 -409
- guidellm/benchmark/scenario.py +0 -104
- guidellm/data/prideandprejudice.txt.gz +0 -0
- guidellm/dataset/__init__.py +0 -22
- guidellm/dataset/creator.py +0 -213
- guidellm/dataset/entrypoints.py +0 -42
- guidellm/dataset/file.py +0 -92
- guidellm/dataset/hf_datasets.py +0 -62
- guidellm/dataset/in_memory.py +0 -132
- guidellm/dataset/synthetic.py +0 -287
- guidellm/objects/__init__.py +0 -18
- guidellm/objects/pydantic.py +0 -89
- guidellm/objects/statistics.py +0 -953
- guidellm/preprocess/__init__.py +0 -3
- guidellm/preprocess/dataset.py +0 -374
- guidellm/presentation/__init__.py +0 -28
- guidellm/presentation/builder.py +0 -27
- guidellm/presentation/data_models.py +0 -232
- guidellm/presentation/injector.py +0 -66
- guidellm/request/__init__.py +0 -18
- guidellm/request/loader.py +0 -284
- guidellm/request/request.py +0 -79
- guidellm/request/types.py +0 -10
- guidellm/scheduler/queues.py +0 -25
- guidellm/scheduler/result.py +0 -155
- guidellm/scheduler/strategy.py +0 -495
- guidellm-0.3.1.dist-info/METADATA +0 -329
- guidellm-0.3.1.dist-info/RECORD +0 -62
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Request schema definitions for generation operations.
|
|
3
|
+
|
|
4
|
+
Contains request models and data structures used to define and execute generation
|
|
5
|
+
requests across different backend services. Provides standardized interfaces for
|
|
6
|
+
request arguments, usage metrics tracking, and request type definitions that enable
|
|
7
|
+
consistent interaction with various AI generation APIs.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import uuid
|
|
13
|
+
from typing import Any, Literal
|
|
14
|
+
|
|
15
|
+
from pydantic import Field, computed_field
|
|
16
|
+
|
|
17
|
+
from guidellm.schemas.base import StandardBaseDict, StandardBaseModel
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"GenerationRequest",
|
|
21
|
+
"GenerationRequestArguments",
|
|
22
|
+
"GenerativeRequestType",
|
|
23
|
+
"UsageMetrics",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
GenerativeRequestType = Literal[
|
|
28
|
+
"text_completions",
|
|
29
|
+
"chat_completions",
|
|
30
|
+
"audio_transcriptions",
|
|
31
|
+
"audio_translations",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class GenerationRequestArguments(StandardBaseDict):
|
|
36
|
+
"""
|
|
37
|
+
HTTP request arguments for generation operations.
|
|
38
|
+
|
|
39
|
+
Encapsulates all necessary HTTP request components including method, headers,
|
|
40
|
+
parameters, and payload data required to execute generation requests against
|
|
41
|
+
backend services. Supports file uploads and streaming responses.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
method: str | None = Field(
|
|
45
|
+
default=None,
|
|
46
|
+
description="The HTTP method to use for the request (e.g., 'POST', 'GET').",
|
|
47
|
+
)
|
|
48
|
+
stream: bool | None = Field(
|
|
49
|
+
default=None,
|
|
50
|
+
description="Whether to stream the response, if applicable.",
|
|
51
|
+
)
|
|
52
|
+
headers: dict[str, str] | None = Field(
|
|
53
|
+
default=None,
|
|
54
|
+
description="Any headers to include in the request, if applicable.",
|
|
55
|
+
)
|
|
56
|
+
params: dict[str, Any] | None = Field(
|
|
57
|
+
default=None,
|
|
58
|
+
description="Query parameters to include in the request, if applicable.",
|
|
59
|
+
)
|
|
60
|
+
body: dict[str, Any] | None = Field(
|
|
61
|
+
default=None,
|
|
62
|
+
description="Content to include in the main request body.",
|
|
63
|
+
)
|
|
64
|
+
files: dict[str, Any] | None = Field(
|
|
65
|
+
default=None,
|
|
66
|
+
description="Files to include in the request, if applicable.",
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def model_combine(
|
|
70
|
+
self, additional: GenerationRequestArguments | dict[str, Any]
|
|
71
|
+
) -> GenerationRequestArguments:
|
|
72
|
+
"""
|
|
73
|
+
Merge additional request arguments into the current instance.
|
|
74
|
+
|
|
75
|
+
Combines method and stream fields by overwriting, while merging collection
|
|
76
|
+
fields like headers, params, body, and files by extending existing values.
|
|
77
|
+
|
|
78
|
+
:param additional: Additional arguments to merge with current instance
|
|
79
|
+
:return: Updated instance with merged arguments
|
|
80
|
+
"""
|
|
81
|
+
additional_dict = (
|
|
82
|
+
additional.model_dump()
|
|
83
|
+
if isinstance(additional, GenerationRequestArguments)
|
|
84
|
+
else additional
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
for overwrite in ("method", "stream"):
|
|
88
|
+
if (val := additional_dict.get(overwrite)) is not None:
|
|
89
|
+
setattr(self, overwrite, val)
|
|
90
|
+
|
|
91
|
+
for combine in ("headers", "params", "body", "files"):
|
|
92
|
+
if (val := additional_dict.get(combine)) is not None:
|
|
93
|
+
current = getattr(self, combine, None) or {}
|
|
94
|
+
setattr(self, combine, {**current, **val})
|
|
95
|
+
|
|
96
|
+
return self
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class UsageMetrics(StandardBaseDict):
|
|
100
|
+
"""
|
|
101
|
+
Multimodal usage metrics for generation requests.
|
|
102
|
+
|
|
103
|
+
Tracks resource consumption across different modalities including text, images,
|
|
104
|
+
video, and audio. Provides granular metrics for tokens, bytes, duration, and
|
|
105
|
+
format-specific measurements to enable comprehensive usage monitoring and billing.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
# Text stats
|
|
109
|
+
text_tokens: int | None = Field(
|
|
110
|
+
default=None, description="Number of text tokens processed/generated."
|
|
111
|
+
)
|
|
112
|
+
text_words: int | None = Field(
|
|
113
|
+
default=None, description="Number of text words processed/generated."
|
|
114
|
+
)
|
|
115
|
+
text_characters: int | None = Field(
|
|
116
|
+
default=None, description="Number of text characters processed/generated."
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Vision image stats
|
|
120
|
+
image_tokens: int | None = Field(
|
|
121
|
+
default=None, description="Number of image tokens processed/generated."
|
|
122
|
+
)
|
|
123
|
+
image_count: int | None = Field(
|
|
124
|
+
default=None, description="Number of images processed/generated."
|
|
125
|
+
)
|
|
126
|
+
image_pixels: int | None = Field(
|
|
127
|
+
default=None, description="Number of image pixels processed/generated."
|
|
128
|
+
)
|
|
129
|
+
image_bytes: int | None = Field(
|
|
130
|
+
default=None, description="Number of image bytes processed/generated."
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Vision video stats
|
|
134
|
+
video_tokens: int | None = Field(
|
|
135
|
+
default=None, description="Number of video tokens processed/generated."
|
|
136
|
+
)
|
|
137
|
+
video_frames: int | None = Field(
|
|
138
|
+
default=None, description="Number of video frames processed/generated."
|
|
139
|
+
)
|
|
140
|
+
video_seconds: float | None = Field(
|
|
141
|
+
default=None, description="Duration of video processed/generated in seconds."
|
|
142
|
+
)
|
|
143
|
+
video_bytes: int | None = Field(
|
|
144
|
+
default=None, description="Number of video bytes processed/generated."
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Audio stats
|
|
148
|
+
audio_tokens: int | None = Field(
|
|
149
|
+
default=None, description="Number of audio tokens processed/generated."
|
|
150
|
+
)
|
|
151
|
+
audio_samples: int | None = Field(
|
|
152
|
+
default=None, description="Number of audio samples processed/generated."
|
|
153
|
+
)
|
|
154
|
+
audio_seconds: float | None = Field(
|
|
155
|
+
default=None, description="Duration of audio processed/generated in seconds."
|
|
156
|
+
)
|
|
157
|
+
audio_bytes: int | None = Field(
|
|
158
|
+
default=None, description="Number of audio bytes processed/generated."
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
@computed_field # type: ignore[misc]
|
|
162
|
+
@property
|
|
163
|
+
def total_tokens(self) -> int | None:
|
|
164
|
+
"""
|
|
165
|
+
Calculate total tokens across all modalities.
|
|
166
|
+
|
|
167
|
+
:return: Sum of text, image, video, and audio tokens, or None if all are None
|
|
168
|
+
"""
|
|
169
|
+
token_metrics = [
|
|
170
|
+
self.text_tokens,
|
|
171
|
+
self.image_tokens,
|
|
172
|
+
self.video_tokens,
|
|
173
|
+
self.audio_tokens,
|
|
174
|
+
]
|
|
175
|
+
# NOTE: None should indicate no data rather than zero usage
|
|
176
|
+
if token_metrics.count(None) == len(token_metrics):
|
|
177
|
+
return None
|
|
178
|
+
else:
|
|
179
|
+
return sum(token or 0 for token in token_metrics)
|
|
180
|
+
|
|
181
|
+
def add_text_metrics(self, text):
|
|
182
|
+
"""
|
|
183
|
+
Adds the metrics from the given text to the fields
|
|
184
|
+
`text_characters` and `text_words`.
|
|
185
|
+
|
|
186
|
+
:param text: Text to add metrics from
|
|
187
|
+
"""
|
|
188
|
+
self.text_characters = (self.text_characters or 0) + len(text)
|
|
189
|
+
self.text_words = (self.text_words or 0) + len(text.split())
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class GenerationRequest(StandardBaseModel):
|
|
193
|
+
"""
|
|
194
|
+
Complete request specification for backend generation operations.
|
|
195
|
+
|
|
196
|
+
Encapsulates all components needed to execute a generation request including
|
|
197
|
+
unique identification, request type specification, HTTP arguments, and input/output
|
|
198
|
+
usage metrics. Serves as the primary interface between the scheduler and backend
|
|
199
|
+
services for coordinating AI generation tasks.
|
|
200
|
+
|
|
201
|
+
Example::
|
|
202
|
+
request = GenerationRequest(
|
|
203
|
+
request_type="text_completions",
|
|
204
|
+
arguments=GenerationRequestArguments(
|
|
205
|
+
method="POST",
|
|
206
|
+
body={"prompt": "Hello world", "max_tokens": 100}
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
"""
|
|
210
|
+
|
|
211
|
+
request_id: str = Field(
|
|
212
|
+
default_factory=lambda: str(uuid.uuid4()),
|
|
213
|
+
description="Unique identifier for the request.",
|
|
214
|
+
)
|
|
215
|
+
request_type: GenerativeRequestType | str = Field(
|
|
216
|
+
description=(
|
|
217
|
+
"Type of request. If url is not provided in arguments, "
|
|
218
|
+
"this will be used to determine the request url."
|
|
219
|
+
),
|
|
220
|
+
)
|
|
221
|
+
arguments: GenerationRequestArguments = Field(
|
|
222
|
+
description=(
|
|
223
|
+
"Payload for the request, structured as a dictionary of arguments to pass "
|
|
224
|
+
"to the respective backend method. For example, can contain "
|
|
225
|
+
"'json', 'headers', 'files', etc."
|
|
226
|
+
)
|
|
227
|
+
)
|
|
228
|
+
input_metrics: UsageMetrics = Field(
|
|
229
|
+
default_factory=UsageMetrics,
|
|
230
|
+
description="Input statistics including counts, sizes, and durations.",
|
|
231
|
+
)
|
|
232
|
+
output_metrics: UsageMetrics = Field(
|
|
233
|
+
default_factory=UsageMetrics,
|
|
234
|
+
description="Output statistics including counts, sizes, and durations.",
|
|
235
|
+
)
|
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Request statistics and metrics for generative AI benchmark analysis.
|
|
3
|
+
|
|
4
|
+
Provides data structures for capturing and analyzing performance metrics from
|
|
5
|
+
generative AI workloads. The module contains request-level statistics including
|
|
6
|
+
token counts, latency measurements, and throughput calculations essential for
|
|
7
|
+
evaluating text generation benchmark performance. Computed properties enable
|
|
8
|
+
analysis of time-to-first-token, inter-token latency, and token generation rates.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import Literal
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
from pydantic import Field, computed_field
|
|
17
|
+
|
|
18
|
+
from guidellm.schemas.base import StandardBaseDict
|
|
19
|
+
from guidellm.schemas.info import RequestInfo
|
|
20
|
+
from guidellm.schemas.request import GenerativeRequestType, UsageMetrics
|
|
21
|
+
|
|
22
|
+
__all__ = ["GenerativeRequestStats"]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class GenerativeRequestStats(StandardBaseDict):
|
|
26
|
+
"""
|
|
27
|
+
Request statistics for generative AI text generation workloads.
|
|
28
|
+
|
|
29
|
+
Captures comprehensive performance metrics for individual generative requests,
|
|
30
|
+
including token counts, timing measurements, and derived performance statistics.
|
|
31
|
+
Provides computed properties for latency analysis, throughput calculations,
|
|
32
|
+
and token generation metrics essential for benchmark evaluation.
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
::
|
|
36
|
+
stats = GenerativeRequestStats(
|
|
37
|
+
request_id="req_123",
|
|
38
|
+
request_type="text_completion",
|
|
39
|
+
info=request_info,
|
|
40
|
+
input_metrics=input_usage,
|
|
41
|
+
output_metrics=output_usage
|
|
42
|
+
)
|
|
43
|
+
throughput = stats.output_tokens_per_second
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
type_: Literal["generative_request_stats"] = "generative_request_stats"
|
|
47
|
+
request_id: str = Field(description="Unique identifier for the request")
|
|
48
|
+
request_type: GenerativeRequestType | str = Field(
|
|
49
|
+
description="Type of generative request (text_completion or chat_completion)"
|
|
50
|
+
)
|
|
51
|
+
response_id: str | None = Field(
|
|
52
|
+
default=None, description="Unique identifier matching vLLM Response ID"
|
|
53
|
+
)
|
|
54
|
+
request_args: str | None = Field(
|
|
55
|
+
default=None, description="Backend arguments used for this request"
|
|
56
|
+
)
|
|
57
|
+
output: str | None = Field(
|
|
58
|
+
default=None, description="Generated text output from the request"
|
|
59
|
+
)
|
|
60
|
+
info: RequestInfo = Field(description="Request metadata and timing information")
|
|
61
|
+
input_metrics: UsageMetrics = Field(
|
|
62
|
+
description="Token usage statistics for the input prompt"
|
|
63
|
+
)
|
|
64
|
+
output_metrics: UsageMetrics = Field(
|
|
65
|
+
description="Token usage statistics for the generated output"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Request stats
|
|
69
|
+
@computed_field # type: ignore[misc]
|
|
70
|
+
@property
|
|
71
|
+
def request_start_time(self) -> float | None:
|
|
72
|
+
"""
|
|
73
|
+
:return: Timestamp when the request started, or None if unavailable
|
|
74
|
+
"""
|
|
75
|
+
return (
|
|
76
|
+
self.info.timings.request_start
|
|
77
|
+
if self.info.timings.request_start is not None
|
|
78
|
+
else self.info.timings.resolve_start
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
@computed_field # type: ignore[misc]
|
|
82
|
+
@property
|
|
83
|
+
def request_end_time(self) -> float:
|
|
84
|
+
"""
|
|
85
|
+
:return: Timestamp when the request ended, or None if unavailable
|
|
86
|
+
"""
|
|
87
|
+
if self.info.timings.resolve_end is None:
|
|
88
|
+
raise ValueError("resolve_end timings should be set but is None.")
|
|
89
|
+
|
|
90
|
+
return (
|
|
91
|
+
self.info.timings.request_end
|
|
92
|
+
if self.info.timings.request_end is not None
|
|
93
|
+
else self.info.timings.resolve_end
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
@computed_field # type: ignore[misc]
|
|
97
|
+
@property
|
|
98
|
+
def request_latency(self) -> float | None:
|
|
99
|
+
"""
|
|
100
|
+
End-to-end request processing latency in seconds.
|
|
101
|
+
|
|
102
|
+
:return: Duration from request start to completion, or None if unavailable
|
|
103
|
+
"""
|
|
104
|
+
start = self.info.timings.request_start
|
|
105
|
+
end = self.info.timings.request_end
|
|
106
|
+
if start is None or end is None:
|
|
107
|
+
return None
|
|
108
|
+
|
|
109
|
+
return end - start
|
|
110
|
+
|
|
111
|
+
# General token stats
|
|
112
|
+
@computed_field # type: ignore[misc]
|
|
113
|
+
@property
|
|
114
|
+
def prompt_tokens(self) -> int | None:
|
|
115
|
+
"""
|
|
116
|
+
:return: Number of tokens in the input prompt, or None if unavailable
|
|
117
|
+
"""
|
|
118
|
+
return self.input_metrics.total_tokens
|
|
119
|
+
|
|
120
|
+
@computed_field # type: ignore[misc]
|
|
121
|
+
@property
|
|
122
|
+
def output_tokens(self) -> int | None:
|
|
123
|
+
"""
|
|
124
|
+
:return: Number of tokens in the generated output, or None if unavailable
|
|
125
|
+
"""
|
|
126
|
+
# Fallback if we did not get usage metrics from the server
|
|
127
|
+
# NOTE: This assumes each iteration is one token
|
|
128
|
+
if self.output_metrics.total_tokens is None:
|
|
129
|
+
return self.info.timings.token_iterations or None
|
|
130
|
+
|
|
131
|
+
return self.output_metrics.total_tokens
|
|
132
|
+
|
|
133
|
+
@computed_field # type: ignore[misc]
|
|
134
|
+
@property
|
|
135
|
+
def total_tokens(self) -> int | None:
|
|
136
|
+
"""
|
|
137
|
+
:return: Sum of prompt and output tokens, or None if both unavailable
|
|
138
|
+
"""
|
|
139
|
+
input_tokens = self.prompt_tokens
|
|
140
|
+
output_tokens = self.output_tokens
|
|
141
|
+
|
|
142
|
+
if input_tokens is None and output_tokens is None:
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
return (input_tokens or 0) + (output_tokens or 0)
|
|
146
|
+
|
|
147
|
+
@computed_field # type: ignore[misc]
|
|
148
|
+
@property
|
|
149
|
+
def time_to_first_token_ms(self) -> float | None:
|
|
150
|
+
"""
|
|
151
|
+
:return: Time to first token generation in milliseconds, or None if unavailable
|
|
152
|
+
"""
|
|
153
|
+
first_token = self.first_token_iteration
|
|
154
|
+
start = self.info.timings.request_start
|
|
155
|
+
if first_token is None or start is None:
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
return 1000 * (first_token - start)
|
|
159
|
+
|
|
160
|
+
@computed_field # type: ignore[misc]
|
|
161
|
+
@property
|
|
162
|
+
def time_per_output_token_ms(self) -> float | None:
|
|
163
|
+
"""
|
|
164
|
+
Average time per output token in milliseconds including first token.
|
|
165
|
+
|
|
166
|
+
:return: Average milliseconds per output token, or None if unavailable
|
|
167
|
+
"""
|
|
168
|
+
if (
|
|
169
|
+
(start := self.info.timings.request_start) is None
|
|
170
|
+
or (
|
|
171
|
+
(last_token := self.last_token_iteration or self.request_end_time)
|
|
172
|
+
is None
|
|
173
|
+
)
|
|
174
|
+
or (output_tokens := self.output_tokens) is None
|
|
175
|
+
or output_tokens == 0
|
|
176
|
+
):
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
return 1000 * (last_token - start) / output_tokens
|
|
180
|
+
|
|
181
|
+
@computed_field # type: ignore[misc]
|
|
182
|
+
@property
|
|
183
|
+
def inter_token_latency_ms(self) -> float | None:
|
|
184
|
+
"""
|
|
185
|
+
Average inter-token latency in milliseconds excluding first token.
|
|
186
|
+
|
|
187
|
+
:return: Average milliseconds between token generations, or None if unavailable
|
|
188
|
+
"""
|
|
189
|
+
first_token = self.first_token_iteration
|
|
190
|
+
last_token = self.last_token_iteration
|
|
191
|
+
output_tokens = self.output_tokens
|
|
192
|
+
if (
|
|
193
|
+
first_token is None
|
|
194
|
+
or last_token is None
|
|
195
|
+
or output_tokens is None
|
|
196
|
+
or output_tokens <= 1
|
|
197
|
+
):
|
|
198
|
+
return None
|
|
199
|
+
|
|
200
|
+
return 1000 * (last_token - first_token) / (output_tokens - 1)
|
|
201
|
+
|
|
202
|
+
@computed_field # type: ignore[misc]
|
|
203
|
+
@property
|
|
204
|
+
def tokens_per_second(self) -> float | None:
|
|
205
|
+
"""
|
|
206
|
+
:return: Total tokens per second throughput, or None if unavailable
|
|
207
|
+
"""
|
|
208
|
+
if not (latency := self.request_latency) or self.total_tokens is None:
|
|
209
|
+
return None
|
|
210
|
+
|
|
211
|
+
return self.total_tokens / latency
|
|
212
|
+
|
|
213
|
+
@computed_field # type: ignore[misc]
|
|
214
|
+
@property
|
|
215
|
+
def output_tokens_per_second(self) -> float | None:
|
|
216
|
+
"""
|
|
217
|
+
:return: Output token generation throughput, or None if unavailable
|
|
218
|
+
"""
|
|
219
|
+
if not (latency := self.request_latency) or self.output_tokens is None:
|
|
220
|
+
return None
|
|
221
|
+
|
|
222
|
+
return self.output_tokens / latency
|
|
223
|
+
|
|
224
|
+
@computed_field # type: ignore[misc]
|
|
225
|
+
@property
|
|
226
|
+
def iter_tokens_per_iteration(self) -> float | None:
|
|
227
|
+
"""
|
|
228
|
+
:return: Average tokens per iteration excluding first token, or None if
|
|
229
|
+
unavailable
|
|
230
|
+
"""
|
|
231
|
+
if (
|
|
232
|
+
self.output_tokens is None
|
|
233
|
+
or self.output_tokens <= 1
|
|
234
|
+
or self.token_iterations <= 1
|
|
235
|
+
):
|
|
236
|
+
return None
|
|
237
|
+
|
|
238
|
+
return (self.output_tokens - 1.0) / (
|
|
239
|
+
self.token_iterations - 1.0
|
|
240
|
+
) # subtract 1 for first token from the prompt, assume first iter is 1 token
|
|
241
|
+
|
|
242
|
+
@computed_field # type: ignore[misc]
|
|
243
|
+
@property
|
|
244
|
+
def output_tokens_per_iteration(self) -> float | None:
|
|
245
|
+
"""
|
|
246
|
+
:return: Average output tokens per iteration, or None if unavailable
|
|
247
|
+
"""
|
|
248
|
+
if self.output_tokens is None or self.token_iterations < 1:
|
|
249
|
+
return None
|
|
250
|
+
|
|
251
|
+
return self.output_tokens / self.token_iterations
|
|
252
|
+
|
|
253
|
+
@property
|
|
254
|
+
def first_token_iteration(self) -> float | None:
|
|
255
|
+
"""
|
|
256
|
+
:return: Timestamp of first token generation, or None if unavailable
|
|
257
|
+
"""
|
|
258
|
+
return self.info.timings.first_token_iteration
|
|
259
|
+
|
|
260
|
+
@property
|
|
261
|
+
def last_token_iteration(self) -> float | None:
|
|
262
|
+
"""
|
|
263
|
+
:return: Timestamp of last token generation, or None if unavailable
|
|
264
|
+
"""
|
|
265
|
+
return self.info.timings.last_token_iteration
|
|
266
|
+
|
|
267
|
+
@property
|
|
268
|
+
def token_iterations(self) -> int:
|
|
269
|
+
"""
|
|
270
|
+
:return: Total number of token generation iterations
|
|
271
|
+
"""
|
|
272
|
+
return self.info.timings.token_iterations
|
|
273
|
+
|
|
274
|
+
@property
|
|
275
|
+
def prompt_tokens_timing(self) -> tuple[float, float]:
|
|
276
|
+
"""
|
|
277
|
+
:return: Tuple of (timestamp, token_count) for prompt processing
|
|
278
|
+
:raises ValueError: If resolve_end timings are not set
|
|
279
|
+
"""
|
|
280
|
+
return (
|
|
281
|
+
(
|
|
282
|
+
self.first_token_iteration
|
|
283
|
+
if self.first_token_iteration is not None
|
|
284
|
+
else self.request_end_time
|
|
285
|
+
),
|
|
286
|
+
self.prompt_tokens or 0.0,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
@property
|
|
290
|
+
def output_tokens_timings(self) -> list[tuple[float, float]]:
|
|
291
|
+
"""
|
|
292
|
+
:return: List of (timestamp, token_count) tuples for output token generations
|
|
293
|
+
:raises ValueError: If resolve_end timings are not set
|
|
294
|
+
"""
|
|
295
|
+
if (
|
|
296
|
+
self.first_token_iteration is None
|
|
297
|
+
or self.last_token_iteration is None
|
|
298
|
+
or self.token_iterations <= 1
|
|
299
|
+
):
|
|
300
|
+
# No iteration data, return single timing at end with all tokens
|
|
301
|
+
return [
|
|
302
|
+
(
|
|
303
|
+
(
|
|
304
|
+
self.last_token_iteration
|
|
305
|
+
if self.last_token_iteration is not None
|
|
306
|
+
else self.request_end_time
|
|
307
|
+
),
|
|
308
|
+
self.output_tokens or 0.0,
|
|
309
|
+
)
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
# Return first token timing as 1 token plus per-iteration timings
|
|
313
|
+
return [
|
|
314
|
+
(self.first_token_iteration, 1.0 * bool(self.output_tokens))
|
|
315
|
+
] + self.iter_tokens_timings
|
|
316
|
+
|
|
317
|
+
@property
|
|
318
|
+
def iter_tokens_timings(self) -> list[tuple[float, float]]:
|
|
319
|
+
"""
|
|
320
|
+
:return: List of (timestamp, token_count) tuples for iterations excluding
|
|
321
|
+
first token
|
|
322
|
+
"""
|
|
323
|
+
if (
|
|
324
|
+
self.first_token_iteration is None
|
|
325
|
+
or self.last_token_iteration is None
|
|
326
|
+
or (tok_per_iter := self.iter_tokens_per_iteration) is None
|
|
327
|
+
or self.token_iterations <= 1
|
|
328
|
+
):
|
|
329
|
+
return []
|
|
330
|
+
|
|
331
|
+
# evenly space the iterations since we don't have per-iteration timings
|
|
332
|
+
# / we don't know the individual token counts per iteration
|
|
333
|
+
iter_times = np.linspace(
|
|
334
|
+
self.first_token_iteration,
|
|
335
|
+
self.last_token_iteration,
|
|
336
|
+
num=self.token_iterations,
|
|
337
|
+
)[1:] # skip first iteration
|
|
338
|
+
|
|
339
|
+
return [(iter_time, tok_per_iter) for iter_time in iter_times]
|
|
340
|
+
|
|
341
|
+
@property
|
|
342
|
+
def total_tokens_timings(self) -> list[tuple[float, float]]:
|
|
343
|
+
"""
|
|
344
|
+
:return: List of (timestamp, token_count) tuples for all token generations
|
|
345
|
+
"""
|
|
346
|
+
prompt_timings = self.prompt_tokens_timing
|
|
347
|
+
output_timings = self.output_tokens_timings
|
|
348
|
+
|
|
349
|
+
return ([prompt_timings] if prompt_timings else []) + output_timings
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Backend response models for request and response handling.
|
|
3
|
+
|
|
4
|
+
Provides standardized response models for generation operations that capture
|
|
5
|
+
output text, usage metrics, and compilation of request statistics. Ensures
|
|
6
|
+
consistent data handling and statistics aggregation across different backend
|
|
7
|
+
implementations.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from pydantic import Field
|
|
13
|
+
|
|
14
|
+
from guidellm.schemas.base import StandardBaseModel
|
|
15
|
+
from guidellm.schemas.info import RequestInfo
|
|
16
|
+
from guidellm.schemas.request import GenerationRequest, UsageMetrics
|
|
17
|
+
from guidellm.schemas.request_stats import GenerativeRequestStats
|
|
18
|
+
|
|
19
|
+
__all__ = ["GenerationResponse"]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class GenerationResponse(StandardBaseModel):
|
|
23
|
+
"""
|
|
24
|
+
Response model for backend generation operations.
|
|
25
|
+
|
|
26
|
+
Captures the output and metrics from a generation request, providing structured
|
|
27
|
+
data for text output, token usage statistics, and compilation of detailed
|
|
28
|
+
request statistics for analysis and monitoring purposes.
|
|
29
|
+
|
|
30
|
+
Example:
|
|
31
|
+
::
|
|
32
|
+
response = GenerationResponse(
|
|
33
|
+
request_id="req-123",
|
|
34
|
+
text="Generated response text",
|
|
35
|
+
input_metrics=UsageMetrics(token_count=50),
|
|
36
|
+
output_metrics=UsageMetrics(token_count=25)
|
|
37
|
+
)
|
|
38
|
+
stats = response.compile_stats(request, info)
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
request_id: str = Field(
|
|
42
|
+
description="Unique identifier matching the original GenerationRequest."
|
|
43
|
+
)
|
|
44
|
+
response_id: str | None = Field(
|
|
45
|
+
default=None,
|
|
46
|
+
description="Unique identifier matching the original vLLM Response ID.",
|
|
47
|
+
)
|
|
48
|
+
request_args: str | None = Field(
|
|
49
|
+
description="Arguments passed to the backend for request processing."
|
|
50
|
+
)
|
|
51
|
+
text: str | None = Field(
|
|
52
|
+
default=None,
|
|
53
|
+
description="The generated response text.",
|
|
54
|
+
)
|
|
55
|
+
input_metrics: UsageMetrics = Field(
|
|
56
|
+
default_factory=UsageMetrics,
|
|
57
|
+
description="Token usage statistics from the input prompt.",
|
|
58
|
+
)
|
|
59
|
+
output_metrics: UsageMetrics = Field(
|
|
60
|
+
default_factory=UsageMetrics,
|
|
61
|
+
description="Token usage statistics from the generated output.",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def compile_stats(
|
|
65
|
+
self,
|
|
66
|
+
request: GenerationRequest,
|
|
67
|
+
info: RequestInfo,
|
|
68
|
+
prefer_response: bool = True,
|
|
69
|
+
) -> GenerativeRequestStats:
|
|
70
|
+
"""
|
|
71
|
+
Compile and return comprehensive request statistics.
|
|
72
|
+
|
|
73
|
+
Merges metrics from the request and response objects to create a complete
|
|
74
|
+
statistical record, with preference given to response-level metrics when
|
|
75
|
+
available to ensure accuracy of actual execution data.
|
|
76
|
+
|
|
77
|
+
:param request: The original generation request containing input data
|
|
78
|
+
:param info: Metadata and timing information for the request execution
|
|
79
|
+
:param prefer_response: Whether to prefer response metrics over request
|
|
80
|
+
metrics when both are available
|
|
81
|
+
:return: A GenerativeRequestStats object containing detailed statistics
|
|
82
|
+
:raises ValueError: When request IDs don't match between objects
|
|
83
|
+
"""
|
|
84
|
+
if request.request_id != self.request_id:
|
|
85
|
+
raise ValueError("Mismatched request IDs between request and response.")
|
|
86
|
+
|
|
87
|
+
if info.request_id != self.request_id:
|
|
88
|
+
raise ValueError("Mismatched request IDs between info and response.")
|
|
89
|
+
|
|
90
|
+
if info.status != "completed":
|
|
91
|
+
# clear out request output metrics if the request failed since
|
|
92
|
+
# those are not valid
|
|
93
|
+
request.output_metrics = UsageMetrics()
|
|
94
|
+
|
|
95
|
+
base_input = request.input_metrics if prefer_response else self.input_metrics
|
|
96
|
+
override_input = (
|
|
97
|
+
self.input_metrics if prefer_response else request.input_metrics
|
|
98
|
+
)
|
|
99
|
+
base_output = request.output_metrics if prefer_response else self.output_metrics
|
|
100
|
+
override_output = (
|
|
101
|
+
self.output_metrics if prefer_response else request.output_metrics
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
input_metrics_dict = base_input.model_dump()
|
|
105
|
+
for key, value in override_input.model_dump().items():
|
|
106
|
+
if value is not None:
|
|
107
|
+
input_metrics_dict[key] = value
|
|
108
|
+
output_metrics_dict = base_output.model_dump()
|
|
109
|
+
for key, value in override_output.model_dump().items():
|
|
110
|
+
if value is not None:
|
|
111
|
+
output_metrics_dict[key] = value
|
|
112
|
+
|
|
113
|
+
return GenerativeRequestStats(
|
|
114
|
+
request_id=self.request_id,
|
|
115
|
+
response_id=self.response_id,
|
|
116
|
+
request_type=request.request_type,
|
|
117
|
+
request_args=str(
|
|
118
|
+
request.arguments.model_dump() if request.arguments else {}
|
|
119
|
+
),
|
|
120
|
+
output=self.text,
|
|
121
|
+
info=info,
|
|
122
|
+
input_metrics=UsageMetrics(**input_metrics_dict),
|
|
123
|
+
output_metrics=UsageMetrics(**output_metrics_dict),
|
|
124
|
+
)
|