guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +524 -255
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +109 -0
  5. guidellm/backends/openai.py +340 -0
  6. guidellm/backends/response_handlers.py +428 -0
  7. guidellm/benchmark/__init__.py +69 -39
  8. guidellm/benchmark/benchmarker.py +160 -316
  9. guidellm/benchmark/entrypoints.py +560 -127
  10. guidellm/benchmark/outputs/__init__.py +24 -0
  11. guidellm/benchmark/outputs/console.py +633 -0
  12. guidellm/benchmark/outputs/csv.py +721 -0
  13. guidellm/benchmark/outputs/html.py +473 -0
  14. guidellm/benchmark/outputs/output.py +169 -0
  15. guidellm/benchmark/outputs/serialized.py +69 -0
  16. guidellm/benchmark/profiles.py +718 -0
  17. guidellm/benchmark/progress.py +553 -556
  18. guidellm/benchmark/scenarios/__init__.py +40 -0
  19. guidellm/benchmark/scenarios/chat.json +6 -0
  20. guidellm/benchmark/scenarios/rag.json +6 -0
  21. guidellm/benchmark/schemas/__init__.py +66 -0
  22. guidellm/benchmark/schemas/base.py +402 -0
  23. guidellm/benchmark/schemas/generative/__init__.py +55 -0
  24. guidellm/benchmark/schemas/generative/accumulator.py +841 -0
  25. guidellm/benchmark/schemas/generative/benchmark.py +163 -0
  26. guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
  27. guidellm/benchmark/schemas/generative/metrics.py +927 -0
  28. guidellm/benchmark/schemas/generative/report.py +158 -0
  29. guidellm/data/__init__.py +34 -4
  30. guidellm/data/builders.py +541 -0
  31. guidellm/data/collators.py +16 -0
  32. guidellm/data/config.py +120 -0
  33. guidellm/data/deserializers/__init__.py +49 -0
  34. guidellm/data/deserializers/deserializer.py +141 -0
  35. guidellm/data/deserializers/file.py +223 -0
  36. guidellm/data/deserializers/huggingface.py +94 -0
  37. guidellm/data/deserializers/memory.py +194 -0
  38. guidellm/data/deserializers/synthetic.py +246 -0
  39. guidellm/data/entrypoints.py +52 -0
  40. guidellm/data/loaders.py +190 -0
  41. guidellm/data/preprocessors/__init__.py +27 -0
  42. guidellm/data/preprocessors/formatters.py +410 -0
  43. guidellm/data/preprocessors/mappers.py +196 -0
  44. guidellm/data/preprocessors/preprocessor.py +30 -0
  45. guidellm/data/processor.py +29 -0
  46. guidellm/data/schemas.py +175 -0
  47. guidellm/data/utils/__init__.py +6 -0
  48. guidellm/data/utils/dataset.py +94 -0
  49. guidellm/extras/__init__.py +4 -0
  50. guidellm/extras/audio.py +220 -0
  51. guidellm/extras/vision.py +242 -0
  52. guidellm/logger.py +2 -2
  53. guidellm/mock_server/__init__.py +8 -0
  54. guidellm/mock_server/config.py +84 -0
  55. guidellm/mock_server/handlers/__init__.py +17 -0
  56. guidellm/mock_server/handlers/chat_completions.py +280 -0
  57. guidellm/mock_server/handlers/completions.py +280 -0
  58. guidellm/mock_server/handlers/tokenizer.py +142 -0
  59. guidellm/mock_server/models.py +510 -0
  60. guidellm/mock_server/server.py +238 -0
  61. guidellm/mock_server/utils.py +302 -0
  62. guidellm/scheduler/__init__.py +69 -26
  63. guidellm/scheduler/constraints/__init__.py +49 -0
  64. guidellm/scheduler/constraints/constraint.py +325 -0
  65. guidellm/scheduler/constraints/error.py +411 -0
  66. guidellm/scheduler/constraints/factory.py +182 -0
  67. guidellm/scheduler/constraints/request.py +312 -0
  68. guidellm/scheduler/constraints/saturation.py +722 -0
  69. guidellm/scheduler/environments.py +252 -0
  70. guidellm/scheduler/scheduler.py +137 -368
  71. guidellm/scheduler/schemas.py +358 -0
  72. guidellm/scheduler/strategies.py +617 -0
  73. guidellm/scheduler/worker.py +413 -419
  74. guidellm/scheduler/worker_group.py +712 -0
  75. guidellm/schemas/__init__.py +65 -0
  76. guidellm/schemas/base.py +417 -0
  77. guidellm/schemas/info.py +188 -0
  78. guidellm/schemas/request.py +235 -0
  79. guidellm/schemas/request_stats.py +349 -0
  80. guidellm/schemas/response.py +124 -0
  81. guidellm/schemas/statistics.py +1018 -0
  82. guidellm/{config.py → settings.py} +31 -24
  83. guidellm/utils/__init__.py +71 -8
  84. guidellm/utils/auto_importer.py +98 -0
  85. guidellm/utils/cli.py +132 -5
  86. guidellm/utils/console.py +566 -0
  87. guidellm/utils/encoding.py +778 -0
  88. guidellm/utils/functions.py +159 -0
  89. guidellm/utils/hf_datasets.py +1 -2
  90. guidellm/utils/hf_transformers.py +4 -4
  91. guidellm/utils/imports.py +9 -0
  92. guidellm/utils/messaging.py +1118 -0
  93. guidellm/utils/mixins.py +115 -0
  94. guidellm/utils/random.py +3 -4
  95. guidellm/utils/registry.py +220 -0
  96. guidellm/utils/singleton.py +133 -0
  97. guidellm/utils/synchronous.py +159 -0
  98. guidellm/utils/text.py +163 -50
  99. guidellm/utils/typing.py +41 -0
  100. guidellm/version.py +2 -2
  101. guidellm-0.6.0a5.dist-info/METADATA +364 -0
  102. guidellm-0.6.0a5.dist-info/RECORD +109 -0
  103. guidellm/backend/__init__.py +0 -23
  104. guidellm/backend/backend.py +0 -259
  105. guidellm/backend/openai.py +0 -708
  106. guidellm/backend/response.py +0 -136
  107. guidellm/benchmark/aggregator.py +0 -760
  108. guidellm/benchmark/benchmark.py +0 -837
  109. guidellm/benchmark/output.py +0 -997
  110. guidellm/benchmark/profile.py +0 -409
  111. guidellm/benchmark/scenario.py +0 -104
  112. guidellm/data/prideandprejudice.txt.gz +0 -0
  113. guidellm/dataset/__init__.py +0 -22
  114. guidellm/dataset/creator.py +0 -213
  115. guidellm/dataset/entrypoints.py +0 -42
  116. guidellm/dataset/file.py +0 -92
  117. guidellm/dataset/hf_datasets.py +0 -62
  118. guidellm/dataset/in_memory.py +0 -132
  119. guidellm/dataset/synthetic.py +0 -287
  120. guidellm/objects/__init__.py +0 -18
  121. guidellm/objects/pydantic.py +0 -89
  122. guidellm/objects/statistics.py +0 -953
  123. guidellm/preprocess/__init__.py +0 -3
  124. guidellm/preprocess/dataset.py +0 -374
  125. guidellm/presentation/__init__.py +0 -28
  126. guidellm/presentation/builder.py +0 -27
  127. guidellm/presentation/data_models.py +0 -232
  128. guidellm/presentation/injector.py +0 -66
  129. guidellm/request/__init__.py +0 -18
  130. guidellm/request/loader.py +0 -284
  131. guidellm/request/request.py +0 -79
  132. guidellm/request/types.py +0 -10
  133. guidellm/scheduler/queues.py +0 -25
  134. guidellm/scheduler/result.py +0 -155
  135. guidellm/scheduler/strategy.py +0 -495
  136. guidellm-0.3.1.dist-info/METADATA +0 -329
  137. guidellm-0.3.1.dist-info/RECORD +0 -62
  138. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
  139. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
  140. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
  141. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,235 @@
1
+ """
2
+ Request schema definitions for generation operations.
3
+
4
+ Contains request models and data structures used to define and execute generation
5
+ requests across different backend services. Provides standardized interfaces for
6
+ request arguments, usage metrics tracking, and request type definitions that enable
7
+ consistent interaction with various AI generation APIs.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import uuid
13
+ from typing import Any, Literal
14
+
15
+ from pydantic import Field, computed_field
16
+
17
+ from guidellm.schemas.base import StandardBaseDict, StandardBaseModel
18
+
19
+ __all__ = [
20
+ "GenerationRequest",
21
+ "GenerationRequestArguments",
22
+ "GenerativeRequestType",
23
+ "UsageMetrics",
24
+ ]
25
+
26
+
27
+ GenerativeRequestType = Literal[
28
+ "text_completions",
29
+ "chat_completions",
30
+ "audio_transcriptions",
31
+ "audio_translations",
32
+ ]
33
+
34
+
35
+ class GenerationRequestArguments(StandardBaseDict):
36
+ """
37
+ HTTP request arguments for generation operations.
38
+
39
+ Encapsulates all necessary HTTP request components including method, headers,
40
+ parameters, and payload data required to execute generation requests against
41
+ backend services. Supports file uploads and streaming responses.
42
+ """
43
+
44
+ method: str | None = Field(
45
+ default=None,
46
+ description="The HTTP method to use for the request (e.g., 'POST', 'GET').",
47
+ )
48
+ stream: bool | None = Field(
49
+ default=None,
50
+ description="Whether to stream the response, if applicable.",
51
+ )
52
+ headers: dict[str, str] | None = Field(
53
+ default=None,
54
+ description="Any headers to include in the request, if applicable.",
55
+ )
56
+ params: dict[str, Any] | None = Field(
57
+ default=None,
58
+ description="Query parameters to include in the request, if applicable.",
59
+ )
60
+ body: dict[str, Any] | None = Field(
61
+ default=None,
62
+ description="Content to include in the main request body.",
63
+ )
64
+ files: dict[str, Any] | None = Field(
65
+ default=None,
66
+ description="Files to include in the request, if applicable.",
67
+ )
68
+
69
+ def model_combine(
70
+ self, additional: GenerationRequestArguments | dict[str, Any]
71
+ ) -> GenerationRequestArguments:
72
+ """
73
+ Merge additional request arguments into the current instance.
74
+
75
+ Combines method and stream fields by overwriting, while merging collection
76
+ fields like headers, params, body, and files by extending existing values.
77
+
78
+ :param additional: Additional arguments to merge with current instance
79
+ :return: Updated instance with merged arguments
80
+ """
81
+ additional_dict = (
82
+ additional.model_dump()
83
+ if isinstance(additional, GenerationRequestArguments)
84
+ else additional
85
+ )
86
+
87
+ for overwrite in ("method", "stream"):
88
+ if (val := additional_dict.get(overwrite)) is not None:
89
+ setattr(self, overwrite, val)
90
+
91
+ for combine in ("headers", "params", "body", "files"):
92
+ if (val := additional_dict.get(combine)) is not None:
93
+ current = getattr(self, combine, None) or {}
94
+ setattr(self, combine, {**current, **val})
95
+
96
+ return self
97
+
98
+
99
+ class UsageMetrics(StandardBaseDict):
100
+ """
101
+ Multimodal usage metrics for generation requests.
102
+
103
+ Tracks resource consumption across different modalities including text, images,
104
+ video, and audio. Provides granular metrics for tokens, bytes, duration, and
105
+ format-specific measurements to enable comprehensive usage monitoring and billing.
106
+ """
107
+
108
+ # Text stats
109
+ text_tokens: int | None = Field(
110
+ default=None, description="Number of text tokens processed/generated."
111
+ )
112
+ text_words: int | None = Field(
113
+ default=None, description="Number of text words processed/generated."
114
+ )
115
+ text_characters: int | None = Field(
116
+ default=None, description="Number of text characters processed/generated."
117
+ )
118
+
119
+ # Vision image stats
120
+ image_tokens: int | None = Field(
121
+ default=None, description="Number of image tokens processed/generated."
122
+ )
123
+ image_count: int | None = Field(
124
+ default=None, description="Number of images processed/generated."
125
+ )
126
+ image_pixels: int | None = Field(
127
+ default=None, description="Number of image pixels processed/generated."
128
+ )
129
+ image_bytes: int | None = Field(
130
+ default=None, description="Number of image bytes processed/generated."
131
+ )
132
+
133
+ # Vision video stats
134
+ video_tokens: int | None = Field(
135
+ default=None, description="Number of video tokens processed/generated."
136
+ )
137
+ video_frames: int | None = Field(
138
+ default=None, description="Number of video frames processed/generated."
139
+ )
140
+ video_seconds: float | None = Field(
141
+ default=None, description="Duration of video processed/generated in seconds."
142
+ )
143
+ video_bytes: int | None = Field(
144
+ default=None, description="Number of video bytes processed/generated."
145
+ )
146
+
147
+ # Audio stats
148
+ audio_tokens: int | None = Field(
149
+ default=None, description="Number of audio tokens processed/generated."
150
+ )
151
+ audio_samples: int | None = Field(
152
+ default=None, description="Number of audio samples processed/generated."
153
+ )
154
+ audio_seconds: float | None = Field(
155
+ default=None, description="Duration of audio processed/generated in seconds."
156
+ )
157
+ audio_bytes: int | None = Field(
158
+ default=None, description="Number of audio bytes processed/generated."
159
+ )
160
+
161
+ @computed_field # type: ignore[misc]
162
+ @property
163
+ def total_tokens(self) -> int | None:
164
+ """
165
+ Calculate total tokens across all modalities.
166
+
167
+ :return: Sum of text, image, video, and audio tokens, or None if all are None
168
+ """
169
+ token_metrics = [
170
+ self.text_tokens,
171
+ self.image_tokens,
172
+ self.video_tokens,
173
+ self.audio_tokens,
174
+ ]
175
+ # NOTE: None should indicate no data rather than zero usage
176
+ if token_metrics.count(None) == len(token_metrics):
177
+ return None
178
+ else:
179
+ return sum(token or 0 for token in token_metrics)
180
+
181
+ def add_text_metrics(self, text):
182
+ """
183
+ Adds the metrics from the given text to the fields
184
+ `text_characters` and `text_words`.
185
+
186
+ :param text: Text to add metrics from
187
+ """
188
+ self.text_characters = (self.text_characters or 0) + len(text)
189
+ self.text_words = (self.text_words or 0) + len(text.split())
190
+
191
+
192
+ class GenerationRequest(StandardBaseModel):
193
+ """
194
+ Complete request specification for backend generation operations.
195
+
196
+ Encapsulates all components needed to execute a generation request including
197
+ unique identification, request type specification, HTTP arguments, and input/output
198
+ usage metrics. Serves as the primary interface between the scheduler and backend
199
+ services for coordinating AI generation tasks.
200
+
201
+ Example::
202
+ request = GenerationRequest(
203
+ request_type="text_completions",
204
+ arguments=GenerationRequestArguments(
205
+ method="POST",
206
+ body={"prompt": "Hello world", "max_tokens": 100}
207
+ )
208
+ )
209
+ """
210
+
211
+ request_id: str = Field(
212
+ default_factory=lambda: str(uuid.uuid4()),
213
+ description="Unique identifier for the request.",
214
+ )
215
+ request_type: GenerativeRequestType | str = Field(
216
+ description=(
217
+ "Type of request. If url is not provided in arguments, "
218
+ "this will be used to determine the request url."
219
+ ),
220
+ )
221
+ arguments: GenerationRequestArguments = Field(
222
+ description=(
223
+ "Payload for the request, structured as a dictionary of arguments to pass "
224
+ "to the respective backend method. For example, can contain "
225
+ "'json', 'headers', 'files', etc."
226
+ )
227
+ )
228
+ input_metrics: UsageMetrics = Field(
229
+ default_factory=UsageMetrics,
230
+ description="Input statistics including counts, sizes, and durations.",
231
+ )
232
+ output_metrics: UsageMetrics = Field(
233
+ default_factory=UsageMetrics,
234
+ description="Output statistics including counts, sizes, and durations.",
235
+ )
@@ -0,0 +1,349 @@
1
+ """
2
+ Request statistics and metrics for generative AI benchmark analysis.
3
+
4
+ Provides data structures for capturing and analyzing performance metrics from
5
+ generative AI workloads. The module contains request-level statistics including
6
+ token counts, latency measurements, and throughput calculations essential for
7
+ evaluating text generation benchmark performance. Computed properties enable
8
+ analysis of time-to-first-token, inter-token latency, and token generation rates.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from typing import Literal
14
+
15
+ import numpy as np
16
+ from pydantic import Field, computed_field
17
+
18
+ from guidellm.schemas.base import StandardBaseDict
19
+ from guidellm.schemas.info import RequestInfo
20
+ from guidellm.schemas.request import GenerativeRequestType, UsageMetrics
21
+
22
+ __all__ = ["GenerativeRequestStats"]
23
+
24
+
25
+ class GenerativeRequestStats(StandardBaseDict):
26
+ """
27
+ Request statistics for generative AI text generation workloads.
28
+
29
+ Captures comprehensive performance metrics for individual generative requests,
30
+ including token counts, timing measurements, and derived performance statistics.
31
+ Provides computed properties for latency analysis, throughput calculations,
32
+ and token generation metrics essential for benchmark evaluation.
33
+
34
+ Example:
35
+ ::
36
+ stats = GenerativeRequestStats(
37
+ request_id="req_123",
38
+ request_type="text_completion",
39
+ info=request_info,
40
+ input_metrics=input_usage,
41
+ output_metrics=output_usage
42
+ )
43
+ throughput = stats.output_tokens_per_second
44
+ """
45
+
46
+ type_: Literal["generative_request_stats"] = "generative_request_stats"
47
+ request_id: str = Field(description="Unique identifier for the request")
48
+ request_type: GenerativeRequestType | str = Field(
49
+ description="Type of generative request (text_completion or chat_completion)"
50
+ )
51
+ response_id: str | None = Field(
52
+ default=None, description="Unique identifier matching vLLM Response ID"
53
+ )
54
+ request_args: str | None = Field(
55
+ default=None, description="Backend arguments used for this request"
56
+ )
57
+ output: str | None = Field(
58
+ default=None, description="Generated text output from the request"
59
+ )
60
+ info: RequestInfo = Field(description="Request metadata and timing information")
61
+ input_metrics: UsageMetrics = Field(
62
+ description="Token usage statistics for the input prompt"
63
+ )
64
+ output_metrics: UsageMetrics = Field(
65
+ description="Token usage statistics for the generated output"
66
+ )
67
+
68
+ # Request stats
69
+ @computed_field # type: ignore[misc]
70
+ @property
71
+ def request_start_time(self) -> float | None:
72
+ """
73
+ :return: Timestamp when the request started, or None if unavailable
74
+ """
75
+ return (
76
+ self.info.timings.request_start
77
+ if self.info.timings.request_start is not None
78
+ else self.info.timings.resolve_start
79
+ )
80
+
81
+ @computed_field # type: ignore[misc]
82
+ @property
83
+ def request_end_time(self) -> float:
84
+ """
85
+ :return: Timestamp when the request ended, or None if unavailable
86
+ """
87
+ if self.info.timings.resolve_end is None:
88
+ raise ValueError("resolve_end timings should be set but is None.")
89
+
90
+ return (
91
+ self.info.timings.request_end
92
+ if self.info.timings.request_end is not None
93
+ else self.info.timings.resolve_end
94
+ )
95
+
96
+ @computed_field # type: ignore[misc]
97
+ @property
98
+ def request_latency(self) -> float | None:
99
+ """
100
+ End-to-end request processing latency in seconds.
101
+
102
+ :return: Duration from request start to completion, or None if unavailable
103
+ """
104
+ start = self.info.timings.request_start
105
+ end = self.info.timings.request_end
106
+ if start is None or end is None:
107
+ return None
108
+
109
+ return end - start
110
+
111
+ # General token stats
112
+ @computed_field # type: ignore[misc]
113
+ @property
114
+ def prompt_tokens(self) -> int | None:
115
+ """
116
+ :return: Number of tokens in the input prompt, or None if unavailable
117
+ """
118
+ return self.input_metrics.total_tokens
119
+
120
+ @computed_field # type: ignore[misc]
121
+ @property
122
+ def output_tokens(self) -> int | None:
123
+ """
124
+ :return: Number of tokens in the generated output, or None if unavailable
125
+ """
126
+ # Fallback if we did not get usage metrics from the server
127
+ # NOTE: This assumes each iteration is one token
128
+ if self.output_metrics.total_tokens is None:
129
+ return self.info.timings.token_iterations or None
130
+
131
+ return self.output_metrics.total_tokens
132
+
133
+ @computed_field # type: ignore[misc]
134
+ @property
135
+ def total_tokens(self) -> int | None:
136
+ """
137
+ :return: Sum of prompt and output tokens, or None if both unavailable
138
+ """
139
+ input_tokens = self.prompt_tokens
140
+ output_tokens = self.output_tokens
141
+
142
+ if input_tokens is None and output_tokens is None:
143
+ return None
144
+
145
+ return (input_tokens or 0) + (output_tokens or 0)
146
+
147
+ @computed_field # type: ignore[misc]
148
+ @property
149
+ def time_to_first_token_ms(self) -> float | None:
150
+ """
151
+ :return: Time to first token generation in milliseconds, or None if unavailable
152
+ """
153
+ first_token = self.first_token_iteration
154
+ start = self.info.timings.request_start
155
+ if first_token is None or start is None:
156
+ return None
157
+
158
+ return 1000 * (first_token - start)
159
+
160
+ @computed_field # type: ignore[misc]
161
+ @property
162
+ def time_per_output_token_ms(self) -> float | None:
163
+ """
164
+ Average time per output token in milliseconds including first token.
165
+
166
+ :return: Average milliseconds per output token, or None if unavailable
167
+ """
168
+ if (
169
+ (start := self.info.timings.request_start) is None
170
+ or (
171
+ (last_token := self.last_token_iteration or self.request_end_time)
172
+ is None
173
+ )
174
+ or (output_tokens := self.output_tokens) is None
175
+ or output_tokens == 0
176
+ ):
177
+ return None
178
+
179
+ return 1000 * (last_token - start) / output_tokens
180
+
181
+ @computed_field # type: ignore[misc]
182
+ @property
183
+ def inter_token_latency_ms(self) -> float | None:
184
+ """
185
+ Average inter-token latency in milliseconds excluding first token.
186
+
187
+ :return: Average milliseconds between token generations, or None if unavailable
188
+ """
189
+ first_token = self.first_token_iteration
190
+ last_token = self.last_token_iteration
191
+ output_tokens = self.output_tokens
192
+ if (
193
+ first_token is None
194
+ or last_token is None
195
+ or output_tokens is None
196
+ or output_tokens <= 1
197
+ ):
198
+ return None
199
+
200
+ return 1000 * (last_token - first_token) / (output_tokens - 1)
201
+
202
+ @computed_field # type: ignore[misc]
203
+ @property
204
+ def tokens_per_second(self) -> float | None:
205
+ """
206
+ :return: Total tokens per second throughput, or None if unavailable
207
+ """
208
+ if not (latency := self.request_latency) or self.total_tokens is None:
209
+ return None
210
+
211
+ return self.total_tokens / latency
212
+
213
+ @computed_field # type: ignore[misc]
214
+ @property
215
+ def output_tokens_per_second(self) -> float | None:
216
+ """
217
+ :return: Output token generation throughput, or None if unavailable
218
+ """
219
+ if not (latency := self.request_latency) or self.output_tokens is None:
220
+ return None
221
+
222
+ return self.output_tokens / latency
223
+
224
+ @computed_field # type: ignore[misc]
225
+ @property
226
+ def iter_tokens_per_iteration(self) -> float | None:
227
+ """
228
+ :return: Average tokens per iteration excluding first token, or None if
229
+ unavailable
230
+ """
231
+ if (
232
+ self.output_tokens is None
233
+ or self.output_tokens <= 1
234
+ or self.token_iterations <= 1
235
+ ):
236
+ return None
237
+
238
+ return (self.output_tokens - 1.0) / (
239
+ self.token_iterations - 1.0
240
+ ) # subtract 1 for first token from the prompt, assume first iter is 1 token
241
+
242
+ @computed_field # type: ignore[misc]
243
+ @property
244
+ def output_tokens_per_iteration(self) -> float | None:
245
+ """
246
+ :return: Average output tokens per iteration, or None if unavailable
247
+ """
248
+ if self.output_tokens is None or self.token_iterations < 1:
249
+ return None
250
+
251
+ return self.output_tokens / self.token_iterations
252
+
253
+ @property
254
+ def first_token_iteration(self) -> float | None:
255
+ """
256
+ :return: Timestamp of first token generation, or None if unavailable
257
+ """
258
+ return self.info.timings.first_token_iteration
259
+
260
+ @property
261
+ def last_token_iteration(self) -> float | None:
262
+ """
263
+ :return: Timestamp of last token generation, or None if unavailable
264
+ """
265
+ return self.info.timings.last_token_iteration
266
+
267
+ @property
268
+ def token_iterations(self) -> int:
269
+ """
270
+ :return: Total number of token generation iterations
271
+ """
272
+ return self.info.timings.token_iterations
273
+
274
+ @property
275
+ def prompt_tokens_timing(self) -> tuple[float, float]:
276
+ """
277
+ :return: Tuple of (timestamp, token_count) for prompt processing
278
+ :raises ValueError: If resolve_end timings are not set
279
+ """
280
+ return (
281
+ (
282
+ self.first_token_iteration
283
+ if self.first_token_iteration is not None
284
+ else self.request_end_time
285
+ ),
286
+ self.prompt_tokens or 0.0,
287
+ )
288
+
289
+ @property
290
+ def output_tokens_timings(self) -> list[tuple[float, float]]:
291
+ """
292
+ :return: List of (timestamp, token_count) tuples for output token generations
293
+ :raises ValueError: If resolve_end timings are not set
294
+ """
295
+ if (
296
+ self.first_token_iteration is None
297
+ or self.last_token_iteration is None
298
+ or self.token_iterations <= 1
299
+ ):
300
+ # No iteration data, return single timing at end with all tokens
301
+ return [
302
+ (
303
+ (
304
+ self.last_token_iteration
305
+ if self.last_token_iteration is not None
306
+ else self.request_end_time
307
+ ),
308
+ self.output_tokens or 0.0,
309
+ )
310
+ ]
311
+
312
+ # Return first token timing as 1 token plus per-iteration timings
313
+ return [
314
+ (self.first_token_iteration, 1.0 * bool(self.output_tokens))
315
+ ] + self.iter_tokens_timings
316
+
317
+ @property
318
+ def iter_tokens_timings(self) -> list[tuple[float, float]]:
319
+ """
320
+ :return: List of (timestamp, token_count) tuples for iterations excluding
321
+ first token
322
+ """
323
+ if (
324
+ self.first_token_iteration is None
325
+ or self.last_token_iteration is None
326
+ or (tok_per_iter := self.iter_tokens_per_iteration) is None
327
+ or self.token_iterations <= 1
328
+ ):
329
+ return []
330
+
331
+ # evenly space the iterations since we don't have per-iteration timings
332
+ # / we don't know the individual token counts per iteration
333
+ iter_times = np.linspace(
334
+ self.first_token_iteration,
335
+ self.last_token_iteration,
336
+ num=self.token_iterations,
337
+ )[1:] # skip first iteration
338
+
339
+ return [(iter_time, tok_per_iter) for iter_time in iter_times]
340
+
341
+ @property
342
+ def total_tokens_timings(self) -> list[tuple[float, float]]:
343
+ """
344
+ :return: List of (timestamp, token_count) tuples for all token generations
345
+ """
346
+ prompt_timings = self.prompt_tokens_timing
347
+ output_timings = self.output_tokens_timings
348
+
349
+ return ([prompt_timings] if prompt_timings else []) + output_timings
@@ -0,0 +1,124 @@
1
+ """
2
+ Backend response models for request and response handling.
3
+
4
+ Provides standardized response models for generation operations that capture
5
+ output text, usage metrics, and compilation of request statistics. Ensures
6
+ consistent data handling and statistics aggregation across different backend
7
+ implementations.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from pydantic import Field
13
+
14
+ from guidellm.schemas.base import StandardBaseModel
15
+ from guidellm.schemas.info import RequestInfo
16
+ from guidellm.schemas.request import GenerationRequest, UsageMetrics
17
+ from guidellm.schemas.request_stats import GenerativeRequestStats
18
+
19
+ __all__ = ["GenerationResponse"]
20
+
21
+
22
+ class GenerationResponse(StandardBaseModel):
23
+ """
24
+ Response model for backend generation operations.
25
+
26
+ Captures the output and metrics from a generation request, providing structured
27
+ data for text output, token usage statistics, and compilation of detailed
28
+ request statistics for analysis and monitoring purposes.
29
+
30
+ Example:
31
+ ::
32
+ response = GenerationResponse(
33
+ request_id="req-123",
34
+ text="Generated response text",
35
+ input_metrics=UsageMetrics(token_count=50),
36
+ output_metrics=UsageMetrics(token_count=25)
37
+ )
38
+ stats = response.compile_stats(request, info)
39
+ """
40
+
41
+ request_id: str = Field(
42
+ description="Unique identifier matching the original GenerationRequest."
43
+ )
44
+ response_id: str | None = Field(
45
+ default=None,
46
+ description="Unique identifier matching the original vLLM Response ID.",
47
+ )
48
+ request_args: str | None = Field(
49
+ description="Arguments passed to the backend for request processing."
50
+ )
51
+ text: str | None = Field(
52
+ default=None,
53
+ description="The generated response text.",
54
+ )
55
+ input_metrics: UsageMetrics = Field(
56
+ default_factory=UsageMetrics,
57
+ description="Token usage statistics from the input prompt.",
58
+ )
59
+ output_metrics: UsageMetrics = Field(
60
+ default_factory=UsageMetrics,
61
+ description="Token usage statistics from the generated output.",
62
+ )
63
+
64
+ def compile_stats(
65
+ self,
66
+ request: GenerationRequest,
67
+ info: RequestInfo,
68
+ prefer_response: bool = True,
69
+ ) -> GenerativeRequestStats:
70
+ """
71
+ Compile and return comprehensive request statistics.
72
+
73
+ Merges metrics from the request and response objects to create a complete
74
+ statistical record, with preference given to response-level metrics when
75
+ available to ensure accuracy of actual execution data.
76
+
77
+ :param request: The original generation request containing input data
78
+ :param info: Metadata and timing information for the request execution
79
+ :param prefer_response: Whether to prefer response metrics over request
80
+ metrics when both are available
81
+ :return: A GenerativeRequestStats object containing detailed statistics
82
+ :raises ValueError: When request IDs don't match between objects
83
+ """
84
+ if request.request_id != self.request_id:
85
+ raise ValueError("Mismatched request IDs between request and response.")
86
+
87
+ if info.request_id != self.request_id:
88
+ raise ValueError("Mismatched request IDs between info and response.")
89
+
90
+ if info.status != "completed":
91
+ # clear out request output metrics if the request failed since
92
+ # those are not valid
93
+ request.output_metrics = UsageMetrics()
94
+
95
+ base_input = request.input_metrics if prefer_response else self.input_metrics
96
+ override_input = (
97
+ self.input_metrics if prefer_response else request.input_metrics
98
+ )
99
+ base_output = request.output_metrics if prefer_response else self.output_metrics
100
+ override_output = (
101
+ self.output_metrics if prefer_response else request.output_metrics
102
+ )
103
+
104
+ input_metrics_dict = base_input.model_dump()
105
+ for key, value in override_input.model_dump().items():
106
+ if value is not None:
107
+ input_metrics_dict[key] = value
108
+ output_metrics_dict = base_output.model_dump()
109
+ for key, value in override_output.model_dump().items():
110
+ if value is not None:
111
+ output_metrics_dict[key] = value
112
+
113
+ return GenerativeRequestStats(
114
+ request_id=self.request_id,
115
+ response_id=self.response_id,
116
+ request_type=request.request_type,
117
+ request_args=str(
118
+ request.arguments.model_dump() if request.arguments else {}
119
+ ),
120
+ output=self.text,
121
+ info=info,
122
+ input_metrics=UsageMetrics(**input_metrics_dict),
123
+ output_metrics=UsageMetrics(**output_metrics_dict),
124
+ )