guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +524 -255
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +109 -0
  5. guidellm/backends/openai.py +340 -0
  6. guidellm/backends/response_handlers.py +428 -0
  7. guidellm/benchmark/__init__.py +69 -39
  8. guidellm/benchmark/benchmarker.py +160 -316
  9. guidellm/benchmark/entrypoints.py +560 -127
  10. guidellm/benchmark/outputs/__init__.py +24 -0
  11. guidellm/benchmark/outputs/console.py +633 -0
  12. guidellm/benchmark/outputs/csv.py +721 -0
  13. guidellm/benchmark/outputs/html.py +473 -0
  14. guidellm/benchmark/outputs/output.py +169 -0
  15. guidellm/benchmark/outputs/serialized.py +69 -0
  16. guidellm/benchmark/profiles.py +718 -0
  17. guidellm/benchmark/progress.py +553 -556
  18. guidellm/benchmark/scenarios/__init__.py +40 -0
  19. guidellm/benchmark/scenarios/chat.json +6 -0
  20. guidellm/benchmark/scenarios/rag.json +6 -0
  21. guidellm/benchmark/schemas/__init__.py +66 -0
  22. guidellm/benchmark/schemas/base.py +402 -0
  23. guidellm/benchmark/schemas/generative/__init__.py +55 -0
  24. guidellm/benchmark/schemas/generative/accumulator.py +841 -0
  25. guidellm/benchmark/schemas/generative/benchmark.py +163 -0
  26. guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
  27. guidellm/benchmark/schemas/generative/metrics.py +927 -0
  28. guidellm/benchmark/schemas/generative/report.py +158 -0
  29. guidellm/data/__init__.py +34 -4
  30. guidellm/data/builders.py +541 -0
  31. guidellm/data/collators.py +16 -0
  32. guidellm/data/config.py +120 -0
  33. guidellm/data/deserializers/__init__.py +49 -0
  34. guidellm/data/deserializers/deserializer.py +141 -0
  35. guidellm/data/deserializers/file.py +223 -0
  36. guidellm/data/deserializers/huggingface.py +94 -0
  37. guidellm/data/deserializers/memory.py +194 -0
  38. guidellm/data/deserializers/synthetic.py +246 -0
  39. guidellm/data/entrypoints.py +52 -0
  40. guidellm/data/loaders.py +190 -0
  41. guidellm/data/preprocessors/__init__.py +27 -0
  42. guidellm/data/preprocessors/formatters.py +410 -0
  43. guidellm/data/preprocessors/mappers.py +196 -0
  44. guidellm/data/preprocessors/preprocessor.py +30 -0
  45. guidellm/data/processor.py +29 -0
  46. guidellm/data/schemas.py +175 -0
  47. guidellm/data/utils/__init__.py +6 -0
  48. guidellm/data/utils/dataset.py +94 -0
  49. guidellm/extras/__init__.py +4 -0
  50. guidellm/extras/audio.py +220 -0
  51. guidellm/extras/vision.py +242 -0
  52. guidellm/logger.py +2 -2
  53. guidellm/mock_server/__init__.py +8 -0
  54. guidellm/mock_server/config.py +84 -0
  55. guidellm/mock_server/handlers/__init__.py +17 -0
  56. guidellm/mock_server/handlers/chat_completions.py +280 -0
  57. guidellm/mock_server/handlers/completions.py +280 -0
  58. guidellm/mock_server/handlers/tokenizer.py +142 -0
  59. guidellm/mock_server/models.py +510 -0
  60. guidellm/mock_server/server.py +238 -0
  61. guidellm/mock_server/utils.py +302 -0
  62. guidellm/scheduler/__init__.py +69 -26
  63. guidellm/scheduler/constraints/__init__.py +49 -0
  64. guidellm/scheduler/constraints/constraint.py +325 -0
  65. guidellm/scheduler/constraints/error.py +411 -0
  66. guidellm/scheduler/constraints/factory.py +182 -0
  67. guidellm/scheduler/constraints/request.py +312 -0
  68. guidellm/scheduler/constraints/saturation.py +722 -0
  69. guidellm/scheduler/environments.py +252 -0
  70. guidellm/scheduler/scheduler.py +137 -368
  71. guidellm/scheduler/schemas.py +358 -0
  72. guidellm/scheduler/strategies.py +617 -0
  73. guidellm/scheduler/worker.py +413 -419
  74. guidellm/scheduler/worker_group.py +712 -0
  75. guidellm/schemas/__init__.py +65 -0
  76. guidellm/schemas/base.py +417 -0
  77. guidellm/schemas/info.py +188 -0
  78. guidellm/schemas/request.py +235 -0
  79. guidellm/schemas/request_stats.py +349 -0
  80. guidellm/schemas/response.py +124 -0
  81. guidellm/schemas/statistics.py +1018 -0
  82. guidellm/{config.py → settings.py} +31 -24
  83. guidellm/utils/__init__.py +71 -8
  84. guidellm/utils/auto_importer.py +98 -0
  85. guidellm/utils/cli.py +132 -5
  86. guidellm/utils/console.py +566 -0
  87. guidellm/utils/encoding.py +778 -0
  88. guidellm/utils/functions.py +159 -0
  89. guidellm/utils/hf_datasets.py +1 -2
  90. guidellm/utils/hf_transformers.py +4 -4
  91. guidellm/utils/imports.py +9 -0
  92. guidellm/utils/messaging.py +1118 -0
  93. guidellm/utils/mixins.py +115 -0
  94. guidellm/utils/random.py +3 -4
  95. guidellm/utils/registry.py +220 -0
  96. guidellm/utils/singleton.py +133 -0
  97. guidellm/utils/synchronous.py +159 -0
  98. guidellm/utils/text.py +163 -50
  99. guidellm/utils/typing.py +41 -0
  100. guidellm/version.py +2 -2
  101. guidellm-0.6.0a5.dist-info/METADATA +364 -0
  102. guidellm-0.6.0a5.dist-info/RECORD +109 -0
  103. guidellm/backend/__init__.py +0 -23
  104. guidellm/backend/backend.py +0 -259
  105. guidellm/backend/openai.py +0 -708
  106. guidellm/backend/response.py +0 -136
  107. guidellm/benchmark/aggregator.py +0 -760
  108. guidellm/benchmark/benchmark.py +0 -837
  109. guidellm/benchmark/output.py +0 -997
  110. guidellm/benchmark/profile.py +0 -409
  111. guidellm/benchmark/scenario.py +0 -104
  112. guidellm/data/prideandprejudice.txt.gz +0 -0
  113. guidellm/dataset/__init__.py +0 -22
  114. guidellm/dataset/creator.py +0 -213
  115. guidellm/dataset/entrypoints.py +0 -42
  116. guidellm/dataset/file.py +0 -92
  117. guidellm/dataset/hf_datasets.py +0 -62
  118. guidellm/dataset/in_memory.py +0 -132
  119. guidellm/dataset/synthetic.py +0 -287
  120. guidellm/objects/__init__.py +0 -18
  121. guidellm/objects/pydantic.py +0 -89
  122. guidellm/objects/statistics.py +0 -953
  123. guidellm/preprocess/__init__.py +0 -3
  124. guidellm/preprocess/dataset.py +0 -374
  125. guidellm/presentation/__init__.py +0 -28
  126. guidellm/presentation/builder.py +0 -27
  127. guidellm/presentation/data_models.py +0 -232
  128. guidellm/presentation/injector.py +0 -66
  129. guidellm/request/__init__.py +0 -18
  130. guidellm/request/loader.py +0 -284
  131. guidellm/request/request.py +0 -79
  132. guidellm/request/types.py +0 -10
  133. guidellm/scheduler/queues.py +0 -25
  134. guidellm/scheduler/result.py +0 -155
  135. guidellm/scheduler/strategy.py +0 -495
  136. guidellm-0.3.1.dist-info/METADATA +0 -329
  137. guidellm-0.3.1.dist-info/RECORD +0 -62
  138. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
  139. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
  140. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
  141. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,841 @@
1
+ """
2
+ Real-time metric accumulation for generative benchmark execution.
3
+
4
+ Captures and computes performance metrics during benchmark runs, tracking timing phases,
5
+ request statistics, token throughput, and latency distributions. Components include
6
+ timing trackers for warmup/cooldown phases, running statistical accumulators for
7
+ throughput and latency metrics, and reservoir sampling for request data. Enables
8
+ comprehensive performance measurement including scheduler overhead, time-to-first-token,
9
+ inter-token latency, and token generation rates across completed, errored, and
10
+ incomplete requests.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import random
16
+ import time
17
+ from typing import Literal
18
+
19
+ from pydantic import Field
20
+
21
+ from guidellm.benchmark.schemas.base import BenchmarkAccumulator, BenchmarkConfig
22
+ from guidellm.scheduler import MultiTurnRequestT, SchedulerState
23
+ from guidellm.schemas import (
24
+ GenerationRequest,
25
+ GenerationResponse,
26
+ GenerativeRequestStats,
27
+ RequestInfo,
28
+ RequestTimings,
29
+ StandardBaseModel,
30
+ StatusBreakdown,
31
+ )
32
+
33
+ __all__ = [
34
+ "GenerativeBenchmarkAccumulator",
35
+ "GenerativeBenchmarkTimings",
36
+ "GenerativeMetricsAccumulator",
37
+ "GenerativeRequestsAccumulator",
38
+ "RunningMetricStats",
39
+ "SchedulerMetricsAccumulator",
40
+ ]
41
+
42
+
43
+ class GenerativeBenchmarkTimings(StandardBaseModel):
44
+ """
45
+ Tracks timing phases and transitions during benchmark execution.
46
+
47
+ Monitors timestamps throughout benchmark execution including request submission,
48
+ measurement period boundaries (warmup/active/cooldown), and completion events.
49
+ Provides duration calculations and phase status determination based on configured
50
+ warmup and cooldown periods.
51
+ """
52
+
53
+ request_start: float | None = Field(
54
+ description="Timestamp when the first request was sent", default=None
55
+ )
56
+ measure_start: float | None = Field(
57
+ description="Timestamp when measurement period started", default=None
58
+ )
59
+ measure_end: float | None = Field(
60
+ description="Timestamp when measurement period ended", default=None
61
+ )
62
+ request_end: float | None = Field(
63
+ description="Timestamp when the last request was completed", default=None
64
+ )
65
+ current_update: float | None = Field(
66
+ description="Most recent timestamp observed during execution", default=None
67
+ )
68
+ current_request: float | None = Field(
69
+ description="Most recent request completion timestamp observed", default=None
70
+ )
71
+ last_update: float | None = Field(
72
+ description="Previous timestamp observed before the current one", default=None
73
+ )
74
+ last_request: float | None = Field(
75
+ description="Previous request completion timestamp before the current one",
76
+ default=None,
77
+ )
78
+
79
+ @property
80
+ def status(self) -> Literal["pending", "warmup", "active", "cooldown"]:
81
+ """
82
+ :return: Current execution phase based on timing thresholds
83
+ """
84
+ if self.request_start is None or self.current_update is None:
85
+ return "pending"
86
+
87
+ if self.measure_start is None or self.current_update <= self.measure_start:
88
+ return "warmup"
89
+
90
+ if self.measure_end is not None and self.current_update >= self.measure_end:
91
+ return "cooldown"
92
+
93
+ return "active"
94
+
95
+ @property
96
+ def duration(self) -> float:
97
+ """
98
+ :return: Elapsed time since measurement or request start in seconds
99
+ """
100
+ if self.request_start is None or self.current_update is None:
101
+ return 0.0
102
+
103
+ return self.current_update - self.request_start
104
+
105
+ @property
106
+ def elapsed_time_last_update(self) -> float:
107
+ """
108
+ :return: Time elapsed between the last two update timestamps in seconds
109
+ """
110
+ if self.current_update is None or self.last_update is None:
111
+ return 0.0
112
+
113
+ return self.current_update - self.last_update
114
+
115
+ @property
116
+ def elapsed_time_last_request(self) -> float:
117
+ """
118
+ :return: Time elapsed between the last two request completions in seconds
119
+ """
120
+ if self.current_request is None or self.last_request is None:
121
+ return 0.0
122
+
123
+ return self.current_request - self.last_request
124
+
125
+ @property
126
+ def finalized_request_start(self) -> float:
127
+ """
128
+ :return: Finalized timestamp from the current state for when requests started
129
+ """
130
+ return self.request_start or -1.0
131
+
132
+ @property
133
+ def finalized_measure_start(self) -> float:
134
+ """
135
+ :return: Finalized timestamp from the current state for when measurement started
136
+ """
137
+ return self.measure_start or self.finalized_request_start
138
+
139
+ @property
140
+ def finalized_measure_end(self) -> float:
141
+ """
142
+ :return: Finalized timestamp from the current state for when measurement ended
143
+ """
144
+ return self.measure_end or self.finalized_request_end
145
+
146
+ @property
147
+ def finalized_request_end(self) -> float:
148
+ """
149
+ :return: Finalized timestamp from the current state for when requests ended
150
+ """
151
+ return self.request_end or self.current_request or -1.0
152
+
153
+ def update_estimate(
154
+ self,
155
+ info: RequestInfo,
156
+ scheduler_state: SchedulerState,
157
+ config: BenchmarkConfig,
158
+ ):
159
+ """
160
+ Update timing estimates based on request info and scheduler state.
161
+
162
+ Advances timing markers through benchmark phases (warmup to active to cooldown)
163
+ based on configured thresholds. Updates current/last timestamps for updates and
164
+ request completions, determining measurement period boundaries.
165
+
166
+ :param info: Request information containing timing data
167
+ :param scheduler_state: Current scheduler state with progress metrics
168
+ :param config: Benchmark configuration with warmup/cooldown settings
169
+ """
170
+ # First update non terminal timestamps
171
+ self.request_start = scheduler_state.start_requests_time
172
+ self.last_update = self.current_update
173
+ if (current_time := info.timings.last_reported) is not None:
174
+ self.current_update = (
175
+ current_time
176
+ if self.current_update is None
177
+ else max(self.current_update, current_time)
178
+ )
179
+
180
+ # Next update measurement period timestamps, if available and possible
181
+ warmup_active, measure_start = config.warmup.compute_transition_time(
182
+ info=info, state=scheduler_state, period="start"
183
+ )
184
+ if not warmup_active:
185
+ # No warmup, set measure_start to first request start
186
+ self.measure_start = self.request_start
187
+ elif measure_start is not None:
188
+ self.measure_start = measure_start
189
+ cooldown_active, measure_end = config.cooldown.compute_transition_time(
190
+ info=info, state=scheduler_state, period="end"
191
+ )
192
+ if cooldown_active and measure_end is not None:
193
+ self.measure_end = measure_end
194
+
195
+ # Update last request terminal timestamps, if request is terminal
196
+ if info.status in {"completed", "errored", "cancelled"}:
197
+ self.last_request = self.current_request
198
+ if info.completed_at is not None and (
199
+ self.current_request is None or info.completed_at > self.current_request
200
+ ):
201
+ self.current_request = info.completed_at
202
+
203
+ # Finally, update request stop timestamps, if at that stage and available
204
+ if scheduler_state.end_processing_time is not None and self.request_end is None:
205
+ self.request_end = (
206
+ scheduler_state.progress.stop_time
207
+ or self.current_request
208
+ or scheduler_state.end_processing_time
209
+ )
210
+ if self.measure_end is None:
211
+ # No cooldown triggered, set measure_end to request_end
212
+ self.measure_end = self.request_end
213
+
214
+
215
+ class RunningMetricStats(StandardBaseModel):
216
+ """
217
+ Maintains running statistics for a metric stream without storing all samples.
218
+
219
+ Accumulates count, sum, time-weighted sum, and duration to compute mean, rate,
220
+ and time-weighted statistics incrementally. Efficient for real-time metric tracking
221
+ during long-running benchmarks where storing individual samples is impractical.
222
+ """
223
+
224
+ count: int = Field(description="Number of samples accumulated", default=0)
225
+ value_sum: float = Field(description="Total sum of accumulated values", default=0.0)
226
+ time_weighted_sum: float = Field(
227
+ description="Time-weighted sum of accumulated values", default=0.0
228
+ )
229
+ duration: float = Field(
230
+ description="Total duration over which values were accumulated", default=0.0
231
+ )
232
+ last_value: float | None = Field(
233
+ description="Most recent value added to the accumulator", default=None
234
+ )
235
+
236
+ @property
237
+ def mean(self) -> float | None:
238
+ """
239
+ :return: Arithmetic mean of accumulated values, or None if no samples
240
+ """
241
+ if self.count <= 0:
242
+ return None
243
+
244
+ return self.value_sum / self.count
245
+
246
+ @property
247
+ def time_weighted_mean(self) -> float | None:
248
+ """
249
+ :return: Time-weighted mean considering duration between samples, or None
250
+ """
251
+ if self.duration <= 0.0:
252
+ return None
253
+
254
+ return self.time_weighted_sum / self.duration
255
+
256
+ @property
257
+ def rate_per_item(self) -> float | None:
258
+ """
259
+ :return: Average value per accumulated item, or None if no samples
260
+ """
261
+ if self.count <= 0:
262
+ return None
263
+
264
+ return self.value_sum / self.count
265
+
266
+ @property
267
+ def rate_per_second(self) -> float | None:
268
+ """
269
+ :return: Average value per second of duration, or None if no duration
270
+ """
271
+ if self.duration <= 0.0:
272
+ return None
273
+
274
+ return self.value_sum / self.duration
275
+
276
+ def update_estimate(
277
+ self,
278
+ value: float | None,
279
+ count: int = 1,
280
+ duration: float | None = None,
281
+ elapsed: float | None = None,
282
+ ):
283
+ """
284
+ Incorporate a new metric value into running statistics.
285
+
286
+ Updates count, sum, and time-weighted statistics using the new value and timing
287
+ information. Time-weighted calculations use the previous value over the elapsed
288
+ interval to capture sustained metric behavior.
289
+
290
+ :param value: New metric value to accumulate
291
+ :param count: Number of occurrences this value represents
292
+ :param duration: Total duration to set, overriding incremental elapsed updates
293
+ :param elapsed: Time elapsed since last update for time-weighted calculations
294
+ """
295
+ self.count += count
296
+ self.value_sum += (value or 0.0) * count
297
+
298
+ if elapsed is not None:
299
+ self.time_weighted_sum += (self.last_value or 0.0) * elapsed
300
+
301
+ self.duration = (
302
+ duration if duration is not None else (self.duration + (elapsed or 0.0))
303
+ )
304
+ self.last_value = value
305
+
306
+
307
+ class SchedulerMetricsAccumulator(StandardBaseModel):
308
+ """
309
+ Tracks scheduler-level timing and overhead metrics during execution.
310
+
311
+ Monitors request lifecycle timing from queuing through completion, capturing delays
312
+ at each stage: queue time, worker start delays, request processing time, and
313
+ finalization overhead. Provides insight into scheduler efficiency and bottleneck
314
+ identification in request orchestration.
315
+ """
316
+
317
+ requests_made: StatusBreakdown[int, int, int, int] = Field(
318
+ description="Request counts by status: successful, incomplete, errored, total",
319
+ default_factory=lambda: StatusBreakdown[int, int, int, int](
320
+ successful=0, errored=0, incomplete=0, total=0
321
+ ),
322
+ )
323
+ # Timings flow:
324
+ # Request scheduling: queued->dequeued->scheduled_at->resolve_start->
325
+ # Request processing: request_start->*_iteration->request_end->
326
+ # Request finalizing: resolve_end->finalized->accumulation update processed
327
+ queued_time: RunningMetricStats = Field(
328
+ default_factory=RunningMetricStats,
329
+ description="Running stats for time requests spent in the queue",
330
+ )
331
+ resolve_start_delay: RunningMetricStats = Field(
332
+ default_factory=RunningMetricStats,
333
+ description=(
334
+ "Running stats for delay before worker begins resolving req after dequeue"
335
+ ),
336
+ )
337
+ resolve_targeted_start_delay: RunningMetricStats = Field(
338
+ default_factory=RunningMetricStats,
339
+ description=(
340
+ "Running stats for delay from targeted start to actual worker start"
341
+ ),
342
+ )
343
+ request_start_delay: RunningMetricStats = Field(
344
+ default_factory=RunningMetricStats,
345
+ description="Running stats for delay after resolve til request start",
346
+ )
347
+ request_targeted_start_delay: RunningMetricStats = Field(
348
+ default_factory=RunningMetricStats,
349
+ description=(
350
+ "Running stats for delay from targeted start to actual request start"
351
+ ),
352
+ )
353
+ request_time: RunningMetricStats = Field(
354
+ default_factory=RunningMetricStats,
355
+ description="Running stats for request processing time",
356
+ )
357
+ resolve_end_delay: RunningMetricStats = Field(
358
+ default_factory=RunningMetricStats,
359
+ description="Running stats for delay after request end till worker resolves",
360
+ )
361
+ resolve_time: RunningMetricStats = Field(
362
+ default_factory=RunningMetricStats,
363
+ description="Running stats for time for worker to resolve requests",
364
+ )
365
+ finalized_delay: RunningMetricStats = Field(
366
+ default_factory=RunningMetricStats,
367
+ description="Running stats for delay after resolve til finalized in scheduler",
368
+ )
369
+ processed_delay: RunningMetricStats = Field(
370
+ default_factory=RunningMetricStats,
371
+ description=(
372
+ "Running stats for delay from finalized til request being "
373
+ "processed by accumulation"
374
+ ),
375
+ )
376
+
377
+ def update_estimate(
378
+ self, scheduler_state: SchedulerState, stats: GenerativeRequestStats
379
+ ):
380
+ """
381
+ Update scheduler metrics with completed request timing data.
382
+
383
+ Extracts timing information from request statistics to update running metrics
384
+ for each scheduler lifecycle stage. Validates that required timing markers are
385
+ present before processing.
386
+
387
+ :param scheduler_state: Current scheduler state with request counts
388
+ :param stats: Completed request statistics with detailed timing information
389
+ :raises ValueError: If required timing markers are missing
390
+ """
391
+ # Update request counts
392
+ self.requests_made.successful = scheduler_state.successful_requests
393
+ self.requests_made.errored = scheduler_state.errored_requests
394
+ self.requests_made.incomplete = scheduler_state.cancelled_requests
395
+ self.requests_made.total = (
396
+ scheduler_state.successful_requests
397
+ + scheduler_state.errored_requests
398
+ + scheduler_state.cancelled_requests
399
+ )
400
+
401
+ # All requests must have queued, dequeued, resolve_end, and finalized timings
402
+ timings: RequestTimings = stats.info.timings
403
+ if any(
404
+ timing is None
405
+ for timing in [
406
+ timings.queued,
407
+ timings.dequeued,
408
+ timings.resolve_end,
409
+ timings.finalized,
410
+ ]
411
+ ):
412
+ raise ValueError(
413
+ "Required timings 'queued', 'dequeued', 'resolve_end', and "
414
+ "'finalized' must not be None"
415
+ )
416
+
417
+ # Store validated non-None timings for type safety
418
+ queued: float = timings.queued # type: ignore[assignment]
419
+ dequeued: float = timings.dequeued # type: ignore[assignment]
420
+ resolve_end: float = timings.resolve_end # type: ignore[assignment]
421
+ finalized: float = timings.finalized # type: ignore[assignment]
422
+
423
+ # Update timing metrics in occurrence order
424
+ self.queued_time.update_estimate(value=dequeued - queued)
425
+
426
+ if timings.scheduled_at is not None and timings.resolve_start is not None:
427
+ self.resolve_start_delay.update_estimate(
428
+ value=timings.resolve_start - timings.scheduled_at
429
+ )
430
+
431
+ if timings.targeted_start is not None and timings.resolve_start is not None:
432
+ self.resolve_targeted_start_delay.update_estimate(
433
+ value=timings.resolve_start - timings.targeted_start
434
+ )
435
+
436
+ if timings.resolve_start is not None and timings.request_start is not None:
437
+ self.request_start_delay.update_estimate(
438
+ value=timings.request_start - timings.resolve_start
439
+ )
440
+
441
+ if timings.targeted_start is not None and timings.request_start is not None:
442
+ self.request_targeted_start_delay.update_estimate(
443
+ value=timings.request_start - timings.targeted_start
444
+ )
445
+
446
+ if timings.request_start is not None and timings.request_end is not None:
447
+ self.request_time.update_estimate(
448
+ value=timings.request_end - timings.request_start
449
+ )
450
+
451
+ if timings.request_end is not None:
452
+ self.resolve_end_delay.update_estimate(
453
+ value=resolve_end - timings.request_end
454
+ )
455
+
456
+ if timings.resolve_start is not None:
457
+ self.resolve_time.update_estimate(value=resolve_end - timings.resolve_start)
458
+
459
+ self.finalized_delay.update_estimate(value=finalized - resolve_end)
460
+ self.processed_delay.update_estimate(value=time.time() - finalized)
461
+
462
+
463
+ class GenerativeMetricsAccumulator(StandardBaseModel):
464
+ """
465
+ Accumulates generative model performance metrics during execution.
466
+
467
+ Tracks token throughput, latency characteristics, and request timing for generative
468
+ workloads. Maintains running statistics for input/output tokens,
469
+ time-to-first-token, inter-token latency, and streaming patterns for comprehensive
470
+ performance analysis.
471
+ """
472
+
473
+ requests: RunningMetricStats = Field(
474
+ default_factory=RunningMetricStats,
475
+ description="Accumulated request count statistics",
476
+ )
477
+ request_latency: RunningMetricStats = Field(
478
+ default_factory=RunningMetricStats,
479
+ description="Accumulated request latency statistics",
480
+ )
481
+ prompt_tokens: RunningMetricStats = Field(
482
+ default_factory=RunningMetricStats,
483
+ description="Accumulated input token count statistics",
484
+ )
485
+ output_tokens: RunningMetricStats = Field(
486
+ default_factory=RunningMetricStats,
487
+ description="Accumulated output token count statistics",
488
+ )
489
+ total_tokens: RunningMetricStats = Field(
490
+ default_factory=RunningMetricStats,
491
+ description="Accumulated total token count statistics",
492
+ )
493
+ time_to_first_token_ms: RunningMetricStats = Field(
494
+ default_factory=RunningMetricStats,
495
+ description="Accumulated time to first token statistics in milliseconds",
496
+ )
497
+ time_per_output_token_ms: RunningMetricStats = Field(
498
+ default_factory=RunningMetricStats,
499
+ description="Accumulated time per output token statistics in milliseconds",
500
+ )
501
+ inter_token_latency_ms: RunningMetricStats = Field(
502
+ default_factory=RunningMetricStats,
503
+ description="Accumulated inter-token latency statistics in milliseconds",
504
+ )
505
+ streaming_iterations: RunningMetricStats = Field(
506
+ default_factory=RunningMetricStats,
507
+ description="Accumulated streaming iteration count statistics",
508
+ )
509
+ output_tokens_by_iteration: RunningMetricStats = Field(
510
+ default_factory=RunningMetricStats,
511
+ description="Accumulated output tokens per iteration statistics",
512
+ )
513
+ iter_tokens_by_iteration: RunningMetricStats = Field(
514
+ default_factory=RunningMetricStats,
515
+ description="Accumulated iteration tokens per iteration statistics",
516
+ )
517
+
518
+ def update_estimate(self, stats: GenerativeRequestStats, duration: float):
519
+ """
520
+ Update generative metrics with completed request statistics.
521
+
522
+ Incorporates token counts, latency measurements, and streaming characteristics
523
+ from a completed request into running metric accumulators with time-weighted
524
+ calculations.
525
+
526
+ :param stats: Request statistics containing token and latency measurements
527
+ :param duration: Current benchmark duration for time-weighted metrics
528
+ """
529
+ self.requests.update_estimate(1.0, duration=duration)
530
+ self.prompt_tokens.update_estimate(stats.prompt_tokens, duration=duration)
531
+ self.output_tokens.update_estimate(stats.output_tokens, duration=duration)
532
+ self.total_tokens.update_estimate(stats.total_tokens, duration=duration)
533
+ self.request_latency.update_estimate(stats.request_latency, duration=duration)
534
+ self.time_to_first_token_ms.update_estimate(
535
+ stats.time_to_first_token_ms, duration=duration
536
+ )
537
+ self.time_per_output_token_ms.update_estimate(
538
+ stats.time_per_output_token_ms,
539
+ count=int(stats.output_tokens or 0),
540
+ duration=duration,
541
+ )
542
+ self.inter_token_latency_ms.update_estimate(
543
+ stats.inter_token_latency_ms,
544
+ count=int((stats.output_tokens or 1) - 1),
545
+ duration=duration,
546
+ )
547
+ self.streaming_iterations.update_estimate(
548
+ stats.token_iterations, duration=duration
549
+ )
550
+ self.output_tokens_by_iteration.update_estimate(
551
+ stats.output_tokens_per_iteration,
552
+ count=int(stats.token_iterations or 0),
553
+ duration=duration,
554
+ )
555
+ self.iter_tokens_by_iteration.update_estimate(
556
+ stats.iter_tokens_per_iteration,
557
+ count=int((stats.token_iterations or 1) - 1),
558
+ duration=duration,
559
+ )
560
+
561
+
562
+ class GenerativeRequestsAccumulator(StandardBaseModel):
563
+ """
564
+ Manages request statistics collection with optional reservoir sampling.
565
+
566
+ Collects detailed request statistics while optionally sampling to limit memory usage
567
+ in long-running benchmarks. Supports configurable sampling rates and selective data
568
+ retention (clearing request arguments and/or outputs for non-sampled requests).
569
+ """
570
+
571
+ sample_requests: int | None = Field(
572
+ default=None,
573
+ description=(
574
+ "Number of requests to sample and keep in the final benchmark for metrics"
575
+ ),
576
+ )
577
+ requests_stats: list[GenerativeRequestStats] = Field(
578
+ description="List of generative request statistics", default_factory=list
579
+ )
580
+ samples: list[int] | None = Field(
581
+ description="Indices of sampled generative requests", default=None
582
+ )
583
+ clear_nonsampled_request_args: bool = Field(
584
+ default=True,
585
+ description=(
586
+ "Whether to clear request arguments and outputs for non-sampled requests"
587
+ ),
588
+ )
589
+ clear_nonsampled_outputs: bool = Field(
590
+ default=True,
591
+ description=(
592
+ "Whether to clear outputs for non-sampled requests while keeping args"
593
+ ),
594
+ )
595
+
596
+ def get_sampled(self) -> list[GenerativeRequestStats]:
597
+ """
598
+ Retrieve the list of sampled request statistics.
599
+
600
+ :return: List of sampled generative request statistics
601
+ """
602
+ if self.samples is None:
603
+ return self.requests_stats
604
+
605
+ return [self.requests_stats[ind] for ind in self.samples]
606
+
607
+ def get_within_range(
608
+ self, start_time: float, end_time: float
609
+ ) -> list[GenerativeRequestStats]:
610
+ """
611
+ Retrieve request statistics within a specified time range.
612
+
613
+ :param start_time: Start timestamp for filtering (requests must end after this)
614
+ :param end_time: End timestamp for filtering (requests must start before this)
615
+ :return: List of request statistics within the time range
616
+ """
617
+ return [
618
+ stats
619
+ for stats in self.requests_stats
620
+ if (stats.request_end_time >= start_time)
621
+ and (
622
+ (
623
+ stats.request_start_time is not None
624
+ and stats.request_start_time <= end_time
625
+ )
626
+ or (
627
+ stats.request_start_time is None
628
+ and stats.request_end_time <= end_time
629
+ )
630
+ )
631
+ ]
632
+
633
+ def update_estimate(
634
+ self,
635
+ response: GenerationResponse | None,
636
+ request: GenerationRequest | MultiTurnRequestT[GenerationRequest],
637
+ info: RequestInfo,
638
+ prefer_response_metrics: bool,
639
+ ) -> GenerativeRequestStats:
640
+ """
641
+ Record request statistics and apply reservoir sampling if configured.
642
+
643
+ Compiles statistics from the completed request and adds to the collection.
644
+ Uses reservoir sampling algorithm to maintain uniform sample distribution when
645
+ enabled, clearing non-sampled request data to manage memory.
646
+
647
+ :param response: Generation response containing output and metrics
648
+ :param request: Original generation request with input data
649
+ :param info: Request execution information and timing
650
+ :param prefer_response_metrics: Whether to prefer metrics from response
651
+ :return: Compiled request statistics
652
+ """
653
+ stats = self.compile_stats(response, request, info, prefer_response_metrics)
654
+
655
+ current_index = len(self.requests_stats)
656
+ self.requests_stats.append(stats)
657
+
658
+ if self.sample_requests is None:
659
+ # Keeping all requests, don't need to sample
660
+ self.samples = None
661
+ elif self.sample_requests <= 0:
662
+ # Not keeping any requests, clear out unnecessary memory usage for current
663
+ self.clear_stats_data(stats)
664
+ elif self.sample_requests >= len(self.requests_stats):
665
+ # Add directly to samples, haven't filled yet
666
+ if self.samples is None:
667
+ self.samples = []
668
+ self.samples.append(current_index)
669
+ elif self.sample_requests / len(self.requests_stats) >= random.random():
670
+ # Sampling logic: choose to replace with decreasing probability s / n
671
+ # where s is sample size, n is current number of requests.
672
+ # If chosen, choose random existing sample to replace.
673
+ # P(new item in samples) = s / n
674
+ # P(prev item in samples) = P(item was in samples) * P(not replaced)
675
+ # P(prev item in samples) =
676
+ # P(before replacement) * P(new item selected) * P(chosen from samples)
677
+ # P(prev item in samples) = (s / (n - 1)) * (s / n) * (1 / s) = s / n
678
+ # P(prev item in samples) = P(new item in samples)
679
+ if self.samples is None:
680
+ self.samples = []
681
+ replace_index = random.randrange(len(self.samples))
682
+ self.clear_stats_data(self.samples[replace_index])
683
+ self.samples[replace_index] = current_index
684
+
685
+ return stats
686
+
687
+ def clear_stats_data(self, stats: GenerativeRequestStats | int):
688
+ if isinstance(stats, int):
689
+ stats = self.requests_stats[stats]
690
+
691
+ if self.clear_nonsampled_request_args:
692
+ stats.request_args = None
693
+ if self.clear_nonsampled_outputs:
694
+ stats.output = None
695
+
696
+ @classmethod
697
+ def compile_stats(
698
+ cls,
699
+ response: GenerationResponse | None,
700
+ request: GenerationRequest | MultiTurnRequestT[GenerationRequest],
701
+ info: RequestInfo,
702
+ prefer_response_metrics: bool,
703
+ ) -> GenerativeRequestStats:
704
+ """
705
+ Compile statistics from request, response, and execution info.
706
+
707
+ :param response: Generation response with output and metrics, or None
708
+ :param request: Original generation request with input data
709
+ :param info: Request execution information and timing
710
+ :param prefer_response_metrics: Whether to prefer metrics from response
711
+ :return: Compiled generative request statistics
712
+ """
713
+ # Extract the first request for arguments if multi-turn
714
+ first_request: GenerationRequest
715
+ if isinstance(request, GenerationRequest):
716
+ first_request = request
717
+ else:
718
+ # Multi-turn request: extract first item
719
+ first_item = request[0]
720
+ first_request = (
721
+ first_item[0] if isinstance(first_item, tuple) else first_item
722
+ )
723
+
724
+ if response is None:
725
+ response = GenerationResponse(
726
+ request_id=info.request_id, request_args=str(first_request.arguments)
727
+ )
728
+
729
+ return response.compile_stats(
730
+ request=first_request,
731
+ info=info,
732
+ prefer_response=prefer_response_metrics,
733
+ )
734
+
735
+
736
+ class GenerativeBenchmarkAccumulator(
737
+ BenchmarkAccumulator[GenerationRequest, GenerationResponse]
738
+ ):
739
+ """
740
+ Primary accumulator for generative benchmark execution metrics and statistics.
741
+
742
+ Orchestrates real-time metric collection across timing, scheduler, concurrency, and
743
+ generative performance dimensions. Maintains separate accumulators for completed,
744
+ errored, and incomplete requests while tracking overall metrics. Integrates with
745
+ scheduler state to monitor warmup/cooldown phases and compute time-weighted
746
+ statistics for throughput and latency analysis.
747
+ """
748
+
749
+ timings: GenerativeBenchmarkTimings = Field(
750
+ default_factory=GenerativeBenchmarkTimings,
751
+ description="Timing phases and transitions during benchmark execution",
752
+ )
753
+ completed: GenerativeRequestsAccumulator = Field(
754
+ default_factory=GenerativeRequestsAccumulator,
755
+ description="Accumulator for completed requests",
756
+ )
757
+ errored: GenerativeRequestsAccumulator = Field(
758
+ default_factory=GenerativeRequestsAccumulator,
759
+ description="Accumulator for errored requests",
760
+ )
761
+ incomplete: GenerativeRequestsAccumulator = Field(
762
+ default_factory=GenerativeRequestsAccumulator,
763
+ description="Accumulator for incomplete requests",
764
+ )
765
+ scheduler_metrics: SchedulerMetricsAccumulator = Field(
766
+ default_factory=SchedulerMetricsAccumulator,
767
+ description="Running metrics for scheduler state",
768
+ )
769
+ concurrency_metric: RunningMetricStats = Field(
770
+ default_factory=RunningMetricStats,
771
+ description="Accumulated request concurrency statistics",
772
+ )
773
+ total_metrics: GenerativeMetricsAccumulator = Field(
774
+ default_factory=GenerativeMetricsAccumulator,
775
+ description="Running metrics for all requests",
776
+ )
777
+ completed_metrics: GenerativeMetricsAccumulator = Field(
778
+ default_factory=GenerativeMetricsAccumulator,
779
+ description="Running metrics for completed requests",
780
+ )
781
+ errored_metrics: GenerativeMetricsAccumulator = Field(
782
+ default_factory=GenerativeMetricsAccumulator,
783
+ description="Running metrics for errored requests",
784
+ )
785
+ incomplete_metrics: GenerativeMetricsAccumulator = Field(
786
+ default_factory=GenerativeMetricsAccumulator,
787
+ description="Running metrics for incomplete requests",
788
+ )
789
+
790
+ def update_estimate(
791
+ self,
792
+ response: GenerationResponse | None,
793
+ request: GenerationRequest | MultiTurnRequestT[GenerationRequest],
794
+ info: RequestInfo,
795
+ scheduler_state: SchedulerState,
796
+ ):
797
+ """
798
+ Update all benchmark metrics with a completed request.
799
+
800
+ Processes request completion by updating timing phases, concurrency metrics,
801
+ scheduler statistics, and generative performance metrics. Routes request to
802
+ appropriate status-specific accumulator (completed/errored/incomplete) and
803
+ updates aggregate totals. Cancelled requests that never started are ignored.
804
+
805
+ :param response: Generation response with output and metrics, or None
806
+ :param request: Original generation request with input data
807
+ :param info: Request execution information and timing
808
+ :param scheduler_state: Current scheduler state for phase tracking
809
+ """
810
+ self.timings.update_estimate(info, scheduler_state, self.config)
811
+ duration = self.timings.duration
812
+ elapsed_time_last_update = self.timings.elapsed_time_last_update
813
+ self.concurrency_metric.update_estimate(
814
+ value=scheduler_state.processing_requests,
815
+ duration=duration,
816
+ elapsed=elapsed_time_last_update,
817
+ )
818
+
819
+ requests_accumulator: GenerativeRequestsAccumulator
820
+ metrics_accumulator: GenerativeMetricsAccumulator
821
+
822
+ if info.status == "completed":
823
+ requests_accumulator = self.completed
824
+ metrics_accumulator = self.completed_metrics
825
+ elif info.status == "errored":
826
+ requests_accumulator = self.errored
827
+ metrics_accumulator = self.errored_metrics
828
+ elif info.status == "cancelled" and info.timings.resolve_start is not None:
829
+ requests_accumulator = self.incomplete
830
+ metrics_accumulator = self.incomplete_metrics
831
+ else:
832
+ # Not a terminal status or cancelled before starting
833
+ # Do not include in requests or metrics
834
+ return
835
+
836
+ stats = requests_accumulator.update_estimate(
837
+ response, request, info, self.config.prefer_response_metrics
838
+ )
839
+ metrics_accumulator.update_estimate(stats, duration)
840
+ self.total_metrics.update_estimate(stats, duration)
841
+ self.scheduler_metrics.update_estimate(scheduler_state, stats)