guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +524 -255
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +109 -0
  5. guidellm/backends/openai.py +340 -0
  6. guidellm/backends/response_handlers.py +428 -0
  7. guidellm/benchmark/__init__.py +69 -39
  8. guidellm/benchmark/benchmarker.py +160 -316
  9. guidellm/benchmark/entrypoints.py +560 -127
  10. guidellm/benchmark/outputs/__init__.py +24 -0
  11. guidellm/benchmark/outputs/console.py +633 -0
  12. guidellm/benchmark/outputs/csv.py +721 -0
  13. guidellm/benchmark/outputs/html.py +473 -0
  14. guidellm/benchmark/outputs/output.py +169 -0
  15. guidellm/benchmark/outputs/serialized.py +69 -0
  16. guidellm/benchmark/profiles.py +718 -0
  17. guidellm/benchmark/progress.py +553 -556
  18. guidellm/benchmark/scenarios/__init__.py +40 -0
  19. guidellm/benchmark/scenarios/chat.json +6 -0
  20. guidellm/benchmark/scenarios/rag.json +6 -0
  21. guidellm/benchmark/schemas/__init__.py +66 -0
  22. guidellm/benchmark/schemas/base.py +402 -0
  23. guidellm/benchmark/schemas/generative/__init__.py +55 -0
  24. guidellm/benchmark/schemas/generative/accumulator.py +841 -0
  25. guidellm/benchmark/schemas/generative/benchmark.py +163 -0
  26. guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
  27. guidellm/benchmark/schemas/generative/metrics.py +927 -0
  28. guidellm/benchmark/schemas/generative/report.py +158 -0
  29. guidellm/data/__init__.py +34 -4
  30. guidellm/data/builders.py +541 -0
  31. guidellm/data/collators.py +16 -0
  32. guidellm/data/config.py +120 -0
  33. guidellm/data/deserializers/__init__.py +49 -0
  34. guidellm/data/deserializers/deserializer.py +141 -0
  35. guidellm/data/deserializers/file.py +223 -0
  36. guidellm/data/deserializers/huggingface.py +94 -0
  37. guidellm/data/deserializers/memory.py +194 -0
  38. guidellm/data/deserializers/synthetic.py +246 -0
  39. guidellm/data/entrypoints.py +52 -0
  40. guidellm/data/loaders.py +190 -0
  41. guidellm/data/preprocessors/__init__.py +27 -0
  42. guidellm/data/preprocessors/formatters.py +410 -0
  43. guidellm/data/preprocessors/mappers.py +196 -0
  44. guidellm/data/preprocessors/preprocessor.py +30 -0
  45. guidellm/data/processor.py +29 -0
  46. guidellm/data/schemas.py +175 -0
  47. guidellm/data/utils/__init__.py +6 -0
  48. guidellm/data/utils/dataset.py +94 -0
  49. guidellm/extras/__init__.py +4 -0
  50. guidellm/extras/audio.py +220 -0
  51. guidellm/extras/vision.py +242 -0
  52. guidellm/logger.py +2 -2
  53. guidellm/mock_server/__init__.py +8 -0
  54. guidellm/mock_server/config.py +84 -0
  55. guidellm/mock_server/handlers/__init__.py +17 -0
  56. guidellm/mock_server/handlers/chat_completions.py +280 -0
  57. guidellm/mock_server/handlers/completions.py +280 -0
  58. guidellm/mock_server/handlers/tokenizer.py +142 -0
  59. guidellm/mock_server/models.py +510 -0
  60. guidellm/mock_server/server.py +238 -0
  61. guidellm/mock_server/utils.py +302 -0
  62. guidellm/scheduler/__init__.py +69 -26
  63. guidellm/scheduler/constraints/__init__.py +49 -0
  64. guidellm/scheduler/constraints/constraint.py +325 -0
  65. guidellm/scheduler/constraints/error.py +411 -0
  66. guidellm/scheduler/constraints/factory.py +182 -0
  67. guidellm/scheduler/constraints/request.py +312 -0
  68. guidellm/scheduler/constraints/saturation.py +722 -0
  69. guidellm/scheduler/environments.py +252 -0
  70. guidellm/scheduler/scheduler.py +137 -368
  71. guidellm/scheduler/schemas.py +358 -0
  72. guidellm/scheduler/strategies.py +617 -0
  73. guidellm/scheduler/worker.py +413 -419
  74. guidellm/scheduler/worker_group.py +712 -0
  75. guidellm/schemas/__init__.py +65 -0
  76. guidellm/schemas/base.py +417 -0
  77. guidellm/schemas/info.py +188 -0
  78. guidellm/schemas/request.py +235 -0
  79. guidellm/schemas/request_stats.py +349 -0
  80. guidellm/schemas/response.py +124 -0
  81. guidellm/schemas/statistics.py +1018 -0
  82. guidellm/{config.py → settings.py} +31 -24
  83. guidellm/utils/__init__.py +71 -8
  84. guidellm/utils/auto_importer.py +98 -0
  85. guidellm/utils/cli.py +132 -5
  86. guidellm/utils/console.py +566 -0
  87. guidellm/utils/encoding.py +778 -0
  88. guidellm/utils/functions.py +159 -0
  89. guidellm/utils/hf_datasets.py +1 -2
  90. guidellm/utils/hf_transformers.py +4 -4
  91. guidellm/utils/imports.py +9 -0
  92. guidellm/utils/messaging.py +1118 -0
  93. guidellm/utils/mixins.py +115 -0
  94. guidellm/utils/random.py +3 -4
  95. guidellm/utils/registry.py +220 -0
  96. guidellm/utils/singleton.py +133 -0
  97. guidellm/utils/synchronous.py +159 -0
  98. guidellm/utils/text.py +163 -50
  99. guidellm/utils/typing.py +41 -0
  100. guidellm/version.py +2 -2
  101. guidellm-0.6.0a5.dist-info/METADATA +364 -0
  102. guidellm-0.6.0a5.dist-info/RECORD +109 -0
  103. guidellm/backend/__init__.py +0 -23
  104. guidellm/backend/backend.py +0 -259
  105. guidellm/backend/openai.py +0 -708
  106. guidellm/backend/response.py +0 -136
  107. guidellm/benchmark/aggregator.py +0 -760
  108. guidellm/benchmark/benchmark.py +0 -837
  109. guidellm/benchmark/output.py +0 -997
  110. guidellm/benchmark/profile.py +0 -409
  111. guidellm/benchmark/scenario.py +0 -104
  112. guidellm/data/prideandprejudice.txt.gz +0 -0
  113. guidellm/dataset/__init__.py +0 -22
  114. guidellm/dataset/creator.py +0 -213
  115. guidellm/dataset/entrypoints.py +0 -42
  116. guidellm/dataset/file.py +0 -92
  117. guidellm/dataset/hf_datasets.py +0 -62
  118. guidellm/dataset/in_memory.py +0 -132
  119. guidellm/dataset/synthetic.py +0 -287
  120. guidellm/objects/__init__.py +0 -18
  121. guidellm/objects/pydantic.py +0 -89
  122. guidellm/objects/statistics.py +0 -953
  123. guidellm/preprocess/__init__.py +0 -3
  124. guidellm/preprocess/dataset.py +0 -374
  125. guidellm/presentation/__init__.py +0 -28
  126. guidellm/presentation/builder.py +0 -27
  127. guidellm/presentation/data_models.py +0 -232
  128. guidellm/presentation/injector.py +0 -66
  129. guidellm/request/__init__.py +0 -18
  130. guidellm/request/loader.py +0 -284
  131. guidellm/request/request.py +0 -79
  132. guidellm/request/types.py +0 -10
  133. guidellm/scheduler/queues.py +0 -25
  134. guidellm/scheduler/result.py +0 -155
  135. guidellm/scheduler/strategy.py +0 -495
  136. guidellm-0.3.1.dist-info/METADATA +0 -329
  137. guidellm-0.3.1.dist-info/RECORD +0 -62
  138. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
  139. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
  140. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
  141. {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,722 @@
1
+ """
2
+ Over-saturation detection constraint implementation.
3
+
4
+ This module implements the Over-Saturation Detection (OSD) algorithm for detecting
5
+ when a model becomes over-saturated during benchmarking. Over-saturation occurs when
6
+ the response rate doesn't keep up with the request rate, leading to degraded
7
+ performance.
8
+
9
+ Algorithm Overview:
10
+ -------------------
11
+ The OSD algorithm uses statistical slope detection to identify over-saturation:
12
+
13
+ 1. **Slope Detection**: The algorithm tracks two key metrics over time:
14
+ - Concurrent requests: Number of requests being processed simultaneously
15
+ - Time-to-first-token (TTFT): Latency for the first token of each response
16
+
17
+ 2. **Statistical Analysis**: For each metric, the algorithm:
18
+ - Maintains a sliding window of recent data points
19
+ - Calculates the linear regression slope using online statistics
20
+ - Computes the margin of error (MOE) using t-distribution confidence intervals
21
+ - Detects positive slopes with low MOE, indicating degradation
22
+
23
+ 3. **Detection Criteria**: Over-saturation is detected when:
24
+ - Both concurrent requests and TTFT show statistically significant positive slopes
25
+ - The minimum duration threshold has been met
26
+ - Sufficient data points are available for reliable slope estimation
27
+
28
+ 4. **Window Management**: The algorithm maintains bounded memory by:
29
+ - Limiting window size by time (maximum_window_seconds)
30
+ - Limiting window size by ratio of total requests (maximum_window_ratio)
31
+ - Automatically pruning old data points
32
+
33
+ 5. **Constraint Integration**: When over-saturation is detected, the constraint:
34
+ - Stops request queuing to prevent further degradation
35
+ - Stops processing of existing requests (if enabled)
36
+ - Provides detailed metadata about detection state
37
+
38
+ Key Parameters:
39
+ ---------------
40
+ - minimum_duration: Minimum seconds before checking for over-saturation (default: 30.0)
41
+ - minimum_ttft: Minimum TTFT threshold for violation counting (default: 2.5)
42
+ - maximum_window_seconds: Maximum time window for data retention (default: 120.0)
43
+ - moe_threshold: Margin of error threshold for slope detection (default: 2.0)
44
+ - maximum_window_ratio: Maximum window size as ratio of total requests (default: 0.75)
45
+ - minimum_window_size: Minimum data points required for slope estimation (default: 5)
46
+ - confidence: Statistical confidence level for t-distribution (default: 0.95)
47
+
48
+ The constraint integrates with the scheduler by evaluating each request update and
49
+ providing scheduler actions (continue/stop) based on the current over-saturation state.
50
+ """
51
+
52
+ from __future__ import annotations
53
+
54
+ import math
55
+ import time
56
+ from typing import Any, Literal
57
+
58
+ from pydantic import Field
59
+
60
+ from guidellm.scheduler.constraints.constraint import (
61
+ Constraint,
62
+ PydanticConstraintInitializer,
63
+ )
64
+ from guidellm.scheduler.constraints.factory import ConstraintsInitializerFactory
65
+ from guidellm.scheduler.schemas import (
66
+ SchedulerState,
67
+ SchedulerUpdateAction,
68
+ )
69
+ from guidellm.schemas import RequestInfo
70
+
71
+ __all__ = [
72
+ "OverSaturationConstraint",
73
+ "OverSaturationConstraintInitializer",
74
+ "SlopeChecker",
75
+ "approx_t_ppf",
76
+ ]
77
+
78
+
79
+ def approx_t_ppf(p: float, df: float) -> float:
80
+ """
81
+ Approximate the percent point function (PPF) for the t-distribution.
82
+
83
+ Provides a fast approximation of the t-distribution PPF using numerical
84
+ methods from Abramowitz & Stegun. This function is significantly faster
85
+ than scipy.stats.t.ppf while providing sufficient accuracy for statistical
86
+ slope detection in over-saturation detection. Used internally by SlopeChecker
87
+ for calculating confidence intervals and margin of error.
88
+
89
+ Reference:
90
+ Milton Abramowitz and Irene A. Stegun (Eds.). (1965).
91
+ Handbook of Mathematical Functions: with Formulas, Graphs,
92
+ and Mathematical Tables. Dover Publications.
93
+
94
+ An electronic version of this book is available at:
95
+ https://personal.math.ubc.ca/~cbm/aands/.
96
+
97
+ :param p: The probability value (e.g., 0.975 for a 95% confidence interval)
98
+ :param df: The degrees of freedom for the t-distribution
99
+ :return: Approximate t-distribution PPF value, or NaN if df <= 0
100
+ """
101
+ dof = df
102
+ if dof <= 0:
103
+ return float("nan")
104
+
105
+ # 1. Approximate the PPF of the Normal distribution (z-score)
106
+ # Uses Abramowitz & Stegun formula 26.2.23.
107
+ c = [2.515517, 0.802853, 0.010328]
108
+ d = [1.432788, 0.189269, 0.001308]
109
+
110
+ numerical_stability_threshold = 0.5
111
+ if p < numerical_stability_threshold:
112
+ t = math.sqrt(-2.0 * math.log(p))
113
+ z = -(
114
+ t
115
+ - ((c[2] * t + c[1]) * t + c[0])
116
+ / (((d[2] * t + d[1]) * t + d[0]) * t + 1.0)
117
+ )
118
+ else:
119
+ t = math.sqrt(-2.0 * math.log(1.0 - p))
120
+ z = t - ((c[2] * t + c[1]) * t + c[0]) / (
121
+ ((d[2] * t + d[1]) * t + d[0]) * t + 1.0
122
+ )
123
+
124
+ # 2. Convert the z-score to a t-score
125
+ # Uses the Cornish-Fisher expansion (first few terms).
126
+ z2 = z * z
127
+ z3 = z2 * z
128
+ z4 = z3 * z
129
+
130
+ g1 = (z3 + z) / 4.0
131
+ g2 = (5.0 * z4 + 16.0 * z3 + 3.0 * z2) / 96.0
132
+
133
+ # Adjust z using the degrees of freedom (dof)
134
+ return z + g1 / dof + g2 / (dof * dof)
135
+
136
+
137
+ class SlopeChecker:
138
+ """
139
+ Helper class for online slope detection using linear regression.
140
+
141
+ Maintains running statistics for efficient O(1) updates and provides
142
+ statistical slope detection with margin of error calculation. Uses online
143
+ algorithms to compute linear regression statistics incrementally without
144
+ storing all data points, enabling memory-efficient slope detection for
145
+ over-saturation detection. Supports adding and removing data points
146
+ dynamically while maintaining accurate statistical measures.
147
+
148
+ Example:
149
+ ::
150
+ checker = SlopeChecker(moe_threshold=2.0, confidence=0.95)
151
+ checker.add_data_point(1.0, 2.0)
152
+ checker.add_data_point(2.0, 3.0)
153
+ checker.add_data_point(3.0, 4.0)
154
+ is_positive = checker.check_slope(3.0) # True for positive slope
155
+ """
156
+
157
+ def __init__(
158
+ self, moe_threshold: float = 1.0, confidence: float = 0.95, eps: float = 1e-12
159
+ ) -> None:
160
+ """
161
+ Initialize slope checker with statistical parameters.
162
+
163
+ :param moe_threshold: Maximum margin of error threshold for slope detection
164
+ :param confidence: Statistical confidence level for t-distribution (0-1)
165
+ :param eps: Epsilon value for numerical stability in calculations
166
+ """
167
+ self.n = 0
168
+ self.sum_x = 0.0
169
+ self.sum_y = 0.0
170
+ self.sum_xy = 0.0
171
+ self.sum_x2 = 0.0
172
+ self.sum_y2 = 0.0
173
+ self.moe_threshold = moe_threshold
174
+ self.eps = eps
175
+ self.confidence = confidence
176
+ self.slope: float | None = None
177
+ self.margin_of_error: float | None = None
178
+
179
+ def add_data_point(self, x_new: float, y_new: float) -> None:
180
+ """
181
+ Integrate a new data point into the accumulated statistics.
182
+
183
+ Updates running sums for linear regression calculation in O(1) time.
184
+ The data point is incorporated into the statistical model without
185
+ storing the individual value, enabling memory-efficient slope detection.
186
+
187
+ :param x_new: The new x-coordinate (typically time or duration)
188
+ :param y_new: The new y-coordinate (typically metric value like TTFT
189
+ or concurrent requests)
190
+ """
191
+ self.n += 1
192
+ self.sum_x += x_new
193
+ self.sum_y += y_new
194
+ self.sum_xy += x_new * y_new
195
+ self.sum_x2 += x_new**2
196
+ self.sum_y2 += y_new**2
197
+
198
+ def remove_data_point(self, x_old: float, y_old: float) -> None:
199
+ """
200
+ Remove a data point from the accumulated statistics.
201
+
202
+ Updates running sums by subtracting the specified data point in O(1) time.
203
+ Used for window management when pruning old data points to maintain
204
+ bounded memory usage while preserving statistical accuracy.
205
+
206
+ :param x_old: The x-coordinate to remove (typically time or duration)
207
+ :param y_old: The y-coordinate to remove (typically metric value)
208
+ """
209
+ self.n -= 1
210
+ self.sum_x -= x_old
211
+ self.sum_y -= y_old
212
+ self.sum_xy -= x_old * y_old
213
+ self.sum_x2 -= x_old**2
214
+ self.sum_y2 -= y_old**2
215
+
216
+ def check_slope(self, effective_n: float) -> bool:
217
+ """
218
+ Check if there is a statistically significant positive slope.
219
+
220
+ Calculates linear regression slope and margin of error using online
221
+ statistics. Returns True if the slope is positive and the margin of
222
+ error is below the threshold, indicating statistically significant
223
+ degradation. Updates internal slope and margin_of_error attributes
224
+ for external inspection.
225
+
226
+ :param effective_n: Effective sample size for slope estimation (may differ
227
+ from actual n for correlation adjustment)
228
+ :return: True if positive slope detected with margin of error below threshold
229
+ """
230
+ minimal_n_for_slope_estimation = 3
231
+ if effective_n < minimal_n_for_slope_estimation:
232
+ return False
233
+
234
+ # Calculate sums of squares and cross-products
235
+ # These formulas are numerically stable for online calculation.
236
+ centered_sum_xx = self.sum_x2 - (self.sum_x**2) / self.n
237
+ centered_sum_xy = self.sum_xy - (self.sum_x * self.sum_y) / self.n
238
+ centered_sum_yy = self.sum_y2 - (self.sum_y**2) / self.n
239
+
240
+ # Safeguard against division by zero for SS_xx
241
+ centered_sum_xx_safe = max(centered_sum_xx, self.eps)
242
+
243
+ slope = centered_sum_xy / centered_sum_xx_safe
244
+
245
+ # Calculate Residual Sum of Squares (RSS)
246
+ # This is a direct calculation using the sums of squares.
247
+ residual_sum_of_squares = centered_sum_yy - (
248
+ centered_sum_xy**2 / centered_sum_xx_safe
249
+ )
250
+
251
+ # Ensure RSS is non-negative due to potential floating point inaccuracies
252
+ residual_sum_of_squares = max(residual_sum_of_squares, 0.0)
253
+
254
+ # Degrees of freedom for standard error (n - 2 for simple linear regression)
255
+ dof = effective_n - 2
256
+
257
+ residual_variance = residual_sum_of_squares / dof
258
+ standard_error = (residual_variance / centered_sum_xx_safe) ** 0.5
259
+
260
+ # t-critical value
261
+ alpha = 1 - self.confidence
262
+ t_crit = approx_t_ppf(1 - alpha / 2, df=dof)
263
+
264
+ # Margin Of Error
265
+ margin_of_error = t_crit * standard_error / max(slope, self.eps)
266
+
267
+ self.slope = slope
268
+ self.margin_of_error = margin_of_error
269
+ return (slope > 0) and (margin_of_error < self.moe_threshold)
270
+
271
+
272
+ class OverSaturationConstraint(Constraint):
273
+ """
274
+ Constraint that detects and stops execution when over-saturation is detected.
275
+
276
+ This constraint implements the Over-Saturation Detection (OSD) algorithm to
277
+ identify when a model becomes over-saturated (response rate doesn't keep up with
278
+ request rate). When over-saturation is detected, the constraint stops request
279
+ queuing and optionally stops processing of existing requests.
280
+
281
+ The constraint maintains internal state for tracking concurrent requests and
282
+ time-to-first-token (TTFT) metrics, using statistical slope detection to identify
283
+ performance degradation patterns.
284
+ """
285
+
286
+ def __init__(
287
+ self,
288
+ minimum_duration: float = 30.0,
289
+ minimum_ttft: float = 2.5,
290
+ maximum_window_seconds: float = 120.0,
291
+ moe_threshold: float = 2.0,
292
+ maximum_window_ratio: float = 0.75,
293
+ minimum_window_size: int = 5,
294
+ confidence: float = 0.95,
295
+ eps: float = 1e-12,
296
+ enabled: bool = True,
297
+ ) -> None: # noqa: PLR0913
298
+ """
299
+ Initialize the over-saturation constraint.
300
+
301
+ Creates a new constraint instance with specified detection parameters.
302
+ The constraint will track concurrent requests and TTFT metrics, using
303
+ statistical slope detection to identify when the model becomes
304
+ over-saturated. All parameters have sensible defaults suitable for
305
+ most benchmarking scenarios.
306
+
307
+ :param minimum_duration: Minimum seconds before checking for over-saturation
308
+ (default: 30.0)
309
+ :param minimum_ttft: Minimum TTFT threshold in seconds for violation counting
310
+ (default: 2.5)
311
+ :param maximum_window_seconds: Maximum time window in seconds for data retention
312
+ (default: 120.0)
313
+ :param moe_threshold: Margin of error threshold for slope detection
314
+ (default: 2.0)
315
+ :param maximum_window_ratio: Maximum window size as ratio of total requests
316
+ (default: 0.75)
317
+ :param minimum_window_size: Minimum data points required for slope estimation
318
+ (default: 5)
319
+ :param confidence: Statistical confidence level for t-distribution (0-1)
320
+ (default: 0.95)
321
+ :param eps: Epsilon for numerical stability in calculations
322
+ (default: 1e-12)
323
+ :param enabled: Whether to actually stop when over-saturation is detected
324
+ (default: True)
325
+ """
326
+ self.minimum_duration = minimum_duration
327
+ self.minimum_ttft = minimum_ttft
328
+ self.maximum_window_seconds = maximum_window_seconds
329
+ self.maximum_window_ratio = maximum_window_ratio
330
+ self.minimum_window_size = minimum_window_size
331
+ self.moe_threshold = moe_threshold
332
+ self.confidence = confidence
333
+ self.eps = eps
334
+ self.enabled = enabled
335
+ self.reset()
336
+
337
+ @property
338
+ def info(self) -> dict[str, Any]:
339
+ """
340
+ Get current constraint configuration and state information.
341
+ :return: Dictionary containing configuration parameters.
342
+ """
343
+
344
+ return {
345
+ "type_": "over_saturation",
346
+ "minimum_duration": self.minimum_duration,
347
+ "minimum_ttft": self.minimum_ttft,
348
+ "maximum_window_seconds": self.maximum_window_seconds,
349
+ "maximum_window_ratio": self.maximum_window_ratio,
350
+ "minimum_window_size": self.minimum_window_size,
351
+ "moe_threshold": self.moe_threshold,
352
+ "confidence": self.confidence,
353
+ "enabled": self.enabled,
354
+ }
355
+
356
+ def reset(self) -> None:
357
+ """
358
+ Reset all internal state to initial values.
359
+
360
+ Clears all tracked requests, resets counters, and reinitializes slope
361
+ checkers. Useful for reusing constraint instances across multiple
362
+ benchmark runs or resetting state after configuration changes.
363
+ """
364
+ self.duration = 0.0
365
+ self.started_requests: list[dict[str, Any]] = []
366
+ self.finished_requests: list[dict[str, Any]] = []
367
+ self.ttft_violations_counter = 0
368
+ self.total_finished_ever = 0
369
+ self.total_started_ever = 0
370
+ self.concurrent_slope_checker = SlopeChecker(
371
+ moe_threshold=self.moe_threshold, confidence=self.confidence, eps=self.eps
372
+ )
373
+ self.ttft_slope_checker = SlopeChecker(
374
+ moe_threshold=self.moe_threshold, confidence=self.confidence, eps=self.eps
375
+ )
376
+
377
+ def _add_finished(self, request: dict[str, Any]) -> None:
378
+ """
379
+ Add a finished request to tracking.
380
+
381
+ :param request: Dictionary containing request data with 'ttft' and
382
+ 'duration' keys.
383
+ """
384
+ ttft = request["ttft"]
385
+ duration = request["duration"]
386
+ if ttft is not None:
387
+ self.total_finished_ever += 1
388
+ self.finished_requests.append(request)
389
+ if ttft > self.minimum_ttft:
390
+ self.ttft_violations_counter += 1
391
+ self.ttft_slope_checker.add_data_point(duration, ttft)
392
+
393
+ def _remove_finished(self, request: dict[str, Any]) -> None:
394
+ """
395
+ Remove a finished request from tracking.
396
+
397
+ :param request: Dictionary containing request data with 'ttft' and
398
+ 'duration' keys.
399
+ """
400
+ del self.finished_requests[0]
401
+ ttft = request["ttft"]
402
+ duration = request["duration"]
403
+ if ttft > self.minimum_ttft:
404
+ self.ttft_violations_counter -= 1
405
+ self.ttft_slope_checker.remove_data_point(duration, ttft)
406
+
407
+ def _add_started(self, request: dict[str, Any]) -> None:
408
+ """
409
+ Add a started request to tracking.
410
+
411
+ :param request: Dictionary containing request data with
412
+ 'concurrent_requests' and 'duration' keys.
413
+ """
414
+ concurrent = request["concurrent_requests"]
415
+ duration = request["duration"]
416
+ if concurrent is not None:
417
+ self.total_started_ever += 1
418
+ self.started_requests.append(request)
419
+ self.concurrent_slope_checker.add_data_point(duration, concurrent)
420
+
421
+ def _remove_started(self, request: dict[str, Any]) -> None:
422
+ """
423
+ Remove a started request from tracking.
424
+
425
+ :param request: Dictionary containing request data with
426
+ 'concurrent_requests' and 'duration' keys.
427
+ """
428
+ del self.started_requests[0]
429
+ concurrent = request["concurrent_requests"]
430
+ duration = request["duration"]
431
+ self.concurrent_slope_checker.remove_data_point(duration, concurrent)
432
+
433
+ def _update_duration(self, duration: float) -> None:
434
+ """
435
+ Update duration and prune old data points.
436
+
437
+ Updates the current duration and removes data points that exceed the maximum
438
+ window size (by ratio or time) to maintain bounded memory usage.
439
+
440
+ :param duration: Current duration in seconds since benchmark start.
441
+ """
442
+ self.duration = duration
443
+
444
+ maximum_finished_window_size = int(
445
+ self.total_finished_ever * self.maximum_window_ratio
446
+ )
447
+ while len(self.finished_requests) > maximum_finished_window_size:
448
+ self._remove_finished(self.finished_requests[0])
449
+
450
+ while (len(self.finished_requests) > 0) and (
451
+ (
452
+ time_since_earliest_request := duration
453
+ - self.finished_requests[0]["duration"]
454
+ )
455
+ > self.maximum_window_seconds
456
+ ):
457
+ self._remove_finished(self.finished_requests[0])
458
+
459
+ maximum_started_window_size = int(
460
+ self.total_started_ever * self.maximum_window_ratio
461
+ )
462
+ while len(self.started_requests) > maximum_started_window_size:
463
+ self._remove_started(self.started_requests[0])
464
+
465
+ while (len(self.started_requests) > 0) and (
466
+ (
467
+ time_since_earliest_request := duration # noqa: F841
468
+ - self.started_requests[0]["duration"]
469
+ )
470
+ > self.maximum_window_seconds
471
+ ):
472
+ self._remove_started(self.started_requests[0])
473
+
474
+ def _check_alert(self) -> bool:
475
+ """
476
+ Check if over-saturation is currently detected.
477
+
478
+ :return: True if over-saturation is detected, False otherwise.
479
+ """
480
+ # Use duration as the maximum n value since requests from the
481
+ # same second are highly correlated, this is simple and good enough
482
+ # given that the MOE has a custom threshold anyway.
483
+ concurrent_n = min(self.duration, self.concurrent_slope_checker.n)
484
+ ttft_n = min(self.duration, self.ttft_slope_checker.n)
485
+
486
+ if (
487
+ (self.duration < self.minimum_duration)
488
+ or (self.ttft_slope_checker.n > self.ttft_violations_counter * 2)
489
+ or (self.duration < self.minimum_ttft)
490
+ or (concurrent_n < self.minimum_window_size)
491
+ ):
492
+ return False
493
+
494
+ is_concurrent_slope_positive = self.concurrent_slope_checker.check_slope(
495
+ concurrent_n
496
+ )
497
+
498
+ if ttft_n < self.minimum_window_size:
499
+ return is_concurrent_slope_positive
500
+
501
+ is_ttft_slope_positive = self.ttft_slope_checker.check_slope(ttft_n)
502
+
503
+ return is_concurrent_slope_positive and is_ttft_slope_positive
504
+
505
+ def __call__(
506
+ self, state: SchedulerState, request_info: RequestInfo
507
+ ) -> SchedulerUpdateAction:
508
+ """
509
+ Evaluate constraint against current scheduler state.
510
+
511
+ :param state: Current scheduler state.
512
+ :param request_info: Individual request information.
513
+ :return: Action indicating whether to continue or stop operations.
514
+ """
515
+ duration = time.time() - state.start_time
516
+
517
+ if request_info.status == "in_progress":
518
+ concurrent_requests = state.processing_requests
519
+ self._add_started(
520
+ {"concurrent_requests": concurrent_requests, "duration": duration}
521
+ )
522
+ elif (
523
+ request_info.status == "completed"
524
+ and request_info.timings
525
+ and request_info.timings.first_token_iteration
526
+ and request_info.timings.request_start
527
+ ):
528
+ ttft = (
529
+ request_info.timings.first_token_iteration
530
+ - request_info.timings.request_start
531
+ )
532
+ self._add_finished({"ttft": ttft, "duration": duration})
533
+
534
+ self._update_duration(duration)
535
+ is_over_saturated = self._check_alert()
536
+
537
+ ttft_slope = self.ttft_slope_checker.slope
538
+ ttft_slope_moe = self.ttft_slope_checker.margin_of_error
539
+ ttft_n = self.ttft_slope_checker.n
540
+ ttft_violations = self.ttft_violations_counter
541
+ concurrent_slope = self.concurrent_slope_checker.slope
542
+ concurrent_slope_moe = self.concurrent_slope_checker.margin_of_error
543
+ concurrent_n = self.concurrent_slope_checker.n
544
+
545
+ should_stop = is_over_saturated and self.enabled
546
+ return SchedulerUpdateAction(
547
+ request_queuing="stop" if should_stop else "continue",
548
+ request_processing="stop_all" if should_stop else "continue",
549
+ metadata={
550
+ "ttft_slope": ttft_slope,
551
+ "ttft_slope_moe": ttft_slope_moe,
552
+ "ttft_n": ttft_n,
553
+ "ttft_violations": ttft_violations,
554
+ "concurrent_slope": concurrent_slope,
555
+ "concurrent_slope_moe": concurrent_slope_moe,
556
+ "concurrent_n": concurrent_n,
557
+ "is_over_saturated": is_over_saturated,
558
+ },
559
+ )
560
+
561
+
562
+ @ConstraintsInitializerFactory.register( # type: ignore[arg-type]
563
+ ["over_saturation", "detect_saturation"]
564
+ )
565
+ class OverSaturationConstraintInitializer(PydanticConstraintInitializer):
566
+ """
567
+ Factory for creating OverSaturationConstraint instances from configuration.
568
+
569
+ Provides a Pydantic-based initializer for over-saturation detection constraints
570
+ with support for flexible configuration patterns. Supports detailed configuration
571
+ dictionaries, enabling easy integration with CLI arguments, configuration files,
572
+ and programmatic constraint creation.
573
+
574
+ Example:
575
+ ::
576
+ # Configuration with defaults
577
+ initializer = OverSaturationConstraintInitializer(enabled=True)
578
+ constraint = initializer.create_constraint()
579
+
580
+ # Detailed configuration
581
+ initializer = OverSaturationConstraintInitializer(
582
+ enabled=True,
583
+ min_seconds=60.0,
584
+ max_window_seconds=300.0,
585
+ moe_threshold=1.5
586
+ )
587
+ constraint = initializer.create_constraint()
588
+
589
+ :cvar type_: Always "over_saturation" to identify this constraint type
590
+ :cvar enabled: Whether to stop the benchmark if over-saturation is detected
591
+ :cvar min_seconds: Minimum seconds before checking for over-saturation
592
+ :cvar max_window_seconds: Maximum time window for data retention
593
+ :cvar moe_threshold: Margin of error threshold for slope detection
594
+ :cvar minimum_ttft: Minimum TTFT threshold for violation counting
595
+ :cvar maximum_window_ratio: Maximum window size as ratio of total requests
596
+ :cvar minimum_window_size: Minimum data points required for slope estimation
597
+ :cvar confidence: Statistical confidence level for t-distribution
598
+ """
599
+
600
+ type_: Literal["over_saturation"] = "over_saturation" # type: ignore[assignment]
601
+ enabled: bool = Field(
602
+ default=True,
603
+ description="Whether to stop the benchmark if the model is over-saturated",
604
+ )
605
+ min_seconds: int | float = Field(
606
+ default=30.0,
607
+ ge=0,
608
+ description="Minimum seconds before checking for over-saturation",
609
+ )
610
+ max_window_seconds: int | float = Field(
611
+ default=120.0,
612
+ ge=0,
613
+ description="Maximum over-saturation checking window size in seconds",
614
+ )
615
+ moe_threshold: float = Field(
616
+ default=2.0,
617
+ ge=0,
618
+ description="Margin of error threshold for slope detection",
619
+ )
620
+ minimum_ttft: float = Field(
621
+ default=2.5,
622
+ ge=0,
623
+ description="Minimum TTFT threshold for violation counting",
624
+ )
625
+ maximum_window_ratio: float = Field(
626
+ default=0.75,
627
+ ge=0,
628
+ le=1.0,
629
+ description="Maximum window size as ratio of total requests",
630
+ )
631
+ minimum_window_size: int = Field(
632
+ default=5,
633
+ ge=0,
634
+ description="Minimum data points required for slope estimation",
635
+ )
636
+ confidence: float = Field(
637
+ default=0.95,
638
+ ge=0,
639
+ le=1.0,
640
+ description="Statistical confidence level for t-distribution",
641
+ )
642
+
643
+ def create_constraint(self, **_kwargs) -> Constraint:
644
+ """
645
+ Create an OverSaturationConstraint instance from this initializer.
646
+
647
+ Constructs a new OverSaturationConstraint with the configuration parameters
648
+ specified in this initializer. The constraint will be ready for evaluation
649
+ against scheduler state and requests.
650
+
651
+ :param _kwargs: Additional keyword arguments (unused)
652
+ :return: Configured OverSaturationConstraint instance ready for use
653
+ """
654
+ return OverSaturationConstraint(
655
+ minimum_duration=self.min_seconds,
656
+ minimum_ttft=self.minimum_ttft,
657
+ maximum_window_seconds=self.max_window_seconds,
658
+ moe_threshold=self.moe_threshold,
659
+ maximum_window_ratio=self.maximum_window_ratio,
660
+ minimum_window_size=self.minimum_window_size,
661
+ confidence=self.confidence,
662
+ enabled=self.enabled,
663
+ )
664
+
665
+ @classmethod
666
+ def validated_kwargs(
667
+ cls, over_saturation: dict[str, Any] | None = None, **kwargs
668
+ ) -> dict[str, Any]:
669
+ """
670
+ Validate and process arguments for OverSaturationConstraint creation.
671
+
672
+ Processes flexible input formats to create validated constraint
673
+ configuration. Supports dictionary inputs for detailed configuration, and
674
+ alias parameters for compatibility. Handles parameter normalization and
675
+ default value application.
676
+
677
+ :param over_saturation: Dictionary with configuration parameters
678
+ (min_seconds, max_window_seconds, etc.)
679
+ :param kwargs: Additional keyword arguments supporting aliases like
680
+ "detect_saturation" for compatibility, or unpacked dict values when
681
+ dict is passed to factory
682
+ :return: Validated dictionary with constraint configuration ready for
683
+ initializer creation
684
+ """
685
+ # Check for aliases in kwargs
686
+ aliases = ["over_saturation", "detect_saturation"]
687
+ result: dict[str, Any] | None = over_saturation
688
+
689
+ for alias in aliases:
690
+ alias_value = kwargs.get(alias)
691
+ if alias_value is not None:
692
+ result = alias_value
693
+ break
694
+
695
+ # If over_saturation is None but kwargs contain constraint parameters,
696
+ # treat kwargs as an unpacked dict (happens when dict is passed to factory)
697
+ if result is None and kwargs:
698
+ constraint_keys = {
699
+ "enabled",
700
+ "min_seconds",
701
+ "max_window_seconds",
702
+ "moe_threshold",
703
+ "minimum_ttft",
704
+ "maximum_window_ratio",
705
+ "minimum_window_size",
706
+ "confidence",
707
+ }
708
+ if any(key in kwargs for key in constraint_keys):
709
+ # Reconstruct dict from kwargs
710
+ result = {key: kwargs[key] for key in constraint_keys if key in kwargs}
711
+
712
+ if result is None:
713
+ return {"enabled": False}
714
+
715
+ if isinstance(result, dict):
716
+ # Return dict as-is, defaults come from fields above
717
+ return result
718
+ else:
719
+ # Type signature only accepts dict or None, so this should never happen
720
+ raise TypeError(
721
+ f"over_saturation must be a dict or None, got {type(result).__name__}"
722
+ )