guidellm 0.3.1__py3-none-any.whl → 0.6.0a5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- guidellm/__init__.py +5 -2
- guidellm/__main__.py +524 -255
- guidellm/backends/__init__.py +33 -0
- guidellm/backends/backend.py +109 -0
- guidellm/backends/openai.py +340 -0
- guidellm/backends/response_handlers.py +428 -0
- guidellm/benchmark/__init__.py +69 -39
- guidellm/benchmark/benchmarker.py +160 -316
- guidellm/benchmark/entrypoints.py +560 -127
- guidellm/benchmark/outputs/__init__.py +24 -0
- guidellm/benchmark/outputs/console.py +633 -0
- guidellm/benchmark/outputs/csv.py +721 -0
- guidellm/benchmark/outputs/html.py +473 -0
- guidellm/benchmark/outputs/output.py +169 -0
- guidellm/benchmark/outputs/serialized.py +69 -0
- guidellm/benchmark/profiles.py +718 -0
- guidellm/benchmark/progress.py +553 -556
- guidellm/benchmark/scenarios/__init__.py +40 -0
- guidellm/benchmark/scenarios/chat.json +6 -0
- guidellm/benchmark/scenarios/rag.json +6 -0
- guidellm/benchmark/schemas/__init__.py +66 -0
- guidellm/benchmark/schemas/base.py +402 -0
- guidellm/benchmark/schemas/generative/__init__.py +55 -0
- guidellm/benchmark/schemas/generative/accumulator.py +841 -0
- guidellm/benchmark/schemas/generative/benchmark.py +163 -0
- guidellm/benchmark/schemas/generative/entrypoints.py +381 -0
- guidellm/benchmark/schemas/generative/metrics.py +927 -0
- guidellm/benchmark/schemas/generative/report.py +158 -0
- guidellm/data/__init__.py +34 -4
- guidellm/data/builders.py +541 -0
- guidellm/data/collators.py +16 -0
- guidellm/data/config.py +120 -0
- guidellm/data/deserializers/__init__.py +49 -0
- guidellm/data/deserializers/deserializer.py +141 -0
- guidellm/data/deserializers/file.py +223 -0
- guidellm/data/deserializers/huggingface.py +94 -0
- guidellm/data/deserializers/memory.py +194 -0
- guidellm/data/deserializers/synthetic.py +246 -0
- guidellm/data/entrypoints.py +52 -0
- guidellm/data/loaders.py +190 -0
- guidellm/data/preprocessors/__init__.py +27 -0
- guidellm/data/preprocessors/formatters.py +410 -0
- guidellm/data/preprocessors/mappers.py +196 -0
- guidellm/data/preprocessors/preprocessor.py +30 -0
- guidellm/data/processor.py +29 -0
- guidellm/data/schemas.py +175 -0
- guidellm/data/utils/__init__.py +6 -0
- guidellm/data/utils/dataset.py +94 -0
- guidellm/extras/__init__.py +4 -0
- guidellm/extras/audio.py +220 -0
- guidellm/extras/vision.py +242 -0
- guidellm/logger.py +2 -2
- guidellm/mock_server/__init__.py +8 -0
- guidellm/mock_server/config.py +84 -0
- guidellm/mock_server/handlers/__init__.py +17 -0
- guidellm/mock_server/handlers/chat_completions.py +280 -0
- guidellm/mock_server/handlers/completions.py +280 -0
- guidellm/mock_server/handlers/tokenizer.py +142 -0
- guidellm/mock_server/models.py +510 -0
- guidellm/mock_server/server.py +238 -0
- guidellm/mock_server/utils.py +302 -0
- guidellm/scheduler/__init__.py +69 -26
- guidellm/scheduler/constraints/__init__.py +49 -0
- guidellm/scheduler/constraints/constraint.py +325 -0
- guidellm/scheduler/constraints/error.py +411 -0
- guidellm/scheduler/constraints/factory.py +182 -0
- guidellm/scheduler/constraints/request.py +312 -0
- guidellm/scheduler/constraints/saturation.py +722 -0
- guidellm/scheduler/environments.py +252 -0
- guidellm/scheduler/scheduler.py +137 -368
- guidellm/scheduler/schemas.py +358 -0
- guidellm/scheduler/strategies.py +617 -0
- guidellm/scheduler/worker.py +413 -419
- guidellm/scheduler/worker_group.py +712 -0
- guidellm/schemas/__init__.py +65 -0
- guidellm/schemas/base.py +417 -0
- guidellm/schemas/info.py +188 -0
- guidellm/schemas/request.py +235 -0
- guidellm/schemas/request_stats.py +349 -0
- guidellm/schemas/response.py +124 -0
- guidellm/schemas/statistics.py +1018 -0
- guidellm/{config.py → settings.py} +31 -24
- guidellm/utils/__init__.py +71 -8
- guidellm/utils/auto_importer.py +98 -0
- guidellm/utils/cli.py +132 -5
- guidellm/utils/console.py +566 -0
- guidellm/utils/encoding.py +778 -0
- guidellm/utils/functions.py +159 -0
- guidellm/utils/hf_datasets.py +1 -2
- guidellm/utils/hf_transformers.py +4 -4
- guidellm/utils/imports.py +9 -0
- guidellm/utils/messaging.py +1118 -0
- guidellm/utils/mixins.py +115 -0
- guidellm/utils/random.py +3 -4
- guidellm/utils/registry.py +220 -0
- guidellm/utils/singleton.py +133 -0
- guidellm/utils/synchronous.py +159 -0
- guidellm/utils/text.py +163 -50
- guidellm/utils/typing.py +41 -0
- guidellm/version.py +2 -2
- guidellm-0.6.0a5.dist-info/METADATA +364 -0
- guidellm-0.6.0a5.dist-info/RECORD +109 -0
- guidellm/backend/__init__.py +0 -23
- guidellm/backend/backend.py +0 -259
- guidellm/backend/openai.py +0 -708
- guidellm/backend/response.py +0 -136
- guidellm/benchmark/aggregator.py +0 -760
- guidellm/benchmark/benchmark.py +0 -837
- guidellm/benchmark/output.py +0 -997
- guidellm/benchmark/profile.py +0 -409
- guidellm/benchmark/scenario.py +0 -104
- guidellm/data/prideandprejudice.txt.gz +0 -0
- guidellm/dataset/__init__.py +0 -22
- guidellm/dataset/creator.py +0 -213
- guidellm/dataset/entrypoints.py +0 -42
- guidellm/dataset/file.py +0 -92
- guidellm/dataset/hf_datasets.py +0 -62
- guidellm/dataset/in_memory.py +0 -132
- guidellm/dataset/synthetic.py +0 -287
- guidellm/objects/__init__.py +0 -18
- guidellm/objects/pydantic.py +0 -89
- guidellm/objects/statistics.py +0 -953
- guidellm/preprocess/__init__.py +0 -3
- guidellm/preprocess/dataset.py +0 -374
- guidellm/presentation/__init__.py +0 -28
- guidellm/presentation/builder.py +0 -27
- guidellm/presentation/data_models.py +0 -232
- guidellm/presentation/injector.py +0 -66
- guidellm/request/__init__.py +0 -18
- guidellm/request/loader.py +0 -284
- guidellm/request/request.py +0 -79
- guidellm/request/types.py +0 -10
- guidellm/scheduler/queues.py +0 -25
- guidellm/scheduler/result.py +0 -155
- guidellm/scheduler/strategy.py +0 -495
- guidellm-0.3.1.dist-info/METADATA +0 -329
- guidellm-0.3.1.dist-info/RECORD +0 -62
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/WHEEL +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/entry_points.txt +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/licenses/LICENSE +0 -0
- {guidellm-0.3.1.dist-info → guidellm-0.6.0a5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,722 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Over-saturation detection constraint implementation.
|
|
3
|
+
|
|
4
|
+
This module implements the Over-Saturation Detection (OSD) algorithm for detecting
|
|
5
|
+
when a model becomes over-saturated during benchmarking. Over-saturation occurs when
|
|
6
|
+
the response rate doesn't keep up with the request rate, leading to degraded
|
|
7
|
+
performance.
|
|
8
|
+
|
|
9
|
+
Algorithm Overview:
|
|
10
|
+
-------------------
|
|
11
|
+
The OSD algorithm uses statistical slope detection to identify over-saturation:
|
|
12
|
+
|
|
13
|
+
1. **Slope Detection**: The algorithm tracks two key metrics over time:
|
|
14
|
+
- Concurrent requests: Number of requests being processed simultaneously
|
|
15
|
+
- Time-to-first-token (TTFT): Latency for the first token of each response
|
|
16
|
+
|
|
17
|
+
2. **Statistical Analysis**: For each metric, the algorithm:
|
|
18
|
+
- Maintains a sliding window of recent data points
|
|
19
|
+
- Calculates the linear regression slope using online statistics
|
|
20
|
+
- Computes the margin of error (MOE) using t-distribution confidence intervals
|
|
21
|
+
- Detects positive slopes with low MOE, indicating degradation
|
|
22
|
+
|
|
23
|
+
3. **Detection Criteria**: Over-saturation is detected when:
|
|
24
|
+
- Both concurrent requests and TTFT show statistically significant positive slopes
|
|
25
|
+
- The minimum duration threshold has been met
|
|
26
|
+
- Sufficient data points are available for reliable slope estimation
|
|
27
|
+
|
|
28
|
+
4. **Window Management**: The algorithm maintains bounded memory by:
|
|
29
|
+
- Limiting window size by time (maximum_window_seconds)
|
|
30
|
+
- Limiting window size by ratio of total requests (maximum_window_ratio)
|
|
31
|
+
- Automatically pruning old data points
|
|
32
|
+
|
|
33
|
+
5. **Constraint Integration**: When over-saturation is detected, the constraint:
|
|
34
|
+
- Stops request queuing to prevent further degradation
|
|
35
|
+
- Stops processing of existing requests (if enabled)
|
|
36
|
+
- Provides detailed metadata about detection state
|
|
37
|
+
|
|
38
|
+
Key Parameters:
|
|
39
|
+
---------------
|
|
40
|
+
- minimum_duration: Minimum seconds before checking for over-saturation (default: 30.0)
|
|
41
|
+
- minimum_ttft: Minimum TTFT threshold for violation counting (default: 2.5)
|
|
42
|
+
- maximum_window_seconds: Maximum time window for data retention (default: 120.0)
|
|
43
|
+
- moe_threshold: Margin of error threshold for slope detection (default: 2.0)
|
|
44
|
+
- maximum_window_ratio: Maximum window size as ratio of total requests (default: 0.75)
|
|
45
|
+
- minimum_window_size: Minimum data points required for slope estimation (default: 5)
|
|
46
|
+
- confidence: Statistical confidence level for t-distribution (default: 0.95)
|
|
47
|
+
|
|
48
|
+
The constraint integrates with the scheduler by evaluating each request update and
|
|
49
|
+
providing scheduler actions (continue/stop) based on the current over-saturation state.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
from __future__ import annotations
|
|
53
|
+
|
|
54
|
+
import math
|
|
55
|
+
import time
|
|
56
|
+
from typing import Any, Literal
|
|
57
|
+
|
|
58
|
+
from pydantic import Field
|
|
59
|
+
|
|
60
|
+
from guidellm.scheduler.constraints.constraint import (
|
|
61
|
+
Constraint,
|
|
62
|
+
PydanticConstraintInitializer,
|
|
63
|
+
)
|
|
64
|
+
from guidellm.scheduler.constraints.factory import ConstraintsInitializerFactory
|
|
65
|
+
from guidellm.scheduler.schemas import (
|
|
66
|
+
SchedulerState,
|
|
67
|
+
SchedulerUpdateAction,
|
|
68
|
+
)
|
|
69
|
+
from guidellm.schemas import RequestInfo
|
|
70
|
+
|
|
71
|
+
__all__ = [
|
|
72
|
+
"OverSaturationConstraint",
|
|
73
|
+
"OverSaturationConstraintInitializer",
|
|
74
|
+
"SlopeChecker",
|
|
75
|
+
"approx_t_ppf",
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def approx_t_ppf(p: float, df: float) -> float:
|
|
80
|
+
"""
|
|
81
|
+
Approximate the percent point function (PPF) for the t-distribution.
|
|
82
|
+
|
|
83
|
+
Provides a fast approximation of the t-distribution PPF using numerical
|
|
84
|
+
methods from Abramowitz & Stegun. This function is significantly faster
|
|
85
|
+
than scipy.stats.t.ppf while providing sufficient accuracy for statistical
|
|
86
|
+
slope detection in over-saturation detection. Used internally by SlopeChecker
|
|
87
|
+
for calculating confidence intervals and margin of error.
|
|
88
|
+
|
|
89
|
+
Reference:
|
|
90
|
+
Milton Abramowitz and Irene A. Stegun (Eds.). (1965).
|
|
91
|
+
Handbook of Mathematical Functions: with Formulas, Graphs,
|
|
92
|
+
and Mathematical Tables. Dover Publications.
|
|
93
|
+
|
|
94
|
+
An electronic version of this book is available at:
|
|
95
|
+
https://personal.math.ubc.ca/~cbm/aands/.
|
|
96
|
+
|
|
97
|
+
:param p: The probability value (e.g., 0.975 for a 95% confidence interval)
|
|
98
|
+
:param df: The degrees of freedom for the t-distribution
|
|
99
|
+
:return: Approximate t-distribution PPF value, or NaN if df <= 0
|
|
100
|
+
"""
|
|
101
|
+
dof = df
|
|
102
|
+
if dof <= 0:
|
|
103
|
+
return float("nan")
|
|
104
|
+
|
|
105
|
+
# 1. Approximate the PPF of the Normal distribution (z-score)
|
|
106
|
+
# Uses Abramowitz & Stegun formula 26.2.23.
|
|
107
|
+
c = [2.515517, 0.802853, 0.010328]
|
|
108
|
+
d = [1.432788, 0.189269, 0.001308]
|
|
109
|
+
|
|
110
|
+
numerical_stability_threshold = 0.5
|
|
111
|
+
if p < numerical_stability_threshold:
|
|
112
|
+
t = math.sqrt(-2.0 * math.log(p))
|
|
113
|
+
z = -(
|
|
114
|
+
t
|
|
115
|
+
- ((c[2] * t + c[1]) * t + c[0])
|
|
116
|
+
/ (((d[2] * t + d[1]) * t + d[0]) * t + 1.0)
|
|
117
|
+
)
|
|
118
|
+
else:
|
|
119
|
+
t = math.sqrt(-2.0 * math.log(1.0 - p))
|
|
120
|
+
z = t - ((c[2] * t + c[1]) * t + c[0]) / (
|
|
121
|
+
((d[2] * t + d[1]) * t + d[0]) * t + 1.0
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# 2. Convert the z-score to a t-score
|
|
125
|
+
# Uses the Cornish-Fisher expansion (first few terms).
|
|
126
|
+
z2 = z * z
|
|
127
|
+
z3 = z2 * z
|
|
128
|
+
z4 = z3 * z
|
|
129
|
+
|
|
130
|
+
g1 = (z3 + z) / 4.0
|
|
131
|
+
g2 = (5.0 * z4 + 16.0 * z3 + 3.0 * z2) / 96.0
|
|
132
|
+
|
|
133
|
+
# Adjust z using the degrees of freedom (dof)
|
|
134
|
+
return z + g1 / dof + g2 / (dof * dof)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class SlopeChecker:
|
|
138
|
+
"""
|
|
139
|
+
Helper class for online slope detection using linear regression.
|
|
140
|
+
|
|
141
|
+
Maintains running statistics for efficient O(1) updates and provides
|
|
142
|
+
statistical slope detection with margin of error calculation. Uses online
|
|
143
|
+
algorithms to compute linear regression statistics incrementally without
|
|
144
|
+
storing all data points, enabling memory-efficient slope detection for
|
|
145
|
+
over-saturation detection. Supports adding and removing data points
|
|
146
|
+
dynamically while maintaining accurate statistical measures.
|
|
147
|
+
|
|
148
|
+
Example:
|
|
149
|
+
::
|
|
150
|
+
checker = SlopeChecker(moe_threshold=2.0, confidence=0.95)
|
|
151
|
+
checker.add_data_point(1.0, 2.0)
|
|
152
|
+
checker.add_data_point(2.0, 3.0)
|
|
153
|
+
checker.add_data_point(3.0, 4.0)
|
|
154
|
+
is_positive = checker.check_slope(3.0) # True for positive slope
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
def __init__(
|
|
158
|
+
self, moe_threshold: float = 1.0, confidence: float = 0.95, eps: float = 1e-12
|
|
159
|
+
) -> None:
|
|
160
|
+
"""
|
|
161
|
+
Initialize slope checker with statistical parameters.
|
|
162
|
+
|
|
163
|
+
:param moe_threshold: Maximum margin of error threshold for slope detection
|
|
164
|
+
:param confidence: Statistical confidence level for t-distribution (0-1)
|
|
165
|
+
:param eps: Epsilon value for numerical stability in calculations
|
|
166
|
+
"""
|
|
167
|
+
self.n = 0
|
|
168
|
+
self.sum_x = 0.0
|
|
169
|
+
self.sum_y = 0.0
|
|
170
|
+
self.sum_xy = 0.0
|
|
171
|
+
self.sum_x2 = 0.0
|
|
172
|
+
self.sum_y2 = 0.0
|
|
173
|
+
self.moe_threshold = moe_threshold
|
|
174
|
+
self.eps = eps
|
|
175
|
+
self.confidence = confidence
|
|
176
|
+
self.slope: float | None = None
|
|
177
|
+
self.margin_of_error: float | None = None
|
|
178
|
+
|
|
179
|
+
def add_data_point(self, x_new: float, y_new: float) -> None:
|
|
180
|
+
"""
|
|
181
|
+
Integrate a new data point into the accumulated statistics.
|
|
182
|
+
|
|
183
|
+
Updates running sums for linear regression calculation in O(1) time.
|
|
184
|
+
The data point is incorporated into the statistical model without
|
|
185
|
+
storing the individual value, enabling memory-efficient slope detection.
|
|
186
|
+
|
|
187
|
+
:param x_new: The new x-coordinate (typically time or duration)
|
|
188
|
+
:param y_new: The new y-coordinate (typically metric value like TTFT
|
|
189
|
+
or concurrent requests)
|
|
190
|
+
"""
|
|
191
|
+
self.n += 1
|
|
192
|
+
self.sum_x += x_new
|
|
193
|
+
self.sum_y += y_new
|
|
194
|
+
self.sum_xy += x_new * y_new
|
|
195
|
+
self.sum_x2 += x_new**2
|
|
196
|
+
self.sum_y2 += y_new**2
|
|
197
|
+
|
|
198
|
+
def remove_data_point(self, x_old: float, y_old: float) -> None:
|
|
199
|
+
"""
|
|
200
|
+
Remove a data point from the accumulated statistics.
|
|
201
|
+
|
|
202
|
+
Updates running sums by subtracting the specified data point in O(1) time.
|
|
203
|
+
Used for window management when pruning old data points to maintain
|
|
204
|
+
bounded memory usage while preserving statistical accuracy.
|
|
205
|
+
|
|
206
|
+
:param x_old: The x-coordinate to remove (typically time or duration)
|
|
207
|
+
:param y_old: The y-coordinate to remove (typically metric value)
|
|
208
|
+
"""
|
|
209
|
+
self.n -= 1
|
|
210
|
+
self.sum_x -= x_old
|
|
211
|
+
self.sum_y -= y_old
|
|
212
|
+
self.sum_xy -= x_old * y_old
|
|
213
|
+
self.sum_x2 -= x_old**2
|
|
214
|
+
self.sum_y2 -= y_old**2
|
|
215
|
+
|
|
216
|
+
def check_slope(self, effective_n: float) -> bool:
|
|
217
|
+
"""
|
|
218
|
+
Check if there is a statistically significant positive slope.
|
|
219
|
+
|
|
220
|
+
Calculates linear regression slope and margin of error using online
|
|
221
|
+
statistics. Returns True if the slope is positive and the margin of
|
|
222
|
+
error is below the threshold, indicating statistically significant
|
|
223
|
+
degradation. Updates internal slope and margin_of_error attributes
|
|
224
|
+
for external inspection.
|
|
225
|
+
|
|
226
|
+
:param effective_n: Effective sample size for slope estimation (may differ
|
|
227
|
+
from actual n for correlation adjustment)
|
|
228
|
+
:return: True if positive slope detected with margin of error below threshold
|
|
229
|
+
"""
|
|
230
|
+
minimal_n_for_slope_estimation = 3
|
|
231
|
+
if effective_n < minimal_n_for_slope_estimation:
|
|
232
|
+
return False
|
|
233
|
+
|
|
234
|
+
# Calculate sums of squares and cross-products
|
|
235
|
+
# These formulas are numerically stable for online calculation.
|
|
236
|
+
centered_sum_xx = self.sum_x2 - (self.sum_x**2) / self.n
|
|
237
|
+
centered_sum_xy = self.sum_xy - (self.sum_x * self.sum_y) / self.n
|
|
238
|
+
centered_sum_yy = self.sum_y2 - (self.sum_y**2) / self.n
|
|
239
|
+
|
|
240
|
+
# Safeguard against division by zero for SS_xx
|
|
241
|
+
centered_sum_xx_safe = max(centered_sum_xx, self.eps)
|
|
242
|
+
|
|
243
|
+
slope = centered_sum_xy / centered_sum_xx_safe
|
|
244
|
+
|
|
245
|
+
# Calculate Residual Sum of Squares (RSS)
|
|
246
|
+
# This is a direct calculation using the sums of squares.
|
|
247
|
+
residual_sum_of_squares = centered_sum_yy - (
|
|
248
|
+
centered_sum_xy**2 / centered_sum_xx_safe
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# Ensure RSS is non-negative due to potential floating point inaccuracies
|
|
252
|
+
residual_sum_of_squares = max(residual_sum_of_squares, 0.0)
|
|
253
|
+
|
|
254
|
+
# Degrees of freedom for standard error (n - 2 for simple linear regression)
|
|
255
|
+
dof = effective_n - 2
|
|
256
|
+
|
|
257
|
+
residual_variance = residual_sum_of_squares / dof
|
|
258
|
+
standard_error = (residual_variance / centered_sum_xx_safe) ** 0.5
|
|
259
|
+
|
|
260
|
+
# t-critical value
|
|
261
|
+
alpha = 1 - self.confidence
|
|
262
|
+
t_crit = approx_t_ppf(1 - alpha / 2, df=dof)
|
|
263
|
+
|
|
264
|
+
# Margin Of Error
|
|
265
|
+
margin_of_error = t_crit * standard_error / max(slope, self.eps)
|
|
266
|
+
|
|
267
|
+
self.slope = slope
|
|
268
|
+
self.margin_of_error = margin_of_error
|
|
269
|
+
return (slope > 0) and (margin_of_error < self.moe_threshold)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class OverSaturationConstraint(Constraint):
|
|
273
|
+
"""
|
|
274
|
+
Constraint that detects and stops execution when over-saturation is detected.
|
|
275
|
+
|
|
276
|
+
This constraint implements the Over-Saturation Detection (OSD) algorithm to
|
|
277
|
+
identify when a model becomes over-saturated (response rate doesn't keep up with
|
|
278
|
+
request rate). When over-saturation is detected, the constraint stops request
|
|
279
|
+
queuing and optionally stops processing of existing requests.
|
|
280
|
+
|
|
281
|
+
The constraint maintains internal state for tracking concurrent requests and
|
|
282
|
+
time-to-first-token (TTFT) metrics, using statistical slope detection to identify
|
|
283
|
+
performance degradation patterns.
|
|
284
|
+
"""
|
|
285
|
+
|
|
286
|
+
def __init__(
|
|
287
|
+
self,
|
|
288
|
+
minimum_duration: float = 30.0,
|
|
289
|
+
minimum_ttft: float = 2.5,
|
|
290
|
+
maximum_window_seconds: float = 120.0,
|
|
291
|
+
moe_threshold: float = 2.0,
|
|
292
|
+
maximum_window_ratio: float = 0.75,
|
|
293
|
+
minimum_window_size: int = 5,
|
|
294
|
+
confidence: float = 0.95,
|
|
295
|
+
eps: float = 1e-12,
|
|
296
|
+
enabled: bool = True,
|
|
297
|
+
) -> None: # noqa: PLR0913
|
|
298
|
+
"""
|
|
299
|
+
Initialize the over-saturation constraint.
|
|
300
|
+
|
|
301
|
+
Creates a new constraint instance with specified detection parameters.
|
|
302
|
+
The constraint will track concurrent requests and TTFT metrics, using
|
|
303
|
+
statistical slope detection to identify when the model becomes
|
|
304
|
+
over-saturated. All parameters have sensible defaults suitable for
|
|
305
|
+
most benchmarking scenarios.
|
|
306
|
+
|
|
307
|
+
:param minimum_duration: Minimum seconds before checking for over-saturation
|
|
308
|
+
(default: 30.0)
|
|
309
|
+
:param minimum_ttft: Minimum TTFT threshold in seconds for violation counting
|
|
310
|
+
(default: 2.5)
|
|
311
|
+
:param maximum_window_seconds: Maximum time window in seconds for data retention
|
|
312
|
+
(default: 120.0)
|
|
313
|
+
:param moe_threshold: Margin of error threshold for slope detection
|
|
314
|
+
(default: 2.0)
|
|
315
|
+
:param maximum_window_ratio: Maximum window size as ratio of total requests
|
|
316
|
+
(default: 0.75)
|
|
317
|
+
:param minimum_window_size: Minimum data points required for slope estimation
|
|
318
|
+
(default: 5)
|
|
319
|
+
:param confidence: Statistical confidence level for t-distribution (0-1)
|
|
320
|
+
(default: 0.95)
|
|
321
|
+
:param eps: Epsilon for numerical stability in calculations
|
|
322
|
+
(default: 1e-12)
|
|
323
|
+
:param enabled: Whether to actually stop when over-saturation is detected
|
|
324
|
+
(default: True)
|
|
325
|
+
"""
|
|
326
|
+
self.minimum_duration = minimum_duration
|
|
327
|
+
self.minimum_ttft = minimum_ttft
|
|
328
|
+
self.maximum_window_seconds = maximum_window_seconds
|
|
329
|
+
self.maximum_window_ratio = maximum_window_ratio
|
|
330
|
+
self.minimum_window_size = minimum_window_size
|
|
331
|
+
self.moe_threshold = moe_threshold
|
|
332
|
+
self.confidence = confidence
|
|
333
|
+
self.eps = eps
|
|
334
|
+
self.enabled = enabled
|
|
335
|
+
self.reset()
|
|
336
|
+
|
|
337
|
+
@property
|
|
338
|
+
def info(self) -> dict[str, Any]:
|
|
339
|
+
"""
|
|
340
|
+
Get current constraint configuration and state information.
|
|
341
|
+
:return: Dictionary containing configuration parameters.
|
|
342
|
+
"""
|
|
343
|
+
|
|
344
|
+
return {
|
|
345
|
+
"type_": "over_saturation",
|
|
346
|
+
"minimum_duration": self.minimum_duration,
|
|
347
|
+
"minimum_ttft": self.minimum_ttft,
|
|
348
|
+
"maximum_window_seconds": self.maximum_window_seconds,
|
|
349
|
+
"maximum_window_ratio": self.maximum_window_ratio,
|
|
350
|
+
"minimum_window_size": self.minimum_window_size,
|
|
351
|
+
"moe_threshold": self.moe_threshold,
|
|
352
|
+
"confidence": self.confidence,
|
|
353
|
+
"enabled": self.enabled,
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
def reset(self) -> None:
|
|
357
|
+
"""
|
|
358
|
+
Reset all internal state to initial values.
|
|
359
|
+
|
|
360
|
+
Clears all tracked requests, resets counters, and reinitializes slope
|
|
361
|
+
checkers. Useful for reusing constraint instances across multiple
|
|
362
|
+
benchmark runs or resetting state after configuration changes.
|
|
363
|
+
"""
|
|
364
|
+
self.duration = 0.0
|
|
365
|
+
self.started_requests: list[dict[str, Any]] = []
|
|
366
|
+
self.finished_requests: list[dict[str, Any]] = []
|
|
367
|
+
self.ttft_violations_counter = 0
|
|
368
|
+
self.total_finished_ever = 0
|
|
369
|
+
self.total_started_ever = 0
|
|
370
|
+
self.concurrent_slope_checker = SlopeChecker(
|
|
371
|
+
moe_threshold=self.moe_threshold, confidence=self.confidence, eps=self.eps
|
|
372
|
+
)
|
|
373
|
+
self.ttft_slope_checker = SlopeChecker(
|
|
374
|
+
moe_threshold=self.moe_threshold, confidence=self.confidence, eps=self.eps
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
def _add_finished(self, request: dict[str, Any]) -> None:
|
|
378
|
+
"""
|
|
379
|
+
Add a finished request to tracking.
|
|
380
|
+
|
|
381
|
+
:param request: Dictionary containing request data with 'ttft' and
|
|
382
|
+
'duration' keys.
|
|
383
|
+
"""
|
|
384
|
+
ttft = request["ttft"]
|
|
385
|
+
duration = request["duration"]
|
|
386
|
+
if ttft is not None:
|
|
387
|
+
self.total_finished_ever += 1
|
|
388
|
+
self.finished_requests.append(request)
|
|
389
|
+
if ttft > self.minimum_ttft:
|
|
390
|
+
self.ttft_violations_counter += 1
|
|
391
|
+
self.ttft_slope_checker.add_data_point(duration, ttft)
|
|
392
|
+
|
|
393
|
+
def _remove_finished(self, request: dict[str, Any]) -> None:
|
|
394
|
+
"""
|
|
395
|
+
Remove a finished request from tracking.
|
|
396
|
+
|
|
397
|
+
:param request: Dictionary containing request data with 'ttft' and
|
|
398
|
+
'duration' keys.
|
|
399
|
+
"""
|
|
400
|
+
del self.finished_requests[0]
|
|
401
|
+
ttft = request["ttft"]
|
|
402
|
+
duration = request["duration"]
|
|
403
|
+
if ttft > self.minimum_ttft:
|
|
404
|
+
self.ttft_violations_counter -= 1
|
|
405
|
+
self.ttft_slope_checker.remove_data_point(duration, ttft)
|
|
406
|
+
|
|
407
|
+
def _add_started(self, request: dict[str, Any]) -> None:
|
|
408
|
+
"""
|
|
409
|
+
Add a started request to tracking.
|
|
410
|
+
|
|
411
|
+
:param request: Dictionary containing request data with
|
|
412
|
+
'concurrent_requests' and 'duration' keys.
|
|
413
|
+
"""
|
|
414
|
+
concurrent = request["concurrent_requests"]
|
|
415
|
+
duration = request["duration"]
|
|
416
|
+
if concurrent is not None:
|
|
417
|
+
self.total_started_ever += 1
|
|
418
|
+
self.started_requests.append(request)
|
|
419
|
+
self.concurrent_slope_checker.add_data_point(duration, concurrent)
|
|
420
|
+
|
|
421
|
+
def _remove_started(self, request: dict[str, Any]) -> None:
|
|
422
|
+
"""
|
|
423
|
+
Remove a started request from tracking.
|
|
424
|
+
|
|
425
|
+
:param request: Dictionary containing request data with
|
|
426
|
+
'concurrent_requests' and 'duration' keys.
|
|
427
|
+
"""
|
|
428
|
+
del self.started_requests[0]
|
|
429
|
+
concurrent = request["concurrent_requests"]
|
|
430
|
+
duration = request["duration"]
|
|
431
|
+
self.concurrent_slope_checker.remove_data_point(duration, concurrent)
|
|
432
|
+
|
|
433
|
+
def _update_duration(self, duration: float) -> None:
|
|
434
|
+
"""
|
|
435
|
+
Update duration and prune old data points.
|
|
436
|
+
|
|
437
|
+
Updates the current duration and removes data points that exceed the maximum
|
|
438
|
+
window size (by ratio or time) to maintain bounded memory usage.
|
|
439
|
+
|
|
440
|
+
:param duration: Current duration in seconds since benchmark start.
|
|
441
|
+
"""
|
|
442
|
+
self.duration = duration
|
|
443
|
+
|
|
444
|
+
maximum_finished_window_size = int(
|
|
445
|
+
self.total_finished_ever * self.maximum_window_ratio
|
|
446
|
+
)
|
|
447
|
+
while len(self.finished_requests) > maximum_finished_window_size:
|
|
448
|
+
self._remove_finished(self.finished_requests[0])
|
|
449
|
+
|
|
450
|
+
while (len(self.finished_requests) > 0) and (
|
|
451
|
+
(
|
|
452
|
+
time_since_earliest_request := duration
|
|
453
|
+
- self.finished_requests[0]["duration"]
|
|
454
|
+
)
|
|
455
|
+
> self.maximum_window_seconds
|
|
456
|
+
):
|
|
457
|
+
self._remove_finished(self.finished_requests[0])
|
|
458
|
+
|
|
459
|
+
maximum_started_window_size = int(
|
|
460
|
+
self.total_started_ever * self.maximum_window_ratio
|
|
461
|
+
)
|
|
462
|
+
while len(self.started_requests) > maximum_started_window_size:
|
|
463
|
+
self._remove_started(self.started_requests[0])
|
|
464
|
+
|
|
465
|
+
while (len(self.started_requests) > 0) and (
|
|
466
|
+
(
|
|
467
|
+
time_since_earliest_request := duration # noqa: F841
|
|
468
|
+
- self.started_requests[0]["duration"]
|
|
469
|
+
)
|
|
470
|
+
> self.maximum_window_seconds
|
|
471
|
+
):
|
|
472
|
+
self._remove_started(self.started_requests[0])
|
|
473
|
+
|
|
474
|
+
def _check_alert(self) -> bool:
|
|
475
|
+
"""
|
|
476
|
+
Check if over-saturation is currently detected.
|
|
477
|
+
|
|
478
|
+
:return: True if over-saturation is detected, False otherwise.
|
|
479
|
+
"""
|
|
480
|
+
# Use duration as the maximum n value since requests from the
|
|
481
|
+
# same second are highly correlated, this is simple and good enough
|
|
482
|
+
# given that the MOE has a custom threshold anyway.
|
|
483
|
+
concurrent_n = min(self.duration, self.concurrent_slope_checker.n)
|
|
484
|
+
ttft_n = min(self.duration, self.ttft_slope_checker.n)
|
|
485
|
+
|
|
486
|
+
if (
|
|
487
|
+
(self.duration < self.minimum_duration)
|
|
488
|
+
or (self.ttft_slope_checker.n > self.ttft_violations_counter * 2)
|
|
489
|
+
or (self.duration < self.minimum_ttft)
|
|
490
|
+
or (concurrent_n < self.minimum_window_size)
|
|
491
|
+
):
|
|
492
|
+
return False
|
|
493
|
+
|
|
494
|
+
is_concurrent_slope_positive = self.concurrent_slope_checker.check_slope(
|
|
495
|
+
concurrent_n
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
if ttft_n < self.minimum_window_size:
|
|
499
|
+
return is_concurrent_slope_positive
|
|
500
|
+
|
|
501
|
+
is_ttft_slope_positive = self.ttft_slope_checker.check_slope(ttft_n)
|
|
502
|
+
|
|
503
|
+
return is_concurrent_slope_positive and is_ttft_slope_positive
|
|
504
|
+
|
|
505
|
+
def __call__(
|
|
506
|
+
self, state: SchedulerState, request_info: RequestInfo
|
|
507
|
+
) -> SchedulerUpdateAction:
|
|
508
|
+
"""
|
|
509
|
+
Evaluate constraint against current scheduler state.
|
|
510
|
+
|
|
511
|
+
:param state: Current scheduler state.
|
|
512
|
+
:param request_info: Individual request information.
|
|
513
|
+
:return: Action indicating whether to continue or stop operations.
|
|
514
|
+
"""
|
|
515
|
+
duration = time.time() - state.start_time
|
|
516
|
+
|
|
517
|
+
if request_info.status == "in_progress":
|
|
518
|
+
concurrent_requests = state.processing_requests
|
|
519
|
+
self._add_started(
|
|
520
|
+
{"concurrent_requests": concurrent_requests, "duration": duration}
|
|
521
|
+
)
|
|
522
|
+
elif (
|
|
523
|
+
request_info.status == "completed"
|
|
524
|
+
and request_info.timings
|
|
525
|
+
and request_info.timings.first_token_iteration
|
|
526
|
+
and request_info.timings.request_start
|
|
527
|
+
):
|
|
528
|
+
ttft = (
|
|
529
|
+
request_info.timings.first_token_iteration
|
|
530
|
+
- request_info.timings.request_start
|
|
531
|
+
)
|
|
532
|
+
self._add_finished({"ttft": ttft, "duration": duration})
|
|
533
|
+
|
|
534
|
+
self._update_duration(duration)
|
|
535
|
+
is_over_saturated = self._check_alert()
|
|
536
|
+
|
|
537
|
+
ttft_slope = self.ttft_slope_checker.slope
|
|
538
|
+
ttft_slope_moe = self.ttft_slope_checker.margin_of_error
|
|
539
|
+
ttft_n = self.ttft_slope_checker.n
|
|
540
|
+
ttft_violations = self.ttft_violations_counter
|
|
541
|
+
concurrent_slope = self.concurrent_slope_checker.slope
|
|
542
|
+
concurrent_slope_moe = self.concurrent_slope_checker.margin_of_error
|
|
543
|
+
concurrent_n = self.concurrent_slope_checker.n
|
|
544
|
+
|
|
545
|
+
should_stop = is_over_saturated and self.enabled
|
|
546
|
+
return SchedulerUpdateAction(
|
|
547
|
+
request_queuing="stop" if should_stop else "continue",
|
|
548
|
+
request_processing="stop_all" if should_stop else "continue",
|
|
549
|
+
metadata={
|
|
550
|
+
"ttft_slope": ttft_slope,
|
|
551
|
+
"ttft_slope_moe": ttft_slope_moe,
|
|
552
|
+
"ttft_n": ttft_n,
|
|
553
|
+
"ttft_violations": ttft_violations,
|
|
554
|
+
"concurrent_slope": concurrent_slope,
|
|
555
|
+
"concurrent_slope_moe": concurrent_slope_moe,
|
|
556
|
+
"concurrent_n": concurrent_n,
|
|
557
|
+
"is_over_saturated": is_over_saturated,
|
|
558
|
+
},
|
|
559
|
+
)
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
@ConstraintsInitializerFactory.register( # type: ignore[arg-type]
|
|
563
|
+
["over_saturation", "detect_saturation"]
|
|
564
|
+
)
|
|
565
|
+
class OverSaturationConstraintInitializer(PydanticConstraintInitializer):
|
|
566
|
+
"""
|
|
567
|
+
Factory for creating OverSaturationConstraint instances from configuration.
|
|
568
|
+
|
|
569
|
+
Provides a Pydantic-based initializer for over-saturation detection constraints
|
|
570
|
+
with support for flexible configuration patterns. Supports detailed configuration
|
|
571
|
+
dictionaries, enabling easy integration with CLI arguments, configuration files,
|
|
572
|
+
and programmatic constraint creation.
|
|
573
|
+
|
|
574
|
+
Example:
|
|
575
|
+
::
|
|
576
|
+
# Configuration with defaults
|
|
577
|
+
initializer = OverSaturationConstraintInitializer(enabled=True)
|
|
578
|
+
constraint = initializer.create_constraint()
|
|
579
|
+
|
|
580
|
+
# Detailed configuration
|
|
581
|
+
initializer = OverSaturationConstraintInitializer(
|
|
582
|
+
enabled=True,
|
|
583
|
+
min_seconds=60.0,
|
|
584
|
+
max_window_seconds=300.0,
|
|
585
|
+
moe_threshold=1.5
|
|
586
|
+
)
|
|
587
|
+
constraint = initializer.create_constraint()
|
|
588
|
+
|
|
589
|
+
:cvar type_: Always "over_saturation" to identify this constraint type
|
|
590
|
+
:cvar enabled: Whether to stop the benchmark if over-saturation is detected
|
|
591
|
+
:cvar min_seconds: Minimum seconds before checking for over-saturation
|
|
592
|
+
:cvar max_window_seconds: Maximum time window for data retention
|
|
593
|
+
:cvar moe_threshold: Margin of error threshold for slope detection
|
|
594
|
+
:cvar minimum_ttft: Minimum TTFT threshold for violation counting
|
|
595
|
+
:cvar maximum_window_ratio: Maximum window size as ratio of total requests
|
|
596
|
+
:cvar minimum_window_size: Minimum data points required for slope estimation
|
|
597
|
+
:cvar confidence: Statistical confidence level for t-distribution
|
|
598
|
+
"""
|
|
599
|
+
|
|
600
|
+
type_: Literal["over_saturation"] = "over_saturation" # type: ignore[assignment]
|
|
601
|
+
enabled: bool = Field(
|
|
602
|
+
default=True,
|
|
603
|
+
description="Whether to stop the benchmark if the model is over-saturated",
|
|
604
|
+
)
|
|
605
|
+
min_seconds: int | float = Field(
|
|
606
|
+
default=30.0,
|
|
607
|
+
ge=0,
|
|
608
|
+
description="Minimum seconds before checking for over-saturation",
|
|
609
|
+
)
|
|
610
|
+
max_window_seconds: int | float = Field(
|
|
611
|
+
default=120.0,
|
|
612
|
+
ge=0,
|
|
613
|
+
description="Maximum over-saturation checking window size in seconds",
|
|
614
|
+
)
|
|
615
|
+
moe_threshold: float = Field(
|
|
616
|
+
default=2.0,
|
|
617
|
+
ge=0,
|
|
618
|
+
description="Margin of error threshold for slope detection",
|
|
619
|
+
)
|
|
620
|
+
minimum_ttft: float = Field(
|
|
621
|
+
default=2.5,
|
|
622
|
+
ge=0,
|
|
623
|
+
description="Minimum TTFT threshold for violation counting",
|
|
624
|
+
)
|
|
625
|
+
maximum_window_ratio: float = Field(
|
|
626
|
+
default=0.75,
|
|
627
|
+
ge=0,
|
|
628
|
+
le=1.0,
|
|
629
|
+
description="Maximum window size as ratio of total requests",
|
|
630
|
+
)
|
|
631
|
+
minimum_window_size: int = Field(
|
|
632
|
+
default=5,
|
|
633
|
+
ge=0,
|
|
634
|
+
description="Minimum data points required for slope estimation",
|
|
635
|
+
)
|
|
636
|
+
confidence: float = Field(
|
|
637
|
+
default=0.95,
|
|
638
|
+
ge=0,
|
|
639
|
+
le=1.0,
|
|
640
|
+
description="Statistical confidence level for t-distribution",
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
def create_constraint(self, **_kwargs) -> Constraint:
|
|
644
|
+
"""
|
|
645
|
+
Create an OverSaturationConstraint instance from this initializer.
|
|
646
|
+
|
|
647
|
+
Constructs a new OverSaturationConstraint with the configuration parameters
|
|
648
|
+
specified in this initializer. The constraint will be ready for evaluation
|
|
649
|
+
against scheduler state and requests.
|
|
650
|
+
|
|
651
|
+
:param _kwargs: Additional keyword arguments (unused)
|
|
652
|
+
:return: Configured OverSaturationConstraint instance ready for use
|
|
653
|
+
"""
|
|
654
|
+
return OverSaturationConstraint(
|
|
655
|
+
minimum_duration=self.min_seconds,
|
|
656
|
+
minimum_ttft=self.minimum_ttft,
|
|
657
|
+
maximum_window_seconds=self.max_window_seconds,
|
|
658
|
+
moe_threshold=self.moe_threshold,
|
|
659
|
+
maximum_window_ratio=self.maximum_window_ratio,
|
|
660
|
+
minimum_window_size=self.minimum_window_size,
|
|
661
|
+
confidence=self.confidence,
|
|
662
|
+
enabled=self.enabled,
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
@classmethod
|
|
666
|
+
def validated_kwargs(
|
|
667
|
+
cls, over_saturation: dict[str, Any] | None = None, **kwargs
|
|
668
|
+
) -> dict[str, Any]:
|
|
669
|
+
"""
|
|
670
|
+
Validate and process arguments for OverSaturationConstraint creation.
|
|
671
|
+
|
|
672
|
+
Processes flexible input formats to create validated constraint
|
|
673
|
+
configuration. Supports dictionary inputs for detailed configuration, and
|
|
674
|
+
alias parameters for compatibility. Handles parameter normalization and
|
|
675
|
+
default value application.
|
|
676
|
+
|
|
677
|
+
:param over_saturation: Dictionary with configuration parameters
|
|
678
|
+
(min_seconds, max_window_seconds, etc.)
|
|
679
|
+
:param kwargs: Additional keyword arguments supporting aliases like
|
|
680
|
+
"detect_saturation" for compatibility, or unpacked dict values when
|
|
681
|
+
dict is passed to factory
|
|
682
|
+
:return: Validated dictionary with constraint configuration ready for
|
|
683
|
+
initializer creation
|
|
684
|
+
"""
|
|
685
|
+
# Check for aliases in kwargs
|
|
686
|
+
aliases = ["over_saturation", "detect_saturation"]
|
|
687
|
+
result: dict[str, Any] | None = over_saturation
|
|
688
|
+
|
|
689
|
+
for alias in aliases:
|
|
690
|
+
alias_value = kwargs.get(alias)
|
|
691
|
+
if alias_value is not None:
|
|
692
|
+
result = alias_value
|
|
693
|
+
break
|
|
694
|
+
|
|
695
|
+
# If over_saturation is None but kwargs contain constraint parameters,
|
|
696
|
+
# treat kwargs as an unpacked dict (happens when dict is passed to factory)
|
|
697
|
+
if result is None and kwargs:
|
|
698
|
+
constraint_keys = {
|
|
699
|
+
"enabled",
|
|
700
|
+
"min_seconds",
|
|
701
|
+
"max_window_seconds",
|
|
702
|
+
"moe_threshold",
|
|
703
|
+
"minimum_ttft",
|
|
704
|
+
"maximum_window_ratio",
|
|
705
|
+
"minimum_window_size",
|
|
706
|
+
"confidence",
|
|
707
|
+
}
|
|
708
|
+
if any(key in kwargs for key in constraint_keys):
|
|
709
|
+
# Reconstruct dict from kwargs
|
|
710
|
+
result = {key: kwargs[key] for key in constraint_keys if key in kwargs}
|
|
711
|
+
|
|
712
|
+
if result is None:
|
|
713
|
+
return {"enabled": False}
|
|
714
|
+
|
|
715
|
+
if isinstance(result, dict):
|
|
716
|
+
# Return dict as-is, defaults come from fields above
|
|
717
|
+
return result
|
|
718
|
+
else:
|
|
719
|
+
# Type signature only accepts dict or None, so this should never happen
|
|
720
|
+
raise TypeError(
|
|
721
|
+
f"over_saturation must be a dict or None, got {type(result).__name__}"
|
|
722
|
+
)
|