liter_llm 1.0.0.pre.rc.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +239 -0
- data/ext/liter_llm_rb/extconf.rb +65 -0
- data/ext/liter_llm_rb/native/.cargo/config.toml +23 -0
- data/ext/liter_llm_rb/native/Cargo.lock +3713 -0
- data/ext/liter_llm_rb/native/Cargo.toml +32 -0
- data/ext/liter_llm_rb/native/build.rs +15 -0
- data/ext/liter_llm_rb/native/src/lib.rs +1079 -0
- data/lib/liter_llm.rb +8 -0
- data/sig/liter_llm.rbs +416 -0
- data/vendor/Cargo.toml +54 -0
- data/vendor/liter-llm/Cargo.toml +92 -0
- data/vendor/liter-llm/README.md +252 -0
- data/vendor/liter-llm/schemas/pricing.json +40 -0
- data/vendor/liter-llm/schemas/providers.json +1662 -0
- data/vendor/liter-llm/src/auth/azure_ad.rs +264 -0
- data/vendor/liter-llm/src/auth/bedrock_sts.rs +353 -0
- data/vendor/liter-llm/src/auth/mod.rs +68 -0
- data/vendor/liter-llm/src/auth/vertex_oauth.rs +353 -0
- data/vendor/liter-llm/src/client/config.rs +351 -0
- data/vendor/liter-llm/src/client/managed.rs +622 -0
- data/vendor/liter-llm/src/client/mod.rs +864 -0
- data/vendor/liter-llm/src/cost.rs +212 -0
- data/vendor/liter-llm/src/error.rs +190 -0
- data/vendor/liter-llm/src/http/eventstream.rs +860 -0
- data/vendor/liter-llm/src/http/mod.rs +12 -0
- data/vendor/liter-llm/src/http/request.rs +438 -0
- data/vendor/liter-llm/src/http/retry.rs +72 -0
- data/vendor/liter-llm/src/http/streaming.rs +289 -0
- data/vendor/liter-llm/src/lib.rs +37 -0
- data/vendor/liter-llm/src/provider/anthropic.rs +2250 -0
- data/vendor/liter-llm/src/provider/azure.rs +579 -0
- data/vendor/liter-llm/src/provider/bedrock.rs +1543 -0
- data/vendor/liter-llm/src/provider/cohere.rs +654 -0
- data/vendor/liter-llm/src/provider/custom.rs +404 -0
- data/vendor/liter-llm/src/provider/google_ai.rs +281 -0
- data/vendor/liter-llm/src/provider/mistral.rs +188 -0
- data/vendor/liter-llm/src/provider/mod.rs +616 -0
- data/vendor/liter-llm/src/provider/vertex.rs +1504 -0
- data/vendor/liter-llm/src/tests.rs +1425 -0
- data/vendor/liter-llm/src/tokenizer.rs +281 -0
- data/vendor/liter-llm/src/tower/budget.rs +599 -0
- data/vendor/liter-llm/src/tower/cache.rs +502 -0
- data/vendor/liter-llm/src/tower/cache_opendal.rs +270 -0
- data/vendor/liter-llm/src/tower/cooldown.rs +231 -0
- data/vendor/liter-llm/src/tower/cost.rs +404 -0
- data/vendor/liter-llm/src/tower/fallback.rs +121 -0
- data/vendor/liter-llm/src/tower/health.rs +219 -0
- data/vendor/liter-llm/src/tower/hooks.rs +369 -0
- data/vendor/liter-llm/src/tower/mod.rs +77 -0
- data/vendor/liter-llm/src/tower/rate_limit.rs +300 -0
- data/vendor/liter-llm/src/tower/router.rs +436 -0
- data/vendor/liter-llm/src/tower/service.rs +181 -0
- data/vendor/liter-llm/src/tower/tests.rs +539 -0
- data/vendor/liter-llm/src/tower/tests_common.rs +252 -0
- data/vendor/liter-llm/src/tower/tracing.rs +209 -0
- data/vendor/liter-llm/src/tower/types.rs +170 -0
- data/vendor/liter-llm/src/types/audio.rs +52 -0
- data/vendor/liter-llm/src/types/batch.rs +77 -0
- data/vendor/liter-llm/src/types/chat.rs +214 -0
- data/vendor/liter-llm/src/types/common.rs +244 -0
- data/vendor/liter-llm/src/types/embedding.rs +84 -0
- data/vendor/liter-llm/src/types/files.rs +58 -0
- data/vendor/liter-llm/src/types/image.rs +40 -0
- data/vendor/liter-llm/src/types/mod.rs +27 -0
- data/vendor/liter-llm/src/types/models.rs +21 -0
- data/vendor/liter-llm/src/types/moderation.rs +80 -0
- data/vendor/liter-llm/src/types/ocr.rs +87 -0
- data/vendor/liter-llm/src/types/rerank.rs +46 -0
- data/vendor/liter-llm/src/types/responses.rs +55 -0
- data/vendor/liter-llm/src/types/search.rs +45 -0
- data/vendor/liter-llm/tests/contract.rs +332 -0
- data/vendor/liter-llm-ffi/Cargo.toml +30 -0
- data/vendor/liter-llm-ffi/build.rs +66 -0
- data/vendor/liter-llm-ffi/cbindgen.toml +60 -0
- data/vendor/liter-llm-ffi/liter_llm.h +850 -0
- data/vendor/liter-llm-ffi/src/lib.rs +2488 -0
- metadata +286 -0
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
//! Per-model rate limiting middleware.
|
|
2
|
+
//!
|
|
3
|
+
//! [`ModelRateLimitLayer`] wraps any [`Service<LlmRequest>`] and enforces
|
|
4
|
+
//! per-model request-per-minute (RPM) and token-per-minute (TPM) limits using
|
|
5
|
+
//! a fixed window. When a model exceeds its configured limit the middleware
|
|
6
|
+
//! returns [`LiterLlmError::RateLimited`] without forwarding the request to the
|
|
7
|
+
//! inner service. After a successful response, token usage is extracted and
|
|
8
|
+
//! added to the running count.
|
|
9
|
+
//!
|
|
10
|
+
//! Rate state is tracked per model name in a [`DashMap`] so that independent
|
|
11
|
+
//! models do not interfere with each other.
|
|
12
|
+
|
|
13
|
+
use std::sync::Arc;
|
|
14
|
+
use std::task::{Context, Poll};
|
|
15
|
+
use std::time::{Duration, Instant};
|
|
16
|
+
|
|
17
|
+
use dashmap::DashMap;
|
|
18
|
+
use tower::{Layer, Service};
|
|
19
|
+
|
|
20
|
+
use super::types::{LlmRequest, LlmResponse};
|
|
21
|
+
use crate::client::BoxFuture;
|
|
22
|
+
use crate::error::{LiterLlmError, Result};
|
|
23
|
+
|
|
24
|
+
// ---- Config ----------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
/// Configuration for per-model rate limits.
|
|
27
|
+
#[derive(Debug, Clone)]
|
|
28
|
+
pub struct RateLimitConfig {
|
|
29
|
+
/// Maximum requests per window. `None` means unlimited.
|
|
30
|
+
pub rpm: Option<u32>,
|
|
31
|
+
/// Maximum tokens per window. `None` means unlimited.
|
|
32
|
+
pub tpm: Option<u64>,
|
|
33
|
+
/// Fixed window duration (defaults to 60 s).
|
|
34
|
+
pub window: Duration,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
impl Default for RateLimitConfig {
|
|
38
|
+
fn default() -> Self {
|
|
39
|
+
Self {
|
|
40
|
+
rpm: None,
|
|
41
|
+
tpm: None,
|
|
42
|
+
window: Duration::from_secs(60),
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// ---- State -----------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
/// Per-model counters for the current window.
|
|
50
|
+
struct ModelRateState {
|
|
51
|
+
request_count: u64,
|
|
52
|
+
token_count: u64,
|
|
53
|
+
window_start: Instant,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
impl ModelRateState {
|
|
57
|
+
fn new() -> Self {
|
|
58
|
+
Self {
|
|
59
|
+
request_count: 0,
|
|
60
|
+
token_count: 0,
|
|
61
|
+
window_start: Instant::now(),
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/// Reset counters if the current window has elapsed.
|
|
66
|
+
fn maybe_reset(&mut self, window: Duration) {
|
|
67
|
+
if self.window_start.elapsed() >= window {
|
|
68
|
+
self.request_count = 0;
|
|
69
|
+
self.token_count = 0;
|
|
70
|
+
self.window_start = Instant::now();
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// ---- Layer -----------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
/// Tower [`Layer`] that enforces per-model rate limits.
|
|
78
|
+
pub struct ModelRateLimitLayer {
|
|
79
|
+
config: RateLimitConfig,
|
|
80
|
+
state: Arc<DashMap<String, ModelRateState>>,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
impl ModelRateLimitLayer {
|
|
84
|
+
/// Create a new rate-limit layer with the given configuration.
|
|
85
|
+
#[must_use]
|
|
86
|
+
pub fn new(config: RateLimitConfig) -> Self {
|
|
87
|
+
Self {
|
|
88
|
+
config,
|
|
89
|
+
state: Arc::new(DashMap::new()),
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
impl<S> Layer<S> for ModelRateLimitLayer {
|
|
95
|
+
type Service = ModelRateLimitService<S>;
|
|
96
|
+
|
|
97
|
+
fn layer(&self, inner: S) -> Self::Service {
|
|
98
|
+
ModelRateLimitService {
|
|
99
|
+
inner,
|
|
100
|
+
config: self.config.clone(),
|
|
101
|
+
state: Arc::clone(&self.state),
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// ---- Service ---------------------------------------------------------------
|
|
107
|
+
|
|
108
|
+
/// Tower service produced by [`ModelRateLimitLayer`].
|
|
109
|
+
pub struct ModelRateLimitService<S> {
|
|
110
|
+
inner: S,
|
|
111
|
+
config: RateLimitConfig,
|
|
112
|
+
state: Arc<DashMap<String, ModelRateState>>,
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
impl<S: Clone> Clone for ModelRateLimitService<S> {
|
|
116
|
+
fn clone(&self) -> Self {
|
|
117
|
+
Self {
|
|
118
|
+
inner: self.inner.clone(),
|
|
119
|
+
config: self.config.clone(),
|
|
120
|
+
state: Arc::clone(&self.state),
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
impl<S> Service<LlmRequest> for ModelRateLimitService<S>
|
|
126
|
+
where
|
|
127
|
+
S: Service<LlmRequest, Response = LlmResponse, Error = LiterLlmError> + Send + 'static,
|
|
128
|
+
S::Future: Send + 'static,
|
|
129
|
+
{
|
|
130
|
+
type Response = LlmResponse;
|
|
131
|
+
type Error = LiterLlmError;
|
|
132
|
+
type Future = BoxFuture<'static, LlmResponse>;
|
|
133
|
+
|
|
134
|
+
fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<()>> {
|
|
135
|
+
self.inner.poll_ready(cx)
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
fn call(&mut self, req: LlmRequest) -> Self::Future {
|
|
139
|
+
let model = req.model().unwrap_or("unknown").to_owned();
|
|
140
|
+
let config = self.config.clone();
|
|
141
|
+
let state = Arc::clone(&self.state);
|
|
142
|
+
|
|
143
|
+
// --- Pre-flight: check RPM limit ---
|
|
144
|
+
{
|
|
145
|
+
let mut entry = state.entry(model.clone()).or_insert_with(ModelRateState::new);
|
|
146
|
+
entry.maybe_reset(config.window);
|
|
147
|
+
|
|
148
|
+
if let Some(rpm) = config.rpm
|
|
149
|
+
&& entry.request_count >= u64::from(rpm)
|
|
150
|
+
{
|
|
151
|
+
return Box::pin(async move {
|
|
152
|
+
Err(LiterLlmError::RateLimited {
|
|
153
|
+
message: format!(
|
|
154
|
+
"model {model} exceeded {rpm} requests per {:.0}s window",
|
|
155
|
+
config.window.as_secs_f64()
|
|
156
|
+
),
|
|
157
|
+
retry_after: Some(config.window),
|
|
158
|
+
})
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if let Some(tpm) = config.tpm
|
|
163
|
+
&& entry.token_count >= tpm
|
|
164
|
+
{
|
|
165
|
+
return Box::pin(async move {
|
|
166
|
+
Err(LiterLlmError::RateLimited {
|
|
167
|
+
message: format!(
|
|
168
|
+
"model {model} exceeded {tpm} tokens per {:.0}s window",
|
|
169
|
+
config.window.as_secs_f64()
|
|
170
|
+
),
|
|
171
|
+
retry_after: Some(config.window),
|
|
172
|
+
})
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Increment request count optimistically.
|
|
177
|
+
entry.request_count += 1;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
let fut = self.inner.call(req);
|
|
181
|
+
|
|
182
|
+
Box::pin(async move {
|
|
183
|
+
let resp = fut.await?;
|
|
184
|
+
|
|
185
|
+
// --- Post-flight: update token count ---
|
|
186
|
+
if let Some(usage) = resp.usage() {
|
|
187
|
+
let total_tokens = usage.prompt_tokens + usage.completion_tokens;
|
|
188
|
+
if let Some(mut entry) = state.get_mut(&model) {
|
|
189
|
+
entry.maybe_reset(config.window);
|
|
190
|
+
entry.token_count += total_tokens;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
Ok(resp)
|
|
195
|
+
})
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// ---- Tests -----------------------------------------------------------------
|
|
200
|
+
|
|
201
|
+
#[cfg(test)]
|
|
202
|
+
mod tests {
|
|
203
|
+
use tower::{Layer as _, Service as _};
|
|
204
|
+
|
|
205
|
+
use super::*;
|
|
206
|
+
use crate::tower::tests_common::{MockClient, chat_req};
|
|
207
|
+
|
|
208
|
+
use crate::tower::service::LlmService;
|
|
209
|
+
use crate::tower::types::LlmRequest;
|
|
210
|
+
|
|
211
|
+
#[tokio::test]
|
|
212
|
+
async fn allows_requests_under_rpm_limit() {
|
|
213
|
+
let config = RateLimitConfig {
|
|
214
|
+
rpm: Some(5),
|
|
215
|
+
tpm: None,
|
|
216
|
+
window: Duration::from_secs(60),
|
|
217
|
+
};
|
|
218
|
+
let layer = ModelRateLimitLayer::new(config);
|
|
219
|
+
let inner = LlmService::new(MockClient::ok());
|
|
220
|
+
let mut svc = layer.layer(inner);
|
|
221
|
+
|
|
222
|
+
for _ in 0..5 {
|
|
223
|
+
let resp = svc.call(LlmRequest::Chat(chat_req("gpt-4"))).await;
|
|
224
|
+
assert!(resp.is_ok(), "requests under limit should succeed");
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
#[tokio::test]
|
|
229
|
+
async fn rejects_requests_over_rpm_limit() {
|
|
230
|
+
let config = RateLimitConfig {
|
|
231
|
+
rpm: Some(2),
|
|
232
|
+
tpm: None,
|
|
233
|
+
window: Duration::from_secs(60),
|
|
234
|
+
};
|
|
235
|
+
let layer = ModelRateLimitLayer::new(config);
|
|
236
|
+
let inner = LlmService::new(MockClient::ok());
|
|
237
|
+
let mut svc = layer.layer(inner);
|
|
238
|
+
|
|
239
|
+
// First two succeed.
|
|
240
|
+
svc.call(LlmRequest::Chat(chat_req("gpt-4"))).await.unwrap();
|
|
241
|
+
svc.call(LlmRequest::Chat(chat_req("gpt-4"))).await.unwrap();
|
|
242
|
+
|
|
243
|
+
// Third should be rate limited.
|
|
244
|
+
let err = svc
|
|
245
|
+
.call(LlmRequest::Chat(chat_req("gpt-4")))
|
|
246
|
+
.await
|
|
247
|
+
.expect_err("should be rate limited");
|
|
248
|
+
assert!(matches!(err, LiterLlmError::RateLimited { .. }));
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
#[tokio::test]
|
|
252
|
+
async fn independent_models_have_separate_limits() {
|
|
253
|
+
let config = RateLimitConfig {
|
|
254
|
+
rpm: Some(1),
|
|
255
|
+
tpm: None,
|
|
256
|
+
window: Duration::from_secs(60),
|
|
257
|
+
};
|
|
258
|
+
let layer = ModelRateLimitLayer::new(config);
|
|
259
|
+
let inner = LlmService::new(MockClient::ok());
|
|
260
|
+
let mut svc = layer.layer(inner);
|
|
261
|
+
|
|
262
|
+
svc.call(LlmRequest::Chat(chat_req("gpt-4"))).await.unwrap();
|
|
263
|
+
// Different model should still work.
|
|
264
|
+
svc.call(LlmRequest::Chat(chat_req("gpt-3.5-turbo"))).await.unwrap();
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
#[tokio::test]
|
|
268
|
+
async fn tpm_limit_rejects_after_threshold() {
|
|
269
|
+
let config = RateLimitConfig {
|
|
270
|
+
rpm: None,
|
|
271
|
+
tpm: Some(10), // Very low threshold — the mock returns 15 total tokens.
|
|
272
|
+
window: Duration::from_secs(60),
|
|
273
|
+
};
|
|
274
|
+
let layer = ModelRateLimitLayer::new(config);
|
|
275
|
+
let inner = LlmService::new(MockClient::ok());
|
|
276
|
+
let mut svc = layer.layer(inner);
|
|
277
|
+
|
|
278
|
+
// First call succeeds and records 15 tokens (over the 10 limit).
|
|
279
|
+
svc.call(LlmRequest::Chat(chat_req("gpt-4"))).await.unwrap();
|
|
280
|
+
|
|
281
|
+
// Second call should be rejected because token count >= tpm.
|
|
282
|
+
let err = svc
|
|
283
|
+
.call(LlmRequest::Chat(chat_req("gpt-4")))
|
|
284
|
+
.await
|
|
285
|
+
.expect_err("should be rate limited by TPM");
|
|
286
|
+
assert!(matches!(err, LiterLlmError::RateLimited { .. }));
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
#[tokio::test]
|
|
290
|
+
async fn unlimited_config_allows_all_requests() {
|
|
291
|
+
let config = RateLimitConfig::default();
|
|
292
|
+
let layer = ModelRateLimitLayer::new(config);
|
|
293
|
+
let inner = LlmService::new(MockClient::ok());
|
|
294
|
+
let mut svc = layer.layer(inner);
|
|
295
|
+
|
|
296
|
+
for _ in 0..100 {
|
|
297
|
+
assert!(svc.call(LlmRequest::Chat(chat_req("gpt-4"))).await.is_ok());
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
}
|