liter_llm 1.0.0.pre.rc.9 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 99f9baa37507b2b56d9d5300e3a53ede447fbfe898e57a93f757a0d36b6aa20d
4
- data.tar.gz: 9fdadec249c006f4b368c545f879f5512b60759bc19d74206ab945017efb4826
3
+ metadata.gz: 6018429462859ec2ef1f3fbc99ae4791f46830e255b57c45556955b0976c5c57
4
+ data.tar.gz: ce5f9971a7d0e52df86452fd6f2bb6b1219e0bc5c7b88790c368b1a7f91c88e3
5
5
  SHA512:
6
- metadata.gz: 6cf1243668cc3e7852198f92da30a92991072cca691c09b91600f8ab874d32d55e55f6e2156f785f6ddf2e8bf2d2d5a5f17c8d931e4c226c98740463153659a7
7
- data.tar.gz: b6eb0d41eb748f0f6c2d4eee706cb23c04da9370681f7532335b99a6fea2dd2035421d44048ae79c7adfece41973dcf4883d9ea931cf31583e96ac016f8e899f
6
+ metadata.gz: 714ce2b67cbb3867c161fb153a0701a1135bcd6970eb48d204cd72a2c2ccc61b50bf838df8243ad58c1879bdc874b2c7644fb7488aaeb1a038e4cd793b4a0782
7
+ data.tar.gz: 2a83fe3b6b266a9f2a8e198d90052bbcd02d46854b7b2b83a552e93ba57d3b386bc19093d7fe8acbeca8397bc067f7e7277058a63b5ef5b917e9a8794ab947cd
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "liter-llm-rb"
3
- version = "1.0.0-rc.9"
3
+ version = "1.0.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["liter-llm", "liter-llm-ffi"]
3
3
 
4
4
  [workspace.package]
5
- version = "1.0.0-rc.9"
5
+ version = "1.0.0"
6
6
  edition = "2024"
7
7
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
8
8
  license = "MIT"
@@ -24,7 +24,7 @@ memchr = "2"
24
24
  napi = { version = "3", features = ["napi4", "serde-json", "async"] }
25
25
  napi-build = "2"
26
26
  napi-derive = "3"
27
- opendal = { version = "0.53", features = ["services-memory"], default-features = false }
27
+ opendal = { version = "0.53", features = ["services-memory", "services-redis", "services-fs"], default-features = false }
28
28
  opentelemetry = "0.31"
29
29
  pin-project-lite = "0.2"
30
30
  pyo3 = { version = "0.28", features = ["abi3-py310"] }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "liter-llm"
3
- version = "1.0.0-rc.9"
3
+ version = "1.0.0"
4
4
  edition = "2024"
5
5
  license = "MIT"
6
6
  repository.workspace = true
@@ -71,7 +71,7 @@ futures-core = "0.3"
71
71
  futures-util = { version = "0.3", optional = true }
72
72
  jsonwebtoken = { version = "10", features = ["use_pem"], default-features = false, optional = true }
73
73
  memchr = { version = "2", optional = true }
74
- opendal = { version = "0.53", features = ["services-memory"], default-features = false, optional = true }
74
+ opendal = { version = "0.53", features = ["services-memory", "services-redis", "services-fs"], default-features = false, optional = true }
75
75
  opentelemetry = { version = "0.31", optional = true }
76
76
  pin-project-lite = "0.2"
77
77
  reqwest = { version = "0.13", features = ["json", "stream", "rustls", "multipart", "form"], default-features = false, optional = true }
@@ -0,0 +1,202 @@
1
+ //! Cache backend integration tests.
2
+ //!
3
+ //! Tests for InMemoryStore (in-process), filesystem cache via OpenDAL,
4
+ //! and Redis cache via OpenDAL (requires Docker, gated with `#[ignore]`).
5
+
6
+ #![cfg(feature = "tower")]
7
+
8
+ use std::time::Duration;
9
+
10
+ use liter_llm::tower::{CacheConfig, CacheStore, CachedResponse, InMemoryStore};
11
+ use liter_llm::types::{AssistantMessage, ChatCompletionResponse, Choice, FinishReason};
12
+
13
+ // ---- Helpers ---------------------------------------------------------------
14
+
15
+ fn dummy_response(id: &str) -> CachedResponse {
16
+ CachedResponse::Chat(ChatCompletionResponse {
17
+ id: id.into(),
18
+ object: "chat.completion".into(),
19
+ created: 1_700_000_000,
20
+ model: "gpt-4".into(),
21
+ choices: vec![Choice {
22
+ index: 0,
23
+ message: AssistantMessage {
24
+ content: Some("Hello!".into()),
25
+ name: None,
26
+ tool_calls: None,
27
+ refusal: None,
28
+ function_call: None,
29
+ },
30
+ finish_reason: Some(FinishReason::Stop),
31
+ }],
32
+ usage: None,
33
+ system_fingerprint: None,
34
+ service_tier: None,
35
+ })
36
+ }
37
+
38
+ // ---- InMemoryStore: LRU eviction under load --------------------------------
39
+
40
+ #[tokio::test]
41
+ async fn in_memory_lru_eviction_under_load() {
42
+ let config = CacheConfig {
43
+ max_entries: 10,
44
+ ttl: Duration::from_secs(300),
45
+ ..Default::default()
46
+ };
47
+ let store = InMemoryStore::new(&config);
48
+
49
+ // Fill the cache to max_entries.
50
+ for i in 0..10u64 {
51
+ let body = format!("request-{i}");
52
+ store.put(i, body, dummy_response(&format!("resp-{i}"))).await;
53
+ }
54
+
55
+ // Verify all 10 entries are present.
56
+ for i in 0..10u64 {
57
+ let body = format!("request-{i}");
58
+ let result = store.get(i, &body).await;
59
+ assert!(result.is_some(), "entry {i} should still be in cache before eviction");
60
+ }
61
+
62
+ // Add one more entry — should evict key=0 (the oldest).
63
+ store.put(10, "request-10".into(), dummy_response("resp-10")).await;
64
+
65
+ // Key=0 should be evicted.
66
+ let evicted = store.get(0, "request-0").await;
67
+ assert!(evicted.is_none(), "oldest entry (key=0) should have been evicted");
68
+
69
+ // Key=1 through key=10 should still be present.
70
+ for i in 1..=10u64 {
71
+ let body = format!("request-{i}");
72
+ let result = store.get(i, &body).await;
73
+ assert!(
74
+ result.is_some(),
75
+ "entry {i} should still be in cache after eviction of key=0"
76
+ );
77
+ }
78
+ }
79
+
80
+ /// Cache key collision guard: put with key=1 body="A", get with key=1 body="B"
81
+ /// should return None because the request bodies do not match.
82
+ #[tokio::test]
83
+ async fn cache_key_collision_guard() {
84
+ let config = CacheConfig {
85
+ max_entries: 100,
86
+ ttl: Duration::from_secs(300),
87
+ ..Default::default()
88
+ };
89
+ let store = InMemoryStore::new(&config);
90
+
91
+ store.put(1, "request-body-A".into(), dummy_response("resp-A")).await;
92
+
93
+ // Same key, different body — should be a miss (collision detected).
94
+ let result = store.get(1, "request-body-B").await;
95
+ assert!(
96
+ result.is_none(),
97
+ "get with different request body should return None (collision guard)"
98
+ );
99
+
100
+ // Same key, same body — should be a hit.
101
+ let result = store.get(1, "request-body-A").await;
102
+ assert!(
103
+ result.is_some(),
104
+ "get with matching request body should return the cached response"
105
+ );
106
+ }
107
+
108
+ // ---- OpenDAL cache backend tests -------------------------------------------
109
+
110
+ #[cfg(feature = "opendal-cache")]
111
+ mod opendal_tests {
112
+ use super::*;
113
+ use liter_llm::tower::OpenDalCacheStore;
114
+ use std::collections::HashMap;
115
+
116
+ /// OpenDAL memory backend: put/get round-trip, collision guard, and remove.
117
+ ///
118
+ /// Uses the in-process `memory` scheme (always available — no external
119
+ /// dependencies) to exercise the `OpenDalCacheStore` code paths that are
120
+ /// shared across all OpenDAL backends.
121
+ #[tokio::test]
122
+ async fn opendal_memory_put_get_remove() {
123
+ let store = OpenDalCacheStore::from_config("memory", HashMap::new(), "cache/", Duration::from_secs(300))
124
+ .expect("memory backend should build");
125
+
126
+ // Put
127
+ store
128
+ .put(42, "opendal-request-body".into(), dummy_response("opendal-resp"))
129
+ .await;
130
+
131
+ // Get — should hit.
132
+ let result = store.get(42, "opendal-request-body").await;
133
+ assert!(result.is_some(), "OpenDAL memory cache should return stored entry");
134
+ match result.unwrap() {
135
+ CachedResponse::Chat(r) => assert_eq!(r.id, "opendal-resp"),
136
+ _ => panic!("expected CachedResponse::Chat"),
137
+ }
138
+
139
+ // Get with wrong body — collision guard.
140
+ let miss = store.get(42, "different-body").await;
141
+ assert!(
142
+ miss.is_none(),
143
+ "OpenDAL memory cache should return None for mismatched request body"
144
+ );
145
+
146
+ // Remove
147
+ store.remove(42).await;
148
+ let after_remove = store.get(42, "opendal-request-body").await;
149
+ assert!(after_remove.is_none(), "entry should be gone after remove");
150
+ }
151
+
152
+ /// OpenDAL memory backend: TTL expiry. Uses 0-second TTL so entries expire
153
+ /// on the next second boundary.
154
+ #[tokio::test]
155
+ async fn opendal_memory_ttl_expiry() {
156
+ // 0-second TTL: entries expire immediately (on next second boundary).
157
+ let store = OpenDalCacheStore::from_config("memory", HashMap::new(), "cache/", Duration::from_secs(0))
158
+ .expect("memory backend should build");
159
+
160
+ store.put(99, "ttl-body".into(), dummy_response("ttl-resp")).await;
161
+
162
+ // Wait for the wall clock to advance past the expires_at timestamp.
163
+ tokio::time::sleep(Duration::from_millis(1100)).await;
164
+
165
+ let result = store.get(99, "ttl-body").await;
166
+ assert!(result.is_none(), "expired entry should return None");
167
+ }
168
+
169
+ /// Redis cache via OpenDAL. Requires a running Redis instance at
170
+ /// localhost:6379 (e.g. via `docker compose up -d redis`).
171
+ ///
172
+ /// Requires Redis on localhost:6379 (see docker-compose.yml).
173
+ #[tokio::test]
174
+ #[ignore = "requires Redis on localhost:6379"]
175
+ async fn redis_cache_put_get_ttl_remove() {
176
+ let mut config = HashMap::new();
177
+ config.insert("connection_string".into(), "redis://localhost:6379".into());
178
+
179
+ let store = OpenDalCacheStore::from_config("redis", config, "liter-test/", Duration::from_secs(300))
180
+ .expect("redis backend should build");
181
+
182
+ // Put
183
+ store.put(1, "redis-body".into(), dummy_response("redis-resp")).await;
184
+
185
+ // Get — should hit.
186
+ let result = store.get(1, "redis-body").await;
187
+ assert!(result.is_some(), "redis cache should return stored entry");
188
+ match result.unwrap() {
189
+ CachedResponse::Chat(r) => assert_eq!(r.id, "redis-resp"),
190
+ _ => panic!("expected CachedResponse::Chat"),
191
+ }
192
+
193
+ // Collision guard.
194
+ let miss = store.get(1, "wrong-body").await;
195
+ assert!(miss.is_none(), "redis cache should miss on body mismatch");
196
+
197
+ // Remove.
198
+ store.remove(1).await;
199
+ let after_remove = store.get(1, "redis-body").await;
200
+ assert!(after_remove.is_none(), "entry should be gone after remove");
201
+ }
202
+ }
@@ -0,0 +1,379 @@
1
+ //! Concurrency tests for tower middleware layers.
2
+ //!
3
+ //! These tests verify that BudgetLayer, CacheLayer, and ModelRateLimitLayer
4
+ //! handle concurrent access correctly — no panics, no data corruption, no
5
+ //! deadlocks.
6
+
7
+ #![cfg(feature = "tower")]
8
+
9
+ use std::sync::Arc;
10
+ use std::time::Duration;
11
+
12
+ use liter_llm::error::LiterLlmError;
13
+ use liter_llm::tower::{
14
+ BudgetConfig, BudgetLayer, BudgetState, CacheConfig, CacheLayer, Enforcement, LlmRequest, LlmService,
15
+ ModelRateLimitLayer, RateLimitConfig,
16
+ };
17
+ use tokio::task::JoinSet;
18
+ use tower::{Service, ServiceBuilder};
19
+
20
+ // ---- Helpers ---------------------------------------------------------------
21
+
22
+ /// Minimal mock client that always returns a successful chat response with
23
+ /// usage: prompt_tokens=10, completion_tokens=5.
24
+ #[derive(Clone)]
25
+ struct ConcurrencyMockClient;
26
+
27
+ impl liter_llm::client::LlmClient for ConcurrencyMockClient {
28
+ fn chat(
29
+ &self,
30
+ req: liter_llm::types::ChatCompletionRequest,
31
+ ) -> liter_llm::client::BoxFuture<'_, liter_llm::types::ChatCompletionResponse> {
32
+ let resp = liter_llm::types::ChatCompletionResponse {
33
+ id: "conc-test".into(),
34
+ object: "chat.completion".into(),
35
+ created: 0,
36
+ model: req.model.clone(),
37
+ choices: vec![liter_llm::types::Choice {
38
+ index: 0,
39
+ message: liter_llm::types::AssistantMessage {
40
+ content: Some("ok".into()),
41
+ name: None,
42
+ tool_calls: None,
43
+ refusal: None,
44
+ function_call: None,
45
+ },
46
+ finish_reason: Some(liter_llm::types::FinishReason::Stop),
47
+ }],
48
+ usage: Some(liter_llm::types::Usage {
49
+ prompt_tokens: 10,
50
+ completion_tokens: 5,
51
+ total_tokens: 15,
52
+ }),
53
+ system_fingerprint: None,
54
+ service_tier: None,
55
+ };
56
+ Box::pin(async move { Ok(resp) })
57
+ }
58
+
59
+ fn chat_stream(
60
+ &self,
61
+ _req: liter_llm::types::ChatCompletionRequest,
62
+ ) -> liter_llm::client::BoxFuture<'_, liter_llm::client::BoxStream<'_, liter_llm::types::ChatCompletionChunk>> {
63
+ Box::pin(async move {
64
+ let stream: liter_llm::client::BoxStream<'_, liter_llm::types::ChatCompletionChunk> =
65
+ Box::pin(futures_util::stream::empty());
66
+ Ok(stream)
67
+ })
68
+ }
69
+
70
+ fn embed(
71
+ &self,
72
+ req: liter_llm::types::EmbeddingRequest,
73
+ ) -> liter_llm::client::BoxFuture<'_, liter_llm::types::EmbeddingResponse> {
74
+ let resp = liter_llm::types::EmbeddingResponse {
75
+ object: "list".into(),
76
+ data: vec![],
77
+ model: req.model.clone(),
78
+ usage: Some(liter_llm::types::Usage {
79
+ prompt_tokens: 4,
80
+ completion_tokens: 0,
81
+ total_tokens: 4,
82
+ }),
83
+ };
84
+ Box::pin(async move { Ok(resp) })
85
+ }
86
+
87
+ fn list_models(&self) -> liter_llm::client::BoxFuture<'_, liter_llm::types::ModelsListResponse> {
88
+ Box::pin(async move {
89
+ Ok(liter_llm::types::ModelsListResponse {
90
+ object: "list".into(),
91
+ data: vec![],
92
+ })
93
+ })
94
+ }
95
+
96
+ fn image_generate(
97
+ &self,
98
+ _req: liter_llm::types::image::CreateImageRequest,
99
+ ) -> liter_llm::client::BoxFuture<'_, liter_llm::types::image::ImagesResponse> {
100
+ Box::pin(async move {
101
+ Ok(liter_llm::types::image::ImagesResponse {
102
+ created: 0,
103
+ data: vec![],
104
+ })
105
+ })
106
+ }
107
+
108
+ fn speech(
109
+ &self,
110
+ _req: liter_llm::types::audio::CreateSpeechRequest,
111
+ ) -> liter_llm::client::BoxFuture<'_, bytes::Bytes> {
112
+ Box::pin(async move { Ok(bytes::Bytes::new()) })
113
+ }
114
+
115
+ fn transcribe(
116
+ &self,
117
+ _req: liter_llm::types::audio::CreateTranscriptionRequest,
118
+ ) -> liter_llm::client::BoxFuture<'_, liter_llm::types::audio::TranscriptionResponse> {
119
+ Box::pin(async move {
120
+ Ok(liter_llm::types::audio::TranscriptionResponse {
121
+ text: String::new(),
122
+ language: None,
123
+ duration: None,
124
+ segments: None,
125
+ })
126
+ })
127
+ }
128
+
129
+ fn moderate(
130
+ &self,
131
+ _req: liter_llm::types::moderation::ModerationRequest,
132
+ ) -> liter_llm::client::BoxFuture<'_, liter_llm::types::moderation::ModerationResponse> {
133
+ Box::pin(async move {
134
+ Ok(liter_llm::types::moderation::ModerationResponse {
135
+ id: String::new(),
136
+ model: String::new(),
137
+ results: vec![],
138
+ })
139
+ })
140
+ }
141
+
142
+ fn rerank(
143
+ &self,
144
+ _req: liter_llm::types::rerank::RerankRequest,
145
+ ) -> liter_llm::client::BoxFuture<'_, liter_llm::types::rerank::RerankResponse> {
146
+ Box::pin(async move {
147
+ Ok(liter_llm::types::rerank::RerankResponse {
148
+ id: None,
149
+ results: vec![],
150
+ meta: None,
151
+ })
152
+ })
153
+ }
154
+
155
+ fn search(
156
+ &self,
157
+ _req: liter_llm::types::search::SearchRequest,
158
+ ) -> liter_llm::client::BoxFuture<'_, liter_llm::types::search::SearchResponse> {
159
+ Box::pin(async {
160
+ Err(liter_llm::error::LiterLlmError::EndpointNotSupported {
161
+ endpoint: "search".into(),
162
+ provider: "mock".into(),
163
+ })
164
+ })
165
+ }
166
+
167
+ fn ocr(
168
+ &self,
169
+ _req: liter_llm::types::ocr::OcrRequest,
170
+ ) -> liter_llm::client::BoxFuture<'_, liter_llm::types::ocr::OcrResponse> {
171
+ Box::pin(async {
172
+ Err(liter_llm::error::LiterLlmError::EndpointNotSupported {
173
+ endpoint: "ocr".into(),
174
+ provider: "mock".into(),
175
+ })
176
+ })
177
+ }
178
+ }
179
+
180
+ fn chat_req(model: &str) -> liter_llm::types::ChatCompletionRequest {
181
+ serde_json::from_value(serde_json::json!({
182
+ "model": model,
183
+ "messages": [{"role": "system", "content": "test"}]
184
+ }))
185
+ .expect("test request should deserialize")
186
+ }
187
+
188
+ // ---- Tests -----------------------------------------------------------------
189
+
190
+ /// Spawn 100 concurrent requests through BudgetLayer. Verify that the final
191
+ /// accumulated spend equals the expected sum (within the documented overshoot
192
+ /// tolerance for hard enforcement — concurrent in-flight requests may all pass
193
+ /// the pre-flight check before any of them record their cost).
194
+ #[tokio::test]
195
+ async fn concurrent_budget_tracking() {
196
+ let state = Arc::new(BudgetState::new());
197
+ let config = BudgetConfig {
198
+ global_limit: Some(100.0), // High enough to not reject any request.
199
+ enforcement: Enforcement::Soft,
200
+ ..Default::default()
201
+ };
202
+
203
+ let svc = ServiceBuilder::new()
204
+ .layer(BudgetLayer::new(config, Arc::clone(&state)))
205
+ .service(LlmService::new(ConcurrencyMockClient));
206
+
207
+ let svc = Arc::new(tokio::sync::Mutex::new(svc));
208
+ let mut tasks = JoinSet::new();
209
+
210
+ for _ in 0..100 {
211
+ let svc = Arc::clone(&svc);
212
+ tasks.spawn(async move {
213
+ let mut s = svc.lock().await.clone();
214
+ s.call(LlmRequest::Chat(chat_req("gpt-4"))).await
215
+ });
216
+ }
217
+
218
+ let mut ok_count = 0u64;
219
+ while let Some(result) = tasks.join_next().await {
220
+ let inner = result.expect("task should not panic");
221
+ if inner.is_ok() {
222
+ ok_count += 1;
223
+ }
224
+ }
225
+
226
+ assert_eq!(ok_count, 100, "all 100 requests should succeed under soft enforcement");
227
+ // Budget state should reflect all 100 calls. The exact value depends on
228
+ // cost::completion_cost for gpt-4 with prompt=10, completion=5. We just
229
+ // verify it is positive and non-zero.
230
+ assert!(
231
+ state.global_spend() > 0.0,
232
+ "global spend should be positive after 100 calls, got {}",
233
+ state.global_spend()
234
+ );
235
+ }
236
+
237
+ /// Spawn 50 concurrent identical requests through CacheLayer + LlmService.
238
+ /// Verify no panics, no corruption in InMemoryStore, and all callers receive
239
+ /// valid responses.
240
+ #[tokio::test]
241
+ async fn concurrent_cache_writes() {
242
+ let config = CacheConfig {
243
+ max_entries: 256,
244
+ ttl: Duration::from_secs(60),
245
+ ..Default::default()
246
+ };
247
+
248
+ let svc = ServiceBuilder::new()
249
+ .layer(CacheLayer::new(config))
250
+ .service(LlmService::new(ConcurrencyMockClient));
251
+
252
+ let svc = Arc::new(tokio::sync::Mutex::new(svc));
253
+ let mut tasks = JoinSet::new();
254
+
255
+ for _ in 0..50 {
256
+ let svc = Arc::clone(&svc);
257
+ tasks.spawn(async move {
258
+ let mut s = svc.lock().await.clone();
259
+ s.call(LlmRequest::Chat(chat_req("gpt-4"))).await
260
+ });
261
+ }
262
+
263
+ let mut ok_count = 0u64;
264
+ while let Some(result) = tasks.join_next().await {
265
+ let inner = result.expect("task should not panic");
266
+ let resp = inner.expect("each request should succeed");
267
+ // Verify the response is a Chat variant with valid content.
268
+ match resp {
269
+ liter_llm::tower::LlmResponse::Chat(r) => {
270
+ assert_eq!(r.model, "gpt-4", "response model should match request");
271
+ }
272
+ other => panic!("expected LlmResponse::Chat, got {other:?}"),
273
+ }
274
+ ok_count += 1;
275
+ }
276
+
277
+ assert_eq!(ok_count, 50, "all 50 requests should return valid responses");
278
+ }
279
+
280
+ /// Spawn 20 concurrent requests with RPM=5. Verify exactly 5 succeed and 15
281
+ /// are rejected with RateLimited.
282
+ #[tokio::test]
283
+ async fn concurrent_rate_limit() {
284
+ let config = RateLimitConfig {
285
+ rpm: Some(5),
286
+ tpm: None,
287
+ window: Duration::from_secs(60),
288
+ };
289
+
290
+ let svc = ServiceBuilder::new()
291
+ .layer(ModelRateLimitLayer::new(config))
292
+ .service(LlmService::new(ConcurrencyMockClient));
293
+
294
+ // Rate limiting uses a shared DashMap, so concurrent access to the *same*
295
+ // mutable service requires serialisation. We clone the service for each
296
+ // task — the Arc<DashMap> is shared.
297
+ let svc = Arc::new(tokio::sync::Mutex::new(svc));
298
+ let mut tasks = JoinSet::new();
299
+
300
+ for _ in 0..20 {
301
+ let svc = Arc::clone(&svc);
302
+ tasks.spawn(async move {
303
+ let mut s = svc.lock().await.clone();
304
+ s.call(LlmRequest::Chat(chat_req("gpt-4"))).await
305
+ });
306
+ }
307
+
308
+ let mut successes = 0u64;
309
+ let mut rate_limited = 0u64;
310
+ while let Some(result) = tasks.join_next().await {
311
+ let inner = result.expect("task should not panic");
312
+ match inner {
313
+ Ok(_) => successes += 1,
314
+ Err(LiterLlmError::RateLimited { .. }) => rate_limited += 1,
315
+ Err(other) => panic!("unexpected error: {other:?}"),
316
+ }
317
+ }
318
+
319
+ assert_eq!(successes, 5, "exactly 5 requests should succeed (RPM=5)");
320
+ assert_eq!(rate_limited, 15, "exactly 15 requests should be rate-limited");
321
+ }
322
+
323
+ /// Cache + Budget + RateLimit all active. Spawn 10 requests. Verify the full
324
+ /// middleware stack handles concurrent access without deadlocks or panics.
325
+ /// The test completes within a timeout to guard against deadlocks.
326
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
327
+ async fn concurrent_full_stack() {
328
+ let budget_state = Arc::new(BudgetState::new());
329
+ let budget_config = BudgetConfig {
330
+ global_limit: Some(100.0),
331
+ enforcement: Enforcement::Soft,
332
+ ..Default::default()
333
+ };
334
+ let cache_config = CacheConfig {
335
+ max_entries: 64,
336
+ ttl: Duration::from_secs(60),
337
+ ..Default::default()
338
+ };
339
+ let rate_config = RateLimitConfig {
340
+ rpm: Some(100), // High enough not to reject.
341
+ tpm: None,
342
+ window: Duration::from_secs(60),
343
+ };
344
+
345
+ let svc = ServiceBuilder::new()
346
+ .layer(CacheLayer::new(cache_config))
347
+ .layer(BudgetLayer::new(budget_config, Arc::clone(&budget_state)))
348
+ .layer(ModelRateLimitLayer::new(rate_config))
349
+ .service(LlmService::new(ConcurrencyMockClient));
350
+
351
+ let svc = Arc::new(tokio::sync::Mutex::new(svc));
352
+ let mut tasks = JoinSet::new();
353
+
354
+ for i in 0..10 {
355
+ let svc = Arc::clone(&svc);
356
+ // Use two different models to exercise separate rate-limit buckets.
357
+ let model = if i % 2 == 0 { "gpt-4" } else { "gpt-3.5-turbo" };
358
+ tasks.spawn(async move {
359
+ let mut s = svc.lock().await.clone();
360
+ s.call(LlmRequest::Chat(chat_req(model))).await
361
+ });
362
+ }
363
+
364
+ // Wrap in a timeout to catch deadlocks.
365
+ let result = tokio::time::timeout(Duration::from_secs(10), async {
366
+ let mut ok_count = 0u64;
367
+ while let Some(result) = tasks.join_next().await {
368
+ let inner = result.expect("task should not panic");
369
+ assert!(inner.is_ok(), "request should succeed: {inner:?}");
370
+ ok_count += 1;
371
+ }
372
+ ok_count
373
+ })
374
+ .await;
375
+
376
+ let ok_count = result.expect("full stack should complete within 10s (no deadlock)");
377
+ assert_eq!(ok_count, 10, "all 10 requests should succeed");
378
+ assert!(budget_state.global_spend() > 0.0, "budget should have recorded spend");
379
+ }