RubyGems - liter_llm - Versions diffs - 1.0.0.pre.rc.9 → 1.0.0 - Mend

liter_llm 1.0.0.pre.rc.9 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/ext/liter_llm_rb/native/Cargo.toml +1 -1
data/vendor/Cargo.toml +2 -2
data/vendor/liter-llm/Cargo.toml +2 -2
data/vendor/liter-llm/tests/cache_integration.rs +202 -0
data/vendor/liter-llm/tests/concurrency.rs +379 -0
data/vendor/liter-llm/tests/middleware_integration.rs +981 -0
data/vendor/liter-llm/tests/operations_integration.rs +641 -0
data/vendor/liter-llm/tests/routing_integration.rs +463 -0
data/vendor/liter-llm-ffi/Cargo.toml +3 -3
data/vendor/liter-llm-ffi/liter_llm.h +1 -1
metadata +6 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 99f9baa37507b2b56d9d5300e3a53ede447fbfe898e57a93f757a0d36b6aa20d
-  data.tar.gz: 9fdadec249c006f4b368c545f879f5512b60759bc19d74206ab945017efb4826
+  metadata.gz: 6018429462859ec2ef1f3fbc99ae4791f46830e255b57c45556955b0976c5c57
+  data.tar.gz: ce5f9971a7d0e52df86452fd6f2bb6b1219e0bc5c7b88790c368b1a7f91c88e3
 SHA512:
-  metadata.gz: 6cf1243668cc3e7852198f92da30a92991072cca691c09b91600f8ab874d32d55e55f6e2156f785f6ddf2e8bf2d2d5a5f17c8d931e4c226c98740463153659a7
-  data.tar.gz: b6eb0d41eb748f0f6c2d4eee706cb23c04da9370681f7532335b99a6fea2dd2035421d44048ae79c7adfece41973dcf4883d9ea931cf31583e96ac016f8e899f
+  metadata.gz: 714ce2b67cbb3867c161fb153a0701a1135bcd6970eb48d204cd72a2c2ccc61b50bf838df8243ad58c1879bdc874b2c7644fb7488aaeb1a038e4cd793b4a0782
+  data.tar.gz: 2a83fe3b6b266a9f2a8e198d90052bbcd02d46854b7b2b83a552e93ba57d3b386bc19093d7fe8acbeca8397bc067f7e7277058a63b5ef5b917e9a8794ab947cd

data/ext/liter_llm_rb/native/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "liter-llm-rb"
-version = "1.0.0-rc.9"
+version = "1.0.0"
 edition = "2024"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
 license = "MIT"

data/vendor/Cargo.toml CHANGED Viewed

@@ -2,7 +2,7 @@
 members = ["liter-llm", "liter-llm-ffi"]
 [workspace.package]
-version = "1.0.0-rc.9"
+version = "1.0.0"
 edition = "2024"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
 license = "MIT"
@@ -24,7 +24,7 @@ memchr = "2"
 napi = { version = "3", features = ["napi4", "serde-json", "async"] }
 napi-build = "2"
 napi-derive = "3"
-opendal = { version = "0.53", features = ["services-memory"], default-features = false }
+opendal = { version = "0.53", features = ["services-memory", "services-redis", "services-fs"], default-features = false }
 opentelemetry = "0.31"
 pin-project-lite = "0.2"
 pyo3 = { version = "0.28", features = ["abi3-py310"] }

data/vendor/liter-llm/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "liter-llm"
-version = "1.0.0-rc.9"
+version = "1.0.0"
 edition = "2024"
 license = "MIT"
 repository.workspace = true
@@ -71,7 +71,7 @@ futures-core = "0.3"
 futures-util = { version = "0.3", optional = true }
 jsonwebtoken = { version = "10", features = ["use_pem"], default-features = false, optional = true }
 memchr = { version = "2", optional = true }
-opendal = { version = "0.53", features = ["services-memory"], default-features = false, optional = true }
+opendal = { version = "0.53", features = ["services-memory", "services-redis", "services-fs"], default-features = false, optional = true }
 opentelemetry = { version = "0.31", optional = true }
 pin-project-lite = "0.2"
 reqwest = { version = "0.13", features = ["json", "stream", "rustls", "multipart", "form"], default-features = false, optional = true }

data/vendor/liter-llm/tests/cache_integration.rs ADDED Viewed

@@ -0,0 +1,202 @@
+//! Cache backend integration tests.
+//!
+//! Tests for InMemoryStore (in-process), filesystem cache via OpenDAL,
+//! and Redis cache via OpenDAL (requires Docker, gated with `#[ignore]`).
+#![cfg(feature = "tower")]
+use std::time::Duration;
+use liter_llm::tower::{CacheConfig, CacheStore, CachedResponse, InMemoryStore};
+use liter_llm::types::{AssistantMessage, ChatCompletionResponse, Choice, FinishReason};
+// ---- Helpers ---------------------------------------------------------------
+fn dummy_response(id: &str) -> CachedResponse {
+    CachedResponse::Chat(ChatCompletionResponse {
+        id: id.into(),
+        object: "chat.completion".into(),
+        created: 1_700_000_000,
+        model: "gpt-4".into(),
+        choices: vec![Choice {
+            index: 0,
+            message: AssistantMessage {
+                content: Some("Hello!".into()),
+                name: None,
+                tool_calls: None,
+                refusal: None,
+                function_call: None,
+            },
+            finish_reason: Some(FinishReason::Stop),
+        }],
+        usage: None,
+        system_fingerprint: None,
+        service_tier: None,
+    })
+}
+// ---- InMemoryStore: LRU eviction under load --------------------------------
+#[tokio::test]
+async fn in_memory_lru_eviction_under_load() {
+    let config = CacheConfig {
+        max_entries: 10,
+        ttl: Duration::from_secs(300),
+        ..Default::default()
+    };
+    let store = InMemoryStore::new(&config);
+    // Fill the cache to max_entries.
+    for i in 0..10u64 {
+        let body = format!("request-{i}");
+        store.put(i, body, dummy_response(&format!("resp-{i}"))).await;
+    }
+    // Verify all 10 entries are present.
+    for i in 0..10u64 {
+        let body = format!("request-{i}");
+        let result = store.get(i, &body).await;
+        assert!(result.is_some(), "entry {i} should still be in cache before eviction");
+    }
+    // Add one more entry — should evict key=0 (the oldest).
+    store.put(10, "request-10".into(), dummy_response("resp-10")).await;
+    // Key=0 should be evicted.
+    let evicted = store.get(0, "request-0").await;
+    assert!(evicted.is_none(), "oldest entry (key=0) should have been evicted");
+    // Key=1 through key=10 should still be present.
+    for i in 1..=10u64 {
+        let body = format!("request-{i}");
+        let result = store.get(i, &body).await;
+        assert!(
+            result.is_some(),
+            "entry {i} should still be in cache after eviction of key=0"
+        );
+    }
+}
+/// Cache key collision guard: put with key=1 body="A", get with key=1 body="B"
+/// should return None because the request bodies do not match.
+#[tokio::test]
+async fn cache_key_collision_guard() {
+    let config = CacheConfig {
+        max_entries: 100,
+        ttl: Duration::from_secs(300),
+        ..Default::default()
+    };
+    let store = InMemoryStore::new(&config);
+    store.put(1, "request-body-A".into(), dummy_response("resp-A")).await;
+    // Same key, different body — should be a miss (collision detected).
+    let result = store.get(1, "request-body-B").await;
+    assert!(
+        result.is_none(),
+        "get with different request body should return None (collision guard)"
+    );
+    // Same key, same body — should be a hit.
+    let result = store.get(1, "request-body-A").await;
+    assert!(
+        result.is_some(),
+        "get with matching request body should return the cached response"
+    );
+}
+// ---- OpenDAL cache backend tests -------------------------------------------
+#[cfg(feature = "opendal-cache")]
+mod opendal_tests {
+    use super::*;
+    use liter_llm::tower::OpenDalCacheStore;
+    use std::collections::HashMap;
+    /// OpenDAL memory backend: put/get round-trip, collision guard, and remove.
+    ///
+    /// Uses the in-process `memory` scheme (always available — no external
+    /// dependencies) to exercise the `OpenDalCacheStore` code paths that are
+    /// shared across all OpenDAL backends.
+    #[tokio::test]
+    async fn opendal_memory_put_get_remove() {
+        let store = OpenDalCacheStore::from_config("memory", HashMap::new(), "cache/", Duration::from_secs(300))
+            .expect("memory backend should build");
+        // Put
+        store
+            .put(42, "opendal-request-body".into(), dummy_response("opendal-resp"))
+            .await;
+        // Get — should hit.
+        let result = store.get(42, "opendal-request-body").await;
+        assert!(result.is_some(), "OpenDAL memory cache should return stored entry");
+        match result.unwrap() {
+            CachedResponse::Chat(r) => assert_eq!(r.id, "opendal-resp"),
+            _ => panic!("expected CachedResponse::Chat"),
+        }
+        // Get with wrong body — collision guard.
+        let miss = store.get(42, "different-body").await;
+        assert!(
+            miss.is_none(),
+            "OpenDAL memory cache should return None for mismatched request body"
+        );
+        // Remove
+        store.remove(42).await;
+        let after_remove = store.get(42, "opendal-request-body").await;
+        assert!(after_remove.is_none(), "entry should be gone after remove");
+    }
+    /// OpenDAL memory backend: TTL expiry. Uses 0-second TTL so entries expire
+    /// on the next second boundary.
+    #[tokio::test]
+    async fn opendal_memory_ttl_expiry() {
+        // 0-second TTL: entries expire immediately (on next second boundary).
+        let store = OpenDalCacheStore::from_config("memory", HashMap::new(), "cache/", Duration::from_secs(0))
+            .expect("memory backend should build");
+        store.put(99, "ttl-body".into(), dummy_response("ttl-resp")).await;
+        // Wait for the wall clock to advance past the expires_at timestamp.
+        tokio::time::sleep(Duration::from_millis(1100)).await;
+        let result = store.get(99, "ttl-body").await;
+        assert!(result.is_none(), "expired entry should return None");
+    }
+    /// Redis cache via OpenDAL. Requires a running Redis instance at
+    /// localhost:6379 (e.g. via `docker compose up -d redis`).
+    ///
+    /// Requires Redis on localhost:6379 (see docker-compose.yml).
+    #[tokio::test]
+    #[ignore = "requires Redis on localhost:6379"]
+    async fn redis_cache_put_get_ttl_remove() {
+        let mut config = HashMap::new();
+        config.insert("connection_string".into(), "redis://localhost:6379".into());
+        let store = OpenDalCacheStore::from_config("redis", config, "liter-test/", Duration::from_secs(300))
+            .expect("redis backend should build");
+        // Put
+        store.put(1, "redis-body".into(), dummy_response("redis-resp")).await;
+        // Get — should hit.
+        let result = store.get(1, "redis-body").await;
+        assert!(result.is_some(), "redis cache should return stored entry");
+        match result.unwrap() {
+            CachedResponse::Chat(r) => assert_eq!(r.id, "redis-resp"),
+            _ => panic!("expected CachedResponse::Chat"),
+        }
+        // Collision guard.
+        let miss = store.get(1, "wrong-body").await;
+        assert!(miss.is_none(), "redis cache should miss on body mismatch");
+        // Remove.
+        store.remove(1).await;
+        let after_remove = store.get(1, "redis-body").await;
+        assert!(after_remove.is_none(), "entry should be gone after remove");
+    }
+}

data/vendor/liter-llm/tests/concurrency.rs ADDED Viewed

@@ -0,0 +1,379 @@
+//! Concurrency tests for tower middleware layers.
+//!
+//! These tests verify that BudgetLayer, CacheLayer, and ModelRateLimitLayer
+//! handle concurrent access correctly — no panics, no data corruption, no
+//! deadlocks.
+#![cfg(feature = "tower")]
+use std::sync::Arc;
+use std::time::Duration;
+use liter_llm::error::LiterLlmError;
+use liter_llm::tower::{
+    BudgetConfig, BudgetLayer, BudgetState, CacheConfig, CacheLayer, Enforcement, LlmRequest, LlmService,
+    ModelRateLimitLayer, RateLimitConfig,
+};
+use tokio::task::JoinSet;
+use tower::{Service, ServiceBuilder};
+// ---- Helpers ---------------------------------------------------------------
+/// Minimal mock client that always returns a successful chat response with
+/// usage: prompt_tokens=10, completion_tokens=5.
+#[derive(Clone)]
+struct ConcurrencyMockClient;
+impl liter_llm::client::LlmClient for ConcurrencyMockClient {
+    fn chat(
+        &self,
+        req: liter_llm::types::ChatCompletionRequest,
+    ) -> liter_llm::client::BoxFuture<'_, liter_llm::types::ChatCompletionResponse> {
+        let resp = liter_llm::types::ChatCompletionResponse {
+            id: "conc-test".into(),
+            object: "chat.completion".into(),
+            created: 0,
+            model: req.model.clone(),
+            choices: vec![liter_llm::types::Choice {
+                index: 0,
+                message: liter_llm::types::AssistantMessage {
+                    content: Some("ok".into()),
+                    name: None,
+                    tool_calls: None,
+                    refusal: None,
+                    function_call: None,
+                },
+                finish_reason: Some(liter_llm::types::FinishReason::Stop),
+            }],
+            usage: Some(liter_llm::types::Usage {
+                prompt_tokens: 10,
+                completion_tokens: 5,
+                total_tokens: 15,
+            }),
+            system_fingerprint: None,
+            service_tier: None,
+        };
+        Box::pin(async move { Ok(resp) })
+    }
+    fn chat_stream(
+        &self,
+        _req: liter_llm::types::ChatCompletionRequest,
+    ) -> liter_llm::client::BoxFuture<'_, liter_llm::client::BoxStream<'_, liter_llm::types::ChatCompletionChunk>> {
+        Box::pin(async move {
+            let stream: liter_llm::client::BoxStream<'_, liter_llm::types::ChatCompletionChunk> =
+                Box::pin(futures_util::stream::empty());
+            Ok(stream)
+        })
+    }
+    fn embed(
+        &self,
+        req: liter_llm::types::EmbeddingRequest,
+    ) -> liter_llm::client::BoxFuture<'_, liter_llm::types::EmbeddingResponse> {
+        let resp = liter_llm::types::EmbeddingResponse {
+            object: "list".into(),
+            data: vec![],
+            model: req.model.clone(),
+            usage: Some(liter_llm::types::Usage {
+                prompt_tokens: 4,
+                completion_tokens: 0,
+                total_tokens: 4,
+            }),
+        };
+        Box::pin(async move { Ok(resp) })
+    }
+    fn list_models(&self) -> liter_llm::client::BoxFuture<'_, liter_llm::types::ModelsListResponse> {
+        Box::pin(async move {
+            Ok(liter_llm::types::ModelsListResponse {
+                object: "list".into(),
+                data: vec![],
+            })
+        })
+    }
+    fn image_generate(
+        &self,
+        _req: liter_llm::types::image::CreateImageRequest,
+    ) -> liter_llm::client::BoxFuture<'_, liter_llm::types::image::ImagesResponse> {
+        Box::pin(async move {
+            Ok(liter_llm::types::image::ImagesResponse {
+                created: 0,
+                data: vec![],
+            })
+        })
+    }
+    fn speech(
+        &self,
+        _req: liter_llm::types::audio::CreateSpeechRequest,
+    ) -> liter_llm::client::BoxFuture<'_, bytes::Bytes> {
+        Box::pin(async move { Ok(bytes::Bytes::new()) })
+    }
+    fn transcribe(
+        &self,
+        _req: liter_llm::types::audio::CreateTranscriptionRequest,
+    ) -> liter_llm::client::BoxFuture<'_, liter_llm::types::audio::TranscriptionResponse> {
+        Box::pin(async move {
+            Ok(liter_llm::types::audio::TranscriptionResponse {
+                text: String::new(),
+                language: None,
+                duration: None,
+                segments: None,
+            })
+        })
+    }
+    fn moderate(
+        &self,
+        _req: liter_llm::types::moderation::ModerationRequest,
+    ) -> liter_llm::client::BoxFuture<'_, liter_llm::types::moderation::ModerationResponse> {
+        Box::pin(async move {
+            Ok(liter_llm::types::moderation::ModerationResponse {
+                id: String::new(),
+                model: String::new(),
+                results: vec![],
+            })
+        })
+    }
+    fn rerank(
+        &self,
+        _req: liter_llm::types::rerank::RerankRequest,
+    ) -> liter_llm::client::BoxFuture<'_, liter_llm::types::rerank::RerankResponse> {
+        Box::pin(async move {
+            Ok(liter_llm::types::rerank::RerankResponse {
+                id: None,
+                results: vec![],
+                meta: None,
+            })
+        })
+    }
+    fn search(
+        &self,
+        _req: liter_llm::types::search::SearchRequest,
+    ) -> liter_llm::client::BoxFuture<'_, liter_llm::types::search::SearchResponse> {
+        Box::pin(async {
+            Err(liter_llm::error::LiterLlmError::EndpointNotSupported {
+                endpoint: "search".into(),
+                provider: "mock".into(),
+            })
+        })
+    }
+    fn ocr(
+        &self,
+        _req: liter_llm::types::ocr::OcrRequest,
+    ) -> liter_llm::client::BoxFuture<'_, liter_llm::types::ocr::OcrResponse> {
+        Box::pin(async {
+            Err(liter_llm::error::LiterLlmError::EndpointNotSupported {
+                endpoint: "ocr".into(),
+                provider: "mock".into(),
+            })
+        })
+    }
+}
+fn chat_req(model: &str) -> liter_llm::types::ChatCompletionRequest {
+    serde_json::from_value(serde_json::json!({
+        "model": model,
+        "messages": [{"role": "system", "content": "test"}]
+    }))
+    .expect("test request should deserialize")
+}
+// ---- Tests -----------------------------------------------------------------
+/// Spawn 100 concurrent requests through BudgetLayer. Verify that the final
+/// accumulated spend equals the expected sum (within the documented overshoot
+/// tolerance for hard enforcement — concurrent in-flight requests may all pass
+/// the pre-flight check before any of them record their cost).
+#[tokio::test]
+async fn concurrent_budget_tracking() {
+    let state = Arc::new(BudgetState::new());
+    let config = BudgetConfig {
+        global_limit: Some(100.0), // High enough to not reject any request.
+        enforcement: Enforcement::Soft,
+        ..Default::default()
+    };
+    let svc = ServiceBuilder::new()
+        .layer(BudgetLayer::new(config, Arc::clone(&state)))
+        .service(LlmService::new(ConcurrencyMockClient));
+    let svc = Arc::new(tokio::sync::Mutex::new(svc));
+    let mut tasks = JoinSet::new();
+    for _ in 0..100 {
+        let svc = Arc::clone(&svc);
+        tasks.spawn(async move {
+            let mut s = svc.lock().await.clone();
+            s.call(LlmRequest::Chat(chat_req("gpt-4"))).await
+        });
+    }
+    let mut ok_count = 0u64;
+    while let Some(result) = tasks.join_next().await {
+        let inner = result.expect("task should not panic");
+        if inner.is_ok() {
+            ok_count += 1;
+        }
+    }
+    assert_eq!(ok_count, 100, "all 100 requests should succeed under soft enforcement");
+    // Budget state should reflect all 100 calls. The exact value depends on
+    // cost::completion_cost for gpt-4 with prompt=10, completion=5. We just
+    // verify it is positive and non-zero.
+    assert!(
+        state.global_spend() > 0.0,
+        "global spend should be positive after 100 calls, got {}",
+        state.global_spend()
+    );
+}
+/// Spawn 50 concurrent identical requests through CacheLayer + LlmService.
+/// Verify no panics, no corruption in InMemoryStore, and all callers receive
+/// valid responses.
+#[tokio::test]
+async fn concurrent_cache_writes() {
+    let config = CacheConfig {
+        max_entries: 256,
+        ttl: Duration::from_secs(60),
+        ..Default::default()
+    };
+    let svc = ServiceBuilder::new()
+        .layer(CacheLayer::new(config))
+        .service(LlmService::new(ConcurrencyMockClient));
+    let svc = Arc::new(tokio::sync::Mutex::new(svc));
+    let mut tasks = JoinSet::new();
+    for _ in 0..50 {
+        let svc = Arc::clone(&svc);
+        tasks.spawn(async move {
+            let mut s = svc.lock().await.clone();
+            s.call(LlmRequest::Chat(chat_req("gpt-4"))).await
+        });
+    }
+    let mut ok_count = 0u64;
+    while let Some(result) = tasks.join_next().await {
+        let inner = result.expect("task should not panic");
+        let resp = inner.expect("each request should succeed");
+        // Verify the response is a Chat variant with valid content.
+        match resp {
+            liter_llm::tower::LlmResponse::Chat(r) => {
+                assert_eq!(r.model, "gpt-4", "response model should match request");
+            }
+            other => panic!("expected LlmResponse::Chat, got {other:?}"),
+        }
+        ok_count += 1;
+    }
+    assert_eq!(ok_count, 50, "all 50 requests should return valid responses");
+}
+/// Spawn 20 concurrent requests with RPM=5. Verify exactly 5 succeed and 15
+/// are rejected with RateLimited.
+#[tokio::test]
+async fn concurrent_rate_limit() {
+    let config = RateLimitConfig {
+        rpm: Some(5),
+        tpm: None,
+        window: Duration::from_secs(60),
+    };
+    let svc = ServiceBuilder::new()
+        .layer(ModelRateLimitLayer::new(config))
+        .service(LlmService::new(ConcurrencyMockClient));
+    // Rate limiting uses a shared DashMap, so concurrent access to the *same*
+    // mutable service requires serialisation. We clone the service for each
+    // task — the Arc<DashMap> is shared.
+    let svc = Arc::new(tokio::sync::Mutex::new(svc));
+    let mut tasks = JoinSet::new();
+    for _ in 0..20 {
+        let svc = Arc::clone(&svc);
+        tasks.spawn(async move {
+            let mut s = svc.lock().await.clone();
+            s.call(LlmRequest::Chat(chat_req("gpt-4"))).await
+        });
+    }
+    let mut successes = 0u64;
+    let mut rate_limited = 0u64;
+    while let Some(result) = tasks.join_next().await {
+        let inner = result.expect("task should not panic");
+        match inner {
+            Ok(_) => successes += 1,
+            Err(LiterLlmError::RateLimited { .. }) => rate_limited += 1,
+            Err(other) => panic!("unexpected error: {other:?}"),
+        }
+    }
+    assert_eq!(successes, 5, "exactly 5 requests should succeed (RPM=5)");
+    assert_eq!(rate_limited, 15, "exactly 15 requests should be rate-limited");
+}
+/// Cache + Budget + RateLimit all active. Spawn 10 requests. Verify the full
+/// middleware stack handles concurrent access without deadlocks or panics.
+/// The test completes within a timeout to guard against deadlocks.
+#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+async fn concurrent_full_stack() {
+    let budget_state = Arc::new(BudgetState::new());
+    let budget_config = BudgetConfig {
+        global_limit: Some(100.0),
+        enforcement: Enforcement::Soft,
+        ..Default::default()
+    };
+    let cache_config = CacheConfig {
+        max_entries: 64,
+        ttl: Duration::from_secs(60),
+        ..Default::default()
+    };
+    let rate_config = RateLimitConfig {
+        rpm: Some(100), // High enough not to reject.
+        tpm: None,
+        window: Duration::from_secs(60),
+    };
+    let svc = ServiceBuilder::new()
+        .layer(CacheLayer::new(cache_config))
+        .layer(BudgetLayer::new(budget_config, Arc::clone(&budget_state)))
+        .layer(ModelRateLimitLayer::new(rate_config))
+        .service(LlmService::new(ConcurrencyMockClient));
+    let svc = Arc::new(tokio::sync::Mutex::new(svc));
+    let mut tasks = JoinSet::new();
+    for i in 0..10 {
+        let svc = Arc::clone(&svc);
+        // Use two different models to exercise separate rate-limit buckets.
+        let model = if i % 2 == 0 { "gpt-4" } else { "gpt-3.5-turbo" };
+        tasks.spawn(async move {
+            let mut s = svc.lock().await.clone();
+            s.call(LlmRequest::Chat(chat_req(model))).await
+        });
+    }
+    // Wrap in a timeout to catch deadlocks.
+    let result = tokio::time::timeout(Duration::from_secs(10), async {
+        let mut ok_count = 0u64;
+        while let Some(result) = tasks.join_next().await {
+            let inner = result.expect("task should not panic");
+            assert!(inner.is_ok(), "request should succeed: {inner:?}");
+            ok_count += 1;
+        }
+        ok_count
+    })
+    .await;
+    let ok_count = result.expect("full stack should complete within 10s (no deadlock)");
+    assert_eq!(ok_count, 10, "all 10 requests should succeed");
+    assert!(budget_state.global_spend() > 0.0, "budget should have recorded spend");
+}