liter_llm 1.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +239 -0
  3. data/ext/liter_llm_rb/extconf.rb +65 -0
  4. data/ext/liter_llm_rb/native/.cargo/config.toml +23 -0
  5. data/ext/liter_llm_rb/native/Cargo.lock +3713 -0
  6. data/ext/liter_llm_rb/native/Cargo.toml +32 -0
  7. data/ext/liter_llm_rb/native/build.rs +15 -0
  8. data/ext/liter_llm_rb/native/src/lib.rs +1079 -0
  9. data/lib/liter_llm.rb +8 -0
  10. data/sig/liter_llm.rbs +416 -0
  11. data/vendor/Cargo.toml +54 -0
  12. data/vendor/liter-llm/Cargo.toml +92 -0
  13. data/vendor/liter-llm/README.md +252 -0
  14. data/vendor/liter-llm/schemas/pricing.json +40 -0
  15. data/vendor/liter-llm/schemas/providers.json +1662 -0
  16. data/vendor/liter-llm/src/auth/azure_ad.rs +264 -0
  17. data/vendor/liter-llm/src/auth/bedrock_sts.rs +353 -0
  18. data/vendor/liter-llm/src/auth/mod.rs +68 -0
  19. data/vendor/liter-llm/src/auth/vertex_oauth.rs +353 -0
  20. data/vendor/liter-llm/src/client/config.rs +351 -0
  21. data/vendor/liter-llm/src/client/managed.rs +622 -0
  22. data/vendor/liter-llm/src/client/mod.rs +864 -0
  23. data/vendor/liter-llm/src/cost.rs +212 -0
  24. data/vendor/liter-llm/src/error.rs +190 -0
  25. data/vendor/liter-llm/src/http/eventstream.rs +860 -0
  26. data/vendor/liter-llm/src/http/mod.rs +12 -0
  27. data/vendor/liter-llm/src/http/request.rs +438 -0
  28. data/vendor/liter-llm/src/http/retry.rs +72 -0
  29. data/vendor/liter-llm/src/http/streaming.rs +289 -0
  30. data/vendor/liter-llm/src/lib.rs +37 -0
  31. data/vendor/liter-llm/src/provider/anthropic.rs +2250 -0
  32. data/vendor/liter-llm/src/provider/azure.rs +579 -0
  33. data/vendor/liter-llm/src/provider/bedrock.rs +1543 -0
  34. data/vendor/liter-llm/src/provider/cohere.rs +654 -0
  35. data/vendor/liter-llm/src/provider/custom.rs +404 -0
  36. data/vendor/liter-llm/src/provider/google_ai.rs +281 -0
  37. data/vendor/liter-llm/src/provider/mistral.rs +188 -0
  38. data/vendor/liter-llm/src/provider/mod.rs +616 -0
  39. data/vendor/liter-llm/src/provider/vertex.rs +1504 -0
  40. data/vendor/liter-llm/src/tests.rs +1425 -0
  41. data/vendor/liter-llm/src/tokenizer.rs +281 -0
  42. data/vendor/liter-llm/src/tower/budget.rs +599 -0
  43. data/vendor/liter-llm/src/tower/cache.rs +502 -0
  44. data/vendor/liter-llm/src/tower/cache_opendal.rs +270 -0
  45. data/vendor/liter-llm/src/tower/cooldown.rs +231 -0
  46. data/vendor/liter-llm/src/tower/cost.rs +404 -0
  47. data/vendor/liter-llm/src/tower/fallback.rs +121 -0
  48. data/vendor/liter-llm/src/tower/health.rs +219 -0
  49. data/vendor/liter-llm/src/tower/hooks.rs +369 -0
  50. data/vendor/liter-llm/src/tower/mod.rs +77 -0
  51. data/vendor/liter-llm/src/tower/rate_limit.rs +300 -0
  52. data/vendor/liter-llm/src/tower/router.rs +436 -0
  53. data/vendor/liter-llm/src/tower/service.rs +181 -0
  54. data/vendor/liter-llm/src/tower/tests.rs +539 -0
  55. data/vendor/liter-llm/src/tower/tests_common.rs +252 -0
  56. data/vendor/liter-llm/src/tower/tracing.rs +209 -0
  57. data/vendor/liter-llm/src/tower/types.rs +170 -0
  58. data/vendor/liter-llm/src/types/audio.rs +52 -0
  59. data/vendor/liter-llm/src/types/batch.rs +77 -0
  60. data/vendor/liter-llm/src/types/chat.rs +214 -0
  61. data/vendor/liter-llm/src/types/common.rs +244 -0
  62. data/vendor/liter-llm/src/types/embedding.rs +84 -0
  63. data/vendor/liter-llm/src/types/files.rs +58 -0
  64. data/vendor/liter-llm/src/types/image.rs +40 -0
  65. data/vendor/liter-llm/src/types/mod.rs +27 -0
  66. data/vendor/liter-llm/src/types/models.rs +21 -0
  67. data/vendor/liter-llm/src/types/moderation.rs +80 -0
  68. data/vendor/liter-llm/src/types/ocr.rs +87 -0
  69. data/vendor/liter-llm/src/types/rerank.rs +46 -0
  70. data/vendor/liter-llm/src/types/responses.rs +55 -0
  71. data/vendor/liter-llm/src/types/search.rs +45 -0
  72. data/vendor/liter-llm/tests/contract.rs +332 -0
  73. data/vendor/liter-llm-ffi/Cargo.toml +30 -0
  74. data/vendor/liter-llm-ffi/build.rs +66 -0
  75. data/vendor/liter-llm-ffi/cbindgen.toml +60 -0
  76. data/vendor/liter-llm-ffi/liter_llm.h +850 -0
  77. data/vendor/liter-llm-ffi/src/lib.rs +2488 -0
  78. metadata +286 -0
@@ -0,0 +1,270 @@
1
+ //! OpenDAL-backed cache store for the response cache.
2
+ //!
3
+ //! Implements [`CacheStore`] using an [`opendal::Operator`] for persistence.
4
+ //! Supports any OpenDAL backend (S3, Redis, GCS, local filesystem, etc.).
5
+
6
+ use std::collections::HashMap;
7
+ use std::future::Future;
8
+ use std::pin::Pin;
9
+ use std::str::FromStr;
10
+ use std::time::{Duration, SystemTime, UNIX_EPOCH};
11
+
12
+ use opendal::Operator;
13
+ use serde::{Deserialize, Serialize};
14
+
15
+ use super::cache::{CacheStore, CachedResponse};
16
+
17
+ /// A cached entry stored via OpenDAL, including metadata for TTL and
18
+ /// collision detection.
19
+ #[derive(Serialize, Deserialize)]
20
+ struct StoredEntry {
21
+ request_body: String,
22
+ response: CachedResponse,
23
+ /// Unix timestamp (seconds) when this entry expires.
24
+ expires_at: u64,
25
+ }
26
+
27
+ /// Cache store backed by an [`opendal::Operator`].
28
+ ///
29
+ /// Entries are stored as JSON files under `{prefix}/{key}`. TTL is embedded
30
+ /// in the stored entry and checked on read. Backend failures are non-fatal:
31
+ /// they log a warning and behave as a cache miss / no-op.
32
+ pub struct OpenDalCacheStore {
33
+ operator: Operator,
34
+ prefix: String,
35
+ ttl: Duration,
36
+ }
37
+
38
+ impl OpenDalCacheStore {
39
+ /// Create a new OpenDAL cache store.
40
+ ///
41
+ /// `operator` must be a fully configured OpenDAL operator.
42
+ /// `prefix` is prepended to all cache keys (e.g. `"llm-cache/"`).
43
+ /// `ttl` controls how long entries are valid.
44
+ pub fn new(operator: Operator, prefix: impl Into<String>, ttl: Duration) -> Self {
45
+ Self {
46
+ operator,
47
+ prefix: prefix.into(),
48
+ ttl,
49
+ }
50
+ }
51
+
52
+ /// Build an OpenDAL operator from a scheme name and config map.
53
+ ///
54
+ /// # Errors
55
+ /// Returns an error if the scheme is unknown or the config is invalid.
56
+ pub fn from_config(
57
+ scheme: &str,
58
+ config: HashMap<String, String>,
59
+ prefix: impl Into<String>,
60
+ ttl: Duration,
61
+ ) -> crate::error::Result<Self> {
62
+ let parsed_scheme =
63
+ opendal::Scheme::from_str(scheme).map_err(|e| crate::error::LiterLlmError::InternalError {
64
+ message: format!("unknown OpenDAL scheme '{scheme}': {e}"),
65
+ })?;
66
+ let operator =
67
+ Operator::via_iter(parsed_scheme, config).map_err(|e| crate::error::LiterLlmError::InternalError {
68
+ message: format!("failed to build OpenDAL operator for '{scheme}': {e}"),
69
+ })?;
70
+ Ok(Self::new(operator, prefix, ttl))
71
+ }
72
+
73
+ fn key_path(&self, key: u64) -> String {
74
+ format!("{}{key}", self.prefix)
75
+ }
76
+
77
+ fn now_secs() -> u64 {
78
+ SystemTime::now()
79
+ .duration_since(UNIX_EPOCH)
80
+ .unwrap_or_default()
81
+ .as_secs()
82
+ }
83
+ }
84
+
85
+ impl CacheStore for OpenDalCacheStore {
86
+ fn get(&self, key: u64, request_body: &str) -> Pin<Box<dyn Future<Output = Option<CachedResponse>> + Send + '_>> {
87
+ let path = self.key_path(key);
88
+ let request_body = request_body.to_owned();
89
+ Box::pin(async move {
90
+ let bytes = match self.operator.read(&path).await {
91
+ Ok(b) => b,
92
+ Err(_) => return None,
93
+ };
94
+ let entry: StoredEntry = match serde_json::from_slice(bytes.to_bytes().as_ref()) {
95
+ Ok(e) => e,
96
+ Err(_) => return None,
97
+ };
98
+ // Check TTL
99
+ if Self::now_secs() > entry.expires_at {
100
+ // Lazily delete expired entry
101
+ let _ = self.operator.delete(&path).await;
102
+ return None;
103
+ }
104
+ // Verify request body matches (collision guard)
105
+ if entry.request_body != request_body {
106
+ return None;
107
+ }
108
+ Some(entry.response)
109
+ })
110
+ }
111
+
112
+ fn put(
113
+ &self,
114
+ key: u64,
115
+ request_body: String,
116
+ response: CachedResponse,
117
+ ) -> Pin<Box<dyn Future<Output = ()> + Send + '_>> {
118
+ let path = self.key_path(key);
119
+ let entry = StoredEntry {
120
+ request_body,
121
+ response,
122
+ expires_at: Self::now_secs() + self.ttl.as_secs(),
123
+ };
124
+ Box::pin(async move {
125
+ let bytes = match serde_json::to_vec(&entry) {
126
+ Ok(b) => b,
127
+ Err(e) => {
128
+ tracing::warn!("OpenDAL cache: failed to serialize entry: {e}");
129
+ return;
130
+ }
131
+ };
132
+ if let Err(e) = self.operator.write(&path, bytes).await {
133
+ tracing::warn!("OpenDAL cache: failed to write {path}: {e}");
134
+ }
135
+ })
136
+ }
137
+
138
+ fn remove(&self, key: u64) -> Pin<Box<dyn Future<Output = ()> + Send + '_>> {
139
+ let path = self.key_path(key);
140
+ Box::pin(async move {
141
+ if let Err(e) = self.operator.delete(&path).await {
142
+ tracing::warn!("OpenDAL cache: failed to delete {path}: {e}");
143
+ }
144
+ })
145
+ }
146
+ }
147
+
148
+ #[cfg(test)]
149
+ mod tests {
150
+ use super::*;
151
+ use crate::tower::cache::{CacheStore, CachedResponse};
152
+ use crate::types::{AssistantMessage, ChatCompletionResponse, Choice, FinishReason};
153
+
154
+ fn memory_store(ttl_secs: u64) -> OpenDalCacheStore {
155
+ let op = Operator::via_iter(opendal::Scheme::Memory, std::iter::empty::<(String, String)>())
156
+ .expect("memory backend should always build");
157
+ OpenDalCacheStore::new(op, "test/", Duration::from_secs(ttl_secs))
158
+ }
159
+
160
+ fn dummy_response() -> CachedResponse {
161
+ CachedResponse::Chat(ChatCompletionResponse {
162
+ id: "test-resp-001".into(),
163
+ object: "chat.completion".into(),
164
+ created: 1_700_000_000,
165
+ model: "gpt-4".into(),
166
+ choices: vec![Choice {
167
+ index: 0,
168
+ message: AssistantMessage {
169
+ content: Some("Hello!".into()),
170
+ name: None,
171
+ tool_calls: None,
172
+ refusal: None,
173
+ function_call: None,
174
+ },
175
+ finish_reason: Some(FinishReason::Stop),
176
+ }],
177
+ usage: None,
178
+ system_fingerprint: None,
179
+ service_tier: None,
180
+ })
181
+ }
182
+
183
+ #[tokio::test]
184
+ async fn put_and_get_round_trip() {
185
+ let store = memory_store(300);
186
+ store.put(42, "request-body-a".into(), dummy_response()).await;
187
+ let cached = store.get(42, "request-body-a").await;
188
+ assert!(cached.is_some(), "expected a cached response after put");
189
+ match cached.unwrap() {
190
+ CachedResponse::Chat(resp) => {
191
+ assert_eq!(resp.id, "test-resp-001");
192
+ assert_eq!(resp.model, "gpt-4");
193
+ }
194
+ _ => panic!("expected CachedResponse::Chat variant"),
195
+ }
196
+ }
197
+
198
+ #[tokio::test]
199
+ async fn get_returns_none_for_missing_key() {
200
+ let store = memory_store(300);
201
+ let result = store.get(999, "any-body").await;
202
+ assert!(result.is_none(), "expected None for a key that was never stored");
203
+ }
204
+
205
+ #[tokio::test]
206
+ async fn get_returns_none_for_wrong_request_body() {
207
+ let store = memory_store(300);
208
+ store.put(1, "body-alpha".into(), dummy_response()).await;
209
+ // Same key but different request body should miss (collision guard).
210
+ let result = store.get(1, "body-beta").await;
211
+ assert!(result.is_none(), "expected None when request body does not match");
212
+ }
213
+
214
+ #[tokio::test]
215
+ async fn expired_entry_returns_none() {
216
+ let store = memory_store(0); // 0-second TTL = immediate expiry
217
+ store.put(1, "req".into(), dummy_response()).await;
218
+ // TTL is stored in whole seconds, so we must wait at least 1 full
219
+ // second for the wall-clock to advance past the `expires_at` timestamp.
220
+ tokio::time::sleep(Duration::from_millis(1100)).await;
221
+ let result = store.get(1, "req").await;
222
+ assert!(result.is_none(), "expected None for expired entry");
223
+ }
224
+
225
+ #[tokio::test]
226
+ async fn remove_deletes_entry() {
227
+ let store = memory_store(300);
228
+ store.put(7, "req".into(), dummy_response()).await;
229
+ // Confirm it exists first.
230
+ assert!(store.get(7, "req").await.is_some());
231
+ // Remove and verify it is gone.
232
+ store.remove(7).await;
233
+ assert!(store.get(7, "req").await.is_none(), "expected None after remove");
234
+ }
235
+
236
+ #[tokio::test]
237
+ async fn overwrite_replaces_previous_entry() {
238
+ let store = memory_store(300);
239
+ store.put(1, "req".into(), dummy_response()).await;
240
+
241
+ // Overwrite with a different response.
242
+ let replacement = CachedResponse::Chat(ChatCompletionResponse {
243
+ id: "test-resp-002".into(),
244
+ object: "chat.completion".into(),
245
+ created: 1_700_000_001,
246
+ model: "gpt-4o".into(),
247
+ choices: vec![],
248
+ usage: None,
249
+ system_fingerprint: None,
250
+ service_tier: None,
251
+ });
252
+ store.put(1, "req".into(), replacement).await;
253
+
254
+ match store.get(1, "req").await {
255
+ Some(CachedResponse::Chat(resp)) => assert_eq!(resp.id, "test-resp-002"),
256
+ _ => panic!("expected updated CachedResponse::Chat variant"),
257
+ }
258
+ }
259
+
260
+ #[test]
261
+ fn from_config_rejects_unknown_scheme() {
262
+ let result = OpenDalCacheStore::from_config(
263
+ "nonexistent_backend_xyz",
264
+ std::collections::HashMap::new(),
265
+ "prefix/",
266
+ Duration::from_secs(60),
267
+ );
268
+ assert!(result.is_err(), "expected error for unknown scheme");
269
+ }
270
+ }
@@ -0,0 +1,231 @@
1
+ //! Deployment cooldown middleware.
2
+ //!
3
+ //! [`CooldownLayer`] wraps a service and implements a cooldown period after
4
+ //! transient errors. When the inner service returns a transient error (as
5
+ //! determined by [`LiterLlmError::is_transient`]), the service is marked as
6
+ //! cooling down for a configurable duration. During the cooldown period,
7
+ //! incoming requests are immediately rejected with
8
+ //! [`LiterLlmError::ServiceUnavailable`] without calling the inner service.
9
+
10
+ use std::sync::Arc;
11
+ use std::task::{Context, Poll};
12
+ use std::time::{Duration, Instant};
13
+
14
+ use tokio::sync::RwLock;
15
+ use tower::{Layer, Service};
16
+
17
+ use super::types::{LlmRequest, LlmResponse};
18
+ use crate::client::BoxFuture;
19
+ use crate::error::{LiterLlmError, Result};
20
+
21
+ // ---- State -----------------------------------------------------------------
22
+
23
+ struct CooldownState {
24
+ /// `None` when not cooling down, `Some(start)` when a cooldown is active.
25
+ cooldown_start: Option<Instant>,
26
+ }
27
+
28
+ // ---- Layer -----------------------------------------------------------------
29
+
30
+ /// Tower [`Layer`] that applies a cooldown period after transient errors.
31
+ pub struct CooldownLayer {
32
+ duration: Duration,
33
+ }
34
+
35
+ impl CooldownLayer {
36
+ /// Create a new cooldown layer.
37
+ ///
38
+ /// After a transient error, the wrapped service will reject all requests
39
+ /// for `duration` before allowing traffic through again.
40
+ #[must_use]
41
+ pub fn new(duration: Duration) -> Self {
42
+ Self { duration }
43
+ }
44
+ }
45
+
46
+ impl<S> Layer<S> for CooldownLayer {
47
+ type Service = CooldownService<S>;
48
+
49
+ fn layer(&self, inner: S) -> Self::Service {
50
+ CooldownService {
51
+ inner,
52
+ duration: self.duration,
53
+ state: Arc::new(RwLock::new(CooldownState { cooldown_start: None })),
54
+ }
55
+ }
56
+ }
57
+
58
+ // ---- Service ---------------------------------------------------------------
59
+
60
+ /// Tower service produced by [`CooldownLayer`].
61
+ pub struct CooldownService<S> {
62
+ inner: S,
63
+ duration: Duration,
64
+ state: Arc<RwLock<CooldownState>>,
65
+ }
66
+
67
+ impl<S: Clone> Clone for CooldownService<S> {
68
+ fn clone(&self) -> Self {
69
+ Self {
70
+ inner: self.inner.clone(),
71
+ duration: self.duration,
72
+ state: Arc::clone(&self.state),
73
+ }
74
+ }
75
+ }
76
+
77
+ impl<S> Service<LlmRequest> for CooldownService<S>
78
+ where
79
+ S: Service<LlmRequest, Response = LlmResponse, Error = LiterLlmError> + Send + Clone + 'static,
80
+ S::Future: Send + 'static,
81
+ {
82
+ type Response = LlmResponse;
83
+ type Error = LiterLlmError;
84
+ type Future = BoxFuture<'static, LlmResponse>;
85
+
86
+ fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<()>> {
87
+ self.inner.poll_ready(cx)
88
+ }
89
+
90
+ fn call(&mut self, req: LlmRequest) -> Self::Future {
91
+ let state = Arc::clone(&self.state);
92
+ let duration = self.duration;
93
+ // IMPORTANT: do NOT call self.inner.call(req) here — the inner service
94
+ // must only be invoked *after* the cooldown check passes inside the
95
+ // async block. Calling it eagerly would send the request even when the
96
+ // service is in a cooldown period.
97
+ let mut inner = self.inner.clone();
98
+
99
+ Box::pin(async move {
100
+ // Check whether we are in a cooldown period.
101
+ {
102
+ let read = state.read().await;
103
+ if let Some(start) = read.cooldown_start {
104
+ if start.elapsed() < duration {
105
+ return Err(LiterLlmError::ServiceUnavailable {
106
+ message: format!(
107
+ "service is cooling down for {:.0}s after a transient error",
108
+ duration.as_secs_f64()
109
+ ),
110
+ });
111
+ }
112
+ // Cooldown has expired — we need to reset it.
113
+ // Drop the read lock first, then take the write lock.
114
+ drop(read);
115
+ let mut write = state.write().await;
116
+ // Double-check under write lock (another task may have reset it).
117
+ if let Some(s) = write.cooldown_start
118
+ && s.elapsed() >= duration
119
+ {
120
+ write.cooldown_start = None;
121
+ }
122
+ }
123
+ }
124
+
125
+ // Only call the inner service after cooldown check passes.
126
+ match inner.call(req).await {
127
+ Ok(resp) => Ok(resp),
128
+ Err(e) if e.is_transient() => {
129
+ // Enter cooldown.
130
+ let mut write = state.write().await;
131
+ write.cooldown_start = Some(Instant::now());
132
+ Err(e)
133
+ }
134
+ Err(e) => Err(e),
135
+ }
136
+ })
137
+ }
138
+ }
139
+
140
+ // ---- Tests -----------------------------------------------------------------
141
+
142
+ #[cfg(test)]
143
+ mod tests {
144
+ use tower::{Layer as _, Service as _};
145
+
146
+ use super::*;
147
+ use crate::tower::service::LlmService;
148
+ use crate::tower::tests_common::{MockClient, chat_req};
149
+ use crate::tower::types::LlmRequest;
150
+
151
+ #[tokio::test]
152
+ async fn passes_through_on_success() {
153
+ let layer = CooldownLayer::new(Duration::from_secs(10));
154
+ let inner = LlmService::new(MockClient::ok());
155
+ let mut svc = layer.layer(inner);
156
+
157
+ let resp = svc.call(LlmRequest::Chat(chat_req("gpt-4"))).await;
158
+ assert!(resp.is_ok());
159
+ }
160
+
161
+ #[tokio::test]
162
+ async fn enters_cooldown_after_transient_error() {
163
+ let layer = CooldownLayer::new(Duration::from_secs(60));
164
+ let inner = LlmService::new(MockClient::failing_timeout());
165
+ let mut svc = layer.layer(inner);
166
+
167
+ // First call — transient error.
168
+ let err = svc
169
+ .call(LlmRequest::Chat(chat_req("gpt-4")))
170
+ .await
171
+ .expect_err("should fail");
172
+ assert!(matches!(err, LiterLlmError::Timeout));
173
+
174
+ // Second call — should be rejected with ServiceUnavailable (cooldown).
175
+ let err = svc
176
+ .call(LlmRequest::Chat(chat_req("gpt-4")))
177
+ .await
178
+ .expect_err("should be in cooldown");
179
+ assert!(
180
+ matches!(err, LiterLlmError::ServiceUnavailable { .. }),
181
+ "expected ServiceUnavailable during cooldown, got {err:?}"
182
+ );
183
+ }
184
+
185
+ #[tokio::test]
186
+ async fn cooldown_expires_after_duration() {
187
+ // Use a zero-second cooldown so it expires immediately.
188
+ let layer = CooldownLayer::new(Duration::from_millis(0));
189
+ let inner = LlmService::new(MockClient::failing_timeout());
190
+ let mut svc = layer.layer(inner);
191
+
192
+ // First call — transient error triggers cooldown.
193
+ svc.call(LlmRequest::Chat(chat_req("gpt-4")))
194
+ .await
195
+ .expect_err("should fail");
196
+
197
+ // With zero duration, cooldown is already expired. The next call should
198
+ // reach the inner service (which will fail again with Timeout, not
199
+ // ServiceUnavailable).
200
+ let err = svc
201
+ .call(LlmRequest::Chat(chat_req("gpt-4")))
202
+ .await
203
+ .expect_err("should fail");
204
+ assert!(
205
+ matches!(err, LiterLlmError::Timeout),
206
+ "expected Timeout (cooldown expired), got {err:?}"
207
+ );
208
+ }
209
+
210
+ #[tokio::test]
211
+ async fn non_transient_error_does_not_trigger_cooldown() {
212
+ let layer = CooldownLayer::new(Duration::from_secs(60));
213
+ let inner = LlmService::new(MockClient::failing_auth());
214
+ let mut svc = layer.layer(inner);
215
+
216
+ // First call — non-transient error.
217
+ svc.call(LlmRequest::Chat(chat_req("gpt-4")))
218
+ .await
219
+ .expect_err("should fail");
220
+
221
+ // Second call — should reach inner service (not cooldown).
222
+ let err = svc
223
+ .call(LlmRequest::Chat(chat_req("gpt-4")))
224
+ .await
225
+ .expect_err("should fail with auth, not cooldown");
226
+ assert!(
227
+ matches!(err, LiterLlmError::BadRequest { .. }),
228
+ "expected BadRequest (auth), not ServiceUnavailable, got {err:?}"
229
+ );
230
+ }
231
+ }