model-library 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. model_library/base/base.py +2 -0
  2. model_library/base/output.py +24 -9
  3. model_library/base/utils.py +27 -5
  4. model_library/config/README.md +169 -0
  5. model_library/config/ai21labs_models.yaml +11 -11
  6. model_library/config/alibaba_models.yaml +21 -22
  7. model_library/config/all_models.json +4623 -2599
  8. model_library/config/amazon_models.yaml +100 -102
  9. model_library/config/anthropic_models.yaml +43 -52
  10. model_library/config/cohere_models.yaml +25 -24
  11. model_library/config/deepseek_models.yaml +28 -25
  12. model_library/config/dummy_model.yaml +9 -7
  13. model_library/config/fireworks_models.yaml +86 -56
  14. model_library/config/google_models.yaml +146 -126
  15. model_library/config/inception_models.yaml +6 -6
  16. model_library/config/kimi_models.yaml +13 -14
  17. model_library/config/minimax_models.yaml +37 -0
  18. model_library/config/mistral_models.yaml +85 -29
  19. model_library/config/openai_models.yaml +192 -150
  20. model_library/config/perplexity_models.yaml +10 -23
  21. model_library/config/together_models.yaml +115 -104
  22. model_library/config/xai_models.yaml +47 -79
  23. model_library/config/zai_models.yaml +23 -15
  24. model_library/exceptions.py +7 -16
  25. model_library/providers/amazon.py +32 -17
  26. model_library/providers/minimax.py +33 -0
  27. model_library/providers/mistral.py +10 -1
  28. model_library/providers/openai.py +2 -6
  29. model_library/register_models.py +36 -36
  30. model_library/registry_utils.py +78 -16
  31. model_library/utils.py +2 -2
  32. {model_library-0.1.3.dist-info → model_library-0.1.5.dist-info}/METADATA +2 -2
  33. model_library-0.1.5.dist-info/RECORD +64 -0
  34. model_library-0.1.3.dist-info/RECORD +0 -61
  35. {model_library-0.1.3.dist-info → model_library-0.1.5.dist-info}/WHEEL +0 -0
  36. {model_library-0.1.3.dist-info → model_library-0.1.5.dist-info}/licenses/LICENSE +0 -0
  37. {model_library-0.1.3.dist-info → model_library-0.1.5.dist-info}/top_level.txt +0 -0
@@ -2,20 +2,21 @@ base-config:
2
2
  company: xAI
3
3
  documentation_url: https://docs.x.ai/docs#models
4
4
  open_source: false
5
- class_properties:
6
- supports_images: true
5
+ supports:
6
+ images: true
7
+ files: false
8
+ tools: true
9
+ metadata:
7
10
  available_as_evaluator: false
8
- supports_files: false
9
11
  available_for_everyone: true
10
12
  ignored_for_cost: false
11
- supports_tools: true
12
13
  properties:
13
14
  reasoning_model: false
14
15
 
15
16
  xai-models:
16
17
  base-config:
17
- class_properties:
18
- supports_temperature: true
18
+ supports:
19
+ temperature: true
19
20
  costs_per_million_token:
20
21
  cache:
21
22
  read_discount: 0.25
@@ -29,19 +30,16 @@ xai-models:
29
30
  release_date: 2025-08-25
30
31
  properties:
31
32
  context_window: 256_000
32
- max_token_output: 40_000
33
+ max_tokens: 40_000
33
34
  reasoning_model: true
34
- class_properties:
35
- supports_images: false
35
+ supports:
36
+ images: false
36
37
  costs_per_million_token:
37
38
  input: 0.20
38
39
  output: 1.50
39
40
  cache:
40
41
  read: 0.02
41
42
  documentation_url: https://docs.x.ai/docs/models/grok-code-fast-1
42
- default_parameters:
43
- temperature: 0.7
44
- max_output_tokens: 40000
45
43
  alternative_keys:
46
44
  - grok/grok-code-fast
47
45
  - grok/grok-code-fast-1-0825
@@ -51,16 +49,12 @@ xai-models:
51
49
  description: Latest advancement in cost-efficient reasoning models with unified architecture. Handles complex requests with deep chain-of-thought reasoning. Features 2M token context window and native tool use.
52
50
  release_date: 2025-09-19
53
51
  open_source: false
54
- class_properties:
55
- supports_images: true
56
- available_as_evaluator: true
57
- supports_metadata: true
58
- supports_files: false
59
- available_for_everyone: true
60
- ignored_for_cost: false
52
+ supports:
53
+ images: true
54
+ files: false
61
55
  properties:
62
56
  context_window: 2_000_000
63
- max_token_output: 2_000_000 # from openrouter
57
+ max_tokens: 2_000_000
64
58
  training_cutoff: null
65
59
  reasoning_model: true
66
60
  documentation_url: https://docs.x.ai/docs/models/grok-4-fast-reasoning
@@ -73,9 +67,6 @@ xai-models:
73
67
  threshold: 128_000
74
68
  input: 0.4
75
69
  output: 1.0
76
- default_parameters:
77
- temperature: 0.7
78
- max_output_tokens: 128000
79
70
  alternative_keys:
80
71
  - grok/grok-4-fast
81
72
  - grok/grok-4-fast-reasoning-latest
@@ -83,18 +74,14 @@ xai-models:
83
74
  grok/grok-4-1-fast-reasoning:
84
75
  label: Grok 4.1 Fast (Reasoning)
85
76
  description: ""
86
- release_date: 2025-10-19
77
+ release_date: 2025-11-19
87
78
  open_source: false
88
- class_properties:
89
- supports_images: true
90
- available_as_evaluator: true
91
- supports_metadata: true
92
- supports_files: false
93
- available_for_everyone: true
94
- ignored_for_cost: false
79
+ supports:
80
+ images: true
81
+ files: false
95
82
  properties:
96
83
  context_window: 2_000_000
97
- max_token_output: 2_000_000 # from openrouter
84
+ max_tokens: 2_000_000 # from openrouter
98
85
  training_cutoff: null
99
86
  reasoning_model: true
100
87
  documentation_url: ""
@@ -107,25 +94,18 @@ xai-models:
107
94
  threshold: 128_000
108
95
  input: 0.4
109
96
  output: 1.0
110
- default_parameters:
111
- temperature: 0.7
112
- max_output_tokens: 128000
113
97
 
114
98
  grok/grok-4-1-fast-non-reasoning:
115
99
  label: Grok 4.1 Fast Non-Reasoning
116
100
  description: ""
117
- release_date: 2025-10-19
101
+ release_date: 2025-11-19
118
102
  open_source: false
119
- class_properties:
120
- supports_images: true
121
- available_as_evaluator: true
122
- supports_metadata: true
123
- supports_files: false
124
- available_for_everyone: true
125
- ignored_for_cost: false
103
+ supports:
104
+ images: true
105
+ files: false
126
106
  properties:
127
107
  context_window: 2_000_000
128
- max_token_output: 2_000_000 # from openrouter
108
+ max_tokens: 2_000_000 # from openrouter
129
109
  training_cutoff: null
130
110
  reasoning_model: false
131
111
  documentation_url: ""
@@ -138,25 +118,18 @@ xai-models:
138
118
  threshold: 128_000
139
119
  input: 0.4
140
120
  output: 1.0
141
- default_parameters:
142
- temperature: 0.7
143
- max_output_tokens: 128000
144
121
 
145
122
  grok/grok-4-fast-non-reasoning:
146
123
  label: Grok 4 Fast (Non-Reasoning)
147
124
  description: Cost-efficient model focused on speed and efficiency for straightforward tasks like summarization or classification without deep logical processing. Unified architecture with reasoning variant, steered via system prompts.
148
125
  release_date: 2025-09-19
149
126
  open_source: false
150
- class_properties:
151
- supports_images: true
152
- available_as_evaluator: true
153
- supports_metadata: true
154
- supports_files: false
155
- available_for_everyone: true
156
- ignored_for_cost: false
127
+ supports:
128
+ images: true
129
+ files: false
157
130
  properties:
158
131
  context_window: 2_000_000
159
- max_token_output: 2_000_000 # from openrouter
132
+ max_tokens: 2_000_000
160
133
  training_cutoff: null
161
134
  reasoning_model: false
162
135
  documentation_url: https://docs.x.ai/docs/models/grok-4-fast-non-reasoning
@@ -169,9 +142,6 @@ xai-models:
169
142
  threshold: 128_000
170
143
  input: 0.4
171
144
  output: 1.0
172
- default_parameters:
173
- temperature: 0.7
174
- max_output_tokens: 2000000
175
145
  alternative_keys:
176
146
  - grok/grok-4-fast-non-reasoning-latest
177
147
 
@@ -179,12 +149,12 @@ xai-models:
179
149
  label: Grok 4
180
150
  description: Latest and greatest flagship model offering unparalleled performance in natural language, math and reasoning. The perfect jack of all trades with native tool use and structured outputs support.
181
151
  release_date: 2025-07-09
182
- class_properties:
183
- supports_images: true
184
- available_for_everyone: false
152
+ supports:
153
+ images: true
154
+ tools: true
185
155
  properties:
186
156
  context_window: 256_000
187
- max_token_output: 128_000
157
+ max_tokens: 128_000
188
158
  training_cutoff: null
189
159
  reasoning_model: true
190
160
  documentation_url: https://docs.x.ai/docs/models/grok-4-0709
@@ -197,9 +167,6 @@ xai-models:
197
167
  threshold: 128_000
198
168
  input: 6.00
199
169
  output: 30.00
200
- default_parameters:
201
- temperature: 0.7
202
- max_output_tokens: 128000
203
170
  alternative_keys:
204
171
  - grok/grok-4
205
172
  - grok/grok-4-latest
@@ -210,15 +177,15 @@ xai-models:
210
177
  release_date: 2025-04-09
211
178
  properties:
212
179
  context_window: 131_072
213
- max_token_output: null
180
+ max_tokens: null
214
181
  training_cutoff: null
215
182
  reasoning_model: true
216
- class_properties:
183
+ metadata:
217
184
  deprecated: true
218
185
  costs_per_million_token:
219
186
  input: 0.30
220
187
  output: 0.50
221
- cached:
188
+ cache:
222
189
  read: 0.075
223
190
  documentation_url: https://docs.x.ai/docs/models/grok-3-mini
224
191
  default_parameters:
@@ -248,7 +215,7 @@ xai-models:
248
215
  release_date: 2025-04-09
249
216
  properties:
250
217
  context_window: 131_072
251
- max_token_output: null
218
+ max_tokens: null
252
219
  training_cutoff: null
253
220
  costs_per_million_token:
254
221
  input: 3.00
@@ -271,10 +238,10 @@ xai-models:
271
238
  release_date: 2024-12-12
272
239
  properties:
273
240
  context_window: 8_192
274
- max_token_output: null
241
+ max_tokens: null
275
242
  training_cutoff: null
276
- class_properties:
277
- supports_images: true
243
+ supports:
244
+ images: true
278
245
  costs_per_million_token:
279
246
  input: 2.00
280
247
  output: 10.00
@@ -288,9 +255,9 @@ xai-models:
288
255
  release_date: 2024-12-11
289
256
  properties:
290
257
  context_window: 131_072
291
- max_token_output: null
258
+ max_tokens: null
292
259
  training_cutoff: null
293
- class_properties:
260
+ metadata:
294
261
  deprecated: true
295
262
  costs_per_million_token:
296
263
  input: 2.00
@@ -302,10 +269,11 @@ xai-models:
302
269
  release_date: 2024-12-12
303
270
  properties:
304
271
  context_window: 8_192
305
- max_token_output: null
272
+ max_tokens: null
306
273
  training_cutoff: null
307
- class_properties:
308
- supports_images: true
274
+ supports:
275
+ images: true
276
+ metadata:
309
277
  deprecated: true
310
278
  costs_per_million_token:
311
279
  input: 5.00
@@ -317,9 +285,9 @@ xai-models:
317
285
  release_date: 2024-12-11
318
286
  properties:
319
287
  context_window: 131_072
320
- max_token_output: null
288
+ max_tokens: null
321
289
  training_cutoff: null
322
- class_properties:
290
+ metadata:
323
291
  deprecated: true
324
292
  costs_per_million_token:
325
293
  input: 5.00
@@ -2,12 +2,13 @@ base-config:
2
2
  company: zAI
3
3
  open_source: true
4
4
  documentation_url: https://docs.z.ai/
5
- class_properties:
6
- supports_images: false
7
- supports_files: false
5
+ supports:
6
+ images: false
7
+ files: false
8
+ temperature: true
9
+ tools: true
10
+ properties:
8
11
  reasoning_model: true
9
- supports_temperature: true
10
- supports_tools: true
11
12
  default_parameters:
12
13
  temperature: 0.6
13
14
  top_p: 1
@@ -23,14 +24,17 @@ zai-models:
23
24
  release_date: 2025-07-28
24
25
  properties:
25
26
  context_window: 128_000
26
- max_token_output: 81_920
27
+ max_tokens: 81_920
27
28
  costs_per_million_token:
28
29
  input: 0.6
29
30
  output: 2.2
30
31
  cache:
31
32
  read: 0.11
32
33
  alternative_keys:
33
- - fireworks/glm-4p5
34
+ - fireworks/glm-4p5:
35
+ costs_per_million_token:
36
+ input: 0.55
37
+ output: 2.19
34
38
 
35
39
  zai/glm-4.5-air:
36
40
  label: GLM 4.5 Air
@@ -38,14 +42,17 @@ zai-models:
38
42
  release_date: 2025-07-28
39
43
  properties:
40
44
  context_window: 128_000
41
- max_token_output: 81_920
45
+ max_tokens: 81_920
42
46
  costs_per_million_token:
43
47
  input: 0.2
44
48
  output: 1.1
45
- cache:
49
+ cache:
46
50
  read: 0.03
47
51
  alternative_keys:
48
- - together/zai-org/GLM-4.5-Air-FP8
52
+ - together/zai-org/GLM-4.5-Air-FP8:
53
+ costs_per_million_token:
54
+ input: 0.22
55
+ output: 0.88
49
56
 
50
57
  zai/glm-4.6:
51
58
  label: GLM 4.6
@@ -53,13 +60,14 @@ zai-models:
53
60
  release_date: 2025-09-30
54
61
  properties:
55
62
  context_window: 200_000
56
- max_token_output: 122_880
63
+ max_tokens: 122_880
57
64
  costs_per_million_token:
58
65
  input: 0.6
59
66
  output: 2.2
60
- cache:
67
+ cache:
61
68
  read: 0.11
62
69
  alternative_keys:
63
- - fireworks/glm-4p6
64
-
65
-
70
+ - fireworks/glm-4p6:
71
+ costs_per_million_token:
72
+ input: 0.55
73
+ output: 2.19
@@ -9,6 +9,7 @@ from anthropic import InternalServerError
9
9
  from anthropic import RateLimitError as AnthropicRateLimitError
10
10
  from backoff._typing import Details
11
11
  from httpcore import ReadError as HTTPCoreReadError
12
+ from httpx import ConnectError as HTTPXConnectError
12
13
  from httpx import ReadError as HTTPXReadError
13
14
  from httpx import RemoteProtocolError
14
15
  from openai import APIConnectionError as OpenAIAPIConnectionError
@@ -54,20 +55,6 @@ class MaxOutputTokensExceededError(Exception):
54
55
  super().__init__(message or MaxOutputTokensExceededError.DEFAULT_MESSAGE)
55
56
 
56
57
 
57
- class MaxInputTokensExceededError(Exception):
58
- """
59
- Raised when the input exceeds the allowed max input tokens limit
60
- """
61
-
62
- DEFAULT_MESSAGE: str = (
63
- "Input exceeded the maximum allowed input tokens. "
64
- "Consider reducing the input size."
65
- )
66
-
67
- def __init__(self, message: str | None = None):
68
- super().__init__(message or MaxInputTokensExceededError.DEFAULT_MESSAGE)
69
-
70
-
71
58
  class MaxContextWindowExceededError(Exception):
72
59
  """
73
60
  Raised when the context window exceeds the allowed max context window limit
@@ -98,7 +85,9 @@ CONTEXT_WINDOW_PATTERN = re.compile(
98
85
  r"sent message larger than max|"
99
86
  r"input tokens exceeded|"
100
87
  r"(messages?|total length).*too long|"
101
- r"payload.*too large"
88
+ r"payload.*too large|"
89
+ r"string too long|"
90
+ r"input exceeded the context window"
102
91
  )
103
92
 
104
93
 
@@ -171,6 +160,7 @@ RETRIABLE_EXCEPTIONS = [
171
160
  AI21RateLimitError,
172
161
  RemoteProtocolError, # httpx connection closing when running models from sdk
173
162
  HTTPXReadError,
163
+ HTTPXConnectError,
174
164
  HTTPCoreReadError,
175
165
  ]
176
166
 
@@ -188,12 +178,13 @@ RETRIABLE_EXCEPTION_CODES = [
188
178
  "connection_error",
189
179
  "service_unavailable",
190
180
  "rate_limit",
181
+ "rate limit",
191
182
  "internal_error",
192
183
  "server_error",
193
184
  "overloaded",
194
185
  "throttling", # AWS throttling errors
195
- "throttlingexception", # AWS throttling errors
196
186
  "internal server error",
187
+ "InternalServerError",
197
188
  ]
198
189
 
199
190
 
@@ -26,6 +26,7 @@ from model_library.base import (
26
26
  ToolDefinition,
27
27
  ToolResult,
28
28
  )
29
+ from model_library.base.input import FileBase
29
30
  from model_library.exceptions import (
30
31
  BadInputError,
31
32
  MaxOutputTokensExceededError,
@@ -60,11 +61,13 @@ class AmazonModel(LLM):
60
61
  config: LLMConfig | None = None,
61
62
  ):
62
63
  super().__init__(model_name, provider, config=config)
63
- if self.model_name.endswith("-thinking"):
64
- self.model_name = self.model_name.replace("-thinking", "")
65
- self.reasoning = True
66
- if self.max_tokens < 1024:
67
- self.max_tokens = 2048
64
+ self.supports_cache = "amazon" in self.model_name or "claude" in self.model_name
65
+ self.supports_cache = (
66
+ self.supports_cache and "v2" not in self.model_name
67
+ ) # supported but no access yet
68
+ self.supports_tool_cache = self.supports_cache and "claude" in self.model_name
69
+
70
+ cache_control = {"type": "default"}
68
71
 
69
72
  @override
70
73
  async def parse_input(
@@ -120,6 +123,10 @@ class AmazonModel(LLM):
120
123
  new_input.append(item)
121
124
 
122
125
  if content_user:
126
+ if self.supports_cache:
127
+ if not isinstance(input[-1], FileBase):
128
+ # last item cannot be file
129
+ content_user.append({"cachePoint": self.cache_control})
123
130
  new_input.append({"role": "user", "content": content_user})
124
131
 
125
132
  return new_input
@@ -174,6 +181,8 @@ class AmazonModel(LLM):
174
181
  }
175
182
  }
176
183
  )
184
+ if parsed_tools and self.supports_tool_cache:
185
+ parsed_tools.append({"cachePoint": self.cache_control})
177
186
  return parsed_tools
178
187
 
179
188
  @override
@@ -203,8 +212,12 @@ class AmazonModel(LLM):
203
212
 
204
213
  if "system_prompt" in kwargs:
205
214
  body["system"] = [{"text": kwargs.pop("system_prompt")}]
215
+ if self.supports_cache:
216
+ body["system"].append({"cachePoint": self.cache_control})
206
217
 
207
218
  if self.reasoning:
219
+ if self.max_tokens < 1024:
220
+ self.max_tokens = 2048
208
221
  budget_tokens = kwargs.pop(
209
222
  "budget_tokens", get_default_budget_tokens(self.max_tokens)
210
223
  )
@@ -244,9 +257,8 @@ class AmazonModel(LLM):
244
257
  tool_calls: dict[str, Any] = {}
245
258
 
246
259
  messages: dict[str, Any] = {"content": []}
247
- input_tokens = 0
248
- output_tokens = 0
249
260
  stop_reason: str = ""
261
+ metadata = QueryResultMetadata()
250
262
 
251
263
  for chunk in response["stream"]:
252
264
  key = list(chunk.keys())[0]
@@ -281,8 +293,16 @@ class AmazonModel(LLM):
281
293
  tool_calls["input"] += delta["toolUse"]["input"]
282
294
 
283
295
  case "metadata":
284
- input_tokens = value["usage"]["inputTokens"]
285
- output_tokens = value["usage"]["outputTokens"]
296
+ metadata = QueryResultMetadata(
297
+ in_tokens=value["usage"]["inputTokens"],
298
+ out_tokens=value["usage"]["outputTokens"],
299
+ )
300
+ metadata.cache_read_tokens = value["usage"].get(
301
+ "cacheReadInputTokens", None
302
+ )
303
+ metadata.cache_write_tokens = value["usage"].get(
304
+ "cacheWriteInputTokens", None
305
+ )
286
306
 
287
307
  case "contentBlockStop":
288
308
  if tool_calls:
@@ -308,7 +328,7 @@ class AmazonModel(LLM):
308
328
  case "messageStop":
309
329
  stop_reason = value["stopReason"]
310
330
 
311
- return messages, stop_reason, input_tokens, output_tokens
331
+ return messages, stop_reason, metadata
312
332
 
313
333
  # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-runtime/client/converse.html#
314
334
  @override
@@ -326,9 +346,7 @@ class AmazonModel(LLM):
326
346
  **body,
327
347
  )
328
348
 
329
- messages, stop_reason, input_tokens, output_tokens = await self.stream_response(
330
- response
331
- )
349
+ messages, stop_reason, metadata = await self.stream_response(response)
332
350
 
333
351
  text = " ".join([i["text"] for i in messages["content"] if "text" in i])
334
352
  reasoning = " ".join(
@@ -361,10 +379,7 @@ class AmazonModel(LLM):
361
379
  return QueryResult(
362
380
  output_text=text,
363
381
  reasoning=reasoning,
364
- metadata=QueryResultMetadata(
365
- in_tokens=input_tokens,
366
- out_tokens=output_tokens,
367
- ),
382
+ metadata=metadata,
368
383
  tool_calls=tool_calls,
369
384
  history=[*input, messages],
370
385
  )
@@ -0,0 +1,33 @@
1
+ from typing import Literal
2
+
3
+ from model_library import model_library_settings
4
+ from model_library.base import (
5
+ DelegateOnly,
6
+ LLMConfig,
7
+ )
8
+ from model_library.providers.openai import OpenAIModel
9
+ from model_library.register_models import register_provider
10
+ from model_library.utils import create_openai_client_with_defaults
11
+
12
+
13
+ @register_provider("minimax")
14
+ class MinimaxModel(DelegateOnly):
15
+ def __init__(
16
+ self,
17
+ model_name: str,
18
+ provider: Literal["minimax"] = "minimax",
19
+ *,
20
+ config: LLMConfig | None = None,
21
+ ):
22
+ super().__init__(model_name, provider, config=config)
23
+
24
+ self.delegate = OpenAIModel(
25
+ model_name=self.model_name,
26
+ provider=self.provider,
27
+ config=config,
28
+ custom_client=create_openai_client_with_defaults(
29
+ api_key=model_library_settings.MINIMAX_API_KEY,
30
+ base_url="https://api.minimax.io/v1",
31
+ ),
32
+ use_completions=True,
33
+ )
@@ -29,6 +29,7 @@ from model_library.base import (
29
29
  from model_library.exceptions import (
30
30
  BadInputError,
31
31
  MaxOutputTokensExceededError,
32
+ ModelNoOutputError,
32
33
  )
33
34
  from model_library.file_utils import trim_images
34
35
  from model_library.register_models import register_provider
@@ -250,9 +251,17 @@ class MistralModel(LLM):
250
251
  self.logger.error(f"Error: {e}", exc_info=True)
251
252
  raise e
252
253
 
253
- if finish_reason == "length" and not text and not reasoning:
254
+ if (
255
+ finish_reason == "length"
256
+ and not text
257
+ and not reasoning
258
+ and not raw_tool_calls
259
+ ):
254
260
  raise MaxOutputTokensExceededError()
255
261
 
262
+ if not text and not reasoning and not raw_tool_calls:
263
+ raise ModelNoOutputError()
264
+
256
265
  tool_calls: list[ToolCall] = []
257
266
 
258
267
  for tool_call in raw_tool_calls or []:
@@ -521,10 +521,6 @@ class OpenAIModel(LLM):
521
521
  metadata: QueryResultMetadata = QueryResultMetadata()
522
522
  raw_tool_calls: list[ChatCompletionMessageToolCall] = []
523
523
 
524
- # enable usage data in streaming responses
525
- if "stream_options" not in body:
526
- body["stream_options"] = {"include_usage": True}
527
-
528
524
  stream = await self.get_client().chat.completions.create(
529
525
  **body, # pyright: ignore[reportAny]
530
526
  stream=True,
@@ -587,7 +583,7 @@ class OpenAIModel(LLM):
587
583
  cache_read_tokens = (
588
584
  chunk.usage.prompt_tokens_details.cached_tokens or 0
589
585
  if chunk.usage.prompt_tokens_details
590
- else 0
586
+ else getattr(chunk.usage, "cached_tokens", 0) # for kimi
591
587
  )
592
588
  metadata = QueryResultMetadata(
593
589
  in_tokens=chunk.usage.prompt_tokens - cache_read_tokens,
@@ -625,7 +621,7 @@ class OpenAIModel(LLM):
625
621
  if raw_tool_calls
626
622
  else None,
627
623
  )
628
- if hasattr(final_message, "reasoning_content") and reasoning_text:
624
+ if reasoning_text:
629
625
  setattr(final_message, "reasoning_content", reasoning_text)
630
626
 
631
627
  return QueryResult(