model-library 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_library/base/base.py +141 -62
- model_library/base/delegate_only.py +77 -10
- model_library/base/output.py +43 -0
- model_library/base/utils.py +35 -0
- model_library/config/alibaba_models.yaml +49 -57
- model_library/config/all_models.json +353 -120
- model_library/config/anthropic_models.yaml +2 -1
- model_library/config/kimi_models.yaml +30 -3
- model_library/config/mistral_models.yaml +2 -0
- model_library/config/openai_models.yaml +15 -23
- model_library/config/together_models.yaml +2 -0
- model_library/config/xiaomi_models.yaml +43 -0
- model_library/config/zai_models.yaml +27 -3
- model_library/exceptions.py +3 -77
- model_library/providers/ai21labs.py +12 -8
- model_library/providers/alibaba.py +17 -8
- model_library/providers/amazon.py +49 -16
- model_library/providers/anthropic.py +128 -48
- model_library/providers/azure.py +22 -10
- model_library/providers/cohere.py +7 -7
- model_library/providers/deepseek.py +8 -8
- model_library/providers/fireworks.py +7 -8
- model_library/providers/google/batch.py +14 -10
- model_library/providers/google/google.py +57 -30
- model_library/providers/inception.py +7 -7
- model_library/providers/kimi.py +18 -8
- model_library/providers/minimax.py +15 -17
- model_library/providers/mistral.py +20 -8
- model_library/providers/openai.py +99 -22
- model_library/providers/openrouter.py +34 -0
- model_library/providers/perplexity.py +7 -7
- model_library/providers/together.py +7 -8
- model_library/providers/vals.py +12 -6
- model_library/providers/vercel.py +34 -0
- model_library/providers/xai.py +47 -42
- model_library/providers/xiaomi.py +34 -0
- model_library/providers/zai.py +38 -8
- model_library/register_models.py +5 -0
- model_library/registry_utils.py +48 -17
- model_library/retriers/__init__.py +0 -0
- model_library/retriers/backoff.py +73 -0
- model_library/retriers/base.py +225 -0
- model_library/retriers/token.py +427 -0
- model_library/retriers/utils.py +11 -0
- model_library/settings.py +1 -1
- model_library/utils.py +17 -7
- {model_library-0.1.7.dist-info → model_library-0.1.9.dist-info}/METADATA +2 -1
- model_library-0.1.9.dist-info/RECORD +73 -0
- {model_library-0.1.7.dist-info → model_library-0.1.9.dist-info}/WHEEL +1 -1
- model_library-0.1.7.dist-info/RECORD +0 -64
- {model_library-0.1.7.dist-info → model_library-0.1.9.dist-info}/licenses/LICENSE +0 -0
- {model_library-0.1.7.dist-info → model_library-0.1.9.dist-info}/top_level.txt +0 -0
|
@@ -35,6 +35,19 @@ gpt-5-models:
|
|
|
35
35
|
training_cutoff: "2024-09"
|
|
36
36
|
reasoning_model: true
|
|
37
37
|
|
|
38
|
+
openai/gpt-5.2-codex:
|
|
39
|
+
label: GPT 5.2 Codex
|
|
40
|
+
documentation_url: https://platform.openai.com/docs/models/gpt-5.2-codex
|
|
41
|
+
description: GPT 5.2 optimized for code
|
|
42
|
+
release_date: 2025-12-11
|
|
43
|
+
properties:
|
|
44
|
+
context_window: 400_000
|
|
45
|
+
costs_per_million_token:
|
|
46
|
+
input: 1.75
|
|
47
|
+
output: 14
|
|
48
|
+
cache:
|
|
49
|
+
read: 0.175
|
|
50
|
+
|
|
38
51
|
openai/gpt-5.2-2025-12-11:
|
|
39
52
|
label: GPT 5.2
|
|
40
53
|
documentation_url: https://platform.openai.com/docs/models/gpt-5.2
|
|
@@ -70,7 +83,7 @@ gpt-5-models:
|
|
|
70
83
|
openai/gpt-5.1-codex-max:
|
|
71
84
|
label: GPT 5.1 Codex Max
|
|
72
85
|
release_date: 2025-12-04
|
|
73
|
-
description:
|
|
86
|
+
description: GPT 5.1 optimized for code
|
|
74
87
|
costs_per_million_token:
|
|
75
88
|
input: 1.25
|
|
76
89
|
output: 10.0
|
|
@@ -79,7 +92,7 @@ gpt-5-models:
|
|
|
79
92
|
openai/gpt-5.1-codex:
|
|
80
93
|
label: GPT 5.1 Codex
|
|
81
94
|
documentation_url: https://platform.openai.com/docs/models/gpt-5.1-codex
|
|
82
|
-
description:
|
|
95
|
+
description: GPT 5.1 optimized for code
|
|
83
96
|
release_date: 2025-11-13
|
|
84
97
|
costs_per_million_token:
|
|
85
98
|
input: 1.25
|
|
@@ -841,24 +854,3 @@ gpt-3.5-models:
|
|
|
841
854
|
input: 1.5
|
|
842
855
|
output: 2.0
|
|
843
856
|
documentation_url: https://platform.openai.com/docs/models/gpt-3.5-turbo-instruct
|
|
844
|
-
|
|
845
|
-
databricks-models:
|
|
846
|
-
base-config:
|
|
847
|
-
company: Databricks
|
|
848
|
-
|
|
849
|
-
databricks/databricks-dbrx-instruct:
|
|
850
|
-
label: DBRX Instruct
|
|
851
|
-
description: Databricks Instruct model.
|
|
852
|
-
release_date: 2024-03-27
|
|
853
|
-
properties:
|
|
854
|
-
context_window: 32_768
|
|
855
|
-
max_tokens: 4_096
|
|
856
|
-
training_cutoff: "2023-12"
|
|
857
|
-
metadata:
|
|
858
|
-
available_for_everyone: false
|
|
859
|
-
deprecated: true
|
|
860
|
-
costs_per_million_token:
|
|
861
|
-
input: 2.25
|
|
862
|
-
output: 6.75
|
|
863
|
-
alternative_keys:
|
|
864
|
-
- databricks/dbrx-instruct
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
base-config:
|
|
2
|
+
company: Xiaomi
|
|
3
|
+
open_source: true
|
|
4
|
+
documentation_url: https://platform.xiaomimimo.com/#/docs/
|
|
5
|
+
supports:
|
|
6
|
+
images: false
|
|
7
|
+
files: false
|
|
8
|
+
tools: true
|
|
9
|
+
metadata:
|
|
10
|
+
available_as_evaluator: false
|
|
11
|
+
available_for_everyone: true
|
|
12
|
+
ignored_for_cost: false
|
|
13
|
+
properties:
|
|
14
|
+
training_cutoff: December 2024
|
|
15
|
+
|
|
16
|
+
xiaomi-models:
|
|
17
|
+
base-config:
|
|
18
|
+
properties:
|
|
19
|
+
context_window: 256000
|
|
20
|
+
supports:
|
|
21
|
+
temperature: true
|
|
22
|
+
top_p: true
|
|
23
|
+
default_parameters:
|
|
24
|
+
temperature: 0.3
|
|
25
|
+
top_p: 0.95
|
|
26
|
+
|
|
27
|
+
xiaomi/mimo-v2-flash:
|
|
28
|
+
label: MiMo V2 Flash
|
|
29
|
+
description:
|
|
30
|
+
MiMo V2 Flash is Xiaomi's Mixture-of-Experts (MoE) language model with
|
|
31
|
+
309B total parameters and 15B active parameters. Designed for high-speed
|
|
32
|
+
reasoning and agentic workflows, it utilizes a novel hybrid attention
|
|
33
|
+
architecture and Multi-Token Prediction (MTP) to achieve state-of-the-art
|
|
34
|
+
performance while significantly reducing inference costs.
|
|
35
|
+
release_date: 2025-12-17
|
|
36
|
+
properties:
|
|
37
|
+
context_window: 256000
|
|
38
|
+
max_tokens: 64000
|
|
39
|
+
costs_per_million_token:
|
|
40
|
+
input: 0.10
|
|
41
|
+
output: 0.30
|
|
42
|
+
cache:
|
|
43
|
+
read: 0.01
|
|
@@ -18,6 +18,25 @@ base-config:
|
|
|
18
18
|
write_markup: 1
|
|
19
19
|
|
|
20
20
|
zai-models:
|
|
21
|
+
zai/glm-4.7-flashx:
|
|
22
|
+
label: GLM 4.7 Flash
|
|
23
|
+
description: "z.AI lightweight fast model"
|
|
24
|
+
release_date: 2026-01-19
|
|
25
|
+
properties:
|
|
26
|
+
context_window: 200_000
|
|
27
|
+
max_tokens: 128_000
|
|
28
|
+
costs_per_million_token:
|
|
29
|
+
input: 0.07
|
|
30
|
+
output: 0.4
|
|
31
|
+
cache:
|
|
32
|
+
read: 0.01
|
|
33
|
+
default_parameters:
|
|
34
|
+
# from https://huggingface.co/zai-org/GLM-4.7-Flash
|
|
35
|
+
temperature: 1
|
|
36
|
+
top_p: 0.95
|
|
37
|
+
provider_properties:
|
|
38
|
+
clear_thinking: false
|
|
39
|
+
|
|
21
40
|
zai/glm-4.7:
|
|
22
41
|
label: GLM 4.7
|
|
23
42
|
description: "Latest model from ZAI"
|
|
@@ -32,6 +51,9 @@ zai-models:
|
|
|
32
51
|
read: 0.11
|
|
33
52
|
default_parameters:
|
|
34
53
|
temperature: 1
|
|
54
|
+
alternative_keys:
|
|
55
|
+
- vercel/zai/glm-4.7
|
|
56
|
+
|
|
35
57
|
zai/glm-4.5:
|
|
36
58
|
label: GLM 4.5
|
|
37
59
|
description: "z.AI old model"
|
|
@@ -46,9 +68,11 @@ zai-models:
|
|
|
46
68
|
read: 0.11
|
|
47
69
|
alternative_keys:
|
|
48
70
|
- fireworks/glm-4p5:
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
71
|
+
metadata:
|
|
72
|
+
deprecated: true
|
|
73
|
+
costs_per_million_token:
|
|
74
|
+
input: 0.55
|
|
75
|
+
output: 2.19
|
|
52
76
|
|
|
53
77
|
zai/glm-4.5-air:
|
|
54
78
|
label: GLM 4.5 Air
|
model_library/exceptions.py
CHANGED
|
@@ -1,13 +1,9 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import random
|
|
3
1
|
import re
|
|
4
|
-
from typing import Any
|
|
2
|
+
from typing import Any
|
|
5
3
|
|
|
6
|
-
import backoff
|
|
7
4
|
from ai21 import TooManyRequestsError as AI21RateLimitError
|
|
8
5
|
from anthropic import InternalServerError
|
|
9
6
|
from anthropic import RateLimitError as AnthropicRateLimitError
|
|
10
|
-
from backoff._typing import Details
|
|
11
7
|
from httpcore import ReadError as HTTPCoreReadError
|
|
12
8
|
from httpx import ConnectError as HTTPXConnectError
|
|
13
9
|
from httpx import ReadError as HTTPXReadError
|
|
@@ -75,12 +71,14 @@ CONTEXT_WINDOW_PATTERN = re.compile(
|
|
|
75
71
|
r"maximum context length is \d+ tokens|"
|
|
76
72
|
r"context length is \d+ tokens|"
|
|
77
73
|
r"exceed.* context (limit|window|length)|"
|
|
74
|
+
r"context window exceeds|"
|
|
78
75
|
r"exceeds maximum length|"
|
|
79
76
|
r"too long.*tokens.*maximum|"
|
|
80
77
|
r"too large for model with \d+ maximum context length|"
|
|
81
78
|
r"longer than the model's context length|"
|
|
82
79
|
r"too many tokens.*size limit exceeded|"
|
|
83
80
|
r"prompt is too long|"
|
|
81
|
+
r"maximum prompt length|"
|
|
84
82
|
r"input length should be|"
|
|
85
83
|
r"sent message larger than max|"
|
|
86
84
|
r"input tokens exceeded|"
|
|
@@ -222,75 +220,3 @@ def exception_message(exception: Exception | Any) -> str:
|
|
|
222
220
|
if str(exception)
|
|
223
221
|
else type(exception).__name__
|
|
224
222
|
)
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
RETRY_MAX_TRIES: int = 20
|
|
228
|
-
RETRY_INITIAL: float = 10.0
|
|
229
|
-
RETRY_EXPO: float = 1.4
|
|
230
|
-
RETRY_MAX_BACKOFF_WAIT: float = 240.0 # 4 minutes (more with jitter)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
def jitter(wait: float) -> float:
|
|
234
|
-
"""
|
|
235
|
-
Increase or decrease the wait time by up to 20%.
|
|
236
|
-
"""
|
|
237
|
-
jitter_fraction = 0.2
|
|
238
|
-
min_wait = wait * (1 - jitter_fraction)
|
|
239
|
-
max_wait = wait * (1 + jitter_fraction)
|
|
240
|
-
return random.uniform(min_wait, max_wait)
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
def retry_llm_call(
|
|
244
|
-
logger: logging.Logger,
|
|
245
|
-
max_tries: int = RETRY_MAX_TRIES,
|
|
246
|
-
max_time: float | None = None,
|
|
247
|
-
backoff_callback: (
|
|
248
|
-
Callable[[int, Exception | None, float, float], None] | None
|
|
249
|
-
) = None,
|
|
250
|
-
):
|
|
251
|
-
def on_backoff(details: Details):
|
|
252
|
-
exception = details.get("exception")
|
|
253
|
-
tries = details.get("tries", 0)
|
|
254
|
-
elapsed = details.get("elapsed", 0.0)
|
|
255
|
-
wait = details.get("wait", 0.0)
|
|
256
|
-
|
|
257
|
-
logger.warning(
|
|
258
|
-
f"[Retrying] Exception: {exception_message(exception)} | Attempt: {tries} | "
|
|
259
|
-
+ f"Elapsed: {elapsed:.1f}s | Next wait: {wait:.1f}s"
|
|
260
|
-
)
|
|
261
|
-
|
|
262
|
-
if backoff_callback:
|
|
263
|
-
backoff_callback(tries, exception, elapsed, wait)
|
|
264
|
-
|
|
265
|
-
def giveup(e: Exception) -> bool:
|
|
266
|
-
return not is_retriable_error(e)
|
|
267
|
-
|
|
268
|
-
def on_giveup(details: Details) -> None:
|
|
269
|
-
exception: Exception | None = details.get("exception", None)
|
|
270
|
-
if not exception:
|
|
271
|
-
return
|
|
272
|
-
|
|
273
|
-
logger.error(
|
|
274
|
-
f"Giving up after retries. Final exception: {exception_message(exception)}"
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
if is_context_window_error(exception):
|
|
278
|
-
message = exception.args[0] if exception.args else str(exception)
|
|
279
|
-
raise MaxContextWindowExceededError(message)
|
|
280
|
-
|
|
281
|
-
raise exception
|
|
282
|
-
|
|
283
|
-
return backoff.on_exception(
|
|
284
|
-
wait_gen=lambda: backoff.expo(
|
|
285
|
-
base=RETRY_EXPO,
|
|
286
|
-
factor=RETRY_INITIAL,
|
|
287
|
-
max_value=RETRY_MAX_BACKOFF_WAIT,
|
|
288
|
-
),
|
|
289
|
-
exception=Exception,
|
|
290
|
-
max_tries=max_tries,
|
|
291
|
-
max_time=max_time,
|
|
292
|
-
giveup=giveup,
|
|
293
|
-
on_backoff=on_backoff,
|
|
294
|
-
on_giveup=on_giveup,
|
|
295
|
-
jitter=jitter,
|
|
296
|
-
)
|
|
@@ -16,13 +16,13 @@ from model_library.base import (
|
|
|
16
16
|
LLMConfig,
|
|
17
17
|
QueryResult,
|
|
18
18
|
QueryResultMetadata,
|
|
19
|
+
RawResponse,
|
|
19
20
|
TextInput,
|
|
20
21
|
ToolBody,
|
|
21
22
|
ToolCall,
|
|
22
23
|
ToolDefinition,
|
|
23
24
|
ToolResult,
|
|
24
25
|
)
|
|
25
|
-
from model_library.base.input import RawResponse
|
|
26
26
|
from model_library.exceptions import (
|
|
27
27
|
BadInputError,
|
|
28
28
|
MaxOutputTokensExceededError,
|
|
@@ -34,17 +34,21 @@ from model_library.utils import default_httpx_client
|
|
|
34
34
|
|
|
35
35
|
@register_provider("ai21labs")
|
|
36
36
|
class AI21LabsModel(LLM):
|
|
37
|
-
|
|
37
|
+
@override
|
|
38
|
+
def _get_default_api_key(self) -> str:
|
|
39
|
+
return model_library_settings.AI21LABS_API_KEY
|
|
38
40
|
|
|
39
41
|
@override
|
|
40
|
-
def get_client(self) -> AsyncAI21Client:
|
|
41
|
-
if not
|
|
42
|
-
|
|
43
|
-
|
|
42
|
+
def get_client(self, api_key: str | None = None) -> AsyncAI21Client:
|
|
43
|
+
if not self.has_client():
|
|
44
|
+
assert api_key
|
|
45
|
+
client = AsyncAI21Client(
|
|
46
|
+
api_key=api_key,
|
|
44
47
|
http_client=default_httpx_client(),
|
|
45
|
-
num_retries=
|
|
48
|
+
num_retries=3,
|
|
46
49
|
)
|
|
47
|
-
|
|
50
|
+
self.assign_client(client)
|
|
51
|
+
return super().get_client()
|
|
48
52
|
|
|
49
53
|
def __init__(
|
|
50
54
|
self,
|
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
from typing import Literal
|
|
1
|
+
from typing import Any, Literal
|
|
2
2
|
|
|
3
|
+
from pydantic import SecretStr
|
|
3
4
|
from typing_extensions import override
|
|
4
5
|
|
|
5
6
|
from model_library import model_library_settings
|
|
6
7
|
from model_library.base import (
|
|
8
|
+
DelegateConfig,
|
|
7
9
|
DelegateOnly,
|
|
8
10
|
LLMConfig,
|
|
9
11
|
QueryResultCost,
|
|
10
12
|
QueryResultMetadata,
|
|
11
13
|
)
|
|
12
|
-
from model_library.providers.openai import OpenAIModel
|
|
13
14
|
from model_library.register_models import register_provider
|
|
14
|
-
from model_library.utils import create_openai_client_with_defaults
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
@register_provider("alibaba")
|
|
@@ -26,17 +26,26 @@ class AlibabaModel(DelegateOnly):
|
|
|
26
26
|
super().__init__(model_name, provider, config=config)
|
|
27
27
|
|
|
28
28
|
# https://www.alibabacloud.com/help/en/model-studio/first-api-call-to-qwen
|
|
29
|
-
self.
|
|
30
|
-
model_name=self.model_name,
|
|
31
|
-
provider=self.provider,
|
|
29
|
+
self.init_delegate(
|
|
32
30
|
config=config,
|
|
33
|
-
|
|
34
|
-
api_key=model_library_settings.DASHSCOPE_API_KEY,
|
|
31
|
+
delegate_config=DelegateConfig(
|
|
35
32
|
base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
|
|
33
|
+
api_key=SecretStr(model_library_settings.DASHSCOPE_API_KEY),
|
|
36
34
|
),
|
|
37
35
|
use_completions=True,
|
|
36
|
+
delegate_provider="openai",
|
|
38
37
|
)
|
|
39
38
|
|
|
39
|
+
@override
|
|
40
|
+
def _get_extra_body(self) -> dict[str, Any]:
|
|
41
|
+
"""Build extra body parameters for Qwen-specific features."""
|
|
42
|
+
extra: dict[str, Any] = {}
|
|
43
|
+
# Enable thinking mode for Qwen3 reasoning models
|
|
44
|
+
# https://www.alibabacloud.com/help/en/model-studio/use-qwen-by-calling-api
|
|
45
|
+
if self.reasoning:
|
|
46
|
+
extra["enable_thinking"] = True
|
|
47
|
+
return extra
|
|
48
|
+
|
|
40
49
|
@override
|
|
41
50
|
async def _calculate_cost(
|
|
42
51
|
self,
|
|
@@ -11,6 +11,7 @@ import botocore
|
|
|
11
11
|
from botocore.client import BaseClient
|
|
12
12
|
from typing_extensions import override
|
|
13
13
|
|
|
14
|
+
from model_library import model_library_settings
|
|
14
15
|
from model_library.base import (
|
|
15
16
|
LLM,
|
|
16
17
|
FileBase,
|
|
@@ -41,20 +42,46 @@ from model_library.register_models import register_provider
|
|
|
41
42
|
@register_provider("amazon")
|
|
42
43
|
@register_provider("bedrock")
|
|
43
44
|
class AmazonModel(LLM):
|
|
44
|
-
|
|
45
|
+
@override
|
|
46
|
+
def _get_default_api_key(self) -> str:
|
|
47
|
+
if getattr(model_library_settings, "AWS_ACCESS_KEY_ID", None):
|
|
48
|
+
return json.dumps(
|
|
49
|
+
{
|
|
50
|
+
"AWS_ACCESS_KEY_ID": model_library_settings.AWS_ACCESS_KEY_ID,
|
|
51
|
+
"AWS_SECRET_ACCESS_KEY": model_library_settings.AWS_SECRET_ACCESS_KEY,
|
|
52
|
+
"AWS_DEFAULT_REGION": model_library_settings.AWS_DEFAULT_REGION,
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
return "using-environment"
|
|
45
56
|
|
|
46
57
|
@override
|
|
47
|
-
def get_client(self) -> BaseClient:
|
|
48
|
-
if not
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
+
def get_client(self, api_key: str | None = None) -> BaseClient:
|
|
59
|
+
if not self.has_client():
|
|
60
|
+
assert api_key
|
|
61
|
+
if api_key != "using-environment":
|
|
62
|
+
creds = json.loads(api_key)
|
|
63
|
+
client = cast(
|
|
64
|
+
BaseClient,
|
|
65
|
+
boto3.client(
|
|
66
|
+
"bedrock-runtime",
|
|
67
|
+
aws_access_key_id=creds["AWS_ACCESS_KEY_ID"],
|
|
68
|
+
aws_secret_access_key=creds["AWS_SECRET_ACCESS_KEY"],
|
|
69
|
+
region_name=creds["AWS_DEFAULT_REGION"],
|
|
70
|
+
config=botocore.config.Config(max_pool_connections=1000), # pyright: ignore[reportAttributeAccessIssue]
|
|
71
|
+
),
|
|
72
|
+
)
|
|
73
|
+
else:
|
|
74
|
+
client = cast(
|
|
75
|
+
BaseClient,
|
|
76
|
+
boto3.client(
|
|
77
|
+
"bedrock-runtime",
|
|
78
|
+
# default connection pool is 10
|
|
79
|
+
config=botocore.config.Config(max_pool_connections=1000), # pyright: ignore[reportAttributeAccessIssue]
|
|
80
|
+
),
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
self.assign_client(client)
|
|
84
|
+
return super().get_client()
|
|
58
85
|
|
|
59
86
|
def __init__(
|
|
60
87
|
self,
|
|
@@ -70,6 +97,11 @@ class AmazonModel(LLM):
|
|
|
70
97
|
) # supported but no access yet
|
|
71
98
|
self.supports_tool_cache = self.supports_cache and "claude" in self.model_name
|
|
72
99
|
|
|
100
|
+
if config and config.custom_api_key:
|
|
101
|
+
raise Exception(
|
|
102
|
+
"custom_api_key is not currently supported for Amazon models"
|
|
103
|
+
)
|
|
104
|
+
|
|
73
105
|
cache_control = {"type": "default"}
|
|
74
106
|
|
|
75
107
|
async def get_tool_call_ids(self, input: Sequence[InputItem]) -> list[str]:
|
|
@@ -238,7 +270,7 @@ class AmazonModel(LLM):
|
|
|
238
270
|
if self.supports_cache:
|
|
239
271
|
body["system"].append({"cachePoint": self.cache_control})
|
|
240
272
|
|
|
241
|
-
if self.reasoning:
|
|
273
|
+
if self.reasoning and self.max_tokens:
|
|
242
274
|
if self.max_tokens < 1024:
|
|
243
275
|
self.max_tokens = 2048
|
|
244
276
|
budget_tokens = kwargs.pop(
|
|
@@ -251,9 +283,10 @@ class AmazonModel(LLM):
|
|
|
251
283
|
}
|
|
252
284
|
}
|
|
253
285
|
|
|
254
|
-
inference: dict[str, Any] = {
|
|
255
|
-
|
|
256
|
-
|
|
286
|
+
inference: dict[str, Any] = {}
|
|
287
|
+
|
|
288
|
+
if self.max_tokens:
|
|
289
|
+
inference["maxTokens"] = self.max_tokens
|
|
257
290
|
|
|
258
291
|
# Only set temperature for models where supports_temperature is True.
|
|
259
292
|
# For example, "thinking" models don't support temperature: https://docs.claude.com/en/docs/build-with-claude/extended-thinking#feature-compatibility
|