model-library 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. model_library/base/base.py +141 -62
  2. model_library/base/delegate_only.py +77 -10
  3. model_library/base/output.py +43 -0
  4. model_library/base/utils.py +35 -0
  5. model_library/config/alibaba_models.yaml +49 -57
  6. model_library/config/all_models.json +353 -120
  7. model_library/config/anthropic_models.yaml +2 -1
  8. model_library/config/kimi_models.yaml +30 -3
  9. model_library/config/mistral_models.yaml +2 -0
  10. model_library/config/openai_models.yaml +15 -23
  11. model_library/config/together_models.yaml +2 -0
  12. model_library/config/xiaomi_models.yaml +43 -0
  13. model_library/config/zai_models.yaml +27 -3
  14. model_library/exceptions.py +3 -77
  15. model_library/providers/ai21labs.py +12 -8
  16. model_library/providers/alibaba.py +17 -8
  17. model_library/providers/amazon.py +49 -16
  18. model_library/providers/anthropic.py +128 -48
  19. model_library/providers/azure.py +22 -10
  20. model_library/providers/cohere.py +7 -7
  21. model_library/providers/deepseek.py +8 -8
  22. model_library/providers/fireworks.py +7 -8
  23. model_library/providers/google/batch.py +14 -10
  24. model_library/providers/google/google.py +57 -30
  25. model_library/providers/inception.py +7 -7
  26. model_library/providers/kimi.py +18 -8
  27. model_library/providers/minimax.py +15 -17
  28. model_library/providers/mistral.py +20 -8
  29. model_library/providers/openai.py +99 -22
  30. model_library/providers/openrouter.py +34 -0
  31. model_library/providers/perplexity.py +7 -7
  32. model_library/providers/together.py +7 -8
  33. model_library/providers/vals.py +12 -6
  34. model_library/providers/vercel.py +34 -0
  35. model_library/providers/xai.py +47 -42
  36. model_library/providers/xiaomi.py +34 -0
  37. model_library/providers/zai.py +38 -8
  38. model_library/register_models.py +5 -0
  39. model_library/registry_utils.py +48 -17
  40. model_library/retriers/__init__.py +0 -0
  41. model_library/retriers/backoff.py +73 -0
  42. model_library/retriers/base.py +225 -0
  43. model_library/retriers/token.py +427 -0
  44. model_library/retriers/utils.py +11 -0
  45. model_library/settings.py +1 -1
  46. model_library/utils.py +17 -7
  47. {model_library-0.1.7.dist-info → model_library-0.1.9.dist-info}/METADATA +2 -1
  48. model_library-0.1.9.dist-info/RECORD +73 -0
  49. {model_library-0.1.7.dist-info → model_library-0.1.9.dist-info}/WHEEL +1 -1
  50. model_library-0.1.7.dist-info/RECORD +0 -64
  51. {model_library-0.1.7.dist-info → model_library-0.1.9.dist-info}/licenses/LICENSE +0 -0
  52. {model_library-0.1.7.dist-info → model_library-0.1.9.dist-info}/top_level.txt +0 -0
@@ -38,6 +38,8 @@ magistral-models:
38
38
  costs_per_million_token:
39
39
  input: 2
40
40
  output: 5
41
+ metadata:
42
+ deprecated: true
41
43
 
42
44
  mistralai/magistral-medium-2509:
43
45
  label: Magistral Medium 1.2 (09/2025)
@@ -35,6 +35,19 @@ gpt-5-models:
35
35
  training_cutoff: "2024-09"
36
36
  reasoning_model: true
37
37
 
38
+ openai/gpt-5.2-codex:
39
+ label: GPT 5.2 Codex
40
+ documentation_url: https://platform.openai.com/docs/models/gpt-5.2-codex
41
+ description: GPT 5.2 optimized for code
42
+ release_date: 2025-12-11
43
+ properties:
44
+ context_window: 400_000
45
+ costs_per_million_token:
46
+ input: 1.75
47
+ output: 14
48
+ cache:
49
+ read: 0.175
50
+
38
51
  openai/gpt-5.2-2025-12-11:
39
52
  label: GPT 5.2
40
53
  documentation_url: https://platform.openai.com/docs/models/gpt-5.2
@@ -70,7 +83,7 @@ gpt-5-models:
70
83
  openai/gpt-5.1-codex-max:
71
84
  label: GPT 5.1 Codex Max
72
85
  release_date: 2025-12-04
73
- description: OpenAI's frontier agentic coding model. Good at long-running coding tasks.
86
+ description: GPT 5.1 optimized for code
74
87
  costs_per_million_token:
75
88
  input: 1.25
76
89
  output: 10.0
@@ -79,7 +92,7 @@ gpt-5-models:
79
92
  openai/gpt-5.1-codex:
80
93
  label: GPT 5.1 Codex
81
94
  documentation_url: https://platform.openai.com/docs/models/gpt-5.1-codex
82
- description: OpenAI's latest coding model
95
+ description: GPT 5.1 optimized for code
83
96
  release_date: 2025-11-13
84
97
  costs_per_million_token:
85
98
  input: 1.25
@@ -841,24 +854,3 @@ gpt-3.5-models:
841
854
  input: 1.5
842
855
  output: 2.0
843
856
  documentation_url: https://platform.openai.com/docs/models/gpt-3.5-turbo-instruct
844
-
845
- databricks-models:
846
- base-config:
847
- company: Databricks
848
-
849
- databricks/databricks-dbrx-instruct:
850
- label: DBRX Instruct
851
- description: Databricks Instruct model.
852
- release_date: 2024-03-27
853
- properties:
854
- context_window: 32_768
855
- max_tokens: 4_096
856
- training_cutoff: "2023-12"
857
- metadata:
858
- available_for_everyone: false
859
- deprecated: true
860
- costs_per_million_token:
861
- input: 2.25
862
- output: 6.75
863
- alternative_keys:
864
- - databricks/dbrx-instruct
@@ -45,6 +45,8 @@ kimi-models:
45
45
  costs_per_million_token:
46
46
  input: 1.00
47
47
  output: 3.00
48
+ metadata:
49
+ deprecated: true
48
50
 
49
51
  # Meta Llama Models
50
52
  llama-4-models:
@@ -0,0 +1,43 @@
1
+ base-config:
2
+ company: Xiaomi
3
+ open_source: true
4
+ documentation_url: https://platform.xiaomimimo.com/#/docs/
5
+ supports:
6
+ images: false
7
+ files: false
8
+ tools: true
9
+ metadata:
10
+ available_as_evaluator: false
11
+ available_for_everyone: true
12
+ ignored_for_cost: false
13
+ properties:
14
+ training_cutoff: December 2024
15
+
16
+ xiaomi-models:
17
+ base-config:
18
+ properties:
19
+ context_window: 256000
20
+ supports:
21
+ temperature: true
22
+ top_p: true
23
+ default_parameters:
24
+ temperature: 0.3
25
+ top_p: 0.95
26
+
27
+ xiaomi/mimo-v2-flash:
28
+ label: MiMo V2 Flash
29
+ description:
30
+ MiMo V2 Flash is Xiaomi's Mixture-of-Experts (MoE) language model with
31
+ 309B total parameters and 15B active parameters. Designed for high-speed
32
+ reasoning and agentic workflows, it utilizes a novel hybrid attention
33
+ architecture and Multi-Token Prediction (MTP) to achieve state-of-the-art
34
+ performance while significantly reducing inference costs.
35
+ release_date: 2025-12-17
36
+ properties:
37
+ context_window: 256000
38
+ max_tokens: 64000
39
+ costs_per_million_token:
40
+ input: 0.10
41
+ output: 0.30
42
+ cache:
43
+ read: 0.01
@@ -18,6 +18,25 @@ base-config:
18
18
  write_markup: 1
19
19
 
20
20
  zai-models:
21
+ zai/glm-4.7-flashx:
22
+ label: GLM 4.7 Flash
23
+ description: "z.AI lightweight fast model"
24
+ release_date: 2026-01-19
25
+ properties:
26
+ context_window: 200_000
27
+ max_tokens: 128_000
28
+ costs_per_million_token:
29
+ input: 0.07
30
+ output: 0.4
31
+ cache:
32
+ read: 0.01
33
+ default_parameters:
34
+ # from https://huggingface.co/zai-org/GLM-4.7-Flash
35
+ temperature: 1
36
+ top_p: 0.95
37
+ provider_properties:
38
+ clear_thinking: false
39
+
21
40
  zai/glm-4.7:
22
41
  label: GLM 4.7
23
42
  description: "Latest model from ZAI"
@@ -32,6 +51,9 @@ zai-models:
32
51
  read: 0.11
33
52
  default_parameters:
34
53
  temperature: 1
54
+ alternative_keys:
55
+ - vercel/zai/glm-4.7
56
+
35
57
  zai/glm-4.5:
36
58
  label: GLM 4.5
37
59
  description: "z.AI old model"
@@ -46,9 +68,11 @@ zai-models:
46
68
  read: 0.11
47
69
  alternative_keys:
48
70
  - fireworks/glm-4p5:
49
- costs_per_million_token:
50
- input: 0.55
51
- output: 2.19
71
+ metadata:
72
+ deprecated: true
73
+ costs_per_million_token:
74
+ input: 0.55
75
+ output: 2.19
52
76
 
53
77
  zai/glm-4.5-air:
54
78
  label: GLM 4.5 Air
@@ -1,13 +1,9 @@
1
- import logging
2
- import random
3
1
  import re
4
- from typing import Any, Callable
2
+ from typing import Any
5
3
 
6
- import backoff
7
4
  from ai21 import TooManyRequestsError as AI21RateLimitError
8
5
  from anthropic import InternalServerError
9
6
  from anthropic import RateLimitError as AnthropicRateLimitError
10
- from backoff._typing import Details
11
7
  from httpcore import ReadError as HTTPCoreReadError
12
8
  from httpx import ConnectError as HTTPXConnectError
13
9
  from httpx import ReadError as HTTPXReadError
@@ -75,12 +71,14 @@ CONTEXT_WINDOW_PATTERN = re.compile(
75
71
  r"maximum context length is \d+ tokens|"
76
72
  r"context length is \d+ tokens|"
77
73
  r"exceed.* context (limit|window|length)|"
74
+ r"context window exceeds|"
78
75
  r"exceeds maximum length|"
79
76
  r"too long.*tokens.*maximum|"
80
77
  r"too large for model with \d+ maximum context length|"
81
78
  r"longer than the model's context length|"
82
79
  r"too many tokens.*size limit exceeded|"
83
80
  r"prompt is too long|"
81
+ r"maximum prompt length|"
84
82
  r"input length should be|"
85
83
  r"sent message larger than max|"
86
84
  r"input tokens exceeded|"
@@ -222,75 +220,3 @@ def exception_message(exception: Exception | Any) -> str:
222
220
  if str(exception)
223
221
  else type(exception).__name__
224
222
  )
225
-
226
-
227
- RETRY_MAX_TRIES: int = 20
228
- RETRY_INITIAL: float = 10.0
229
- RETRY_EXPO: float = 1.4
230
- RETRY_MAX_BACKOFF_WAIT: float = 240.0 # 4 minutes (more with jitter)
231
-
232
-
233
- def jitter(wait: float) -> float:
234
- """
235
- Increase or decrease the wait time by up to 20%.
236
- """
237
- jitter_fraction = 0.2
238
- min_wait = wait * (1 - jitter_fraction)
239
- max_wait = wait * (1 + jitter_fraction)
240
- return random.uniform(min_wait, max_wait)
241
-
242
-
243
- def retry_llm_call(
244
- logger: logging.Logger,
245
- max_tries: int = RETRY_MAX_TRIES,
246
- max_time: float | None = None,
247
- backoff_callback: (
248
- Callable[[int, Exception | None, float, float], None] | None
249
- ) = None,
250
- ):
251
- def on_backoff(details: Details):
252
- exception = details.get("exception")
253
- tries = details.get("tries", 0)
254
- elapsed = details.get("elapsed", 0.0)
255
- wait = details.get("wait", 0.0)
256
-
257
- logger.warning(
258
- f"[Retrying] Exception: {exception_message(exception)} | Attempt: {tries} | "
259
- + f"Elapsed: {elapsed:.1f}s | Next wait: {wait:.1f}s"
260
- )
261
-
262
- if backoff_callback:
263
- backoff_callback(tries, exception, elapsed, wait)
264
-
265
- def giveup(e: Exception) -> bool:
266
- return not is_retriable_error(e)
267
-
268
- def on_giveup(details: Details) -> None:
269
- exception: Exception | None = details.get("exception", None)
270
- if not exception:
271
- return
272
-
273
- logger.error(
274
- f"Giving up after retries. Final exception: {exception_message(exception)}"
275
- )
276
-
277
- if is_context_window_error(exception):
278
- message = exception.args[0] if exception.args else str(exception)
279
- raise MaxContextWindowExceededError(message)
280
-
281
- raise exception
282
-
283
- return backoff.on_exception(
284
- wait_gen=lambda: backoff.expo(
285
- base=RETRY_EXPO,
286
- factor=RETRY_INITIAL,
287
- max_value=RETRY_MAX_BACKOFF_WAIT,
288
- ),
289
- exception=Exception,
290
- max_tries=max_tries,
291
- max_time=max_time,
292
- giveup=giveup,
293
- on_backoff=on_backoff,
294
- on_giveup=on_giveup,
295
- jitter=jitter,
296
- )
@@ -16,13 +16,13 @@ from model_library.base import (
16
16
  LLMConfig,
17
17
  QueryResult,
18
18
  QueryResultMetadata,
19
+ RawResponse,
19
20
  TextInput,
20
21
  ToolBody,
21
22
  ToolCall,
22
23
  ToolDefinition,
23
24
  ToolResult,
24
25
  )
25
- from model_library.base.input import RawResponse
26
26
  from model_library.exceptions import (
27
27
  BadInputError,
28
28
  MaxOutputTokensExceededError,
@@ -34,17 +34,21 @@ from model_library.utils import default_httpx_client
34
34
 
35
35
  @register_provider("ai21labs")
36
36
  class AI21LabsModel(LLM):
37
- _client: AsyncAI21Client | None = None
37
+ @override
38
+ def _get_default_api_key(self) -> str:
39
+ return model_library_settings.AI21LABS_API_KEY
38
40
 
39
41
  @override
40
- def get_client(self) -> AsyncAI21Client:
41
- if not AI21LabsModel._client:
42
- AI21LabsModel._client = AsyncAI21Client(
43
- api_key=model_library_settings.AI21LABS_API_KEY,
42
+ def get_client(self, api_key: str | None = None) -> AsyncAI21Client:
43
+ if not self.has_client():
44
+ assert api_key
45
+ client = AsyncAI21Client(
46
+ api_key=api_key,
44
47
  http_client=default_httpx_client(),
45
- num_retries=1,
48
+ num_retries=3,
46
49
  )
47
- return AI21LabsModel._client
50
+ self.assign_client(client)
51
+ return super().get_client()
48
52
 
49
53
  def __init__(
50
54
  self,
@@ -1,17 +1,17 @@
1
- from typing import Literal
1
+ from typing import Any, Literal
2
2
 
3
+ from pydantic import SecretStr
3
4
  from typing_extensions import override
4
5
 
5
6
  from model_library import model_library_settings
6
7
  from model_library.base import (
8
+ DelegateConfig,
7
9
  DelegateOnly,
8
10
  LLMConfig,
9
11
  QueryResultCost,
10
12
  QueryResultMetadata,
11
13
  )
12
- from model_library.providers.openai import OpenAIModel
13
14
  from model_library.register_models import register_provider
14
- from model_library.utils import create_openai_client_with_defaults
15
15
 
16
16
 
17
17
  @register_provider("alibaba")
@@ -26,17 +26,26 @@ class AlibabaModel(DelegateOnly):
26
26
  super().__init__(model_name, provider, config=config)
27
27
 
28
28
  # https://www.alibabacloud.com/help/en/model-studio/first-api-call-to-qwen
29
- self.delegate = OpenAIModel(
30
- model_name=self.model_name,
31
- provider=self.provider,
29
+ self.init_delegate(
32
30
  config=config,
33
- custom_client=create_openai_client_with_defaults(
34
- api_key=model_library_settings.DASHSCOPE_API_KEY,
31
+ delegate_config=DelegateConfig(
35
32
  base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
33
+ api_key=SecretStr(model_library_settings.DASHSCOPE_API_KEY),
36
34
  ),
37
35
  use_completions=True,
36
+ delegate_provider="openai",
38
37
  )
39
38
 
39
+ @override
40
+ def _get_extra_body(self) -> dict[str, Any]:
41
+ """Build extra body parameters for Qwen-specific features."""
42
+ extra: dict[str, Any] = {}
43
+ # Enable thinking mode for Qwen3 reasoning models
44
+ # https://www.alibabacloud.com/help/en/model-studio/use-qwen-by-calling-api
45
+ if self.reasoning:
46
+ extra["enable_thinking"] = True
47
+ return extra
48
+
40
49
  @override
41
50
  async def _calculate_cost(
42
51
  self,
@@ -11,6 +11,7 @@ import botocore
11
11
  from botocore.client import BaseClient
12
12
  from typing_extensions import override
13
13
 
14
+ from model_library import model_library_settings
14
15
  from model_library.base import (
15
16
  LLM,
16
17
  FileBase,
@@ -41,20 +42,46 @@ from model_library.register_models import register_provider
41
42
  @register_provider("amazon")
42
43
  @register_provider("bedrock")
43
44
  class AmazonModel(LLM):
44
- _client: BaseClient | None = None
45
+ @override
46
+ def _get_default_api_key(self) -> str:
47
+ if getattr(model_library_settings, "AWS_ACCESS_KEY_ID", None):
48
+ return json.dumps(
49
+ {
50
+ "AWS_ACCESS_KEY_ID": model_library_settings.AWS_ACCESS_KEY_ID,
51
+ "AWS_SECRET_ACCESS_KEY": model_library_settings.AWS_SECRET_ACCESS_KEY,
52
+ "AWS_DEFAULT_REGION": model_library_settings.AWS_DEFAULT_REGION,
53
+ }
54
+ )
55
+ return "using-environment"
45
56
 
46
57
  @override
47
- def get_client(self) -> BaseClient:
48
- if not AmazonModel._client:
49
- AmazonModel._client = cast(
50
- BaseClient,
51
- boto3.client(
52
- "bedrock-runtime",
53
- # default connection pool is 10
54
- config=botocore.config.Config(max_pool_connections=1000), # pyright: ignore[reportAttributeAccessIssue]
55
- ),
56
- ) # pyright: ignore[reportUnknownMemberType]
57
- return AmazonModel._client
58
+ def get_client(self, api_key: str | None = None) -> BaseClient:
59
+ if not self.has_client():
60
+ assert api_key
61
+ if api_key != "using-environment":
62
+ creds = json.loads(api_key)
63
+ client = cast(
64
+ BaseClient,
65
+ boto3.client(
66
+ "bedrock-runtime",
67
+ aws_access_key_id=creds["AWS_ACCESS_KEY_ID"],
68
+ aws_secret_access_key=creds["AWS_SECRET_ACCESS_KEY"],
69
+ region_name=creds["AWS_DEFAULT_REGION"],
70
+ config=botocore.config.Config(max_pool_connections=1000), # pyright: ignore[reportAttributeAccessIssue]
71
+ ),
72
+ )
73
+ else:
74
+ client = cast(
75
+ BaseClient,
76
+ boto3.client(
77
+ "bedrock-runtime",
78
+ # default connection pool is 10
79
+ config=botocore.config.Config(max_pool_connections=1000), # pyright: ignore[reportAttributeAccessIssue]
80
+ ),
81
+ )
82
+
83
+ self.assign_client(client)
84
+ return super().get_client()
58
85
 
59
86
  def __init__(
60
87
  self,
@@ -70,6 +97,11 @@ class AmazonModel(LLM):
70
97
  ) # supported but no access yet
71
98
  self.supports_tool_cache = self.supports_cache and "claude" in self.model_name
72
99
 
100
+ if config and config.custom_api_key:
101
+ raise Exception(
102
+ "custom_api_key is not currently supported for Amazon models"
103
+ )
104
+
73
105
  cache_control = {"type": "default"}
74
106
 
75
107
  async def get_tool_call_ids(self, input: Sequence[InputItem]) -> list[str]:
@@ -238,7 +270,7 @@ class AmazonModel(LLM):
238
270
  if self.supports_cache:
239
271
  body["system"].append({"cachePoint": self.cache_control})
240
272
 
241
- if self.reasoning:
273
+ if self.reasoning and self.max_tokens:
242
274
  if self.max_tokens < 1024:
243
275
  self.max_tokens = 2048
244
276
  budget_tokens = kwargs.pop(
@@ -251,9 +283,10 @@ class AmazonModel(LLM):
251
283
  }
252
284
  }
253
285
 
254
- inference: dict[str, Any] = {
255
- "maxTokens": self.max_tokens,
256
- }
286
+ inference: dict[str, Any] = {}
287
+
288
+ if self.max_tokens:
289
+ inference["maxTokens"] = self.max_tokens
257
290
 
258
291
  # Only set temperature for models where supports_temperature is True.
259
292
  # For example, "thinking" models don't support temperature: https://docs.claude.com/en/docs/build-with-claude/extended-thinking#feature-compatibility