PyPI - model-library - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

model-library 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

model_library/base/base.py +141 -62
model_library/base/delegate_only.py +77 -10
model_library/base/output.py +43 -0
model_library/base/utils.py +35 -0
model_library/config/alibaba_models.yaml +49 -57
model_library/config/all_models.json +353 -120
model_library/config/anthropic_models.yaml +2 -1
model_library/config/kimi_models.yaml +30 -3
model_library/config/mistral_models.yaml +2 -0
model_library/config/openai_models.yaml +15 -23
model_library/config/together_models.yaml +2 -0
model_library/config/xiaomi_models.yaml +43 -0
model_library/config/zai_models.yaml +27 -3
model_library/exceptions.py +3 -77
model_library/providers/ai21labs.py +12 -8
model_library/providers/alibaba.py +17 -8
model_library/providers/amazon.py +49 -16
model_library/providers/anthropic.py +128 -48
model_library/providers/azure.py +22 -10
model_library/providers/cohere.py +7 -7
model_library/providers/deepseek.py +8 -8
model_library/providers/fireworks.py +7 -8
model_library/providers/google/batch.py +14 -10
model_library/providers/google/google.py +57 -30
model_library/providers/inception.py +7 -7
model_library/providers/kimi.py +18 -8
model_library/providers/minimax.py +15 -17
model_library/providers/mistral.py +20 -8
model_library/providers/openai.py +99 -22
model_library/providers/openrouter.py +34 -0
model_library/providers/perplexity.py +7 -7
model_library/providers/together.py +7 -8
model_library/providers/vals.py +12 -6
model_library/providers/vercel.py +34 -0
model_library/providers/xai.py +47 -42
model_library/providers/xiaomi.py +34 -0
model_library/providers/zai.py +38 -8
model_library/register_models.py +5 -0
model_library/registry_utils.py +48 -17
model_library/retriers/__init__.py +0 -0
model_library/retriers/backoff.py +73 -0
model_library/retriers/base.py +225 -0
model_library/retriers/token.py +427 -0
model_library/retriers/utils.py +11 -0
model_library/settings.py +1 -1
model_library/utils.py +17 -7
{model_library-0.1.7.dist-info → model_library-0.1.9.dist-info}/METADATA +2 -1
model_library-0.1.9.dist-info/RECORD +73 -0
{model_library-0.1.7.dist-info → model_library-0.1.9.dist-info}/WHEEL +1 -1
model_library-0.1.7.dist-info/RECORD +0 -64
{model_library-0.1.7.dist-info → model_library-0.1.9.dist-info}/licenses/LICENSE +0 -0
{model_library-0.1.7.dist-info → model_library-0.1.9.dist-info}/top_level.txt +0 -0

model_library/config/mistral_models.yaml CHANGED Viewed

@@ -38,6 +38,8 @@ magistral-models:
     costs_per_million_token:
       input: 2
       output: 5
+    metadata:
+      deprecated: true
   mistralai/magistral-medium-2509:
     label: Magistral Medium 1.2 (09/2025)

model_library/config/openai_models.yaml CHANGED Viewed

@@ -35,6 +35,19 @@ gpt-5-models:
       training_cutoff: "2024-09"
       reasoning_model: true
+  openai/gpt-5.2-codex:
+    label: GPT 5.2 Codex
+    documentation_url: https://platform.openai.com/docs/models/gpt-5.2-codex
+    description: GPT 5.2 optimized for code
+    release_date: 2025-12-11
+    properties:
+      context_window: 400_000
+    costs_per_million_token:
+      input: 1.75
+      output: 14
+      cache:
+        read: 0.175
   openai/gpt-5.2-2025-12-11:
     label: GPT 5.2
     documentation_url: https://platform.openai.com/docs/models/gpt-5.2
@@ -70,7 +83,7 @@ gpt-5-models:
   openai/gpt-5.1-codex-max:
     label: GPT 5.1 Codex Max
     release_date: 2025-12-04
-    description: OpenAI's frontier agentic coding model. Good at long-running coding tasks.
+    description: GPT 5.1 optimized for code
     costs_per_million_token:
       input: 1.25
       output: 10.0
@@ -79,7 +92,7 @@ gpt-5-models:
   openai/gpt-5.1-codex:
     label: GPT 5.1 Codex
     documentation_url: https://platform.openai.com/docs/models/gpt-5.1-codex
-    description: OpenAI's latest coding model
+    description: GPT 5.1 optimized for code
     release_date: 2025-11-13
     costs_per_million_token:
       input: 1.25
@@ -841,24 +854,3 @@ gpt-3.5-models:
       input: 1.5
       output: 2.0
     documentation_url: https://platform.openai.com/docs/models/gpt-3.5-turbo-instruct
-databricks-models:
-  base-config:
-    company: Databricks
-  databricks/databricks-dbrx-instruct:
-    label: DBRX Instruct
-    description: Databricks Instruct model.
-    release_date: 2024-03-27
-    properties:
-      context_window: 32_768
-      max_tokens: 4_096
-      training_cutoff: "2023-12"
-    metadata:
-      available_for_everyone: false
-      deprecated: true
-    costs_per_million_token:
-      input: 2.25
-      output: 6.75
-    alternative_keys:
-      - databricks/dbrx-instruct

model_library/config/together_models.yaml CHANGED Viewed

@@ -45,6 +45,8 @@ kimi-models:
     costs_per_million_token:
       input: 1.00
       output: 3.00
+    metadata:
+      deprecated: true
 # Meta Llama Models
 llama-4-models:

model_library/config/xiaomi_models.yaml ADDED Viewed

@@ -0,0 +1,43 @@
+base-config:
+  company: Xiaomi
+  open_source: true
+  documentation_url: https://platform.xiaomimimo.com/#/docs/
+  supports:
+    images: false
+    files: false
+    tools: true
+  metadata:
+    available_as_evaluator: false
+    available_for_everyone: true
+    ignored_for_cost: false
+  properties:
+    training_cutoff: December 2024
+xiaomi-models:
+  base-config:
+    properties:
+      context_window: 256000
+    supports:
+      temperature: true
+      top_p: true
+    default_parameters:
+      temperature: 0.3
+      top_p: 0.95
+  xiaomi/mimo-v2-flash:
+    label: MiMo V2 Flash
+    description:
+      MiMo V2 Flash is Xiaomi's Mixture-of-Experts (MoE) language model with
+      309B total parameters and 15B active parameters. Designed for high-speed
+      reasoning and agentic workflows, it utilizes a novel hybrid attention
+      architecture and Multi-Token Prediction (MTP) to achieve state-of-the-art
+      performance while significantly reducing inference costs.
+    release_date: 2025-12-17
+    properties:
+      context_window: 256000
+      max_tokens: 64000
+    costs_per_million_token:
+      input: 0.10
+      output: 0.30
+      cache:
+        read: 0.01

model_library/config/zai_models.yaml CHANGED Viewed

@@ -18,6 +18,25 @@ base-config:
       write_markup: 1
 zai-models:
+  zai/glm-4.7-flashx:
+    label: GLM 4.7 Flash
+    description: "z.AI lightweight fast model"
+    release_date: 2026-01-19
+    properties:
+      context_window: 200_000
+      max_tokens: 128_000
+    costs_per_million_token:
+      input: 0.07
+      output: 0.4
+      cache:
+        read: 0.01
+    default_parameters:
+      # from https://huggingface.co/zai-org/GLM-4.7-Flash
+      temperature: 1
+      top_p: 0.95
+    provider_properties:
+      clear_thinking: false
   zai/glm-4.7:
     label: GLM 4.7
     description: "Latest model from ZAI"
@@ -32,6 +51,9 @@ zai-models:
         read: 0.11
     default_parameters:
       temperature: 1
+    alternative_keys:
+      - vercel/zai/glm-4.7
   zai/glm-4.5:
     label: GLM 4.5
     description: "z.AI old model"
@@ -46,9 +68,11 @@ zai-models:
         read: 0.11
     alternative_keys:
       - fireworks/glm-4p5:
-        costs_per_million_token:
-          input: 0.55
-          output: 2.19
+          metadata:
+            deprecated: true
+          costs_per_million_token:
+            input: 0.55
+            output: 2.19
   zai/glm-4.5-air:
     label: GLM 4.5 Air

model_library/exceptions.py CHANGED Viewed

@@ -1,13 +1,9 @@
-import logging
-import random
 import re
-from typing import Any, Callable
+from typing import Any
-import backoff
 from ai21 import TooManyRequestsError as AI21RateLimitError
 from anthropic import InternalServerError
 from anthropic import RateLimitError as AnthropicRateLimitError
-from backoff._typing import Details
 from httpcore import ReadError as HTTPCoreReadError
 from httpx import ConnectError as HTTPXConnectError
 from httpx import ReadError as HTTPXReadError
@@ -75,12 +71,14 @@ CONTEXT_WINDOW_PATTERN = re.compile(
     r"maximum context length is \d+ tokens|"
     r"context length is \d+ tokens|"
     r"exceed.* context (limit|window|length)|"
+    r"context window exceeds|"
     r"exceeds maximum length|"
     r"too long.*tokens.*maximum|"
     r"too large for model with \d+ maximum context length|"
     r"longer than the model's context length|"
     r"too many tokens.*size limit exceeded|"
     r"prompt is too long|"
+    r"maximum prompt length|"
     r"input length should be|"
     r"sent message larger than max|"
     r"input tokens exceeded|"
@@ -222,75 +220,3 @@ def exception_message(exception: Exception | Any) -> str:
         if str(exception)
         else type(exception).__name__
     )
-RETRY_MAX_TRIES: int = 20
-RETRY_INITIAL: float = 10.0
-RETRY_EXPO: float = 1.4
-RETRY_MAX_BACKOFF_WAIT: float = 240.0  # 4 minutes (more with jitter)
-def jitter(wait: float) -> float:
-    """
-    Increase or decrease the wait time by up to 20%.
-    """
-    jitter_fraction = 0.2
-    min_wait = wait * (1 - jitter_fraction)
-    max_wait = wait * (1 + jitter_fraction)
-    return random.uniform(min_wait, max_wait)
-def retry_llm_call(
-    logger: logging.Logger,
-    max_tries: int = RETRY_MAX_TRIES,
-    max_time: float | None = None,
-    backoff_callback: (
-        Callable[[int, Exception | None, float, float], None] | None
-    ) = None,
-):
-    def on_backoff(details: Details):
-        exception = details.get("exception")
-        tries = details.get("tries", 0)
-        elapsed = details.get("elapsed", 0.0)
-        wait = details.get("wait", 0.0)
-        logger.warning(
-            f"[Retrying] Exception: {exception_message(exception)} | Attempt: {tries} | "
-            + f"Elapsed: {elapsed:.1f}s | Next wait: {wait:.1f}s"
-        )
-        if backoff_callback:
-            backoff_callback(tries, exception, elapsed, wait)
-    def giveup(e: Exception) -> bool:
-        return not is_retriable_error(e)
-    def on_giveup(details: Details) -> None:
-        exception: Exception | None = details.get("exception", None)
-        if not exception:
-            return
-        logger.error(
-            f"Giving up after retries. Final exception: {exception_message(exception)}"
-        )
-        if is_context_window_error(exception):
-            message = exception.args[0] if exception.args else str(exception)
-            raise MaxContextWindowExceededError(message)
-        raise exception
-    return backoff.on_exception(
-        wait_gen=lambda: backoff.expo(
-            base=RETRY_EXPO,
-            factor=RETRY_INITIAL,
-            max_value=RETRY_MAX_BACKOFF_WAIT,
-        ),
-        exception=Exception,
-        max_tries=max_tries,
-        max_time=max_time,
-        giveup=giveup,
-        on_backoff=on_backoff,
-        on_giveup=on_giveup,
-        jitter=jitter,
-    )

model_library/providers/ai21labs.py CHANGED Viewed

@@ -16,13 +16,13 @@ from model_library.base import (
     LLMConfig,
     QueryResult,
     QueryResultMetadata,
+    RawResponse,
     TextInput,
     ToolBody,
     ToolCall,
     ToolDefinition,
     ToolResult,
 )
-from model_library.base.input import RawResponse
 from model_library.exceptions import (
     BadInputError,
     MaxOutputTokensExceededError,
@@ -34,17 +34,21 @@ from model_library.utils import default_httpx_client
 @register_provider("ai21labs")
 class AI21LabsModel(LLM):
-    _client: AsyncAI21Client | None = None
+    @override
+    def _get_default_api_key(self) -> str:
+        return model_library_settings.AI21LABS_API_KEY
     @override
-    def get_client(self) -> AsyncAI21Client:
-        if not AI21LabsModel._client:
-            AI21LabsModel._client = AsyncAI21Client(
-                api_key=model_library_settings.AI21LABS_API_KEY,
+    def get_client(self, api_key: str | None = None) -> AsyncAI21Client:
+        if not self.has_client():
+            assert api_key
+            client = AsyncAI21Client(
+                api_key=api_key,
                 http_client=default_httpx_client(),
-                num_retries=1,
+                num_retries=3,
             )
-        return AI21LabsModel._client
+            self.assign_client(client)
+        return super().get_client()
     def __init__(
         self,

model_library/providers/alibaba.py CHANGED Viewed

@@ -1,17 +1,17 @@
-from typing import Literal
+from typing import Any, Literal
+from pydantic import SecretStr
 from typing_extensions import override
 from model_library import model_library_settings
 from model_library.base import (
+    DelegateConfig,
     DelegateOnly,
     LLMConfig,
     QueryResultCost,
     QueryResultMetadata,
 )
-from model_library.providers.openai import OpenAIModel
 from model_library.register_models import register_provider
-from model_library.utils import create_openai_client_with_defaults
 @register_provider("alibaba")
@@ -26,17 +26,26 @@ class AlibabaModel(DelegateOnly):
         super().__init__(model_name, provider, config=config)
         # https://www.alibabacloud.com/help/en/model-studio/first-api-call-to-qwen
-        self.delegate = OpenAIModel(
-            model_name=self.model_name,
-            provider=self.provider,
+        self.init_delegate(
             config=config,
-            custom_client=create_openai_client_with_defaults(
-                api_key=model_library_settings.DASHSCOPE_API_KEY,
+            delegate_config=DelegateConfig(
                 base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
+                api_key=SecretStr(model_library_settings.DASHSCOPE_API_KEY),
             ),
             use_completions=True,
+            delegate_provider="openai",
         )
+    @override
+    def _get_extra_body(self) -> dict[str, Any]:
+        """Build extra body parameters for Qwen-specific features."""
+        extra: dict[str, Any] = {}
+        # Enable thinking mode for Qwen3 reasoning models
+        # https://www.alibabacloud.com/help/en/model-studio/use-qwen-by-calling-api
+        if self.reasoning:
+            extra["enable_thinking"] = True
+        return extra
     @override
     async def _calculate_cost(
         self,

model_library/providers/amazon.py CHANGED Viewed

@@ -11,6 +11,7 @@ import botocore
 from botocore.client import BaseClient
 from typing_extensions import override
+from model_library import model_library_settings
 from model_library.base import (
     LLM,
     FileBase,
@@ -41,20 +42,46 @@ from model_library.register_models import register_provider
 @register_provider("amazon")
 @register_provider("bedrock")
 class AmazonModel(LLM):
-    _client: BaseClient | None = None
+    @override
+    def _get_default_api_key(self) -> str:
+        if getattr(model_library_settings, "AWS_ACCESS_KEY_ID", None):
+            return json.dumps(
+                {
+                    "AWS_ACCESS_KEY_ID": model_library_settings.AWS_ACCESS_KEY_ID,
+                    "AWS_SECRET_ACCESS_KEY": model_library_settings.AWS_SECRET_ACCESS_KEY,
+                    "AWS_DEFAULT_REGION": model_library_settings.AWS_DEFAULT_REGION,
+                }
+            )
+        return "using-environment"
     @override
-    def get_client(self) -> BaseClient:
-        if not AmazonModel._client:
-            AmazonModel._client = cast(
-                BaseClient,
-                boto3.client(
-                    "bedrock-runtime",
-                    # default connection pool is 10
-                    config=botocore.config.Config(max_pool_connections=1000),  # pyright: ignore[reportAttributeAccessIssue]
-                ),
-            )  # pyright: ignore[reportUnknownMemberType]
-        return AmazonModel._client
+    def get_client(self, api_key: str | None = None) -> BaseClient:
+        if not self.has_client():
+            assert api_key
+            if api_key != "using-environment":
+                creds = json.loads(api_key)
+                client = cast(
+                    BaseClient,
+                    boto3.client(
+                        "bedrock-runtime",
+                        aws_access_key_id=creds["AWS_ACCESS_KEY_ID"],
+                        aws_secret_access_key=creds["AWS_SECRET_ACCESS_KEY"],
+                        region_name=creds["AWS_DEFAULT_REGION"],
+                        config=botocore.config.Config(max_pool_connections=1000),  # pyright: ignore[reportAttributeAccessIssue]
+                    ),
+                )
+            else:
+                client = cast(
+                    BaseClient,
+                    boto3.client(
+                        "bedrock-runtime",
+                        # default connection pool is 10
+                        config=botocore.config.Config(max_pool_connections=1000),  # pyright: ignore[reportAttributeAccessIssue]
+                    ),
+                )
+            self.assign_client(client)
+        return super().get_client()
     def __init__(
         self,
@@ -70,6 +97,11 @@ class AmazonModel(LLM):
         )  # supported but no access yet
         self.supports_tool_cache = self.supports_cache and "claude" in self.model_name
+        if config and config.custom_api_key:
+            raise Exception(
+                "custom_api_key is not currently supported for Amazon models"
+            )
     cache_control = {"type": "default"}
     async def get_tool_call_ids(self, input: Sequence[InputItem]) -> list[str]:
@@ -238,7 +270,7 @@ class AmazonModel(LLM):
             if self.supports_cache:
                 body["system"].append({"cachePoint": self.cache_control})
-        if self.reasoning:
+        if self.reasoning and self.max_tokens:
             if self.max_tokens < 1024:
                 self.max_tokens = 2048
             budget_tokens = kwargs.pop(
@@ -251,9 +283,10 @@ class AmazonModel(LLM):
                 }
             }
-        inference: dict[str, Any] = {
-            "maxTokens": self.max_tokens,
-        }
+        inference: dict[str, Any] = {}
+        if self.max_tokens:
+            inference["maxTokens"] = self.max_tokens
         # Only set temperature for models where supports_temperature is True.
         # For example, "thinking" models don't support temperature: https://docs.claude.com/en/docs/build-with-claude/extended-thinking#feature-compatibility

model-library 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

model-library 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl