PyPI - model-library - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

model-library 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

model_library/base/base.py +139 -62
model_library/base/delegate_only.py +77 -10
model_library/base/output.py +43 -0
model_library/base/utils.py +35 -0
model_library/config/alibaba_models.yaml +44 -57
model_library/config/all_models.json +253 -126
model_library/config/kimi_models.yaml +30 -3
model_library/config/openai_models.yaml +15 -23
model_library/config/zai_models.yaml +24 -3
model_library/exceptions.py +3 -77
model_library/providers/ai21labs.py +12 -8
model_library/providers/alibaba.py +17 -8
model_library/providers/amazon.py +49 -16
model_library/providers/anthropic.py +93 -40
model_library/providers/azure.py +22 -10
model_library/providers/cohere.py +7 -7
model_library/providers/deepseek.py +8 -8
model_library/providers/fireworks.py +7 -8
model_library/providers/google/batch.py +14 -10
model_library/providers/google/google.py +48 -29
model_library/providers/inception.py +7 -7
model_library/providers/kimi.py +18 -8
model_library/providers/minimax.py +15 -17
model_library/providers/mistral.py +20 -8
model_library/providers/openai.py +99 -22
model_library/providers/openrouter.py +34 -0
model_library/providers/perplexity.py +7 -7
model_library/providers/together.py +7 -8
model_library/providers/vals.py +12 -6
model_library/providers/xai.py +47 -42
model_library/providers/zai.py +38 -8
model_library/registry_utils.py +39 -15
model_library/retriers/__init__.py +0 -0
model_library/retriers/backoff.py +73 -0
model_library/retriers/base.py +225 -0
model_library/retriers/token.py +427 -0
model_library/retriers/utils.py +11 -0
model_library/settings.py +1 -1
model_library/utils.py +13 -0
{model_library-0.1.7.dist-info → model_library-0.1.8.dist-info}/METADATA +2 -1
model_library-0.1.8.dist-info/RECORD +70 -0
{model_library-0.1.7.dist-info → model_library-0.1.8.dist-info}/WHEEL +1 -1
model_library-0.1.7.dist-info/RECORD +0 -64
{model_library-0.1.7.dist-info → model_library-0.1.8.dist-info}/licenses/LICENSE +0 -0
{model_library-0.1.7.dist-info → model_library-0.1.8.dist-info}/top_level.txt +0 -0

model_library/base/base.py CHANGED Viewed

@@ -1,9 +1,12 @@
+import hashlib
 import io
 import logging
+import threading
 import time
 import uuid
 from abc import ABC, abstractmethod
 from collections.abc import Awaitable
+from math import ceil
 from pprint import pformat
 from typing import (
     Any,
@@ -14,7 +17,7 @@ from typing import (
 )
 import tiktoken
-from pydantic import model_serializer
+from pydantic import SecretStr, model_serializer
 from pydantic.main import BaseModel
 from tiktoken.core import Encoding
 from typing_extensions import override
@@ -34,15 +37,15 @@ from model_library.base.output import (
     QueryResult,
     QueryResultCost,
     QueryResultMetadata,
+    RateLimit,
 )
 from model_library.base.utils import (
     get_pretty_input_types,
     serialize_for_tokenizing,
 )
-from model_library.exceptions import (
-    ImmediateRetryException,
-    retry_llm_call,
-)
+from model_library.retriers.backoff import ExponentialBackoffRetrier
+from model_library.retriers.base import BaseRetrier, R, RetrierType, retry_decorator
+from model_library.retriers.token import TokenRetrier
 from model_library.utils import truncate_str
 PydanticT = TypeVar("PydanticT", bound=BaseModel)
@@ -56,11 +59,18 @@ class ProviderConfig(BaseModel):
         return self.__dict__
-DEFAULT_MAX_TOKENS = 2048
+class TokenRetryParams(BaseModel):
+    input_modifier: float
+    output_modifier: float
+    use_dynamic_estimate: bool = True
+    limit: int
+    limit_refresh_seconds: Literal[60] = 60
 class LLMConfig(BaseModel):
-    max_tokens: int = DEFAULT_MAX_TOKENS
+    max_tokens: int | None = None
     temperature: float | None = None
     top_p: float | None = None
     top_k: int | None = None
@@ -75,11 +85,18 @@ class LLMConfig(BaseModel):
     native: bool = True
     provider_config: ProviderConfig | None = None
     registry_key: str | None = None
+    custom_api_key: SecretStr | None = None
+class DelegateConfig(BaseModel):
+    base_url: str
+    api_key: SecretStr
-RetrierType = Callable[[Callable[..., Awaitable[Any]]], Callable[..., Awaitable[Any]]]
-R = TypeVar("R")  # return type
+# shared across all subclasses and instances
+# hash(provider + api_key) -> client
+client_registry_lock = threading.Lock()
+client_registry: dict[tuple[str, str], Any] = {}
 class LLM(ABC):
@@ -88,6 +105,34 @@ class LLM(ABC):
     LLM call errors should be raised as exceptions
     """
+    @abstractmethod
+    def get_client(self, api_key: str | None = None) -> Any:
+        """
+        Returns the cached instance of the appropriate SDK client.
+        Sublasses should implement this method and:
+        - if api_key is provided, initialize their client and call assing_client(client).
+        - else return super().get_client()
+        """
+        global client_registry
+        return client_registry[self._client_registry_key]
+    def assign_client(self, client: object) -> None:
+        """Thread-safe assignment to the client registry"""
+        global client_registry
+        if self._client_registry_key not in client_registry:
+            with client_registry_lock:
+                if self._client_registry_key not in client_registry:
+                    client_registry[self._client_registry_key] = client
+    def has_client(self) -> bool:
+        return self._client_registry_key in client_registry
+    @abstractmethod
+    def _get_default_api_key(self) -> str:
+        """Return the api key from model_library.settings"""
+        ...
     def __init__(
         self,
         model_name: str,
@@ -103,7 +148,7 @@ class LLM(ABC):
         config = config or LLMConfig()
         self._registry_key = config.registry_key
-        self.max_tokens: int = config.max_tokens
+        self.max_tokens: int | None = config.max_tokens
         self.temperature: float | None = config.temperature
         self.top_p: float | None = config.top_p
         self.top_k: int | None = config.top_k
@@ -131,21 +176,33 @@ class LLM(ABC):
         self.logger: logging.Logger = logging.getLogger(
             f"llm.{provider}.{model_name}<instance={self.instance_id}>"
         )
-        self.custom_retrier: Callable[..., RetrierType] | None = retry_llm_call
+        self.custom_retrier: RetrierType | None = None
+        self.token_retry_params = None
+        # set _client_registry_key after initializing delegate
+        if not self.native:
+            return
+        if config.custom_api_key:
+            raw_key = config.custom_api_key.get_secret_value()
+        else:
+            raw_key = self._get_default_api_key()
+        key_hash = hashlib.sha256(raw_key.encode()).hexdigest()
+        self._client_registry_key = (self.provider, key_hash)
+        self._client_registry_key_model_specific = (
+            f"{self.provider}.{self.model_name}",
+            key_hash,
+        )
+        self.get_client(api_key=raw_key)
     @override
     def __repr__(self):
         attrs = vars(self).copy()
         attrs.pop("logger", None)
         attrs.pop("custom_retrier", None)
-        attrs.pop("_key", None)
         return f"{self.__class__.__name__}(\n{pformat(attrs, indent=2, sort_dicts=False)}\n)"
-    @abstractmethod
-    def get_client(self) -> object:
-        """Return the instance of the appropriate SDK client."""
-        ...
     @staticmethod
     async def timer_wrapper(func: Callable[[], Awaitable[R]]) -> tuple[R, float]:
         """
@@ -155,43 +212,6 @@ class LLM(ABC):
         result = await func()
         return result, round(time.perf_counter() - start, 4)
-    @staticmethod
-    async def immediate_retry_wrapper(
-        func: Callable[[], Awaitable[R]],
-        logger: logging.Logger,
-    ) -> R:
-        """
-        Retry the query immediately
-        """
-        MAX_IMMEDIATE_RETRIES = 10
-        retries = 0
-        while True:
-            try:
-                return await func()
-            except ImmediateRetryException as e:
-                if retries >= MAX_IMMEDIATE_RETRIES:
-                    logger.error(f"Query reached max immediate retries {retries}: {e}")
-                    raise Exception(
-                        f"Query reached max immediate retries {retries}: {e}"
-                    ) from e
-                retries += 1
-                logger.warning(
-                    f"Query retried immediately {retries}/{MAX_IMMEDIATE_RETRIES}: {e}"
-                )
-    @staticmethod
-    async def backoff_retry_wrapper(
-        func: Callable[..., Awaitable[R]],
-        backoff_retrier: RetrierType | None,
-    ) -> R:
-        """
-        Retry the query with backoff
-        """
-        if not backoff_retrier:
-            return await func()
-        return await backoff_retrier(func)()
     async def delegate_query(
         self,
         input: Sequence[InputItem],
@@ -276,15 +296,38 @@ class LLM(ABC):
             return await LLM.timer_wrapper(query_func)
         async def immediate_retry() -> tuple[QueryResult, float]:
-            return await LLM.immediate_retry_wrapper(timed_query, query_logger)
-        async def backoff_retry() -> tuple[QueryResult, float]:
-            backoff_retrier = (
-                self.custom_retrier(query_logger) if self.custom_retrier else None
-            )
-            return await LLM.backoff_retry_wrapper(immediate_retry, backoff_retrier)
+            return await BaseRetrier.immediate_retry_wrapper(timed_query, query_logger)
+        async def default_retry() -> tuple[QueryResult, float]:
+            if self.token_retry_params:
+                (
+                    estimate_input_tokens,
+                    estimate_output_tokens,
+                ) = await self.estimate_query_tokens(
+                    input,
+                    tools=tools,
+                    **kwargs,
+                )
+                retrier = TokenRetrier(
+                    logger=query_logger,
+                    client_registry_key=self._client_registry_key_model_specific,
+                    estimate_input_tokens=estimate_input_tokens,
+                    estimate_output_tokens=estimate_output_tokens,
+                    dynamic_estimate_instance_id=self.instance_id
+                    if self.token_retry_params.use_dynamic_estimate
+                    else None,
+                )
+            else:
+                retrier = ExponentialBackoffRetrier(logger=query_logger)
+            return await retry_decorator(retrier)(immediate_retry)()
+        run_with_retry = (
+            default_retry
+            if not self.custom_retrier
+            else self.custom_retrier(immediate_retry)
+        )
-        output, duration = await backoff_retry()
+        output, duration = await run_with_retry()
         output.metadata.duration_seconds = duration
         output.metadata.cost = await self._calculate_cost(output.metadata)
@@ -293,6 +336,16 @@ class LLM(ABC):
         return output
+    async def init_token_retry(self, token_retry_params: TokenRetryParams) -> None:
+        self.token_retry_params = token_retry_params
+        await TokenRetrier.init_remaining_tokens(
+            client_registry_key=self._client_registry_key_model_specific,
+            limit=self.token_retry_params.limit,
+            limit_refresh_seconds=self.token_retry_params.limit_refresh_seconds,
+            get_rate_limit_func=self.get_rate_limit,
+            logger=self.logger,
+        )
     async def _calculate_cost(
         self,
         metadata: QueryResultMetadata,
@@ -438,6 +491,30 @@ class LLM(ABC):
         """Upload a file to the model provider"""
         ...
+    async def get_rate_limit(self) -> RateLimit | None:
+        """Get the rate limit for the model provider"""
+        return None
+    async def estimate_query_tokens(
+        self,
+        input: Sequence[InputItem],
+        *,
+        tools: list[ToolDefinition] = [],
+        **kwargs: object,
+    ) -> tuple[int, int]:
+        """Pessimistically estimate the number of tokens required for a query"""
+        assert self.token_retry_params
+        # TODO: when passing in images and files, we really need to take that into account when calculating the output tokens!!
+        input_tokens = (
+            await self.count_tokens(input, history=[], tools=tools, **kwargs)
+            * self.token_retry_params.input_modifier
+        )
+        output_tokens = input_tokens * self.token_retry_params.output_modifier
+        return ceil(input_tokens), ceil(output_tokens)
     async def get_encoding(self) -> Encoding:
         """Get the appropriate tokenizer"""

model_library/base/delegate_only.py CHANGED Viewed

@@ -13,6 +13,7 @@ from model_library.base import (
     QueryResult,
     ToolDefinition,
 )
+from model_library.base.base import DelegateConfig
 class DelegateOnlyException(Exception):
@@ -21,17 +22,51 @@ class DelegateOnlyException(Exception):
     delegate-only model.
     """
-    DEFAULT_MESSAGE: str = "This model supports only delegate-only functionality. Only the query() method should be used."
+    DEFAULT_MESSAGE: str = "This model is running in delegate-only mode, certain functionality is not supported."
     def __init__(self, message: str | None = None):
         super().__init__(message or DelegateOnlyException.DEFAULT_MESSAGE)
 class DelegateOnly(LLM):
-    @override
-    def get_client(self) -> None:
+    def _get_default_api_key(self) -> str:
         raise DelegateOnlyException()
+    @override
+    def get_client(self, api_key: str | None = None) -> None:
+        assert self.delegate
+        return self.delegate.get_client()
+    def init_delegate(
+        self,
+        config: LLMConfig | None,
+        delegate_config: DelegateConfig,
+        delegate_provider: Literal["openai", "anthropic"],
+        use_completions: bool = True,
+    ) -> None:
+        from model_library.providers.anthropic import AnthropicModel
+        from model_library.providers.openai import OpenAIModel
+        match delegate_provider:
+            case "openai":
+                self.delegate = OpenAIModel(
+                    model_name=self.model_name,
+                    provider=self.provider,
+                    config=config,
+                    use_completions=use_completions,
+                    delegate_config=delegate_config,
+                )
+            case "anthropic":
+                self.delegate = AnthropicModel(
+                    model_name=self.model_name,
+                    provider=self.provider,
+                    config=config,
+                    delegate_config=delegate_config,
+                )
+        self._client_registry_key_model_specific = (
+            self.delegate._client_registry_key_model_specific
+        )
     def __init__(
         self,
         model_name: str,
@@ -42,6 +77,11 @@ class DelegateOnly(LLM):
         config = config or LLMConfig()
         config.native = False
         super().__init__(model_name, provider, config=config)
+        config.native = True
+    def _get_extra_body(self) -> dict[str, Any]:
+        """Build extra body parameters for delegate-specific features."""
+        return {}
     @override
     async def _query_impl(
@@ -53,9 +93,12 @@ class DelegateOnly(LLM):
         **kwargs: object,
     ) -> QueryResult:
         assert self.delegate
         return await self.delegate_query(
-            input, tools=tools, query_logger=query_logger, **kwargs
+            input,
+            tools=tools,
+            query_logger=query_logger,
+            extra_body=self._get_extra_body(),
+            **kwargs,
         )
     @override
@@ -66,7 +109,8 @@ class DelegateOnly(LLM):
         tools: list[ToolDefinition],
         **kwargs: object,
     ) -> dict[str, Any]:
-        raise DelegateOnlyException()
+        assert self.delegate
+        return await self.delegate.build_body(input, tools=tools, **kwargs)
     @override
     async def parse_input(
@@ -74,28 +118,32 @@ class DelegateOnly(LLM):
         input: Sequence[InputItem],
         **kwargs: Any,
     ) -> Any:
-        raise DelegateOnlyException()
+        assert self.delegate
+        return await self.delegate.parse_input(input, **kwargs)
     @override
     async def parse_image(
         self,
         image: FileInput,
     ) -> Any:
-        raise DelegateOnlyException()
+        assert self.delegate
+        return await self.delegate.parse_image(image)
     @override
     async def parse_file(
         self,
         file: FileInput,
     ) -> Any:
-        raise DelegateOnlyException()
+        assert self.delegate
+        return await self.delegate.parse_file(file)
     @override
     async def parse_tools(
         self,
         tools: list[ToolDefinition],
     ) -> Any:
-        raise DelegateOnlyException()
+        assert self.delegate
+        return await self.delegate.parse_tools(tools)
     @override
     async def upload_file(
@@ -106,3 +154,22 @@ class DelegateOnly(LLM):
         type: Literal["image", "file"] = "file",
     ) -> FileWithId:
         raise DelegateOnlyException()
+    @override
+    async def get_rate_limit(self) -> Any:
+        assert self.delegate
+        return await self.delegate.get_rate_limit()
+    @override
+    async def count_tokens(
+        self,
+        input: Sequence[InputItem],
+        *,
+        history: Sequence[InputItem] = [],
+        tools: list[ToolDefinition] = [],
+        **kwargs: object,
+    ) -> int:
+        assert self.delegate
+        return await self.delegate.count_tokens(
+            input, history=history, tools=tools, **kwargs
+        )

model_library/base/output.py CHANGED Viewed

@@ -118,6 +118,48 @@ class QueryResultCost(BaseModel):
         )
+class RateLimit(BaseModel):
+    """Rate limit information"""
+    request_limit: int | None = None
+    request_remaining: int | None = None
+    token_limit: int | None = None
+    token_limit_input: int | None = None
+    token_limit_output: int | None = None
+    token_remaining: int | None = None
+    token_remaining_input: int | None = None
+    token_remaining_output: int | None = None
+    unix_timestamp: float
+    raw: Any
+    @computed_field
+    @property
+    def token_limit_total(self) -> int:
+        if self.token_limit:
+            return self.token_limit
+        else:
+            return (self.token_limit_input or 0) + (self.token_limit_output or 0)
+    @computed_field
+    @property
+    def token_remaining_total(self) -> int:
+        if self.token_remaining:
+            return self.token_remaining
+        else:
+            return (self.token_remaining_input or 0) + (
+                self.token_remaining_output or 0
+            )
+    @override
+    def __repr__(self):
+        attrs = vars(self).copy()
+        attrs.pop("raw", None)
+        return f"{self.__class__.__name__}(\n{pformat(attrs, indent=2, sort_dicts=False)}\n)"
 class QueryResultMetadata(BaseModel):
     """
     Metadata for a query: token usage and timing.
@@ -131,6 +173,7 @@ class QueryResultMetadata(BaseModel):
     reasoning_tokens: int | None = None
     cache_read_tokens: int | None = None
     cache_write_tokens: int | None = None
+    extra: dict[str, Any] = {}
     @property
     def default_duration_seconds(self) -> float:

model_library/base/utils.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import json
+import re
+from datetime import datetime, timedelta
 from typing import Any, Sequence, TypeVar
 from pydantic import BaseModel
@@ -77,3 +79,36 @@ def get_pretty_input_types(input: Sequence["InputItem"], verbose: bool = False)
     processed_items = [f"  {process_item(item)}" for item in input]
     return "\n" + "\n".join(processed_items) if processed_items else ""
+TIME_PATTERN = re.compile(r"^(\d+(?:\.\d+)?)([a-zA-Z]+)$")
+UNIT_TO_SECONDS = {
+    "ms": 0.001,
+    "s": 1,
+    "m": 60,
+    "h": 3600,
+}
+def to_timestamp(input_str: str, server_now: datetime) -> int:
+    """Converts a header string into a server-relative Unix timestamp in ms."""
+    input_str = input_str.strip()
+    # ISO Timestamp (e.g. 2026-01-09T21:58:01Z)
+    if "T" in input_str and "-" in input_str:
+        try:
+            dt = datetime.fromisoformat(input_str.replace("Z", "+00:00"))
+            return int(dt.timestamp() * 1000)
+        except ValueError:
+            pass
+    # Duration (e.g. 10s, 6ms)
+    match = TIME_PATTERN.match(input_str)
+    if match:
+        value, unit = match.groups()
+        offset_seconds = float(value) * UNIT_TO_SECONDS.get(unit.lower(), 0)
+        # Add duration to the SERVER'S provided date
+        dt = server_now + timedelta(seconds=offset_seconds)
+        return int(dt.timestamp() * 1000)
+    raise ValueError(f"Unsupported time format: {input_str}")

model_library/config/alibaba_models.yaml CHANGED Viewed

@@ -1,17 +1,51 @@
-qwen-models:
-  base-config:
-    company: Alibaba
-    open_source: false
+base-config:
+  company: Alibaba
+  open_source: false
+  supports:
+    temperature: true
+  metadata:
+    available_for_everyone: false
+    available_as_evaluator: false
+  default_parameters:
+    temperature: 0.7
+  properties:
+    reasoning_model: false
+qwen-3-vl-models:
+  base-config:
     supports:
-      temperature: true
+      images: true
+  alibaba/qwen3-vl-plus-2025-09-23:
+    label: Qwen 3 VL Plus
+    open_source: true
+    description: Qwen 3 VL Plus (2025-09-23)
+    release_date: 2025-09-23
     metadata:
-      available_for_everyone: false
-      available_as_evaluator: false
-    default_parameters:
-      temperature: 0.7
+      deprecated: true
     properties:
+      context_window: 262_144
+      max_tokens: 32_768
+      training_cutoff: ""
       reasoning_model: false
+    costs_per_million_token:
+      input: 0.2
+      output: 1.6
+qwen-3-max-models:
+  base-config:
+    supports:
+      tools: true
+      images: false
+  alibaba/qwen3-max-2026-01-23:
+    label: Qwen 3 Max Thinking
+    description: Qwen 3 Max with enhanced reasoning capabilities
+    release_date: 2026-01-23
+    properties:
+      context_window: 256_000
+      max_tokens: 32_000
+      reasoning_model: true
   alibaba/qwen3-max-preview:
     label: Qwen 3 Max Preview
@@ -20,15 +54,7 @@ qwen-models:
     properties:
       context_window: 262_144
       max_tokens: 65_536
-      training_cutoff: ""
-    costs_per_million_token:
-      input: 1.2
-      output: 6
-    supports:
-      images: false
-      tools: true
-    metadata:
-      available_for_everyone: false
+      reasoning_model: true
   alibaba/qwen3-max-2025-09-23:
     label: Qwen 3 Max 2025-09-23
@@ -39,14 +65,6 @@ qwen-models:
       max_tokens: 65_536
       training_cutoff: ""
       reasoning_model: true
-    costs_per_million_token:
-      input: 1.2
-      output: 6
-    supports:
-      images: false
-      tools: true
-    metadata:
-      available_for_everyone: false
   alibaba/qwen3-max:
     label: Qwen 3 Max
@@ -57,34 +75,3 @@ qwen-models:
       max_tokens: 65_536
       training_cutoff: ""
       reasoning_model: false
-    costs_per_million_token:
-      input: 1.2
-      output: 6
-      cache:
-        read_discount: 0.8
-        write_markup: 1
-      context:
-        threshold: 32_000
-        input: 2.4
-        output: 12
-    supports:
-      images: false
-      tools: true
-    metadata:
-      available_for_everyone: false
-  alibaba/qwen3-vl-plus-2025-09-23:
-    label: Qwen 3 VL Plus
-    open_source: true
-    description: Qwen 3 VL Plus (2025-09-23)
-    release_date: 2025-09-23
-    properties:
-      context_window: 262_144
-      max_tokens: 32_768
-      training_cutoff: ""
-      reasoning_model: false
-    costs_per_million_token:
-      input: 0.2
-      output: 1.6
-    supports:
-      images: true

model-library 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

model-library 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl