model-library 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. model_library/base/base.py +139 -62
  2. model_library/base/delegate_only.py +77 -10
  3. model_library/base/output.py +43 -0
  4. model_library/base/utils.py +35 -0
  5. model_library/config/alibaba_models.yaml +44 -57
  6. model_library/config/all_models.json +253 -126
  7. model_library/config/kimi_models.yaml +30 -3
  8. model_library/config/openai_models.yaml +15 -23
  9. model_library/config/zai_models.yaml +24 -3
  10. model_library/exceptions.py +3 -77
  11. model_library/providers/ai21labs.py +12 -8
  12. model_library/providers/alibaba.py +17 -8
  13. model_library/providers/amazon.py +49 -16
  14. model_library/providers/anthropic.py +93 -40
  15. model_library/providers/azure.py +22 -10
  16. model_library/providers/cohere.py +7 -7
  17. model_library/providers/deepseek.py +8 -8
  18. model_library/providers/fireworks.py +7 -8
  19. model_library/providers/google/batch.py +14 -10
  20. model_library/providers/google/google.py +48 -29
  21. model_library/providers/inception.py +7 -7
  22. model_library/providers/kimi.py +18 -8
  23. model_library/providers/minimax.py +15 -17
  24. model_library/providers/mistral.py +20 -8
  25. model_library/providers/openai.py +99 -22
  26. model_library/providers/openrouter.py +34 -0
  27. model_library/providers/perplexity.py +7 -7
  28. model_library/providers/together.py +7 -8
  29. model_library/providers/vals.py +12 -6
  30. model_library/providers/xai.py +47 -42
  31. model_library/providers/zai.py +38 -8
  32. model_library/registry_utils.py +39 -15
  33. model_library/retriers/__init__.py +0 -0
  34. model_library/retriers/backoff.py +73 -0
  35. model_library/retriers/base.py +225 -0
  36. model_library/retriers/token.py +427 -0
  37. model_library/retriers/utils.py +11 -0
  38. model_library/settings.py +1 -1
  39. model_library/utils.py +13 -0
  40. {model_library-0.1.7.dist-info → model_library-0.1.8.dist-info}/METADATA +2 -1
  41. model_library-0.1.8.dist-info/RECORD +70 -0
  42. {model_library-0.1.7.dist-info → model_library-0.1.8.dist-info}/WHEEL +1 -1
  43. model_library-0.1.7.dist-info/RECORD +0 -64
  44. {model_library-0.1.7.dist-info → model_library-0.1.8.dist-info}/licenses/LICENSE +0 -0
  45. {model_library-0.1.7.dist-info → model_library-0.1.8.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,12 @@
1
+ import hashlib
1
2
  import io
2
3
  import logging
4
+ import threading
3
5
  import time
4
6
  import uuid
5
7
  from abc import ABC, abstractmethod
6
8
  from collections.abc import Awaitable
9
+ from math import ceil
7
10
  from pprint import pformat
8
11
  from typing import (
9
12
  Any,
@@ -14,7 +17,7 @@ from typing import (
14
17
  )
15
18
 
16
19
  import tiktoken
17
- from pydantic import model_serializer
20
+ from pydantic import SecretStr, model_serializer
18
21
  from pydantic.main import BaseModel
19
22
  from tiktoken.core import Encoding
20
23
  from typing_extensions import override
@@ -34,15 +37,15 @@ from model_library.base.output import (
34
37
  QueryResult,
35
38
  QueryResultCost,
36
39
  QueryResultMetadata,
40
+ RateLimit,
37
41
  )
38
42
  from model_library.base.utils import (
39
43
  get_pretty_input_types,
40
44
  serialize_for_tokenizing,
41
45
  )
42
- from model_library.exceptions import (
43
- ImmediateRetryException,
44
- retry_llm_call,
45
- )
46
+ from model_library.retriers.backoff import ExponentialBackoffRetrier
47
+ from model_library.retriers.base import BaseRetrier, R, RetrierType, retry_decorator
48
+ from model_library.retriers.token import TokenRetrier
46
49
  from model_library.utils import truncate_str
47
50
 
48
51
  PydanticT = TypeVar("PydanticT", bound=BaseModel)
@@ -56,11 +59,18 @@ class ProviderConfig(BaseModel):
56
59
  return self.__dict__
57
60
 
58
61
 
59
- DEFAULT_MAX_TOKENS = 2048
62
+ class TokenRetryParams(BaseModel):
63
+ input_modifier: float
64
+ output_modifier: float
65
+
66
+ use_dynamic_estimate: bool = True
67
+
68
+ limit: int
69
+ limit_refresh_seconds: Literal[60] = 60
60
70
 
61
71
 
62
72
  class LLMConfig(BaseModel):
63
- max_tokens: int = DEFAULT_MAX_TOKENS
73
+ max_tokens: int | None = None
64
74
  temperature: float | None = None
65
75
  top_p: float | None = None
66
76
  top_k: int | None = None
@@ -75,11 +85,18 @@ class LLMConfig(BaseModel):
75
85
  native: bool = True
76
86
  provider_config: ProviderConfig | None = None
77
87
  registry_key: str | None = None
88
+ custom_api_key: SecretStr | None = None
89
+
78
90
 
91
+ class DelegateConfig(BaseModel):
92
+ base_url: str
93
+ api_key: SecretStr
79
94
 
80
- RetrierType = Callable[[Callable[..., Awaitable[Any]]], Callable[..., Awaitable[Any]]]
81
95
 
82
- R = TypeVar("R") # return type
96
+ # shared across all subclasses and instances
97
+ # hash(provider + api_key) -> client
98
+ client_registry_lock = threading.Lock()
99
+ client_registry: dict[tuple[str, str], Any] = {}
83
100
 
84
101
 
85
102
  class LLM(ABC):
@@ -88,6 +105,34 @@ class LLM(ABC):
88
105
  LLM call errors should be raised as exceptions
89
106
  """
90
107
 
108
+ @abstractmethod
109
+ def get_client(self, api_key: str | None = None) -> Any:
110
+ """
111
+ Returns the cached instance of the appropriate SDK client.
112
+ Sublasses should implement this method and:
113
+ - if api_key is provided, initialize their client and call assing_client(client).
114
+ - else return super().get_client()
115
+ """
116
+ global client_registry
117
+ return client_registry[self._client_registry_key]
118
+
119
+ def assign_client(self, client: object) -> None:
120
+ """Thread-safe assignment to the client registry"""
121
+ global client_registry
122
+
123
+ if self._client_registry_key not in client_registry:
124
+ with client_registry_lock:
125
+ if self._client_registry_key not in client_registry:
126
+ client_registry[self._client_registry_key] = client
127
+
128
+ def has_client(self) -> bool:
129
+ return self._client_registry_key in client_registry
130
+
131
+ @abstractmethod
132
+ def _get_default_api_key(self) -> str:
133
+ """Return the api key from model_library.settings"""
134
+ ...
135
+
91
136
  def __init__(
92
137
  self,
93
138
  model_name: str,
@@ -103,7 +148,7 @@ class LLM(ABC):
103
148
  config = config or LLMConfig()
104
149
  self._registry_key = config.registry_key
105
150
 
106
- self.max_tokens: int = config.max_tokens
151
+ self.max_tokens: int | None = config.max_tokens
107
152
  self.temperature: float | None = config.temperature
108
153
  self.top_p: float | None = config.top_p
109
154
  self.top_k: int | None = config.top_k
@@ -131,21 +176,33 @@ class LLM(ABC):
131
176
  self.logger: logging.Logger = logging.getLogger(
132
177
  f"llm.{provider}.{model_name}<instance={self.instance_id}>"
133
178
  )
134
- self.custom_retrier: Callable[..., RetrierType] | None = retry_llm_call
179
+ self.custom_retrier: RetrierType | None = None
180
+
181
+ self.token_retry_params = None
182
+ # set _client_registry_key after initializing delegate
183
+ if not self.native:
184
+ return
185
+
186
+ if config.custom_api_key:
187
+ raw_key = config.custom_api_key.get_secret_value()
188
+ else:
189
+ raw_key = self._get_default_api_key()
190
+
191
+ key_hash = hashlib.sha256(raw_key.encode()).hexdigest()
192
+ self._client_registry_key = (self.provider, key_hash)
193
+ self._client_registry_key_model_specific = (
194
+ f"{self.provider}.{self.model_name}",
195
+ key_hash,
196
+ )
197
+ self.get_client(api_key=raw_key)
135
198
 
136
199
  @override
137
200
  def __repr__(self):
138
201
  attrs = vars(self).copy()
139
202
  attrs.pop("logger", None)
140
203
  attrs.pop("custom_retrier", None)
141
- attrs.pop("_key", None)
142
204
  return f"{self.__class__.__name__}(\n{pformat(attrs, indent=2, sort_dicts=False)}\n)"
143
205
 
144
- @abstractmethod
145
- def get_client(self) -> object:
146
- """Return the instance of the appropriate SDK client."""
147
- ...
148
-
149
206
  @staticmethod
150
207
  async def timer_wrapper(func: Callable[[], Awaitable[R]]) -> tuple[R, float]:
151
208
  """
@@ -155,43 +212,6 @@ class LLM(ABC):
155
212
  result = await func()
156
213
  return result, round(time.perf_counter() - start, 4)
157
214
 
158
- @staticmethod
159
- async def immediate_retry_wrapper(
160
- func: Callable[[], Awaitable[R]],
161
- logger: logging.Logger,
162
- ) -> R:
163
- """
164
- Retry the query immediately
165
- """
166
- MAX_IMMEDIATE_RETRIES = 10
167
- retries = 0
168
- while True:
169
- try:
170
- return await func()
171
- except ImmediateRetryException as e:
172
- if retries >= MAX_IMMEDIATE_RETRIES:
173
- logger.error(f"Query reached max immediate retries {retries}: {e}")
174
- raise Exception(
175
- f"Query reached max immediate retries {retries}: {e}"
176
- ) from e
177
- retries += 1
178
-
179
- logger.warning(
180
- f"Query retried immediately {retries}/{MAX_IMMEDIATE_RETRIES}: {e}"
181
- )
182
-
183
- @staticmethod
184
- async def backoff_retry_wrapper(
185
- func: Callable[..., Awaitable[R]],
186
- backoff_retrier: RetrierType | None,
187
- ) -> R:
188
- """
189
- Retry the query with backoff
190
- """
191
- if not backoff_retrier:
192
- return await func()
193
- return await backoff_retrier(func)()
194
-
195
215
  async def delegate_query(
196
216
  self,
197
217
  input: Sequence[InputItem],
@@ -276,15 +296,38 @@ class LLM(ABC):
276
296
  return await LLM.timer_wrapper(query_func)
277
297
 
278
298
  async def immediate_retry() -> tuple[QueryResult, float]:
279
- return await LLM.immediate_retry_wrapper(timed_query, query_logger)
280
-
281
- async def backoff_retry() -> tuple[QueryResult, float]:
282
- backoff_retrier = (
283
- self.custom_retrier(query_logger) if self.custom_retrier else None
284
- )
285
- return await LLM.backoff_retry_wrapper(immediate_retry, backoff_retrier)
299
+ return await BaseRetrier.immediate_retry_wrapper(timed_query, query_logger)
300
+
301
+ async def default_retry() -> tuple[QueryResult, float]:
302
+ if self.token_retry_params:
303
+ (
304
+ estimate_input_tokens,
305
+ estimate_output_tokens,
306
+ ) = await self.estimate_query_tokens(
307
+ input,
308
+ tools=tools,
309
+ **kwargs,
310
+ )
311
+ retrier = TokenRetrier(
312
+ logger=query_logger,
313
+ client_registry_key=self._client_registry_key_model_specific,
314
+ estimate_input_tokens=estimate_input_tokens,
315
+ estimate_output_tokens=estimate_output_tokens,
316
+ dynamic_estimate_instance_id=self.instance_id
317
+ if self.token_retry_params.use_dynamic_estimate
318
+ else None,
319
+ )
320
+ else:
321
+ retrier = ExponentialBackoffRetrier(logger=query_logger)
322
+ return await retry_decorator(retrier)(immediate_retry)()
323
+
324
+ run_with_retry = (
325
+ default_retry
326
+ if not self.custom_retrier
327
+ else self.custom_retrier(immediate_retry)
328
+ )
286
329
 
287
- output, duration = await backoff_retry()
330
+ output, duration = await run_with_retry()
288
331
  output.metadata.duration_seconds = duration
289
332
  output.metadata.cost = await self._calculate_cost(output.metadata)
290
333
 
@@ -293,6 +336,16 @@ class LLM(ABC):
293
336
 
294
337
  return output
295
338
 
339
+ async def init_token_retry(self, token_retry_params: TokenRetryParams) -> None:
340
+ self.token_retry_params = token_retry_params
341
+ await TokenRetrier.init_remaining_tokens(
342
+ client_registry_key=self._client_registry_key_model_specific,
343
+ limit=self.token_retry_params.limit,
344
+ limit_refresh_seconds=self.token_retry_params.limit_refresh_seconds,
345
+ get_rate_limit_func=self.get_rate_limit,
346
+ logger=self.logger,
347
+ )
348
+
296
349
  async def _calculate_cost(
297
350
  self,
298
351
  metadata: QueryResultMetadata,
@@ -438,6 +491,30 @@ class LLM(ABC):
438
491
  """Upload a file to the model provider"""
439
492
  ...
440
493
 
494
+ async def get_rate_limit(self) -> RateLimit | None:
495
+ """Get the rate limit for the model provider"""
496
+ return None
497
+
498
+ async def estimate_query_tokens(
499
+ self,
500
+ input: Sequence[InputItem],
501
+ *,
502
+ tools: list[ToolDefinition] = [],
503
+ **kwargs: object,
504
+ ) -> tuple[int, int]:
505
+ """Pessimistically estimate the number of tokens required for a query"""
506
+ assert self.token_retry_params
507
+
508
+ # TODO: when passing in images and files, we really need to take that into account when calculating the output tokens!!
509
+
510
+ input_tokens = (
511
+ await self.count_tokens(input, history=[], tools=tools, **kwargs)
512
+ * self.token_retry_params.input_modifier
513
+ )
514
+
515
+ output_tokens = input_tokens * self.token_retry_params.output_modifier
516
+ return ceil(input_tokens), ceil(output_tokens)
517
+
441
518
  async def get_encoding(self) -> Encoding:
442
519
  """Get the appropriate tokenizer"""
443
520
 
@@ -13,6 +13,7 @@ from model_library.base import (
13
13
  QueryResult,
14
14
  ToolDefinition,
15
15
  )
16
+ from model_library.base.base import DelegateConfig
16
17
 
17
18
 
18
19
  class DelegateOnlyException(Exception):
@@ -21,17 +22,51 @@ class DelegateOnlyException(Exception):
21
22
  delegate-only model.
22
23
  """
23
24
 
24
- DEFAULT_MESSAGE: str = "This model supports only delegate-only functionality. Only the query() method should be used."
25
+ DEFAULT_MESSAGE: str = "This model is running in delegate-only mode, certain functionality is not supported."
25
26
 
26
27
  def __init__(self, message: str | None = None):
27
28
  super().__init__(message or DelegateOnlyException.DEFAULT_MESSAGE)
28
29
 
29
30
 
30
31
  class DelegateOnly(LLM):
31
- @override
32
- def get_client(self) -> None:
32
+ def _get_default_api_key(self) -> str:
33
33
  raise DelegateOnlyException()
34
34
 
35
+ @override
36
+ def get_client(self, api_key: str | None = None) -> None:
37
+ assert self.delegate
38
+ return self.delegate.get_client()
39
+
40
+ def init_delegate(
41
+ self,
42
+ config: LLMConfig | None,
43
+ delegate_config: DelegateConfig,
44
+ delegate_provider: Literal["openai", "anthropic"],
45
+ use_completions: bool = True,
46
+ ) -> None:
47
+ from model_library.providers.anthropic import AnthropicModel
48
+ from model_library.providers.openai import OpenAIModel
49
+
50
+ match delegate_provider:
51
+ case "openai":
52
+ self.delegate = OpenAIModel(
53
+ model_name=self.model_name,
54
+ provider=self.provider,
55
+ config=config,
56
+ use_completions=use_completions,
57
+ delegate_config=delegate_config,
58
+ )
59
+ case "anthropic":
60
+ self.delegate = AnthropicModel(
61
+ model_name=self.model_name,
62
+ provider=self.provider,
63
+ config=config,
64
+ delegate_config=delegate_config,
65
+ )
66
+ self._client_registry_key_model_specific = (
67
+ self.delegate._client_registry_key_model_specific
68
+ )
69
+
35
70
  def __init__(
36
71
  self,
37
72
  model_name: str,
@@ -42,6 +77,11 @@ class DelegateOnly(LLM):
42
77
  config = config or LLMConfig()
43
78
  config.native = False
44
79
  super().__init__(model_name, provider, config=config)
80
+ config.native = True
81
+
82
+ def _get_extra_body(self) -> dict[str, Any]:
83
+ """Build extra body parameters for delegate-specific features."""
84
+ return {}
45
85
 
46
86
  @override
47
87
  async def _query_impl(
@@ -53,9 +93,12 @@ class DelegateOnly(LLM):
53
93
  **kwargs: object,
54
94
  ) -> QueryResult:
55
95
  assert self.delegate
56
-
57
96
  return await self.delegate_query(
58
- input, tools=tools, query_logger=query_logger, **kwargs
97
+ input,
98
+ tools=tools,
99
+ query_logger=query_logger,
100
+ extra_body=self._get_extra_body(),
101
+ **kwargs,
59
102
  )
60
103
 
61
104
  @override
@@ -66,7 +109,8 @@ class DelegateOnly(LLM):
66
109
  tools: list[ToolDefinition],
67
110
  **kwargs: object,
68
111
  ) -> dict[str, Any]:
69
- raise DelegateOnlyException()
112
+ assert self.delegate
113
+ return await self.delegate.build_body(input, tools=tools, **kwargs)
70
114
 
71
115
  @override
72
116
  async def parse_input(
@@ -74,28 +118,32 @@ class DelegateOnly(LLM):
74
118
  input: Sequence[InputItem],
75
119
  **kwargs: Any,
76
120
  ) -> Any:
77
- raise DelegateOnlyException()
121
+ assert self.delegate
122
+ return await self.delegate.parse_input(input, **kwargs)
78
123
 
79
124
  @override
80
125
  async def parse_image(
81
126
  self,
82
127
  image: FileInput,
83
128
  ) -> Any:
84
- raise DelegateOnlyException()
129
+ assert self.delegate
130
+ return await self.delegate.parse_image(image)
85
131
 
86
132
  @override
87
133
  async def parse_file(
88
134
  self,
89
135
  file: FileInput,
90
136
  ) -> Any:
91
- raise DelegateOnlyException()
137
+ assert self.delegate
138
+ return await self.delegate.parse_file(file)
92
139
 
93
140
  @override
94
141
  async def parse_tools(
95
142
  self,
96
143
  tools: list[ToolDefinition],
97
144
  ) -> Any:
98
- raise DelegateOnlyException()
145
+ assert self.delegate
146
+ return await self.delegate.parse_tools(tools)
99
147
 
100
148
  @override
101
149
  async def upload_file(
@@ -106,3 +154,22 @@ class DelegateOnly(LLM):
106
154
  type: Literal["image", "file"] = "file",
107
155
  ) -> FileWithId:
108
156
  raise DelegateOnlyException()
157
+
158
+ @override
159
+ async def get_rate_limit(self) -> Any:
160
+ assert self.delegate
161
+ return await self.delegate.get_rate_limit()
162
+
163
+ @override
164
+ async def count_tokens(
165
+ self,
166
+ input: Sequence[InputItem],
167
+ *,
168
+ history: Sequence[InputItem] = [],
169
+ tools: list[ToolDefinition] = [],
170
+ **kwargs: object,
171
+ ) -> int:
172
+ assert self.delegate
173
+ return await self.delegate.count_tokens(
174
+ input, history=history, tools=tools, **kwargs
175
+ )
@@ -118,6 +118,48 @@ class QueryResultCost(BaseModel):
118
118
  )
119
119
 
120
120
 
121
+ class RateLimit(BaseModel):
122
+ """Rate limit information"""
123
+
124
+ request_limit: int | None = None
125
+ request_remaining: int | None = None
126
+
127
+ token_limit: int | None = None
128
+ token_limit_input: int | None = None
129
+ token_limit_output: int | None = None
130
+
131
+ token_remaining: int | None = None
132
+ token_remaining_input: int | None = None
133
+ token_remaining_output: int | None = None
134
+
135
+ unix_timestamp: float
136
+ raw: Any
137
+
138
+ @computed_field
139
+ @property
140
+ def token_limit_total(self) -> int:
141
+ if self.token_limit:
142
+ return self.token_limit
143
+ else:
144
+ return (self.token_limit_input or 0) + (self.token_limit_output or 0)
145
+
146
+ @computed_field
147
+ @property
148
+ def token_remaining_total(self) -> int:
149
+ if self.token_remaining:
150
+ return self.token_remaining
151
+ else:
152
+ return (self.token_remaining_input or 0) + (
153
+ self.token_remaining_output or 0
154
+ )
155
+
156
+ @override
157
+ def __repr__(self):
158
+ attrs = vars(self).copy()
159
+ attrs.pop("raw", None)
160
+ return f"{self.__class__.__name__}(\n{pformat(attrs, indent=2, sort_dicts=False)}\n)"
161
+
162
+
121
163
  class QueryResultMetadata(BaseModel):
122
164
  """
123
165
  Metadata for a query: token usage and timing.
@@ -131,6 +173,7 @@ class QueryResultMetadata(BaseModel):
131
173
  reasoning_tokens: int | None = None
132
174
  cache_read_tokens: int | None = None
133
175
  cache_write_tokens: int | None = None
176
+ extra: dict[str, Any] = {}
134
177
 
135
178
  @property
136
179
  def default_duration_seconds(self) -> float:
@@ -1,4 +1,6 @@
1
1
  import json
2
+ import re
3
+ from datetime import datetime, timedelta
2
4
  from typing import Any, Sequence, TypeVar
3
5
 
4
6
  from pydantic import BaseModel
@@ -77,3 +79,36 @@ def get_pretty_input_types(input: Sequence["InputItem"], verbose: bool = False)
77
79
 
78
80
  processed_items = [f" {process_item(item)}" for item in input]
79
81
  return "\n" + "\n".join(processed_items) if processed_items else ""
82
+
83
+
84
+ TIME_PATTERN = re.compile(r"^(\d+(?:\.\d+)?)([a-zA-Z]+)$")
85
+ UNIT_TO_SECONDS = {
86
+ "ms": 0.001,
87
+ "s": 1,
88
+ "m": 60,
89
+ "h": 3600,
90
+ }
91
+
92
+
93
+ def to_timestamp(input_str: str, server_now: datetime) -> int:
94
+ """Converts a header string into a server-relative Unix timestamp in ms."""
95
+ input_str = input_str.strip()
96
+
97
+ # ISO Timestamp (e.g. 2026-01-09T21:58:01Z)
98
+ if "T" in input_str and "-" in input_str:
99
+ try:
100
+ dt = datetime.fromisoformat(input_str.replace("Z", "+00:00"))
101
+ return int(dt.timestamp() * 1000)
102
+ except ValueError:
103
+ pass
104
+
105
+ # Duration (e.g. 10s, 6ms)
106
+ match = TIME_PATTERN.match(input_str)
107
+ if match:
108
+ value, unit = match.groups()
109
+ offset_seconds = float(value) * UNIT_TO_SECONDS.get(unit.lower(), 0)
110
+ # Add duration to the SERVER'S provided date
111
+ dt = server_now + timedelta(seconds=offset_seconds)
112
+ return int(dt.timestamp() * 1000)
113
+
114
+ raise ValueError(f"Unsupported time format: {input_str}")
@@ -1,17 +1,51 @@
1
- qwen-models:
2
- base-config:
3
- company: Alibaba
4
- open_source: false
1
+ base-config:
2
+ company: Alibaba
3
+ open_source: false
4
+ supports:
5
+ temperature: true
6
+ metadata:
7
+ available_for_everyone: false
8
+ available_as_evaluator: false
9
+ default_parameters:
10
+ temperature: 0.7
11
+ properties:
12
+ reasoning_model: false
5
13
 
14
+ qwen-3-vl-models:
15
+ base-config:
6
16
  supports:
7
- temperature: true
17
+ images: true
18
+
19
+ alibaba/qwen3-vl-plus-2025-09-23:
20
+ label: Qwen 3 VL Plus
21
+ open_source: true
22
+ description: Qwen 3 VL Plus (2025-09-23)
23
+ release_date: 2025-09-23
8
24
  metadata:
9
- available_for_everyone: false
10
- available_as_evaluator: false
11
- default_parameters:
12
- temperature: 0.7
25
+ deprecated: true
13
26
  properties:
27
+ context_window: 262_144
28
+ max_tokens: 32_768
29
+ training_cutoff: ""
14
30
  reasoning_model: false
31
+ costs_per_million_token:
32
+ input: 0.2
33
+ output: 1.6
34
+
35
+ qwen-3-max-models:
36
+ base-config:
37
+ supports:
38
+ tools: true
39
+ images: false
40
+
41
+ alibaba/qwen3-max-2026-01-23:
42
+ label: Qwen 3 Max Thinking
43
+ description: Qwen 3 Max with enhanced reasoning capabilities
44
+ release_date: 2026-01-23
45
+ properties:
46
+ context_window: 256_000
47
+ max_tokens: 32_000
48
+ reasoning_model: true
15
49
 
16
50
  alibaba/qwen3-max-preview:
17
51
  label: Qwen 3 Max Preview
@@ -20,15 +54,7 @@ qwen-models:
20
54
  properties:
21
55
  context_window: 262_144
22
56
  max_tokens: 65_536
23
- training_cutoff: ""
24
- costs_per_million_token:
25
- input: 1.2
26
- output: 6
27
- supports:
28
- images: false
29
- tools: true
30
- metadata:
31
- available_for_everyone: false
57
+ reasoning_model: true
32
58
 
33
59
  alibaba/qwen3-max-2025-09-23:
34
60
  label: Qwen 3 Max 2025-09-23
@@ -39,14 +65,6 @@ qwen-models:
39
65
  max_tokens: 65_536
40
66
  training_cutoff: ""
41
67
  reasoning_model: true
42
- costs_per_million_token:
43
- input: 1.2
44
- output: 6
45
- supports:
46
- images: false
47
- tools: true
48
- metadata:
49
- available_for_everyone: false
50
68
 
51
69
  alibaba/qwen3-max:
52
70
  label: Qwen 3 Max
@@ -57,34 +75,3 @@ qwen-models:
57
75
  max_tokens: 65_536
58
76
  training_cutoff: ""
59
77
  reasoning_model: false
60
- costs_per_million_token:
61
- input: 1.2
62
- output: 6
63
- cache:
64
- read_discount: 0.8
65
- write_markup: 1
66
- context:
67
- threshold: 32_000
68
- input: 2.4
69
- output: 12
70
- supports:
71
- images: false
72
- tools: true
73
- metadata:
74
- available_for_everyone: false
75
-
76
- alibaba/qwen3-vl-plus-2025-09-23:
77
- label: Qwen 3 VL Plus
78
- open_source: true
79
- description: Qwen 3 VL Plus (2025-09-23)
80
- release_date: 2025-09-23
81
- properties:
82
- context_window: 262_144
83
- max_tokens: 32_768
84
- training_cutoff: ""
85
- reasoning_model: false
86
- costs_per_million_token:
87
- input: 0.2
88
- output: 1.6
89
- supports:
90
- images: true