vectorvein 0.1.87__py3-none-any.whl → 0.1.89__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vectorvein/chat_clients/anthropic_client.py +4 -0
- vectorvein/chat_clients/base_client.py +121 -2
- vectorvein/chat_clients/gemini_client.py +9 -523
- vectorvein/chat_clients/openai_compatible_client.py +16 -12
- vectorvein/chat_clients/utils.py +34 -116
- vectorvein/settings/__init__.py +30 -1
- vectorvein/types/defaults.py +30 -6
- vectorvein/types/llm_parameters.py +4 -1
- vectorvein/utilities/rate_limiter.py +312 -0
- {vectorvein-0.1.87.dist-info → vectorvein-0.1.89.dist-info}/METADATA +6 -1
- {vectorvein-0.1.87.dist-info → vectorvein-0.1.89.dist-info}/RECORD +13 -12
- {vectorvein-0.1.87.dist-info → vectorvein-0.1.89.dist-info}/WHEEL +0 -0
- {vectorvein-0.1.87.dist-info → vectorvein-0.1.89.dist-info}/entry_points.txt +0 -0
@@ -434,6 +434,8 @@ class AnthropicChatClient(BaseChatClient):
|
|
434
434
|
else:
|
435
435
|
max_tokens = self.model_setting.context_length - token_counts
|
436
436
|
|
437
|
+
self._acquire_rate_limit(self.endpoint, self.model, messages)
|
438
|
+
|
437
439
|
if self.stream:
|
438
440
|
stream_response = raw_client.messages.create(
|
439
441
|
model=self.model_id,
|
@@ -824,6 +826,8 @@ class AsyncAnthropicChatClient(BaseAsyncChatClient):
|
|
824
826
|
else:
|
825
827
|
max_tokens = self.model_setting.context_length - token_counts
|
826
828
|
|
829
|
+
await self._acquire_rate_limit(self.endpoint, self.model, messages)
|
830
|
+
|
827
831
|
if self.stream:
|
828
832
|
stream_response = await raw_client.messages.create(
|
829
833
|
model=self.model_id,
|
@@ -1,7 +1,8 @@
|
|
1
|
-
|
2
|
-
# @Date: 2024-07-26 14:48:55
|
1
|
+
import time
|
3
2
|
import random
|
3
|
+
import asyncio
|
4
4
|
from abc import ABC, abstractmethod
|
5
|
+
from collections import defaultdict
|
5
6
|
from functools import cached_property
|
6
7
|
from typing import Generator, AsyncGenerator, Any, overload, Literal, Iterable
|
7
8
|
|
@@ -29,6 +30,8 @@ from ..types.llm_parameters import (
|
|
29
30
|
ChatCompletionDeltaMessage,
|
30
31
|
ChatCompletionStreamOptionsParam,
|
31
32
|
)
|
33
|
+
from ..utilities.rate_limiter import SyncMemoryRateLimiter, SyncRedisRateLimiter, SyncDiskCacheRateLimiter
|
34
|
+
from ..utilities.rate_limiter import AsyncMemoryRateLimiter, AsyncRedisRateLimiter, AsyncDiskCacheRateLimiter
|
32
35
|
|
33
36
|
|
34
37
|
class BaseChatClient(ABC):
|
@@ -59,11 +62,65 @@ class BaseChatClient(ABC):
|
|
59
62
|
|
60
63
|
self.backend_settings = settings.get_backend(self.BACKEND_NAME)
|
61
64
|
|
65
|
+
self.rate_limiter = self._init_rate_limiter()
|
66
|
+
self.active_requests = defaultdict(int)
|
67
|
+
self.rpm = None
|
68
|
+
self.tpm = None
|
69
|
+
self.concurrent_requests = None
|
70
|
+
|
62
71
|
if endpoint_id:
|
63
72
|
self.endpoint_id = endpoint_id
|
64
73
|
self.random_endpoint = False
|
65
74
|
self.endpoint = settings.get_endpoint(self.endpoint_id)
|
66
75
|
|
76
|
+
def _init_rate_limiter(self):
|
77
|
+
if not settings.rate_limit:
|
78
|
+
return None
|
79
|
+
if not settings.rate_limit.enabled:
|
80
|
+
return None
|
81
|
+
|
82
|
+
if settings.rate_limit.backend == "memory":
|
83
|
+
return SyncMemoryRateLimiter()
|
84
|
+
elif settings.rate_limit.backend == "redis":
|
85
|
+
if not settings.rate_limit.redis:
|
86
|
+
raise ValueError("Redis settings must be provided if Redis backend is selected.")
|
87
|
+
return SyncRedisRateLimiter(
|
88
|
+
host=settings.rate_limit.redis.host,
|
89
|
+
port=settings.rate_limit.redis.port,
|
90
|
+
db=settings.rate_limit.redis.db,
|
91
|
+
)
|
92
|
+
elif settings.rate_limit.backend == "diskcache":
|
93
|
+
if not settings.rate_limit.diskcache:
|
94
|
+
raise ValueError("Diskcache settings must be provided if Diskcache backend is selected.")
|
95
|
+
return SyncDiskCacheRateLimiter(
|
96
|
+
cache_dir=settings.rate_limit.diskcache.cache_dir,
|
97
|
+
)
|
98
|
+
return None
|
99
|
+
|
100
|
+
def _acquire_rate_limit(self, endpoint: EndpointSetting | None, model: str, messages: list):
|
101
|
+
if endpoint is None:
|
102
|
+
return
|
103
|
+
|
104
|
+
key = f"{endpoint.id}:{model}"
|
105
|
+
|
106
|
+
# Get rate limit parameters
|
107
|
+
# Priority: parameters in model.endpoints > parameters in endpoint > default parameters
|
108
|
+
rpm = self.rpm or endpoint.rpm or (settings.rate_limit.default_rpm if settings.rate_limit else 60)
|
109
|
+
tpm = self.tpm or endpoint.tpm or (settings.rate_limit.default_tpm if settings.rate_limit else 1000000)
|
110
|
+
|
111
|
+
while self.rate_limiter:
|
112
|
+
allowed, wait_time = self.rate_limiter.check_limit(key, rpm, tpm, self._estimate_request_tokens(messages))
|
113
|
+
if allowed:
|
114
|
+
break
|
115
|
+
time.sleep(wait_time)
|
116
|
+
|
117
|
+
def _estimate_request_tokens(self, messages: list) -> int:
|
118
|
+
"""Roughly estimate the number of tokens in the request"""
|
119
|
+
tokens = 0
|
120
|
+
for message in messages:
|
121
|
+
tokens += int(len(message.get("content", "")) * 0.6)
|
122
|
+
return tokens
|
123
|
+
|
67
124
|
def set_model_id_by_endpoint_id(self, endpoint_id: str):
|
68
125
|
for endpoint_option in self.backend_settings.models[self.model].endpoints:
|
69
126
|
if isinstance(endpoint_option, dict) and endpoint_id == endpoint_option["endpoint_id"]:
|
@@ -79,6 +136,9 @@ class BaseChatClient(ABC):
|
|
79
136
|
if isinstance(endpoint, dict):
|
80
137
|
self.endpoint_id = endpoint["endpoint_id"]
|
81
138
|
self.model_id = endpoint["model_id"]
|
139
|
+
self.rpm = endpoint.get("rpm", None)
|
140
|
+
self.tpm = endpoint.get("tpm", None)
|
141
|
+
self.concurrent_requests = endpoint.get("concurrent_requests", None)
|
82
142
|
else:
|
83
143
|
self.endpoint_id = endpoint
|
84
144
|
self.endpoint = settings.get_endpoint(self.endpoint_id)
|
@@ -236,11 +296,67 @@ class BaseAsyncChatClient(ABC):
|
|
236
296
|
|
237
297
|
self.backend_settings = settings.get_backend(self.BACKEND_NAME)
|
238
298
|
|
299
|
+
self.rate_limiter = self._init_rate_limiter()
|
300
|
+
self.active_requests = defaultdict(int)
|
301
|
+
self.rpm = None
|
302
|
+
self.tpm = None
|
303
|
+
self.concurrent_requests = None
|
304
|
+
|
239
305
|
if endpoint_id:
|
240
306
|
self.endpoint_id = endpoint_id
|
241
307
|
self.random_endpoint = False
|
242
308
|
self.endpoint = settings.get_endpoint(self.endpoint_id)
|
243
309
|
|
310
|
+
def _init_rate_limiter(self):
|
311
|
+
if not settings.rate_limit:
|
312
|
+
return None
|
313
|
+
if not settings.rate_limit.enabled:
|
314
|
+
return None
|
315
|
+
|
316
|
+
if settings.rate_limit.backend == "memory":
|
317
|
+
return AsyncMemoryRateLimiter()
|
318
|
+
elif settings.rate_limit.backend == "redis":
|
319
|
+
if not settings.rate_limit.redis:
|
320
|
+
raise ValueError("Redis settings must be provided if Redis backend is selected.")
|
321
|
+
return AsyncRedisRateLimiter(
|
322
|
+
host=settings.rate_limit.redis.host,
|
323
|
+
port=settings.rate_limit.redis.port,
|
324
|
+
db=settings.rate_limit.redis.db,
|
325
|
+
)
|
326
|
+
elif settings.rate_limit.backend == "diskcache":
|
327
|
+
if not settings.rate_limit.diskcache:
|
328
|
+
raise ValueError("Diskcache settings must be provided if Diskcache backend is selected.")
|
329
|
+
return AsyncDiskCacheRateLimiter(
|
330
|
+
cache_dir=settings.rate_limit.diskcache.cache_dir,
|
331
|
+
)
|
332
|
+
return None
|
333
|
+
|
334
|
+
async def _acquire_rate_limit(self, endpoint: EndpointSetting | None, model: str, messages: list):
|
335
|
+
if endpoint is None:
|
336
|
+
return
|
337
|
+
|
338
|
+
key = f"{endpoint.id}:{model}"
|
339
|
+
|
340
|
+
# Get rate limit parameters
|
341
|
+
# Priority: parameters in model.endpoints > parameters in endpoint > default parameters
|
342
|
+
rpm = self.rpm or endpoint.rpm or (settings.rate_limit.default_rpm if settings.rate_limit else 60)
|
343
|
+
tpm = self.tpm or endpoint.tpm or (settings.rate_limit.default_tpm if settings.rate_limit else 1000000)
|
344
|
+
|
345
|
+
while self.rate_limiter:
|
346
|
+
allowed, wait_time = await self.rate_limiter.check_limit(
|
347
|
+
key, rpm, tpm, self._estimate_request_tokens(messages)
|
348
|
+
)
|
349
|
+
if allowed:
|
350
|
+
break
|
351
|
+
await asyncio.sleep(wait_time)
|
352
|
+
|
353
|
+
def _estimate_request_tokens(self, messages: list) -> int:
|
354
|
+
"""Roughly estimate the number of tokens in the request"""
|
355
|
+
tokens = 0
|
356
|
+
for message in messages:
|
357
|
+
tokens += int(len(message.get("content", "")) * 0.6)
|
358
|
+
return tokens
|
359
|
+
|
244
360
|
def set_model_id_by_endpoint_id(self, endpoint_id: str):
|
245
361
|
for endpoint_option in self.backend_settings.models[self.model].endpoints:
|
246
362
|
if isinstance(endpoint_option, dict) and endpoint_id == endpoint_option["endpoint_id"]:
|
@@ -256,6 +372,9 @@ class BaseAsyncChatClient(ABC):
|
|
256
372
|
if isinstance(endpoint, dict):
|
257
373
|
self.endpoint_id = endpoint["endpoint_id"]
|
258
374
|
self.model_id = endpoint["model_id"]
|
375
|
+
self.rpm = endpoint.get("rpm", None)
|
376
|
+
self.tpm = endpoint.get("tpm", None)
|
377
|
+
self.concurrent_requests = endpoint.get("concurrent_requests", None)
|
259
378
|
else:
|
260
379
|
self.endpoint_id = endpoint
|
261
380
|
self.endpoint = settings.get_endpoint(self.endpoint_id)
|