vectorvein 0.1.87__py3-none-any.whl → 0.1.89__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -434,6 +434,8 @@ class AnthropicChatClient(BaseChatClient):
434
434
  else:
435
435
  max_tokens = self.model_setting.context_length - token_counts
436
436
 
437
+ self._acquire_rate_limit(self.endpoint, self.model, messages)
438
+
437
439
  if self.stream:
438
440
  stream_response = raw_client.messages.create(
439
441
  model=self.model_id,
@@ -824,6 +826,8 @@ class AsyncAnthropicChatClient(BaseAsyncChatClient):
824
826
  else:
825
827
  max_tokens = self.model_setting.context_length - token_counts
826
828
 
829
+ await self._acquire_rate_limit(self.endpoint, self.model, messages)
830
+
827
831
  if self.stream:
828
832
  stream_response = await raw_client.messages.create(
829
833
  model=self.model_id,
@@ -1,7 +1,8 @@
1
- # @Author: Bi Ying
2
- # @Date: 2024-07-26 14:48:55
1
+ import time
3
2
  import random
3
+ import asyncio
4
4
  from abc import ABC, abstractmethod
5
+ from collections import defaultdict
5
6
  from functools import cached_property
6
7
  from typing import Generator, AsyncGenerator, Any, overload, Literal, Iterable
7
8
 
@@ -29,6 +30,8 @@ from ..types.llm_parameters import (
29
30
  ChatCompletionDeltaMessage,
30
31
  ChatCompletionStreamOptionsParam,
31
32
  )
33
+ from ..utilities.rate_limiter import SyncMemoryRateLimiter, SyncRedisRateLimiter, SyncDiskCacheRateLimiter
34
+ from ..utilities.rate_limiter import AsyncMemoryRateLimiter, AsyncRedisRateLimiter, AsyncDiskCacheRateLimiter
32
35
 
33
36
 
34
37
  class BaseChatClient(ABC):
@@ -59,11 +62,65 @@ class BaseChatClient(ABC):
59
62
 
60
63
  self.backend_settings = settings.get_backend(self.BACKEND_NAME)
61
64
 
65
+ self.rate_limiter = self._init_rate_limiter()
66
+ self.active_requests = defaultdict(int)
67
+ self.rpm = None
68
+ self.tpm = None
69
+ self.concurrent_requests = None
70
+
62
71
  if endpoint_id:
63
72
  self.endpoint_id = endpoint_id
64
73
  self.random_endpoint = False
65
74
  self.endpoint = settings.get_endpoint(self.endpoint_id)
66
75
 
76
+ def _init_rate_limiter(self):
77
+ if not settings.rate_limit:
78
+ return None
79
+ if not settings.rate_limit.enabled:
80
+ return None
81
+
82
+ if settings.rate_limit.backend == "memory":
83
+ return SyncMemoryRateLimiter()
84
+ elif settings.rate_limit.backend == "redis":
85
+ if not settings.rate_limit.redis:
86
+ raise ValueError("Redis settings must be provided if Redis backend is selected.")
87
+ return SyncRedisRateLimiter(
88
+ host=settings.rate_limit.redis.host,
89
+ port=settings.rate_limit.redis.port,
90
+ db=settings.rate_limit.redis.db,
91
+ )
92
+ elif settings.rate_limit.backend == "diskcache":
93
+ if not settings.rate_limit.diskcache:
94
+ raise ValueError("Diskcache settings must be provided if Diskcache backend is selected.")
95
+ return SyncDiskCacheRateLimiter(
96
+ cache_dir=settings.rate_limit.diskcache.cache_dir,
97
+ )
98
+ return None
99
+
100
+ def _acquire_rate_limit(self, endpoint: EndpointSetting | None, model: str, messages: list):
101
+ if endpoint is None:
102
+ return
103
+
104
+ key = f"{endpoint.id}:{model}"
105
+
106
+ # Get rate limit parameters
107
+ # Priority: parameters in model.endpoints > parameters in endpoint > default parameters
108
+ rpm = self.rpm or endpoint.rpm or (settings.rate_limit.default_rpm if settings.rate_limit else 60)
109
+ tpm = self.tpm or endpoint.tpm or (settings.rate_limit.default_tpm if settings.rate_limit else 1000000)
110
+
111
+ while self.rate_limiter:
112
+ allowed, wait_time = self.rate_limiter.check_limit(key, rpm, tpm, self._estimate_request_tokens(messages))
113
+ if allowed:
114
+ break
115
+ time.sleep(wait_time)
116
+
117
+ def _estimate_request_tokens(self, messages: list) -> int:
118
+ """Roughly estimate the number of tokens in the request"""
119
+ tokens = 0
120
+ for message in messages:
121
+ tokens += int(len(message.get("content", "")) * 0.6)
122
+ return tokens
123
+
67
124
  def set_model_id_by_endpoint_id(self, endpoint_id: str):
68
125
  for endpoint_option in self.backend_settings.models[self.model].endpoints:
69
126
  if isinstance(endpoint_option, dict) and endpoint_id == endpoint_option["endpoint_id"]:
@@ -79,6 +136,9 @@ class BaseChatClient(ABC):
79
136
  if isinstance(endpoint, dict):
80
137
  self.endpoint_id = endpoint["endpoint_id"]
81
138
  self.model_id = endpoint["model_id"]
139
+ self.rpm = endpoint.get("rpm", None)
140
+ self.tpm = endpoint.get("tpm", None)
141
+ self.concurrent_requests = endpoint.get("concurrent_requests", None)
82
142
  else:
83
143
  self.endpoint_id = endpoint
84
144
  self.endpoint = settings.get_endpoint(self.endpoint_id)
@@ -236,11 +296,67 @@ class BaseAsyncChatClient(ABC):
236
296
 
237
297
  self.backend_settings = settings.get_backend(self.BACKEND_NAME)
238
298
 
299
+ self.rate_limiter = self._init_rate_limiter()
300
+ self.active_requests = defaultdict(int)
301
+ self.rpm = None
302
+ self.tpm = None
303
+ self.concurrent_requests = None
304
+
239
305
  if endpoint_id:
240
306
  self.endpoint_id = endpoint_id
241
307
  self.random_endpoint = False
242
308
  self.endpoint = settings.get_endpoint(self.endpoint_id)
243
309
 
310
+ def _init_rate_limiter(self):
311
+ if not settings.rate_limit:
312
+ return None
313
+ if not settings.rate_limit.enabled:
314
+ return None
315
+
316
+ if settings.rate_limit.backend == "memory":
317
+ return AsyncMemoryRateLimiter()
318
+ elif settings.rate_limit.backend == "redis":
319
+ if not settings.rate_limit.redis:
320
+ raise ValueError("Redis settings must be provided if Redis backend is selected.")
321
+ return AsyncRedisRateLimiter(
322
+ host=settings.rate_limit.redis.host,
323
+ port=settings.rate_limit.redis.port,
324
+ db=settings.rate_limit.redis.db,
325
+ )
326
+ elif settings.rate_limit.backend == "diskcache":
327
+ if not settings.rate_limit.diskcache:
328
+ raise ValueError("Diskcache settings must be provided if Diskcache backend is selected.")
329
+ return AsyncDiskCacheRateLimiter(
330
+ cache_dir=settings.rate_limit.diskcache.cache_dir,
331
+ )
332
+ return None
333
+
334
+ async def _acquire_rate_limit(self, endpoint: EndpointSetting | None, model: str, messages: list):
335
+ if endpoint is None:
336
+ return
337
+
338
+ key = f"{endpoint.id}:{model}"
339
+
340
+ # Get rate limit parameters
341
+ # Priority: parameters in model.endpoints > parameters in endpoint > default parameters
342
+ rpm = self.rpm or endpoint.rpm or (settings.rate_limit.default_rpm if settings.rate_limit else 60)
343
+ tpm = self.tpm or endpoint.tpm or (settings.rate_limit.default_tpm if settings.rate_limit else 1000000)
344
+
345
+ while self.rate_limiter:
346
+ allowed, wait_time = await self.rate_limiter.check_limit(
347
+ key, rpm, tpm, self._estimate_request_tokens(messages)
348
+ )
349
+ if allowed:
350
+ break
351
+ await asyncio.sleep(wait_time)
352
+
353
+ def _estimate_request_tokens(self, messages: list) -> int:
354
+ """Roughly estimate the number of tokens in the request"""
355
+ tokens = 0
356
+ for message in messages:
357
+ tokens += int(len(message.get("content", "")) * 0.6)
358
+ return tokens
359
+
244
360
  def set_model_id_by_endpoint_id(self, endpoint_id: str):
245
361
  for endpoint_option in self.backend_settings.models[self.model].endpoints:
246
362
  if isinstance(endpoint_option, dict) and endpoint_id == endpoint_option["endpoint_id"]:
@@ -256,6 +372,9 @@ class BaseAsyncChatClient(ABC):
256
372
  if isinstance(endpoint, dict):
257
373
  self.endpoint_id = endpoint["endpoint_id"]
258
374
  self.model_id = endpoint["model_id"]
375
+ self.rpm = endpoint.get("rpm", None)
376
+ self.tpm = endpoint.get("tpm", None)
377
+ self.concurrent_requests = endpoint.get("concurrent_requests", None)
259
378
  else:
260
379
  self.endpoint_id = endpoint
261
380
  self.endpoint = settings.get_endpoint(self.endpoint_id)