maxllm-gate 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maxllm_gate-0.2.0/.env.example +60 -0
- maxllm_gate-0.2.0/.github/copilot-instructions.md +343 -0
- maxllm_gate-0.2.0/.gitignore +77 -0
- maxllm_gate-0.2.0/Dockerfile +24 -0
- maxllm_gate-0.2.0/LICENSE +21 -0
- maxllm_gate-0.2.0/MAXLLM.py +0 -0
- maxllm_gate-0.2.0/PKG-INFO +771 -0
- maxllm_gate-0.2.0/PYPI_UPLOAD.md +117 -0
- maxllm_gate-0.2.0/README.md +717 -0
- maxllm_gate-0.2.0/config.example.yaml +60 -0
- maxllm_gate-0.2.0/docker-compose.yml +62 -0
- maxllm_gate-0.2.0/examples/basic_usage.py +80 -0
- maxllm_gate-0.2.0/examples/concurrent_requests.py +124 -0
- maxllm_gate-0.2.0/examples/multi_key_config.py +96 -0
- maxllm_gate-0.2.0/examples/priority_requests.py +140 -0
- maxllm_gate-0.2.0/prometheus.yml +9 -0
- maxllm_gate-0.2.0/pyproject.toml +97 -0
- maxllm_gate-0.2.0/requirements.txt +22 -0
- maxllm_gate-0.2.0/scripts/benchmark.py +123 -0
- maxllm_gate-0.2.0/scripts/simulate_load.py +276 -0
- maxllm_gate-0.2.0/src/llm_scheduler/__init__.py +8 -0
- maxllm_gate-0.2.0/src/llm_scheduler/api/__init__.py +6 -0
- maxllm_gate-0.2.0/src/llm_scheduler/api/dependencies.py +10 -0
- maxllm_gate-0.2.0/src/llm_scheduler/api/routes.py +275 -0
- maxllm_gate-0.2.0/src/llm_scheduler/api/schemas.py +135 -0
- maxllm_gate-0.2.0/src/llm_scheduler/config.py +117 -0
- maxllm_gate-0.2.0/src/llm_scheduler/core/__init__.py +8 -0
- maxllm_gate-0.2.0/src/llm_scheduler/core/dispatcher.py +225 -0
- maxllm_gate-0.2.0/src/llm_scheduler/core/queue_manager.py +251 -0
- maxllm_gate-0.2.0/src/llm_scheduler/core/scheduler.py +236 -0
- maxllm_gate-0.2.0/src/llm_scheduler/core/token_estimator.py +201 -0
- maxllm_gate-0.2.0/src/llm_scheduler/main.py +86 -0
- maxllm_gate-0.2.0/src/llm_scheduler/models/__init__.py +6 -0
- maxllm_gate-0.2.0/src/llm_scheduler/models/provider.py +103 -0
- maxllm_gate-0.2.0/src/llm_scheduler/models/request.py +101 -0
- maxllm_gate-0.2.0/src/llm_scheduler/observability/__init__.py +6 -0
- maxllm_gate-0.2.0/src/llm_scheduler/observability/logging.py +65 -0
- maxllm_gate-0.2.0/src/llm_scheduler/observability/metrics.py +92 -0
- maxllm_gate-0.2.0/src/llm_scheduler/rate_limiting/__init__.py +7 -0
- maxllm_gate-0.2.0/src/llm_scheduler/rate_limiting/key_manager.py +252 -0
- maxllm_gate-0.2.0/src/llm_scheduler/rate_limiting/token_bucket.py +152 -0
- maxllm_gate-0.2.0/src/llm_scheduler/rate_limiting/tracker.py +281 -0
- maxllm_gate-0.2.0/src/llm_scheduler/strategies/__init__.py +7 -0
- maxllm_gate-0.2.0/src/llm_scheduler/strategies/base.py +56 -0
- maxllm_gate-0.2.0/src/llm_scheduler/strategies/fallback.py +52 -0
- maxllm_gate-0.2.0/src/llm_scheduler/strategies/least_utilized.py +30 -0
- maxllm_gate-0.2.0/src/llm_scheduler/strategies/round_robin.py +29 -0
- maxllm_gate-0.2.0/src/llm_scheduler/strategies/token_aware.py +46 -0
- maxllm_gate-0.2.0/src/llm_scheduler/utils/__init__.py +6 -0
- maxllm_gate-0.2.0/src/llm_scheduler/utils/retry.py +136 -0
- maxllm_gate-0.2.0/src/llm_scheduler/utils/time_utils.py +115 -0
- maxllm_gate-0.2.0/src/maxllm/__init__.py +77 -0
- maxllm_gate-0.2.0/src/maxllm/client.py +598 -0
- maxllm_gate-0.2.0/src/maxllm/config.py +181 -0
- maxllm_gate-0.2.0/src/maxllm/rate_limiter.py +432 -0
- maxllm_gate-0.2.0/src/maxllm/redis_backend.py +495 -0
- maxllm_gate-0.2.0/src/maxllm/scheduler.py +559 -0
- maxllm_gate-0.2.0/src/maxllm/validation.py +183 -0
- maxllm_gate-0.2.0/tests/__init__.py +1 -0
- maxllm_gate-0.2.0/tests/conftest.py +96 -0
- maxllm_gate-0.2.0/tests/mocks/__init__.py +133 -0
- maxllm_gate-0.2.0/tests/test_api.py +92 -0
- maxllm_gate-0.2.0/tests/test_scheduler.py +136 -0
- maxllm_gate-0.2.0/tests/test_sdk.py +509 -0
- maxllm_gate-0.2.0/tests/test_strategies.py +151 -0
- maxllm_gate-0.2.0/tests/test_token_bucket.py +124 -0
- maxllm_gate-0.2.0/tests/test_token_estimator.py +113 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# LLM Rate Limit Scheduler Configuration
|
|
2
|
+
|
|
3
|
+
# Server settings
|
|
4
|
+
HOST=0.0.0.0
|
|
5
|
+
PORT=8000
|
|
6
|
+
DEBUG=false
|
|
7
|
+
LOG_LEVEL=INFO
|
|
8
|
+
|
|
9
|
+
# API Keys Configuration (JSON format)
|
|
10
|
+
# Format: {"key_id": {"api_key": "...", "provider": "...", "tpm_limit": ..., "rpm_limit": ...}}
|
|
11
|
+
API_KEYS_CONFIG='{
|
|
12
|
+
"groq-1": {
|
|
13
|
+
"api_key": "gsk_your_groq_key_1",
|
|
14
|
+
"provider": "groq",
|
|
15
|
+
"models": ["llama-3.1-70b-versatile", "mixtral-8x7b-32768"],
|
|
16
|
+
"tpm_limit": 30000,
|
|
17
|
+
"rpm_limit": 30
|
|
18
|
+
},
|
|
19
|
+
"groq-2": {
|
|
20
|
+
"api_key": "gsk_your_groq_key_2",
|
|
21
|
+
"provider": "groq",
|
|
22
|
+
"models": ["llama-3.1-70b-versatile", "mixtral-8x7b-32768"],
|
|
23
|
+
"tpm_limit": 30000,
|
|
24
|
+
"rpm_limit": 30
|
|
25
|
+
},
|
|
26
|
+
"openrouter-1": {
|
|
27
|
+
"api_key": "sk-or-your_openrouter_key",
|
|
28
|
+
"provider": "openrouter",
|
|
29
|
+
"models": ["anthropic/claude-3-haiku", "meta-llama/llama-3-70b-instruct"],
|
|
30
|
+
"tpm_limit": 100000,
|
|
31
|
+
"rpm_limit": 200
|
|
32
|
+
},
|
|
33
|
+
"openai-1": {
|
|
34
|
+
"api_key": "sk-your_openai_key",
|
|
35
|
+
"provider": "openai",
|
|
36
|
+
"models": ["gpt-4o-mini", "gpt-4o"],
|
|
37
|
+
"tpm_limit": 90000,
|
|
38
|
+
"rpm_limit": 500
|
|
39
|
+
}
|
|
40
|
+
}'
|
|
41
|
+
|
|
42
|
+
# Default scheduling strategy: least_utilized | round_robin | token_aware
|
|
43
|
+
DEFAULT_STRATEGY=least_utilized
|
|
44
|
+
|
|
45
|
+
# Token estimation settings
|
|
46
|
+
DEFAULT_MAX_TOKENS=1024
|
|
47
|
+
TOKEN_ESTIMATION_BUFFER=1.1
|
|
48
|
+
|
|
49
|
+
# Retry settings
|
|
50
|
+
MAX_RETRIES=3
|
|
51
|
+
RETRY_BASE_DELAY=1.0
|
|
52
|
+
RETRY_MAX_DELAY=60.0
|
|
53
|
+
|
|
54
|
+
# Queue settings
|
|
55
|
+
MAX_QUEUE_SIZE=10000
|
|
56
|
+
DEFAULT_PRIORITY=medium
|
|
57
|
+
|
|
58
|
+
# Redis settings (optional, for production)
|
|
59
|
+
# REDIS_URL=redis://localhost:6379/0
|
|
60
|
+
# USE_REDIS_QUEUE=false
|
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
# MAXLLM Copilot Instructions
|
|
2
|
+
|
|
3
|
+
## Project Overview
|
|
4
|
+
|
|
5
|
+
MAXLLM is a production-ready LLM client that sits on top of LiteLLM, providing intelligent rate limiting, smart routing, and distributed state support. It manages multiple API keys across providers (OpenAI, Groq, OpenRouter, etc.) to maximize throughput and prevent 429 errors.
|
|
6
|
+
|
|
7
|
+
### Architecture
|
|
8
|
+
|
|
9
|
+
The project has two main components:
|
|
10
|
+
|
|
11
|
+
1. **SDK Client (`src/maxllm/`)** - Simple Python client library for end users
|
|
12
|
+
2. **Scheduler Server (`src/llm_scheduler/`)** - Optional FastAPI gateway with advanced scheduling
|
|
13
|
+
|
|
14
|
+
**Request Flow:**
|
|
15
|
+
```
|
|
16
|
+
User → MAXLLM Client → Scheduler → Rate Limiter → LiteLLM → Provider API
|
|
17
|
+
↓
|
|
18
|
+
Queue Manager (if capacity exhausted)
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
**Core Components:**
|
|
22
|
+
- `llm_scheduler/core/scheduler.py` - Main scheduling engine that routes requests
|
|
23
|
+
- `llm_scheduler/rate_limiting/token_bucket.py` - Token bucket algorithm for rate limiting
|
|
24
|
+
- `llm_scheduler/strategies/` - Routing strategies (least_utilized, round_robin, token_aware, balanced)
|
|
25
|
+
- `maxllm/client.py` - User-facing SDK (sync/async)
|
|
26
|
+
- `maxllm/scheduler.py` - SDK's scheduler (simplified version for client use)
|
|
27
|
+
|
|
28
|
+
### Key Concepts
|
|
29
|
+
|
|
30
|
+
**Dual Package Structure:**
|
|
31
|
+
- `maxllm` - The SDK package that users import (`from maxllm import MAXLLM`)
|
|
32
|
+
- `llm_scheduler` - Server/API package for FastAPI gateway mode
|
|
33
|
+
- Both packages are in `src/` and installed together via `pyproject.toml`
|
|
34
|
+
|
|
35
|
+
**Rate Limiting Philosophy:**
|
|
36
|
+
- Never blindly hit 429 errors
|
|
37
|
+
- Estimate tokens BEFORE making requests (using tiktoken)
|
|
38
|
+
- Check ALL available keys before deciding to wait
|
|
39
|
+
- Use token bucket algorithm for TPM/RPM tracking
|
|
40
|
+
- Defer execution when capacity exhausted (queuing instead of failing)
|
|
41
|
+
|
|
42
|
+
**Routing Strategies:**
|
|
43
|
+
- `least_utilized` - Routes to key with most available capacity
|
|
44
|
+
- `round_robin` - Cycles through keys evenly
|
|
45
|
+
- `token_aware` - Prioritizes keys that can handle the request size
|
|
46
|
+
- `balanced` (NEW) - Weighted scoring: utilization (40%), latency (35%), errors (15%), freshness (10%)
|
|
47
|
+
|
|
48
|
+
## Build, Test, and Lint Commands
|
|
49
|
+
|
|
50
|
+
### Installation
|
|
51
|
+
```bash
|
|
52
|
+
# Development setup
|
|
53
|
+
pip install -e ".[dev,all]"
|
|
54
|
+
|
|
55
|
+
# Individual features
|
|
56
|
+
pip install -e ".[server]" # FastAPI server mode
|
|
57
|
+
pip install -e ".[yaml]" # YAML config support
|
|
58
|
+
pip install -e ".[redis]" # Redis backend
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Testing
|
|
62
|
+
```bash
|
|
63
|
+
# Run all tests
|
|
64
|
+
pytest
|
|
65
|
+
|
|
66
|
+
# Run with coverage
|
|
67
|
+
pytest --cov=src --cov-report=html
|
|
68
|
+
|
|
69
|
+
# Run specific test file
|
|
70
|
+
pytest tests/test_sdk.py
|
|
71
|
+
|
|
72
|
+
# Run single test
|
|
73
|
+
pytest tests/test_sdk.py::test_chat_basic
|
|
74
|
+
|
|
75
|
+
# Run async tests only
|
|
76
|
+
pytest -k "asyncio"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Linting
|
|
80
|
+
```bash
|
|
81
|
+
# Run ruff linter
|
|
82
|
+
ruff check src/ tests/
|
|
83
|
+
|
|
84
|
+
# Auto-fix issues
|
|
85
|
+
ruff check --fix src/ tests/
|
|
86
|
+
|
|
87
|
+
# Type checking
|
|
88
|
+
mypy src/
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Running the Server
|
|
92
|
+
```bash
|
|
93
|
+
# Start FastAPI server (requires [server] extras)
|
|
94
|
+
maxllm-server
|
|
95
|
+
|
|
96
|
+
# Or with uvicorn directly
|
|
97
|
+
uvicorn llm_scheduler.main:app --host 0.0.0.0 --port 8000
|
|
98
|
+
|
|
99
|
+
# With Docker
|
|
100
|
+
docker-compose up
|
|
101
|
+
|
|
102
|
+
# Check health
|
|
103
|
+
curl http://localhost:8000/health
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Running Examples
|
|
107
|
+
```bash
|
|
108
|
+
python examples/basic_usage.py
|
|
109
|
+
python examples/concurrent_requests.py
|
|
110
|
+
python examples/multi_key_config.py
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Key Conventions
|
|
114
|
+
|
|
115
|
+
### Config Management
|
|
116
|
+
|
|
117
|
+
**Two config systems coexist:**
|
|
118
|
+
1. SDK Config (`maxllm/config.py`) - Simple YAML/dict for client library
|
|
119
|
+
2. Server Config (`llm_scheduler/config.py`) - Pydantic Settings for FastAPI app
|
|
120
|
+
|
|
121
|
+
Both use similar structure but serve different purposes. Don't confuse them when making changes.
|
|
122
|
+
|
|
123
|
+
**Config Loading Priority:**
|
|
124
|
+
```python
|
|
125
|
+
# SDK client
|
|
126
|
+
MAXLLM.from_config("config.yaml") # YAML file
|
|
127
|
+
MAXLLM.from_env() # Environment variables
|
|
128
|
+
MAXLLM(keys=[...]) # Direct dict
|
|
129
|
+
|
|
130
|
+
# Server uses Pydantic Settings
|
|
131
|
+
settings.get_api_keys() # Reads from env vars or config
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Async/Sync Duality
|
|
135
|
+
|
|
136
|
+
The SDK provides both sync (`MAXLLM`) and async (`MAXLLMAsync`) clients. Key patterns:
|
|
137
|
+
|
|
138
|
+
- Async is preferred for production/high-throughput scenarios
|
|
139
|
+
- Sync wrapper uses `asyncio.run()` internally
|
|
140
|
+
- Both share the same core logic in `scheduler.py` and `rate_limiter.py`
|
|
141
|
+
- Tests use `@pytest.mark.asyncio` for async code
|
|
142
|
+
|
|
143
|
+
**Implementation pattern:**
|
|
144
|
+
```python
|
|
145
|
+
# Internal methods are async
|
|
146
|
+
async def _execute_request(...):
|
|
147
|
+
...
|
|
148
|
+
|
|
149
|
+
# Public API provides both
|
|
150
|
+
def chat(self, ...): # Sync wrapper
|
|
151
|
+
return asyncio.run(self._execute_request(...))
|
|
152
|
+
|
|
153
|
+
async def chat(self, ...): # Async version
|
|
154
|
+
return await self._execute_request(...)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Token Estimation
|
|
158
|
+
|
|
159
|
+
Token counting happens BEFORE requests to avoid hitting rate limits:
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
# src/llm_scheduler/core/token_estimator.py
|
|
163
|
+
estimated_tokens = token_estimator.estimate(messages, max_tokens)
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
- Uses tiktoken for accurate counts
|
|
167
|
+
- Adds buffer (default 10%) for safety margin
|
|
168
|
+
- Cached encoders per model to avoid repeated initialization
|
|
169
|
+
- Estimation errors are conservative (overestimate to be safe)
|
|
170
|
+
|
|
171
|
+
### Strategy Selection
|
|
172
|
+
|
|
173
|
+
Strategies are selected by name in config and resolved via registry:
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
# src/llm_scheduler/strategies/__init__.py
|
|
177
|
+
strategy = StrategyRegistry.get(strategy_name)
|
|
178
|
+
selected_key = strategy.select(candidates, estimated_tokens)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
When adding new strategies:
|
|
182
|
+
1. Create class in `strategies/` that extends `SchedulingStrategy`
|
|
183
|
+
2. Register in `StrategyRegistry`
|
|
184
|
+
3. Add to config validation in `config.py`
|
|
185
|
+
4. Add tests in `test_strategies.py`
|
|
186
|
+
|
|
187
|
+
### Error Handling
|
|
188
|
+
|
|
189
|
+
**Retry Logic:**
|
|
190
|
+
- Transient failures (network, timeout) → automatic retry with exponential backoff
|
|
191
|
+
- Rate limit hits (429) → should never happen (that's the point!)
|
|
192
|
+
- Auth failures (401) → immediate fail, no retry
|
|
193
|
+
- Model not found (404) → immediate fail, no retry
|
|
194
|
+
|
|
195
|
+
**Key Health Tracking:**
|
|
196
|
+
- Each key tracks error rate and latency
|
|
197
|
+
- Strategies can use health metrics for routing decisions
|
|
198
|
+
- Unhealthy keys are automatically deprioritized
|
|
199
|
+
- See `llm_scheduler/rate_limiting/tracker.py`
|
|
200
|
+
|
|
201
|
+
### Testing Patterns
|
|
202
|
+
|
|
203
|
+
**Fixtures in conftest.py:**
|
|
204
|
+
- `mock_config` - Test config with fake keys
|
|
205
|
+
- `key_manager` - Pre-configured KeyManager
|
|
206
|
+
- `scheduler` - Running scheduler instance
|
|
207
|
+
- `sample_messages` - Standard test messages
|
|
208
|
+
|
|
209
|
+
**Mocking LiteLLM:**
|
|
210
|
+
```python
|
|
211
|
+
@patch("litellm.acompletion")
|
|
212
|
+
async def test_something(mock_completion):
|
|
213
|
+
mock_completion.return_value = AsyncMock(...)
|
|
214
|
+
# Test code
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
**Testing Rate Limits:**
|
|
218
|
+
Use `TokenBucket` directly to test token bucket logic without full scheduler overhead.
|
|
219
|
+
|
|
220
|
+
### Redis Backend (Optional)
|
|
221
|
+
|
|
222
|
+
For distributed deployments, rate limit state can be stored in Redis:
|
|
223
|
+
|
|
224
|
+
```python
|
|
225
|
+
# src/maxllm/redis_backend.py
|
|
226
|
+
limiter = HybridRateLimiter(
|
|
227
|
+
redis_url="redis://localhost:6379",
|
|
228
|
+
fallback_to_memory=True, # Graceful degradation
|
|
229
|
+
)
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
- Keys stored as `maxllm:ratelimit:{key_id}:tokens`
|
|
233
|
+
- Uses Redis EVAL for atomic token consumption
|
|
234
|
+
- Falls back to in-memory if Redis unavailable
|
|
235
|
+
- Not required for single-instance deployments
|
|
236
|
+
|
|
237
|
+
### Observability
|
|
238
|
+
|
|
239
|
+
**Metrics Available:**
|
|
240
|
+
- `client.capacity()` - Token/request capacity per key
|
|
241
|
+
- `client.latency()` - Latency stats (avg, p50, p99)
|
|
242
|
+
- `client.scores()` - Routing decision scores per key
|
|
243
|
+
|
|
244
|
+
**Prometheus Integration (server mode):**
|
|
245
|
+
- Request counts by model/key
|
|
246
|
+
- Latency histograms
|
|
247
|
+
- Queue depth
|
|
248
|
+
- Rate limit hit rate
|
|
249
|
+
- Available at `/metrics` endpoint
|
|
250
|
+
|
|
251
|
+
### Common Pitfalls
|
|
252
|
+
|
|
253
|
+
1. **Don't confuse the two config systems** - SDK uses `MAXLLMConfig`, server uses Pydantic Settings
|
|
254
|
+
2. **Token estimation is approximate** - Always add buffer, never assume exact count
|
|
255
|
+
3. **Strategies return None if no capacity** - Handle this case (queue or fail)
|
|
256
|
+
4. **Context managers are important** - Use `with MAXLLM.from_config(...)` for graceful shutdown
|
|
257
|
+
5. **Test isolation** - Each test should use fresh scheduler instance (see fixtures)
|
|
258
|
+
6. **Provider-specific quirks** - Some providers need special handling in LiteLLM (check docs)
|
|
259
|
+
|
|
260
|
+
### File Organization
|
|
261
|
+
|
|
262
|
+
```
|
|
263
|
+
src/
|
|
264
|
+
maxllm/ # SDK package (public API)
|
|
265
|
+
client.py # User-facing MAXLLM/MAXLLMAsync classes
|
|
266
|
+
scheduler.py # Client-side scheduler
|
|
267
|
+
config.py # SDK config models
|
|
268
|
+
rate_limiter.py # Rate limiting for SDK
|
|
269
|
+
validation.py # Pydantic request validation
|
|
270
|
+
|
|
271
|
+
llm_scheduler/ # Server package (FastAPI)
|
|
272
|
+
main.py # FastAPI app entry point
|
|
273
|
+
config.py # Server settings (Pydantic)
|
|
274
|
+
api/ # FastAPI routes
|
|
275
|
+
core/ # Core scheduling logic
|
|
276
|
+
scheduler.py # Main scheduler engine
|
|
277
|
+
dispatcher.py # Request dispatcher
|
|
278
|
+
queue_manager.py # Request queuing
|
|
279
|
+
token_estimator.py # Token counting
|
|
280
|
+
rate_limiting/ # Rate limit tracking
|
|
281
|
+
token_bucket.py # Token bucket algorithm
|
|
282
|
+
key_manager.py # API key management
|
|
283
|
+
tracker.py # Rate limit state
|
|
284
|
+
strategies/ # Routing strategies
|
|
285
|
+
observability/ # Logging and metrics
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
## Environment Variables
|
|
289
|
+
|
|
290
|
+
```bash
|
|
291
|
+
# For SDK usage
|
|
292
|
+
MAXLLM_KEYS='{"groq-1": {...}, "openai-1": {...}}'
|
|
293
|
+
|
|
294
|
+
# For server mode (see llm_scheduler/config.py)
|
|
295
|
+
HOST=0.0.0.0
|
|
296
|
+
PORT=8000
|
|
297
|
+
LOG_LEVEL=INFO
|
|
298
|
+
DEFAULT_STRATEGY=balanced
|
|
299
|
+
MAX_QUEUE_SIZE=10000
|
|
300
|
+
|
|
301
|
+
# Redis (optional)
|
|
302
|
+
REDIS_URL=redis://localhost:6379
|
|
303
|
+
REDIS_PREFIX=maxllm:
|
|
304
|
+
|
|
305
|
+
# Provider API keys can also be individual env vars
|
|
306
|
+
GROQ_API_KEY=gsk_...
|
|
307
|
+
OPENAI_API_KEY=sk-...
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
## Making Changes
|
|
311
|
+
|
|
312
|
+
### Adding a New Provider
|
|
313
|
+
|
|
314
|
+
1. LiteLLM already handles most providers - just add to config
|
|
315
|
+
2. Update `config.example.yaml` with example
|
|
316
|
+
3. Add provider-specific rate limits (check their docs)
|
|
317
|
+
4. Test with `examples/basic_usage.py`
|
|
318
|
+
|
|
319
|
+
### Adding a New Strategy
|
|
320
|
+
|
|
321
|
+
1. Create `src/llm_scheduler/strategies/my_strategy.py`
|
|
322
|
+
2. Extend `SchedulingStrategy` base class
|
|
323
|
+
3. Implement `select()` method
|
|
324
|
+
4. Register in `StrategyRegistry` (`strategies/__init__.py`)
|
|
325
|
+
5. Add tests in `tests/test_strategies.py`
|
|
326
|
+
6. Update README.md strategy table
|
|
327
|
+
|
|
328
|
+
### Modifying Rate Limiting
|
|
329
|
+
|
|
330
|
+
Core logic is in `token_bucket.py`. The token bucket algorithm:
|
|
331
|
+
- Refills at constant rate (TPM/RPM converted to tokens per second)
|
|
332
|
+
- Consumes tokens on each request
|
|
333
|
+
- Blocks if insufficient capacity
|
|
334
|
+
|
|
335
|
+
Be careful changing this - it's mathematically proven and well-tested.
|
|
336
|
+
|
|
337
|
+
### Changing Token Estimation
|
|
338
|
+
|
|
339
|
+
Token estimation is in `core/token_estimator.py`. Uses tiktoken under the hood. If changing:
|
|
340
|
+
- Keep conservative (overestimate is better than underestimate)
|
|
341
|
+
- Cache encoders (they're expensive to create)
|
|
342
|
+
- Test with various message lengths
|
|
343
|
+
- Consider token_buffer multiplier in config
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
*.egg-info/
|
|
24
|
+
.installed.cfg
|
|
25
|
+
*.egg
|
|
26
|
+
|
|
27
|
+
# PyInstaller
|
|
28
|
+
*.manifest
|
|
29
|
+
*.spec
|
|
30
|
+
|
|
31
|
+
# Installer logs
|
|
32
|
+
pip-log.txt
|
|
33
|
+
pip-delete-this-directory.txt
|
|
34
|
+
|
|
35
|
+
# Unit test / coverage reports
|
|
36
|
+
htmlcov/
|
|
37
|
+
.tox/
|
|
38
|
+
.nox/
|
|
39
|
+
.coverage
|
|
40
|
+
.coverage.*
|
|
41
|
+
.cache
|
|
42
|
+
nosetests.xml
|
|
43
|
+
coverage.xml
|
|
44
|
+
*.cover
|
|
45
|
+
*.py,cover
|
|
46
|
+
.hypothesis/
|
|
47
|
+
.pytest_cache/
|
|
48
|
+
|
|
49
|
+
# Translations
|
|
50
|
+
*.mo
|
|
51
|
+
*.pot
|
|
52
|
+
|
|
53
|
+
# Environments
|
|
54
|
+
.env
|
|
55
|
+
.venv
|
|
56
|
+
env/
|
|
57
|
+
venv/
|
|
58
|
+
ENV/
|
|
59
|
+
env.bak/
|
|
60
|
+
venv.bak/
|
|
61
|
+
|
|
62
|
+
# IDE
|
|
63
|
+
.vscode/
|
|
64
|
+
.idea/
|
|
65
|
+
*.swp
|
|
66
|
+
*.swo
|
|
67
|
+
*~
|
|
68
|
+
|
|
69
|
+
# OS
|
|
70
|
+
.DS_Store
|
|
71
|
+
Thumbs.db
|
|
72
|
+
|
|
73
|
+
# Project specific
|
|
74
|
+
logs/
|
|
75
|
+
*.log
|
|
76
|
+
.mypy_cache/
|
|
77
|
+
.ruff_cache/
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
FROM python:3.11-slim
|
|
2
|
+
|
|
3
|
+
WORKDIR /app
|
|
4
|
+
|
|
5
|
+
# Install dependencies
|
|
6
|
+
COPY requirements.txt .
|
|
7
|
+
RUN pip install --no-cache-dir -r requirements.txt
|
|
8
|
+
|
|
9
|
+
# Copy source code
|
|
10
|
+
COPY src/ ./src/
|
|
11
|
+
COPY pyproject.toml .
|
|
12
|
+
|
|
13
|
+
# Install package
|
|
14
|
+
RUN pip install -e .
|
|
15
|
+
|
|
16
|
+
# Expose port
|
|
17
|
+
EXPOSE 8000
|
|
18
|
+
|
|
19
|
+
# Health check
|
|
20
|
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
|
21
|
+
CMD curl -f http://localhost:8000/health || exit 1
|
|
22
|
+
|
|
23
|
+
# Run the application
|
|
24
|
+
CMD ["uvicorn", "llm_scheduler.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 LLM Rate Limit Scheduler Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
File without changes
|