flexllm 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flexllm/__init__.py +224 -0
- flexllm/__main__.py +1096 -0
- flexllm/async_api/__init__.py +9 -0
- flexllm/async_api/concurrent_call.py +100 -0
- flexllm/async_api/concurrent_executor.py +1036 -0
- flexllm/async_api/core.py +373 -0
- flexllm/async_api/interface.py +12 -0
- flexllm/async_api/progress.py +277 -0
- flexllm/base_client.py +988 -0
- flexllm/batch_tools/__init__.py +16 -0
- flexllm/batch_tools/folder_processor.py +317 -0
- flexllm/batch_tools/table_processor.py +363 -0
- flexllm/cache/__init__.py +10 -0
- flexllm/cache/response_cache.py +293 -0
- flexllm/chain_of_thought_client.py +1120 -0
- flexllm/claudeclient.py +402 -0
- flexllm/client_pool.py +698 -0
- flexllm/geminiclient.py +563 -0
- flexllm/llm_client.py +523 -0
- flexllm/llm_parser.py +60 -0
- flexllm/mllm_client.py +559 -0
- flexllm/msg_processors/__init__.py +174 -0
- flexllm/msg_processors/image_processor.py +729 -0
- flexllm/msg_processors/image_processor_helper.py +485 -0
- flexllm/msg_processors/messages_processor.py +341 -0
- flexllm/msg_processors/unified_processor.py +1404 -0
- flexllm/openaiclient.py +256 -0
- flexllm/pricing/__init__.py +104 -0
- flexllm/pricing/data.json +1201 -0
- flexllm/pricing/updater.py +223 -0
- flexllm/provider_router.py +213 -0
- flexllm/token_counter.py +270 -0
- flexllm/utils/__init__.py +1 -0
- flexllm/utils/core.py +41 -0
- flexllm-0.3.3.dist-info/METADATA +573 -0
- flexllm-0.3.3.dist-info/RECORD +39 -0
- flexllm-0.3.3.dist-info/WHEEL +4 -0
- flexllm-0.3.3.dist-info/entry_points.txt +3 -0
- flexllm-0.3.3.dist-info/licenses/LICENSE +201 -0
flexllm/utils/core.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Core utilities for flexllm"""
|
|
2
|
+
|
|
3
|
+
from functools import wraps
|
|
4
|
+
import asyncio
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def async_retry(
|
|
9
|
+
retry_times: int = 3,
|
|
10
|
+
retry_delay: float = 1.0,
|
|
11
|
+
exceptions: tuple = (Exception,),
|
|
12
|
+
logger=None,
|
|
13
|
+
):
|
|
14
|
+
"""
|
|
15
|
+
Async retry decorator
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
retry_times: Maximum retry count
|
|
19
|
+
retry_delay: Delay between retries (seconds)
|
|
20
|
+
exceptions: Exception types to retry on
|
|
21
|
+
logger: Logger instance
|
|
22
|
+
"""
|
|
23
|
+
if logger is None:
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
def decorator(func):
|
|
27
|
+
@wraps(func)
|
|
28
|
+
async def wrapper(*args, **kwargs):
|
|
29
|
+
for attempt in range(retry_times):
|
|
30
|
+
try:
|
|
31
|
+
return await func(*args, **kwargs)
|
|
32
|
+
except exceptions as e:
|
|
33
|
+
if attempt == retry_times - 1:
|
|
34
|
+
raise
|
|
35
|
+
logger.warning(f"Attempt {attempt + 1} failed: {str(e)}")
|
|
36
|
+
await asyncio.sleep(retry_delay)
|
|
37
|
+
return await func(*args, **kwargs)
|
|
38
|
+
|
|
39
|
+
return wrapper
|
|
40
|
+
|
|
41
|
+
return decorator
|
|
@@ -0,0 +1,573 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: flexllm
|
|
3
|
+
Version: 0.3.3
|
|
4
|
+
Summary: High-performance LLM client with batch processing, caching, and checkpoint recovery
|
|
5
|
+
Project-URL: Homepage, https://github.com/KenyonY/flexllm
|
|
6
|
+
Project-URL: Repository, https://github.com/KenyonY/flexllm
|
|
7
|
+
Project-URL: Documentation, https://github.com/KenyonY/flexllm#readme
|
|
8
|
+
Project-URL: Issues, https://github.com/KenyonY/flexllm/issues
|
|
9
|
+
Author-email: kunyuan <beidongjiedeguang@gmail.com>
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: anthropic,async,batch,cache,claude,gemini,llm,multimodal,openai
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Requires-Dist: aiohttp>=3.8.0
|
|
24
|
+
Requires-Dist: aiolimiter>=1.1.0
|
|
25
|
+
Requires-Dist: json5>=0.9.0
|
|
26
|
+
Requires-Dist: loguru>=0.6.0
|
|
27
|
+
Requires-Dist: numpy
|
|
28
|
+
Requires-Dist: orjson
|
|
29
|
+
Requires-Dist: pillow
|
|
30
|
+
Requires-Dist: requests>=2.28.0
|
|
31
|
+
Requires-Dist: rich>=12.0.0
|
|
32
|
+
Requires-Dist: tqdm>=4.60.0
|
|
33
|
+
Provides-Extra: all
|
|
34
|
+
Requires-Dist: flaxkv2>=0.1.5; extra == 'all'
|
|
35
|
+
Requires-Dist: google-auth>=2.0.0; extra == 'all'
|
|
36
|
+
Requires-Dist: opencv-python; extra == 'all'
|
|
37
|
+
Requires-Dist: pyyaml>=6.0; extra == 'all'
|
|
38
|
+
Requires-Dist: tiktoken>=0.5.0; extra == 'all'
|
|
39
|
+
Requires-Dist: typer>=0.9.0; extra == 'all'
|
|
40
|
+
Provides-Extra: cache
|
|
41
|
+
Requires-Dist: flaxkv2>=0.1.5; extra == 'cache'
|
|
42
|
+
Provides-Extra: cli
|
|
43
|
+
Requires-Dist: pyyaml>=6.0; extra == 'cli'
|
|
44
|
+
Requires-Dist: typer>=0.9.0; extra == 'cli'
|
|
45
|
+
Provides-Extra: dev
|
|
46
|
+
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
47
|
+
Requires-Dist: isort>=5.12.0; extra == 'dev'
|
|
48
|
+
Requires-Dist: pytest-asyncio>=0.20.0; extra == 'dev'
|
|
49
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
50
|
+
Provides-Extra: image
|
|
51
|
+
Requires-Dist: opencv-python; extra == 'image'
|
|
52
|
+
Provides-Extra: test
|
|
53
|
+
Requires-Dist: flaxkv2>=0.1.5; extra == 'test'
|
|
54
|
+
Requires-Dist: opencv-python; extra == 'test'
|
|
55
|
+
Requires-Dist: pandas>=1.3.0; extra == 'test'
|
|
56
|
+
Requires-Dist: pytest-asyncio>=0.20.0; extra == 'test'
|
|
57
|
+
Requires-Dist: pytest>=7.0.0; extra == 'test'
|
|
58
|
+
Provides-Extra: token
|
|
59
|
+
Requires-Dist: tiktoken>=0.5.0; extra == 'token'
|
|
60
|
+
Provides-Extra: vertex
|
|
61
|
+
Requires-Dist: google-auth>=2.0.0; extra == 'vertex'
|
|
62
|
+
Description-Content-Type: text/markdown
|
|
63
|
+
|
|
64
|
+
<h1 align="center">flexllm</h1>
|
|
65
|
+
|
|
66
|
+
<p align="center">
|
|
67
|
+
<strong>Production-grade LLM client with checkpoint recovery, response caching, and multi-provider support</strong>
|
|
68
|
+
</p>
|
|
69
|
+
|
|
70
|
+
<p align="center">
|
|
71
|
+
<a href="https://pypi.org/project/flexllm/">
|
|
72
|
+
<img src="https://img.shields.io/pypi/v/flexllm?color=brightgreen&style=flat-square" alt="PyPI version">
|
|
73
|
+
</a>
|
|
74
|
+
<a href="https://github.com/KenyonY/flexllm/blob/main/LICENSE">
|
|
75
|
+
<img alt="License" src="https://img.shields.io/github/license/KenyonY/flexllm.svg?color=blue&style=flat-square">
|
|
76
|
+
</a>
|
|
77
|
+
<a href="https://pypistats.org/packages/flexllm">
|
|
78
|
+
<img alt="pypi downloads" src="https://img.shields.io/pypi/dm/flexllm?style=flat-square">
|
|
79
|
+
</a>
|
|
80
|
+
</p>
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Features
|
|
85
|
+
|
|
86
|
+
| Feature | Description |
|
|
87
|
+
|---------|-------------|
|
|
88
|
+
| **Checkpoint Recovery** | Batch jobs auto-resume from interruption - process millions of requests without losing progress |
|
|
89
|
+
| **Response Caching** | Built-in intelligent caching with TTL and IPC multi-process sharing - avoid duplicate API calls |
|
|
90
|
+
| **Multi-Provider** | One interface for OpenAI, Gemini, Claude, and any OpenAI-compatible API (vLLM, Ollama, etc.) |
|
|
91
|
+
| **High-Performance Async** | Fine-grained concurrency control, QPS limiting, and streaming batch results |
|
|
92
|
+
| **Load Balancing** | Multi-endpoint distribution with automatic failover (round_robin/weighted/random/fallback) |
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Core Strengths
|
|
97
|
+
|
|
98
|
+
### 1. Checkpoint Recovery - Never Lose Progress
|
|
99
|
+
|
|
100
|
+
Process millions of requests without fear of interruption. When your batch job crashes at 3 AM, just restart it - flexllm picks up exactly where it left off.
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
# Process 100,000 requests - if interrupted, resume automatically
|
|
104
|
+
results = await client.chat_completions_batch(
|
|
105
|
+
messages_list,
|
|
106
|
+
output_jsonl="results.jsonl", # Progress saved here
|
|
107
|
+
)
|
|
108
|
+
# Ctrl+C at 50,000? No problem. Re-run and it continues from 50,001.
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### 2. Response Caching - Save Money, Save Time
|
|
112
|
+
|
|
113
|
+
Built-in intelligent caching avoids duplicate API calls. Same question? Instant answer from cache.
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
client = LLMClient(
|
|
117
|
+
model="gpt-4",
|
|
118
|
+
cache=ResponseCacheConfig.with_ttl(3600), # 1 hour cache
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# First call: API request (~2s, ~$0.01)
|
|
122
|
+
result1 = await client.chat_completions(messages)
|
|
123
|
+
|
|
124
|
+
# Second call: Cache hit (~0.001s, $0)
|
|
125
|
+
result2 = await client.chat_completions(messages)
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Supports multi-process cache sharing via IPC - perfect for distributed workloads.
|
|
129
|
+
|
|
130
|
+
### 3. One Interface, All Providers
|
|
131
|
+
|
|
132
|
+
Write once, run everywhere. Switch between OpenAI, Gemini, Claude, or self-hosted models without changing your code.
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
# OpenAI
|
|
136
|
+
client = LLMClient(provider="openai", base_url="https://api.openai.com/v1", ...)
|
|
137
|
+
|
|
138
|
+
# Gemini
|
|
139
|
+
client = LLMClient(provider="gemini", api_key="...", model="gemini-2.0-flash")
|
|
140
|
+
|
|
141
|
+
# Claude
|
|
142
|
+
client = LLMClient(provider="claude", api_key="...", model="claude-sonnet-4-20250514")
|
|
143
|
+
|
|
144
|
+
# Self-hosted (vLLM, Ollama, etc.)
|
|
145
|
+
client = LLMClient(base_url="http://localhost:8000/v1", model="qwen2.5")
|
|
146
|
+
|
|
147
|
+
# Same API for all:
|
|
148
|
+
result = await client.chat_completions(messages)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### 4. High-Performance Async Engine
|
|
152
|
+
|
|
153
|
+
Maximize throughput with fine-grained concurrency control and QPS limiting.
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
client = LLMClient(
|
|
157
|
+
concurrency_limit=100, # 100 concurrent requests
|
|
158
|
+
max_qps=50, # Rate limit: 50 req/sec
|
|
159
|
+
retry_times=3, # Auto-retry on failure
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Process 10,000 requests with optimal parallelism
|
|
163
|
+
results = await client.chat_completions_batch(messages_list, show_progress=True)
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
Streaming results - process results as they complete, don't wait for all:
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
async for result in client.iter_chat_completions_batch(messages_list):
|
|
170
|
+
process(result) # Handle each result immediately
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### 5. Load Balancing & Failover
|
|
174
|
+
|
|
175
|
+
Distribute workloads across multiple endpoints with automatic failover.
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
pool = LLMClientPool(
|
|
179
|
+
endpoints=[
|
|
180
|
+
{"base_url": "http://gpu1:8000/v1", "model": "qwen"},
|
|
181
|
+
{"base_url": "http://gpu2:8000/v1", "model": "qwen"},
|
|
182
|
+
{"base_url": "http://gpu3:8000/v1", "model": "qwen"},
|
|
183
|
+
],
|
|
184
|
+
load_balance="round_robin", # or "weighted", "random", "fallback"
|
|
185
|
+
fallback=True, # Auto-switch on failure
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Requests automatically distributed across healthy endpoints
|
|
189
|
+
results = await pool.chat_completions_batch(messages_list, distribute=True)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### 6. Thinking Mode Support
|
|
193
|
+
|
|
194
|
+
Unified interface for reasoning models - DeepSeek-R1, Qwen3, Claude extended thinking, Gemini thinking.
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
result = await client.chat_completions(
|
|
198
|
+
messages,
|
|
199
|
+
thinking=True, # Enable thinking
|
|
200
|
+
return_raw=True,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# Unified parsing across all providers
|
|
204
|
+
parsed = client.parse_thoughts(result.data)
|
|
205
|
+
print("Thinking:", parsed["thought"])
|
|
206
|
+
print("Answer:", parsed["answer"])
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
## Installation
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
pip install flexllm
|
|
215
|
+
|
|
216
|
+
# With caching support
|
|
217
|
+
pip install flexllm[cache]
|
|
218
|
+
|
|
219
|
+
# With CLI
|
|
220
|
+
pip install flexllm[cli]
|
|
221
|
+
|
|
222
|
+
# All features
|
|
223
|
+
pip install flexllm[all]
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
## Quick Start
|
|
227
|
+
|
|
228
|
+
### Single Request
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
from flexllm import LLMClient
|
|
232
|
+
|
|
233
|
+
client = LLMClient(
|
|
234
|
+
model="gpt-4",
|
|
235
|
+
base_url="https://api.openai.com/v1",
|
|
236
|
+
api_key="your-api-key"
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# Async
|
|
240
|
+
response = await client.chat_completions([
|
|
241
|
+
{"role": "user", "content": "Hello!"}
|
|
242
|
+
])
|
|
243
|
+
|
|
244
|
+
# Sync
|
|
245
|
+
response = client.chat_completions_sync([
|
|
246
|
+
{"role": "user", "content": "Hello!"}
|
|
247
|
+
])
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### Batch Processing with Checkpoint Recovery
|
|
251
|
+
|
|
252
|
+
```python
|
|
253
|
+
from flexllm import LLMClient
|
|
254
|
+
|
|
255
|
+
client = LLMClient(
|
|
256
|
+
model="gpt-4",
|
|
257
|
+
base_url="https://api.openai.com/v1",
|
|
258
|
+
api_key="your-api-key",
|
|
259
|
+
concurrency_limit=50,
|
|
260
|
+
max_qps=100,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
messages_list = [
|
|
264
|
+
[{"role": "user", "content": f"Question {i}"}]
|
|
265
|
+
for i in range(10000)
|
|
266
|
+
]
|
|
267
|
+
|
|
268
|
+
# If interrupted, re-running resumes from where it stopped
|
|
269
|
+
results = await client.chat_completions_batch(
|
|
270
|
+
messages_list,
|
|
271
|
+
output_jsonl="results.jsonl",
|
|
272
|
+
show_progress=True,
|
|
273
|
+
)
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
### Response Caching
|
|
277
|
+
|
|
278
|
+
```python
|
|
279
|
+
from flexllm import LLMClient, ResponseCacheConfig
|
|
280
|
+
|
|
281
|
+
client = LLMClient(
|
|
282
|
+
model="gpt-4",
|
|
283
|
+
base_url="https://api.openai.com/v1",
|
|
284
|
+
api_key="your-api-key",
|
|
285
|
+
cache=ResponseCacheConfig.with_ttl(3600), # 1 hour TTL
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
# Duplicate requests hit cache automatically
|
|
289
|
+
result1 = await client.chat_completions(messages) # API call
|
|
290
|
+
result2 = await client.chat_completions(messages) # Cache hit (instant)
|
|
291
|
+
|
|
292
|
+
# Multi-process cache sharing (IPC mode - default)
|
|
293
|
+
cache = ResponseCacheConfig.ipc(ttl=86400) # 24h, shared across processes
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
### Streaming Response
|
|
297
|
+
|
|
298
|
+
```python
|
|
299
|
+
async for chunk in client.chat_completions_stream(messages):
|
|
300
|
+
print(chunk, end="", flush=True)
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
### Multi-Modal (Vision)
|
|
304
|
+
|
|
305
|
+
```python
|
|
306
|
+
from flexllm import MllmClient
|
|
307
|
+
|
|
308
|
+
client = MllmClient(
|
|
309
|
+
base_url="https://api.openai.com/v1",
|
|
310
|
+
api_key="your-api-key",
|
|
311
|
+
model="gpt-4o",
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
messages = [
|
|
315
|
+
{
|
|
316
|
+
"role": "user",
|
|
317
|
+
"content": [
|
|
318
|
+
{"type": "text", "text": "What's in this image?"},
|
|
319
|
+
{"type": "image_url", "image_url": {"url": "path/to/image.jpg"}}
|
|
320
|
+
]
|
|
321
|
+
}
|
|
322
|
+
]
|
|
323
|
+
|
|
324
|
+
response = await client.call_llm([messages])
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
### Load Balancing with Failover
|
|
328
|
+
|
|
329
|
+
```python
|
|
330
|
+
from flexllm import LLMClientPool
|
|
331
|
+
|
|
332
|
+
pool = LLMClientPool(
|
|
333
|
+
endpoints=[
|
|
334
|
+
{"base_url": "http://host1:8000/v1", "api_key": "key1", "model": "qwen"},
|
|
335
|
+
{"base_url": "http://host2:8000/v1", "api_key": "key2", "model": "qwen"},
|
|
336
|
+
],
|
|
337
|
+
load_balance="round_robin",
|
|
338
|
+
fallback=True,
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
# Single request with automatic failover
|
|
342
|
+
result = await pool.chat_completions(messages)
|
|
343
|
+
|
|
344
|
+
# Batch requests distributed across endpoints
|
|
345
|
+
results = await pool.chat_completions_batch(messages_list, distribute=True)
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
### Gemini Client
|
|
349
|
+
|
|
350
|
+
```python
|
|
351
|
+
from flexllm import GeminiClient
|
|
352
|
+
|
|
353
|
+
# Gemini Developer API
|
|
354
|
+
client = GeminiClient(
|
|
355
|
+
model="gemini-2.0-flash",
|
|
356
|
+
api_key="your-gemini-api-key"
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
# With thinking mode
|
|
360
|
+
response = await client.chat_completions(
|
|
361
|
+
messages,
|
|
362
|
+
thinking="high", # False, True, "minimal", "low", "medium", "high"
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
# Vertex AI mode
|
|
366
|
+
client = GeminiClient(
|
|
367
|
+
model="gemini-2.0-flash",
|
|
368
|
+
project_id="your-project-id",
|
|
369
|
+
location="us-central1",
|
|
370
|
+
use_vertex_ai=True,
|
|
371
|
+
)
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
### Claude Client
|
|
375
|
+
|
|
376
|
+
```python
|
|
377
|
+
from flexllm import LLMClient, ClaudeClient
|
|
378
|
+
|
|
379
|
+
# Using unified LLMClient (recommended)
|
|
380
|
+
client = LLMClient(
|
|
381
|
+
provider="claude",
|
|
382
|
+
api_key="your-anthropic-key",
|
|
383
|
+
model="claude-sonnet-4-20250514",
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
response = await client.chat_completions([
|
|
387
|
+
{"role": "user", "content": "Hello, Claude!"}
|
|
388
|
+
])
|
|
389
|
+
|
|
390
|
+
# With extended thinking
|
|
391
|
+
result = await client.chat_completions(
|
|
392
|
+
messages,
|
|
393
|
+
thinking=True,
|
|
394
|
+
return_raw=True,
|
|
395
|
+
)
|
|
396
|
+
parsed = client.parse_thoughts(result.data)
|
|
397
|
+
```
|
|
398
|
+
|
|
399
|
+
### Function Calling (Tool Use)
|
|
400
|
+
|
|
401
|
+
```python
|
|
402
|
+
from flexllm import LLMClient
|
|
403
|
+
|
|
404
|
+
client = LLMClient(
|
|
405
|
+
base_url="https://api.openai.com/v1",
|
|
406
|
+
api_key="your-api-key",
|
|
407
|
+
model="gpt-4",
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
tools = [
|
|
411
|
+
{
|
|
412
|
+
"type": "function",
|
|
413
|
+
"function": {
|
|
414
|
+
"name": "get_weather",
|
|
415
|
+
"description": "Get current weather for a location",
|
|
416
|
+
"parameters": {
|
|
417
|
+
"type": "object",
|
|
418
|
+
"properties": {
|
|
419
|
+
"location": {"type": "string", "description": "City name"}
|
|
420
|
+
},
|
|
421
|
+
"required": ["location"]
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
]
|
|
426
|
+
|
|
427
|
+
result = await client.chat_completions(
|
|
428
|
+
messages=[{"role": "user", "content": "What's the weather in Tokyo?"}],
|
|
429
|
+
tools=tools,
|
|
430
|
+
return_usage=True,
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
if result.tool_calls:
|
|
434
|
+
for tool_call in result.tool_calls:
|
|
435
|
+
print(f"Function: {tool_call.function['name']}")
|
|
436
|
+
print(f"Arguments: {tool_call.function['arguments']}")
|
|
437
|
+
```
|
|
438
|
+
|
|
439
|
+
## CLI Usage
|
|
440
|
+
|
|
441
|
+
```bash
|
|
442
|
+
# Quick ask
|
|
443
|
+
flexllm ask "What is Python?"
|
|
444
|
+
flexllm ask "Explain this" -s "You are a code expert"
|
|
445
|
+
echo "long text" | flexllm ask "Summarize"
|
|
446
|
+
|
|
447
|
+
# Interactive chat
|
|
448
|
+
flexllm chat
|
|
449
|
+
flexllm chat --model=gpt-4 "Hello"
|
|
450
|
+
|
|
451
|
+
# Batch processing with checkpoint recovery
|
|
452
|
+
flexllm batch input.jsonl -o output.jsonl
|
|
453
|
+
|
|
454
|
+
# List models
|
|
455
|
+
flexllm models # Remote models
|
|
456
|
+
flexllm list_models # Configured models
|
|
457
|
+
|
|
458
|
+
# Test connection
|
|
459
|
+
flexllm test
|
|
460
|
+
|
|
461
|
+
# Initialize config
|
|
462
|
+
flexllm init
|
|
463
|
+
```
|
|
464
|
+
|
|
465
|
+
### CLI Configuration
|
|
466
|
+
|
|
467
|
+
Create `~/.flexllm/config.yaml`:
|
|
468
|
+
|
|
469
|
+
```yaml
|
|
470
|
+
default: "gpt-4"
|
|
471
|
+
|
|
472
|
+
models:
|
|
473
|
+
- id: gpt-4
|
|
474
|
+
name: gpt-4
|
|
475
|
+
provider: openai
|
|
476
|
+
base_url: https://api.openai.com/v1
|
|
477
|
+
api_key: your-api-key
|
|
478
|
+
|
|
479
|
+
- id: local
|
|
480
|
+
name: local-ollama
|
|
481
|
+
provider: openai
|
|
482
|
+
base_url: http://localhost:11434/v1
|
|
483
|
+
api_key: EMPTY
|
|
484
|
+
```
|
|
485
|
+
|
|
486
|
+
Or use environment variables:
|
|
487
|
+
|
|
488
|
+
```bash
|
|
489
|
+
export FLEXLLM_BASE_URL="https://api.openai.com/v1"
|
|
490
|
+
export FLEXLLM_API_KEY="your-key"
|
|
491
|
+
export FLEXLLM_MODEL="gpt-4"
|
|
492
|
+
```
|
|
493
|
+
|
|
494
|
+
## API Reference
|
|
495
|
+
|
|
496
|
+
### LLMClient
|
|
497
|
+
|
|
498
|
+
```python
|
|
499
|
+
LLMClient(
|
|
500
|
+
provider: str = "auto", # "auto", "openai", "gemini", "claude"
|
|
501
|
+
model: str, # Model name
|
|
502
|
+
base_url: str, # API base URL
|
|
503
|
+
api_key: str = "EMPTY", # API key
|
|
504
|
+
cache: ResponseCacheConfig, # Cache config
|
|
505
|
+
concurrency_limit: int = 10, # Max concurrent requests
|
|
506
|
+
max_qps: float = None, # Max requests per second
|
|
507
|
+
retry_times: int = 3, # Retry count on failure
|
|
508
|
+
retry_delay: float = 1.0, # Delay between retries
|
|
509
|
+
timeout: int = 120, # Request timeout (seconds)
|
|
510
|
+
)
|
|
511
|
+
```
|
|
512
|
+
|
|
513
|
+
### Methods
|
|
514
|
+
|
|
515
|
+
| Method | Description |
|
|
516
|
+
|--------|-------------|
|
|
517
|
+
| `chat_completions(messages)` | Single async request |
|
|
518
|
+
| `chat_completions_sync(messages)` | Single sync request |
|
|
519
|
+
| `chat_completions_batch(messages_list)` | Batch async with checkpoint |
|
|
520
|
+
| `chat_completions_batch_sync(messages_list)` | Batch sync with checkpoint |
|
|
521
|
+
| `iter_chat_completions_batch(messages_list)` | Streaming batch results |
|
|
522
|
+
| `chat_completions_stream(messages)` | Token-by-token streaming |
|
|
523
|
+
| `parse_thoughts(response_data)` | Parse thinking content |
|
|
524
|
+
|
|
525
|
+
### ResponseCacheConfig
|
|
526
|
+
|
|
527
|
+
```python
|
|
528
|
+
# Shortcuts
|
|
529
|
+
ResponseCacheConfig.with_ttl(3600) # 1 hour TTL
|
|
530
|
+
ResponseCacheConfig.persistent() # Never expire
|
|
531
|
+
ResponseCacheConfig.ipc(ttl=86400) # Multi-process shared (default)
|
|
532
|
+
ResponseCacheConfig.local(ttl=86400) # Single process only
|
|
533
|
+
|
|
534
|
+
# Full config
|
|
535
|
+
ResponseCacheConfig(
|
|
536
|
+
enabled: bool = False,
|
|
537
|
+
ttl: int = 86400, # Time-to-live in seconds
|
|
538
|
+
cache_dir: str = "~/.cache/flexllm/llm_response",
|
|
539
|
+
use_ipc: bool = True, # Multi-process cache sharing
|
|
540
|
+
)
|
|
541
|
+
```
|
|
542
|
+
|
|
543
|
+
### Token Counting
|
|
544
|
+
|
|
545
|
+
```python
|
|
546
|
+
from flexllm import count_tokens, estimate_cost, estimate_batch_cost
|
|
547
|
+
|
|
548
|
+
tokens = count_tokens("Hello world", model="gpt-4")
|
|
549
|
+
cost = estimate_cost(tokens, model="gpt-4", is_input=True)
|
|
550
|
+
total_cost = estimate_batch_cost(messages_list, model="gpt-4")
|
|
551
|
+
```
|
|
552
|
+
|
|
553
|
+
## Architecture
|
|
554
|
+
|
|
555
|
+
```
|
|
556
|
+
LLMClient (Unified entry point)
|
|
557
|
+
├── OpenAIClient (OpenAI-compatible APIs)
|
|
558
|
+
├── GeminiClient (Google Gemini)
|
|
559
|
+
└── ClaudeClient (Anthropic Claude)
|
|
560
|
+
│
|
|
561
|
+
└── LLMClientBase (Abstract base - 4 methods to implement)
|
|
562
|
+
│
|
|
563
|
+
├── ConcurrentRequester (Async engine with QPS control)
|
|
564
|
+
├── ResponseCache (FlaxKV2-based caching with IPC)
|
|
565
|
+
└── ImageProcessor (Multi-modal support)
|
|
566
|
+
|
|
567
|
+
LLMClientPool (Multi-endpoint load balancing)
|
|
568
|
+
└── ProviderRouter (round_robin / weighted / random / fallback)
|
|
569
|
+
```
|
|
570
|
+
|
|
571
|
+
## License
|
|
572
|
+
|
|
573
|
+
Apache 2.0
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
flexllm/__init__.py,sha256=GMd4DJMgpxUGaLZ55X5pdSJoa-79lZCIBtyPr63IHH4,7160
|
|
2
|
+
flexllm/__main__.py,sha256=n63FUI84kJ81A1PQW0Takd9s4yVExGCzyGDDw0gTMzY,40789
|
|
3
|
+
flexllm/base_client.py,sha256=_82yHc7QieIr0OMFU5hqjrshnLq1ROIyzdjVdFUTJTs,42204
|
|
4
|
+
flexllm/chain_of_thought_client.py,sha256=ohmUaEWtvfIFFrw_h2ADNSkiEVE_qryLr18KtWzU00Y,41351
|
|
5
|
+
flexllm/claudeclient.py,sha256=BTmkh5bc0Lgvo90v-XB_bc_vFPaaUhc8hol2nXc12ec,15060
|
|
6
|
+
flexllm/client_pool.py,sha256=yZlCO7aopAgKhpHn5tG6OwCzLzMcAFt4ovH1MEJ8-AY,24229
|
|
7
|
+
flexllm/geminiclient.py,sha256=2K03uGIpED7ryul7dmftpIq5OuY_ZIIVYHbnIUADu9s,20306
|
|
8
|
+
flexllm/llm_client.py,sha256=U0zoMsf1q7VX0CYlB6MGgjqynlTCs8QtqTkIzzStkV4,19002
|
|
9
|
+
flexllm/llm_parser.py,sha256=raQnxIuSUNbRvrBLeZnpIIkDCcO2UzThtplGy8Cb1Y8,1832
|
|
10
|
+
flexllm/mllm_client.py,sha256=a6C2JJPrA6JW3XN07ZPQ0xVqc2ketMEQWKCK0uupEvw,17251
|
|
11
|
+
flexllm/openaiclient.py,sha256=pb8ttzPYMC7pLVMCpRXknzs2-kC-CmM73nUHw5bo7gw,8377
|
|
12
|
+
flexllm/provider_router.py,sha256=PElAQ4v6Go21VxTm9LF5_w0Pkt6GcOYJDp-OKRIEFtU,5978
|
|
13
|
+
flexllm/token_counter.py,sha256=GHzZwniQo4uaDfXRgljzVvoTfcKzkZTpP4S3VJ2PGXE,8188
|
|
14
|
+
flexllm/async_api/__init__.py,sha256=LqybQzdw2JfpdOMMeif9qlTIm68oOg3UseFBC17lcDg,196
|
|
15
|
+
flexllm/async_api/concurrent_call.py,sha256=pymX43aQ4HdiGUk5oPheOsWssjnitDz0nLbQ0N7B5tQ,2904
|
|
16
|
+
flexllm/async_api/concurrent_executor.py,sha256=-BzI2QH_RkzrCO1_RY0EkOWyAS24HOWncED43SchTq0,33809
|
|
17
|
+
flexllm/async_api/core.py,sha256=UqFQ5A-ObJhmRUS8nheB6UNFCCdfhJk5TkRK3goZSzU,13350
|
|
18
|
+
flexllm/async_api/interface.py,sha256=s6pDxAjm0r0rfXWYvEnY9qmGl571thm7cKHqPlyxfGU,227
|
|
19
|
+
flexllm/async_api/progress.py,sha256=n8Mlf6rpNotEdZ6n6snkUN9JhSorcGCUR8ZGkue9yTs,10537
|
|
20
|
+
flexllm/batch_tools/__init__.py,sha256=2hK377iLyFxv5HXc2CK4b6ff_gC5eUKlyv11vhXEhaU,392
|
|
21
|
+
flexllm/batch_tools/folder_processor.py,sha256=FjkeGqowRxtvvCYpGYHWGx7esdJ5OOpPMo3cQ9oR2QE,10695
|
|
22
|
+
flexllm/batch_tools/table_processor.py,sha256=VRH5kZK3QPcVrc_ZSQSszMPYlw0lzMLk3Yata6Ub9FE,11490
|
|
23
|
+
flexllm/cache/__init__.py,sha256=yVRpoQNltmcupnOt5daiFrJ5zv7SV71oOsBkbK390tA,241
|
|
24
|
+
flexllm/cache/response_cache.py,sha256=lkOKESR3aAXkAde6s0zL5KsH2l8vmDI6QMiDKyTKv_U,8300
|
|
25
|
+
flexllm/msg_processors/__init__.py,sha256=je_TqbV_QeSFHJV-zUq9hhyml7rsnKMZw8gYwzDCqzs,5799
|
|
26
|
+
flexllm/msg_processors/image_processor.py,sha256=gg2eJYav1_AhzB1JYCBDP8wxfNHEarKAyhINZsnXC9o,28915
|
|
27
|
+
flexllm/msg_processors/image_processor_helper.py,sha256=QIc_MkD2EYPmc1SyfBDani9yfiQ62FhrWCzn-0k9wRI,16067
|
|
28
|
+
flexllm/msg_processors/messages_processor.py,sha256=Rzn1lt361gwNctyAhUmiEkZgTogsGXEYE1LSApqGJe4,13199
|
|
29
|
+
flexllm/msg_processors/unified_processor.py,sha256=IKRO1rR0gJxir_nPnahVYboqufRPsnSjvtnjkSwLjhc,49103
|
|
30
|
+
flexllm/pricing/__init__.py,sha256=4HjLPiBy5WHDaQ0Idp-sVPUt5W_UxZeeq38nJwIKga0,2586
|
|
31
|
+
flexllm/pricing/data.json,sha256=QKEmoa6WjC3CpH8nWwxYL0cy5VXS15LI8UDNe2QwqXI,22334
|
|
32
|
+
flexllm/pricing/updater.py,sha256=TvKOtBLgc8VWwBY4pwOQlTDZmJ2kEZe1GYxYh_GE8Rs,5837
|
|
33
|
+
flexllm/utils/__init__.py,sha256=-BAWBrc4Kc18yrPK_OTOa4JaDyx2OgFcRuIiQe-zYeI,30
|
|
34
|
+
flexllm/utils/core.py,sha256=eraE-FMnQ4vYmdjgs82ZdakcHTEn9dXbTHFZ76plnWw,1079
|
|
35
|
+
flexllm-0.3.3.dist-info/METADATA,sha256=blNSncn_AILRQwXgTrmAw46NIiv-IDjFfvhf6-DahpM,15697
|
|
36
|
+
flexllm-0.3.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
37
|
+
flexllm-0.3.3.dist-info/entry_points.txt,sha256=BWKjRwFpLLBW8GLo6M-B15VYWiefec4kuFFdVtyOSLY,79
|
|
38
|
+
flexllm-0.3.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
39
|
+
flexllm-0.3.3.dist-info/RECORD,,
|