inference-proxy 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inference_proxy-0.2.2/PKG-INFO +288 -0
- inference_proxy-0.2.2/README.md +241 -0
- {inference_proxy-0.2.0 → inference_proxy-0.2.2}/lm_proxy/__main__.py +5 -5
- {inference_proxy-0.2.0 → inference_proxy-0.2.2}/lm_proxy/app.py +1 -0
- inference_proxy-0.2.2/lm_proxy/bootstrap.py +81 -0
- {inference_proxy-0.2.0 → inference_proxy-0.2.2}/lm_proxy/config.py +5 -5
- {inference_proxy-0.2.0 → inference_proxy-0.2.2}/lm_proxy/core.py +204 -204
- {inference_proxy-0.2.0 → inference_proxy-0.2.2}/pyproject.toml +2 -3
- inference_proxy-0.2.0/PKG-INFO +0 -81
- inference_proxy-0.2.0/README.md +0 -34
- inference_proxy-0.2.0/lm_proxy/bootstrap.py +0 -70
- {inference_proxy-0.2.0 → inference_proxy-0.2.2}/LICENSE +0 -0
- {inference_proxy-0.2.0 → inference_proxy-0.2.2}/lm_proxy/__init__.py +0 -0
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: inference-proxy
|
|
3
|
+
Version: 0.2.2
|
|
4
|
+
Summary: "Inference Proxy" is OpenAI-compatible http proxy server for inferencing various LLMs capable of working with Google, Anthropic, OpenAI APIs, local PyTorch inference, etc.
|
|
5
|
+
License: MIT License
|
|
6
|
+
|
|
7
|
+
Copyright (c) 2025 Vitalii Stepanenko
|
|
8
|
+
|
|
9
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
10
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
11
|
+
in the Software without restriction, including without limitation the rights
|
|
12
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
13
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
14
|
+
furnished to do so, subject to the following conditions:
|
|
15
|
+
|
|
16
|
+
The above copyright notice and this permission notice shall be included in all
|
|
17
|
+
copies or substantial portions of the Software.
|
|
18
|
+
|
|
19
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
20
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
21
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
22
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
23
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
24
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
25
|
+
SOFTWARE.
|
|
26
|
+
Keywords: llm,large language models,ai,gpt,openai,proxy,http,proxy-server
|
|
27
|
+
Author: Vitalii Stepanenko
|
|
28
|
+
Author-email: mail@vitalii.in
|
|
29
|
+
Maintainer: Vitalii Stepanenko
|
|
30
|
+
Maintainer-email: mail@vitalii.in
|
|
31
|
+
Requires-Python: >=3.10,<4
|
|
32
|
+
Classifier: Intended Audience :: Developers
|
|
33
|
+
Classifier: Operating System :: OS Independent
|
|
34
|
+
Classifier: Programming Language :: Python :: 3
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
39
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
40
|
+
Requires-Dist: ai-microcore (>=4.4.3,<4.5.0)
|
|
41
|
+
Requires-Dist: fastapi (>=0.116.1,<0.117.0)
|
|
42
|
+
Requires-Dist: typer (>=0.16.1)
|
|
43
|
+
Requires-Dist: uvicorn (>=0.22.0)
|
|
44
|
+
Project-URL: Source Code, https://github.com/Nayjest/lm-proxy
|
|
45
|
+
Description-Content-Type: text/markdown
|
|
46
|
+
|
|
47
|
+
<p align="center">
|
|
48
|
+
<img src="https://img.shields.io/github/license/Nayjest/lm-proxy?color=blue" alt="License">
|
|
49
|
+
<a href="https://pypi.org/project/lm-proxy/"><img src="https://img.shields.io/pypi/v/lm-proxy?color=blue" alt="PyPI"></a>
|
|
50
|
+
<a href="https://github.com/Nayjest/lm-proxy/actions/workflows/tests.yml"><img src="https://github.com/Nayjest/lm-proxy/actions/workflows/tests.yml/badge.svg" alt="Tests"></a>
|
|
51
|
+
<a href="https://github.com/Nayjest/lm-proxy/actions/workflows/code-style.yml"><img src="https://github.com/Nayjest/lm-proxy/actions/workflows/code-style.yml/badge.svg" alt="Code Style"></a>
|
|
52
|
+
</p>
|
|
53
|
+
|
|
54
|
+
# Inference Proxy
|
|
55
|
+
|
|
56
|
+
**Inference Proxy** is an OpenAI-compatible HTTP proxy server for various Large Language Models (LLMs) inference.
|
|
57
|
+
It provides a unified interface for working with different AI providers through a single API endpoint that follows the OpenAI format.
|
|
58
|
+
Stream like OpenAI, authenticate with your own API keys, and keep clients unchanged.
|
|
59
|
+
## ✨ Features
|
|
60
|
+
|
|
61
|
+
- **Provider Agnostic**: Connect to OpenAI, Anthropic, Google AI, local models, and more using a single API
|
|
62
|
+
- **Unified Interface**: Access all models through the standard OpenAI API format
|
|
63
|
+
- **Dynamic Routing**: Route requests to different LLM providers based on model name patterns
|
|
64
|
+
- **Stream Support**: Full streaming support for real-time responses
|
|
65
|
+
- **API Key Management**: Configurable API key validation and access control
|
|
66
|
+
- **Easy Configuration**: Simple TOML configuration files for setup
|
|
67
|
+
|
|
68
|
+
## 🚀 Getting Started
|
|
69
|
+
|
|
70
|
+
### Installation
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install inference-proxy
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Quick Start
|
|
77
|
+
|
|
78
|
+
1. Create a `config.toml` file:
|
|
79
|
+
|
|
80
|
+
```toml
|
|
81
|
+
host = "0.0.0.0"
|
|
82
|
+
port = 8000
|
|
83
|
+
|
|
84
|
+
[connections]
|
|
85
|
+
[connections.openai]
|
|
86
|
+
api_type = "open_ai"
|
|
87
|
+
api_base = "https://api.openai.com/v1/"
|
|
88
|
+
api_key = "env:OPENAI_API_KEY"
|
|
89
|
+
|
|
90
|
+
[connections.anthropic]
|
|
91
|
+
api_type = "anthropic"
|
|
92
|
+
api_key = "env:ANTHROPIC_API_KEY"
|
|
93
|
+
|
|
94
|
+
[routing]
|
|
95
|
+
"gpt*" = "openai.*"
|
|
96
|
+
"claude*" = "anthropic.*"
|
|
97
|
+
"*" = "openai.gpt-3.5-turbo"
|
|
98
|
+
|
|
99
|
+
[groups.default]
|
|
100
|
+
api_keys = ["YOUR_API_KEY_HERE"]
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
2. Start the server:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
inference-proxy
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
3. Use it with any OpenAI-compatible client:
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
from openai import OpenAI
|
|
113
|
+
|
|
114
|
+
client = OpenAI(
|
|
115
|
+
api_key="YOUR_API_KEY_HERE",
|
|
116
|
+
base_url="http://localhost:8000/v1"
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
completion = client.chat.completions.create(
|
|
120
|
+
model="gpt-5", # This will be routed to OpenAI based on config
|
|
121
|
+
messages=[{"role": "user", "content": "Hello, world!"}]
|
|
122
|
+
)
|
|
123
|
+
print(completion.choices[0].message.content)
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Or use the same endpoint with Claude models:
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
completion = client.chat.completions.create(
|
|
130
|
+
model="claude-opus-4-1-20250805", # This will be routed to Anthropic based on config
|
|
131
|
+
messages=[{"role": "user", "content": "Hello, world!"}]
|
|
132
|
+
)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## 📝 Configuration
|
|
136
|
+
|
|
137
|
+
Inference Proxy is configured through a TOML file that specifies connections, routing rules, and access control.
|
|
138
|
+
|
|
139
|
+
### Basic Structure
|
|
140
|
+
|
|
141
|
+
```toml
|
|
142
|
+
host = "0.0.0.0" # Interface to bind to
|
|
143
|
+
port = 8000 # Port to listen on
|
|
144
|
+
dev_autoreload = false # Enable for development
|
|
145
|
+
|
|
146
|
+
# API key validation function (optional)
|
|
147
|
+
check_api_key = "lm_proxy.core.check_api_key"
|
|
148
|
+
|
|
149
|
+
# LLM Provider Connections
|
|
150
|
+
[connections]
|
|
151
|
+
|
|
152
|
+
[connections.openai]
|
|
153
|
+
api_type = "open_ai"
|
|
154
|
+
api_base = "https://api.openai.com/v1/"
|
|
155
|
+
api_key = "env:OPENAI_API_KEY"
|
|
156
|
+
|
|
157
|
+
[connections.google]
|
|
158
|
+
api_type = "google_ai_studio"
|
|
159
|
+
api_key = "env:GOOGLE_API_KEY"
|
|
160
|
+
|
|
161
|
+
# Routing rules (model_pattern = "connection.model")
|
|
162
|
+
[routing]
|
|
163
|
+
"gpt*" = "openai.*" # Route all GPT models to OpenAI
|
|
164
|
+
"claude*" = "anthropic.*" # Route all Claude models to Anthropic
|
|
165
|
+
"gemini*" = "google.*" # Route all Gemini models to Google
|
|
166
|
+
"*" = "openai.gpt-3.5-turbo" # Default fallback
|
|
167
|
+
|
|
168
|
+
# Access control groups
|
|
169
|
+
[groups.default]
|
|
170
|
+
api_keys = [
|
|
171
|
+
"KEY1",
|
|
172
|
+
"KEY2"
|
|
173
|
+
]
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### Environment Variables
|
|
177
|
+
|
|
178
|
+
You can use environment variables in your configuration file by prefixing values with `env:`:
|
|
179
|
+
|
|
180
|
+
```toml
|
|
181
|
+
[connections.openai]
|
|
182
|
+
api_key = "env:OPENAI_API_KEY"
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
Load these from a `.env` file or set them in your environment before starting the server.
|
|
186
|
+
|
|
187
|
+
## 🔌 API Usage
|
|
188
|
+
|
|
189
|
+
Inference Proxy implements the OpenAI chat completions API endpoint. You can use any OpenAI-compatible client to interact with it.
|
|
190
|
+
|
|
191
|
+
### Endpoint
|
|
192
|
+
|
|
193
|
+
```
|
|
194
|
+
POST /v1/chat/completions
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### Request Format
|
|
198
|
+
|
|
199
|
+
```json
|
|
200
|
+
{
|
|
201
|
+
"model": "gpt-3.5-turbo",
|
|
202
|
+
"messages": [
|
|
203
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
|
204
|
+
{"role": "user", "content": "What is the capital of France?"}
|
|
205
|
+
],
|
|
206
|
+
"temperature": 0.7,
|
|
207
|
+
"stream": false
|
|
208
|
+
}
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
### Response Format
|
|
212
|
+
|
|
213
|
+
```json
|
|
214
|
+
{
|
|
215
|
+
"choices": [
|
|
216
|
+
{
|
|
217
|
+
"index": 0,
|
|
218
|
+
"message": {
|
|
219
|
+
"role": "assistant",
|
|
220
|
+
"content": "The capital of France is Paris."
|
|
221
|
+
},
|
|
222
|
+
"finish_reason": "stop"
|
|
223
|
+
}
|
|
224
|
+
]
|
|
225
|
+
}
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
## 🛠️ Advanced Usage
|
|
229
|
+
|
|
230
|
+
### Custom API Key Validation
|
|
231
|
+
|
|
232
|
+
You can implement your own API key validation function:
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
# my_validators.py
|
|
236
|
+
def validate_api_key(api_key: str) -> str | None:
|
|
237
|
+
"""
|
|
238
|
+
Validate an API key and return the group name if valid.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
api_key: The API key to validate
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
The name of the group if valid, None otherwise
|
|
245
|
+
"""
|
|
246
|
+
if api_key == "secret-key":
|
|
247
|
+
return "admin"
|
|
248
|
+
elif api_key.startswith("user-"):
|
|
249
|
+
return "users"
|
|
250
|
+
return None
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
Then reference it in your config:
|
|
254
|
+
|
|
255
|
+
```toml
|
|
256
|
+
check_api_key = "my_validators.validate_api_key"
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### Dynamic Model Routing
|
|
260
|
+
|
|
261
|
+
The routing section allows flexible pattern matching with wildcards:
|
|
262
|
+
|
|
263
|
+
```toml
|
|
264
|
+
[routing]
|
|
265
|
+
"gpt-4*" = "openai.gpt-4" # Route gpt-4 requests to OpenAI GPT-4
|
|
266
|
+
"gpt-3.5*" = "openai.gpt-3.5-turbo" # Route gpt-3.5 requests to OpenAI
|
|
267
|
+
"claude*" = "anthropic.*" # Pass model name as-is to Anthropic
|
|
268
|
+
"gemini*" = "google.*" # Pass model name as-is to Google
|
|
269
|
+
"custom*" = "local.llama-7b" # Map any "custom*" to a specific local model
|
|
270
|
+
"*" = "openai.gpt-3.5-turbo" # Default fallback for unmatched models
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
## 🤝 Contributing
|
|
274
|
+
|
|
275
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
276
|
+
|
|
277
|
+
1. Fork the repository
|
|
278
|
+
2. Create your feature branch (`git checkout -b feature/amazing-feature`)
|
|
279
|
+
3. Commit your changes (`git commit -m 'Add some amazing feature'`)
|
|
280
|
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
|
281
|
+
5. Open a Pull Request
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
## 📄 License
|
|
285
|
+
|
|
286
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
287
|
+
© 2025 Vitalii Stepanenko
|
|
288
|
+
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="https://img.shields.io/github/license/Nayjest/lm-proxy?color=blue" alt="License">
|
|
3
|
+
<a href="https://pypi.org/project/lm-proxy/"><img src="https://img.shields.io/pypi/v/lm-proxy?color=blue" alt="PyPI"></a>
|
|
4
|
+
<a href="https://github.com/Nayjest/lm-proxy/actions/workflows/tests.yml"><img src="https://github.com/Nayjest/lm-proxy/actions/workflows/tests.yml/badge.svg" alt="Tests"></a>
|
|
5
|
+
<a href="https://github.com/Nayjest/lm-proxy/actions/workflows/code-style.yml"><img src="https://github.com/Nayjest/lm-proxy/actions/workflows/code-style.yml/badge.svg" alt="Code Style"></a>
|
|
6
|
+
</p>
|
|
7
|
+
|
|
8
|
+
# Inference Proxy
|
|
9
|
+
|
|
10
|
+
**Inference Proxy** is an OpenAI-compatible HTTP proxy server for various Large Language Models (LLMs) inference.
|
|
11
|
+
It provides a unified interface for working with different AI providers through a single API endpoint that follows the OpenAI format.
|
|
12
|
+
Stream like OpenAI, authenticate with your own API keys, and keep clients unchanged.
|
|
13
|
+
## ✨ Features
|
|
14
|
+
|
|
15
|
+
- **Provider Agnostic**: Connect to OpenAI, Anthropic, Google AI, local models, and more using a single API
|
|
16
|
+
- **Unified Interface**: Access all models through the standard OpenAI API format
|
|
17
|
+
- **Dynamic Routing**: Route requests to different LLM providers based on model name patterns
|
|
18
|
+
- **Stream Support**: Full streaming support for real-time responses
|
|
19
|
+
- **API Key Management**: Configurable API key validation and access control
|
|
20
|
+
- **Easy Configuration**: Simple TOML configuration files for setup
|
|
21
|
+
|
|
22
|
+
## 🚀 Getting Started
|
|
23
|
+
|
|
24
|
+
### Installation
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install inference-proxy
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Quick Start
|
|
31
|
+
|
|
32
|
+
1. Create a `config.toml` file:
|
|
33
|
+
|
|
34
|
+
```toml
|
|
35
|
+
host = "0.0.0.0"
|
|
36
|
+
port = 8000
|
|
37
|
+
|
|
38
|
+
[connections]
|
|
39
|
+
[connections.openai]
|
|
40
|
+
api_type = "open_ai"
|
|
41
|
+
api_base = "https://api.openai.com/v1/"
|
|
42
|
+
api_key = "env:OPENAI_API_KEY"
|
|
43
|
+
|
|
44
|
+
[connections.anthropic]
|
|
45
|
+
api_type = "anthropic"
|
|
46
|
+
api_key = "env:ANTHROPIC_API_KEY"
|
|
47
|
+
|
|
48
|
+
[routing]
|
|
49
|
+
"gpt*" = "openai.*"
|
|
50
|
+
"claude*" = "anthropic.*"
|
|
51
|
+
"*" = "openai.gpt-3.5-turbo"
|
|
52
|
+
|
|
53
|
+
[groups.default]
|
|
54
|
+
api_keys = ["YOUR_API_KEY_HERE"]
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
2. Start the server:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
inference-proxy
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
3. Use it with any OpenAI-compatible client:
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from openai import OpenAI
|
|
67
|
+
|
|
68
|
+
client = OpenAI(
|
|
69
|
+
api_key="YOUR_API_KEY_HERE",
|
|
70
|
+
base_url="http://localhost:8000/v1"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
completion = client.chat.completions.create(
|
|
74
|
+
model="gpt-5", # This will be routed to OpenAI based on config
|
|
75
|
+
messages=[{"role": "user", "content": "Hello, world!"}]
|
|
76
|
+
)
|
|
77
|
+
print(completion.choices[0].message.content)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Or use the same endpoint with Claude models:
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
completion = client.chat.completions.create(
|
|
84
|
+
model="claude-opus-4-1-20250805", # This will be routed to Anthropic based on config
|
|
85
|
+
messages=[{"role": "user", "content": "Hello, world!"}]
|
|
86
|
+
)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## 📝 Configuration
|
|
90
|
+
|
|
91
|
+
Inference Proxy is configured through a TOML file that specifies connections, routing rules, and access control.
|
|
92
|
+
|
|
93
|
+
### Basic Structure
|
|
94
|
+
|
|
95
|
+
```toml
|
|
96
|
+
host = "0.0.0.0" # Interface to bind to
|
|
97
|
+
port = 8000 # Port to listen on
|
|
98
|
+
dev_autoreload = false # Enable for development
|
|
99
|
+
|
|
100
|
+
# API key validation function (optional)
|
|
101
|
+
check_api_key = "lm_proxy.core.check_api_key"
|
|
102
|
+
|
|
103
|
+
# LLM Provider Connections
|
|
104
|
+
[connections]
|
|
105
|
+
|
|
106
|
+
[connections.openai]
|
|
107
|
+
api_type = "open_ai"
|
|
108
|
+
api_base = "https://api.openai.com/v1/"
|
|
109
|
+
api_key = "env:OPENAI_API_KEY"
|
|
110
|
+
|
|
111
|
+
[connections.google]
|
|
112
|
+
api_type = "google_ai_studio"
|
|
113
|
+
api_key = "env:GOOGLE_API_KEY"
|
|
114
|
+
|
|
115
|
+
# Routing rules (model_pattern = "connection.model")
|
|
116
|
+
[routing]
|
|
117
|
+
"gpt*" = "openai.*" # Route all GPT models to OpenAI
|
|
118
|
+
"claude*" = "anthropic.*" # Route all Claude models to Anthropic
|
|
119
|
+
"gemini*" = "google.*" # Route all Gemini models to Google
|
|
120
|
+
"*" = "openai.gpt-3.5-turbo" # Default fallback
|
|
121
|
+
|
|
122
|
+
# Access control groups
|
|
123
|
+
[groups.default]
|
|
124
|
+
api_keys = [
|
|
125
|
+
"KEY1",
|
|
126
|
+
"KEY2"
|
|
127
|
+
]
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Environment Variables
|
|
131
|
+
|
|
132
|
+
You can use environment variables in your configuration file by prefixing values with `env:`:
|
|
133
|
+
|
|
134
|
+
```toml
|
|
135
|
+
[connections.openai]
|
|
136
|
+
api_key = "env:OPENAI_API_KEY"
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
Load these from a `.env` file or set them in your environment before starting the server.
|
|
140
|
+
|
|
141
|
+
## 🔌 API Usage
|
|
142
|
+
|
|
143
|
+
Inference Proxy implements the OpenAI chat completions API endpoint. You can use any OpenAI-compatible client to interact with it.
|
|
144
|
+
|
|
145
|
+
### Endpoint
|
|
146
|
+
|
|
147
|
+
```
|
|
148
|
+
POST /v1/chat/completions
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### Request Format
|
|
152
|
+
|
|
153
|
+
```json
|
|
154
|
+
{
|
|
155
|
+
"model": "gpt-3.5-turbo",
|
|
156
|
+
"messages": [
|
|
157
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
|
158
|
+
{"role": "user", "content": "What is the capital of France?"}
|
|
159
|
+
],
|
|
160
|
+
"temperature": 0.7,
|
|
161
|
+
"stream": false
|
|
162
|
+
}
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### Response Format
|
|
166
|
+
|
|
167
|
+
```json
|
|
168
|
+
{
|
|
169
|
+
"choices": [
|
|
170
|
+
{
|
|
171
|
+
"index": 0,
|
|
172
|
+
"message": {
|
|
173
|
+
"role": "assistant",
|
|
174
|
+
"content": "The capital of France is Paris."
|
|
175
|
+
},
|
|
176
|
+
"finish_reason": "stop"
|
|
177
|
+
}
|
|
178
|
+
]
|
|
179
|
+
}
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
## 🛠️ Advanced Usage
|
|
183
|
+
|
|
184
|
+
### Custom API Key Validation
|
|
185
|
+
|
|
186
|
+
You can implement your own API key validation function:
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
# my_validators.py
|
|
190
|
+
def validate_api_key(api_key: str) -> str | None:
|
|
191
|
+
"""
|
|
192
|
+
Validate an API key and return the group name if valid.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
api_key: The API key to validate
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
The name of the group if valid, None otherwise
|
|
199
|
+
"""
|
|
200
|
+
if api_key == "secret-key":
|
|
201
|
+
return "admin"
|
|
202
|
+
elif api_key.startswith("user-"):
|
|
203
|
+
return "users"
|
|
204
|
+
return None
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
Then reference it in your config:
|
|
208
|
+
|
|
209
|
+
```toml
|
|
210
|
+
check_api_key = "my_validators.validate_api_key"
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
### Dynamic Model Routing
|
|
214
|
+
|
|
215
|
+
The routing section allows flexible pattern matching with wildcards:
|
|
216
|
+
|
|
217
|
+
```toml
|
|
218
|
+
[routing]
|
|
219
|
+
"gpt-4*" = "openai.gpt-4" # Route gpt-4 requests to OpenAI GPT-4
|
|
220
|
+
"gpt-3.5*" = "openai.gpt-3.5-turbo" # Route gpt-3.5 requests to OpenAI
|
|
221
|
+
"claude*" = "anthropic.*" # Pass model name as-is to Anthropic
|
|
222
|
+
"gemini*" = "google.*" # Pass model name as-is to Google
|
|
223
|
+
"custom*" = "local.llama-7b" # Map any "custom*" to a specific local model
|
|
224
|
+
"*" = "openai.gpt-3.5-turbo" # Default fallback for unmatched models
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
## 🤝 Contributing
|
|
228
|
+
|
|
229
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
230
|
+
|
|
231
|
+
1. Fork the repository
|
|
232
|
+
2. Create your feature branch (`git checkout -b feature/amazing-feature`)
|
|
233
|
+
3. Commit your changes (`git commit -m 'Add some amazing feature'`)
|
|
234
|
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
|
235
|
+
5. Open a Pull Request
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
## 📄 License
|
|
239
|
+
|
|
240
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
241
|
+
© 2025 Vitalii Stepanenko
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from .app import cli_app
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
if __name__ == "__main__":
|
|
5
|
-
cli_app()
|
|
1
|
+
from .app import cli_app
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
if __name__ == "__main__":
|
|
5
|
+
cli_app()
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import logging
|
|
3
|
+
import inspect
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
import microcore as mc
|
|
8
|
+
from microcore import ui
|
|
9
|
+
from microcore.configuration import get_bool_from_env
|
|
10
|
+
from dotenv import load_dotenv
|
|
11
|
+
|
|
12
|
+
from .config import Config
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def setup_logging(log_level: int = logging.INFO):
|
|
16
|
+
class CustomFormatter(logging.Formatter):
|
|
17
|
+
def format(self, record):
|
|
18
|
+
dt = datetime.fromtimestamp(record.created).strftime("%H:%M:%S")
|
|
19
|
+
message, level_name = record.getMessage(), record.levelname
|
|
20
|
+
if record.levelno == logging.WARNING:
|
|
21
|
+
message = mc.ui.yellow(message)
|
|
22
|
+
level_name = mc.ui.yellow(level_name)
|
|
23
|
+
if record.levelno >= logging.ERROR:
|
|
24
|
+
message = mc.ui.red(message)
|
|
25
|
+
level_name = mc.ui.red(level_name)
|
|
26
|
+
return f"{dt} {level_name}: {message}"
|
|
27
|
+
|
|
28
|
+
handler = logging.StreamHandler()
|
|
29
|
+
handler.setFormatter(CustomFormatter())
|
|
30
|
+
logging.basicConfig(level=log_level, handlers=[handler])
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class Env:
|
|
34
|
+
config: Config
|
|
35
|
+
connections: dict[str, mc.types.LLMAsyncFunctionType]
|
|
36
|
+
debug: bool
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def init(config: Config | str, debug: bool = False):
|
|
40
|
+
env.debug = debug
|
|
41
|
+
|
|
42
|
+
if isinstance(config, Config):
|
|
43
|
+
env.config = config
|
|
44
|
+
elif isinstance(config, str):
|
|
45
|
+
env.config = Config.load(config)
|
|
46
|
+
else:
|
|
47
|
+
raise ValueError("config must be a string (file path) or Config instance")
|
|
48
|
+
|
|
49
|
+
# initialize connections
|
|
50
|
+
env.connections = dict()
|
|
51
|
+
for conn_name, conn_config in env.config.connections.items():
|
|
52
|
+
logging.info(f"Initializing '{conn_name}' LLM proxy connection...")
|
|
53
|
+
try:
|
|
54
|
+
if inspect.iscoroutinefunction(conn_config):
|
|
55
|
+
env.connections[conn_name] = conn_config
|
|
56
|
+
else:
|
|
57
|
+
mc.configure(
|
|
58
|
+
**conn_config,
|
|
59
|
+
EMBEDDING_DB_TYPE=mc.EmbeddingDbType.NONE
|
|
60
|
+
)
|
|
61
|
+
env.connections[conn_name] = mc.env().llm_async_function
|
|
62
|
+
except mc.LLMConfigError as e:
|
|
63
|
+
raise ValueError(f"Error in configuration for connection '{conn_name}': {e}")
|
|
64
|
+
|
|
65
|
+
logging.info(f"Done initializing {len(env.connections)} connections.")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
env = Env()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def bootstrap(config: str | Config = 'config.toml'):
|
|
72
|
+
load_dotenv('.env', override=True)
|
|
73
|
+
debug = '--debug' in sys.argv or get_bool_from_env('LM_PROXY_DEBUG', False)
|
|
74
|
+
setup_logging(logging.DEBUG if debug else logging.INFO)
|
|
75
|
+
mc.logging.LoggingConfig.OUTPUT_METHOD = logging.info
|
|
76
|
+
logging.info(
|
|
77
|
+
f"Bootstrapping {ui.yellow('lm_proxy')} "
|
|
78
|
+
f"using configuration: {'dynamic' if isinstance(config, Config) else ui.blue(config)} "
|
|
79
|
+
f"{'[DEBUG: ON]' if debug else ''}..."
|
|
80
|
+
)
|
|
81
|
+
Env.init(config, debug=debug)
|
|
@@ -7,7 +7,7 @@ from typing import Union, Callable
|
|
|
7
7
|
import tomllib
|
|
8
8
|
import importlib.util
|
|
9
9
|
|
|
10
|
-
from pydantic import BaseModel, Field
|
|
10
|
+
from pydantic import BaseModel, Field, ConfigDict
|
|
11
11
|
from microcore.utils import resolve_callable
|
|
12
12
|
|
|
13
13
|
|
|
@@ -24,18 +24,17 @@ class Group(BaseModel):
|
|
|
24
24
|
|
|
25
25
|
class Config(BaseModel):
|
|
26
26
|
"""Main configuration model matching config.toml structure."""
|
|
27
|
+
model_config = ConfigDict(extra="forbid")
|
|
27
28
|
enabled: bool = True
|
|
28
29
|
host: str = "0.0.0.0"
|
|
29
30
|
port: int = 8000
|
|
30
31
|
dev_autoreload: bool = False
|
|
31
32
|
connections: dict[str, Union[dict, Callable]]
|
|
32
33
|
routing: dict[str, str] = Field(default_factory=dict)
|
|
34
|
+
""" model_name_pattern* => connection_name.< model | * >, example: {"gpt-*": "oai.*"} """
|
|
33
35
|
groups: dict[str, Group] = Field(default_factory=dict)
|
|
34
36
|
check_api_key: Union[str, Callable] = Field(default="lm_proxy.core.check_api_key")
|
|
35
37
|
|
|
36
|
-
class Config:
|
|
37
|
-
extra = "forbid"
|
|
38
|
-
|
|
39
38
|
def __init__(self, **data):
|
|
40
39
|
super().__init__(**data)
|
|
41
40
|
self.check_api_key = resolve_callable(self.check_api_key)
|
|
@@ -59,10 +58,11 @@ class Config(BaseModel):
|
|
|
59
58
|
config_module = importlib.util.module_from_spec(spec)
|
|
60
59
|
spec.loader.exec_module(config_module)
|
|
61
60
|
return config_module.config
|
|
62
|
-
|
|
63
61
|
elif config_path.endswith(".toml"):
|
|
64
62
|
with open(config_path, "rb") as f:
|
|
65
63
|
config_data = tomllib.load(f)
|
|
64
|
+
else:
|
|
65
|
+
raise ValueError(f"Unsupported configuration file extension: {config_path}")
|
|
66
66
|
|
|
67
67
|
# Process environment variables in api_key fields
|
|
68
68
|
for conn_name, conn_config in config_data.get("connections", {}).items():
|
|
@@ -1,204 +1,204 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import fnmatch
|
|
3
|
-
import json
|
|
4
|
-
import logging
|
|
5
|
-
import secrets
|
|
6
|
-
import time
|
|
7
|
-
from typing import List, Optional
|
|
8
|
-
|
|
9
|
-
import microcore as mc
|
|
10
|
-
from fastapi import HTTPException
|
|
11
|
-
from pydantic import BaseModel
|
|
12
|
-
from starlette.requests import Request
|
|
13
|
-
from starlette.responses import JSONResponse, Response, StreamingResponse
|
|
14
|
-
|
|
15
|
-
from .bootstrap import env
|
|
16
|
-
from .config import Config, Group
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class ChatCompletionRequest(BaseModel):
|
|
20
|
-
model: str
|
|
21
|
-
messages: List[mc.Msg]
|
|
22
|
-
stream: Optional[bool] = None
|
|
23
|
-
max_tokens: Optional[int] = None
|
|
24
|
-
temperature: Optional[float] = None
|
|
25
|
-
top_p: Optional[float] = None
|
|
26
|
-
n: Optional[int] = None
|
|
27
|
-
stop: Optional[List[str]] = None
|
|
28
|
-
presence_penalty: Optional[float] = None
|
|
29
|
-
frequency_penalty: Optional[float] = None
|
|
30
|
-
user: Optional[str] = None
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def resolve_connection_and_model(config: Config, external_model: str) -> tuple[str, str]:
|
|
34
|
-
for model_match, rule in config.routing.items():
|
|
35
|
-
if fnmatch.fnmatchcase(external_model, model_match):
|
|
36
|
-
connection_name, model_part = rule.split(".", 1)
|
|
37
|
-
if connection_name not in config.connections:
|
|
38
|
-
raise ValueError(
|
|
39
|
-
f"Routing selected unknown connection '{connection_name}'. "
|
|
40
|
-
f"Defined connections: {', '.join(config.connections.keys()) or '(none)'}"
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
resolved_model = external_model if model_part == "*" else model_part
|
|
44
|
-
return connection_name, resolved_model
|
|
45
|
-
|
|
46
|
-
raise ValueError(
|
|
47
|
-
f"No routing rule matched model '{external_model}'. "
|
|
48
|
-
"Add a catch-all rule like \"*\" = \"openai.gpt-3.5-turbo\" if desired."
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
async def process_stream(async_llm_func, prompt, llm_params):
|
|
53
|
-
queue = asyncio.Queue()
|
|
54
|
-
stream_id = f"chatcmpl-{secrets.token_hex(12)}"
|
|
55
|
-
created = int(time.time())
|
|
56
|
-
|
|
57
|
-
async def callback(chunk):
|
|
58
|
-
await queue.put(chunk)
|
|
59
|
-
|
|
60
|
-
def make_chunk(delta=None, content=None, finish_reason=None, error=None) -> str:
|
|
61
|
-
if delta is None:
|
|
62
|
-
delta = dict(content=str(content)) if content is not None else dict()
|
|
63
|
-
obj = {
|
|
64
|
-
"id": stream_id,
|
|
65
|
-
"object": "chat.completion.chunk",
|
|
66
|
-
"created": created,
|
|
67
|
-
"choices": [{"index": 0, "delta": delta}],
|
|
68
|
-
}
|
|
69
|
-
if error is not None:
|
|
70
|
-
obj['error'] = {'message': str(error), 'type': type(error).__name__}
|
|
71
|
-
if finish_reason is None:
|
|
72
|
-
finish_reason = 'error'
|
|
73
|
-
if finish_reason is not None:
|
|
74
|
-
obj['choices'][0]['finish_reason'] = finish_reason
|
|
75
|
-
return "data: " + json.dumps(obj) + "\n\n"
|
|
76
|
-
|
|
77
|
-
task = asyncio.create_task(
|
|
78
|
-
async_llm_func(prompt, **llm_params, callback=callback)
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
try:
|
|
82
|
-
# Initial chunk: role
|
|
83
|
-
yield make_chunk(delta={'role': 'assistant'})
|
|
84
|
-
|
|
85
|
-
while not task.done():
|
|
86
|
-
try:
|
|
87
|
-
block = await asyncio.wait_for(queue.get(), timeout=0.1)
|
|
88
|
-
yield make_chunk(content=block)
|
|
89
|
-
except asyncio.TimeoutError:
|
|
90
|
-
continue
|
|
91
|
-
|
|
92
|
-
# Drain any remaining
|
|
93
|
-
while not queue.empty():
|
|
94
|
-
block = await queue.get()
|
|
95
|
-
yield make_chunk(content=block)
|
|
96
|
-
|
|
97
|
-
finally:
|
|
98
|
-
try:
|
|
99
|
-
await task
|
|
100
|
-
except Exception as e:
|
|
101
|
-
yield make_chunk(error={'message': str(e), 'type': type(e).__name__})
|
|
102
|
-
|
|
103
|
-
# Final chunk: finish_reason
|
|
104
|
-
yield make_chunk(finish_reason='stop')
|
|
105
|
-
yield "data: [DONE]\n\n"
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
def read_api_key(request: Request) -> str:
|
|
109
|
-
"""
|
|
110
|
-
Extracts the Bearer token from the Authorization header.
|
|
111
|
-
returns '' if not present.
|
|
112
|
-
"""
|
|
113
|
-
auth = request.headers.get("authorization")
|
|
114
|
-
if auth and auth.lower().startswith("bearer "):
|
|
115
|
-
return auth[7:].strip()
|
|
116
|
-
return ""
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
def check_api_key(api_key: Optional[str]) -> Group:
|
|
120
|
-
for group_name, group in env.config.groups.items():
|
|
121
|
-
if api_key in group.api_keys:
|
|
122
|
-
return group_name
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
async def chat_completions(request: ChatCompletionRequest, raw_request: Request) -> Response:
|
|
126
|
-
"""
|
|
127
|
-
Endpoint for chat completions that mimics OpenAI's API structure.
|
|
128
|
-
Streams the response from the LLM using microcore.
|
|
129
|
-
"""
|
|
130
|
-
if not env.config.enabled:
|
|
131
|
-
raise HTTPException(
|
|
132
|
-
status_code=503,
|
|
133
|
-
detail={
|
|
134
|
-
"error": {
|
|
135
|
-
"message": "The service is disabled.",
|
|
136
|
-
"type": "service_unavailable",
|
|
137
|
-
"param": None,
|
|
138
|
-
"code": "service_disabled",
|
|
139
|
-
}
|
|
140
|
-
},
|
|
141
|
-
)
|
|
142
|
-
api_key = read_api_key(raw_request)
|
|
143
|
-
group: str | bool | None = (env.config.check_api_key)(api_key)
|
|
144
|
-
if not group:
|
|
145
|
-
raise HTTPException(
|
|
146
|
-
status_code=403,
|
|
147
|
-
detail={
|
|
148
|
-
"error": {
|
|
149
|
-
"message": "Incorrect API key provided: "
|
|
150
|
-
"your API key is invalid, expired, or revoked.",
|
|
151
|
-
"type": "invalid_request_error",
|
|
152
|
-
"param": None,
|
|
153
|
-
"code": "invalid_api_key",
|
|
154
|
-
}
|
|
155
|
-
},
|
|
156
|
-
)
|
|
157
|
-
|
|
158
|
-
llm_params = request.
|
|
159
|
-
|
|
160
|
-
connection, llm_params["model"] = resolve_connection_and_model(
|
|
161
|
-
env.config,
|
|
162
|
-
llm_params.get("model", "default_model")
|
|
163
|
-
)
|
|
164
|
-
logging.debug(
|
|
165
|
-
"Resolved routing for [%s] --> connection: %s, model: %s",
|
|
166
|
-
request.model,
|
|
167
|
-
connection,
|
|
168
|
-
llm_params["model"]
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
if not env.config.groups[group].allows_connecting_to(connection):
|
|
172
|
-
raise HTTPException(
|
|
173
|
-
status_code=403,
|
|
174
|
-
detail={
|
|
175
|
-
"error": {
|
|
176
|
-
"message": f"Your API key does not allow using the '{connection}' connection.",
|
|
177
|
-
"type": "invalid_request_error",
|
|
178
|
-
"param": None,
|
|
179
|
-
"code": "connection_not_allowed",
|
|
180
|
-
}
|
|
181
|
-
},
|
|
182
|
-
)
|
|
183
|
-
|
|
184
|
-
async_llm_func = env.connections[connection]
|
|
185
|
-
|
|
186
|
-
logging.info("Querying LLM... params: %s", llm_params)
|
|
187
|
-
if request.stream:
|
|
188
|
-
return StreamingResponse(
|
|
189
|
-
process_stream(async_llm_func, request.messages, llm_params),
|
|
190
|
-
media_type="text/event-stream"
|
|
191
|
-
)
|
|
192
|
-
out = await async_llm_func(request.messages, **llm_params)
|
|
193
|
-
logging.info("LLM response: %s", out)
|
|
194
|
-
return JSONResponse(
|
|
195
|
-
{
|
|
196
|
-
"choices": [
|
|
197
|
-
{
|
|
198
|
-
"index": 0,
|
|
199
|
-
"message": {"role": "assistant", "content": str(out)},
|
|
200
|
-
"finish_reason": "stop"
|
|
201
|
-
}
|
|
202
|
-
]
|
|
203
|
-
}
|
|
204
|
-
)
|
|
1
|
+
import asyncio
|
|
2
|
+
import fnmatch
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import secrets
|
|
6
|
+
import time
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
import microcore as mc
|
|
10
|
+
from fastapi import HTTPException
|
|
11
|
+
from pydantic import BaseModel
|
|
12
|
+
from starlette.requests import Request
|
|
13
|
+
from starlette.responses import JSONResponse, Response, StreamingResponse
|
|
14
|
+
|
|
15
|
+
from .bootstrap import env
|
|
16
|
+
from .config import Config, Group
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ChatCompletionRequest(BaseModel):
|
|
20
|
+
model: str
|
|
21
|
+
messages: List[mc.Msg]
|
|
22
|
+
stream: Optional[bool] = None
|
|
23
|
+
max_tokens: Optional[int] = None
|
|
24
|
+
temperature: Optional[float] = None
|
|
25
|
+
top_p: Optional[float] = None
|
|
26
|
+
n: Optional[int] = None
|
|
27
|
+
stop: Optional[List[str]] = None
|
|
28
|
+
presence_penalty: Optional[float] = None
|
|
29
|
+
frequency_penalty: Optional[float] = None
|
|
30
|
+
user: Optional[str] = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def resolve_connection_and_model(config: Config, external_model: str) -> tuple[str, str]:
|
|
34
|
+
for model_match, rule in config.routing.items():
|
|
35
|
+
if fnmatch.fnmatchcase(external_model, model_match):
|
|
36
|
+
connection_name, model_part = rule.split(".", 1)
|
|
37
|
+
if connection_name not in config.connections:
|
|
38
|
+
raise ValueError(
|
|
39
|
+
f"Routing selected unknown connection '{connection_name}'. "
|
|
40
|
+
f"Defined connections: {', '.join(config.connections.keys()) or '(none)'}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
resolved_model = external_model if model_part == "*" else model_part
|
|
44
|
+
return connection_name, resolved_model
|
|
45
|
+
|
|
46
|
+
raise ValueError(
|
|
47
|
+
f"No routing rule matched model '{external_model}'. "
|
|
48
|
+
"Add a catch-all rule like \"*\" = \"openai.gpt-3.5-turbo\" if desired."
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def process_stream(async_llm_func, prompt, llm_params):
|
|
53
|
+
queue = asyncio.Queue()
|
|
54
|
+
stream_id = f"chatcmpl-{secrets.token_hex(12)}"
|
|
55
|
+
created = int(time.time())
|
|
56
|
+
|
|
57
|
+
async def callback(chunk):
|
|
58
|
+
await queue.put(chunk)
|
|
59
|
+
|
|
60
|
+
def make_chunk(delta=None, content=None, finish_reason=None, error=None) -> str:
|
|
61
|
+
if delta is None:
|
|
62
|
+
delta = dict(content=str(content)) if content is not None else dict()
|
|
63
|
+
obj = {
|
|
64
|
+
"id": stream_id,
|
|
65
|
+
"object": "chat.completion.chunk",
|
|
66
|
+
"created": created,
|
|
67
|
+
"choices": [{"index": 0, "delta": delta}],
|
|
68
|
+
}
|
|
69
|
+
if error is not None:
|
|
70
|
+
obj['error'] = {'message': str(error), 'type': type(error).__name__}
|
|
71
|
+
if finish_reason is None:
|
|
72
|
+
finish_reason = 'error'
|
|
73
|
+
if finish_reason is not None:
|
|
74
|
+
obj['choices'][0]['finish_reason'] = finish_reason
|
|
75
|
+
return "data: " + json.dumps(obj) + "\n\n"
|
|
76
|
+
|
|
77
|
+
task = asyncio.create_task(
|
|
78
|
+
async_llm_func(prompt, **llm_params, callback=callback)
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
# Initial chunk: role
|
|
83
|
+
yield make_chunk(delta={'role': 'assistant'})
|
|
84
|
+
|
|
85
|
+
while not task.done():
|
|
86
|
+
try:
|
|
87
|
+
block = await asyncio.wait_for(queue.get(), timeout=0.1)
|
|
88
|
+
yield make_chunk(content=block)
|
|
89
|
+
except asyncio.TimeoutError:
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
# Drain any remaining
|
|
93
|
+
while not queue.empty():
|
|
94
|
+
block = await queue.get()
|
|
95
|
+
yield make_chunk(content=block)
|
|
96
|
+
|
|
97
|
+
finally:
|
|
98
|
+
try:
|
|
99
|
+
await task
|
|
100
|
+
except Exception as e:
|
|
101
|
+
yield make_chunk(error={'message': str(e), 'type': type(e).__name__})
|
|
102
|
+
|
|
103
|
+
# Final chunk: finish_reason
|
|
104
|
+
yield make_chunk(finish_reason='stop')
|
|
105
|
+
yield "data: [DONE]\n\n"
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def read_api_key(request: Request) -> str:
|
|
109
|
+
"""
|
|
110
|
+
Extracts the Bearer token from the Authorization header.
|
|
111
|
+
returns '' if not present.
|
|
112
|
+
"""
|
|
113
|
+
auth = request.headers.get("authorization")
|
|
114
|
+
if auth and auth.lower().startswith("bearer "):
|
|
115
|
+
return auth[7:].strip()
|
|
116
|
+
return ""
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def check_api_key(api_key: Optional[str]) -> Group:
|
|
120
|
+
for group_name, group in env.config.groups.items():
|
|
121
|
+
if api_key in group.api_keys:
|
|
122
|
+
return group_name
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
async def chat_completions(request: ChatCompletionRequest, raw_request: Request) -> Response:
|
|
126
|
+
"""
|
|
127
|
+
Endpoint for chat completions that mimics OpenAI's API structure.
|
|
128
|
+
Streams the response from the LLM using microcore.
|
|
129
|
+
"""
|
|
130
|
+
if not env.config.enabled:
|
|
131
|
+
raise HTTPException(
|
|
132
|
+
status_code=503,
|
|
133
|
+
detail={
|
|
134
|
+
"error": {
|
|
135
|
+
"message": "The service is disabled.",
|
|
136
|
+
"type": "service_unavailable",
|
|
137
|
+
"param": None,
|
|
138
|
+
"code": "service_disabled",
|
|
139
|
+
}
|
|
140
|
+
},
|
|
141
|
+
)
|
|
142
|
+
api_key = read_api_key(raw_request)
|
|
143
|
+
group: str | bool | None = (env.config.check_api_key)(api_key)
|
|
144
|
+
if not group:
|
|
145
|
+
raise HTTPException(
|
|
146
|
+
status_code=403,
|
|
147
|
+
detail={
|
|
148
|
+
"error": {
|
|
149
|
+
"message": "Incorrect API key provided: "
|
|
150
|
+
"your API key is invalid, expired, or revoked.",
|
|
151
|
+
"type": "invalid_request_error",
|
|
152
|
+
"param": None,
|
|
153
|
+
"code": "invalid_api_key",
|
|
154
|
+
}
|
|
155
|
+
},
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
llm_params = request.model_dump(exclude={'messages'}, exclude_none=True)
|
|
159
|
+
|
|
160
|
+
connection, llm_params["model"] = resolve_connection_and_model(
|
|
161
|
+
env.config,
|
|
162
|
+
llm_params.get("model", "default_model")
|
|
163
|
+
)
|
|
164
|
+
logging.debug(
|
|
165
|
+
"Resolved routing for [%s] --> connection: %s, model: %s",
|
|
166
|
+
request.model,
|
|
167
|
+
connection,
|
|
168
|
+
llm_params["model"]
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
if not env.config.groups[group].allows_connecting_to(connection):
|
|
172
|
+
raise HTTPException(
|
|
173
|
+
status_code=403,
|
|
174
|
+
detail={
|
|
175
|
+
"error": {
|
|
176
|
+
"message": f"Your API key does not allow using the '{connection}' connection.",
|
|
177
|
+
"type": "invalid_request_error",
|
|
178
|
+
"param": None,
|
|
179
|
+
"code": "connection_not_allowed",
|
|
180
|
+
}
|
|
181
|
+
},
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
async_llm_func = env.connections[connection]
|
|
185
|
+
|
|
186
|
+
logging.info("Querying LLM... params: %s", llm_params)
|
|
187
|
+
if request.stream:
|
|
188
|
+
return StreamingResponse(
|
|
189
|
+
process_stream(async_llm_func, request.messages, llm_params),
|
|
190
|
+
media_type="text/event-stream"
|
|
191
|
+
)
|
|
192
|
+
out = await async_llm_func(request.messages, **llm_params)
|
|
193
|
+
logging.info("LLM response: %s", out)
|
|
194
|
+
return JSONResponse(
|
|
195
|
+
{
|
|
196
|
+
"choices": [
|
|
197
|
+
{
|
|
198
|
+
"index": 0,
|
|
199
|
+
"message": {"role": "assistant", "content": str(out)},
|
|
200
|
+
"finish_reason": "stop"
|
|
201
|
+
}
|
|
202
|
+
]
|
|
203
|
+
}
|
|
204
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "inference-proxy"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.2"
|
|
4
4
|
description = "\"Inference Proxy\" is OpenAI-compatible http proxy server for inferencing various LLMs capable of working with Google, Anthropic, OpenAI APIs, local PyTorch inference, etc."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
keywords = ["llm", "large language models", "ai", "gpt", "openai", "proxy", "http", "proxy-server"]
|
|
@@ -15,7 +15,7 @@ classifiers = [
|
|
|
15
15
|
"License :: OSI Approved :: MIT License",
|
|
16
16
|
]
|
|
17
17
|
dependencies = [
|
|
18
|
-
"ai-microcore~=4.3
|
|
18
|
+
"ai-microcore~=4.4.3",
|
|
19
19
|
"fastapi~=0.116.1",
|
|
20
20
|
"uvicorn>=0.22.0",
|
|
21
21
|
"typer>=0.16.1",
|
|
@@ -47,4 +47,3 @@ pytest = "^7.4.3"
|
|
|
47
47
|
|
|
48
48
|
[tool.poetry.scripts]
|
|
49
49
|
inference-proxy = "lm_proxy.app:cli_app"
|
|
50
|
-
lm_proxy = "lm_proxy.app:cli_app"
|
inference_proxy-0.2.0/PKG-INFO
DELETED
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.3
|
|
2
|
-
Name: inference-proxy
|
|
3
|
-
Version: 0.2.0
|
|
4
|
-
Summary: "Inference Proxy" is OpenAI-compatible http proxy server for inferencing various LLMs capable of working with Google, Anthropic, OpenAI APIs, local PyTorch inference, etc.
|
|
5
|
-
License: MIT License
|
|
6
|
-
|
|
7
|
-
Copyright (c) 2025 Vitalii Stepanenko
|
|
8
|
-
|
|
9
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
10
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
11
|
-
in the Software without restriction, including without limitation the rights
|
|
12
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
13
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
14
|
-
furnished to do so, subject to the following conditions:
|
|
15
|
-
|
|
16
|
-
The above copyright notice and this permission notice shall be included in all
|
|
17
|
-
copies or substantial portions of the Software.
|
|
18
|
-
|
|
19
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
20
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
21
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
22
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
23
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
24
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
25
|
-
SOFTWARE.
|
|
26
|
-
Keywords: llm,large language models,ai,gpt,openai,proxy,http,proxy-server
|
|
27
|
-
Author: Vitalii Stepanenko
|
|
28
|
-
Author-email: mail@vitalii.in
|
|
29
|
-
Maintainer: Vitalii Stepanenko
|
|
30
|
-
Maintainer-email: mail@vitalii.in
|
|
31
|
-
Requires-Python: >=3.10,<4
|
|
32
|
-
Classifier: Intended Audience :: Developers
|
|
33
|
-
Classifier: Operating System :: OS Independent
|
|
34
|
-
Classifier: Programming Language :: Python :: 3
|
|
35
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
36
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
37
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
38
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
39
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
40
|
-
Requires-Dist: ai-microcore (>=4.3.2,<4.4.0)
|
|
41
|
-
Requires-Dist: fastapi (>=0.116.1,<0.117.0)
|
|
42
|
-
Requires-Dist: typer (>=0.16.1)
|
|
43
|
-
Requires-Dist: uvicorn (>=0.22.0)
|
|
44
|
-
Project-URL: Source Code, https://github.com/Nayjest/lm-proxy
|
|
45
|
-
Description-Content-Type: text/markdown
|
|
46
|
-
|
|
47
|
-
<p align="right">
|
|
48
|
-
<a href="https://pypi.org/project/lm-proxy/" target="_blank"><img src="https://badge.fury.io/py/lm-proxy.svg" alt="PYPI Release"></a>
|
|
49
|
-
<a href="https://github.com/Nayjest/lm-proxy/actions/workflows/code-style.yml" target="_blank"><img src="https://github.com/Nayjest/lm-proxy/actions/workflows/code-style.yml/badge.svg" alt="Code Style"></a>
|
|
50
|
-
<a href="https://github.com/Nayjest/lm-proxy/actions/workflows/tests.yml" target="_blank"><img src="https://github.com/Nayjest/lm-proxy/actions/workflows/tests.yml/badge.svg" alt="Tests"></a>
|
|
51
|
-
<a href="https://github.com/Nayjest/lm-proxy/blob/main/LICENSE" target="_blank"><img src="https://img.shields.io/static/v1?label=license&message=MIT&color=d08aff" alt="License"></a>
|
|
52
|
-
</p>
|
|
53
|
-
|
|
54
|
-
# Inference Proxy
|
|
55
|
-
|
|
56
|
-
**Inference Proxy** is OpenAI-compatible http proxy server for inferencing various LLMs capable of working with Google, Anthropic, OpenAI APIs, local PyTorch inference, etc.
|
|
57
|
-
|
|
58
|
-
**Development Status**: bookmark it and go away, it is still in early development.
|
|
59
|
-
|
|
60
|
-
## ✨ Features
|
|
61
|
-
|
|
62
|
-
- @todo
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
## 🚀 Quickstart
|
|
66
|
-
```bash
|
|
67
|
-
# Install Inference Proxy via pip
|
|
68
|
-
pip install inference-proxy
|
|
69
|
-
|
|
70
|
-
```
|
|
71
|
-
|
|
72
|
-
## 🤝 Contributing
|
|
73
|
-
|
|
74
|
-
We ❤️ contributions! See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
75
|
-
|
|
76
|
-
## 📝 License
|
|
77
|
-
|
|
78
|
-
Licensed under the [MIT License](LICENSE).
|
|
79
|
-
|
|
80
|
-
© 2022—2025 [Vitalii Stepanenko](mailto:mail@vitaliy.in)
|
|
81
|
-
|
inference_proxy-0.2.0/README.md
DELETED
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
<p align="right">
|
|
2
|
-
<a href="https://pypi.org/project/lm-proxy/" target="_blank"><img src="https://badge.fury.io/py/lm-proxy.svg" alt="PYPI Release"></a>
|
|
3
|
-
<a href="https://github.com/Nayjest/lm-proxy/actions/workflows/code-style.yml" target="_blank"><img src="https://github.com/Nayjest/lm-proxy/actions/workflows/code-style.yml/badge.svg" alt="Code Style"></a>
|
|
4
|
-
<a href="https://github.com/Nayjest/lm-proxy/actions/workflows/tests.yml" target="_blank"><img src="https://github.com/Nayjest/lm-proxy/actions/workflows/tests.yml/badge.svg" alt="Tests"></a>
|
|
5
|
-
<a href="https://github.com/Nayjest/lm-proxy/blob/main/LICENSE" target="_blank"><img src="https://img.shields.io/static/v1?label=license&message=MIT&color=d08aff" alt="License"></a>
|
|
6
|
-
</p>
|
|
7
|
-
|
|
8
|
-
# Inference Proxy
|
|
9
|
-
|
|
10
|
-
**Inference Proxy** is OpenAI-compatible http proxy server for inferencing various LLMs capable of working with Google, Anthropic, OpenAI APIs, local PyTorch inference, etc.
|
|
11
|
-
|
|
12
|
-
**Development Status**: bookmark it and go away, it is still in early development.
|
|
13
|
-
|
|
14
|
-
## ✨ Features
|
|
15
|
-
|
|
16
|
-
- @todo
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
## 🚀 Quickstart
|
|
20
|
-
```bash
|
|
21
|
-
# Install Inference Proxy via pip
|
|
22
|
-
pip install inference-proxy
|
|
23
|
-
|
|
24
|
-
```
|
|
25
|
-
|
|
26
|
-
## 🤝 Contributing
|
|
27
|
-
|
|
28
|
-
We ❤️ contributions! See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
29
|
-
|
|
30
|
-
## 📝 License
|
|
31
|
-
|
|
32
|
-
Licensed under the [MIT License](LICENSE).
|
|
33
|
-
|
|
34
|
-
© 2022—2025 [Vitalii Stepanenko](mailto:mail@vitaliy.in)
|
|
@@ -1,70 +0,0 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import logging
|
|
3
|
-
import inspect
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
import microcore as mc
|
|
8
|
-
from microcore import ui
|
|
9
|
-
from microcore.configuration import get_bool_from_env
|
|
10
|
-
from dotenv import load_dotenv
|
|
11
|
-
|
|
12
|
-
from .config import Config
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def setup_logging(log_level: int = logging.INFO):
|
|
16
|
-
class CustomFormatter(logging.Formatter):
|
|
17
|
-
def format(self, record):
|
|
18
|
-
dt = datetime.fromtimestamp(record.created).strftime("%H:%M:%S")
|
|
19
|
-
message, level_name = record.getMessage(), record.levelname
|
|
20
|
-
if record.levelno == logging.WARNING:
|
|
21
|
-
message = mc.ui.yellow(message)
|
|
22
|
-
level_name = mc.ui.yellow(level_name)
|
|
23
|
-
if record.levelno >= logging.ERROR:
|
|
24
|
-
message = mc.ui.red(message)
|
|
25
|
-
level_name = mc.ui.red(level_name)
|
|
26
|
-
return f"{dt} {level_name}: {message}"
|
|
27
|
-
|
|
28
|
-
handler = logging.StreamHandler()
|
|
29
|
-
handler.setFormatter(CustomFormatter())
|
|
30
|
-
logging.basicConfig(level=log_level, handlers=[handler])
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class Env:
|
|
34
|
-
config: Config
|
|
35
|
-
connections: dict[str, mc.types.LLMAsyncFunctionType]
|
|
36
|
-
debug: bool
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
env = Env()
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def bootstrap(config_file: str = 'config.toml'):
|
|
43
|
-
load_dotenv('.env', override=True)
|
|
44
|
-
env.debug = '--debug' in sys.argv or get_bool_from_env('LM_PROXY_DEBUG', False)
|
|
45
|
-
setup_logging(logging.DEBUG if env.debug else logging.INFO)
|
|
46
|
-
logging.info(
|
|
47
|
-
f"Bootstrapping {ui.yellow('lm_proxy')} "
|
|
48
|
-
f"using configuration: {ui.blue(config_file)} "
|
|
49
|
-
f"{'[DEBUG: ON]' if env.debug else ''}..."
|
|
50
|
-
)
|
|
51
|
-
|
|
52
|
-
env.config = Config.load(config_file)
|
|
53
|
-
env.connections = dict()
|
|
54
|
-
|
|
55
|
-
for conn_name, conn_config in env.config.connections.items():
|
|
56
|
-
logging.info(f"Initializing '{conn_name}' connection...")
|
|
57
|
-
try:
|
|
58
|
-
if inspect.iscoroutinefunction(conn_config):
|
|
59
|
-
env.connections[conn_name] = conn_config
|
|
60
|
-
else:
|
|
61
|
-
mc.configure(
|
|
62
|
-
**conn_config,
|
|
63
|
-
EMBEDDING_DB_TYPE=mc.EmbeddingDbType.NONE
|
|
64
|
-
)
|
|
65
|
-
env.connections[conn_name] = mc.env().llm_async_function
|
|
66
|
-
except mc.LLMConfigError as e:
|
|
67
|
-
raise ValueError(f"Error in configuration for connection '{conn_name}': {e}")
|
|
68
|
-
|
|
69
|
-
logging.info(f"Done initializing {len(env.connections)} connections.")
|
|
70
|
-
mc.logging.LoggingConfig.OUTPUT_METHOD = logging.info
|
|
File without changes
|
|
File without changes
|