inference-proxy 0.2.2__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {inference_proxy-0.2.2 → inference_proxy-0.3.0}/PKG-INFO +46 -1
- {inference_proxy-0.2.2 → inference_proxy-0.3.0}/README.md +45 -0
- {inference_proxy-0.2.2 → inference_proxy-0.3.0}/lm_proxy/app.py +4 -2
- {inference_proxy-0.2.2 → inference_proxy-0.3.0}/lm_proxy/bootstrap.py +7 -6
- {inference_proxy-0.2.2 → inference_proxy-0.3.0}/lm_proxy/config.py +14 -1
- {inference_proxy-0.2.2 → inference_proxy-0.3.0}/lm_proxy/core.py +70 -25
- inference_proxy-0.3.0/lm_proxy/loggers/__init__.py +11 -0
- inference_proxy-0.3.0/lm_proxy/loggers/base_logger.py +56 -0
- inference_proxy-0.3.0/lm_proxy/loggers/core.py +53 -0
- inference_proxy-0.3.0/lm_proxy/loggers/log_writers.py +24 -0
- inference_proxy-0.3.0/lm_proxy/utils.py +73 -0
- {inference_proxy-0.2.2 → inference_proxy-0.3.0}/pyproject.toml +6 -2
- {inference_proxy-0.2.2 → inference_proxy-0.3.0}/LICENSE +0 -0
- {inference_proxy-0.2.2 → inference_proxy-0.3.0}/lm_proxy/__init__.py +0 -0
- {inference_proxy-0.2.2 → inference_proxy-0.3.0}/lm_proxy/__main__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: inference-proxy
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: "Inference Proxy" is OpenAI-compatible http proxy server for inferencing various LLMs capable of working with Google, Anthropic, OpenAI APIs, local PyTorch inference, etc.
|
|
5
5
|
License: MIT License
|
|
6
6
|
|
|
@@ -158,6 +158,10 @@ api_key = "env:OPENAI_API_KEY"
|
|
|
158
158
|
api_type = "google_ai_studio"
|
|
159
159
|
api_key = "env:GOOGLE_API_KEY"
|
|
160
160
|
|
|
161
|
+
[connections.anthropic]
|
|
162
|
+
api_type = "anthropic"
|
|
163
|
+
api_key = "env:ANTHROPIC_API_KEY"
|
|
164
|
+
|
|
161
165
|
# Routing rules (model_pattern = "connection.model")
|
|
162
166
|
[routing]
|
|
163
167
|
"gpt*" = "openai.*" # Route all GPT models to OpenAI
|
|
@@ -171,6 +175,25 @@ api_keys = [
|
|
|
171
175
|
"KEY1",
|
|
172
176
|
"KEY2"
|
|
173
177
|
]
|
|
178
|
+
|
|
179
|
+
# optional
|
|
180
|
+
[[loggers]]
|
|
181
|
+
class = 'lm_proxy.loggers.BaseLogger'
|
|
182
|
+
[loggers.log_writer]
|
|
183
|
+
class = 'lm_proxy.loggers.log_writers.JsonLogWriter'
|
|
184
|
+
file_name = 'storage/json.log'
|
|
185
|
+
[loggers.entry_transformer]
|
|
186
|
+
class = 'lm_proxy.loggers.LogEntryTransformer'
|
|
187
|
+
completion_tokens = "response.usage.completion_tokens"
|
|
188
|
+
prompt_tokens = "response.usage.prompt_tokens"
|
|
189
|
+
prompt = "request.messages"
|
|
190
|
+
response = "response"
|
|
191
|
+
group = "group"
|
|
192
|
+
connection = "connection"
|
|
193
|
+
api_key_id = "api_key_id"
|
|
194
|
+
remote_addr = "remote_addr"
|
|
195
|
+
created_at = "created_at"
|
|
196
|
+
duration = "duration"
|
|
174
197
|
```
|
|
175
198
|
|
|
176
199
|
### Environment Variables
|
|
@@ -184,6 +207,28 @@ api_key = "env:OPENAI_API_KEY"
|
|
|
184
207
|
|
|
185
208
|
Load these from a `.env` file or set them in your environment before starting the server.
|
|
186
209
|
|
|
210
|
+
|
|
211
|
+
## 🔑 Proxy API Keys vs. Provider API Keys
|
|
212
|
+
|
|
213
|
+
Inference Proxy utilizes two distinct types of API keys to facilitate secure and efficient request handling.
|
|
214
|
+
|
|
215
|
+
- **Proxy API Key (Virtual API Key, Client API Key):**
|
|
216
|
+
A unique key generated and managed within the Inference Proxy.
|
|
217
|
+
Clients use these keys to authenticate their requests to the proxy's API endpoints.
|
|
218
|
+
Each Client API Key is associated with a specific group, which defines the scope of access and permissions for the client's requests.
|
|
219
|
+
These keys allow users to securely interact with the proxy without direct access to external service credentials.
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
- **Provider API Key (Upstream API Key):**
|
|
224
|
+
A key provided by external LLM inference providers (e.g., OpenAI, Anthropic, Mistral, etc.) and configured within the Inference Proxy.
|
|
225
|
+
The proxy uses these keys to authenticate and forward validated client requests to the respective external services.
|
|
226
|
+
Provider API Keys remain hidden from end users, ensuring secure and transparent communication with provider APIs.
|
|
227
|
+
|
|
228
|
+
This distinction ensures a clear separation of concerns:
|
|
229
|
+
Virtual API Keys manage user authentication and access within the proxy,
|
|
230
|
+
while Upstream API Keys handle secure communication with external providers.
|
|
231
|
+
|
|
187
232
|
## 🔌 API Usage
|
|
188
233
|
|
|
189
234
|
Inference Proxy implements the OpenAI chat completions API endpoint. You can use any OpenAI-compatible client to interact with it.
|
|
@@ -112,6 +112,10 @@ api_key = "env:OPENAI_API_KEY"
|
|
|
112
112
|
api_type = "google_ai_studio"
|
|
113
113
|
api_key = "env:GOOGLE_API_KEY"
|
|
114
114
|
|
|
115
|
+
[connections.anthropic]
|
|
116
|
+
api_type = "anthropic"
|
|
117
|
+
api_key = "env:ANTHROPIC_API_KEY"
|
|
118
|
+
|
|
115
119
|
# Routing rules (model_pattern = "connection.model")
|
|
116
120
|
[routing]
|
|
117
121
|
"gpt*" = "openai.*" # Route all GPT models to OpenAI
|
|
@@ -125,6 +129,25 @@ api_keys = [
|
|
|
125
129
|
"KEY1",
|
|
126
130
|
"KEY2"
|
|
127
131
|
]
|
|
132
|
+
|
|
133
|
+
# optional
|
|
134
|
+
[[loggers]]
|
|
135
|
+
class = 'lm_proxy.loggers.BaseLogger'
|
|
136
|
+
[loggers.log_writer]
|
|
137
|
+
class = 'lm_proxy.loggers.log_writers.JsonLogWriter'
|
|
138
|
+
file_name = 'storage/json.log'
|
|
139
|
+
[loggers.entry_transformer]
|
|
140
|
+
class = 'lm_proxy.loggers.LogEntryTransformer'
|
|
141
|
+
completion_tokens = "response.usage.completion_tokens"
|
|
142
|
+
prompt_tokens = "response.usage.prompt_tokens"
|
|
143
|
+
prompt = "request.messages"
|
|
144
|
+
response = "response"
|
|
145
|
+
group = "group"
|
|
146
|
+
connection = "connection"
|
|
147
|
+
api_key_id = "api_key_id"
|
|
148
|
+
remote_addr = "remote_addr"
|
|
149
|
+
created_at = "created_at"
|
|
150
|
+
duration = "duration"
|
|
128
151
|
```
|
|
129
152
|
|
|
130
153
|
### Environment Variables
|
|
@@ -138,6 +161,28 @@ api_key = "env:OPENAI_API_KEY"
|
|
|
138
161
|
|
|
139
162
|
Load these from a `.env` file or set them in your environment before starting the server.
|
|
140
163
|
|
|
164
|
+
|
|
165
|
+
## 🔑 Proxy API Keys vs. Provider API Keys
|
|
166
|
+
|
|
167
|
+
Inference Proxy utilizes two distinct types of API keys to facilitate secure and efficient request handling.
|
|
168
|
+
|
|
169
|
+
- **Proxy API Key (Virtual API Key, Client API Key):**
|
|
170
|
+
A unique key generated and managed within the Inference Proxy.
|
|
171
|
+
Clients use these keys to authenticate their requests to the proxy's API endpoints.
|
|
172
|
+
Each Client API Key is associated with a specific group, which defines the scope of access and permissions for the client's requests.
|
|
173
|
+
These keys allow users to securely interact with the proxy without direct access to external service credentials.
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
- **Provider API Key (Upstream API Key):**
|
|
178
|
+
A key provided by external LLM inference providers (e.g., OpenAI, Anthropic, Mistral, etc.) and configured within the Inference Proxy.
|
|
179
|
+
The proxy uses these keys to authenticate and forward validated client requests to the respective external services.
|
|
180
|
+
Provider API Keys remain hidden from end users, ensuring secure and transparent communication with provider APIs.
|
|
181
|
+
|
|
182
|
+
This distinction ensures a clear separation of concerns:
|
|
183
|
+
Virtual API Keys manage user authentication and access within the proxy,
|
|
184
|
+
while Upstream API Keys handle secure communication with external providers.
|
|
185
|
+
|
|
141
186
|
## 🔌 API Usage
|
|
142
187
|
|
|
143
188
|
Inference Proxy implements the OpenAI chat completions API endpoint. You can use any OpenAI-compatible client to interact with it.
|
|
@@ -14,7 +14,7 @@ def run_server(
|
|
|
14
14
|
config: str = typer.Option(None, help="Path to the configuration file"),
|
|
15
15
|
debug: bool = typer.Option(False, help="Enable debug mode (more verbose logging)"),
|
|
16
16
|
):
|
|
17
|
-
bootstrap(config or
|
|
17
|
+
bootstrap(config or "config.toml")
|
|
18
18
|
uvicorn.run(
|
|
19
19
|
"lm_proxy.app:web_app",
|
|
20
20
|
host=env.config.host,
|
|
@@ -25,7 +25,9 @@ def run_server(
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def web_app():
|
|
28
|
-
app = FastAPI(
|
|
28
|
+
app = FastAPI(
|
|
29
|
+
title="LM-Proxy", description="OpenAI-compatible proxy server for LLM inference"
|
|
30
|
+
)
|
|
29
31
|
app.add_api_route(
|
|
30
32
|
path="/v1/chat/completions",
|
|
31
33
|
endpoint=chat_completions,
|
|
@@ -55,12 +55,13 @@ class Env:
|
|
|
55
55
|
env.connections[conn_name] = conn_config
|
|
56
56
|
else:
|
|
57
57
|
mc.configure(
|
|
58
|
-
**conn_config,
|
|
59
|
-
EMBEDDING_DB_TYPE=mc.EmbeddingDbType.NONE
|
|
58
|
+
**conn_config, EMBEDDING_DB_TYPE=mc.EmbeddingDbType.NONE
|
|
60
59
|
)
|
|
61
60
|
env.connections[conn_name] = mc.env().llm_async_function
|
|
62
61
|
except mc.LLMConfigError as e:
|
|
63
|
-
raise ValueError(
|
|
62
|
+
raise ValueError(
|
|
63
|
+
f"Error in configuration for connection '{conn_name}': {e}"
|
|
64
|
+
)
|
|
64
65
|
|
|
65
66
|
logging.info(f"Done initializing {len(env.connections)} connections.")
|
|
66
67
|
|
|
@@ -68,9 +69,9 @@ class Env:
|
|
|
68
69
|
env = Env()
|
|
69
70
|
|
|
70
71
|
|
|
71
|
-
def bootstrap(config: str | Config =
|
|
72
|
-
load_dotenv(
|
|
73
|
-
debug =
|
|
72
|
+
def bootstrap(config: str | Config = "config.toml"):
|
|
73
|
+
load_dotenv(".env", override=True)
|
|
74
|
+
debug = "--debug" in sys.argv or get_bool_from_env("LM_PROXY_DEBUG", False)
|
|
74
75
|
setup_logging(logging.DEBUG if debug else logging.INFO)
|
|
75
76
|
mc.logging.LoggingConfig.OUTPUT_METHOD = logging.info
|
|
76
77
|
logging.info(
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
Configuration models for LM-Proxy settings.
|
|
3
3
|
This module defines Pydantic models that match the structure of config.toml.
|
|
4
4
|
"""
|
|
5
|
+
|
|
5
6
|
import os
|
|
6
7
|
from typing import Union, Callable
|
|
7
8
|
import tomllib
|
|
@@ -10,6 +11,8 @@ import importlib.util
|
|
|
10
11
|
from pydantic import BaseModel, Field, ConfigDict
|
|
11
12
|
from microcore.utils import resolve_callable
|
|
12
13
|
|
|
14
|
+
from .utils import resolve_instance_or_callable
|
|
15
|
+
|
|
13
16
|
|
|
14
17
|
class Group(BaseModel):
|
|
15
18
|
api_keys: list[str] = Field(default_factory=list)
|
|
@@ -24,20 +27,30 @@ class Group(BaseModel):
|
|
|
24
27
|
|
|
25
28
|
class Config(BaseModel):
|
|
26
29
|
"""Main configuration model matching config.toml structure."""
|
|
30
|
+
|
|
27
31
|
model_config = ConfigDict(extra="forbid")
|
|
28
32
|
enabled: bool = True
|
|
29
33
|
host: str = "0.0.0.0"
|
|
30
34
|
port: int = 8000
|
|
31
35
|
dev_autoreload: bool = False
|
|
32
|
-
connections: dict[str, Union[dict, Callable]]
|
|
36
|
+
connections: dict[str, Union[dict, Callable]] = Field(
|
|
37
|
+
..., # Required field (no default)
|
|
38
|
+
description="Dictionary of connection configurations",
|
|
39
|
+
examples=[{"openai": {"api_key": "sk-..."}}],
|
|
40
|
+
)
|
|
33
41
|
routing: dict[str, str] = Field(default_factory=dict)
|
|
34
42
|
""" model_name_pattern* => connection_name.< model | * >, example: {"gpt-*": "oai.*"} """
|
|
35
43
|
groups: dict[str, Group] = Field(default_factory=dict)
|
|
36
44
|
check_api_key: Union[str, Callable] = Field(default="lm_proxy.core.check_api_key")
|
|
45
|
+
loggers: list[Union[str, Callable, dict]] = Field(default_factory=list)
|
|
46
|
+
encryption_key: str = Field(
|
|
47
|
+
default="Eclipse", description="Key for encrypting sensitive data"
|
|
48
|
+
)
|
|
37
49
|
|
|
38
50
|
def __init__(self, **data):
|
|
39
51
|
super().__init__(**data)
|
|
40
52
|
self.check_api_key = resolve_callable(self.check_api_key)
|
|
53
|
+
self.loggers = [resolve_instance_or_callable(logger) for logger in self.loggers]
|
|
41
54
|
if not self.groups:
|
|
42
55
|
# Default group with no restrictions
|
|
43
56
|
self.groups = {"default": Group()}
|
|
@@ -4,16 +4,20 @@ import json
|
|
|
4
4
|
import logging
|
|
5
5
|
import secrets
|
|
6
6
|
import time
|
|
7
|
+
import hashlib
|
|
7
8
|
from typing import List, Optional
|
|
8
9
|
|
|
9
10
|
import microcore as mc
|
|
10
11
|
from fastapi import HTTPException
|
|
12
|
+
from lm_proxy.loggers import LogEntry
|
|
11
13
|
from pydantic import BaseModel
|
|
12
14
|
from starlette.requests import Request
|
|
13
15
|
from starlette.responses import JSONResponse, Response, StreamingResponse
|
|
14
16
|
|
|
15
17
|
from .bootstrap import env
|
|
16
18
|
from .config import Config, Group
|
|
19
|
+
from .loggers import log_non_blocking
|
|
20
|
+
from .utils import get_client_ip
|
|
17
21
|
|
|
18
22
|
|
|
19
23
|
class ChatCompletionRequest(BaseModel):
|
|
@@ -30,7 +34,9 @@ class ChatCompletionRequest(BaseModel):
|
|
|
30
34
|
user: Optional[str] = None
|
|
31
35
|
|
|
32
36
|
|
|
33
|
-
def resolve_connection_and_model(
|
|
37
|
+
def resolve_connection_and_model(
|
|
38
|
+
config: Config, external_model: str
|
|
39
|
+
) -> tuple[str, str]:
|
|
34
40
|
for model_match, rule in config.routing.items():
|
|
35
41
|
if fnmatch.fnmatchcase(external_model, model_match):
|
|
36
42
|
connection_name, model_part = rule.split(".", 1)
|
|
@@ -45,11 +51,14 @@ def resolve_connection_and_model(config: Config, external_model: str) -> tuple[s
|
|
|
45
51
|
|
|
46
52
|
raise ValueError(
|
|
47
53
|
f"No routing rule matched model '{external_model}'. "
|
|
48
|
-
|
|
54
|
+
'Add a catch-all rule like "*" = "openai.gpt-3.5-turbo" if desired.'
|
|
49
55
|
)
|
|
50
56
|
|
|
51
57
|
|
|
52
|
-
async def process_stream(
|
|
58
|
+
async def process_stream(
|
|
59
|
+
async_llm_func, request: ChatCompletionRequest, llm_params, log_entry: LogEntry
|
|
60
|
+
):
|
|
61
|
+
prompt = request.messages
|
|
53
62
|
queue = asyncio.Queue()
|
|
54
63
|
stream_id = f"chatcmpl-{secrets.token_hex(12)}"
|
|
55
64
|
created = int(time.time())
|
|
@@ -67,20 +76,18 @@ async def process_stream(async_llm_func, prompt, llm_params):
|
|
|
67
76
|
"choices": [{"index": 0, "delta": delta}],
|
|
68
77
|
}
|
|
69
78
|
if error is not None:
|
|
70
|
-
obj[
|
|
79
|
+
obj["error"] = {"message": str(error), "type": type(error).__name__}
|
|
71
80
|
if finish_reason is None:
|
|
72
|
-
finish_reason =
|
|
81
|
+
finish_reason = "error"
|
|
73
82
|
if finish_reason is not None:
|
|
74
|
-
obj[
|
|
83
|
+
obj["choices"][0]["finish_reason"] = finish_reason
|
|
75
84
|
return "data: " + json.dumps(obj) + "\n\n"
|
|
76
85
|
|
|
77
|
-
task = asyncio.create_task(
|
|
78
|
-
async_llm_func(prompt, **llm_params, callback=callback)
|
|
79
|
-
)
|
|
86
|
+
task = asyncio.create_task(async_llm_func(prompt, **llm_params, callback=callback))
|
|
80
87
|
|
|
81
88
|
try:
|
|
82
89
|
# Initial chunk: role
|
|
83
|
-
yield make_chunk(delta={
|
|
90
|
+
yield make_chunk(delta={"role": "assistant"})
|
|
84
91
|
|
|
85
92
|
while not task.done():
|
|
86
93
|
try:
|
|
@@ -96,13 +103,16 @@ async def process_stream(async_llm_func, prompt, llm_params):
|
|
|
96
103
|
|
|
97
104
|
finally:
|
|
98
105
|
try:
|
|
99
|
-
await task
|
|
106
|
+
result = await task
|
|
107
|
+
log_entry.response = result
|
|
100
108
|
except Exception as e:
|
|
101
|
-
|
|
109
|
+
log_entry.error = e
|
|
110
|
+
yield make_chunk(error={"message": str(e), "type": type(e).__name__})
|
|
102
111
|
|
|
103
112
|
# Final chunk: finish_reason
|
|
104
|
-
yield make_chunk(finish_reason=
|
|
113
|
+
yield make_chunk(finish_reason="stop")
|
|
105
114
|
yield "data: [DONE]\n\n"
|
|
115
|
+
await log_non_blocking(log_entry)
|
|
106
116
|
|
|
107
117
|
|
|
108
118
|
def read_api_key(request: Request) -> str:
|
|
@@ -116,13 +126,33 @@ def read_api_key(request: Request) -> str:
|
|
|
116
126
|
return ""
|
|
117
127
|
|
|
118
128
|
|
|
119
|
-
def check_api_key(api_key: Optional[str]) -> Group:
|
|
129
|
+
def check_api_key(api_key: Optional[str]) -> Optional[Group]:
|
|
130
|
+
"""
|
|
131
|
+
Validates an Client API key against configured groups and returns the matching group.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
api_key (Optional[str]): The Virtual / Client API key to validate.
|
|
135
|
+
Returns:
|
|
136
|
+
Optional[Group]: The Group object if the API key is valid and found in a group,
|
|
137
|
+
None otherwise.
|
|
138
|
+
"""
|
|
120
139
|
for group_name, group in env.config.groups.items():
|
|
121
140
|
if api_key in group.api_keys:
|
|
122
141
|
return group_name
|
|
142
|
+
return None
|
|
143
|
+
|
|
123
144
|
|
|
145
|
+
def api_key_id(api_key: Optional[str]) -> str | None:
|
|
146
|
+
if not api_key:
|
|
147
|
+
return None
|
|
148
|
+
return hashlib.md5(
|
|
149
|
+
(api_key + env.config.encryption_key).encode("utf-8")
|
|
150
|
+
).hexdigest()
|
|
124
151
|
|
|
125
|
-
|
|
152
|
+
|
|
153
|
+
async def chat_completions(
|
|
154
|
+
request: ChatCompletionRequest, raw_request: Request
|
|
155
|
+
) -> Response:
|
|
126
156
|
"""
|
|
127
157
|
Endpoint for chat completions that mimics OpenAI's API structure.
|
|
128
158
|
Streams the response from the LLM using microcore.
|
|
@@ -141,13 +171,19 @@ async def chat_completions(request: ChatCompletionRequest, raw_request: Request)
|
|
|
141
171
|
)
|
|
142
172
|
api_key = read_api_key(raw_request)
|
|
143
173
|
group: str | bool | None = (env.config.check_api_key)(api_key)
|
|
174
|
+
log_entry = LogEntry(
|
|
175
|
+
request=request,
|
|
176
|
+
api_key_id=api_key_id(api_key),
|
|
177
|
+
group=group if isinstance(group, str) else None,
|
|
178
|
+
remote_addr=get_client_ip(raw_request),
|
|
179
|
+
)
|
|
144
180
|
if not group:
|
|
145
181
|
raise HTTPException(
|
|
146
182
|
status_code=403,
|
|
147
183
|
detail={
|
|
148
184
|
"error": {
|
|
149
185
|
"message": "Incorrect API key provided: "
|
|
150
|
-
|
|
186
|
+
"your API key is invalid, expired, or revoked.",
|
|
151
187
|
"type": "invalid_request_error",
|
|
152
188
|
"param": None,
|
|
153
189
|
"code": "invalid_api_key",
|
|
@@ -155,17 +191,17 @@ async def chat_completions(request: ChatCompletionRequest, raw_request: Request)
|
|
|
155
191
|
},
|
|
156
192
|
)
|
|
157
193
|
|
|
158
|
-
llm_params = request.model_dump(exclude={
|
|
194
|
+
llm_params = request.model_dump(exclude={"messages"}, exclude_none=True)
|
|
159
195
|
|
|
160
196
|
connection, llm_params["model"] = resolve_connection_and_model(
|
|
161
|
-
env.config,
|
|
162
|
-
llm_params.get("model", "default_model")
|
|
197
|
+
env.config, llm_params.get("model", "default_model")
|
|
163
198
|
)
|
|
199
|
+
log_entry.connection = connection
|
|
164
200
|
logging.debug(
|
|
165
201
|
"Resolved routing for [%s] --> connection: %s, model: %s",
|
|
166
202
|
request.model,
|
|
167
203
|
connection,
|
|
168
|
-
llm_params["model"]
|
|
204
|
+
llm_params["model"],
|
|
169
205
|
)
|
|
170
206
|
|
|
171
207
|
if not env.config.groups[group].allows_connecting_to(connection):
|
|
@@ -186,18 +222,27 @@ async def chat_completions(request: ChatCompletionRequest, raw_request: Request)
|
|
|
186
222
|
logging.info("Querying LLM... params: %s", llm_params)
|
|
187
223
|
if request.stream:
|
|
188
224
|
return StreamingResponse(
|
|
189
|
-
process_stream(async_llm_func, request
|
|
190
|
-
media_type="text/event-stream"
|
|
225
|
+
process_stream(async_llm_func, request, llm_params, log_entry),
|
|
226
|
+
media_type="text/event-stream",
|
|
191
227
|
)
|
|
192
|
-
|
|
193
|
-
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
out = await async_llm_func(request.messages, **llm_params)
|
|
231
|
+
log_entry.response = out
|
|
232
|
+
logging.info("LLM response: %s", out)
|
|
233
|
+
except Exception as e:
|
|
234
|
+
log_entry.error = e
|
|
235
|
+
await log_non_blocking(log_entry)
|
|
236
|
+
raise
|
|
237
|
+
await log_non_blocking(log_entry)
|
|
238
|
+
|
|
194
239
|
return JSONResponse(
|
|
195
240
|
{
|
|
196
241
|
"choices": [
|
|
197
242
|
{
|
|
198
243
|
"index": 0,
|
|
199
244
|
"message": {"role": "assistant", "content": str(out)},
|
|
200
|
-
"finish_reason": "stop"
|
|
245
|
+
"finish_reason": "stop",
|
|
201
246
|
}
|
|
202
247
|
]
|
|
203
248
|
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from .base_logger import BaseLogger, LogEntryTransformer
|
|
2
|
+
from .log_writers import JsonLogWriter
|
|
3
|
+
from .core import LogEntry, log_non_blocking
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"BaseLogger",
|
|
7
|
+
"LogEntryTransformer",
|
|
8
|
+
"JsonLogWriter",
|
|
9
|
+
"LogEntry",
|
|
10
|
+
"log_non_blocking",
|
|
11
|
+
]
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
|
|
4
|
+
from lm_proxy.utils import resolve_instance_or_callable
|
|
5
|
+
|
|
6
|
+
from ..utils import resolve_obj_path
|
|
7
|
+
from .core import LogEntry
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AbstractLogEntryTransformer(abc.ABC):
|
|
11
|
+
@abc.abstractmethod
|
|
12
|
+
def __call__(self, log_entry: LogEntry) -> dict:
|
|
13
|
+
raise NotImplementedError()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LogEntryTransformer(AbstractLogEntryTransformer):
|
|
17
|
+
def __init__(self, **kwargs):
|
|
18
|
+
self.mapping = kwargs
|
|
19
|
+
|
|
20
|
+
def __call__(self, log_entry: LogEntry) -> dict:
|
|
21
|
+
result = {}
|
|
22
|
+
for key, path in self.mapping.items():
|
|
23
|
+
result[key] = resolve_obj_path(log_entry, path)
|
|
24
|
+
return result
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class AbstractLogWriter(abc.ABC):
|
|
28
|
+
@abc.abstractmethod
|
|
29
|
+
def __call__(self, logged_data: dict) -> dict:
|
|
30
|
+
raise NotImplementedError()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class BaseLogger:
|
|
35
|
+
log_writer: AbstractLogWriter | str | dict
|
|
36
|
+
entry_transformer: AbstractLogEntryTransformer | str | dict = field(default=None)
|
|
37
|
+
|
|
38
|
+
def __post_init__(self):
|
|
39
|
+
self.entry_transformer = resolve_instance_or_callable(
|
|
40
|
+
self.entry_transformer,
|
|
41
|
+
debug_name="logging.<logger>.entry_transformer",
|
|
42
|
+
)
|
|
43
|
+
self.log_writer = resolve_instance_or_callable(
|
|
44
|
+
self.log_writer,
|
|
45
|
+
debug_name="logging.<logger>.log_writer",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def _transform(self, log_entry: LogEntry) -> dict:
|
|
49
|
+
return (
|
|
50
|
+
self.entry_transformer(log_entry)
|
|
51
|
+
if self.entry_transformer
|
|
52
|
+
else log_entry.to_dict()
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def __call__(self, log_entry: LogEntry):
|
|
56
|
+
self.log_writer(self._transform(log_entry))
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Optional, TYPE_CHECKING
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
|
|
7
|
+
import microcore as mc
|
|
8
|
+
from ..bootstrap import env
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from lm_proxy.core import ChatCompletionRequest, Group
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class LogEntry:
|
|
16
|
+
request: "ChatCompletionRequest" = field()
|
|
17
|
+
response: Optional[mc.LLMResponse] = field(default=None)
|
|
18
|
+
error: Optional[Exception] = field(default=None)
|
|
19
|
+
group: "Group" = field(default=None)
|
|
20
|
+
connection: str = field(default=None)
|
|
21
|
+
api_key_id: Optional[str] = field(default=None)
|
|
22
|
+
remote_addr: Optional[str] = field(default=None)
|
|
23
|
+
created_at: Optional[datetime] = field(default_factory=datetime.now)
|
|
24
|
+
duration: Optional[float] = field(default=None)
|
|
25
|
+
|
|
26
|
+
def to_dict(self) -> dict:
|
|
27
|
+
data = self.__dict__.copy()
|
|
28
|
+
if self.request:
|
|
29
|
+
data["request"] = self.request.model_dump(mode="json")
|
|
30
|
+
return data
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def log(log_entry: LogEntry):
|
|
34
|
+
if log_entry.duration is None and log_entry.created_at:
|
|
35
|
+
log_entry.duration = (datetime.now() - log_entry.created_at).total_seconds()
|
|
36
|
+
for handler in env.config.loggers:
|
|
37
|
+
# check if it is async, then run both sync and async loggers in non-blocking way (sync too)
|
|
38
|
+
if asyncio.iscoroutinefunction(handler):
|
|
39
|
+
asyncio.create_task(handler(log_entry))
|
|
40
|
+
else:
|
|
41
|
+
try:
|
|
42
|
+
handler(log_entry)
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logging.error("Error in logger handler: %s", e)
|
|
45
|
+
raise e
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def log_non_blocking(
|
|
49
|
+
log_entry: LogEntry,
|
|
50
|
+
) -> Optional[asyncio.Task]:
|
|
51
|
+
if env.config.loggers:
|
|
52
|
+
task = asyncio.create_task(log(log_entry))
|
|
53
|
+
return task
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from .base_logger import AbstractLogWriter
|
|
6
|
+
from ..utils import CustomJsonEncoder
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class JsonLogWriter(AbstractLogWriter):
|
|
11
|
+
|
|
12
|
+
file_name: str
|
|
13
|
+
|
|
14
|
+
def __post_init__(self):
|
|
15
|
+
dir_path = os.path.dirname(self.file_name)
|
|
16
|
+
if dir_path:
|
|
17
|
+
os.makedirs(dir_path, exist_ok=True)
|
|
18
|
+
# Create the file if it doesn't exist
|
|
19
|
+
with open(self.file_name, "a", encoding="utf-8"):
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
def __call__(self, logged_data: dict):
|
|
23
|
+
with open(self.file_name, "a", encoding="utf-8") as f:
|
|
24
|
+
f.write(json.dumps(logged_data, cls=CustomJsonEncoder) + "\n")
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import inspect
|
|
3
|
+
from typing import Union, Callable
|
|
4
|
+
from datetime import datetime, date, time
|
|
5
|
+
|
|
6
|
+
from microcore.utils import resolve_callable
|
|
7
|
+
from starlette.requests import Request
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def resolve_obj_path(obj, path: str, default=None):
|
|
11
|
+
"""Resolves dotted path supporting both attributes and dict keys."""
|
|
12
|
+
for part in path.split("."):
|
|
13
|
+
try:
|
|
14
|
+
if isinstance(obj, dict):
|
|
15
|
+
obj = obj[part]
|
|
16
|
+
else:
|
|
17
|
+
obj = getattr(obj, part)
|
|
18
|
+
except (AttributeError, KeyError, TypeError):
|
|
19
|
+
return default
|
|
20
|
+
return obj
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def resolve_instance_or_callable(
|
|
24
|
+
item: Union[str, Callable, dict], class_key: str = "class", debug_name: str = None
|
|
25
|
+
) -> Callable:
|
|
26
|
+
if not item:
|
|
27
|
+
return None
|
|
28
|
+
if isinstance(item, dict):
|
|
29
|
+
if class_key not in item:
|
|
30
|
+
raise ValueError(
|
|
31
|
+
f"'{class_key}' key is missing in {debug_name or 'item'} config: {item}"
|
|
32
|
+
)
|
|
33
|
+
class_name = item.pop(class_key)
|
|
34
|
+
constructor = resolve_callable(class_name)
|
|
35
|
+
return constructor(**item)
|
|
36
|
+
if isinstance(item, str):
|
|
37
|
+
fn = resolve_callable(item)
|
|
38
|
+
return fn() if inspect.isclass(fn) else fn
|
|
39
|
+
if callable(item):
|
|
40
|
+
return item() if inspect.isclass(item) else item
|
|
41
|
+
else:
|
|
42
|
+
raise ValueError(f"Invalid {debug_name or 'item'} config: {item}")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class CustomJsonEncoder(json.JSONEncoder):
|
|
46
|
+
def default(self, obj):
|
|
47
|
+
if isinstance(obj, datetime):
|
|
48
|
+
return obj.isoformat()
|
|
49
|
+
elif isinstance(obj, date):
|
|
50
|
+
return obj.isoformat()
|
|
51
|
+
elif isinstance(obj, time):
|
|
52
|
+
return obj.isoformat()
|
|
53
|
+
elif hasattr(obj, "__dict__"):
|
|
54
|
+
return obj.__dict__
|
|
55
|
+
elif hasattr(obj, "model_dump"):
|
|
56
|
+
return obj.model_dump()
|
|
57
|
+
elif hasattr(obj, "dict"):
|
|
58
|
+
return obj.dict()
|
|
59
|
+
return super().default(obj)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_client_ip(request: Request) -> str:
|
|
63
|
+
# Try different headers in order of preference
|
|
64
|
+
if forwarded_for := request.headers.get("X-Forwarded-For"):
|
|
65
|
+
return forwarded_for.split(",")[0].strip()
|
|
66
|
+
if real_ip := request.headers.get("X-Real-IP"):
|
|
67
|
+
return real_ip
|
|
68
|
+
if forwarded := request.headers.get("Forwarded"):
|
|
69
|
+
# Parse Forwarded header (RFC 7239)
|
|
70
|
+
return forwarded.split("for=")[1].split(";")[0].strip()
|
|
71
|
+
|
|
72
|
+
# Fallback to direct client
|
|
73
|
+
return request.client.host if request.client else "unknown"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "inference-proxy"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.3.0"
|
|
4
4
|
description = "\"Inference Proxy\" is OpenAI-compatible http proxy server for inferencing various LLMs capable of working with Google, Anthropic, OpenAI APIs, local PyTorch inference, etc."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
keywords = ["llm", "large language models", "ai", "gpt", "openai", "proxy", "http", "proxy-server"]
|
|
@@ -43,7 +43,11 @@ package-mode = true
|
|
|
43
43
|
packages = [{ include = "lm_proxy"}]
|
|
44
44
|
|
|
45
45
|
[tool.poetry.group.test.dependencies]
|
|
46
|
-
pytest = "
|
|
46
|
+
pytest = "~=8.4.2"
|
|
47
|
+
pytest-asyncio = "~=1.2.0"
|
|
47
48
|
|
|
48
49
|
[tool.poetry.scripts]
|
|
49
50
|
inference-proxy = "lm_proxy.app:cli_app"
|
|
51
|
+
|
|
52
|
+
[tool.pytest.ini_options]
|
|
53
|
+
asyncio_mode = "auto"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|