tb-router-embed 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tb_router_embed-1.0.1/.env.example +12 -0
- tb_router_embed-1.0.1/.github/workflows/build.yml +27 -0
- tb_router_embed-1.0.1/.github/workflows/publish.yml +30 -0
- tb_router_embed-1.0.1/.gitignore +23 -0
- tb_router_embed-1.0.1/LICENSE +21 -0
- tb_router_embed-1.0.1/PKG-INFO +115 -0
- tb_router_embed-1.0.1/README.md +94 -0
- tb_router_embed-1.0.1/pyproject.toml +36 -0
- tb_router_embed-1.0.1/requirements-publish.txt +3 -0
- tb_router_embed-1.0.1/requirements.txt +5 -0
- tb_router_embed-1.0.1/src/router_embed/__init__.py +3 -0
- tb_router_embed-1.0.1/src/router_embed/config.py +71 -0
- tb_router_embed-1.0.1/src/router_embed/main.py +91 -0
- tb_router_embed-1.0.1/src/router_embed/router.py +81 -0
- tb_router_embed-1.0.1/tests/__init__.py +0 -0
- tb_router_embed-1.0.1/tests/test_router.py +44 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# Comma-separated vLLM backend URLs (host:port or full URL)
|
|
2
|
+
# Examples: localhost:8001 (same machine) or 192.168.86.173:8001,192.168.86.176:8001
|
|
3
|
+
EMBEDDING_BACKENDS=192.168.86.173:8001,192.168.86.176:8001
|
|
4
|
+
|
|
5
|
+
# Routing strategy: failover (default) or round_robin
|
|
6
|
+
ROUTER_STRATEGY=failover
|
|
7
|
+
|
|
8
|
+
# Port for the router (do not use 8001; reserved for vLLM backends)
|
|
9
|
+
ROUTER_PORT=8011
|
|
10
|
+
|
|
11
|
+
# Max concurrent requests to backends; excess requests wait in queue
|
|
12
|
+
ROUTER_MAX_CONCURRENT=20
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
name: Build
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
build:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
|
|
15
|
+
- name: Set up Python
|
|
16
|
+
uses: actions/setup-python@v5
|
|
17
|
+
with:
|
|
18
|
+
python-version: "3.11"
|
|
19
|
+
|
|
20
|
+
- name: Install
|
|
21
|
+
run: pip install -e ".[dev]"
|
|
22
|
+
|
|
23
|
+
- name: Test
|
|
24
|
+
run: pytest tests/ -v
|
|
25
|
+
|
|
26
|
+
- name: Build package
|
|
27
|
+
run: pip install build && python -m build
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
deploy:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
|
|
14
|
+
- name: Set up Python
|
|
15
|
+
uses: actions/setup-python@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: "3.11"
|
|
18
|
+
|
|
19
|
+
- name: Install build dependencies
|
|
20
|
+
run: |
|
|
21
|
+
python -m pip install --upgrade pip
|
|
22
|
+
pip install build twine
|
|
23
|
+
|
|
24
|
+
- name: Build distribution
|
|
25
|
+
run: python -m build
|
|
26
|
+
|
|
27
|
+
- name: Publish to PyPI
|
|
28
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
29
|
+
with:
|
|
30
|
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Build
|
|
2
|
+
build/
|
|
3
|
+
dist/
|
|
4
|
+
*.egg-info/
|
|
5
|
+
*.egg
|
|
6
|
+
|
|
7
|
+
# Python
|
|
8
|
+
__pycache__/
|
|
9
|
+
*.py[cod]
|
|
10
|
+
*$py.class
|
|
11
|
+
.pytest_cache/
|
|
12
|
+
.venv/
|
|
13
|
+
venv/
|
|
14
|
+
env/
|
|
15
|
+
|
|
16
|
+
# Env (keep .env.example)
|
|
17
|
+
.env
|
|
18
|
+
.env.local
|
|
19
|
+
|
|
20
|
+
# IDE
|
|
21
|
+
.idea/
|
|
22
|
+
.vscode/
|
|
23
|
+
*.swp
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Layers
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tb-router-embed
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: HTTP proxy router for vLLM embedding API with round-robin and failover
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: embedding,load-balancer,rag,router,vllm
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Requires-Python: >=3.11
|
|
13
|
+
Requires-Dist: fastapi>=0.100.0
|
|
14
|
+
Requires-Dist: httpx>=0.20.0
|
|
15
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
16
|
+
Requires-Dist: uvicorn[standard]>=0.20.0
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
|
|
19
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# router-embed
|
|
23
|
+
|
|
24
|
+
HTTP proxy router for vLLM embedding API. Routes `/v1/embeddings` and `/v1/models` requests to multiple vLLM backends with configurable round-robin or failover strategies.
|
|
25
|
+
|
|
26
|
+
## Setup
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
python3.11 -m venv venv
|
|
30
|
+
source venv/bin/activate
|
|
31
|
+
pip install -r requirements.txt
|
|
32
|
+
pip install -e .
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Run
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
cp .env.example .env # edit EMBEDDING_BACKENDS for your vLLM servers
|
|
39
|
+
source venv/bin/activate
|
|
40
|
+
python -m router_embed.main
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Or `router-embed` (after activating venv). If `router-embed` isn't found, use `./venv/bin/python -m router_embed.main` with no activation.
|
|
44
|
+
|
|
45
|
+
Configure clients (ingest/retrieve layers): set `EMBEDDING_URL=http://<router-host>:8011` instead of pointing directly at a vLLM server.
|
|
46
|
+
|
|
47
|
+
## Deploy (simple)
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
# From the project directory
|
|
51
|
+
source venv/bin/activate
|
|
52
|
+
cp .env.example .env # edit EMBEDDING_BACKENDS
|
|
53
|
+
nohup python -m router_embed.main &
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Or run in foreground (logs to terminal):
|
|
57
|
+
```bash
|
|
58
|
+
source venv/bin/activate && python -m router_embed.main
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Configuration
|
|
62
|
+
|
|
63
|
+
| Variable | Default | Description |
|
|
64
|
+
|----------|---------|-------------|
|
|
65
|
+
| `EMBEDDING_BACKENDS` | `192.168.86.173:8001,192.168.86.176:8001` | Comma-separated backend URLs |
|
|
66
|
+
| `ROUTER_STRATEGY` | `failover` | `failover` or `round_robin` |
|
|
67
|
+
| `ROUTER_PORT` | `8011` | Port for the router (do not use 8001) |
|
|
68
|
+
| `ROUTER_MAX_CONCURRENT` | `20` | Max concurrent requests; excess wait in queue |
|
|
69
|
+
|
|
70
|
+
## API
|
|
71
|
+
|
|
72
|
+
| Endpoint | Method | Description |
|
|
73
|
+
|----------|--------|-------------|
|
|
74
|
+
| `/v1/embeddings` | POST | Forward to backend (OpenAI-compatible) |
|
|
75
|
+
| `/v1/models` | GET | Forward to backend |
|
|
76
|
+
| `/health` | GET | Health check |
|
|
77
|
+
|
|
78
|
+
Examples:
|
|
79
|
+
```bash
|
|
80
|
+
# Embeddings
|
|
81
|
+
curl -X POST http://localhost:8011/v1/embeddings \
|
|
82
|
+
-H "Content-Type: application/json" \
|
|
83
|
+
-d '{"model": "BAAI/bge-m3", "input": "hello world"}'
|
|
84
|
+
|
|
85
|
+
# Models
|
|
86
|
+
curl http://localhost:8011/v1/models
|
|
87
|
+
|
|
88
|
+
# Health
|
|
89
|
+
curl http://localhost:8011/health
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Troubleshooting
|
|
93
|
+
|
|
94
|
+
**"address already in use" (port 8011)** — Another process is using the port. Stop it: `lsof -ti:8011 | xargs kill` (macOS/Linux). Or set `ROUTER_PORT=8012` in `.env` to use a different port.
|
|
95
|
+
|
|
96
|
+
**"All backends unavailable: All connection attempts failed"** — The router could not reach any vLLM backend. Ensure:
|
|
97
|
+
1. vLLM is running on your backend hosts (default: 192.168.86.173:8001, 192.168.86.176:8001)
|
|
98
|
+
2. Your machine can reach those IPs (same network, no firewall blocking)
|
|
99
|
+
3. Or set `EMBEDDING_BACKENDS` in `.env` to your vLLM URLs, e.g. `localhost:8001` if vLLM runs on the same machine
|
|
100
|
+
|
|
101
|
+
## Development
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
pip install -e ".[dev]"
|
|
105
|
+
pytest tests/ -v
|
|
106
|
+
# or: make test
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Publish to PyPI
|
|
110
|
+
|
|
111
|
+
One-shot via GitHub Actions (see [pypi-hello-world](https://github.com/taixingbi/pypi-hello-world)):
|
|
112
|
+
|
|
113
|
+
1. Add `PYPI_API_TOKEN` as a repo secret (Settings → Secrets → Actions). Get token at [pypi.org/manage/account/token/](https://pypi.org/manage/account/token/).
|
|
114
|
+
2. Push to `main` or run **Actions → Publish to PyPI → Run workflow** manually.
|
|
115
|
+
3. The workflow builds and uploads to PyPI.
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# router-embed
|
|
2
|
+
|
|
3
|
+
HTTP proxy router for vLLM embedding API. Routes `/v1/embeddings` and `/v1/models` requests to multiple vLLM backends with configurable round-robin or failover strategies.
|
|
4
|
+
|
|
5
|
+
## Setup
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
python3.11 -m venv venv
|
|
9
|
+
source venv/bin/activate
|
|
10
|
+
pip install -r requirements.txt
|
|
11
|
+
pip install -e .
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Run
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
cp .env.example .env # edit EMBEDDING_BACKENDS for your vLLM servers
|
|
18
|
+
source venv/bin/activate
|
|
19
|
+
python -m router_embed.main
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Or `router-embed` (after activating venv). If `router-embed` isn't found, use `./venv/bin/python -m router_embed.main` with no activation.
|
|
23
|
+
|
|
24
|
+
Configure clients (ingest/retrieve layers): set `EMBEDDING_URL=http://<router-host>:8011` instead of pointing directly at a vLLM server.
|
|
25
|
+
|
|
26
|
+
## Deploy (simple)
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# From the project directory
|
|
30
|
+
source venv/bin/activate
|
|
31
|
+
cp .env.example .env # edit EMBEDDING_BACKENDS
|
|
32
|
+
nohup python -m router_embed.main &
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Or run in foreground (logs to terminal):
|
|
36
|
+
```bash
|
|
37
|
+
source venv/bin/activate && python -m router_embed.main
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Configuration
|
|
41
|
+
|
|
42
|
+
| Variable | Default | Description |
|
|
43
|
+
|----------|---------|-------------|
|
|
44
|
+
| `EMBEDDING_BACKENDS` | `192.168.86.173:8001,192.168.86.176:8001` | Comma-separated backend URLs |
|
|
45
|
+
| `ROUTER_STRATEGY` | `failover` | `failover` or `round_robin` |
|
|
46
|
+
| `ROUTER_PORT` | `8011` | Port for the router (do not use 8001) |
|
|
47
|
+
| `ROUTER_MAX_CONCURRENT` | `20` | Max concurrent requests; excess wait in queue |
|
|
48
|
+
|
|
49
|
+
## API
|
|
50
|
+
|
|
51
|
+
| Endpoint | Method | Description |
|
|
52
|
+
|----------|--------|-------------|
|
|
53
|
+
| `/v1/embeddings` | POST | Forward to backend (OpenAI-compatible) |
|
|
54
|
+
| `/v1/models` | GET | Forward to backend |
|
|
55
|
+
| `/health` | GET | Health check |
|
|
56
|
+
|
|
57
|
+
Examples:
|
|
58
|
+
```bash
|
|
59
|
+
# Embeddings
|
|
60
|
+
curl -X POST http://localhost:8011/v1/embeddings \
|
|
61
|
+
-H "Content-Type: application/json" \
|
|
62
|
+
-d '{"model": "BAAI/bge-m3", "input": "hello world"}'
|
|
63
|
+
|
|
64
|
+
# Models
|
|
65
|
+
curl http://localhost:8011/v1/models
|
|
66
|
+
|
|
67
|
+
# Health
|
|
68
|
+
curl http://localhost:8011/health
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Troubleshooting
|
|
72
|
+
|
|
73
|
+
**"address already in use" (port 8011)** — Another process is using the port. Stop it: `lsof -ti:8011 | xargs kill` (macOS/Linux). Or set `ROUTER_PORT=8012` in `.env` to use a different port.
|
|
74
|
+
|
|
75
|
+
**"All backends unavailable: All connection attempts failed"** — The router could not reach any vLLM backend. Ensure:
|
|
76
|
+
1. vLLM is running on your backend hosts (default: 192.168.86.173:8001, 192.168.86.176:8001)
|
|
77
|
+
2. Your machine can reach those IPs (same network, no firewall blocking)
|
|
78
|
+
3. Or set `EMBEDDING_BACKENDS` in `.env` to your vLLM URLs, e.g. `localhost:8001` if vLLM runs on the same machine
|
|
79
|
+
|
|
80
|
+
## Development
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
pip install -e ".[dev]"
|
|
84
|
+
pytest tests/ -v
|
|
85
|
+
# or: make test
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Publish to PyPI
|
|
89
|
+
|
|
90
|
+
One-shot via GitHub Actions (see [pypi-hello-world](https://github.com/taixingbi/pypi-hello-world)):
|
|
91
|
+
|
|
92
|
+
1. Add `PYPI_API_TOKEN` as a repo secret (Settings → Secrets → Actions). Get token at [pypi.org/manage/account/token/](https://pypi.org/manage/account/token/).
|
|
93
|
+
2. Push to `main` or run **Actions → Publish to PyPI → Run workflow** manually.
|
|
94
|
+
3. The workflow builds and uploads to PyPI.
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.0"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "tb-router-embed"
|
|
7
|
+
version = "1.0.1"
|
|
8
|
+
description = "HTTP proxy router for vLLM embedding API with round-robin and failover"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
keywords = ["rag", "embedding", "vllm", "router", "load-balancer"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"License :: OSI Approved :: MIT License",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Programming Language :: Python :: 3.11",
|
|
17
|
+
"Programming Language :: Python :: 3.12",
|
|
18
|
+
]
|
|
19
|
+
dependencies = [
|
|
20
|
+
"httpx>=0.20.0",
|
|
21
|
+
"uvicorn[standard]>=0.20.0",
|
|
22
|
+
"fastapi>=0.100.0",
|
|
23
|
+
"python-dotenv>=1.0.0",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.optional-dependencies]
|
|
27
|
+
dev = [
|
|
28
|
+
"pytest>=7.0",
|
|
29
|
+
"pytest-asyncio>=0.21.0",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.scripts]
|
|
33
|
+
router-embed = "router_embed.main:run"
|
|
34
|
+
|
|
35
|
+
[tool.hatch.build.targets.wheel]
|
|
36
|
+
packages = ["src/router_embed"]
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration for vLLM embedding router.
|
|
3
|
+
Loads .env if present; values can be overridden by environment variables or configure().
|
|
4
|
+
"""
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
|
|
9
|
+
load_dotenv()
|
|
10
|
+
|
|
11
|
+
_overrides: dict = {}
|
|
12
|
+
|
|
13
|
+
DEFAULT_BACKENDS = "192.168.86.173:8001,192.168.86.176:8001"
|
|
14
|
+
DEFAULT_STRATEGY = "failover"
|
|
15
|
+
DEFAULT_PORT = 8011
|
|
16
|
+
DEFAULT_MAX_CONCURRENT = 20
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def configure(
|
|
20
|
+
backends: str | None = None,
|
|
21
|
+
strategy: str | None = None,
|
|
22
|
+
port: int | None = None,
|
|
23
|
+
max_concurrent: int | None = None,
|
|
24
|
+
**kwargs,
|
|
25
|
+
) -> None:
|
|
26
|
+
"""Set configuration overrides (used before starting the server)."""
|
|
27
|
+
global _overrides
|
|
28
|
+
if backends is not None:
|
|
29
|
+
_overrides["backends"] = backends
|
|
30
|
+
if strategy is not None:
|
|
31
|
+
_overrides["strategy"] = strategy
|
|
32
|
+
if port is not None:
|
|
33
|
+
_overrides["port"] = port
|
|
34
|
+
if max_concurrent is not None:
|
|
35
|
+
_overrides["max_concurrent"] = max_concurrent
|
|
36
|
+
for k, v in kwargs.items():
|
|
37
|
+
if v is not None:
|
|
38
|
+
_overrides[k] = v
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_backends() -> list[str]:
|
|
42
|
+
"""Backend URLs: override > env > default. Returns list of base URLs."""
|
|
43
|
+
raw = _overrides.get("backends") or os.getenv("EMBEDDING_BACKENDS", DEFAULT_BACKENDS)
|
|
44
|
+
urls = []
|
|
45
|
+
for b in raw.split(","):
|
|
46
|
+
b = b.strip()
|
|
47
|
+
if not b:
|
|
48
|
+
continue
|
|
49
|
+
if not b.startswith(("http://", "https://")):
|
|
50
|
+
b = f"http://{b}"
|
|
51
|
+
urls.append(b.rstrip("/"))
|
|
52
|
+
return urls
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_strategy() -> str:
|
|
56
|
+
"""Routing strategy: override > env > default. One of round_robin, failover."""
|
|
57
|
+
return _overrides.get("strategy") or os.getenv("ROUTER_STRATEGY", DEFAULT_STRATEGY)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_port() -> int:
|
|
61
|
+
"""Router port: override > env > default. Do not use 8001 (reserved for vLLM backends)."""
|
|
62
|
+
raw = _overrides.get("port") or os.getenv("ROUTER_PORT", str(DEFAULT_PORT))
|
|
63
|
+
return int(raw)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def get_max_concurrent() -> int:
|
|
67
|
+
"""Max concurrent requests to backends; excess wait in queue."""
|
|
68
|
+
raw = _overrides.get("max_concurrent") or os.getenv(
|
|
69
|
+
"ROUTER_MAX_CONCURRENT", str(DEFAULT_MAX_CONCURRENT)
|
|
70
|
+
)
|
|
71
|
+
return int(raw)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""FastAPI app: proxy /v1/embeddings and /v1/models to vLLM backends."""
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
if __name__ == "__main__":
|
|
6
|
+
_src = Path(__file__).resolve().parent.parent
|
|
7
|
+
if str(_src) not in sys.path:
|
|
8
|
+
sys.path.insert(0, str(_src))
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
from contextlib import asynccontextmanager
|
|
12
|
+
|
|
13
|
+
import httpx
|
|
14
|
+
from fastapi import FastAPI, Request
|
|
15
|
+
from fastapi.responses import JSONResponse, Response
|
|
16
|
+
|
|
17
|
+
from router_embed.config import get_backends, get_max_concurrent, get_port
|
|
18
|
+
from router_embed.router import proxy_request
|
|
19
|
+
|
|
20
|
+
_http_client: httpx.AsyncClient | None = None
|
|
21
|
+
_queue: asyncio.Semaphore | None = None
|
|
22
|
+
|
|
23
|
+
_SKIP_HEADERS = frozenset({"host", "content-length"})
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _err(code: int, msg: str) -> JSONResponse:
|
|
27
|
+
return JSONResponse(status_code=code, content={"error": msg})
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@asynccontextmanager
|
|
31
|
+
async def lifespan(app: FastAPI):
|
|
32
|
+
global _http_client, _queue
|
|
33
|
+
_http_client = httpx.AsyncClient(timeout=60.0)
|
|
34
|
+
_queue = asyncio.Semaphore(get_max_concurrent())
|
|
35
|
+
yield
|
|
36
|
+
await _http_client.aclose()
|
|
37
|
+
_http_client = None
|
|
38
|
+
_queue = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
app = FastAPI(title="vLLM Embedding Router", lifespan=lifespan)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@app.get("/health")
|
|
45
|
+
async def health():
|
|
46
|
+
"""Health check: router is up."""
|
|
47
|
+
return {"status": "ok"}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@app.api_route("/v1/embeddings", methods=["POST"])
|
|
51
|
+
@app.api_route("/v1/models", methods=["GET"])
|
|
52
|
+
async def proxy(request: Request):
|
|
53
|
+
"""Forward /v1/embeddings and /v1/models to backend(s)."""
|
|
54
|
+
if _http_client is None or _queue is None:
|
|
55
|
+
return _err(503, "Router not initialized")
|
|
56
|
+
|
|
57
|
+
backends = get_backends()
|
|
58
|
+
if not backends:
|
|
59
|
+
return _err(503, "No backends configured. Set EMBEDDING_BACKENDS.")
|
|
60
|
+
|
|
61
|
+
headers = {k: v for k, v in request.headers.items() if k.lower() not in _SKIP_HEADERS}
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
async with _queue:
|
|
65
|
+
resp, _ = await proxy_request(
|
|
66
|
+
_http_client,
|
|
67
|
+
request.method,
|
|
68
|
+
request.url.path,
|
|
69
|
+
content=await request.body(),
|
|
70
|
+
headers=headers,
|
|
71
|
+
)
|
|
72
|
+
return Response(content=resp.content, status_code=resp.status_code, headers=dict(resp.headers))
|
|
73
|
+
except (httpx.TimeoutException, httpx.ConnectError) as e:
|
|
74
|
+
return _err(503, f"All backends unavailable: {e!s}")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def run():
|
|
78
|
+
"""CLI entry point: run uvicorn server."""
|
|
79
|
+
import uvicorn
|
|
80
|
+
|
|
81
|
+
port = get_port()
|
|
82
|
+
uvicorn.run(
|
|
83
|
+
"router_embed.main:app",
|
|
84
|
+
host="0.0.0.0",
|
|
85
|
+
port=port,
|
|
86
|
+
reload=False,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
if __name__ == "__main__":
|
|
91
|
+
run()
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Backend selection and proxy logic for vLLM embedding router."""
|
|
2
|
+
import itertools
|
|
3
|
+
|
|
4
|
+
import httpx
|
|
5
|
+
|
|
6
|
+
from router_embed.config import get_backends, get_strategy
|
|
7
|
+
|
|
8
|
+
REQUEST_TIMEOUT = 60.0
|
|
9
|
+
_ROUND_ROBIN_INDEX: itertools.cycle | None = None
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _get_round_robin_index() -> itertools.cycle:
|
|
13
|
+
global _ROUND_ROBIN_INDEX
|
|
14
|
+
if _ROUND_ROBIN_INDEX is None:
|
|
15
|
+
backends = get_backends()
|
|
16
|
+
if not backends:
|
|
17
|
+
raise ValueError("No backends configured. Set EMBEDDING_BACKENDS.")
|
|
18
|
+
_ROUND_ROBIN_INDEX = itertools.cycle(range(len(backends)))
|
|
19
|
+
return _ROUND_ROBIN_INDEX
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _select_backend() -> str:
|
|
23
|
+
"""Select backend URL based on strategy."""
|
|
24
|
+
backends = get_backends()
|
|
25
|
+
if not backends:
|
|
26
|
+
raise ValueError("No backends configured. Set EMBEDDING_BACKENDS.")
|
|
27
|
+
|
|
28
|
+
strategy = get_strategy()
|
|
29
|
+
if strategy == "round_robin":
|
|
30
|
+
idx = next(_get_round_robin_index())
|
|
31
|
+
return backends[idx]
|
|
32
|
+
if strategy == "failover":
|
|
33
|
+
return backends[0]
|
|
34
|
+
raise ValueError(f"Unknown ROUTER_STRATEGY: {strategy}. Use round_robin or failover.")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _is_retryable(status_code: int) -> bool:
|
|
38
|
+
return status_code == 408 or status_code >= 500
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
async def proxy_request(
|
|
42
|
+
client: httpx.AsyncClient,
|
|
43
|
+
method: str,
|
|
44
|
+
path: str,
|
|
45
|
+
content: bytes | None = None,
|
|
46
|
+
headers: dict | None = None,
|
|
47
|
+
) -> tuple[httpx.Response, str]:
|
|
48
|
+
"""
|
|
49
|
+
Forward request to backend(s). Returns (response, backend_url_used).
|
|
50
|
+
On failover: tries next backend on 5xx or timeout.
|
|
51
|
+
Raises httpx.HTTPStatusError if all backends fail.
|
|
52
|
+
"""
|
|
53
|
+
backends = get_backends()
|
|
54
|
+
if not backends:
|
|
55
|
+
raise ValueError("No backends configured. Set EMBEDDING_BACKENDS.")
|
|
56
|
+
|
|
57
|
+
strategy = get_strategy()
|
|
58
|
+
candidates = backends if strategy == "failover" else [_select_backend()]
|
|
59
|
+
|
|
60
|
+
last_error: Exception | None = None
|
|
61
|
+
last_response: httpx.Response | None = None
|
|
62
|
+
|
|
63
|
+
for base_url in candidates:
|
|
64
|
+
url = f"{base_url}{path}"
|
|
65
|
+
try:
|
|
66
|
+
resp = await client.request(
|
|
67
|
+
method, url, content=content, headers=headers, timeout=REQUEST_TIMEOUT
|
|
68
|
+
)
|
|
69
|
+
if strategy == "failover" and _is_retryable(resp.status_code):
|
|
70
|
+
last_response = resp
|
|
71
|
+
continue
|
|
72
|
+
return resp, base_url
|
|
73
|
+
except (httpx.TimeoutException, httpx.ConnectError) as e:
|
|
74
|
+
last_error = e
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
if last_response is not None:
|
|
78
|
+
return last_response, candidates[-1]
|
|
79
|
+
if last_error is not None:
|
|
80
|
+
raise last_error
|
|
81
|
+
raise RuntimeError("No backends available")
|
|
File without changes
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Tests for router_embed."""
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
from router_embed.config import configure, get_backends, get_max_concurrent, get_port, get_strategy
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@pytest.fixture(autouse=True)
|
|
8
|
+
def reset_config():
|
|
9
|
+
import router_embed.config as cfg
|
|
10
|
+
cfg._overrides.clear()
|
|
11
|
+
yield
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_get_backends_default():
|
|
15
|
+
backends = get_backends()
|
|
16
|
+
assert len(backends) == 2
|
|
17
|
+
assert "192.168.86.173" in backends[0]
|
|
18
|
+
assert "192.168.86.176" in backends[1]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_get_backends_override():
|
|
22
|
+
configure(backends="http://a:8001,http://b:8002")
|
|
23
|
+
assert get_backends() == ["http://a:8001", "http://b:8002"]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_get_strategy_default():
|
|
27
|
+
assert get_strategy() == "failover"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_get_port_default():
|
|
31
|
+
assert get_port() == 8011
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_get_max_concurrent_default():
|
|
35
|
+
assert get_max_concurrent() == 20
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_health_endpoint():
|
|
39
|
+
from fastapi.testclient import TestClient
|
|
40
|
+
from router_embed.main import app
|
|
41
|
+
client = TestClient(app)
|
|
42
|
+
r = client.get("/health")
|
|
43
|
+
assert r.status_code == 200
|
|
44
|
+
assert r.json() == {"status": "ok"}
|