vllm-router 0.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vllm_router-0.1.9/Cargo.toml +111 -0
- vllm_router-0.1.9/MANIFEST.in +5 -0
- vllm_router-0.1.9/PKG-INFO +266 -0
- vllm_router-0.1.9/README.md +244 -0
- vllm_router-0.1.9/build.rs +31 -0
- vllm_router-0.1.9/py_src/vllm_router/__init__.py +9 -0
- vllm_router-0.1.9/py_src/vllm_router/launch_router.py +109 -0
- vllm_router-0.1.9/py_src/vllm_router/mini_lb.py +395 -0
- vllm_router-0.1.9/py_src/vllm_router/router.py +148 -0
- vllm_router-0.1.9/py_src/vllm_router/router_args.py +592 -0
- vllm_router-0.1.9/py_src/vllm_router/version.py +1 -0
- vllm_router-0.1.9/py_src/vllm_router.egg-info/PKG-INFO +266 -0
- vllm_router-0.1.9/py_src/vllm_router.egg-info/SOURCES.txt +96 -0
- vllm_router-0.1.9/py_src/vllm_router.egg-info/dependency_links.txt +1 -0
- vllm_router-0.1.9/py_src/vllm_router.egg-info/entry_points.txt +2 -0
- vllm_router-0.1.9/py_src/vllm_router.egg-info/not-zip-safe +1 -0
- vllm_router-0.1.9/py_src/vllm_router.egg-info/requires.txt +11 -0
- vllm_router-0.1.9/py_src/vllm_router.egg-info/top_level.txt +1 -0
- vllm_router-0.1.9/pyproject.toml +45 -0
- vllm_router-0.1.9/setup.cfg +4 -0
- vllm_router-0.1.9/setup.py +22 -0
- vllm_router-0.1.9/src/config/mod.rs +28 -0
- vllm_router-0.1.9/src/config/types.rs +1308 -0
- vllm_router-0.1.9/src/config/validation.rs +848 -0
- vllm_router-0.1.9/src/core/circuit_breaker.rs +555 -0
- vllm_router-0.1.9/src/core/error.rs +240 -0
- vllm_router-0.1.9/src/core/mod.rs +26 -0
- vllm_router-0.1.9/src/core/retry.rs +409 -0
- vllm_router-0.1.9/src/core/token_bucket.rs +195 -0
- vllm_router-0.1.9/src/core/worker.rs +1947 -0
- vllm_router-0.1.9/src/core/worker_registry.rs +526 -0
- vllm_router-0.1.9/src/data_connector/mod.rs +11 -0
- vllm_router-0.1.9/src/data_connector/response_memory_store.rs +325 -0
- vllm_router-0.1.9/src/data_connector/response_noop_store.rs +53 -0
- vllm_router-0.1.9/src/data_connector/responses.rs +177 -0
- vllm_router-0.1.9/src/grpc/client.rs +254 -0
- vllm_router-0.1.9/src/grpc/mod.rs +8 -0
- vllm_router-0.1.9/src/handler.rs +84 -0
- vllm_router-0.1.9/src/lib.rs +521 -0
- vllm_router-0.1.9/src/logger.rs +56 -0
- vllm_router-0.1.9/src/logging.rs +163 -0
- vllm_router-0.1.9/src/main.rs +761 -0
- vllm_router-0.1.9/src/metrics.rs +1047 -0
- vllm_router-0.1.9/src/middleware.rs +501 -0
- vllm_router-0.1.9/src/policies/cache_aware.rs +554 -0
- vllm_router-0.1.9/src/policies/consistent_hash.rs +807 -0
- vllm_router-0.1.9/src/policies/factory.rs +107 -0
- vllm_router-0.1.9/src/policies/mod.rs +190 -0
- vllm_router-0.1.9/src/policies/power_of_two.rs +202 -0
- vllm_router-0.1.9/src/policies/random.rs +123 -0
- vllm_router-0.1.9/src/policies/registry.rs +334 -0
- vllm_router-0.1.9/src/policies/round_robin.rs +142 -0
- vllm_router-0.1.9/src/proto/vllm_scheduler.proto +389 -0
- vllm_router-0.1.9/src/protocols/mod.rs +6 -0
- vllm_router-0.1.9/src/protocols/spec.rs +2930 -0
- vllm_router-0.1.9/src/protocols/validation.rs +1221 -0
- vllm_router-0.1.9/src/protocols/worker_spec.rs +178 -0
- vllm_router-0.1.9/src/routers/factory.rs +258 -0
- vllm_router-0.1.9/src/routers/grpc/mod.rs +4 -0
- vllm_router-0.1.9/src/routers/grpc/pd_router.rs +345 -0
- vllm_router-0.1.9/src/routers/grpc/router.rs +283 -0
- vllm_router-0.1.9/src/routers/header_utils.rs +53 -0
- vllm_router-0.1.9/src/routers/http/dp_utils.rs +172 -0
- vllm_router-0.1.9/src/routers/http/logprobs_merge.rs +357 -0
- vllm_router-0.1.9/src/routers/http/mod.rs +10 -0
- vllm_router-0.1.9/src/routers/http/openai_router.rs +423 -0
- vllm_router-0.1.9/src/routers/http/pd_router.rs +2667 -0
- vllm_router-0.1.9/src/routers/http/pd_types.rs +81 -0
- vllm_router-0.1.9/src/routers/http/router.rs +1640 -0
- vllm_router-0.1.9/src/routers/http/vllm_pd_router.rs +1407 -0
- vllm_router-0.1.9/src/routers/http/vllm_service_discovery.rs +348 -0
- vllm_router-0.1.9/src/routers/mod.rs +164 -0
- vllm_router-0.1.9/src/routers/router_manager.rs +805 -0
- vllm_router-0.1.9/src/routes/interface.rs +10 -0
- vllm_router-0.1.9/src/routes/mod.rs +7 -0
- vllm_router-0.1.9/src/routes/pool_route.rs +0 -0
- vllm_router-0.1.9/src/routes/prefill_decode_route.rs +0 -0
- vllm_router-0.1.9/src/routes/round_robin_route.rs +36 -0
- vllm_router-0.1.9/src/routes/routing_tree_builder.rs +64 -0
- vllm_router-0.1.9/src/routes/single_server_route.rs +42 -0
- vllm_router-0.1.9/src/server.rs +811 -0
- vllm_router-0.1.9/src/service_discovery.rs +1174 -0
- vllm_router-0.1.9/src/tokenizer/chat_template.rs +182 -0
- vllm_router-0.1.9/src/tokenizer/factory.rs +318 -0
- vllm_router-0.1.9/src/tokenizer/hub.rs +238 -0
- vllm_router-0.1.9/src/tokenizer/huggingface.rs +234 -0
- vllm_router-0.1.9/src/tokenizer/mock.rs +112 -0
- vllm_router-0.1.9/src/tokenizer/mod.rs +123 -0
- vllm_router-0.1.9/src/tokenizer/sequence.rs +238 -0
- vllm_router-0.1.9/src/tokenizer/stop.rs +506 -0
- vllm_router-0.1.9/src/tokenizer/stream.rs +105 -0
- vllm_router-0.1.9/src/tokenizer/tests.rs +143 -0
- vllm_router-0.1.9/src/tokenizer/tiktoken.rs +276 -0
- vllm_router-0.1.9/src/tokenizer/traits.rs +83 -0
- vllm_router-0.1.9/src/tree.rs +1478 -0
- vllm_router-0.1.9/src/types.rs +74 -0
- vllm_router-0.1.9/src/utils/json.rs +336 -0
- vllm_router-0.1.9/src/utils/mod.rs +1 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "vllm_router_rs"
|
|
3
|
+
version = "0.1.9"
|
|
4
|
+
edition = "2021"
|
|
5
|
+
|
|
6
|
+
[features]
|
|
7
|
+
default = ["grpc-client"]
|
|
8
|
+
grpc-client = []
|
|
9
|
+
grpc-server = []
|
|
10
|
+
|
|
11
|
+
[lib]
|
|
12
|
+
name = "vllm_router_rs"
|
|
13
|
+
# Pure Rust library: Just omit crate-type (defaults to rlib)
|
|
14
|
+
# Python/C binding + Rust library: Use ["cdylib", "rlib"]
|
|
15
|
+
crate-type = ["cdylib", "rlib"]
|
|
16
|
+
|
|
17
|
+
[[bin]]
|
|
18
|
+
name = "vllm-router"
|
|
19
|
+
path = "src/main.rs"
|
|
20
|
+
|
|
21
|
+
[dependencies]
|
|
22
|
+
clap = { version = "4", features = ["derive"] }
|
|
23
|
+
axum = { version = "0.8.4", features = ["macros", "ws", "tracing"] }
|
|
24
|
+
tower = { version = "0.5", features = ["full"] }
|
|
25
|
+
tower-http = { version = "0.6", features = ["trace", "compression-gzip", "cors", "timeout", "limit", "request-id", "util"] }
|
|
26
|
+
serde = { version = "1.0", features = ["derive"] }
|
|
27
|
+
serde_json = "1.0"
|
|
28
|
+
bytes = "1.8.0"
|
|
29
|
+
rand = "0.9.2"
|
|
30
|
+
reqwest = { version = "0.12.8", features = ["stream", "blocking", "json"] }
|
|
31
|
+
futures-util = "0.3"
|
|
32
|
+
futures = "0.3"
|
|
33
|
+
pyo3 = { version = "0.24", features = ["extension-module"] }
|
|
34
|
+
dashmap = "6.1.0"
|
|
35
|
+
http = "1.1.0"
|
|
36
|
+
tokio = { version = "1.42.0", features = ["full"] }
|
|
37
|
+
async-trait = "0.1"
|
|
38
|
+
tracing = "0.1"
|
|
39
|
+
tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "chrono"] }
|
|
40
|
+
tracing-log = "0.2"
|
|
41
|
+
tracing-appender = "0.2.3"
|
|
42
|
+
chrono = "0.4"
|
|
43
|
+
kube = { version = "1.1.0", features = ["runtime", "derive"] }
|
|
44
|
+
k8s-openapi = { version = "0.25.0", features = ["v1_33"] }
|
|
45
|
+
metrics = "0.24.2"
|
|
46
|
+
metrics-exporter-prometheus = "0.17.0"
|
|
47
|
+
uuid = { version = "1.10", features = ["v4", "serde"] }
|
|
48
|
+
ulid = "1.2.1"
|
|
49
|
+
parking_lot = "0.12.4"
|
|
50
|
+
thiserror = "2.0.12"
|
|
51
|
+
regex = "1.10"
|
|
52
|
+
url = "2.5.4"
|
|
53
|
+
tokio-stream = { version = "0.1", features = ["sync"] }
|
|
54
|
+
anyhow = "1.0"
|
|
55
|
+
tokenizers = { version = "0.22.2" }
|
|
56
|
+
tiktoken-rs = { version = "0.7.0" }
|
|
57
|
+
minijinja = { version = "2.0" }
|
|
58
|
+
rustls = { version = "0.23", default-features = false, features = ["ring", "std"] }
|
|
59
|
+
hf-hub = { version = "0.4.3", features = ["tokio"] }
|
|
60
|
+
|
|
61
|
+
# gRPC and Protobuf dependencies
|
|
62
|
+
tonic = { version = "0.12", features = ["tls", "gzip", "transport"] }
|
|
63
|
+
prost = "0.13"
|
|
64
|
+
prost-types = "0.13"
|
|
65
|
+
deadpool = { version = "0.12", features = ["managed", "rt_tokio_1"] }
|
|
66
|
+
backoff = { version = "0.4", features = ["tokio"] }
|
|
67
|
+
strum = { version = "0.26", features = ["derive"] }
|
|
68
|
+
once_cell = "1.21.3"
|
|
69
|
+
zmq = "0.10.0"
|
|
70
|
+
rmp-serde = "1.3"
|
|
71
|
+
|
|
72
|
+
[build-dependencies]
|
|
73
|
+
tonic-build = "0.12"
|
|
74
|
+
prost-build = "0.13"
|
|
75
|
+
|
|
76
|
+
[dev-dependencies]
|
|
77
|
+
criterion = { version = "0.5", features = ["html_reports"] }
|
|
78
|
+
tower = { version = "0.5", features = ["util"] }
|
|
79
|
+
http-body-util = "0.1"
|
|
80
|
+
portpicker = "0.1"
|
|
81
|
+
tempfile = "3.8"
|
|
82
|
+
lazy_static = "1.4"
|
|
83
|
+
|
|
84
|
+
[[bench]]
|
|
85
|
+
name = "request_processing"
|
|
86
|
+
harness = false
|
|
87
|
+
path = "benches/request_processing.rs"
|
|
88
|
+
|
|
89
|
+
[[bench]]
|
|
90
|
+
name = "tokenizer_benchmark"
|
|
91
|
+
harness = false
|
|
92
|
+
path = "benches/tokenizer_benchmark.rs"
|
|
93
|
+
|
|
94
|
+
[profile.release]
|
|
95
|
+
lto = "thin"
|
|
96
|
+
codegen-units = 1
|
|
97
|
+
|
|
98
|
+
[profile.dev]
|
|
99
|
+
opt-level = 0
|
|
100
|
+
debug = true
|
|
101
|
+
split-debuginfo = "unpacked"
|
|
102
|
+
incremental = true
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
[profile.dev.build-override]
|
|
106
|
+
opt-level = 3
|
|
107
|
+
codegen-units = 1
|
|
108
|
+
|
|
109
|
+
[profile.dev-opt]
|
|
110
|
+
inherits = "dev"
|
|
111
|
+
opt-level = 1
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vllm-router
|
|
3
|
+
Version: 0.1.9
|
|
4
|
+
Summary: High-performance Rust-based load balancer for VLLM with multiple routing algorithms and prefill-decode disaggregation support
|
|
5
|
+
Author-email: Byron Hsu <byronhsu1230@gmail.com>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
8
|
+
Classifier: Programming Language :: Rust
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Requires-Python: >=3.8
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
Requires-Dist: setproctitle
|
|
13
|
+
Requires-Dist: aiohttp
|
|
14
|
+
Requires-Dist: orjson
|
|
15
|
+
Requires-Dist: uvicorn
|
|
16
|
+
Requires-Dist: fastapi
|
|
17
|
+
Requires-Dist: requests>=2.25.0
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
21
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
22
|
+
|
|
23
|
+
# vLLM Router
|
|
24
|
+
|
|
25
|
+
A high-performance and light-weight request forwarding system for vLLM large scale deployments, providing advanced load balancing methods and prefill/decode disaggregation support.
|
|
26
|
+
|
|
27
|
+
### Key Features
|
|
28
|
+
|
|
29
|
+
- **Core Architecture**: Request routing framework and async processing patterns
|
|
30
|
+
- **Load Balancing**: Multiple algorithms (cache-aware, power of two, consistent hashing, random, round robin)
|
|
31
|
+
- **Prefill-Decode Disaggregation**: Specialized routing for separated processing phases
|
|
32
|
+
- **Service Discovery**: Kubernetes-native worker management and health monitoring
|
|
33
|
+
- **Enterprise Features**: Circuit breakers, retry logic, metrics collection
|
|
34
|
+
|
|
35
|
+
## Quick Start
|
|
36
|
+
|
|
37
|
+
### Prerequisites
|
|
38
|
+
|
|
39
|
+
**Rust and Cargo:**
|
|
40
|
+
```bash
|
|
41
|
+
# Install rustup (Rust installer and version manager)
|
|
42
|
+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
|
43
|
+
|
|
44
|
+
# Follow the installation prompts, then reload your shell
|
|
45
|
+
source $HOME/.cargo/env
|
|
46
|
+
|
|
47
|
+
# Verify installation
|
|
48
|
+
rustc --version
|
|
49
|
+
cargo --version
|
|
50
|
+
|
|
51
|
+
# Install protobuf compiler (on Ubuntu/Debian)
|
|
52
|
+
sudo apt-get update
|
|
53
|
+
sudo apt-get install -y protobuf-compiler libprotobuf-dev
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
**Python with pip installed**
|
|
57
|
+
|
|
58
|
+
### Installation & Basic Usage
|
|
59
|
+
|
|
60
|
+
#### Rust Binary
|
|
61
|
+
```bash
|
|
62
|
+
# Build Rust components
|
|
63
|
+
cargo build --release
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
#### Python Package
|
|
67
|
+
```bash
|
|
68
|
+
pip install setuptools-rust wheel build
|
|
69
|
+
python -m build
|
|
70
|
+
pip install dist/*.whl
|
|
71
|
+
|
|
72
|
+
# Rebuild & reinstall in one step during development
|
|
73
|
+
python -m build && pip install --force-reinstall dist/*.whl
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Usage Examples
|
|
77
|
+
|
|
78
|
+
#### Standard Data Parallelism Routing
|
|
79
|
+
```bash
|
|
80
|
+
# Launch router with data parallelism (8 replicas per worker URL)
|
|
81
|
+
# When data-parallel-size > 1, the router automatically creates DP-aware workers
|
|
82
|
+
./target/release/vllm-router \
|
|
83
|
+
--worker-urls http://worker1:8000 http://worker2:8000 \
|
|
84
|
+
--policy consistent_hash \
|
|
85
|
+
--intra-node-data-parallel-size 8
|
|
86
|
+
|
|
87
|
+
# Alternative: using cargo run
|
|
88
|
+
cargo run --release -- \
|
|
89
|
+
--worker-urls http://worker1:8000 http://worker2:8000 \
|
|
90
|
+
--policy consistent_hash \
|
|
91
|
+
--intra-node-data-parallel-size 8
|
|
92
|
+
|
|
93
|
+
# Alternative: using python launcher
|
|
94
|
+
vllm-router \
|
|
95
|
+
--worker-urls http://worker1:8000 http://worker2:8000 \
|
|
96
|
+
--policy consistent_hash \
|
|
97
|
+
--intra-node-data-parallel-size 8
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
#### Prefill-Decode Disaggregation
|
|
101
|
+
```bash
|
|
102
|
+
# When vLLM runs the NIXL connector, prefill/decode URLs are required.
|
|
103
|
+
# See a working example in scripts/llama3.1/ folder.
|
|
104
|
+
cargo run --release -- \
|
|
105
|
+
--policy consistent_hash \
|
|
106
|
+
--vllm-pd-disaggregation \
|
|
107
|
+
--prefill http://127.0.0.1:8081 \
|
|
108
|
+
--prefill http://127.0.0.1:8082 \
|
|
109
|
+
--decode http://127.0.0.1:8083 \
|
|
110
|
+
--decode http://127.0.0.1:8084 \
|
|
111
|
+
--decode http://127.0.0.1:8085 \
|
|
112
|
+
--decode http://127.0.0.1:8086 \
|
|
113
|
+
--host 127.0.0.1 \
|
|
114
|
+
--port 8090 \
|
|
115
|
+
--intra-node-data-parallel-size 1 \
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# When vLLM runs the NCCL connector, ZMQ based discovery is supported.
|
|
119
|
+
# See a working example in scripts/install.sh
|
|
120
|
+
cargo run --release -- \
|
|
121
|
+
--policy consistent_hash \
|
|
122
|
+
--vllm-pd-disaggregation \
|
|
123
|
+
--vllm-discovery-address 0.0.0.0:30001 \
|
|
124
|
+
--host 0.0.0.0 \
|
|
125
|
+
--port 10001 \
|
|
126
|
+
--prefill-policy consistent_hash \
|
|
127
|
+
--decode-policy consistent_hash
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Configuration
|
|
131
|
+
|
|
132
|
+
### Metrics
|
|
133
|
+
|
|
134
|
+
Prometheus metrics endpoint available at `127.0.0.1:29000` by default.
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
# Custom metrics configuration
|
|
138
|
+
vllm-router \
|
|
139
|
+
--worker-urls http://localhost:8080 http://localhost:8081 \
|
|
140
|
+
--prometheus-host 0.0.0.0 \
|
|
141
|
+
--prometheus-port 9000
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Retries and Circuit Breakers
|
|
145
|
+
|
|
146
|
+
#### Retry Configuration
|
|
147
|
+
Retries are enabled by default with exponential backoff and jitter:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
vllm-router \
|
|
151
|
+
--worker-urls http://localhost:8080 http://localhost:8081 \
|
|
152
|
+
--retry-max-retries 3 \
|
|
153
|
+
--retry-initial-backoff-ms 100 \
|
|
154
|
+
--retry-max-backoff-ms 10000 \
|
|
155
|
+
--retry-backoff-multiplier 2.0 \
|
|
156
|
+
--retry-jitter-factor 0.1
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
#### Circuit Breaker Configuration
|
|
160
|
+
Circuit breakers protect workers and provide automatic recovery:
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
vllm-router \
|
|
164
|
+
--worker-urls http://localhost:8080 http://localhost:8081 \
|
|
165
|
+
--cb-failure-threshold 5 \
|
|
166
|
+
--cb-success-threshold 2 \
|
|
167
|
+
--cb-timeout-duration-secs 30 \
|
|
168
|
+
--cb-window-duration-secs 60
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
**Circuit Breaker State Machine:**
|
|
172
|
+
- `Closed` → `Open` after N consecutive failures (failure-threshold)
|
|
173
|
+
- `Open` → `HalfOpen` after timeout (timeout-duration-secs)
|
|
174
|
+
- `HalfOpen` → `Closed` after M consecutive successes (success-threshold)
|
|
175
|
+
|
|
176
|
+
**Retry Policy:** Retries on HTTP status codes 408/429/500/502/503/504, with backoff/jitter between attempts.
|
|
177
|
+
|
|
178
|
+
### Request ID Tracking
|
|
179
|
+
|
|
180
|
+
Track requests across distributed systems with configurable headers:
|
|
181
|
+
|
|
182
|
+
```bash
|
|
183
|
+
# Use custom request ID headers
|
|
184
|
+
vllm-router \
|
|
185
|
+
--worker-urls http://localhost:8080 \
|
|
186
|
+
--request-id-headers x-trace-id x-request-id
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
**Default headers:** `x-request-id`, `x-correlation-id`, `x-trace-id`, `request-id`
|
|
190
|
+
|
|
191
|
+
### Load Balancing Policies
|
|
192
|
+
|
|
193
|
+
The router supports multiple load balancing policies:
|
|
194
|
+
|
|
195
|
+
| Policy | Description | Session Affinity | Use Case |
|
|
196
|
+
|--------|-------------|------------------|----------|
|
|
197
|
+
| `round_robin` | Sequential distribution across workers | No | General purpose, even distribution |
|
|
198
|
+
| `random` | Uniform random selection | No | Simple deployments |
|
|
199
|
+
| `consistent_hash` | Routes same session/user to same worker | Yes | Multi-turn chat, KV cache reuse |
|
|
200
|
+
| `power_of_two` | Picks least loaded of two random workers | No | Load-sensitive workloads |
|
|
201
|
+
| `cache_aware` | Optimizes for prefix cache hits | Yes | Repeated prompts, few-shot |
|
|
202
|
+
|
|
203
|
+
```bash
|
|
204
|
+
# Example: Using consistent_hash with HTTP header for session affinity
|
|
205
|
+
curl -X POST http://router:8000/v1/chat/completions \
|
|
206
|
+
-H "X-Session-ID: my-session-123" \
|
|
207
|
+
-H "Content-Type: application/json" \
|
|
208
|
+
-d '{"model": "llama-3", "messages": [{"role": "user", "content": "Hello!"}]}'
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
For detailed configuration options, hash key priorities, and usage examples, see [Load Balancing Documentation](docs/load_balancing/README.md).
|
|
212
|
+
|
|
213
|
+
## Advanced Features
|
|
214
|
+
|
|
215
|
+
### Kubernetes Service Discovery
|
|
216
|
+
|
|
217
|
+
Automatic worker discovery and management in Kubernetes environments.
|
|
218
|
+
|
|
219
|
+
#### Basic Service Discovery
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
vllm-router \
|
|
223
|
+
--service-discovery \
|
|
224
|
+
--selector app=vllm-worker role=inference \
|
|
225
|
+
--service-discovery-namespace default
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
### Command Line Arguments Reference
|
|
229
|
+
|
|
230
|
+
#### Service Discovery
|
|
231
|
+
- `--service-discovery`: Enable Kubernetes service discovery
|
|
232
|
+
- `--service-discovery-port`: Port for worker URLs (default: 8000)
|
|
233
|
+
- `--service-discovery-namespace`: Kubernetes namespace to watch
|
|
234
|
+
- `--selector`: Label selectors for regular mode (format: `key1=value1 key2=value2`)
|
|
235
|
+
|
|
236
|
+
## Development
|
|
237
|
+
|
|
238
|
+
### Troubleshooting
|
|
239
|
+
|
|
240
|
+
**VSCode Rust Analyzer Issues:**
|
|
241
|
+
Set `rust-analyzer.linkedProjects` to the absolute path of `Cargo.toml`:
|
|
242
|
+
|
|
243
|
+
```json
|
|
244
|
+
{
|
|
245
|
+
"rust-analyzer.linkedProjects": ["/workspaces/vllm/vllm-router/Cargo.toml"]
|
|
246
|
+
}
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
### CI/CD Pipeline
|
|
250
|
+
|
|
251
|
+
The continuous integration pipeline includes comprehensive testing, benchmarking, and publishing:
|
|
252
|
+
|
|
253
|
+
#### Build & Test
|
|
254
|
+
1. **Build Wheels**: Uses `cibuildwheel` for manylinux x86_64 packages
|
|
255
|
+
2. **Build Source Distribution**: Creates source distribution for pip fallback
|
|
256
|
+
3. **Rust HTTP Server Benchmarking**: Performance testing of router overhead
|
|
257
|
+
4. **Basic Inference Testing**: End-to-end validation through the router
|
|
258
|
+
5. **PD Disaggregation Testing**: Benchmark and sanity checks for prefill-decode load balancing
|
|
259
|
+
|
|
260
|
+
#### Publishing
|
|
261
|
+
- **PyPI Publishing**: Wheels and source distributions published when version changes in `pyproject.toml`
|
|
262
|
+
- **Container Images**: Docker images published using `/docker/Dockerfile.router`
|
|
263
|
+
|
|
264
|
+
## Acknowledgement
|
|
265
|
+
|
|
266
|
+
This project is a fork of [SGLang Model Gateway](https://github.com/sgl-project/sglang/tree/main/sgl-model-gateway), and we would like to explicitly acknowledge and thank the original authors for their work. At this stage, our fork includes only minimal changes to preserve the existing interface and ensure compatibility with vLLM. We anticipate further divergence as we pursue the roadmap we have in mind, which is the reason for creating the fork.
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
# vLLM Router
|
|
2
|
+
|
|
3
|
+
A high-performance and light-weight request forwarding system for vLLM large scale deployments, providing advanced load balancing methods and prefill/decode disaggregation support.
|
|
4
|
+
|
|
5
|
+
### Key Features
|
|
6
|
+
|
|
7
|
+
- **Core Architecture**: Request routing framework and async processing patterns
|
|
8
|
+
- **Load Balancing**: Multiple algorithms (cache-aware, power of two, consistent hashing, random, round robin)
|
|
9
|
+
- **Prefill-Decode Disaggregation**: Specialized routing for separated processing phases
|
|
10
|
+
- **Service Discovery**: Kubernetes-native worker management and health monitoring
|
|
11
|
+
- **Enterprise Features**: Circuit breakers, retry logic, metrics collection
|
|
12
|
+
|
|
13
|
+
## Quick Start
|
|
14
|
+
|
|
15
|
+
### Prerequisites
|
|
16
|
+
|
|
17
|
+
**Rust and Cargo:**
|
|
18
|
+
```bash
|
|
19
|
+
# Install rustup (Rust installer and version manager)
|
|
20
|
+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
|
21
|
+
|
|
22
|
+
# Follow the installation prompts, then reload your shell
|
|
23
|
+
source $HOME/.cargo/env
|
|
24
|
+
|
|
25
|
+
# Verify installation
|
|
26
|
+
rustc --version
|
|
27
|
+
cargo --version
|
|
28
|
+
|
|
29
|
+
# Install protobuf compiler (on Ubuntu/Debian)
|
|
30
|
+
sudo apt-get update
|
|
31
|
+
sudo apt-get install -y protobuf-compiler libprotobuf-dev
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
**Python with pip installed**
|
|
35
|
+
|
|
36
|
+
### Installation & Basic Usage
|
|
37
|
+
|
|
38
|
+
#### Rust Binary
|
|
39
|
+
```bash
|
|
40
|
+
# Build Rust components
|
|
41
|
+
cargo build --release
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
#### Python Package
|
|
45
|
+
```bash
|
|
46
|
+
pip install setuptools-rust wheel build
|
|
47
|
+
python -m build
|
|
48
|
+
pip install dist/*.whl
|
|
49
|
+
|
|
50
|
+
# Rebuild & reinstall in one step during development
|
|
51
|
+
python -m build && pip install --force-reinstall dist/*.whl
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Usage Examples
|
|
55
|
+
|
|
56
|
+
#### Standard Data Parallelism Routing
|
|
57
|
+
```bash
|
|
58
|
+
# Launch router with data parallelism (8 replicas per worker URL)
|
|
59
|
+
# When data-parallel-size > 1, the router automatically creates DP-aware workers
|
|
60
|
+
./target/release/vllm-router \
|
|
61
|
+
--worker-urls http://worker1:8000 http://worker2:8000 \
|
|
62
|
+
--policy consistent_hash \
|
|
63
|
+
--intra-node-data-parallel-size 8
|
|
64
|
+
|
|
65
|
+
# Alternative: using cargo run
|
|
66
|
+
cargo run --release -- \
|
|
67
|
+
--worker-urls http://worker1:8000 http://worker2:8000 \
|
|
68
|
+
--policy consistent_hash \
|
|
69
|
+
--intra-node-data-parallel-size 8
|
|
70
|
+
|
|
71
|
+
# Alternative: using python launcher
|
|
72
|
+
vllm-router \
|
|
73
|
+
--worker-urls http://worker1:8000 http://worker2:8000 \
|
|
74
|
+
--policy consistent_hash \
|
|
75
|
+
--intra-node-data-parallel-size 8
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
#### Prefill-Decode Disaggregation
|
|
79
|
+
```bash
|
|
80
|
+
# When vLLM runs the NIXL connector, prefill/decode URLs are required.
|
|
81
|
+
# See a working example in scripts/llama3.1/ folder.
|
|
82
|
+
cargo run --release -- \
|
|
83
|
+
--policy consistent_hash \
|
|
84
|
+
--vllm-pd-disaggregation \
|
|
85
|
+
--prefill http://127.0.0.1:8081 \
|
|
86
|
+
--prefill http://127.0.0.1:8082 \
|
|
87
|
+
--decode http://127.0.0.1:8083 \
|
|
88
|
+
--decode http://127.0.0.1:8084 \
|
|
89
|
+
--decode http://127.0.0.1:8085 \
|
|
90
|
+
--decode http://127.0.0.1:8086 \
|
|
91
|
+
--host 127.0.0.1 \
|
|
92
|
+
--port 8090 \
|
|
93
|
+
--intra-node-data-parallel-size 1 \
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# When vLLM runs the NCCL connector, ZMQ based discovery is supported.
|
|
97
|
+
# See a working example in scripts/install.sh
|
|
98
|
+
cargo run --release -- \
|
|
99
|
+
--policy consistent_hash \
|
|
100
|
+
--vllm-pd-disaggregation \
|
|
101
|
+
--vllm-discovery-address 0.0.0.0:30001 \
|
|
102
|
+
--host 0.0.0.0 \
|
|
103
|
+
--port 10001 \
|
|
104
|
+
--prefill-policy consistent_hash \
|
|
105
|
+
--decode-policy consistent_hash
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Configuration
|
|
109
|
+
|
|
110
|
+
### Metrics
|
|
111
|
+
|
|
112
|
+
Prometheus metrics endpoint available at `127.0.0.1:29000` by default.
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
# Custom metrics configuration
|
|
116
|
+
vllm-router \
|
|
117
|
+
--worker-urls http://localhost:8080 http://localhost:8081 \
|
|
118
|
+
--prometheus-host 0.0.0.0 \
|
|
119
|
+
--prometheus-port 9000
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Retries and Circuit Breakers
|
|
123
|
+
|
|
124
|
+
#### Retry Configuration
|
|
125
|
+
Retries are enabled by default with exponential backoff and jitter:
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
vllm-router \
|
|
129
|
+
--worker-urls http://localhost:8080 http://localhost:8081 \
|
|
130
|
+
--retry-max-retries 3 \
|
|
131
|
+
--retry-initial-backoff-ms 100 \
|
|
132
|
+
--retry-max-backoff-ms 10000 \
|
|
133
|
+
--retry-backoff-multiplier 2.0 \
|
|
134
|
+
--retry-jitter-factor 0.1
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
#### Circuit Breaker Configuration
|
|
138
|
+
Circuit breakers protect workers and provide automatic recovery:
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
vllm-router \
|
|
142
|
+
--worker-urls http://localhost:8080 http://localhost:8081 \
|
|
143
|
+
--cb-failure-threshold 5 \
|
|
144
|
+
--cb-success-threshold 2 \
|
|
145
|
+
--cb-timeout-duration-secs 30 \
|
|
146
|
+
--cb-window-duration-secs 60
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
**Circuit Breaker State Machine:**
|
|
150
|
+
- `Closed` → `Open` after N consecutive failures (failure-threshold)
|
|
151
|
+
- `Open` → `HalfOpen` after timeout (timeout-duration-secs)
|
|
152
|
+
- `HalfOpen` → `Closed` after M consecutive successes (success-threshold)
|
|
153
|
+
|
|
154
|
+
**Retry Policy:** Retries on HTTP status codes 408/429/500/502/503/504, with backoff/jitter between attempts.
|
|
155
|
+
|
|
156
|
+
### Request ID Tracking
|
|
157
|
+
|
|
158
|
+
Track requests across distributed systems with configurable headers:
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
# Use custom request ID headers
|
|
162
|
+
vllm-router \
|
|
163
|
+
--worker-urls http://localhost:8080 \
|
|
164
|
+
--request-id-headers x-trace-id x-request-id
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
**Default headers:** `x-request-id`, `x-correlation-id`, `x-trace-id`, `request-id`
|
|
168
|
+
|
|
169
|
+
### Load Balancing Policies
|
|
170
|
+
|
|
171
|
+
The router supports multiple load balancing policies:
|
|
172
|
+
|
|
173
|
+
| Policy | Description | Session Affinity | Use Case |
|
|
174
|
+
|--------|-------------|------------------|----------|
|
|
175
|
+
| `round_robin` | Sequential distribution across workers | No | General purpose, even distribution |
|
|
176
|
+
| `random` | Uniform random selection | No | Simple deployments |
|
|
177
|
+
| `consistent_hash` | Routes same session/user to same worker | Yes | Multi-turn chat, KV cache reuse |
|
|
178
|
+
| `power_of_two` | Picks least loaded of two random workers | No | Load-sensitive workloads |
|
|
179
|
+
| `cache_aware` | Optimizes for prefix cache hits | Yes | Repeated prompts, few-shot |
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
# Example: Using consistent_hash with HTTP header for session affinity
|
|
183
|
+
curl -X POST http://router:8000/v1/chat/completions \
|
|
184
|
+
-H "X-Session-ID: my-session-123" \
|
|
185
|
+
-H "Content-Type: application/json" \
|
|
186
|
+
-d '{"model": "llama-3", "messages": [{"role": "user", "content": "Hello!"}]}'
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
For detailed configuration options, hash key priorities, and usage examples, see [Load Balancing Documentation](docs/load_balancing/README.md).
|
|
190
|
+
|
|
191
|
+
## Advanced Features
|
|
192
|
+
|
|
193
|
+
### Kubernetes Service Discovery
|
|
194
|
+
|
|
195
|
+
Automatic worker discovery and management in Kubernetes environments.
|
|
196
|
+
|
|
197
|
+
#### Basic Service Discovery
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
vllm-router \
|
|
201
|
+
--service-discovery \
|
|
202
|
+
--selector app=vllm-worker role=inference \
|
|
203
|
+
--service-discovery-namespace default
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
### Command Line Arguments Reference
|
|
207
|
+
|
|
208
|
+
#### Service Discovery
|
|
209
|
+
- `--service-discovery`: Enable Kubernetes service discovery
|
|
210
|
+
- `--service-discovery-port`: Port for worker URLs (default: 8000)
|
|
211
|
+
- `--service-discovery-namespace`: Kubernetes namespace to watch
|
|
212
|
+
- `--selector`: Label selectors for regular mode (format: `key1=value1 key2=value2`)
|
|
213
|
+
|
|
214
|
+
## Development
|
|
215
|
+
|
|
216
|
+
### Troubleshooting
|
|
217
|
+
|
|
218
|
+
**VSCode Rust Analyzer Issues:**
|
|
219
|
+
Set `rust-analyzer.linkedProjects` to the absolute path of `Cargo.toml`:
|
|
220
|
+
|
|
221
|
+
```json
|
|
222
|
+
{
|
|
223
|
+
"rust-analyzer.linkedProjects": ["/workspaces/vllm/vllm-router/Cargo.toml"]
|
|
224
|
+
}
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### CI/CD Pipeline
|
|
228
|
+
|
|
229
|
+
The continuous integration pipeline includes comprehensive testing, benchmarking, and publishing:
|
|
230
|
+
|
|
231
|
+
#### Build & Test
|
|
232
|
+
1. **Build Wheels**: Uses `cibuildwheel` for manylinux x86_64 packages
|
|
233
|
+
2. **Build Source Distribution**: Creates source distribution for pip fallback
|
|
234
|
+
3. **Rust HTTP Server Benchmarking**: Performance testing of router overhead
|
|
235
|
+
4. **Basic Inference Testing**: End-to-end validation through the router
|
|
236
|
+
5. **PD Disaggregation Testing**: Benchmark and sanity checks for prefill-decode load balancing
|
|
237
|
+
|
|
238
|
+
#### Publishing
|
|
239
|
+
- **PyPI Publishing**: Wheels and source distributions published when version changes in `pyproject.toml`
|
|
240
|
+
- **Container Images**: Docker images published using `/docker/Dockerfile.router`
|
|
241
|
+
|
|
242
|
+
## Acknowledgement
|
|
243
|
+
|
|
244
|
+
This project is a fork of [SGLang Model Gateway](https://github.com/sgl-project/sglang/tree/main/sgl-model-gateway), and we would like to explicitly acknowledge and thank the original authors for their work. At this stage, our fork includes only minimal changes to preserve the existing interface and ensure compatibility with vLLM. We anticipate further divergence as we pursue the roadmap we have in mind, which is the reason for creating the fork.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
2
|
+
// Only regenerate if the proto file changes
|
|
3
|
+
println!("cargo:rerun-if-changed=src/proto/vllm_scheduler.proto");
|
|
4
|
+
|
|
5
|
+
// Configure protobuf compilation with custom settings
|
|
6
|
+
let config = prost_build::Config::new();
|
|
7
|
+
|
|
8
|
+
// Skip serde for types that use prost_types::Struct
|
|
9
|
+
// These cause conflicts and we don't need serde for all generated types
|
|
10
|
+
|
|
11
|
+
// Configure tonic-build for gRPC code generation
|
|
12
|
+
tonic_build::configure()
|
|
13
|
+
// Generate both client and server code
|
|
14
|
+
.build_server(true)
|
|
15
|
+
.build_client(true)
|
|
16
|
+
// Add a module-level attribute for documentation and clippy warnings
|
|
17
|
+
.server_mod_attribute(
|
|
18
|
+
"vllm.grpc.scheduler",
|
|
19
|
+
"#[allow(unused, clippy::mixed_attributes_style)]",
|
|
20
|
+
)
|
|
21
|
+
.client_mod_attribute(
|
|
22
|
+
"vllm.grpc.scheduler",
|
|
23
|
+
"#[allow(unused, clippy::mixed_attributes_style)]",
|
|
24
|
+
)
|
|
25
|
+
// Compile the proto file with the custom config
|
|
26
|
+
.compile_protos_with_config(config, &["src/proto/vllm_scheduler.proto"], &["src/proto"])?;
|
|
27
|
+
|
|
28
|
+
println!("cargo:warning=Protobuf compilation completed successfully");
|
|
29
|
+
|
|
30
|
+
Ok(())
|
|
31
|
+
}
|