vllm-router 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. vllm_router-0.1.9/Cargo.toml +111 -0
  2. vllm_router-0.1.9/MANIFEST.in +5 -0
  3. vllm_router-0.1.9/PKG-INFO +266 -0
  4. vllm_router-0.1.9/README.md +244 -0
  5. vllm_router-0.1.9/build.rs +31 -0
  6. vllm_router-0.1.9/py_src/vllm_router/__init__.py +9 -0
  7. vllm_router-0.1.9/py_src/vllm_router/launch_router.py +109 -0
  8. vllm_router-0.1.9/py_src/vllm_router/mini_lb.py +395 -0
  9. vllm_router-0.1.9/py_src/vllm_router/router.py +148 -0
  10. vllm_router-0.1.9/py_src/vllm_router/router_args.py +592 -0
  11. vllm_router-0.1.9/py_src/vllm_router/version.py +1 -0
  12. vllm_router-0.1.9/py_src/vllm_router.egg-info/PKG-INFO +266 -0
  13. vllm_router-0.1.9/py_src/vllm_router.egg-info/SOURCES.txt +96 -0
  14. vllm_router-0.1.9/py_src/vllm_router.egg-info/dependency_links.txt +1 -0
  15. vllm_router-0.1.9/py_src/vllm_router.egg-info/entry_points.txt +2 -0
  16. vllm_router-0.1.9/py_src/vllm_router.egg-info/not-zip-safe +1 -0
  17. vllm_router-0.1.9/py_src/vllm_router.egg-info/requires.txt +11 -0
  18. vllm_router-0.1.9/py_src/vllm_router.egg-info/top_level.txt +1 -0
  19. vllm_router-0.1.9/pyproject.toml +45 -0
  20. vllm_router-0.1.9/setup.cfg +4 -0
  21. vllm_router-0.1.9/setup.py +22 -0
  22. vllm_router-0.1.9/src/config/mod.rs +28 -0
  23. vllm_router-0.1.9/src/config/types.rs +1308 -0
  24. vllm_router-0.1.9/src/config/validation.rs +848 -0
  25. vllm_router-0.1.9/src/core/circuit_breaker.rs +555 -0
  26. vllm_router-0.1.9/src/core/error.rs +240 -0
  27. vllm_router-0.1.9/src/core/mod.rs +26 -0
  28. vllm_router-0.1.9/src/core/retry.rs +409 -0
  29. vllm_router-0.1.9/src/core/token_bucket.rs +195 -0
  30. vllm_router-0.1.9/src/core/worker.rs +1947 -0
  31. vllm_router-0.1.9/src/core/worker_registry.rs +526 -0
  32. vllm_router-0.1.9/src/data_connector/mod.rs +11 -0
  33. vllm_router-0.1.9/src/data_connector/response_memory_store.rs +325 -0
  34. vllm_router-0.1.9/src/data_connector/response_noop_store.rs +53 -0
  35. vllm_router-0.1.9/src/data_connector/responses.rs +177 -0
  36. vllm_router-0.1.9/src/grpc/client.rs +254 -0
  37. vllm_router-0.1.9/src/grpc/mod.rs +8 -0
  38. vllm_router-0.1.9/src/handler.rs +84 -0
  39. vllm_router-0.1.9/src/lib.rs +521 -0
  40. vllm_router-0.1.9/src/logger.rs +56 -0
  41. vllm_router-0.1.9/src/logging.rs +163 -0
  42. vllm_router-0.1.9/src/main.rs +761 -0
  43. vllm_router-0.1.9/src/metrics.rs +1047 -0
  44. vllm_router-0.1.9/src/middleware.rs +501 -0
  45. vllm_router-0.1.9/src/policies/cache_aware.rs +554 -0
  46. vllm_router-0.1.9/src/policies/consistent_hash.rs +807 -0
  47. vllm_router-0.1.9/src/policies/factory.rs +107 -0
  48. vllm_router-0.1.9/src/policies/mod.rs +190 -0
  49. vllm_router-0.1.9/src/policies/power_of_two.rs +202 -0
  50. vllm_router-0.1.9/src/policies/random.rs +123 -0
  51. vllm_router-0.1.9/src/policies/registry.rs +334 -0
  52. vllm_router-0.1.9/src/policies/round_robin.rs +142 -0
  53. vllm_router-0.1.9/src/proto/vllm_scheduler.proto +389 -0
  54. vllm_router-0.1.9/src/protocols/mod.rs +6 -0
  55. vllm_router-0.1.9/src/protocols/spec.rs +2930 -0
  56. vllm_router-0.1.9/src/protocols/validation.rs +1221 -0
  57. vllm_router-0.1.9/src/protocols/worker_spec.rs +178 -0
  58. vllm_router-0.1.9/src/routers/factory.rs +258 -0
  59. vllm_router-0.1.9/src/routers/grpc/mod.rs +4 -0
  60. vllm_router-0.1.9/src/routers/grpc/pd_router.rs +345 -0
  61. vllm_router-0.1.9/src/routers/grpc/router.rs +283 -0
  62. vllm_router-0.1.9/src/routers/header_utils.rs +53 -0
  63. vllm_router-0.1.9/src/routers/http/dp_utils.rs +172 -0
  64. vllm_router-0.1.9/src/routers/http/logprobs_merge.rs +357 -0
  65. vllm_router-0.1.9/src/routers/http/mod.rs +10 -0
  66. vllm_router-0.1.9/src/routers/http/openai_router.rs +423 -0
  67. vllm_router-0.1.9/src/routers/http/pd_router.rs +2667 -0
  68. vllm_router-0.1.9/src/routers/http/pd_types.rs +81 -0
  69. vllm_router-0.1.9/src/routers/http/router.rs +1640 -0
  70. vllm_router-0.1.9/src/routers/http/vllm_pd_router.rs +1407 -0
  71. vllm_router-0.1.9/src/routers/http/vllm_service_discovery.rs +348 -0
  72. vllm_router-0.1.9/src/routers/mod.rs +164 -0
  73. vllm_router-0.1.9/src/routers/router_manager.rs +805 -0
  74. vllm_router-0.1.9/src/routes/interface.rs +10 -0
  75. vllm_router-0.1.9/src/routes/mod.rs +7 -0
  76. vllm_router-0.1.9/src/routes/pool_route.rs +0 -0
  77. vllm_router-0.1.9/src/routes/prefill_decode_route.rs +0 -0
  78. vllm_router-0.1.9/src/routes/round_robin_route.rs +36 -0
  79. vllm_router-0.1.9/src/routes/routing_tree_builder.rs +64 -0
  80. vllm_router-0.1.9/src/routes/single_server_route.rs +42 -0
  81. vllm_router-0.1.9/src/server.rs +811 -0
  82. vllm_router-0.1.9/src/service_discovery.rs +1174 -0
  83. vllm_router-0.1.9/src/tokenizer/chat_template.rs +182 -0
  84. vllm_router-0.1.9/src/tokenizer/factory.rs +318 -0
  85. vllm_router-0.1.9/src/tokenizer/hub.rs +238 -0
  86. vllm_router-0.1.9/src/tokenizer/huggingface.rs +234 -0
  87. vllm_router-0.1.9/src/tokenizer/mock.rs +112 -0
  88. vllm_router-0.1.9/src/tokenizer/mod.rs +123 -0
  89. vllm_router-0.1.9/src/tokenizer/sequence.rs +238 -0
  90. vllm_router-0.1.9/src/tokenizer/stop.rs +506 -0
  91. vllm_router-0.1.9/src/tokenizer/stream.rs +105 -0
  92. vllm_router-0.1.9/src/tokenizer/tests.rs +143 -0
  93. vllm_router-0.1.9/src/tokenizer/tiktoken.rs +276 -0
  94. vllm_router-0.1.9/src/tokenizer/traits.rs +83 -0
  95. vllm_router-0.1.9/src/tree.rs +1478 -0
  96. vllm_router-0.1.9/src/types.rs +74 -0
  97. vllm_router-0.1.9/src/utils/json.rs +336 -0
  98. vllm_router-0.1.9/src/utils/mod.rs +1 -0
@@ -0,0 +1,111 @@
1
+ [package]
2
+ name = "vllm_router_rs"
3
+ version = "0.1.9"
4
+ edition = "2021"
5
+
6
+ [features]
7
+ default = ["grpc-client"]
8
+ grpc-client = []
9
+ grpc-server = []
10
+
11
+ [lib]
12
+ name = "vllm_router_rs"
13
+ # Pure Rust library: Just omit crate-type (defaults to rlib)
14
+ # Python/C binding + Rust library: Use ["cdylib", "rlib"]
15
+ crate-type = ["cdylib", "rlib"]
16
+
17
+ [[bin]]
18
+ name = "vllm-router"
19
+ path = "src/main.rs"
20
+
21
+ [dependencies]
22
+ clap = { version = "4", features = ["derive"] }
23
+ axum = { version = "0.8.4", features = ["macros", "ws", "tracing"] }
24
+ tower = { version = "0.5", features = ["full"] }
25
+ tower-http = { version = "0.6", features = ["trace", "compression-gzip", "cors", "timeout", "limit", "request-id", "util"] }
26
+ serde = { version = "1.0", features = ["derive"] }
27
+ serde_json = "1.0"
28
+ bytes = "1.8.0"
29
+ rand = "0.9.2"
30
+ reqwest = { version = "0.12.8", features = ["stream", "blocking", "json"] }
31
+ futures-util = "0.3"
32
+ futures = "0.3"
33
+ pyo3 = { version = "0.24", features = ["extension-module"] }
34
+ dashmap = "6.1.0"
35
+ http = "1.1.0"
36
+ tokio = { version = "1.42.0", features = ["full"] }
37
+ async-trait = "0.1"
38
+ tracing = "0.1"
39
+ tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "chrono"] }
40
+ tracing-log = "0.2"
41
+ tracing-appender = "0.2.3"
42
+ chrono = "0.4"
43
+ kube = { version = "1.1.0", features = ["runtime", "derive"] }
44
+ k8s-openapi = { version = "0.25.0", features = ["v1_33"] }
45
+ metrics = "0.24.2"
46
+ metrics-exporter-prometheus = "0.17.0"
47
+ uuid = { version = "1.10", features = ["v4", "serde"] }
48
+ ulid = "1.2.1"
49
+ parking_lot = "0.12.4"
50
+ thiserror = "2.0.12"
51
+ regex = "1.10"
52
+ url = "2.5.4"
53
+ tokio-stream = { version = "0.1", features = ["sync"] }
54
+ anyhow = "1.0"
55
+ tokenizers = { version = "0.22.2" }
56
+ tiktoken-rs = { version = "0.7.0" }
57
+ minijinja = { version = "2.0" }
58
+ rustls = { version = "0.23", default-features = false, features = ["ring", "std"] }
59
+ hf-hub = { version = "0.4.3", features = ["tokio"] }
60
+
61
+ # gRPC and Protobuf dependencies
62
+ tonic = { version = "0.12", features = ["tls", "gzip", "transport"] }
63
+ prost = "0.13"
64
+ prost-types = "0.13"
65
+ deadpool = { version = "0.12", features = ["managed", "rt_tokio_1"] }
66
+ backoff = { version = "0.4", features = ["tokio"] }
67
+ strum = { version = "0.26", features = ["derive"] }
68
+ once_cell = "1.21.3"
69
+ zmq = "0.10.0"
70
+ rmp-serde = "1.3"
71
+
72
+ [build-dependencies]
73
+ tonic-build = "0.12"
74
+ prost-build = "0.13"
75
+
76
+ [dev-dependencies]
77
+ criterion = { version = "0.5", features = ["html_reports"] }
78
+ tower = { version = "0.5", features = ["util"] }
79
+ http-body-util = "0.1"
80
+ portpicker = "0.1"
81
+ tempfile = "3.8"
82
+ lazy_static = "1.4"
83
+
84
+ [[bench]]
85
+ name = "request_processing"
86
+ harness = false
87
+ path = "benches/request_processing.rs"
88
+
89
+ [[bench]]
90
+ name = "tokenizer_benchmark"
91
+ harness = false
92
+ path = "benches/tokenizer_benchmark.rs"
93
+
94
+ [profile.release]
95
+ lto = "thin"
96
+ codegen-units = 1
97
+
98
+ [profile.dev]
99
+ opt-level = 0
100
+ debug = true
101
+ split-debuginfo = "unpacked"
102
+ incremental = true
103
+
104
+
105
+ [profile.dev.build-override]
106
+ opt-level = 3
107
+ codegen-units = 1
108
+
109
+ [profile.dev-opt]
110
+ inherits = "dev"
111
+ opt-level = 1
@@ -0,0 +1,5 @@
1
+ # Must include:
2
+ include Cargo.toml # Rust project configuration
3
+ include build.rs # Build script for protobuf generation
4
+ recursive-include src *.rs # Rust source files
5
+ recursive-include src/proto *.proto # Protobuf definitions
@@ -0,0 +1,266 @@
1
+ Metadata-Version: 2.4
2
+ Name: vllm-router
3
+ Version: 0.1.9
4
+ Summary: High-performance Rust-based load balancer for VLLM with multiple routing algorithms and prefill-decode disaggregation support
5
+ Author-email: Byron Hsu <byronhsu1230@gmail.com>
6
+ License: Apache-2.0
7
+ Classifier: Programming Language :: Python :: Implementation :: CPython
8
+ Classifier: Programming Language :: Rust
9
+ Classifier: Programming Language :: Python :: 3
10
+ Requires-Python: >=3.8
11
+ Description-Content-Type: text/markdown
12
+ Requires-Dist: setproctitle
13
+ Requires-Dist: aiohttp
14
+ Requires-Dist: orjson
15
+ Requires-Dist: uvicorn
16
+ Requires-Dist: fastapi
17
+ Requires-Dist: requests>=2.25.0
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
20
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
21
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
22
+
23
+ # vLLM Router
24
+
25
+ A high-performance and light-weight request forwarding system for vLLM large scale deployments, providing advanced load balancing methods and prefill/decode disaggregation support.
26
+
27
+ ### Key Features
28
+
29
+ - **Core Architecture**: Request routing framework and async processing patterns
30
+ - **Load Balancing**: Multiple algorithms (cache-aware, power of two, consistent hashing, random, round robin)
31
+ - **Prefill-Decode Disaggregation**: Specialized routing for separated processing phases
32
+ - **Service Discovery**: Kubernetes-native worker management and health monitoring
33
+ - **Enterprise Features**: Circuit breakers, retry logic, metrics collection
34
+
35
+ ## Quick Start
36
+
37
+ ### Prerequisites
38
+
39
+ **Rust and Cargo:**
40
+ ```bash
41
+ # Install rustup (Rust installer and version manager)
42
+ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
43
+
44
+ # Follow the installation prompts, then reload your shell
45
+ source $HOME/.cargo/env
46
+
47
+ # Verify installation
48
+ rustc --version
49
+ cargo --version
50
+
51
+ # Install protobuf compiler (on Ubuntu/Debian)
52
+ sudo apt-get update
53
+ sudo apt-get install -y protobuf-compiler libprotobuf-dev
54
+ ```
55
+
56
+ **Python with pip installed**
57
+
58
+ ### Installation & Basic Usage
59
+
60
+ #### Rust Binary
61
+ ```bash
62
+ # Build Rust components
63
+ cargo build --release
64
+ ```
65
+
66
+ #### Python Package
67
+ ```bash
68
+ pip install setuptools-rust wheel build
69
+ python -m build
70
+ pip install dist/*.whl
71
+
72
+ # Rebuild & reinstall in one step during development
73
+ python -m build && pip install --force-reinstall dist/*.whl
74
+ ```
75
+
76
+ ### Usage Examples
77
+
78
+ #### Standard Data Parallelism Routing
79
+ ```bash
80
+ # Launch router with data parallelism (8 replicas per worker URL)
81
+ # When data-parallel-size > 1, the router automatically creates DP-aware workers
82
+ ./target/release/vllm-router \
83
+ --worker-urls http://worker1:8000 http://worker2:8000 \
84
+ --policy consistent_hash \
85
+ --intra-node-data-parallel-size 8
86
+
87
+ # Alternative: using cargo run
88
+ cargo run --release -- \
89
+ --worker-urls http://worker1:8000 http://worker2:8000 \
90
+ --policy consistent_hash \
91
+ --intra-node-data-parallel-size 8
92
+
93
+ # Alternative: using python launcher
94
+ vllm-router \
95
+ --worker-urls http://worker1:8000 http://worker2:8000 \
96
+ --policy consistent_hash \
97
+ --intra-node-data-parallel-size 8
98
+ ```
99
+
100
+ #### Prefill-Decode Disaggregation
101
+ ```bash
102
+ # When vLLM runs the NIXL connector, prefill/decode URLs are required.
103
+ # See a working example in scripts/llama3.1/ folder.
104
+ cargo run --release -- \
105
+ --policy consistent_hash \
106
+ --vllm-pd-disaggregation \
107
+ --prefill http://127.0.0.1:8081 \
108
+ --prefill http://127.0.0.1:8082 \
109
+ --decode http://127.0.0.1:8083 \
110
+ --decode http://127.0.0.1:8084 \
111
+ --decode http://127.0.0.1:8085 \
112
+ --decode http://127.0.0.1:8086 \
113
+ --host 127.0.0.1 \
114
+ --port 8090 \
115
+ --intra-node-data-parallel-size 1 \
116
+
117
+
118
+ # When vLLM runs the NCCL connector, ZMQ based discovery is supported.
119
+ # See a working example in scripts/install.sh
120
+ cargo run --release -- \
121
+ --policy consistent_hash \
122
+ --vllm-pd-disaggregation \
123
+ --vllm-discovery-address 0.0.0.0:30001 \
124
+ --host 0.0.0.0 \
125
+ --port 10001 \
126
+ --prefill-policy consistent_hash \
127
+ --decode-policy consistent_hash
128
+ ```
129
+
130
+ ## Configuration
131
+
132
+ ### Metrics
133
+
134
+ Prometheus metrics endpoint available at `127.0.0.1:29000` by default.
135
+
136
+ ```bash
137
+ # Custom metrics configuration
138
+ vllm-router \
139
+ --worker-urls http://localhost:8080 http://localhost:8081 \
140
+ --prometheus-host 0.0.0.0 \
141
+ --prometheus-port 9000
142
+ ```
143
+
144
+ ### Retries and Circuit Breakers
145
+
146
+ #### Retry Configuration
147
+ Retries are enabled by default with exponential backoff and jitter:
148
+
149
+ ```bash
150
+ vllm-router \
151
+ --worker-urls http://localhost:8080 http://localhost:8081 \
152
+ --retry-max-retries 3 \
153
+ --retry-initial-backoff-ms 100 \
154
+ --retry-max-backoff-ms 10000 \
155
+ --retry-backoff-multiplier 2.0 \
156
+ --retry-jitter-factor 0.1
157
+ ```
158
+
159
+ #### Circuit Breaker Configuration
160
+ Circuit breakers protect workers and provide automatic recovery:
161
+
162
+ ```bash
163
+ vllm-router \
164
+ --worker-urls http://localhost:8080 http://localhost:8081 \
165
+ --cb-failure-threshold 5 \
166
+ --cb-success-threshold 2 \
167
+ --cb-timeout-duration-secs 30 \
168
+ --cb-window-duration-secs 60
169
+ ```
170
+
171
+ **Circuit Breaker State Machine:**
172
+ - `Closed` → `Open` after N consecutive failures (failure-threshold)
173
+ - `Open` → `HalfOpen` after timeout (timeout-duration-secs)
174
+ - `HalfOpen` → `Closed` after M consecutive successes (success-threshold)
175
+
176
+ **Retry Policy:** Retries on HTTP status codes 408/429/500/502/503/504, with backoff/jitter between attempts.
177
+
178
+ ### Request ID Tracking
179
+
180
+ Track requests across distributed systems with configurable headers:
181
+
182
+ ```bash
183
+ # Use custom request ID headers
184
+ vllm-router \
185
+ --worker-urls http://localhost:8080 \
186
+ --request-id-headers x-trace-id x-request-id
187
+ ```
188
+
189
+ **Default headers:** `x-request-id`, `x-correlation-id`, `x-trace-id`, `request-id`
190
+
191
+ ### Load Balancing Policies
192
+
193
+ The router supports multiple load balancing policies:
194
+
195
+ | Policy | Description | Session Affinity | Use Case |
196
+ |--------|-------------|------------------|----------|
197
+ | `round_robin` | Sequential distribution across workers | No | General purpose, even distribution |
198
+ | `random` | Uniform random selection | No | Simple deployments |
199
+ | `consistent_hash` | Routes same session/user to same worker | Yes | Multi-turn chat, KV cache reuse |
200
+ | `power_of_two` | Picks least loaded of two random workers | No | Load-sensitive workloads |
201
+ | `cache_aware` | Optimizes for prefix cache hits | Yes | Repeated prompts, few-shot |
202
+
203
+ ```bash
204
+ # Example: Using consistent_hash with HTTP header for session affinity
205
+ curl -X POST http://router:8000/v1/chat/completions \
206
+ -H "X-Session-ID: my-session-123" \
207
+ -H "Content-Type: application/json" \
208
+ -d '{"model": "llama-3", "messages": [{"role": "user", "content": "Hello!"}]}'
209
+ ```
210
+
211
+ For detailed configuration options, hash key priorities, and usage examples, see [Load Balancing Documentation](docs/load_balancing/README.md).
212
+
213
+ ## Advanced Features
214
+
215
+ ### Kubernetes Service Discovery
216
+
217
+ Automatic worker discovery and management in Kubernetes environments.
218
+
219
+ #### Basic Service Discovery
220
+
221
+ ```bash
222
+ vllm-router \
223
+ --service-discovery \
224
+ --selector app=vllm-worker role=inference \
225
+ --service-discovery-namespace default
226
+ ```
227
+
228
+ ### Command Line Arguments Reference
229
+
230
+ #### Service Discovery
231
+ - `--service-discovery`: Enable Kubernetes service discovery
232
+ - `--service-discovery-port`: Port for worker URLs (default: 8000)
233
+ - `--service-discovery-namespace`: Kubernetes namespace to watch
234
+ - `--selector`: Label selectors for regular mode (format: `key1=value1 key2=value2`)
235
+
236
+ ## Development
237
+
238
+ ### Troubleshooting
239
+
240
+ **VSCode Rust Analyzer Issues:**
241
+ Set `rust-analyzer.linkedProjects` to the absolute path of `Cargo.toml`:
242
+
243
+ ```json
244
+ {
245
+ "rust-analyzer.linkedProjects": ["/workspaces/vllm/vllm-router/Cargo.toml"]
246
+ }
247
+ ```
248
+
249
+ ### CI/CD Pipeline
250
+
251
+ The continuous integration pipeline includes comprehensive testing, benchmarking, and publishing:
252
+
253
+ #### Build & Test
254
+ 1. **Build Wheels**: Uses `cibuildwheel` for manylinux x86_64 packages
255
+ 2. **Build Source Distribution**: Creates source distribution for pip fallback
256
+ 3. **Rust HTTP Server Benchmarking**: Performance testing of router overhead
257
+ 4. **Basic Inference Testing**: End-to-end validation through the router
258
+ 5. **PD Disaggregation Testing**: Benchmark and sanity checks for prefill-decode load balancing
259
+
260
+ #### Publishing
261
+ - **PyPI Publishing**: Wheels and source distributions published when version changes in `pyproject.toml`
262
+ - **Container Images**: Docker images published using `/docker/Dockerfile.router`
263
+
264
+ ## Acknowledgement
265
+
266
+ This project is a fork of [SGLang Model Gateway](https://github.com/sgl-project/sglang/tree/main/sgl-model-gateway), and we would like to explicitly acknowledge and thank the original authors for their work. At this stage, our fork includes only minimal changes to preserve the existing interface and ensure compatibility with vLLM. We anticipate further divergence as we pursue the roadmap we have in mind, which is the reason for creating the fork.
@@ -0,0 +1,244 @@
1
+ # vLLM Router
2
+
3
+ A high-performance and light-weight request forwarding system for vLLM large scale deployments, providing advanced load balancing methods and prefill/decode disaggregation support.
4
+
5
+ ### Key Features
6
+
7
+ - **Core Architecture**: Request routing framework and async processing patterns
8
+ - **Load Balancing**: Multiple algorithms (cache-aware, power of two, consistent hashing, random, round robin)
9
+ - **Prefill-Decode Disaggregation**: Specialized routing for separated processing phases
10
+ - **Service Discovery**: Kubernetes-native worker management and health monitoring
11
+ - **Enterprise Features**: Circuit breakers, retry logic, metrics collection
12
+
13
+ ## Quick Start
14
+
15
+ ### Prerequisites
16
+
17
+ **Rust and Cargo:**
18
+ ```bash
19
+ # Install rustup (Rust installer and version manager)
20
+ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
21
+
22
+ # Follow the installation prompts, then reload your shell
23
+ source $HOME/.cargo/env
24
+
25
+ # Verify installation
26
+ rustc --version
27
+ cargo --version
28
+
29
+ # Install protobuf compiler (on Ubuntu/Debian)
30
+ sudo apt-get update
31
+ sudo apt-get install -y protobuf-compiler libprotobuf-dev
32
+ ```
33
+
34
+ **Python with pip installed**
35
+
36
+ ### Installation & Basic Usage
37
+
38
+ #### Rust Binary
39
+ ```bash
40
+ # Build Rust components
41
+ cargo build --release
42
+ ```
43
+
44
+ #### Python Package
45
+ ```bash
46
+ pip install setuptools-rust wheel build
47
+ python -m build
48
+ pip install dist/*.whl
49
+
50
+ # Rebuild & reinstall in one step during development
51
+ python -m build && pip install --force-reinstall dist/*.whl
52
+ ```
53
+
54
+ ### Usage Examples
55
+
56
+ #### Standard Data Parallelism Routing
57
+ ```bash
58
+ # Launch router with data parallelism (8 replicas per worker URL)
59
+ # When data-parallel-size > 1, the router automatically creates DP-aware workers
60
+ ./target/release/vllm-router \
61
+ --worker-urls http://worker1:8000 http://worker2:8000 \
62
+ --policy consistent_hash \
63
+ --intra-node-data-parallel-size 8
64
+
65
+ # Alternative: using cargo run
66
+ cargo run --release -- \
67
+ --worker-urls http://worker1:8000 http://worker2:8000 \
68
+ --policy consistent_hash \
69
+ --intra-node-data-parallel-size 8
70
+
71
+ # Alternative: using python launcher
72
+ vllm-router \
73
+ --worker-urls http://worker1:8000 http://worker2:8000 \
74
+ --policy consistent_hash \
75
+ --intra-node-data-parallel-size 8
76
+ ```
77
+
78
+ #### Prefill-Decode Disaggregation
79
+ ```bash
80
+ # When vLLM runs the NIXL connector, prefill/decode URLs are required.
81
+ # See a working example in scripts/llama3.1/ folder.
82
+ cargo run --release -- \
83
+ --policy consistent_hash \
84
+ --vllm-pd-disaggregation \
85
+ --prefill http://127.0.0.1:8081 \
86
+ --prefill http://127.0.0.1:8082 \
87
+ --decode http://127.0.0.1:8083 \
88
+ --decode http://127.0.0.1:8084 \
89
+ --decode http://127.0.0.1:8085 \
90
+ --decode http://127.0.0.1:8086 \
91
+ --host 127.0.0.1 \
92
+ --port 8090 \
93
+ --intra-node-data-parallel-size 1 \
94
+
95
+
96
+ # When vLLM runs the NCCL connector, ZMQ based discovery is supported.
97
+ # See a working example in scripts/install.sh
98
+ cargo run --release -- \
99
+ --policy consistent_hash \
100
+ --vllm-pd-disaggregation \
101
+ --vllm-discovery-address 0.0.0.0:30001 \
102
+ --host 0.0.0.0 \
103
+ --port 10001 \
104
+ --prefill-policy consistent_hash \
105
+ --decode-policy consistent_hash
106
+ ```
107
+
108
+ ## Configuration
109
+
110
+ ### Metrics
111
+
112
+ Prometheus metrics endpoint available at `127.0.0.1:29000` by default.
113
+
114
+ ```bash
115
+ # Custom metrics configuration
116
+ vllm-router \
117
+ --worker-urls http://localhost:8080 http://localhost:8081 \
118
+ --prometheus-host 0.0.0.0 \
119
+ --prometheus-port 9000
120
+ ```
121
+
122
+ ### Retries and Circuit Breakers
123
+
124
+ #### Retry Configuration
125
+ Retries are enabled by default with exponential backoff and jitter:
126
+
127
+ ```bash
128
+ vllm-router \
129
+ --worker-urls http://localhost:8080 http://localhost:8081 \
130
+ --retry-max-retries 3 \
131
+ --retry-initial-backoff-ms 100 \
132
+ --retry-max-backoff-ms 10000 \
133
+ --retry-backoff-multiplier 2.0 \
134
+ --retry-jitter-factor 0.1
135
+ ```
136
+
137
+ #### Circuit Breaker Configuration
138
+ Circuit breakers protect workers and provide automatic recovery:
139
+
140
+ ```bash
141
+ vllm-router \
142
+ --worker-urls http://localhost:8080 http://localhost:8081 \
143
+ --cb-failure-threshold 5 \
144
+ --cb-success-threshold 2 \
145
+ --cb-timeout-duration-secs 30 \
146
+ --cb-window-duration-secs 60
147
+ ```
148
+
149
+ **Circuit Breaker State Machine:**
150
+ - `Closed` → `Open` after N consecutive failures (failure-threshold)
151
+ - `Open` → `HalfOpen` after timeout (timeout-duration-secs)
152
+ - `HalfOpen` → `Closed` after M consecutive successes (success-threshold)
153
+
154
+ **Retry Policy:** Retries on HTTP status codes 408/429/500/502/503/504, with backoff/jitter between attempts.
155
+
156
+ ### Request ID Tracking
157
+
158
+ Track requests across distributed systems with configurable headers:
159
+
160
+ ```bash
161
+ # Use custom request ID headers
162
+ vllm-router \
163
+ --worker-urls http://localhost:8080 \
164
+ --request-id-headers x-trace-id x-request-id
165
+ ```
166
+
167
+ **Default headers:** `x-request-id`, `x-correlation-id`, `x-trace-id`, `request-id`
168
+
169
+ ### Load Balancing Policies
170
+
171
+ The router supports multiple load balancing policies:
172
+
173
+ | Policy | Description | Session Affinity | Use Case |
174
+ |--------|-------------|------------------|----------|
175
+ | `round_robin` | Sequential distribution across workers | No | General purpose, even distribution |
176
+ | `random` | Uniform random selection | No | Simple deployments |
177
+ | `consistent_hash` | Routes same session/user to same worker | Yes | Multi-turn chat, KV cache reuse |
178
+ | `power_of_two` | Picks least loaded of two random workers | No | Load-sensitive workloads |
179
+ | `cache_aware` | Optimizes for prefix cache hits | Yes | Repeated prompts, few-shot |
180
+
181
+ ```bash
182
+ # Example: Using consistent_hash with HTTP header for session affinity
183
+ curl -X POST http://router:8000/v1/chat/completions \
184
+ -H "X-Session-ID: my-session-123" \
185
+ -H "Content-Type: application/json" \
186
+ -d '{"model": "llama-3", "messages": [{"role": "user", "content": "Hello!"}]}'
187
+ ```
188
+
189
+ For detailed configuration options, hash key priorities, and usage examples, see [Load Balancing Documentation](docs/load_balancing/README.md).
190
+
191
+ ## Advanced Features
192
+
193
+ ### Kubernetes Service Discovery
194
+
195
+ Automatic worker discovery and management in Kubernetes environments.
196
+
197
+ #### Basic Service Discovery
198
+
199
+ ```bash
200
+ vllm-router \
201
+ --service-discovery \
202
+ --selector app=vllm-worker role=inference \
203
+ --service-discovery-namespace default
204
+ ```
205
+
206
+ ### Command Line Arguments Reference
207
+
208
+ #### Service Discovery
209
+ - `--service-discovery`: Enable Kubernetes service discovery
210
+ - `--service-discovery-port`: Port for worker URLs (default: 8000)
211
+ - `--service-discovery-namespace`: Kubernetes namespace to watch
212
+ - `--selector`: Label selectors for regular mode (format: `key1=value1 key2=value2`)
213
+
214
+ ## Development
215
+
216
+ ### Troubleshooting
217
+
218
+ **VSCode Rust Analyzer Issues:**
219
+ Set `rust-analyzer.linkedProjects` to the absolute path of `Cargo.toml`:
220
+
221
+ ```json
222
+ {
223
+ "rust-analyzer.linkedProjects": ["/workspaces/vllm/vllm-router/Cargo.toml"]
224
+ }
225
+ ```
226
+
227
+ ### CI/CD Pipeline
228
+
229
+ The continuous integration pipeline includes comprehensive testing, benchmarking, and publishing:
230
+
231
+ #### Build & Test
232
+ 1. **Build Wheels**: Uses `cibuildwheel` for manylinux x86_64 packages
233
+ 2. **Build Source Distribution**: Creates source distribution for pip fallback
234
+ 3. **Rust HTTP Server Benchmarking**: Performance testing of router overhead
235
+ 4. **Basic Inference Testing**: End-to-end validation through the router
236
+ 5. **PD Disaggregation Testing**: Benchmark and sanity checks for prefill-decode load balancing
237
+
238
+ #### Publishing
239
+ - **PyPI Publishing**: Wheels and source distributions published when version changes in `pyproject.toml`
240
+ - **Container Images**: Docker images published using `/docker/Dockerfile.router`
241
+
242
+ ## Acknowledgement
243
+
244
+ This project is a fork of [SGLang Model Gateway](https://github.com/sgl-project/sglang/tree/main/sgl-model-gateway), and we would like to explicitly acknowledge and thank the original authors for their work. At this stage, our fork includes only minimal changes to preserve the existing interface and ensure compatibility with vLLM. We anticipate further divergence as we pursue the roadmap we have in mind, which is the reason for creating the fork.
@@ -0,0 +1,31 @@
1
+ fn main() -> Result<(), Box<dyn std::error::Error>> {
2
+ // Only regenerate if the proto file changes
3
+ println!("cargo:rerun-if-changed=src/proto/vllm_scheduler.proto");
4
+
5
+ // Configure protobuf compilation with custom settings
6
+ let config = prost_build::Config::new();
7
+
8
+ // Skip serde for types that use prost_types::Struct
9
+ // These cause conflicts and we don't need serde for all generated types
10
+
11
+ // Configure tonic-build for gRPC code generation
12
+ tonic_build::configure()
13
+ // Generate both client and server code
14
+ .build_server(true)
15
+ .build_client(true)
16
+ // Add a module-level attribute for documentation and clippy warnings
17
+ .server_mod_attribute(
18
+ "vllm.grpc.scheduler",
19
+ "#[allow(unused, clippy::mixed_attributes_style)]",
20
+ )
21
+ .client_mod_attribute(
22
+ "vllm.grpc.scheduler",
23
+ "#[allow(unused, clippy::mixed_attributes_style)]",
24
+ )
25
+ // Compile the proto file with the custom config
26
+ .compile_protos_with_config(config, &["src/proto/vllm_scheduler.proto"], &["src/proto"])?;
27
+
28
+ println!("cargo:warning=Protobuf compilation completed successfully");
29
+
30
+ Ok(())
31
+ }
@@ -0,0 +1,9 @@
1
+ from vllm_router.version import __version__
2
+
3
+ try:
4
+ from vllm_router.router import Router
5
+
6
+ __all__ = ["__version__", "Router"]
7
+ except ImportError:
8
+ # Router is not available if Rust extension is not built
9
+ __all__ = ["__version__"]