vllm-sr 0.1.0b2.dev20260204070724__tar.gz → 0.1.0b2.dev20260204090051__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {vllm_sr-0.1.0b2.dev20260204070724/vllm_sr.egg-info → vllm_sr-0.1.0b2.dev20260204090051}/PKG-INFO +1 -1
  2. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/merger.py +13 -1
  3. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/models.py +2 -1
  4. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/templates/config.template.yaml +74 -7
  5. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/pyproject.toml +1 -1
  6. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051/vllm_sr.egg-info}/PKG-INFO +1 -1
  7. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/MANIFEST.in +0 -0
  8. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/README.md +0 -0
  9. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/__init__.py +0 -0
  10. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/commands/__init__.py +0 -0
  11. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/commands/config.py +0 -0
  12. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/commands/generate.py +0 -0
  13. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/commands/init.py +0 -0
  14. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/commands/serve.py +0 -0
  15. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/commands/show_config.py +0 -0
  16. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/commands/show_defaults.py +0 -0
  17. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/commands/validate.py +0 -0
  18. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/config_generator.py +0 -0
  19. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/consts.py +0 -0
  20. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/core.py +0 -0
  21. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/defaults.py +0 -0
  22. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/docker_cli.py +0 -0
  23. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/logo.py +0 -0
  24. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/main.py +0 -0
  25. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/parser.py +0 -0
  26. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/templates/envoy.template.yaml +0 -0
  27. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/templates/generate_dashboard.py +0 -0
  28. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/templates/grafana-dashboard.serve.yaml +0 -0
  29. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/templates/grafana-datasource-jaeger.serve.yaml +0 -0
  30. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/templates/grafana-datasource.serve.yaml +0 -0
  31. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/templates/grafana.serve.ini +0 -0
  32. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/templates/llm-router-dashboard.serve.json +0 -0
  33. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/templates/prometheus.serve.yaml +0 -0
  34. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/templates/router-defaults.yaml +0 -0
  35. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/templates/tools_db.json +0 -0
  36. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/utils.py +0 -0
  37. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/cli/validator.py +0 -0
  38. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/requirements.txt +0 -0
  39. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/setup.cfg +0 -0
  40. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/tests/test_plugin_parsing.py +0 -0
  41. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/tests/test_plugin_yaml_generation.py +0 -0
  42. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/vllm_sr.egg-info/SOURCES.txt +0 -0
  43. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/vllm_sr.egg-info/dependency_links.txt +0 -0
  44. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/vllm_sr.egg-info/entry_points.txt +0 -0
  45. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/vllm_sr.egg-info/requires.txt +0 -0
  46. {vllm_sr-0.1.0b2.dev20260204070724 → vllm_sr-0.1.0b2.dev20260204090051}/vllm_sr.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vllm-sr
3
- Version: 0.1.0b2.dev20260204070724
3
+ Version: 0.1.0b2.dev20260204090051
4
4
  Summary: vLLM Semantic Router - Intelligent routing for Mixture-of-Models
5
5
  Author: vLLM-SR Team
6
6
  License: Apache-2.0
@@ -154,8 +154,20 @@ def translate_latency_signals(latencies: list) -> list:
154
154
  for signal in latencies:
155
155
  rule = {
156
156
  "name": signal.name,
157
- "max_tpot": signal.max_tpot,
158
157
  }
158
+ # At least one of tpot_percentile or ttft_percentile should be set
159
+ if signal.tpot_percentile is not None and signal.tpot_percentile > 0:
160
+ rule["tpot_percentile"] = signal.tpot_percentile
161
+ if signal.ttft_percentile is not None and signal.ttft_percentile > 0:
162
+ rule["ttft_percentile"] = signal.ttft_percentile
163
+
164
+ # Validate that at least one is set
165
+ if "tpot_percentile" not in rule and "ttft_percentile" not in rule:
166
+ log.warn(
167
+ f"Latency signal '{signal.name}' has neither tpot_percentile nor ttft_percentile set, skipping"
168
+ )
169
+ continue
170
+
159
171
  if signal.description:
160
172
  rule["description"] = signal.description
161
173
  rules.append(rule)
@@ -72,7 +72,8 @@ class Latency(BaseModel):
72
72
  """Latency signal configuration."""
73
73
 
74
74
  name: str
75
- max_tpot: float
75
+ tpot_percentile: Optional[int] = None
76
+ ttft_percentile: Optional[int] = None
76
77
  description: str
77
78
 
78
79
 
@@ -177,14 +177,48 @@ signals:
177
177
  - name: "ja"
178
178
  description: "Japanese language queries"
179
179
 
180
- # latency - Latency-based routing signals (TPOT-based)
180
+ # latency - Latency-based routing signals (TPOT and TTFT percentile-based)
181
+ # Percentile-based rules adapt to each model's actual performance distribution
182
+ # Works with any number of observations (1+): uses average for 1-2, percentile for 3+
183
+ #
184
+ # ⚠️ RECOMMENDATION: Use BOTH tpot_percentile AND ttft_percentile for comprehensive latency evaluation
185
+ # - TPOT: Measures token generation speed (throughput)
186
+ # - TTFT: Measures first token latency (user-perceived latency)
187
+ # - Together: Complete latency picture for optimal routing decisions
188
+ #
189
+ # You CAN use only one if needed for specific use cases:
190
+ # - TPOT only: Batch processing, cost optimization (throughput matters)
191
+ # - TTFT only: Real-time chat (user perception matters)
192
+ # - Both: RECOMMENDED for most applications (comprehensive evaluation)
193
+ #
194
+ # At least one of tpot_percentile or ttft_percentile must be set (validation requirement)
195
+ # When both are set, model must meet BOTH thresholds (AND logic)
181
196
  latency:
182
- - name: "low_latency"
183
- max_tpot: 0.05 # 50ms per token
184
- description: "For real-time chat applications requiring fast responses"
185
- - name: "medium_latency"
186
- max_tpot: 0.15 # 150ms per token
187
- description: "For standard applications with moderate latency tolerance"
197
+ # Example 1: RECOMMENDED - Both TPOT and TTFT percentiles (comprehensive latency evaluation)
198
+ - name: "low_latency_comprehensive"
199
+ tpot_percentile: 10 # 10th percentile for TPOT (top 10% fastest token generation)
200
+ ttft_percentile: 10 # 10th percentile for TTFT (top 10% fastest first token)
201
+ description: "RECOMMENDED: For real-time applications - fast start and fast generation"
202
+
203
+ # Example 2: Different percentiles for different priorities
204
+ - name: "balanced_latency"
205
+ tpot_percentile: 50 # Median TPOT (top 50%)
206
+ ttft_percentile: 10 # Top 10% TTFT (prioritize fast start)
207
+ description: "Prioritize fast start, accept moderate generation speed"
208
+
209
+ # Example 3: TPOT percentile only (use case: batch processing, cost optimization)
210
+ # ⚠️ Note: Only using one metric is allowed but not recommended for most use cases
211
+ - name: "batch_processing_optimized"
212
+ tpot_percentile: 10 # 10th percentile for TPOT (top 10% fastest token generation)
213
+ # ttft_percentile: not set - acceptable for batch processing where throughput matters
214
+ description: "For batch processing where throughput (TPOT) is critical, TTFT less important"
215
+
216
+ # Example 4: TTFT percentile only (use case: real-time chat where first token matters)
217
+ # ⚠️ Note: Only using one metric is allowed but not recommended for most use cases
218
+ - name: "chat_fast_start"
219
+ ttft_percentile: 10 # 10th percentile for TTFT (top 10% fastest first token)
220
+ # tpot_percentile: not set - acceptable for chat apps where user perception matters
221
+ description: "For chat applications where fast first token (TTFT) is critical for UX"
188
222
 
189
223
  # context - Context length signals (Token Count)
190
224
  context:
@@ -431,6 +465,39 @@ decisions:
431
465
  enabled: true
432
466
  similarity_threshold: 0.85
433
467
 
468
+ # Latency-based routing example: Route to models that meet latency requirements
469
+ - name: "low_latency_route"
470
+ description: "Route to models with low latency (fast TPOT and TTFT)"
471
+ priority: 90
472
+ rules:
473
+ operator: "AND"
474
+ conditions:
475
+ - type: "latency"
476
+ name: "low_latency_comprehensive" # Requires both TPOT and TTFT percentiles
477
+ modelRefs:
478
+ - model: "openai/gpt-oss-120b"
479
+ use_reasoning: false
480
+ plugins:
481
+ - type: "system_prompt"
482
+ configuration:
483
+ system_prompt: "Provide fast, concise responses suitable for real-time applications."
484
+
485
+ - name: "fast_start_route"
486
+ description: "Route to models with fast first token (prioritize TTFT for chat apps)"
487
+ priority: 85
488
+ rules:
489
+ operator: "AND"
490
+ conditions:
491
+ - type: "latency"
492
+ name: "chat_fast_start" # Only requires TTFT percentile
493
+ modelRefs:
494
+ - model: "openai/gpt-oss-120b"
495
+ use_reasoning: false
496
+ plugins:
497
+ - type: "system_prompt"
498
+ configuration:
499
+ system_prompt: "Start responding quickly. User is waiting for immediate feedback."
500
+
434
501
  # Size-aware routing example: Try smaller models first, escalate if confidence is low
435
502
  - name: "confidence_route"
436
503
  description: "Cost-efficient routing: try small model first, escalate if needed"
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "vllm-sr"
7
- version = "0.1.0.beta.2.dev20260204070724"
7
+ version = "0.1.0.beta.2.dev20260204090051"
8
8
  description = "vLLM Semantic Router - Intelligent routing for Mixture-of-Models"
9
9
  authors = [{name = "vLLM-SR Team"}]
10
10
  readme = "README.md"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vllm-sr
3
- Version: 0.1.0b2.dev20260204070724
3
+ Version: 0.1.0b2.dev20260204090051
4
4
  Summary: vLLM Semantic Router - Intelligent routing for Mixture-of-Models
5
5
  Author: vLLM-SR Team
6
6
  License: Apache-2.0