vllm-sr 0.1.0b2.dev20260203201608__tar.gz → 0.1.0b2.dev20260204071119__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {vllm_sr-0.1.0b2.dev20260203201608/vllm_sr.egg-info → vllm_sr-0.1.0b2.dev20260204071119}/PKG-INFO +1 -1
  2. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/merger.py +13 -1
  3. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/models.py +48 -2
  4. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/templates/config.template.yaml +127 -7
  5. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/templates/envoy.template.yaml +11 -11
  6. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/templates/router-defaults.yaml +1 -1
  7. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/pyproject.toml +1 -1
  8. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119/vllm_sr.egg-info}/PKG-INFO +1 -1
  9. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/MANIFEST.in +0 -0
  10. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/README.md +0 -0
  11. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/__init__.py +0 -0
  12. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/commands/__init__.py +0 -0
  13. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/commands/config.py +0 -0
  14. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/commands/generate.py +0 -0
  15. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/commands/init.py +0 -0
  16. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/commands/serve.py +0 -0
  17. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/commands/show_config.py +0 -0
  18. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/commands/show_defaults.py +0 -0
  19. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/commands/validate.py +0 -0
  20. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/config_generator.py +0 -0
  21. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/consts.py +0 -0
  22. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/core.py +0 -0
  23. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/defaults.py +0 -0
  24. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/docker_cli.py +0 -0
  25. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/logo.py +0 -0
  26. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/main.py +0 -0
  27. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/parser.py +0 -0
  28. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/templates/generate_dashboard.py +0 -0
  29. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/templates/grafana-dashboard.serve.yaml +0 -0
  30. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/templates/grafana-datasource-jaeger.serve.yaml +0 -0
  31. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/templates/grafana-datasource.serve.yaml +0 -0
  32. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/templates/grafana.serve.ini +0 -0
  33. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/templates/llm-router-dashboard.serve.json +0 -0
  34. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/templates/prometheus.serve.yaml +0 -0
  35. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/templates/tools_db.json +0 -0
  36. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/utils.py +0 -0
  37. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/cli/validator.py +0 -0
  38. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/requirements.txt +0 -0
  39. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/setup.cfg +0 -0
  40. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/tests/test_plugin_parsing.py +0 -0
  41. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/tests/test_plugin_yaml_generation.py +0 -0
  42. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/vllm_sr.egg-info/SOURCES.txt +0 -0
  43. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/vllm_sr.egg-info/dependency_links.txt +0 -0
  44. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/vllm_sr.egg-info/entry_points.txt +0 -0
  45. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/vllm_sr.egg-info/requires.txt +0 -0
  46. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204071119}/vllm_sr.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vllm-sr
3
- Version: 0.1.0b2.dev20260203201608
3
+ Version: 0.1.0b2.dev20260204071119
4
4
  Summary: vLLM Semantic Router - Intelligent routing for Mixture-of-Models
5
5
  Author: vLLM-SR Team
6
6
  License: Apache-2.0
@@ -154,8 +154,20 @@ def translate_latency_signals(latencies: list) -> list:
154
154
  for signal in latencies:
155
155
  rule = {
156
156
  "name": signal.name,
157
- "max_tpot": signal.max_tpot,
158
157
  }
158
+ # At least one of tpot_percentile or ttft_percentile should be set
159
+ if signal.tpot_percentile is not None and signal.tpot_percentile > 0:
160
+ rule["tpot_percentile"] = signal.tpot_percentile
161
+ if signal.ttft_percentile is not None and signal.ttft_percentile > 0:
162
+ rule["ttft_percentile"] = signal.ttft_percentile
163
+
164
+ # Validate that at least one is set
165
+ if "tpot_percentile" not in rule and "ttft_percentile" not in rule:
166
+ log.warn(
167
+ f"Latency signal '{signal.name}' has neither tpot_percentile nor ttft_percentile set, skipping"
168
+ )
169
+ continue
170
+
159
171
  if signal.description:
160
172
  rule["description"] = signal.description
161
173
  rules.append(rule)
@@ -72,7 +72,8 @@ class Latency(BaseModel):
72
72
  """Latency signal configuration."""
73
73
 
74
74
  name: str
75
- max_tpot: float
75
+ tpot_percentile: Optional[int] = None
76
+ ttft_percentile: Optional[int] = None
76
77
  description: str
77
78
 
78
79
 
@@ -192,18 +193,63 @@ class ConcurrentAlgorithmConfig(BaseModel):
192
193
  on_error: Optional[str] = "skip"
193
194
 
194
195
 
196
+ class ReMoMAlgorithmConfig(BaseModel):
197
+ """Configuration for ReMoM (Reasoning for Mixture of Models) algorithm.
198
+
199
+ This algorithm performs multi-round parallel reasoning with intelligent synthesis.
200
+ Inspired by PaCoRe (arXiv:2601.05593) but extended to support mixture of models.
201
+ """
202
+
203
+ # Breadth schedule: array of parallel calls per round (e.g., [32, 4] means 32 calls in round 1, 4 in round 2, then 1 final)
204
+ breadth_schedule: list[int]
205
+
206
+ # Model distribution strategy: "weighted", "equal", or "first_only"
207
+ model_distribution: Optional[str] = "weighted"
208
+
209
+ # Temperature for model calls (default: 1.0 for diverse exploration)
210
+ temperature: Optional[float] = 1.0
211
+
212
+ # Whether to include reasoning content in synthesis prompts
213
+ include_reasoning: Optional[bool] = False
214
+
215
+ # Compaction strategy: "full" or "last_n_tokens"
216
+ compaction_strategy: Optional[str] = "full"
217
+
218
+ # Number of tokens to keep when using last_n_tokens compaction
219
+ compaction_tokens: Optional[int] = 1000
220
+
221
+ # Custom synthesis template (uses default if not provided)
222
+ synthesis_template: Optional[str] = None
223
+
224
+ # Maximum concurrent model calls per round
225
+ max_concurrent: Optional[int] = None
226
+
227
+ # Behavior on model call failure: "skip" or "fail"
228
+ on_error: Optional[str] = "skip"
229
+
230
+ # Random seed for shuffling responses (for reproducibility)
231
+ shuffle_seed: Optional[int] = 42
232
+
233
+ # Whether to include intermediate responses in the response body for visualization
234
+ include_intermediate_responses: Optional[bool] = True
235
+
236
+ # Maximum number of responses to keep per round (for memory efficiency)
237
+ max_responses_per_round: Optional[int] = None
238
+
239
+
195
240
  class AlgorithmConfig(BaseModel):
196
241
  """Algorithm configuration for multi-model decisions.
197
242
 
198
243
  Specifies how multiple models in a decision should be orchestrated.
199
244
  """
200
245
 
201
- # Algorithm type: "sequential", "confidence", "concurrent"
246
+ # Algorithm type: "sequential", "confidence", "concurrent", "remom"
202
247
  type: str
203
248
 
204
249
  # Algorithm-specific configurations (only one should be set based on type)
205
250
  confidence: Optional[ConfidenceAlgorithmConfig] = None
206
251
  concurrent: Optional[ConcurrentAlgorithmConfig] = None
252
+ remom: Optional[ReMoMAlgorithmConfig] = None
207
253
 
208
254
 
209
255
  class PluginType(str, Enum):
@@ -177,14 +177,48 @@ signals:
177
177
  - name: "ja"
178
178
  description: "Japanese language queries"
179
179
 
180
- # latency - Latency-based routing signals (TPOT-based)
180
+ # latency - Latency-based routing signals (TPOT and TTFT percentile-based)
181
+ # Percentile-based rules adapt to each model's actual performance distribution
182
+ # Works with any number of observations (1+): uses average for 1-2, percentile for 3+
183
+ #
184
+ # ⚠️ RECOMMENDATION: Use BOTH tpot_percentile AND ttft_percentile for comprehensive latency evaluation
185
+ # - TPOT: Measures token generation speed (throughput)
186
+ # - TTFT: Measures first token latency (user-perceived latency)
187
+ # - Together: Complete latency picture for optimal routing decisions
188
+ #
189
+ # You CAN use only one if needed for specific use cases:
190
+ # - TPOT only: Batch processing, cost optimization (throughput matters)
191
+ # - TTFT only: Real-time chat (user perception matters)
192
+ # - Both: RECOMMENDED for most applications (comprehensive evaluation)
193
+ #
194
+ # At least one of tpot_percentile or ttft_percentile must be set (validation requirement)
195
+ # When both are set, model must meet BOTH thresholds (AND logic)
181
196
  latency:
182
- - name: "low_latency"
183
- max_tpot: 0.05 # 50ms per token
184
- description: "For real-time chat applications requiring fast responses"
185
- - name: "medium_latency"
186
- max_tpot: 0.15 # 150ms per token
187
- description: "For standard applications with moderate latency tolerance"
197
+ # Example 1: RECOMMENDED - Both TPOT and TTFT percentiles (comprehensive latency evaluation)
198
+ - name: "low_latency_comprehensive"
199
+ tpot_percentile: 10 # 10th percentile for TPOT (top 10% fastest token generation)
200
+ ttft_percentile: 10 # 10th percentile for TTFT (top 10% fastest first token)
201
+ description: "RECOMMENDED: For real-time applications - fast start and fast generation"
202
+
203
+ # Example 2: Different percentiles for different priorities
204
+ - name: "balanced_latency"
205
+ tpot_percentile: 50 # Median TPOT (top 50%)
206
+ ttft_percentile: 10 # Top 10% TTFT (prioritize fast start)
207
+ description: "Prioritize fast start, accept moderate generation speed"
208
+
209
+ # Example 3: TPOT percentile only (use case: batch processing, cost optimization)
210
+ # ⚠️ Note: Only using one metric is allowed but not recommended for most use cases
211
+ - name: "batch_processing_optimized"
212
+ tpot_percentile: 10 # 10th percentile for TPOT (top 10% fastest token generation)
213
+ # ttft_percentile: not set - acceptable for batch processing where throughput matters
214
+ description: "For batch processing where throughput (TPOT) is critical, TTFT less important"
215
+
216
+ # Example 4: TTFT percentile only (use case: real-time chat where first token matters)
217
+ # ⚠️ Note: Only using one metric is allowed but not recommended for most use cases
218
+ - name: "chat_fast_start"
219
+ ttft_percentile: 10 # 10th percentile for TTFT (top 10% fastest first token)
220
+ # tpot_percentile: not set - acceptable for chat apps where user perception matters
221
+ description: "For chat applications where fast first token (TTFT) is critical for UX"
188
222
 
189
223
  # context - Context length signals (Token Count)
190
224
  context:
@@ -431,6 +465,39 @@ decisions:
431
465
  enabled: true
432
466
  similarity_threshold: 0.85
433
467
 
468
+ # Latency-based routing example: Route to models that meet latency requirements
469
+ - name: "low_latency_route"
470
+ description: "Route to models with low latency (fast TPOT and TTFT)"
471
+ priority: 90
472
+ rules:
473
+ operator: "AND"
474
+ conditions:
475
+ - type: "latency"
476
+ name: "low_latency_comprehensive" # Requires both TPOT and TTFT percentiles
477
+ modelRefs:
478
+ - model: "openai/gpt-oss-120b"
479
+ use_reasoning: false
480
+ plugins:
481
+ - type: "system_prompt"
482
+ configuration:
483
+ system_prompt: "Provide fast, concise responses suitable for real-time applications."
484
+
485
+ - name: "fast_start_route"
486
+ description: "Route to models with fast first token (prioritize TTFT for chat apps)"
487
+ priority: 85
488
+ rules:
489
+ operator: "AND"
490
+ conditions:
491
+ - type: "latency"
492
+ name: "chat_fast_start" # Only requires TTFT percentile
493
+ modelRefs:
494
+ - model: "openai/gpt-oss-120b"
495
+ use_reasoning: false
496
+ plugins:
497
+ - type: "system_prompt"
498
+ configuration:
499
+ system_prompt: "Start responding quickly. User is waiting for immediate feedback."
500
+
434
501
  # Size-aware routing example: Try smaller models first, escalate if confidence is low
435
502
  - name: "confidence_route"
436
503
  description: "Cost-efficient routing: try small model first, escalate if needed"
@@ -483,6 +550,59 @@ decisions:
483
550
  configuration:
484
551
  enabled: false
485
552
 
553
+ # ReMoM algorithm example: Multi-round parallel reasoning with intelligent synthesis
554
+ - name: "remom_route"
555
+ description: "Complex reasoning using ReMoM (Reasoning for Mixture of Models)"
556
+ priority: 70
557
+ rules:
558
+ operator: "AND"
559
+ conditions:
560
+ - type: "keyword"
561
+ name: "looper_keywords"
562
+ modelRefs:
563
+ - model: "openai/gpt-oss-120b"
564
+ - model: "gpt-5.2"
565
+ algorithm:
566
+ type: "remom"
567
+ remom:
568
+ # Breadth schedule: [32, 4] means 32 parallel calls in round 1, 4 in round 2, then 1 final
569
+ # Low intensity: [4], Medium: [16], High: [32, 4]
570
+ breadth_schedule: [4] # Low intensity for demonstration
571
+
572
+ # Model distribution strategy:
573
+ # - "weighted": Distribute calls based on model weights (default)
574
+ # - "equal": Distribute evenly across all models
575
+ # - "first_only": Use only the first model (PaCoRe-compatible)
576
+ model_distribution: "equal"
577
+
578
+ # Temperature for diverse exploration (default: 1.0)
579
+ temperature: 1.0
580
+
581
+ # Include reasoning content from vLLM in synthesis prompts
582
+ include_reasoning: true
583
+
584
+ # Compaction strategy: "full" or "last_n_tokens"
585
+ compaction_strategy: "last_n_tokens"
586
+ compaction_tokens: 1000 # Keep last 1000 tokens when compacting
587
+
588
+ # Custom synthesis template (optional, uses default if not provided)
589
+ # synthesis_template: "Your custom template here"
590
+
591
+ # Maximum concurrent model calls per round (optional, defaults to all)
592
+ max_concurrent: 4
593
+
594
+ # Behavior on model call failure: "skip" or "fail"
595
+ on_error: "skip"
596
+
597
+ # Random seed for reproducibility
598
+ shuffle_seed: 42
599
+
600
+ # Include intermediate responses for visualization in dashboard
601
+ include_intermediate_responses: true
602
+
603
+ # Maximum responses to keep per round (optional, for memory efficiency)
604
+ # max_responses_per_round: 10
605
+
486
606
  # LLM - Backend model configuration
487
607
  providers:
488
608
  # Model configuration
@@ -12,7 +12,7 @@ static_resources:
12
12
  socket_address:
13
13
  address: {{ listener.address }}
14
14
  port_value: {{ listener.port }}
15
- perConnectionBufferLimitBytes: 52428800
15
+ perConnectionBufferLimitBytes: 524288000
16
16
  filter_chains:
17
17
  - filters:
18
18
  - name: envoy.filters.network.http_connection_manager
@@ -36,8 +36,8 @@ static_resources:
36
36
  exact: "{{ model.name }}"
37
37
  route:
38
38
  cluster: {{ model.cluster_name }}_cluster
39
- timeout: {{ listener.timeout | default('300s') }}
40
- idleTimeout: 300s
39
+ timeout: {{ listener.timeout | default('600s') }}
40
+ idleTimeout: 600s
41
41
  # Rewrite Host header to match upstream server
42
42
  host_rewrite_literal: "{{ model.endpoints[0].address }}"
43
43
  {% if model.path_prefix %}
@@ -59,8 +59,8 @@ static_resources:
59
59
  exact: "{{ model.name }}"
60
60
  route:
61
61
  cluster: anthropic_api_cluster
62
- timeout: {{ listener.timeout | default('300s') }}
63
- idleTimeout: 300s
62
+ timeout: {{ listener.timeout | default('600s') }}
63
+ idleTimeout: 600s
64
64
  host_rewrite_literal: "api.anthropic.com"
65
65
  {% endfor %}
66
66
  # Default route (no x-selected-model header)
@@ -73,7 +73,7 @@ static_resources:
73
73
  {% else %}
74
74
  cluster: vllm_static_cluster
75
75
  {% endif %}
76
- timeout: {{ listener.timeout | default('300s') }}
76
+ timeout: {{ listener.timeout | default('600s') }}
77
77
  {% if models %}
78
78
  # Rewrite Host header to match upstream server
79
79
  host_rewrite_literal: "{{ models[0].endpoints[0].address }}"
@@ -94,13 +94,13 @@ static_resources:
94
94
  grpc_service:
95
95
  envoy_grpc:
96
96
  cluster_name: extproc_service
97
- timeout: 300s
97
+ timeout: 600s
98
98
  processing_mode:
99
99
  request_header_mode: "SEND"
100
100
  response_header_mode: "SEND"
101
101
  request_body_mode: "BUFFERED"
102
102
  response_body_mode: "BUFFERED"
103
- message_timeout: {{ listener.timeout | default('300s') }}
103
+ message_timeout: {{ listener.timeout | default('600s') }}
104
104
  - name: envoy.filters.http.router
105
105
  typed_config:
106
106
  "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
@@ -115,7 +115,7 @@ static_resources:
115
115
  clusters:
116
116
  # ExtProc service (semantic router)
117
117
  - name: extproc_service
118
- connect_timeout: 300s
118
+ connect_timeout: 600s
119
119
  type: STATIC
120
120
  lb_policy: ROUND_ROBIN
121
121
  http2_protocol_options: {}
@@ -150,7 +150,7 @@ static_resources:
150
150
  {% for model in models %}
151
151
  # Cluster for model: {{ model.name }}
152
152
  - name: {{ model.cluster_name }}_cluster
153
- connect_timeout: 300s
153
+ connect_timeout: 600s
154
154
  type: {{ model.cluster_type }}
155
155
  {% if model.cluster_type == 'LOGICAL_DNS' %}
156
156
  dns_lookup_family: V4_ONLY
@@ -189,7 +189,7 @@ static_resources:
189
189
  - name: anthropic_api_cluster
190
190
  type: LOGICAL_DNS
191
191
  dns_lookup_family: V4_ONLY
192
- connect_timeout: 60s
192
+ connect_timeout: 600s
193
193
  lb_policy: ROUND_ROBIN
194
194
  load_assignment:
195
195
  cluster_name: anthropic_api_cluster
@@ -172,7 +172,7 @@ looper:
172
172
  enabled: true # Enable looper for multi-model decisions
173
173
  # Endpoint points to Envoy (same container), which handles load balancing and auth
174
174
  # Port should match listener port (default: 8888)
175
- endpoint: "http://localhost:8888/v1/chat/completions"
175
+ endpoint: "http://localhost:8899/v1/chat/completions"
176
176
  timeout_seconds: 120 # Timeout in seconds for each model call
177
177
  headers: {} # Optional headers (e.g., {"Authorization": "Bearer xxx"})
178
178
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "vllm-sr"
7
- version = "0.1.0.beta.2.dev20260203201608"
7
+ version = "0.1.0.beta.2.dev20260204071119"
8
8
  description = "vLLM Semantic Router - Intelligent routing for Mixture-of-Models"
9
9
  authors = [{name = "vLLM-SR Team"}]
10
10
  readme = "README.md"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vllm-sr
3
- Version: 0.1.0b2.dev20260203201608
3
+ Version: 0.1.0b2.dev20260204071119
4
4
  Summary: vLLM Semantic Router - Intelligent routing for Mixture-of-Models
5
5
  Author: vLLM-SR Team
6
6
  License: Apache-2.0