vllm-sr 0.1.0b2.dev20260203201608__tar.gz → 0.1.0b2.dev20260204070724__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {vllm_sr-0.1.0b2.dev20260203201608/vllm_sr.egg-info → vllm_sr-0.1.0b2.dev20260204070724}/PKG-INFO +1 -1
  2. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/models.py +46 -1
  3. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/templates/config.template.yaml +53 -0
  4. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/templates/envoy.template.yaml +11 -11
  5. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/templates/router-defaults.yaml +1 -1
  6. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/pyproject.toml +1 -1
  7. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724/vllm_sr.egg-info}/PKG-INFO +1 -1
  8. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/MANIFEST.in +0 -0
  9. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/README.md +0 -0
  10. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/__init__.py +0 -0
  11. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/commands/__init__.py +0 -0
  12. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/commands/config.py +0 -0
  13. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/commands/generate.py +0 -0
  14. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/commands/init.py +0 -0
  15. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/commands/serve.py +0 -0
  16. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/commands/show_config.py +0 -0
  17. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/commands/show_defaults.py +0 -0
  18. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/commands/validate.py +0 -0
  19. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/config_generator.py +0 -0
  20. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/consts.py +0 -0
  21. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/core.py +0 -0
  22. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/defaults.py +0 -0
  23. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/docker_cli.py +0 -0
  24. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/logo.py +0 -0
  25. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/main.py +0 -0
  26. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/merger.py +0 -0
  27. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/parser.py +0 -0
  28. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/templates/generate_dashboard.py +0 -0
  29. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/templates/grafana-dashboard.serve.yaml +0 -0
  30. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/templates/grafana-datasource-jaeger.serve.yaml +0 -0
  31. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/templates/grafana-datasource.serve.yaml +0 -0
  32. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/templates/grafana.serve.ini +0 -0
  33. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/templates/llm-router-dashboard.serve.json +0 -0
  34. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/templates/prometheus.serve.yaml +0 -0
  35. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/templates/tools_db.json +0 -0
  36. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/utils.py +0 -0
  37. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/validator.py +0 -0
  38. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/requirements.txt +0 -0
  39. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/setup.cfg +0 -0
  40. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/tests/test_plugin_parsing.py +0 -0
  41. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/tests/test_plugin_yaml_generation.py +0 -0
  42. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/vllm_sr.egg-info/SOURCES.txt +0 -0
  43. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/vllm_sr.egg-info/dependency_links.txt +0 -0
  44. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/vllm_sr.egg-info/entry_points.txt +0 -0
  45. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/vllm_sr.egg-info/requires.txt +0 -0
  46. {vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/vllm_sr.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vllm-sr
3
- Version: 0.1.0b2.dev20260203201608
3
+ Version: 0.1.0b2.dev20260204070724
4
4
  Summary: vLLM Semantic Router - Intelligent routing for Mixture-of-Models
5
5
  Author: vLLM-SR Team
6
6
  License: Apache-2.0
@@ -192,18 +192,63 @@ class ConcurrentAlgorithmConfig(BaseModel):
192
192
  on_error: Optional[str] = "skip"
193
193
 
194
194
 
195
+ class ReMoMAlgorithmConfig(BaseModel):
196
+ """Configuration for ReMoM (Reasoning for Mixture of Models) algorithm.
197
+
198
+ This algorithm performs multi-round parallel reasoning with intelligent synthesis.
199
+ Inspired by PaCoRe (arXiv:2601.05593) but extended to support mixture of models.
200
+ """
201
+
202
+ # Breadth schedule: array of parallel calls per round (e.g., [32, 4] means 32 calls in round 1, 4 in round 2, then 1 final)
203
+ breadth_schedule: list[int]
204
+
205
+ # Model distribution strategy: "weighted", "equal", or "first_only"
206
+ model_distribution: Optional[str] = "weighted"
207
+
208
+ # Temperature for model calls (default: 1.0 for diverse exploration)
209
+ temperature: Optional[float] = 1.0
210
+
211
+ # Whether to include reasoning content in synthesis prompts
212
+ include_reasoning: Optional[bool] = False
213
+
214
+ # Compaction strategy: "full" or "last_n_tokens"
215
+ compaction_strategy: Optional[str] = "full"
216
+
217
+ # Number of tokens to keep when using last_n_tokens compaction
218
+ compaction_tokens: Optional[int] = 1000
219
+
220
+ # Custom synthesis template (uses default if not provided)
221
+ synthesis_template: Optional[str] = None
222
+
223
+ # Maximum concurrent model calls per round
224
+ max_concurrent: Optional[int] = None
225
+
226
+ # Behavior on model call failure: "skip" or "fail"
227
+ on_error: Optional[str] = "skip"
228
+
229
+ # Random seed for shuffling responses (for reproducibility)
230
+ shuffle_seed: Optional[int] = 42
231
+
232
+ # Whether to include intermediate responses in the response body for visualization
233
+ include_intermediate_responses: Optional[bool] = True
234
+
235
+ # Maximum number of responses to keep per round (for memory efficiency)
236
+ max_responses_per_round: Optional[int] = None
237
+
238
+
195
239
  class AlgorithmConfig(BaseModel):
196
240
  """Algorithm configuration for multi-model decisions.
197
241
 
198
242
  Specifies how multiple models in a decision should be orchestrated.
199
243
  """
200
244
 
201
- # Algorithm type: "sequential", "confidence", "concurrent"
245
+ # Algorithm type: "sequential", "confidence", "concurrent", "remom"
202
246
  type: str
203
247
 
204
248
  # Algorithm-specific configurations (only one should be set based on type)
205
249
  confidence: Optional[ConfidenceAlgorithmConfig] = None
206
250
  concurrent: Optional[ConcurrentAlgorithmConfig] = None
251
+ remom: Optional[ReMoMAlgorithmConfig] = None
207
252
 
208
253
 
209
254
  class PluginType(str, Enum):
@@ -483,6 +483,59 @@ decisions:
483
483
  configuration:
484
484
  enabled: false
485
485
 
486
+ # ReMoM algorithm example: Multi-round parallel reasoning with intelligent synthesis
487
+ - name: "remom_route"
488
+ description: "Complex reasoning using ReMoM (Reasoning for Mixture of Models)"
489
+ priority: 70
490
+ rules:
491
+ operator: "AND"
492
+ conditions:
493
+ - type: "keyword"
494
+ name: "looper_keywords"
495
+ modelRefs:
496
+ - model: "openai/gpt-oss-120b"
497
+ - model: "gpt-5.2"
498
+ algorithm:
499
+ type: "remom"
500
+ remom:
501
+ # Breadth schedule: [32, 4] means 32 parallel calls in round 1, 4 in round 2, then 1 final
502
+ # Low intensity: [4], Medium: [16], High: [32, 4]
503
+ breadth_schedule: [4] # Low intensity for demonstration
504
+
505
+ # Model distribution strategy:
506
+ # - "weighted": Distribute calls based on model weights (default)
507
+ # - "equal": Distribute evenly across all models
508
+ # - "first_only": Use only the first model (PaCoRe-compatible)
509
+ model_distribution: "equal"
510
+
511
+ # Temperature for diverse exploration (default: 1.0)
512
+ temperature: 1.0
513
+
514
+ # Include reasoning content from vLLM in synthesis prompts
515
+ include_reasoning: true
516
+
517
+ # Compaction strategy: "full" or "last_n_tokens"
518
+ compaction_strategy: "last_n_tokens"
519
+ compaction_tokens: 1000 # Keep last 1000 tokens when compacting
520
+
521
+ # Custom synthesis template (optional, uses default if not provided)
522
+ # synthesis_template: "Your custom template here"
523
+
524
+ # Maximum concurrent model calls per round (optional, defaults to all)
525
+ max_concurrent: 4
526
+
527
+ # Behavior on model call failure: "skip" or "fail"
528
+ on_error: "skip"
529
+
530
+ # Random seed for reproducibility
531
+ shuffle_seed: 42
532
+
533
+ # Include intermediate responses for visualization in dashboard
534
+ include_intermediate_responses: true
535
+
536
+ # Maximum responses to keep per round (optional, for memory efficiency)
537
+ # max_responses_per_round: 10
538
+
486
539
  # LLM - Backend model configuration
487
540
  providers:
488
541
  # Model configuration
@@ -12,7 +12,7 @@ static_resources:
12
12
  socket_address:
13
13
  address: {{ listener.address }}
14
14
  port_value: {{ listener.port }}
15
- perConnectionBufferLimitBytes: 52428800
15
+ perConnectionBufferLimitBytes: 524288000
16
16
  filter_chains:
17
17
  - filters:
18
18
  - name: envoy.filters.network.http_connection_manager
@@ -36,8 +36,8 @@ static_resources:
36
36
  exact: "{{ model.name }}"
37
37
  route:
38
38
  cluster: {{ model.cluster_name }}_cluster
39
- timeout: {{ listener.timeout | default('300s') }}
40
- idleTimeout: 300s
39
+ timeout: {{ listener.timeout | default('600s') }}
40
+ idleTimeout: 600s
41
41
  # Rewrite Host header to match upstream server
42
42
  host_rewrite_literal: "{{ model.endpoints[0].address }}"
43
43
  {% if model.path_prefix %}
@@ -59,8 +59,8 @@ static_resources:
59
59
  exact: "{{ model.name }}"
60
60
  route:
61
61
  cluster: anthropic_api_cluster
62
- timeout: {{ listener.timeout | default('300s') }}
63
- idleTimeout: 300s
62
+ timeout: {{ listener.timeout | default('600s') }}
63
+ idleTimeout: 600s
64
64
  host_rewrite_literal: "api.anthropic.com"
65
65
  {% endfor %}
66
66
  # Default route (no x-selected-model header)
@@ -73,7 +73,7 @@ static_resources:
73
73
  {% else %}
74
74
  cluster: vllm_static_cluster
75
75
  {% endif %}
76
- timeout: {{ listener.timeout | default('300s') }}
76
+ timeout: {{ listener.timeout | default('600s') }}
77
77
  {% if models %}
78
78
  # Rewrite Host header to match upstream server
79
79
  host_rewrite_literal: "{{ models[0].endpoints[0].address }}"
@@ -94,13 +94,13 @@ static_resources:
94
94
  grpc_service:
95
95
  envoy_grpc:
96
96
  cluster_name: extproc_service
97
- timeout: 300s
97
+ timeout: 600s
98
98
  processing_mode:
99
99
  request_header_mode: "SEND"
100
100
  response_header_mode: "SEND"
101
101
  request_body_mode: "BUFFERED"
102
102
  response_body_mode: "BUFFERED"
103
- message_timeout: {{ listener.timeout | default('300s') }}
103
+ message_timeout: {{ listener.timeout | default('600s') }}
104
104
  - name: envoy.filters.http.router
105
105
  typed_config:
106
106
  "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
@@ -115,7 +115,7 @@ static_resources:
115
115
  clusters:
116
116
  # ExtProc service (semantic router)
117
117
  - name: extproc_service
118
- connect_timeout: 300s
118
+ connect_timeout: 600s
119
119
  type: STATIC
120
120
  lb_policy: ROUND_ROBIN
121
121
  http2_protocol_options: {}
@@ -150,7 +150,7 @@ static_resources:
150
150
  {% for model in models %}
151
151
  # Cluster for model: {{ model.name }}
152
152
  - name: {{ model.cluster_name }}_cluster
153
- connect_timeout: 300s
153
+ connect_timeout: 600s
154
154
  type: {{ model.cluster_type }}
155
155
  {% if model.cluster_type == 'LOGICAL_DNS' %}
156
156
  dns_lookup_family: V4_ONLY
@@ -189,7 +189,7 @@ static_resources:
189
189
  - name: anthropic_api_cluster
190
190
  type: LOGICAL_DNS
191
191
  dns_lookup_family: V4_ONLY
192
- connect_timeout: 60s
192
+ connect_timeout: 600s
193
193
  lb_policy: ROUND_ROBIN
194
194
  load_assignment:
195
195
  cluster_name: anthropic_api_cluster
@@ -172,7 +172,7 @@ looper:
172
172
  enabled: true # Enable looper for multi-model decisions
173
173
  # Endpoint points to Envoy (same container), which handles load balancing and auth
174
174
  # Port should match listener port (default: 8888)
175
- endpoint: "http://localhost:8888/v1/chat/completions"
175
+ endpoint: "http://localhost:8899/v1/chat/completions"
176
176
  timeout_seconds: 120 # Timeout in seconds for each model call
177
177
  headers: {} # Optional headers (e.g., {"Authorization": "Bearer xxx"})
178
178
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "vllm-sr"
7
- version = "0.1.0.beta.2.dev20260203201608"
7
+ version = "0.1.0.beta.2.dev20260204070724"
8
8
  description = "vLLM Semantic Router - Intelligent routing for Mixture-of-Models"
9
9
  authors = [{name = "vLLM-SR Team"}]
10
10
  readme = "README.md"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vllm-sr
3
- Version: 0.1.0b2.dev20260203201608
3
+ Version: 0.1.0b2.dev20260204070724
4
4
  Summary: vLLM Semantic Router - Intelligent routing for Mixture-of-Models
5
5
  Author: vLLM-SR Team
6
6
  License: Apache-2.0