vllm-sr 0.1.0b2.dev20260203182852__py3-none-any.whl → 0.1.0b2.dev20260204070724__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/models.py +46 -1
- cli/templates/config.template.yaml +53 -0
- cli/templates/envoy.template.yaml +11 -11
- cli/templates/router-defaults.yaml +1 -1
- {vllm_sr-0.1.0b2.dev20260203182852.dist-info → vllm_sr-0.1.0b2.dev20260204070724.dist-info}/METADATA +1 -1
- {vllm_sr-0.1.0b2.dev20260203182852.dist-info → vllm_sr-0.1.0b2.dev20260204070724.dist-info}/RECORD +9 -9
- {vllm_sr-0.1.0b2.dev20260203182852.dist-info → vllm_sr-0.1.0b2.dev20260204070724.dist-info}/WHEEL +0 -0
- {vllm_sr-0.1.0b2.dev20260203182852.dist-info → vllm_sr-0.1.0b2.dev20260204070724.dist-info}/entry_points.txt +0 -0
- {vllm_sr-0.1.0b2.dev20260203182852.dist-info → vllm_sr-0.1.0b2.dev20260204070724.dist-info}/top_level.txt +0 -0
cli/models.py
CHANGED
|
@@ -192,18 +192,63 @@ class ConcurrentAlgorithmConfig(BaseModel):
|
|
|
192
192
|
on_error: Optional[str] = "skip"
|
|
193
193
|
|
|
194
194
|
|
|
195
|
+
class ReMoMAlgorithmConfig(BaseModel):
|
|
196
|
+
"""Configuration for ReMoM (Reasoning for Mixture of Models) algorithm.
|
|
197
|
+
|
|
198
|
+
This algorithm performs multi-round parallel reasoning with intelligent synthesis.
|
|
199
|
+
Inspired by PaCoRe (arXiv:2601.05593) but extended to support mixture of models.
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
# Breadth schedule: array of parallel calls per round (e.g., [32, 4] means 32 calls in round 1, 4 in round 2, then 1 final)
|
|
203
|
+
breadth_schedule: list[int]
|
|
204
|
+
|
|
205
|
+
# Model distribution strategy: "weighted", "equal", or "first_only"
|
|
206
|
+
model_distribution: Optional[str] = "weighted"
|
|
207
|
+
|
|
208
|
+
# Temperature for model calls (default: 1.0 for diverse exploration)
|
|
209
|
+
temperature: Optional[float] = 1.0
|
|
210
|
+
|
|
211
|
+
# Whether to include reasoning content in synthesis prompts
|
|
212
|
+
include_reasoning: Optional[bool] = False
|
|
213
|
+
|
|
214
|
+
# Compaction strategy: "full" or "last_n_tokens"
|
|
215
|
+
compaction_strategy: Optional[str] = "full"
|
|
216
|
+
|
|
217
|
+
# Number of tokens to keep when using last_n_tokens compaction
|
|
218
|
+
compaction_tokens: Optional[int] = 1000
|
|
219
|
+
|
|
220
|
+
# Custom synthesis template (uses default if not provided)
|
|
221
|
+
synthesis_template: Optional[str] = None
|
|
222
|
+
|
|
223
|
+
# Maximum concurrent model calls per round
|
|
224
|
+
max_concurrent: Optional[int] = None
|
|
225
|
+
|
|
226
|
+
# Behavior on model call failure: "skip" or "fail"
|
|
227
|
+
on_error: Optional[str] = "skip"
|
|
228
|
+
|
|
229
|
+
# Random seed for shuffling responses (for reproducibility)
|
|
230
|
+
shuffle_seed: Optional[int] = 42
|
|
231
|
+
|
|
232
|
+
# Whether to include intermediate responses in the response body for visualization
|
|
233
|
+
include_intermediate_responses: Optional[bool] = True
|
|
234
|
+
|
|
235
|
+
# Maximum number of responses to keep per round (for memory efficiency)
|
|
236
|
+
max_responses_per_round: Optional[int] = None
|
|
237
|
+
|
|
238
|
+
|
|
195
239
|
class AlgorithmConfig(BaseModel):
|
|
196
240
|
"""Algorithm configuration for multi-model decisions.
|
|
197
241
|
|
|
198
242
|
Specifies how multiple models in a decision should be orchestrated.
|
|
199
243
|
"""
|
|
200
244
|
|
|
201
|
-
# Algorithm type: "sequential", "confidence", "concurrent"
|
|
245
|
+
# Algorithm type: "sequential", "confidence", "concurrent", "remom"
|
|
202
246
|
type: str
|
|
203
247
|
|
|
204
248
|
# Algorithm-specific configurations (only one should be set based on type)
|
|
205
249
|
confidence: Optional[ConfidenceAlgorithmConfig] = None
|
|
206
250
|
concurrent: Optional[ConcurrentAlgorithmConfig] = None
|
|
251
|
+
remom: Optional[ReMoMAlgorithmConfig] = None
|
|
207
252
|
|
|
208
253
|
|
|
209
254
|
class PluginType(str, Enum):
|
|
@@ -483,6 +483,59 @@ decisions:
|
|
|
483
483
|
configuration:
|
|
484
484
|
enabled: false
|
|
485
485
|
|
|
486
|
+
# ReMoM algorithm example: Multi-round parallel reasoning with intelligent synthesis
|
|
487
|
+
- name: "remom_route"
|
|
488
|
+
description: "Complex reasoning using ReMoM (Reasoning for Mixture of Models)"
|
|
489
|
+
priority: 70
|
|
490
|
+
rules:
|
|
491
|
+
operator: "AND"
|
|
492
|
+
conditions:
|
|
493
|
+
- type: "keyword"
|
|
494
|
+
name: "looper_keywords"
|
|
495
|
+
modelRefs:
|
|
496
|
+
- model: "openai/gpt-oss-120b"
|
|
497
|
+
- model: "gpt-5.2"
|
|
498
|
+
algorithm:
|
|
499
|
+
type: "remom"
|
|
500
|
+
remom:
|
|
501
|
+
# Breadth schedule: [32, 4] means 32 parallel calls in round 1, 4 in round 2, then 1 final
|
|
502
|
+
# Low intensity: [4], Medium: [16], High: [32, 4]
|
|
503
|
+
breadth_schedule: [4] # Low intensity for demonstration
|
|
504
|
+
|
|
505
|
+
# Model distribution strategy:
|
|
506
|
+
# - "weighted": Distribute calls based on model weights (default)
|
|
507
|
+
# - "equal": Distribute evenly across all models
|
|
508
|
+
# - "first_only": Use only the first model (PaCoRe-compatible)
|
|
509
|
+
model_distribution: "equal"
|
|
510
|
+
|
|
511
|
+
# Temperature for diverse exploration (default: 1.0)
|
|
512
|
+
temperature: 1.0
|
|
513
|
+
|
|
514
|
+
# Include reasoning content from vLLM in synthesis prompts
|
|
515
|
+
include_reasoning: true
|
|
516
|
+
|
|
517
|
+
# Compaction strategy: "full" or "last_n_tokens"
|
|
518
|
+
compaction_strategy: "last_n_tokens"
|
|
519
|
+
compaction_tokens: 1000 # Keep last 1000 tokens when compacting
|
|
520
|
+
|
|
521
|
+
# Custom synthesis template (optional, uses default if not provided)
|
|
522
|
+
# synthesis_template: "Your custom template here"
|
|
523
|
+
|
|
524
|
+
# Maximum concurrent model calls per round (optional, defaults to all)
|
|
525
|
+
max_concurrent: 4
|
|
526
|
+
|
|
527
|
+
# Behavior on model call failure: "skip" or "fail"
|
|
528
|
+
on_error: "skip"
|
|
529
|
+
|
|
530
|
+
# Random seed for reproducibility
|
|
531
|
+
shuffle_seed: 42
|
|
532
|
+
|
|
533
|
+
# Include intermediate responses for visualization in dashboard
|
|
534
|
+
include_intermediate_responses: true
|
|
535
|
+
|
|
536
|
+
# Maximum responses to keep per round (optional, for memory efficiency)
|
|
537
|
+
# max_responses_per_round: 10
|
|
538
|
+
|
|
486
539
|
# LLM - Backend model configuration
|
|
487
540
|
providers:
|
|
488
541
|
# Model configuration
|
|
@@ -12,7 +12,7 @@ static_resources:
|
|
|
12
12
|
socket_address:
|
|
13
13
|
address: {{ listener.address }}
|
|
14
14
|
port_value: {{ listener.port }}
|
|
15
|
-
perConnectionBufferLimitBytes:
|
|
15
|
+
perConnectionBufferLimitBytes: 524288000
|
|
16
16
|
filter_chains:
|
|
17
17
|
- filters:
|
|
18
18
|
- name: envoy.filters.network.http_connection_manager
|
|
@@ -36,8 +36,8 @@ static_resources:
|
|
|
36
36
|
exact: "{{ model.name }}"
|
|
37
37
|
route:
|
|
38
38
|
cluster: {{ model.cluster_name }}_cluster
|
|
39
|
-
timeout: {{ listener.timeout | default('
|
|
40
|
-
idleTimeout:
|
|
39
|
+
timeout: {{ listener.timeout | default('600s') }}
|
|
40
|
+
idleTimeout: 600s
|
|
41
41
|
# Rewrite Host header to match upstream server
|
|
42
42
|
host_rewrite_literal: "{{ model.endpoints[0].address }}"
|
|
43
43
|
{% if model.path_prefix %}
|
|
@@ -59,8 +59,8 @@ static_resources:
|
|
|
59
59
|
exact: "{{ model.name }}"
|
|
60
60
|
route:
|
|
61
61
|
cluster: anthropic_api_cluster
|
|
62
|
-
timeout: {{ listener.timeout | default('
|
|
63
|
-
idleTimeout:
|
|
62
|
+
timeout: {{ listener.timeout | default('600s') }}
|
|
63
|
+
idleTimeout: 600s
|
|
64
64
|
host_rewrite_literal: "api.anthropic.com"
|
|
65
65
|
{% endfor %}
|
|
66
66
|
# Default route (no x-selected-model header)
|
|
@@ -73,7 +73,7 @@ static_resources:
|
|
|
73
73
|
{% else %}
|
|
74
74
|
cluster: vllm_static_cluster
|
|
75
75
|
{% endif %}
|
|
76
|
-
timeout: {{ listener.timeout | default('
|
|
76
|
+
timeout: {{ listener.timeout | default('600s') }}
|
|
77
77
|
{% if models %}
|
|
78
78
|
# Rewrite Host header to match upstream server
|
|
79
79
|
host_rewrite_literal: "{{ models[0].endpoints[0].address }}"
|
|
@@ -94,13 +94,13 @@ static_resources:
|
|
|
94
94
|
grpc_service:
|
|
95
95
|
envoy_grpc:
|
|
96
96
|
cluster_name: extproc_service
|
|
97
|
-
timeout:
|
|
97
|
+
timeout: 600s
|
|
98
98
|
processing_mode:
|
|
99
99
|
request_header_mode: "SEND"
|
|
100
100
|
response_header_mode: "SEND"
|
|
101
101
|
request_body_mode: "BUFFERED"
|
|
102
102
|
response_body_mode: "BUFFERED"
|
|
103
|
-
message_timeout: {{ listener.timeout | default('
|
|
103
|
+
message_timeout: {{ listener.timeout | default('600s') }}
|
|
104
104
|
- name: envoy.filters.http.router
|
|
105
105
|
typed_config:
|
|
106
106
|
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
|
|
@@ -115,7 +115,7 @@ static_resources:
|
|
|
115
115
|
clusters:
|
|
116
116
|
# ExtProc service (semantic router)
|
|
117
117
|
- name: extproc_service
|
|
118
|
-
connect_timeout:
|
|
118
|
+
connect_timeout: 600s
|
|
119
119
|
type: STATIC
|
|
120
120
|
lb_policy: ROUND_ROBIN
|
|
121
121
|
http2_protocol_options: {}
|
|
@@ -150,7 +150,7 @@ static_resources:
|
|
|
150
150
|
{% for model in models %}
|
|
151
151
|
# Cluster for model: {{ model.name }}
|
|
152
152
|
- name: {{ model.cluster_name }}_cluster
|
|
153
|
-
connect_timeout:
|
|
153
|
+
connect_timeout: 600s
|
|
154
154
|
type: {{ model.cluster_type }}
|
|
155
155
|
{% if model.cluster_type == 'LOGICAL_DNS' %}
|
|
156
156
|
dns_lookup_family: V4_ONLY
|
|
@@ -189,7 +189,7 @@ static_resources:
|
|
|
189
189
|
- name: anthropic_api_cluster
|
|
190
190
|
type: LOGICAL_DNS
|
|
191
191
|
dns_lookup_family: V4_ONLY
|
|
192
|
-
connect_timeout:
|
|
192
|
+
connect_timeout: 600s
|
|
193
193
|
lb_policy: ROUND_ROBIN
|
|
194
194
|
load_assignment:
|
|
195
195
|
cluster_name: anthropic_api_cluster
|
|
@@ -172,7 +172,7 @@ looper:
|
|
|
172
172
|
enabled: true # Enable looper for multi-model decisions
|
|
173
173
|
# Endpoint points to Envoy (same container), which handles load balancing and auth
|
|
174
174
|
# Port should match listener port (default: 8888)
|
|
175
|
-
endpoint: "http://localhost:
|
|
175
|
+
endpoint: "http://localhost:8899/v1/chat/completions"
|
|
176
176
|
timeout_seconds: 120 # Timeout in seconds for each model call
|
|
177
177
|
headers: {} # Optional headers (e.g., {"Authorization": "Bearer xxx"})
|
|
178
178
|
|
{vllm_sr-0.1.0b2.dev20260203182852.dist-info → vllm_sr-0.1.0b2.dev20260204070724.dist-info}/RECORD
RENAMED
|
@@ -7,7 +7,7 @@ cli/docker_cli.py,sha256=kj3VvNfUJwcizZQj63mB2yLBBUbvZ9P3MtleIXhTp0M,25230
|
|
|
7
7
|
cli/logo.py,sha256=I0qnCCGyOsHmN6MRgqa_c07MZSArv_6eHcHAL36V0Eg,1512
|
|
8
8
|
cli/main.py,sha256=-PkXZw_z0VtzLbA5BCQlvLTYRuNbAT0CcBSFvjrsmBo,9137
|
|
9
9
|
cli/merger.py,sha256=KMajL8tzdZeVRSdyq6MJt-DNhOQ5K_suMGx3jhl0odQ,16569
|
|
10
|
-
cli/models.py,sha256=
|
|
10
|
+
cli/models.py,sha256=jrHiAqEIsWgu5O4_Y7T1xXLxFWlXuEtdr9RCLg1B0V0,13468
|
|
11
11
|
cli/parser.py,sha256=lFPrROAYyc2cPoMMULV2sPid5IklIvc8zpDekj5Pocw,5653
|
|
12
12
|
cli/utils.py,sha256=CHMwSZC-zxrxCnlZqiyzF2CToCVB5tHku2zyYig8fo0,3787
|
|
13
13
|
cli/validator.py,sha256=G49I8_vyg0HvfI2LHwFlOagTwWAEsIkIDzX5riYoZhA,10651
|
|
@@ -19,8 +19,8 @@ cli/commands/serve.py,sha256=vsK3T4uWx49CPTgy4sc6oM7F5riq0sNbyiT7KbS37Wc,6104
|
|
|
19
19
|
cli/commands/show_config.py,sha256=emkIdH9LKbzMzxna3Lxl-hEG85isfAXY5BlZjP_vaIM,4689
|
|
20
20
|
cli/commands/show_defaults.py,sha256=r95KOHKQWeeNnoFO5EmCUZSPeOKb-5NzCoaZ6CaBcIw,701
|
|
21
21
|
cli/commands/validate.py,sha256=Chm_MfyJMESiOo6IXiroYwMUkm8HCI26OHYYUBkGXGk,2773
|
|
22
|
-
cli/templates/config.template.yaml,sha256=
|
|
23
|
-
cli/templates/envoy.template.yaml,sha256=
|
|
22
|
+
cli/templates/config.template.yaml,sha256=sS7ZJqrujBQgIs8E0vxUYcVofFL_SiGgW8XnNz5lrRE,18541
|
|
23
|
+
cli/templates/envoy.template.yaml,sha256=j43mxQ3NcFMlmbGyrDM0-1rvQkC1N4MpXUC2o7SLT9g,8293
|
|
24
24
|
cli/templates/generate_dashboard.py,sha256=gCui1rPrxUFIeioebyQ_JhL7zbjoEAg8GjUJcWXHcys,24233
|
|
25
25
|
cli/templates/grafana-dashboard.serve.yaml,sha256=o1H5xx60esBzk19qnMQtX4LmL5DswpMvOO7cO09Q3kI,350
|
|
26
26
|
cli/templates/grafana-datasource-jaeger.serve.yaml,sha256=7eigRe8mdyHjbPA7B-ZoJGHrq1chhCPWWsE-K4Dvj30,319
|
|
@@ -28,10 +28,10 @@ cli/templates/grafana-datasource.serve.yaml,sha256=Cxjz1zVWoUdSzbSsS_iJhMHRrmRi6
|
|
|
28
28
|
cli/templates/grafana.serve.ini,sha256=x9bCkzxqm5gC4fKToY2lhNPdWhwAaJGVe5ABMW6Dv-c,1674
|
|
29
29
|
cli/templates/llm-router-dashboard.serve.json,sha256=pwnTjUh7z3_3LnIwtaLXjDWH4aHd2Mc57z0oekgt-Bk,60903
|
|
30
30
|
cli/templates/prometheus.serve.yaml,sha256=MGYq8dlRq_i2m5sogQ--kwTvJpkf44QQoCNoI7oyVT8,270
|
|
31
|
-
cli/templates/router-defaults.yaml,sha256=
|
|
31
|
+
cli/templates/router-defaults.yaml,sha256=a0riw9juttoBr4G7rtUEJA2FY9TgAvK-8mngWGCCIoc,9373
|
|
32
32
|
cli/templates/tools_db.json,sha256=CPqPBkd5nc966m1YEozz06frrmv3Pd5rrkxKkO3rTiA,4537
|
|
33
|
-
vllm_sr-0.1.0b2.
|
|
34
|
-
vllm_sr-0.1.0b2.
|
|
35
|
-
vllm_sr-0.1.0b2.
|
|
36
|
-
vllm_sr-0.1.0b2.
|
|
37
|
-
vllm_sr-0.1.0b2.
|
|
33
|
+
vllm_sr-0.1.0b2.dev20260204070724.dist-info/METADATA,sha256=Klc_-W-wKFC4HErcZy2jqrsSzfQQUSPI5JRB6MFWMCc,7298
|
|
34
|
+
vllm_sr-0.1.0b2.dev20260204070724.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
35
|
+
vllm_sr-0.1.0b2.dev20260204070724.dist-info/entry_points.txt,sha256=WhlBPbLHUpWUsMuUQX9cnvsYMf0ih5i57vvJ1jJNi0k,42
|
|
36
|
+
vllm_sr-0.1.0b2.dev20260204070724.dist-info/top_level.txt,sha256=2ImG917oaVHlm0nP9oJE-Qrgs-fq_fGWgba2H1f8fpE,4
|
|
37
|
+
vllm_sr-0.1.0b2.dev20260204070724.dist-info/RECORD,,
|
{vllm_sr-0.1.0b2.dev20260203182852.dist-info → vllm_sr-0.1.0b2.dev20260204070724.dist-info}/WHEEL
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|