PyPI - vllm-sr - Versions diffs - 0.1.0b2.dev20260203201608__tar.gz → 0.1.0b2.dev20260204070724__tar.gz - Mend

vllm-sr 0.1.0b2.dev20260203201608tar.gz → 0.1.0b2.dev20260204070724tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

{vllm_sr-0.1.0b2.dev20260203201608/vllm_sr.egg-info → vllm_sr-0.1.0b2.dev20260204070724}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: vllm-sr
-Version: 0.1.0b2.dev20260203201608
+Version: 0.1.0b2.dev20260204070724
 Summary: vLLM Semantic Router - Intelligent routing for Mixture-of-Models
 Author: vLLM-SR Team
 License: Apache-2.0

{vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/models.py RENAMED Viewed

@@ -192,18 +192,63 @@ class ConcurrentAlgorithmConfig(BaseModel):
     on_error: Optional[str] = "skip"
+class ReMoMAlgorithmConfig(BaseModel):
+    """Configuration for ReMoM (Reasoning for Mixture of Models) algorithm.
+    This algorithm performs multi-round parallel reasoning with intelligent synthesis.
+    Inspired by PaCoRe (arXiv:2601.05593) but extended to support mixture of models.
+    """
+    # Breadth schedule: array of parallel calls per round (e.g., [32, 4] means 32 calls in round 1, 4 in round 2, then 1 final)
+    breadth_schedule: list[int]
+    # Model distribution strategy: "weighted", "equal", or "first_only"
+    model_distribution: Optional[str] = "weighted"
+    # Temperature for model calls (default: 1.0 for diverse exploration)
+    temperature: Optional[float] = 1.0
+    # Whether to include reasoning content in synthesis prompts
+    include_reasoning: Optional[bool] = False
+    # Compaction strategy: "full" or "last_n_tokens"
+    compaction_strategy: Optional[str] = "full"
+    # Number of tokens to keep when using last_n_tokens compaction
+    compaction_tokens: Optional[int] = 1000
+    # Custom synthesis template (uses default if not provided)
+    synthesis_template: Optional[str] = None
+    # Maximum concurrent model calls per round
+    max_concurrent: Optional[int] = None
+    # Behavior on model call failure: "skip" or "fail"
+    on_error: Optional[str] = "skip"
+    # Random seed for shuffling responses (for reproducibility)
+    shuffle_seed: Optional[int] = 42
+    # Whether to include intermediate responses in the response body for visualization
+    include_intermediate_responses: Optional[bool] = True
+    # Maximum number of responses to keep per round (for memory efficiency)
+    max_responses_per_round: Optional[int] = None
 class AlgorithmConfig(BaseModel):
     """Algorithm configuration for multi-model decisions.
     Specifies how multiple models in a decision should be orchestrated.
     """
-    # Algorithm type: "sequential", "confidence", "concurrent"
+    # Algorithm type: "sequential", "confidence", "concurrent", "remom"
     type: str
     # Algorithm-specific configurations (only one should be set based on type)
     confidence: Optional[ConfidenceAlgorithmConfig] = None
     concurrent: Optional[ConcurrentAlgorithmConfig] = None
+    remom: Optional[ReMoMAlgorithmConfig] = None
 class PluginType(str, Enum):

{vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/templates/config.template.yaml RENAMED Viewed

@@ -483,6 +483,59 @@ decisions:
         configuration:
           enabled: false
+  # ReMoM algorithm example: Multi-round parallel reasoning with intelligent synthesis
+  - name: "remom_route"
+    description: "Complex reasoning using ReMoM (Reasoning for Mixture of Models)"
+    priority: 70
+    rules:
+      operator: "AND"
+      conditions:
+        - type: "keyword"
+          name: "looper_keywords"
+    modelRefs:
+      - model: "openai/gpt-oss-120b"
+      - model: "gpt-5.2"
+    algorithm:
+      type: "remom"
+      remom:
+        # Breadth schedule: [32, 4] means 32 parallel calls in round 1, 4 in round 2, then 1 final
+        # Low intensity: [4], Medium: [16], High: [32, 4]
+        breadth_schedule: [4]  # Low intensity for demonstration
+        # Model distribution strategy:
+        # - "weighted": Distribute calls based on model weights (default)
+        # - "equal": Distribute evenly across all models
+        # - "first_only": Use only the first model (PaCoRe-compatible)
+        model_distribution: "equal"
+        # Temperature for diverse exploration (default: 1.0)
+        temperature: 1.0
+        # Include reasoning content from vLLM in synthesis prompts
+        include_reasoning: true
+        # Compaction strategy: "full" or "last_n_tokens"
+        compaction_strategy: "last_n_tokens"
+        compaction_tokens: 1000  # Keep last 1000 tokens when compacting
+        # Custom synthesis template (optional, uses default if not provided)
+        # synthesis_template: "Your custom template here"
+        # Maximum concurrent model calls per round (optional, defaults to all)
+        max_concurrent: 4
+        # Behavior on model call failure: "skip" or "fail"
+        on_error: "skip"
+        # Random seed for reproducibility
+        shuffle_seed: 42
+        # Include intermediate responses for visualization in dashboard
+        include_intermediate_responses: true
+        # Maximum responses to keep per round (optional, for memory efficiency)
+        # max_responses_per_round: 10
 # LLM - Backend model configuration
 providers:
   # Model configuration

{vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/templates/envoy.template.yaml RENAMED Viewed

@@ -12,7 +12,7 @@ static_resources:
       socket_address:
         address: {{ listener.address }}
         port_value: {{ listener.port }}
-    perConnectionBufferLimitBytes: 52428800
+    perConnectionBufferLimitBytes: 524288000
     filter_chains:
     - filters:
       - name: envoy.filters.network.http_connection_manager
@@ -36,8 +36,8 @@ static_resources:
                       exact: "{{ model.name }}"
                 route:
                   cluster: {{ model.cluster_name }}_cluster
-                  timeout: {{ listener.timeout | default('300s') }}
-                  idleTimeout: 300s
+                  timeout: {{ listener.timeout | default('600s') }}
+                  idleTimeout: 600s
                   # Rewrite Host header to match upstream server
                   host_rewrite_literal: "{{ model.endpoints[0].address }}"
                   {% if model.path_prefix %}
@@ -59,8 +59,8 @@ static_resources:
                       exact: "{{ model.name }}"
                 route:
                   cluster: anthropic_api_cluster
-                  timeout: {{ listener.timeout | default('300s') }}
-                  idleTimeout: 300s
+                  timeout: {{ listener.timeout | default('600s') }}
+                  idleTimeout: 600s
                   host_rewrite_literal: "api.anthropic.com"
               {% endfor %}
               # Default route (no x-selected-model header)
@@ -73,7 +73,7 @@ static_resources:
                   {% else %}
                   cluster: vllm_static_cluster
                   {% endif %}
-                  timeout: {{ listener.timeout | default('300s') }}
+                  timeout: {{ listener.timeout | default('600s') }}
                   {% if models %}
                   # Rewrite Host header to match upstream server
                   host_rewrite_literal: "{{ models[0].endpoints[0].address }}"
@@ -94,13 +94,13 @@ static_resources:
               grpc_service:
                 envoy_grpc:
                   cluster_name: extproc_service
-                timeout: 300s
+                timeout: 600s
               processing_mode:
                 request_header_mode: "SEND"
                 response_header_mode: "SEND"
                 request_body_mode: "BUFFERED"
                 response_body_mode: "BUFFERED"
-              message_timeout: {{ listener.timeout | default('300s') }}
+              message_timeout: {{ listener.timeout | default('600s') }}
           - name: envoy.filters.http.router
             typed_config:
               "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
@@ -115,7 +115,7 @@ static_resources:
   clusters:
   # ExtProc service (semantic router)
   - name: extproc_service
-    connect_timeout: 300s
+    connect_timeout: 600s
     type: STATIC
     lb_policy: ROUND_ROBIN
     http2_protocol_options: {}
@@ -150,7 +150,7 @@ static_resources:
   {% for model in models %}
   # Cluster for model: {{ model.name }}
   - name: {{ model.cluster_name }}_cluster
-    connect_timeout: 300s
+    connect_timeout: 600s
     type: {{ model.cluster_type }}
     {% if model.cluster_type == 'LOGICAL_DNS' %}
     dns_lookup_family: V4_ONLY
@@ -189,7 +189,7 @@ static_resources:
   - name: anthropic_api_cluster
     type: LOGICAL_DNS
     dns_lookup_family: V4_ONLY
-    connect_timeout: 60s
+    connect_timeout: 600s
     lb_policy: ROUND_ROBIN
     load_assignment:
       cluster_name: anthropic_api_cluster

{vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/cli/templates/router-defaults.yaml RENAMED Viewed

@@ -172,7 +172,7 @@ looper:
   enabled: true  # Enable looper for multi-model decisions
   # Endpoint points to Envoy (same container), which handles load balancing and auth
   # Port should match listener port (default: 8888)
-  endpoint: "http://localhost:8888/v1/chat/completions"
+  endpoint: "http://localhost:8899/v1/chat/completions"
   timeout_seconds: 120  # Timeout in seconds for each model call
   headers: {}  # Optional headers (e.g., {"Authorization": "Bearer xxx"})

{vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "vllm-sr"
-version = "0.1.0.beta.2.dev20260203201608"
+version = "0.1.0.beta.2.dev20260204070724"
 description = "vLLM Semantic Router - Intelligent routing for Mixture-of-Models"
 authors = [{name = "vLLM-SR Team"}]
 readme = "README.md"

{vllm_sr-0.1.0b2.dev20260203201608 → vllm_sr-0.1.0b2.dev20260204070724/vllm_sr.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: vllm-sr
-Version: 0.1.0b2.dev20260203201608
+Version: 0.1.0b2.dev20260204070724
 Summary: vLLM Semantic Router - Intelligent routing for Mixture-of-Models
 Author: vLLM-SR Team
 License: Apache-2.0