PyPI - vllm-sr - Versions diffs - 0.1.0b2.dev20260126190945__tar.gz → 0.1.0b2.dev20260202073049__tar.gz - Mend

vllm-sr 0.1.0b2.dev20260126190945tar.gz → 0.1.0b2.dev20260202073049tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

{vllm_sr-0.1.0b2.dev20260126190945/vllm_sr.egg-info → vllm_sr-0.1.0b2.dev20260202073049}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: vllm-sr
-Version: 0.1.0b2.dev20260126190945
+Version: 0.1.0b2.dev20260202073049
 Summary: vLLM Semantic Router - Intelligent routing for Mixture-of-Models
 Author: vLLM-SR Team
 License: Apache-2.0

{vllm_sr-0.1.0b2.dev20260126190945 → vllm_sr-0.1.0b2.dev20260202073049}/cli/merger.py RENAMED Viewed

@@ -185,6 +185,37 @@ def translate_context_signals(context_rules: list) -> list:
     return rules
+def translate_complexity_signals(complexity_rules: list) -> list:
+    """
+    Translate complexity signals to router format.
+    Args:
+        complexity_rules: List of ComplexityRule objects
+    Returns:
+        list: Router complexity rules
+    """
+    rules = []
+    for signal in complexity_rules:
+        rule = {
+            "name": signal.name,
+            "threshold": signal.threshold,
+            "hard": {"candidates": signal.hard.candidates},
+            "easy": {"candidates": signal.easy.candidates},
+        }
+        if signal.description:
+            rule["description"] = signal.description
+        if signal.composer:
+            rule["composer"] = {
+                "operator": signal.composer.operator,
+                "conditions": [
+                    {"type": c.type, "name": c.name} for c in signal.composer.conditions
+                ],
+            }
+        rules.append(rule)
+    return rules
 def translate_external_models(external_models: list) -> list:
     """
     Translate external models to router format.
@@ -441,6 +472,14 @@ def merge_configs(user_config: UserConfig, defaults: Dict[str, Any]) -> Dict[str
             )
             log.info(f"  Added {len(user_config.signals.context)} context signals")
+        if user_config.signals.complexity and len(user_config.signals.complexity) > 0:
+            merged["complexity_rules"] = translate_complexity_signals(
+                user_config.signals.complexity
+            )
+            log.info(
+                f"  Added {len(user_config.signals.complexity)} complexity signals"
+            )
         # Translate domains to categories
         if user_config.signals.domains:
             merged["categories"] = translate_domains_to_categories(

{vllm_sr-0.1.0b2.dev20260126190945 → vllm_sr-0.1.0b2.dev20260202073049}/cli/models.py RENAMED Viewed

@@ -85,6 +85,28 @@ class ContextRule(BaseModel):
     description: Optional[str] = None
+class ComplexityCandidates(BaseModel):
+    """Complexity candidates configuration."""
+    candidates: List[str]
+class ComplexityRule(BaseModel):
+    """Complexity-based signal configuration using embedding similarity.
+    The composer field allows filtering based on other signals (e.g., only apply
+    code_complexity when domain is "computer_science"). This is evaluated after
+    all signals are computed in parallel, enabling signal dependencies.
+    """
+    name: str
+    threshold: float = 0.1
+    hard: ComplexityCandidates
+    easy: ComplexityCandidates
+    description: Optional[str] = None
+    composer: Optional["Rules"] = None  # Forward reference, defined below
 class Signals(BaseModel):
     """All signal configurations."""
@@ -97,6 +119,7 @@ class Signals(BaseModel):
     language: Optional[List[Language]] = []
     latency: Optional[List[Latency]] = []
     context: Optional[List[ContextRule]] = []
+    complexity: Optional[List[ComplexityRule]] = []
 class Condition(BaseModel):

{vllm_sr-0.1.0b2.dev20260126190945 → vllm_sr-0.1.0b2.dev20260202073049}/cli/templates/config.template.yaml RENAMED Viewed

@@ -186,8 +186,8 @@ signals:
       max_tpot: 0.15  # 150ms per token
       description: "For standard applications with moderate latency tolerance"
-  # context_rules - Context length signals (Token Count)
-  context_rules:
+  # context - Context length signals (Token Count)
+  context:
     - name: "low_token_count"
       min_tokens: "0"
       max_tokens: "1K"
@@ -197,6 +197,60 @@ signals:
       max_tokens: "128K"
       description: "Long requests requiring large context window"
+  # complexity - Complexity signals (Embedding-based difficulty detection)
+  # IMPORTANT: It is strongly recommended to configure a composer for each complexity rule
+  # to filter based on other signals (e.g., domain). This prevents misclassification where
+  # a math question might match code_complexity or vice versa.
+  complexity:
+    - name: "code_complexity"
+      composer:
+        operator: "AND"
+        conditions:
+          - type: "domain"
+            name: "computer science"
+      threshold: 0.1
+      hard:
+        candidates:
+          - "design distributed system"
+          - "implement consensus algorithm"
+          - "optimize for scale"
+          - "architect microservices"
+          - "fix race condition"
+          - "implement garbage collector"
+      easy:
+        candidates:
+          - "print hello world"
+          - "loop through array"
+          - "read file"
+          - "sort list"
+          - "string concatenation"
+          - "simple function"
+      description: "Detects code complexity level"
+    - name: "math_complexity"
+      composer:
+        operator: "AND"
+        conditions:
+          - type: "domain"
+            name: "math"
+      threshold: 0.1
+      hard:
+        candidates:
+          - "prove mathematically"
+          - "derive the equation"
+          - "formal proof"
+          - "solve differential equation"
+          - "prove by induction"
+          - "analyze convergence"
+      easy:
+        candidates:
+          - "what is 2+2"
+          - "simple arithmetic"
+          - "basic calculation"
+          - "count numbers"
+          - "add two numbers"
+          - "multiply values"
+      description: "Detects mathematical complexity level"
 # Decisions - Routing logic
 decisions:
   # Highest priority: Preference-based routing via external LLM

{vllm_sr-0.1.0b2.dev20260126190945 → vllm_sr-0.1.0b2.dev20260202073049}/cli/templates/router-defaults.yaml RENAMED Viewed

@@ -10,6 +10,7 @@
 #   "models/mom-feedback-detector": "llm-semantic-router/feedback-detector"
 #   "models/mom-embedding-pro": "Qwen/Qwen3-Embedding-0.6B"
 #   "models/mom-embedding-flash": "google/embeddinggemma-300m"
+#   "models/mom-embedding-ultra": "llm-semantic-router/mmbert-embed-32k-2d-matryoshka"
 # Response API Configuration
 # Enables OpenAI Response API support with conversation chaining
@@ -19,6 +20,14 @@ response_api:
   ttl_seconds: 86400       # 24 hours
   max_responses: 1000
+# Router Replay Configuration (System-Level)
+# Provides storage backend configuration for router_replay plugin
+# Per-decision settings (max_records, capture settings) are configured via router_replay plugin
+router_replay:
+  store_backend: "memory"  # Options: "memory", "redis", "postgres", "milvus"
+  ttl_seconds: 2592000     # 30 days retention (for persistent backends)
+  async_writes: false      # Enable async writes for better performance
 semantic_cache:
   enabled: true
   backend_type: "memory"  # Options: "memory", "milvus", or "hybrid"
@@ -37,14 +46,9 @@ semantic_cache:
   # backend_config_path: "config/milvus.yaml" # Path to Milvus config
   # Embedding model for semantic similarity matching
-  # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
-  # Default: "bert" (fastest, lowest memory)
-  embedding_model: "bert"
-bert_model:
-  model_id: models/mom-embedding-light
-  threshold: 0.6
-  use_cpu: true
+  # If not specified, automatically uses the model configured in embedding_models section
+  # Options: "mmbert" (multilingual, 768-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
+  # embedding_model: "mmbert"  # Optional: explicitly set if you want to override auto-detection
 tools:
   enabled: true
@@ -125,20 +129,29 @@ feedback_detector:
 #     access_key: ""  # Optional: for Authorization header (Bearer token)
 # Embedding Models Configuration
-# These models provide intelligent embedding generation with automatic routing:
-# - Qwen3-Embedding-0.6B: Up to 32K context, high quality,
-# - EmbeddingGemma-300M: Up to 8K context, fast inference, Matryoshka support (768/512/256/128)
+# This is the UNIFIED configuration for all embedding-related features:
+# - Semantic Cache: Automatically uses the configured model
+# - Tool Selection: Uses the configured model for tool matching
+# - Embedding Signal: Uses the model specified in hnsw_config.model_type
+# - Complexity Signal: Uses the model specified in hnsw_config.model_type
+#
+# Available models:
+# - Qwen3-Embedding-0.6B (Pro): Up to 32K context, high quality, 1024-dim
+# - EmbeddingGemma-300M (Flash): Up to 8K context, fast inference, Matryoshka support (768/512/256/128)
+# - mmBERT-Embed-32K-2D-Matryoshka (Ultra): Up to 32K context, 1800+ languages, 2D Matryoshka (layer early exit + dimension reduction)
 embedding_models:
-  qwen3_model_path: "models/mom-embedding-pro"
+  # qwen3_model_path: "models/mom-embedding-pro"
   # gemma_model_path: "models/mom-embedding-flash"
+  mmbert_model_path: "models/mom-embedding-ultra"
   use_cpu: true  # Set to false for GPU acceleration (requires CUDA)
   # HNSW Configuration
   # Improves performance by preloading candidate embeddings at startup
   # and using HNSW index for O(log n) similarity search
   hnsw_config:
-    model_type: "qwen3"         # Which model to use: "qwen3" (high quality) or "gemma" (fast)
+    model_type: "mmbert"         # Which model to use: "qwen3" (high quality), "gemma" (fast), or "mmbert" (multilingual)
     preload_embeddings: true    # Precompute candidate embeddings at startup
-    target_dimension: 1024       # Embedding dimension
+    target_dimension: 768        # Embedding dimension (1024 for qwen3, 768 for gemma/mmbert)
+    # For mmbert only: target_layer (3/6/11/22) for layer early exit
     enable_soft_matching: true
     min_score_threshold: 0.5

{vllm_sr-0.1.0b2.dev20260126190945 → vllm_sr-0.1.0b2.dev20260202073049}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "vllm-sr"
-version = "0.1.0.beta.2.dev20260126190945"
+version = "0.1.0.beta.2.dev20260202073049"
 description = "vLLM Semantic Router - Intelligent routing for Mixture-of-Models"
 authors = [{name = "vLLM-SR Team"}]
 readme = "README.md"

{vllm_sr-0.1.0b2.dev20260126190945 → vllm_sr-0.1.0b2.dev20260202073049/vllm_sr.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: vllm-sr
-Version: 0.1.0b2.dev20260126190945
+Version: 0.1.0b2.dev20260202073049
 Summary: vLLM Semantic Router - Intelligent routing for Mixture-of-Models
 Author: vLLM-SR Team
 License: Apache-2.0