vllm-sr 0.1.0b2.dev20260129021803__tar.gz → 0.1.0b2.dev20260129090916__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {vllm_sr-0.1.0b2.dev20260129021803/vllm_sr.egg-info → vllm_sr-0.1.0b2.dev20260129090916}/PKG-INFO +1 -1
  2. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/templates/router-defaults.yaml +27 -14
  3. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/pyproject.toml +1 -1
  4. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916/vllm_sr.egg-info}/PKG-INFO +1 -1
  5. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/MANIFEST.in +0 -0
  6. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/README.md +0 -0
  7. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/__init__.py +0 -0
  8. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/commands/__init__.py +0 -0
  9. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/commands/config.py +0 -0
  10. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/commands/generate.py +0 -0
  11. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/commands/init.py +0 -0
  12. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/commands/serve.py +0 -0
  13. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/commands/show_config.py +0 -0
  14. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/commands/show_defaults.py +0 -0
  15. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/commands/validate.py +0 -0
  16. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/config_generator.py +0 -0
  17. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/consts.py +0 -0
  18. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/core.py +0 -0
  19. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/defaults.py +0 -0
  20. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/docker_cli.py +0 -0
  21. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/logo.py +0 -0
  22. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/main.py +0 -0
  23. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/merger.py +0 -0
  24. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/models.py +0 -0
  25. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/parser.py +0 -0
  26. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/templates/config.template.yaml +0 -0
  27. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/templates/envoy.template.yaml +0 -0
  28. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/templates/generate_dashboard.py +0 -0
  29. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/templates/grafana-dashboard.serve.yaml +0 -0
  30. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/templates/grafana-datasource-jaeger.serve.yaml +0 -0
  31. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/templates/grafana-datasource.serve.yaml +0 -0
  32. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/templates/grafana.serve.ini +0 -0
  33. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/templates/llm-router-dashboard.serve.json +0 -0
  34. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/templates/prometheus.serve.yaml +0 -0
  35. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/templates/tools_db.json +0 -0
  36. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/utils.py +0 -0
  37. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/cli/validator.py +0 -0
  38. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/requirements.txt +0 -0
  39. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/setup.cfg +0 -0
  40. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/tests/test_plugin_parsing.py +0 -0
  41. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/tests/test_plugin_yaml_generation.py +0 -0
  42. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/vllm_sr.egg-info/SOURCES.txt +0 -0
  43. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/vllm_sr.egg-info/dependency_links.txt +0 -0
  44. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/vllm_sr.egg-info/entry_points.txt +0 -0
  45. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/vllm_sr.egg-info/requires.txt +0 -0
  46. {vllm_sr-0.1.0b2.dev20260129021803 → vllm_sr-0.1.0b2.dev20260129090916}/vllm_sr.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vllm-sr
3
- Version: 0.1.0b2.dev20260129021803
3
+ Version: 0.1.0b2.dev20260129090916
4
4
  Summary: vLLM Semantic Router - Intelligent routing for Mixture-of-Models
5
5
  Author: vLLM-SR Team
6
6
  License: Apache-2.0
@@ -10,6 +10,7 @@
10
10
  # "models/mom-feedback-detector": "llm-semantic-router/feedback-detector"
11
11
  # "models/mom-embedding-pro": "Qwen/Qwen3-Embedding-0.6B"
12
12
  # "models/mom-embedding-flash": "google/embeddinggemma-300m"
13
+ # "models/mom-embedding-ultra": "llm-semantic-router/mmbert-embed-32k-2d-matryoshka"
13
14
 
14
15
  # Response API Configuration
15
16
  # Enables OpenAI Response API support with conversation chaining
@@ -19,6 +20,14 @@ response_api:
19
20
  ttl_seconds: 86400 # 24 hours
20
21
  max_responses: 1000
21
22
 
23
+ # Router Replay Configuration (System-Level)
24
+ # Provides storage backend configuration for router_replay plugin
25
+ # Per-decision settings (max_records, capture settings) are configured via router_replay plugin
26
+ router_replay:
27
+ store_backend: "memory" # Options: "memory", "redis", "postgres", "milvus"
28
+ ttl_seconds: 2592000 # 30 days retention (for persistent backends)
29
+ async_writes: false # Enable async writes for better performance
30
+
22
31
  semantic_cache:
23
32
  enabled: true
24
33
  backend_type: "memory" # Options: "memory", "milvus", or "hybrid"
@@ -37,14 +46,9 @@ semantic_cache:
37
46
  # backend_config_path: "config/milvus.yaml" # Path to Milvus config
38
47
 
39
48
  # Embedding model for semantic similarity matching
40
- # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
41
- # Default: "bert" (fastest, lowest memory)
42
- embedding_model: "bert"
43
-
44
- bert_model:
45
- model_id: models/mom-embedding-light
46
- threshold: 0.6
47
- use_cpu: true
49
+ # If not specified, automatically uses the model configured in embedding_models section
50
+ # Options: "mmbert" (multilingual, 768-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
51
+ # embedding_model: "mmbert" # Optional: explicitly set if you want to override auto-detection
48
52
 
49
53
  tools:
50
54
  enabled: true
@@ -125,20 +129,29 @@ feedback_detector:
125
129
  # access_key: "" # Optional: for Authorization header (Bearer token)
126
130
 
127
131
  # Embedding Models Configuration
128
- # These models provide intelligent embedding generation with automatic routing:
129
- # - Qwen3-Embedding-0.6B: Up to 32K context, high quality,
130
- # - EmbeddingGemma-300M: Up to 8K context, fast inference, Matryoshka support (768/512/256/128)
132
+ # This is the UNIFIED configuration for all embedding-related features:
133
+ # - Semantic Cache: Automatically uses the configured model
134
+ # - Tool Selection: Uses the configured model for tool matching
135
+ # - Embedding Signal: Uses the model specified in hnsw_config.model_type
136
+ # - Complexity Signal: Uses the model specified in hnsw_config.model_type
137
+ #
138
+ # Available models:
139
+ # - Qwen3-Embedding-0.6B (Pro): Up to 32K context, high quality, 1024-dim
140
+ # - EmbeddingGemma-300M (Flash): Up to 8K context, fast inference, Matryoshka support (768/512/256/128)
141
+ # - mmBERT-Embed-32K-2D-Matryoshka (Ultra): Up to 32K context, 1800+ languages, 2D Matryoshka (layer early exit + dimension reduction)
131
142
  embedding_models:
132
- qwen3_model_path: "models/mom-embedding-pro"
143
+ # qwen3_model_path: "models/mom-embedding-pro"
133
144
  # gemma_model_path: "models/mom-embedding-flash"
145
+ mmbert_model_path: "models/mom-embedding-ultra"
134
146
  use_cpu: true # Set to false for GPU acceleration (requires CUDA)
135
147
  # HNSW Configuration
136
148
  # Improves performance by preloading candidate embeddings at startup
137
149
  # and using HNSW index for O(log n) similarity search
138
150
  hnsw_config:
139
- model_type: "qwen3" # Which model to use: "qwen3" (high quality) or "gemma" (fast)
151
+ model_type: "mmbert" # Which model to use: "qwen3" (high quality), "gemma" (fast), or "mmbert" (multilingual)
140
152
  preload_embeddings: true # Precompute candidate embeddings at startup
141
- target_dimension: 1024 # Embedding dimension
153
+ target_dimension: 768 # Embedding dimension (1024 for qwen3, 768 for gemma/mmbert)
154
+ # For mmbert only: target_layer (3/6/11/22) for layer early exit
142
155
  enable_soft_matching: true
143
156
  min_score_threshold: 0.5
144
157
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "vllm-sr"
7
- version = "0.1.0.beta.2.dev20260129021803"
7
+ version = "0.1.0.beta.2.dev20260129090916"
8
8
  description = "vLLM Semantic Router - Intelligent routing for Mixture-of-Models"
9
9
  authors = [{name = "vLLM-SR Team"}]
10
10
  readme = "README.md"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vllm-sr
3
- Version: 0.1.0b2.dev20260129021803
3
+ Version: 0.1.0b2.dev20260129090916
4
4
  Summary: vLLM Semantic Router - Intelligent routing for Mixture-of-Models
5
5
  Author: vLLM-SR Team
6
6
  License: Apache-2.0