vllm-sr 0.1.0b2.dev20260129021916__py3-none-any.whl → 0.1.0b2.dev20260129200211__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/templates/router-defaults.yaml +25 -22
- {vllm_sr-0.1.0b2.dev20260129021916.dist-info → vllm_sr-0.1.0b2.dev20260129200211.dist-info}/METADATA +1 -1
- {vllm_sr-0.1.0b2.dev20260129021916.dist-info → vllm_sr-0.1.0b2.dev20260129200211.dist-info}/RECORD +6 -6
- {vllm_sr-0.1.0b2.dev20260129021916.dist-info → vllm_sr-0.1.0b2.dev20260129200211.dist-info}/WHEEL +0 -0
- {vllm_sr-0.1.0b2.dev20260129021916.dist-info → vllm_sr-0.1.0b2.dev20260129200211.dist-info}/entry_points.txt +0 -0
- {vllm_sr-0.1.0b2.dev20260129021916.dist-info → vllm_sr-0.1.0b2.dev20260129200211.dist-info}/top_level.txt +0 -0
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
# "models/mom-feedback-detector": "llm-semantic-router/feedback-detector"
|
|
11
11
|
# "models/mom-embedding-pro": "Qwen/Qwen3-Embedding-0.6B"
|
|
12
12
|
# "models/mom-embedding-flash": "google/embeddinggemma-300m"
|
|
13
|
+
# "models/mom-embedding-ultra": "llm-semantic-router/mmbert-embed-32k-2d-matryoshka"
|
|
13
14
|
|
|
14
15
|
# Response API Configuration
|
|
15
16
|
# Enables OpenAI Response API support with conversation chaining
|
|
@@ -19,15 +20,13 @@ response_api:
|
|
|
19
20
|
ttl_seconds: 86400 # 24 hours
|
|
20
21
|
max_responses: 1000
|
|
21
22
|
|
|
22
|
-
# Router Replay Configuration
|
|
23
|
-
#
|
|
23
|
+
# Router Replay Configuration (System-Level)
|
|
24
|
+
# Provides storage backend configuration for router_replay plugin
|
|
25
|
+
# Per-decision settings (max_records, capture settings) are configured via router_replay plugin
|
|
24
26
|
router_replay:
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
capture_request_body: true
|
|
29
|
-
capture_response_body: true
|
|
30
|
-
max_body_bytes: 4096
|
|
27
|
+
store_backend: "memory" # Options: "memory", "redis", "postgres", "milvus"
|
|
28
|
+
ttl_seconds: 2592000 # 30 days retention (for persistent backends)
|
|
29
|
+
async_writes: false # Enable async writes for better performance
|
|
31
30
|
|
|
32
31
|
semantic_cache:
|
|
33
32
|
enabled: true
|
|
@@ -47,14 +46,9 @@ semantic_cache:
|
|
|
47
46
|
# backend_config_path: "config/milvus.yaml" # Path to Milvus config
|
|
48
47
|
|
|
49
48
|
# Embedding model for semantic similarity matching
|
|
50
|
-
#
|
|
51
|
-
#
|
|
52
|
-
embedding_model: "
|
|
53
|
-
|
|
54
|
-
bert_model:
|
|
55
|
-
model_id: models/mom-embedding-light
|
|
56
|
-
threshold: 0.6
|
|
57
|
-
use_cpu: true
|
|
49
|
+
# If not specified, automatically uses the model configured in embedding_models section
|
|
50
|
+
# Options: "mmbert" (multilingual, 768-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
|
|
51
|
+
# embedding_model: "mmbert" # Optional: explicitly set if you want to override auto-detection
|
|
58
52
|
|
|
59
53
|
tools:
|
|
60
54
|
enabled: true
|
|
@@ -135,20 +129,29 @@ feedback_detector:
|
|
|
135
129
|
# access_key: "" # Optional: for Authorization header (Bearer token)
|
|
136
130
|
|
|
137
131
|
# Embedding Models Configuration
|
|
138
|
-
#
|
|
139
|
-
# -
|
|
140
|
-
# -
|
|
132
|
+
# This is the UNIFIED configuration for all embedding-related features:
|
|
133
|
+
# - Semantic Cache: Automatically uses the configured model
|
|
134
|
+
# - Tool Selection: Uses the configured model for tool matching
|
|
135
|
+
# - Embedding Signal: Uses the model specified in hnsw_config.model_type
|
|
136
|
+
# - Complexity Signal: Uses the model specified in hnsw_config.model_type
|
|
137
|
+
#
|
|
138
|
+
# Available models:
|
|
139
|
+
# - Qwen3-Embedding-0.6B (Pro): Up to 32K context, high quality, 1024-dim
|
|
140
|
+
# - EmbeddingGemma-300M (Flash): Up to 8K context, fast inference, Matryoshka support (768/512/256/128)
|
|
141
|
+
# - mmBERT-Embed-32K-2D-Matryoshka (Ultra): Up to 32K context, 1800+ languages, 2D Matryoshka (layer early exit + dimension reduction)
|
|
141
142
|
embedding_models:
|
|
142
|
-
qwen3_model_path: "models/mom-embedding-pro"
|
|
143
|
+
# qwen3_model_path: "models/mom-embedding-pro"
|
|
143
144
|
# gemma_model_path: "models/mom-embedding-flash"
|
|
145
|
+
mmbert_model_path: "models/mom-embedding-ultra"
|
|
144
146
|
use_cpu: true # Set to false for GPU acceleration (requires CUDA)
|
|
145
147
|
# HNSW Configuration
|
|
146
148
|
# Improves performance by preloading candidate embeddings at startup
|
|
147
149
|
# and using HNSW index for O(log n) similarity search
|
|
148
150
|
hnsw_config:
|
|
149
|
-
model_type: "
|
|
151
|
+
model_type: "mmbert" # Which model to use: "qwen3" (high quality), "gemma" (fast), or "mmbert" (multilingual)
|
|
150
152
|
preload_embeddings: true # Precompute candidate embeddings at startup
|
|
151
|
-
target_dimension:
|
|
153
|
+
target_dimension: 768 # Embedding dimension (1024 for qwen3, 768 for gemma/mmbert)
|
|
154
|
+
# For mmbert only: target_layer (3/6/11/22) for layer early exit
|
|
152
155
|
enable_soft_matching: true
|
|
153
156
|
min_score_threshold: 0.5
|
|
154
157
|
|
{vllm_sr-0.1.0b2.dev20260129021916.dist-info → vllm_sr-0.1.0b2.dev20260129200211.dist-info}/RECORD
RENAMED
|
@@ -28,10 +28,10 @@ cli/templates/grafana-datasource.serve.yaml,sha256=Cxjz1zVWoUdSzbSsS_iJhMHRrmRi6
|
|
|
28
28
|
cli/templates/grafana.serve.ini,sha256=x9bCkzxqm5gC4fKToY2lhNPdWhwAaJGVe5ABMW6Dv-c,1674
|
|
29
29
|
cli/templates/llm-router-dashboard.serve.json,sha256=pwnTjUh7z3_3LnIwtaLXjDWH4aHd2Mc57z0oekgt-Bk,60903
|
|
30
30
|
cli/templates/prometheus.serve.yaml,sha256=MGYq8dlRq_i2m5sogQ--kwTvJpkf44QQoCNoI7oyVT8,270
|
|
31
|
-
cli/templates/router-defaults.yaml,sha256=
|
|
31
|
+
cli/templates/router-defaults.yaml,sha256=crPnhOGAQYMgnIjHJEU8aNtlplau8wjrvGLrjqPsnwY,8647
|
|
32
32
|
cli/templates/tools_db.json,sha256=CPqPBkd5nc966m1YEozz06frrmv3Pd5rrkxKkO3rTiA,4537
|
|
33
|
-
vllm_sr-0.1.0b2.
|
|
34
|
-
vllm_sr-0.1.0b2.
|
|
35
|
-
vllm_sr-0.1.0b2.
|
|
36
|
-
vllm_sr-0.1.0b2.
|
|
37
|
-
vllm_sr-0.1.0b2.
|
|
33
|
+
vllm_sr-0.1.0b2.dev20260129200211.dist-info/METADATA,sha256=_nL5xOg9HQm-cK0BkXeVbwYYAcBJKUOiJxxYSOQt3zY,7173
|
|
34
|
+
vllm_sr-0.1.0b2.dev20260129200211.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
35
|
+
vllm_sr-0.1.0b2.dev20260129200211.dist-info/entry_points.txt,sha256=WhlBPbLHUpWUsMuUQX9cnvsYMf0ih5i57vvJ1jJNi0k,42
|
|
36
|
+
vllm_sr-0.1.0b2.dev20260129200211.dist-info/top_level.txt,sha256=2ImG917oaVHlm0nP9oJE-Qrgs-fq_fGWgba2H1f8fpE,4
|
|
37
|
+
vllm_sr-0.1.0b2.dev20260129200211.dist-info/RECORD,,
|
{vllm_sr-0.1.0b2.dev20260129021916.dist-info → vllm_sr-0.1.0b2.dev20260129200211.dist-info}/WHEEL
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|