sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +2 -1
- sglang/eval/loogle_eval.py +7 -0
- sglang/srt/configs/deepseekvl2.py +11 -2
- sglang/srt/configs/internvl.py +3 -0
- sglang/srt/configs/janus_pro.py +3 -0
- sglang/srt/configs/model_config.py +9 -7
- sglang/srt/configs/update_config.py +3 -1
- sglang/srt/conversation.py +1 -0
- sglang/srt/custom_op.py +5 -2
- sglang/srt/disaggregation/decode.py +9 -1
- sglang/srt/disaggregation/mooncake/conn.py +44 -56
- sglang/srt/distributed/parallel_state.py +33 -0
- sglang/srt/entrypoints/engine.py +30 -26
- sglang/srt/entrypoints/openai/serving_chat.py +21 -2
- sglang/srt/eplb/expert_location_dispatch.py +1 -1
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/qwen3_detector.py +150 -0
- sglang/srt/hf_transformers_utils.py +0 -1
- sglang/srt/layers/activation.py +13 -0
- sglang/srt/layers/attention/flashattention_backend.py +3 -3
- sglang/srt/layers/attention/flashinfer_backend.py +40 -1
- sglang/srt/layers/linear.py +13 -102
- sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
- sglang/srt/layers/moe/ep_moe/layer.py +23 -402
- sglang/srt/layers/moe/fused_moe_native.py +7 -47
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +35 -45
- sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
- sglang/srt/layers/moe/topk.py +187 -12
- sglang/srt/layers/quantization/__init__.py +20 -134
- sglang/srt/layers/quantization/awq.py +578 -11
- sglang/srt/layers/quantization/awq_triton.py +339 -0
- sglang/srt/layers/quantization/base_config.py +85 -10
- sglang/srt/layers/quantization/blockwise_int8.py +17 -55
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +24 -73
- sglang/srt/layers/quantization/fp8.py +273 -62
- sglang/srt/layers/quantization/fp8_kernel.py +210 -46
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/gptq.py +501 -143
- sglang/srt/layers/quantization/marlin_utils.py +790 -0
- sglang/srt/layers/quantization/modelopt_quant.py +26 -108
- sglang/srt/layers/quantization/moe_wna16.py +45 -49
- sglang/srt/layers/quantization/petit.py +252 -0
- sglang/srt/layers/quantization/petit_utils.py +104 -0
- sglang/srt/layers/quantization/qoq.py +7 -6
- sglang/srt/layers/quantization/scalar_type.py +352 -0
- sglang/srt/layers/quantization/unquant.py +422 -0
- sglang/srt/layers/quantization/utils.py +343 -3
- sglang/srt/layers/quantization/w4afp8.py +8 -4
- sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
- sglang/srt/layers/quantization/w8a8_int8.py +51 -115
- sglang/srt/layers/vocab_parallel_embedding.py +1 -41
- sglang/srt/lora/lora.py +0 -4
- sglang/srt/lora/lora_manager.py +87 -53
- sglang/srt/lora/mem_pool.py +81 -33
- sglang/srt/lora/utils.py +12 -5
- sglang/srt/managers/cache_controller.py +241 -0
- sglang/srt/managers/io_struct.py +41 -29
- sglang/srt/managers/mm_utils.py +7 -8
- sglang/srt/managers/schedule_batch.py +150 -110
- sglang/srt/managers/schedule_policy.py +68 -27
- sglang/srt/managers/scheduler.py +243 -61
- sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
- sglang/srt/managers/tokenizer_manager.py +11 -3
- sglang/srt/managers/tp_worker.py +14 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
- sglang/srt/mem_cache/allocator.py +7 -16
- sglang/srt/mem_cache/base_prefix_cache.py +14 -2
- sglang/srt/mem_cache/chunk_cache.py +5 -2
- sglang/srt/mem_cache/hicache_storage.py +152 -0
- sglang/srt/mem_cache/hiradix_cache.py +179 -4
- sglang/srt/mem_cache/memory_pool.py +16 -1
- sglang/srt/mem_cache/memory_pool_host.py +41 -2
- sglang/srt/mem_cache/radix_cache.py +26 -0
- sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +5 -6
- sglang/srt/model_executor/forward_batch_info.py +14 -1
- sglang/srt/model_executor/model_runner.py +109 -22
- sglang/srt/model_loader/loader.py +7 -1
- sglang/srt/model_loader/utils.py +4 -4
- sglang/srt/models/clip.py +1 -1
- sglang/srt/models/deepseek.py +9 -6
- sglang/srt/models/deepseek_janus_pro.py +1 -1
- sglang/srt/models/deepseek_v2.py +191 -171
- sglang/srt/models/deepseek_vl2.py +5 -5
- sglang/srt/models/gemma.py +48 -0
- sglang/srt/models/gemma2.py +52 -0
- sglang/srt/models/gemma3_causal.py +63 -0
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -4
- sglang/srt/models/granitemoe.py +385 -0
- sglang/srt/models/grok.py +9 -3
- sglang/srt/models/hunyuan.py +63 -16
- sglang/srt/models/internvl.py +1 -1
- sglang/srt/models/kimi_vl.py +1 -1
- sglang/srt/models/llama.py +41 -0
- sglang/srt/models/llama4.py +11 -11
- sglang/srt/models/llava.py +2 -2
- sglang/srt/models/llavavid.py +1 -1
- sglang/srt/models/minicpm.py +0 -2
- sglang/srt/models/minicpmo.py +3 -7
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mistral.py +1 -1
- sglang/srt/models/mixtral.py +9 -2
- sglang/srt/models/mllama.py +3 -5
- sglang/srt/models/mllama4.py +3 -3
- sglang/srt/models/olmoe.py +8 -5
- sglang/srt/models/persimmon.py +330 -0
- sglang/srt/models/phi.py +321 -0
- sglang/srt/models/phi4mm.py +44 -4
- sglang/srt/models/phi4mm_audio.py +1260 -0
- sglang/srt/models/phi4mm_utils.py +1917 -0
- sglang/srt/models/phimoe.py +9 -3
- sglang/srt/models/qwen.py +37 -0
- sglang/srt/models/qwen2.py +41 -0
- sglang/srt/models/qwen2_5_vl.py +4 -4
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +53 -5
- sglang/srt/models/qwen2_vl.py +4 -4
- sglang/srt/models/qwen3.py +65 -1
- sglang/srt/models/qwen3_moe.py +56 -18
- sglang/srt/models/vila.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +91 -97
- sglang/srt/multimodal/processors/clip.py +21 -19
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
- sglang/srt/multimodal/processors/gemma3.py +13 -17
- sglang/srt/multimodal/processors/gemma3n.py +19 -23
- sglang/srt/multimodal/processors/internvl.py +9 -10
- sglang/srt/multimodal/processors/janus_pro.py +12 -27
- sglang/srt/multimodal/processors/kimi_vl.py +12 -14
- sglang/srt/multimodal/processors/llava.py +4 -2
- sglang/srt/multimodal/processors/minicpm.py +35 -44
- sglang/srt/multimodal/processors/mlama.py +21 -18
- sglang/srt/multimodal/processors/mllama4.py +4 -5
- sglang/srt/multimodal/processors/phi4mm.py +63 -39
- sglang/srt/multimodal/processors/pixtral.py +14 -35
- sglang/srt/multimodal/processors/qwen_audio.py +65 -0
- sglang/srt/multimodal/processors/qwen_vl.py +16 -21
- sglang/srt/multimodal/processors/vila.py +14 -14
- sglang/srt/sampling/sampling_params.py +8 -1
- sglang/srt/server_args.py +393 -230
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +9 -1
- sglang/srt/two_batch_overlap.py +1 -0
- sglang/srt/utils.py +27 -1
- sglang/test/runners.py +14 -3
- sglang/test/test_block_fp8.py +8 -3
- sglang/test/test_block_fp8_ep.py +1 -1
- sglang/test/test_custom_ops.py +12 -7
- sglang/test/test_cutlass_w4a8_moe.py +1 -3
- sglang/test/test_fp4_moe.py +1 -3
- sglang/test/test_marlin_moe.py +286 -0
- sglang/test/test_marlin_utils.py +171 -0
- sglang/test/test_utils.py +35 -0
- sglang/version.py +1 -1
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/METADATA +8 -8
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/RECORD +166 -146
- sglang/srt/layers/quantization/quant_utils.py +0 -166
- sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/top_level.txt +0 -0
@@ -71,11 +71,12 @@ class HostKVCache(abc.ABC):
|
|
71
71
|
requested_bytes = self.size * self.size_per_token
|
72
72
|
# preserve at least 10GB for other usage
|
73
73
|
ten_gb = 10 * (1024**3)
|
74
|
-
|
74
|
+
available_bytes = host_mem.available - ten_gb
|
75
|
+
if requested_bytes > available_bytes:
|
75
76
|
raise ValueError(
|
76
77
|
f"Not enough host memory available. Requesting "
|
77
78
|
f"{requested_bytes / 1e9:.2f} GB but only have "
|
78
|
-
f"{
|
79
|
+
f"{available_bytes / 1e9:.2f} GB free. Please reduce the "
|
79
80
|
f"size of the hierarchical cache."
|
80
81
|
)
|
81
82
|
else:
|
@@ -98,6 +99,20 @@ class HostKVCache(abc.ABC):
|
|
98
99
|
def init_kv_buffer(self):
|
99
100
|
raise NotImplementedError()
|
100
101
|
|
102
|
+
@abc.abstractmethod
|
103
|
+
def get_flat_data_page(self, index) -> torch.Tensor:
|
104
|
+
"""
|
105
|
+
Get a flat data page from the host memory pool.
|
106
|
+
"""
|
107
|
+
raise NotImplementedError()
|
108
|
+
|
109
|
+
@abc.abstractmethod
|
110
|
+
def set_from_flat_data_page(self, index: int, data_page: torch.Tensor) -> None:
|
111
|
+
"""
|
112
|
+
Set a flat data page to the host memory pool.
|
113
|
+
"""
|
114
|
+
raise NotImplementedError()
|
115
|
+
|
101
116
|
@synchronized()
|
102
117
|
def clear(self):
|
103
118
|
# Initialize memory states and tracking structures.
|
@@ -226,6 +241,19 @@ class MHATokenToKVPoolHost(HostKVCache):
|
|
226
241
|
pin_memory=self.pin_memory,
|
227
242
|
)
|
228
243
|
|
244
|
+
# todo, page first memory layout
|
245
|
+
def get_flat_data_page(self, index) -> torch.Tensor:
|
246
|
+
return self.kv_buffer[:, :, index : index + self.page_size, :, :].flatten()
|
247
|
+
|
248
|
+
def set_from_flat_data_page(self, index: int, data_page: torch.Tensor) -> None:
|
249
|
+
self.kv_buffer[:, :, index : index + self.page_size, :, :] = data_page.reshape(
|
250
|
+
2,
|
251
|
+
self.layer_num,
|
252
|
+
self.page_size,
|
253
|
+
self.head_num,
|
254
|
+
self.head_dim,
|
255
|
+
)
|
256
|
+
|
229
257
|
@property
|
230
258
|
def k_buffer(self):
|
231
259
|
return self.kv_buffer[0]
|
@@ -275,3 +303,14 @@ class MLATokenToKVPoolHost(HostKVCache):
|
|
275
303
|
device=self.device,
|
276
304
|
pin_memory=self.pin_memory,
|
277
305
|
)
|
306
|
+
|
307
|
+
def get_flat_data_page(self, index) -> torch.Tensor:
|
308
|
+
return self.kv_buffer[:, index : index + self.page_size, :, :].flatten()
|
309
|
+
|
310
|
+
def set_from_flat_data_page(self, index: int, data_page: torch.Tensor) -> None:
|
311
|
+
self.kv_buffer[:, index : index + self.page_size, :, :] = data_page.reshape(
|
312
|
+
self.layer_num,
|
313
|
+
self.page_size,
|
314
|
+
1,
|
315
|
+
self.kv_lora_rank + self.qk_rope_head_dim,
|
316
|
+
)
|
@@ -55,8 +55,13 @@ class TreeNode:
|
|
55
55
|
self.hit_count = 0
|
56
56
|
# indicating the node is loading KV cache from host
|
57
57
|
self.loading = False
|
58
|
+
# indicating the node is locked to protect from eviction
|
59
|
+
# incremented when the node is referenced by a storage operation
|
60
|
+
self.host_ref_counter = 0
|
58
61
|
# store the host indices of KV cache
|
59
62
|
self.host_value: Optional[torch.Tensor] = None
|
63
|
+
# store hash values of each pages
|
64
|
+
self.hash_value: Optional[List[str]] = None
|
60
65
|
|
61
66
|
self.id = TreeNode.counter if id is None else id
|
62
67
|
TreeNode.counter += 1
|
@@ -69,6 +74,27 @@ class TreeNode:
|
|
69
74
|
def backuped(self):
|
70
75
|
return self.host_value is not None
|
71
76
|
|
77
|
+
@property
|
78
|
+
def backuped_storage(self):
|
79
|
+
return self.hash_value is not None and len(self.hash_value) > 0
|
80
|
+
|
81
|
+
def protect_host(self):
|
82
|
+
"""Protect the host value from eviction."""
|
83
|
+
self.host_ref_counter += 1
|
84
|
+
|
85
|
+
def release_host(self):
|
86
|
+
"""Release the host value, allowing it to be evicted."""
|
87
|
+
if self.host_ref_counter > 0:
|
88
|
+
self.host_ref_counter -= 1
|
89
|
+
else:
|
90
|
+
raise RuntimeError("Host reference counter is already zero.")
|
91
|
+
|
92
|
+
def get_last_hash_value(self) -> Optional[str]:
|
93
|
+
"""Returns the hash value of the last page in this node."""
|
94
|
+
if self.hash_value is None or len(self.hash_value) == 0:
|
95
|
+
return None
|
96
|
+
return self.hash_value[-1]
|
97
|
+
|
72
98
|
def __lt__(self, other: "TreeNode"):
|
73
99
|
return self.last_access_time < other.last_access_time
|
74
100
|
|