sglang 0.4.9.post4__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/lang/chat_template.py +21 -0
- sglang/srt/configs/internvl.py +3 -0
- sglang/srt/configs/model_config.py +4 -0
- sglang/srt/constrained/base_grammar_backend.py +10 -2
- sglang/srt/constrained/xgrammar_backend.py +7 -5
- sglang/srt/conversation.py +16 -1
- sglang/srt/debug_utils/__init__.py +0 -0
- sglang/srt/debug_utils/dump_comparator.py +131 -0
- sglang/srt/debug_utils/dumper.py +108 -0
- sglang/srt/debug_utils/text_comparator.py +172 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
- sglang/srt/disaggregation/mooncake/conn.py +16 -0
- sglang/srt/disaggregation/prefill.py +13 -1
- sglang/srt/entrypoints/engine.py +4 -2
- sglang/srt/entrypoints/openai/serving_chat.py +132 -79
- sglang/srt/function_call/ebnf_composer.py +10 -3
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +164 -0
- sglang/srt/function_call/qwen3_coder_detector.py +1 -0
- sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
- sglang/srt/layers/attention/vision.py +56 -8
- sglang/srt/layers/layernorm.py +26 -1
- sglang/srt/layers/logits_processor.py +14 -3
- sglang/srt/layers/moe/ep_moe/layer.py +172 -206
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
- sglang/srt/layers/moe/topk.py +84 -22
- sglang/srt/layers/multimodal.py +11 -8
- sglang/srt/layers/quantization/fp8.py +25 -247
- sglang/srt/layers/quantization/fp8_kernel.py +78 -48
- sglang/srt/layers/quantization/modelopt_quant.py +25 -10
- sglang/srt/layers/quantization/unquant.py +24 -76
- sglang/srt/layers/quantization/w4afp8.py +68 -17
- sglang/srt/lora/lora_registry.py +93 -29
- sglang/srt/managers/cache_controller.py +9 -7
- sglang/srt/managers/mm_utils.py +154 -35
- sglang/srt/managers/multimodal_processor.py +3 -14
- sglang/srt/managers/schedule_batch.py +14 -8
- sglang/srt/managers/scheduler.py +35 -1
- sglang/srt/managers/tokenizer_manager.py +37 -6
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/mem_cache/hiradix_cache.py +5 -2
- sglang/srt/model_executor/model_runner.py +68 -14
- sglang/srt/models/deepseek_v2.py +62 -28
- sglang/srt/models/glm4_moe.py +1035 -0
- sglang/srt/models/glm4_moe_nextn.py +167 -0
- sglang/srt/models/interns1.py +328 -0
- sglang/srt/models/internvl.py +143 -47
- sglang/srt/models/llava.py +9 -5
- sglang/srt/models/minicpmo.py +4 -1
- sglang/srt/models/qwen2_moe.py +2 -2
- sglang/srt/models/qwen3_moe.py +5 -2
- sglang/srt/multimodal/processors/base_processor.py +20 -6
- sglang/srt/multimodal/processors/clip.py +2 -2
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
- sglang/srt/multimodal/processors/gemma3.py +2 -2
- sglang/srt/multimodal/processors/gemma3n.py +2 -2
- sglang/srt/multimodal/processors/internvl.py +21 -8
- sglang/srt/multimodal/processors/janus_pro.py +2 -2
- sglang/srt/multimodal/processors/kimi_vl.py +2 -2
- sglang/srt/multimodal/processors/llava.py +4 -4
- sglang/srt/multimodal/processors/minicpm.py +2 -3
- sglang/srt/multimodal/processors/mlama.py +2 -2
- sglang/srt/multimodal/processors/mllama4.py +18 -111
- sglang/srt/multimodal/processors/phi4mm.py +2 -2
- sglang/srt/multimodal/processors/pixtral.py +2 -2
- sglang/srt/multimodal/processors/qwen_audio.py +2 -2
- sglang/srt/multimodal/processors/qwen_vl.py +2 -2
- sglang/srt/multimodal/processors/vila.py +3 -1
- sglang/srt/reasoning_parser.py +2 -1
- sglang/srt/server_args.py +57 -6
- sglang/srt/utils.py +96 -1
- sglang/srt/weight_sync/utils.py +119 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_utils.py +65 -5
- sglang/utils.py +19 -0
- sglang/version.py +1 -1
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +4 -4
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +83 -73
- sglang/srt/debug_utils.py +0 -74
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0
sglang/srt/debug_utils.py
DELETED
@@ -1,74 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import time
|
3
|
-
from pathlib import Path
|
4
|
-
|
5
|
-
import torch
|
6
|
-
|
7
|
-
from sglang.srt.utils import get_bool_env_var
|
8
|
-
|
9
|
-
|
10
|
-
class _Dumper:
|
11
|
-
"""Utility to dump tensors, which can be useful when comparison checking models.
|
12
|
-
|
13
|
-
Example usage:
|
14
|
-
debug_utils.dumper.dump("layer_start_hidden_states", hidden_states, layer_id=self.layer_id)
|
15
|
-
"""
|
16
|
-
|
17
|
-
def __init__(self):
|
18
|
-
self._enable = get_bool_env_var("SGLANG_DUMPER_ENABLE", "true")
|
19
|
-
self._base_dir = Path(os.environ.get("SGLANG_DUMPER_DIR", "/tmp"))
|
20
|
-
self._enable_write_file = get_bool_env_var("SGLANG_DUMPER_WRITE_FILE", "1")
|
21
|
-
self._partial_name = str(time.time())
|
22
|
-
self.forward_pass_id = None
|
23
|
-
|
24
|
-
def dump(self, name, value, **kwargs):
|
25
|
-
if not self._enable:
|
26
|
-
return
|
27
|
-
|
28
|
-
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
29
|
-
|
30
|
-
rank = get_tensor_model_parallel_rank()
|
31
|
-
full_kwargs = dict(
|
32
|
-
forward_pass_id=self.forward_pass_id,
|
33
|
-
name=name,
|
34
|
-
**kwargs,
|
35
|
-
)
|
36
|
-
full_filename = "___".join(f"{k}={v}" for k, v in full_kwargs.items()) + ".pt"
|
37
|
-
path = (
|
38
|
-
self._base_dir / f"sglang_dump_{self._partial_name}_{rank}" / full_filename
|
39
|
-
)
|
40
|
-
|
41
|
-
sample_value = self._get_sample_value(name, value)
|
42
|
-
|
43
|
-
print(
|
44
|
-
f"[{rank}, {time.time()}] {path} "
|
45
|
-
f"type={type(value)} "
|
46
|
-
f"shape={value.shape if isinstance(value, torch.Tensor) else None} "
|
47
|
-
f"dtype={value.dtype if isinstance(value, torch.Tensor) else None} "
|
48
|
-
f"sample_value={sample_value}"
|
49
|
-
)
|
50
|
-
|
51
|
-
if self._enable_write_file:
|
52
|
-
path.parent.mkdir(parents=True, exist_ok=True)
|
53
|
-
torch.save(value, str(path))
|
54
|
-
|
55
|
-
def _get_sample_value(self, name, value):
|
56
|
-
if value is None:
|
57
|
-
return None
|
58
|
-
|
59
|
-
if isinstance(value, tuple):
|
60
|
-
return [self._get_sample_value(name, x) for x in value]
|
61
|
-
|
62
|
-
if not isinstance(value, torch.Tensor):
|
63
|
-
return None
|
64
|
-
|
65
|
-
if value.numel() < 200:
|
66
|
-
return value
|
67
|
-
|
68
|
-
slices = [
|
69
|
-
slice(0, 5) if dim_size > 200 else slice(None) for dim_size in value.shape
|
70
|
-
]
|
71
|
-
return value[tuple(slices)]
|
72
|
-
|
73
|
-
|
74
|
-
dumper = _Dumper()
|
File without changes
|
File without changes
|
File without changes
|