sglang 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/bench_one_batch_server.py +79 -53
- sglang/bench_serving.py +186 -14
- sglang/profiler.py +0 -1
- sglang/srt/configs/__init__.py +2 -0
- sglang/srt/configs/longcat_flash.py +104 -0
- sglang/srt/configs/model_config.py +12 -0
- sglang/srt/connector/__init__.py +1 -1
- sglang/srt/connector/base_connector.py +1 -2
- sglang/srt/connector/redis.py +2 -2
- sglang/srt/connector/serde/__init__.py +1 -1
- sglang/srt/connector/serde/safe_serde.py +4 -3
- sglang/srt/conversation.py +38 -5
- sglang/srt/disaggregation/ascend/conn.py +75 -0
- sglang/srt/disaggregation/launch_lb.py +0 -13
- sglang/srt/disaggregation/mini_lb.py +33 -8
- sglang/srt/disaggregation/prefill.py +1 -1
- sglang/srt/distributed/parallel_state.py +24 -14
- sglang/srt/entrypoints/engine.py +19 -12
- sglang/srt/entrypoints/http_server.py +174 -34
- sglang/srt/entrypoints/openai/protocol.py +87 -24
- sglang/srt/entrypoints/openai/serving_chat.py +50 -9
- sglang/srt/entrypoints/openai/serving_completions.py +15 -0
- sglang/srt/eplb/eplb_manager.py +26 -2
- sglang/srt/eplb/expert_distribution.py +29 -2
- sglang/srt/function_call/deepseekv31_detector.py +222 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/gpt_oss_detector.py +144 -256
- sglang/srt/harmony_parser.py +588 -0
- sglang/srt/hf_transformers_utils.py +26 -7
- sglang/srt/layers/activation.py +12 -0
- sglang/srt/layers/attention/ascend_backend.py +374 -136
- sglang/srt/layers/attention/flashattention_backend.py +241 -7
- sglang/srt/layers/attention/flashinfer_backend.py +5 -2
- sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
- sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
- sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
- sglang/srt/layers/communicator.py +1 -2
- sglang/srt/layers/layernorm.py +28 -3
- sglang/srt/layers/linear.py +3 -2
- sglang/srt/layers/logits_processor.py +1 -1
- sglang/srt/layers/moe/cutlass_moe.py +0 -8
- sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
- sglang/srt/layers/moe/ep_moe/layer.py +13 -13
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/topk.py +35 -12
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
- sglang/srt/layers/quantization/fp8.py +2 -1
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/modelopt_quant.py +7 -0
- sglang/srt/layers/quantization/mxfp4.py +25 -27
- sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
- sglang/srt/layers/quantization/utils.py +13 -0
- sglang/srt/layers/quantization/w8a8_int8.py +7 -3
- sglang/srt/layers/rotary_embedding.py +28 -1
- sglang/srt/layers/sampler.py +29 -5
- sglang/srt/layers/utils.py +0 -14
- sglang/srt/managers/cache_controller.py +237 -204
- sglang/srt/managers/detokenizer_manager.py +48 -2
- sglang/srt/managers/io_struct.py +57 -0
- sglang/srt/managers/mm_utils.py +5 -1
- sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
- sglang/srt/managers/scheduler.py +94 -9
- sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
- sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
- sglang/srt/managers/tokenizer_manager.py +122 -42
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +51 -23
- sglang/srt/mem_cache/hiradix_cache.py +87 -71
- sglang/srt/mem_cache/lora_radix_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +77 -14
- sglang/srt/mem_cache/memory_pool_host.py +4 -5
- sglang/srt/mem_cache/radix_cache.py +6 -4
- sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +38 -20
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +87 -82
- sglang/srt/mem_cache/swa_radix_cache.py +1 -1
- sglang/srt/model_executor/model_runner.py +6 -5
- sglang/srt/model_loader/loader.py +15 -24
- sglang/srt/model_loader/utils.py +12 -0
- sglang/srt/models/deepseek_v2.py +38 -13
- sglang/srt/models/gpt_oss.py +2 -15
- sglang/srt/models/llama_eagle3.py +4 -0
- sglang/srt/models/longcat_flash.py +1015 -0
- sglang/srt/models/longcat_flash_nextn.py +691 -0
- sglang/srt/models/qwen2.py +26 -3
- sglang/srt/models/qwen2_5_vl.py +66 -41
- sglang/srt/models/qwen2_moe.py +22 -2
- sglang/srt/models/transformers.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +4 -2
- sglang/srt/reasoning_parser.py +56 -300
- sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
- sglang/srt/server_args.py +122 -56
- sglang/srt/speculative/eagle_worker.py +28 -8
- sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
- sglang/srt/utils.py +73 -5
- sglang/test/attention/test_trtllm_mla_backend.py +12 -3
- sglang/version.py +1 -1
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +7 -6
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +107 -99
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0
sglang/srt/models/gpt_oss.py
CHANGED
@@ -58,7 +58,7 @@ from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
|
58
58
|
from sglang.srt.layers.quantization.fp8_utils import dequant_mxfp4
|
59
59
|
from sglang.srt.layers.radix_attention import RadixAttention
|
60
60
|
from sglang.srt.layers.rotary_embedding import get_rope
|
61
|
-
from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
|
61
|
+
from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
|
62
62
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
63
63
|
ParallelLMHead,
|
64
64
|
VocabParallelEmbedding,
|
@@ -71,6 +71,7 @@ from sglang.srt.utils import (
|
|
71
71
|
add_prefix,
|
72
72
|
is_cuda,
|
73
73
|
is_flashinfer_available,
|
74
|
+
is_sm100_supported,
|
74
75
|
make_layers,
|
75
76
|
)
|
76
77
|
|
@@ -1028,10 +1029,6 @@ class GptOssForCausalLM(nn.Module):
|
|
1028
1029
|
)
|
1029
1030
|
|
1030
1031
|
params_dict = dict(self.named_parameters())
|
1031
|
-
params_checker = {k: False for k, v in params_dict.items()}
|
1032
|
-
|
1033
|
-
for other_loaded_param_name in other_loaded_param_names:
|
1034
|
-
params_checker[other_loaded_param_name] = True
|
1035
1032
|
|
1036
1033
|
for name, loaded_weight in weights:
|
1037
1034
|
loaded_weight = _WeightCreator.maybe_materialize(loaded_weight)
|
@@ -1068,7 +1065,6 @@ class GptOssForCausalLM(nn.Module):
|
|
1068
1065
|
param = params_dict[name]
|
1069
1066
|
weight_loader = param.weight_loader
|
1070
1067
|
weight_loader(param, loaded_weight, shard_id)
|
1071
|
-
params_checker[name] = True
|
1072
1068
|
break
|
1073
1069
|
else:
|
1074
1070
|
for mapping in expert_params_mapping:
|
@@ -1091,7 +1087,6 @@ class GptOssForCausalLM(nn.Module):
|
|
1091
1087
|
name,
|
1092
1088
|
shard_id=shard_id,
|
1093
1089
|
)
|
1094
|
-
params_checker[name] = True
|
1095
1090
|
break
|
1096
1091
|
else:
|
1097
1092
|
if name.endswith(".bias") and name not in params_dict:
|
@@ -1110,17 +1105,9 @@ class GptOssForCausalLM(nn.Module):
|
|
1110
1105
|
param, "weight_loader", default_weight_loader
|
1111
1106
|
)
|
1112
1107
|
weight_loader(param, loaded_weight)
|
1113
|
-
params_checker[name] = True
|
1114
1108
|
else:
|
1115
1109
|
logger.warning(f"Parameter {name} not found in params_dict")
|
1116
1110
|
|
1117
|
-
not_loaded_params = [k for k, v in params_checker.items() if not v]
|
1118
|
-
if tp_rank == 0:
|
1119
|
-
if len(not_loaded_params) > 0:
|
1120
|
-
raise Exception(f"Not all parameters loaded: {not_loaded_params}")
|
1121
|
-
else:
|
1122
|
-
logging.info("All parameters loaded successfully.")
|
1123
|
-
|
1124
1111
|
def get_embed_and_head(self):
|
1125
1112
|
return self.model.embed_tokens.weight, self.lm_head.weight
|
1126
1113
|
|
@@ -185,9 +185,13 @@ class LlamaForCausalLMEagle3(LlamaForCausalLM):
|
|
185
185
|
)
|
186
186
|
# Llama 3.2 1B Instruct set tie_word_embeddings to True
|
187
187
|
# Llama 3.1 8B Instruct set tie_word_embeddings to False
|
188
|
+
self.load_lm_head_from_target = False
|
188
189
|
if self.config.tie_word_embeddings:
|
189
190
|
self.lm_head = self.model.embed_tokens
|
190
191
|
else:
|
192
|
+
if config.draft_vocab_size is None:
|
193
|
+
self.load_lm_head_from_target = True
|
194
|
+
config.draft_vocab_size = config.vocab_size
|
191
195
|
self.lm_head = ParallelLMHead(
|
192
196
|
config.draft_vocab_size,
|
193
197
|
config.hidden_size,
|