sglang 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/bench_one_batch_server.py +79 -53
  3. sglang/bench_serving.py +186 -14
  4. sglang/profiler.py +0 -1
  5. sglang/srt/configs/__init__.py +2 -0
  6. sglang/srt/configs/longcat_flash.py +104 -0
  7. sglang/srt/configs/model_config.py +12 -0
  8. sglang/srt/connector/__init__.py +1 -1
  9. sglang/srt/connector/base_connector.py +1 -2
  10. sglang/srt/connector/redis.py +2 -2
  11. sglang/srt/connector/serde/__init__.py +1 -1
  12. sglang/srt/connector/serde/safe_serde.py +4 -3
  13. sglang/srt/conversation.py +38 -5
  14. sglang/srt/disaggregation/ascend/conn.py +75 -0
  15. sglang/srt/disaggregation/launch_lb.py +0 -13
  16. sglang/srt/disaggregation/mini_lb.py +33 -8
  17. sglang/srt/disaggregation/prefill.py +1 -1
  18. sglang/srt/distributed/parallel_state.py +24 -14
  19. sglang/srt/entrypoints/engine.py +19 -12
  20. sglang/srt/entrypoints/http_server.py +174 -34
  21. sglang/srt/entrypoints/openai/protocol.py +87 -24
  22. sglang/srt/entrypoints/openai/serving_chat.py +50 -9
  23. sglang/srt/entrypoints/openai/serving_completions.py +15 -0
  24. sglang/srt/eplb/eplb_manager.py +26 -2
  25. sglang/srt/eplb/expert_distribution.py +29 -2
  26. sglang/srt/function_call/deepseekv31_detector.py +222 -0
  27. sglang/srt/function_call/function_call_parser.py +2 -0
  28. sglang/srt/function_call/gpt_oss_detector.py +144 -256
  29. sglang/srt/harmony_parser.py +588 -0
  30. sglang/srt/hf_transformers_utils.py +26 -7
  31. sglang/srt/layers/activation.py +12 -0
  32. sglang/srt/layers/attention/ascend_backend.py +374 -136
  33. sglang/srt/layers/attention/flashattention_backend.py +241 -7
  34. sglang/srt/layers/attention/flashinfer_backend.py +5 -2
  35. sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
  36. sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
  37. sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
  38. sglang/srt/layers/communicator.py +1 -2
  39. sglang/srt/layers/layernorm.py +28 -3
  40. sglang/srt/layers/linear.py +3 -2
  41. sglang/srt/layers/logits_processor.py +1 -1
  42. sglang/srt/layers/moe/cutlass_moe.py +0 -8
  43. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  44. sglang/srt/layers/moe/ep_moe/layer.py +13 -13
  45. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  46. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  47. sglang/srt/layers/moe/topk.py +35 -12
  48. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
  49. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
  50. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
  51. sglang/srt/layers/quantization/fp8.py +2 -1
  52. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  53. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  54. sglang/srt/layers/quantization/modelopt_quant.py +7 -0
  55. sglang/srt/layers/quantization/mxfp4.py +25 -27
  56. sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
  57. sglang/srt/layers/quantization/utils.py +13 -0
  58. sglang/srt/layers/quantization/w8a8_int8.py +7 -3
  59. sglang/srt/layers/rotary_embedding.py +28 -1
  60. sglang/srt/layers/sampler.py +29 -5
  61. sglang/srt/layers/utils.py +0 -14
  62. sglang/srt/managers/cache_controller.py +237 -204
  63. sglang/srt/managers/detokenizer_manager.py +48 -2
  64. sglang/srt/managers/io_struct.py +57 -0
  65. sglang/srt/managers/mm_utils.py +5 -1
  66. sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
  67. sglang/srt/managers/scheduler.py +94 -9
  68. sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
  69. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  70. sglang/srt/managers/tokenizer_manager.py +122 -42
  71. sglang/srt/mem_cache/chunk_cache.py +1 -1
  72. sglang/srt/mem_cache/hicache_storage.py +51 -23
  73. sglang/srt/mem_cache/hiradix_cache.py +87 -71
  74. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  75. sglang/srt/mem_cache/memory_pool.py +77 -14
  76. sglang/srt/mem_cache/memory_pool_host.py +4 -5
  77. sglang/srt/mem_cache/radix_cache.py +6 -4
  78. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  79. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +38 -20
  80. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +87 -82
  81. sglang/srt/mem_cache/swa_radix_cache.py +1 -1
  82. sglang/srt/model_executor/model_runner.py +6 -5
  83. sglang/srt/model_loader/loader.py +15 -24
  84. sglang/srt/model_loader/utils.py +12 -0
  85. sglang/srt/models/deepseek_v2.py +38 -13
  86. sglang/srt/models/gpt_oss.py +2 -15
  87. sglang/srt/models/llama_eagle3.py +4 -0
  88. sglang/srt/models/longcat_flash.py +1015 -0
  89. sglang/srt/models/longcat_flash_nextn.py +691 -0
  90. sglang/srt/models/qwen2.py +26 -3
  91. sglang/srt/models/qwen2_5_vl.py +66 -41
  92. sglang/srt/models/qwen2_moe.py +22 -2
  93. sglang/srt/models/transformers.py +1 -1
  94. sglang/srt/multimodal/processors/base_processor.py +4 -2
  95. sglang/srt/reasoning_parser.py +56 -300
  96. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  97. sglang/srt/server_args.py +122 -56
  98. sglang/srt/speculative/eagle_worker.py +28 -8
  99. sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
  100. sglang/srt/utils.py +73 -5
  101. sglang/test/attention/test_trtllm_mla_backend.py +12 -3
  102. sglang/version.py +1 -1
  103. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +7 -6
  104. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +107 -99
  105. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
  106. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
  107. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0
@@ -58,7 +58,7 @@ from sglang.srt.layers.quantization.base_config import QuantizationConfig
58
58
  from sglang.srt.layers.quantization.fp8_utils import dequant_mxfp4
59
59
  from sglang.srt.layers.radix_attention import RadixAttention
60
60
  from sglang.srt.layers.rotary_embedding import get_rope
61
- from sglang.srt.layers.utils import PPMissingLayer, get_layer_id, is_sm100_supported
61
+ from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
62
62
  from sglang.srt.layers.vocab_parallel_embedding import (
63
63
  ParallelLMHead,
64
64
  VocabParallelEmbedding,
@@ -71,6 +71,7 @@ from sglang.srt.utils import (
71
71
  add_prefix,
72
72
  is_cuda,
73
73
  is_flashinfer_available,
74
+ is_sm100_supported,
74
75
  make_layers,
75
76
  )
76
77
 
@@ -1028,10 +1029,6 @@ class GptOssForCausalLM(nn.Module):
1028
1029
  )
1029
1030
 
1030
1031
  params_dict = dict(self.named_parameters())
1031
- params_checker = {k: False for k, v in params_dict.items()}
1032
-
1033
- for other_loaded_param_name in other_loaded_param_names:
1034
- params_checker[other_loaded_param_name] = True
1035
1032
 
1036
1033
  for name, loaded_weight in weights:
1037
1034
  loaded_weight = _WeightCreator.maybe_materialize(loaded_weight)
@@ -1068,7 +1065,6 @@ class GptOssForCausalLM(nn.Module):
1068
1065
  param = params_dict[name]
1069
1066
  weight_loader = param.weight_loader
1070
1067
  weight_loader(param, loaded_weight, shard_id)
1071
- params_checker[name] = True
1072
1068
  break
1073
1069
  else:
1074
1070
  for mapping in expert_params_mapping:
@@ -1091,7 +1087,6 @@ class GptOssForCausalLM(nn.Module):
1091
1087
  name,
1092
1088
  shard_id=shard_id,
1093
1089
  )
1094
- params_checker[name] = True
1095
1090
  break
1096
1091
  else:
1097
1092
  if name.endswith(".bias") and name not in params_dict:
@@ -1110,17 +1105,9 @@ class GptOssForCausalLM(nn.Module):
1110
1105
  param, "weight_loader", default_weight_loader
1111
1106
  )
1112
1107
  weight_loader(param, loaded_weight)
1113
- params_checker[name] = True
1114
1108
  else:
1115
1109
  logger.warning(f"Parameter {name} not found in params_dict")
1116
1110
 
1117
- not_loaded_params = [k for k, v in params_checker.items() if not v]
1118
- if tp_rank == 0:
1119
- if len(not_loaded_params) > 0:
1120
- raise Exception(f"Not all parameters loaded: {not_loaded_params}")
1121
- else:
1122
- logging.info("All parameters loaded successfully.")
1123
-
1124
1111
  def get_embed_and_head(self):
1125
1112
  return self.model.embed_tokens.weight, self.lm_head.weight
1126
1113
 
@@ -185,9 +185,13 @@ class LlamaForCausalLMEagle3(LlamaForCausalLM):
185
185
  )
186
186
  # Llama 3.2 1B Instruct set tie_word_embeddings to True
187
187
  # Llama 3.1 8B Instruct set tie_word_embeddings to False
188
+ self.load_lm_head_from_target = False
188
189
  if self.config.tie_word_embeddings:
189
190
  self.lm_head = self.model.embed_tokens
190
191
  else:
192
+ if config.draft_vocab_size is None:
193
+ self.load_lm_head_from_target = True
194
+ config.draft_vocab_size = config.vocab_size
191
195
  self.lm_head = ParallelLMHead(
192
196
  config.draft_vocab_size,
193
197
  config.hidden_size,