sglang 0.4.9.post6__py3-none-any.whl → 0.4.10.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. sglang/bench_offline_throughput.py +20 -0
  2. sglang/bench_one_batch.py +3 -0
  3. sglang/srt/configs/__init__.py +8 -0
  4. sglang/srt/configs/model_config.py +4 -0
  5. sglang/srt/configs/step3_vl.py +172 -0
  6. sglang/srt/conversation.py +23 -0
  7. sglang/srt/disaggregation/decode.py +2 -8
  8. sglang/srt/disaggregation/launch_lb.py +5 -20
  9. sglang/srt/disaggregation/mooncake/conn.py +33 -15
  10. sglang/srt/disaggregation/prefill.py +2 -6
  11. sglang/srt/distributed/parallel_state.py +86 -1
  12. sglang/srt/entrypoints/engine.py +14 -18
  13. sglang/srt/entrypoints/http_server.py +10 -2
  14. sglang/srt/entrypoints/openai/serving_chat.py +2 -21
  15. sglang/srt/eplb/expert_distribution.py +5 -0
  16. sglang/srt/eplb/expert_location.py +17 -6
  17. sglang/srt/eplb/expert_location_dispatch.py +1 -0
  18. sglang/srt/eplb/expert_location_updater.py +2 -0
  19. sglang/srt/function_call/function_call_parser.py +2 -0
  20. sglang/srt/function_call/step3_detector.py +436 -0
  21. sglang/srt/hf_transformers_utils.py +2 -0
  22. sglang/srt/jinja_template_utils.py +4 -1
  23. sglang/srt/layers/attention/trtllm_mla_backend.py +372 -0
  24. sglang/srt/layers/attention/utils.py +6 -1
  25. sglang/srt/layers/moe/cutlass_moe.py +2 -1
  26. sglang/srt/layers/moe/ep_moe/layer.py +39 -674
  27. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
  28. sglang/srt/layers/moe/fused_moe_triton/layer.py +152 -39
  29. sglang/srt/layers/quantization/fp8.py +52 -18
  30. sglang/srt/layers/quantization/unquant.py +0 -8
  31. sglang/srt/layers/quantization/w4afp8.py +1 -0
  32. sglang/srt/layers/quantization/w8a8_int8.py +4 -1
  33. sglang/srt/managers/cache_controller.py +165 -67
  34. sglang/srt/managers/data_parallel_controller.py +2 -0
  35. sglang/srt/managers/io_struct.py +0 -2
  36. sglang/srt/managers/scheduler.py +90 -671
  37. sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
  38. sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
  39. sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
  40. sglang/srt/managers/template_manager.py +62 -19
  41. sglang/srt/managers/tokenizer_manager.py +123 -74
  42. sglang/srt/managers/tp_worker.py +4 -0
  43. sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
  44. sglang/srt/mem_cache/hicache_storage.py +60 -17
  45. sglang/srt/mem_cache/hiradix_cache.py +36 -8
  46. sglang/srt/mem_cache/memory_pool.py +15 -118
  47. sglang/srt/mem_cache/memory_pool_host.py +418 -29
  48. sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
  49. sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
  50. sglang/srt/mem_cache/nixl/hicache_nixl.py +163 -0
  51. sglang/srt/mem_cache/nixl/nixl_utils.py +238 -0
  52. sglang/srt/mem_cache/nixl/test_hicache_nixl_storage.py +216 -0
  53. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +183 -0
  54. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
  55. sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
  56. sglang/srt/model_executor/cuda_graph_runner.py +25 -1
  57. sglang/srt/model_executor/model_runner.py +13 -1
  58. sglang/srt/model_loader/weight_utils.py +2 -0
  59. sglang/srt/models/arcee.py +532 -0
  60. sglang/srt/models/deepseek_v2.py +7 -6
  61. sglang/srt/models/glm4_moe.py +6 -4
  62. sglang/srt/models/granitemoe.py +3 -0
  63. sglang/srt/models/grok.py +3 -0
  64. sglang/srt/models/hunyuan.py +1 -0
  65. sglang/srt/models/llama4.py +3 -0
  66. sglang/srt/models/mixtral.py +3 -0
  67. sglang/srt/models/olmoe.py +3 -0
  68. sglang/srt/models/phimoe.py +1 -0
  69. sglang/srt/models/step3_vl.py +991 -0
  70. sglang/srt/multimodal/processors/base_processor.py +15 -16
  71. sglang/srt/multimodal/processors/step3_vl.py +515 -0
  72. sglang/srt/reasoning_parser.py +2 -1
  73. sglang/srt/server_args.py +49 -18
  74. sglang/srt/speculative/eagle_worker.py +2 -0
  75. sglang/srt/utils.py +1 -0
  76. sglang/test/attention/test_trtllm_mla_backend.py +945 -0
  77. sglang/utils.py +0 -11
  78. sglang/version.py +1 -1
  79. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/METADATA +3 -4
  80. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/RECORD +83 -65
  81. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/WHEEL +0 -0
  82. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/licenses/LICENSE +0 -0
  83. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/top_level.txt +0 -0
@@ -87,6 +87,7 @@ class Llama4MoE(nn.Module):
87
87
  def __init__(
88
88
  self,
89
89
  config: Llama4TextConfig,
90
+ layer_id: int,
90
91
  quant_config: Optional[QuantizationConfig] = None,
91
92
  prefix: str = "",
92
93
  ):
@@ -114,6 +115,7 @@ class Llama4MoE(nn.Module):
114
115
  num_experts=config.num_local_experts,
115
116
  hidden_size=config.hidden_size,
116
117
  intermediate_size=intermediate_size_moe,
118
+ layer_id=layer_id,
117
119
  reduce_results=False,
118
120
  quant_config=quant_config,
119
121
  apply_router_weight_on_input=True,
@@ -373,6 +375,7 @@ class Llama4DecoderLayer(nn.Module):
373
375
  if is_moe_layer:
374
376
  self.feed_forward = Llama4MoE(
375
377
  config=config,
378
+ layer_id=layer_id,
376
379
  quant_config=quant_config,
377
380
  prefix=add_prefix("feed_forward", prefix),
378
381
  )
@@ -69,6 +69,7 @@ class MixtralMoE(nn.Module):
69
69
  top_k: int,
70
70
  hidden_size: int,
71
71
  intermediate_size: int,
72
+ layer_id: int,
72
73
  params_dtype: Optional[torch.dtype] = None,
73
74
  quant_config: Optional[QuantizationConfig] = None,
74
75
  tp_size: Optional[int] = None,
@@ -97,6 +98,7 @@ class MixtralMoE(nn.Module):
97
98
  self.experts = MoEImpl(
98
99
  num_experts=num_experts,
99
100
  top_k=top_k,
101
+ layer_id=layer_id,
100
102
  hidden_size=hidden_size,
101
103
  intermediate_size=intermediate_size,
102
104
  params_dtype=params_dtype,
@@ -226,6 +228,7 @@ class MixtralDecoderLayer(nn.Module):
226
228
  top_k=config.num_experts_per_tok,
227
229
  hidden_size=config.hidden_size,
228
230
  intermediate_size=config.intermediate_size,
231
+ layer_id=layer_id,
229
232
  quant_config=quant_config,
230
233
  prefix=add_prefix("block_sparse_moe", prefix),
231
234
  )
@@ -63,6 +63,7 @@ class OlmoeMoE(nn.Module):
63
63
  params_dtype: Optional[torch.dtype] = None,
64
64
  quant_config: Optional[QuantizationConfig] = None,
65
65
  tp_size: Optional[int] = None,
66
+ layer_id: int = 0,
66
67
  prefix: str = "",
67
68
  ):
68
69
  super().__init__()
@@ -89,6 +90,7 @@ class OlmoeMoE(nn.Module):
89
90
  reduce_results=True,
90
91
  quant_config=quant_config,
91
92
  tp_size=tp_size,
93
+ layer_id=layer_id,
92
94
  prefix=add_prefix("experts", prefix),
93
95
  )
94
96
 
@@ -224,6 +226,7 @@ class OlmoeDecoderLayer(nn.Module):
224
226
  top_k=config.num_experts_per_tok,
225
227
  hidden_size=config.hidden_size,
226
228
  intermediate_size=config.intermediate_size,
229
+ layer_id=layer_id,
227
230
  quant_config=quant_config,
228
231
  prefix=add_prefix("mlp", prefix),
229
232
  )
@@ -210,6 +210,7 @@ class PhiMoE(nn.Module):
210
210
  self.experts = FusedMoE(
211
211
  num_experts=num_experts,
212
212
  top_k=top_k,
213
+ layer_id=layer_id,
213
214
  hidden_size=hidden_size,
214
215
  intermediate_size=intermediate_size,
215
216
  reduce_results=True,