sglang 0.5.0rc2__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. sglang/bench_one_batch.py +0 -6
  2. sglang/bench_one_batch_server.py +7 -2
  3. sglang/bench_serving.py +3 -3
  4. sglang/eval/llama3_eval.py +0 -1
  5. sglang/srt/configs/model_config.py +24 -9
  6. sglang/srt/configs/update_config.py +40 -5
  7. sglang/srt/constrained/xgrammar_backend.py +23 -11
  8. sglang/srt/conversation.py +2 -15
  9. sglang/srt/disaggregation/ascend/conn.py +1 -3
  10. sglang/srt/disaggregation/base/conn.py +1 -0
  11. sglang/srt/disaggregation/decode.py +1 -1
  12. sglang/srt/disaggregation/launch_lb.py +7 -1
  13. sglang/srt/disaggregation/mini_lb.py +11 -5
  14. sglang/srt/disaggregation/mooncake/conn.py +141 -47
  15. sglang/srt/disaggregation/prefill.py +261 -5
  16. sglang/srt/disaggregation/utils.py +2 -1
  17. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
  18. sglang/srt/distributed/device_communicators/pynccl.py +68 -18
  19. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
  20. sglang/srt/distributed/naive_distributed.py +112 -0
  21. sglang/srt/distributed/parallel_state.py +90 -4
  22. sglang/srt/entrypoints/context.py +20 -1
  23. sglang/srt/entrypoints/engine.py +27 -2
  24. sglang/srt/entrypoints/http_server.py +12 -0
  25. sglang/srt/entrypoints/openai/protocol.py +2 -2
  26. sglang/srt/entrypoints/openai/serving_chat.py +22 -6
  27. sglang/srt/entrypoints/openai/serving_completions.py +9 -1
  28. sglang/srt/entrypoints/openai/serving_responses.py +2 -2
  29. sglang/srt/eplb/expert_distribution.py +2 -3
  30. sglang/srt/function_call/deepseekv3_detector.py +1 -1
  31. sglang/srt/hf_transformers_utils.py +24 -0
  32. sglang/srt/host_shared_memory.py +83 -0
  33. sglang/srt/layers/attention/ascend_backend.py +132 -22
  34. sglang/srt/layers/attention/flashattention_backend.py +24 -17
  35. sglang/srt/layers/attention/flashinfer_backend.py +11 -3
  36. sglang/srt/layers/attention/flashinfer_mla_backend.py +226 -76
  37. sglang/srt/layers/attention/triton_backend.py +85 -46
  38. sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
  39. sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
  40. sglang/srt/layers/attention/trtllm_mha_backend.py +390 -30
  41. sglang/srt/layers/attention/trtllm_mla_backend.py +39 -16
  42. sglang/srt/layers/attention/utils.py +94 -15
  43. sglang/srt/layers/attention/vision.py +40 -13
  44. sglang/srt/layers/attention/vision_utils.py +65 -0
  45. sglang/srt/layers/communicator.py +51 -3
  46. sglang/srt/layers/dp_attention.py +23 -4
  47. sglang/srt/layers/elementwise.py +94 -0
  48. sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
  49. sglang/srt/layers/layernorm.py +8 -1
  50. sglang/srt/layers/linear.py +24 -0
  51. sglang/srt/layers/logits_processor.py +5 -1
  52. sglang/srt/layers/moe/__init__.py +31 -0
  53. sglang/srt/layers/moe/ep_moe/layer.py +37 -33
  54. sglang/srt/layers/moe/fused_moe_native.py +14 -25
  55. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
  60. sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
  61. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
  62. sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
  63. sglang/srt/layers/moe/moe_runner/base.py +13 -0
  64. sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
  65. sglang/srt/layers/moe/router.py +15 -9
  66. sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
  67. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
  68. sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
  69. sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
  70. sglang/srt/layers/moe/topk.py +167 -83
  71. sglang/srt/layers/moe/utils.py +159 -18
  72. sglang/srt/layers/quantization/__init__.py +13 -14
  73. sglang/srt/layers/quantization/awq.py +7 -7
  74. sglang/srt/layers/quantization/base_config.py +2 -6
  75. sglang/srt/layers/quantization/blockwise_int8.py +4 -12
  76. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -28
  77. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
  78. sglang/srt/layers/quantization/fp8.py +127 -119
  79. sglang/srt/layers/quantization/fp8_kernel.py +195 -24
  80. sglang/srt/layers/quantization/fp8_utils.py +34 -9
  81. sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
  82. sglang/srt/layers/quantization/gptq.py +5 -4
  83. sglang/srt/layers/quantization/marlin_utils.py +11 -3
  84. sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
  85. sglang/srt/layers/quantization/modelopt_quant.py +165 -68
  86. sglang/srt/layers/quantization/moe_wna16.py +10 -15
  87. sglang/srt/layers/quantization/mxfp4.py +206 -37
  88. sglang/srt/layers/quantization/quark/quark.py +390 -0
  89. sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
  90. sglang/srt/layers/quantization/unquant.py +34 -70
  91. sglang/srt/layers/quantization/utils.py +25 -0
  92. sglang/srt/layers/quantization/w4afp8.py +7 -8
  93. sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
  94. sglang/srt/layers/quantization/w8a8_int8.py +5 -13
  95. sglang/srt/layers/radix_attention.py +6 -0
  96. sglang/srt/layers/rotary_embedding.py +1 -0
  97. sglang/srt/lora/lora_manager.py +21 -22
  98. sglang/srt/lora/lora_registry.py +3 -3
  99. sglang/srt/lora/mem_pool.py +26 -24
  100. sglang/srt/lora/utils.py +10 -12
  101. sglang/srt/managers/cache_controller.py +76 -18
  102. sglang/srt/managers/detokenizer_manager.py +10 -2
  103. sglang/srt/managers/io_struct.py +9 -0
  104. sglang/srt/managers/mm_utils.py +1 -1
  105. sglang/srt/managers/schedule_batch.py +4 -9
  106. sglang/srt/managers/scheduler.py +25 -16
  107. sglang/srt/managers/session_controller.py +1 -1
  108. sglang/srt/managers/template_manager.py +7 -5
  109. sglang/srt/managers/tokenizer_manager.py +60 -21
  110. sglang/srt/managers/tp_worker.py +1 -0
  111. sglang/srt/managers/utils.py +59 -1
  112. sglang/srt/mem_cache/allocator.py +7 -5
  113. sglang/srt/mem_cache/allocator_ascend.py +0 -11
  114. sglang/srt/mem_cache/hicache_storage.py +14 -4
  115. sglang/srt/mem_cache/memory_pool.py +3 -3
  116. sglang/srt/mem_cache/memory_pool_host.py +35 -2
  117. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
  118. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
  119. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
  120. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
  121. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
  122. sglang/srt/model_executor/cuda_graph_runner.py +25 -12
  123. sglang/srt/model_executor/forward_batch_info.py +4 -1
  124. sglang/srt/model_executor/model_runner.py +43 -32
  125. sglang/srt/model_executor/npu_graph_runner.py +94 -0
  126. sglang/srt/model_loader/loader.py +24 -6
  127. sglang/srt/models/dbrx.py +12 -6
  128. sglang/srt/models/deepseek.py +2 -1
  129. sglang/srt/models/deepseek_nextn.py +3 -1
  130. sglang/srt/models/deepseek_v2.py +224 -223
  131. sglang/srt/models/ernie4.py +2 -2
  132. sglang/srt/models/glm4_moe.py +25 -63
  133. sglang/srt/models/glm4v.py +52 -1
  134. sglang/srt/models/glm4v_moe.py +8 -11
  135. sglang/srt/models/gpt_oss.py +34 -74
  136. sglang/srt/models/granitemoe.py +0 -1
  137. sglang/srt/models/grok.py +376 -48
  138. sglang/srt/models/interns1.py +12 -47
  139. sglang/srt/models/internvl.py +6 -51
  140. sglang/srt/models/llama4.py +0 -2
  141. sglang/srt/models/minicpm3.py +0 -1
  142. sglang/srt/models/mixtral.py +0 -2
  143. sglang/srt/models/nemotron_nas.py +435 -0
  144. sglang/srt/models/olmoe.py +0 -1
  145. sglang/srt/models/phi4mm.py +3 -21
  146. sglang/srt/models/qwen2_5_vl.py +2 -0
  147. sglang/srt/models/qwen2_moe.py +3 -18
  148. sglang/srt/models/qwen3.py +2 -2
  149. sglang/srt/models/qwen3_classification.py +7 -1
  150. sglang/srt/models/qwen3_moe.py +9 -38
  151. sglang/srt/models/step3_vl.py +2 -1
  152. sglang/srt/models/xverse_moe.py +11 -5
  153. sglang/srt/multimodal/processors/base_processor.py +3 -3
  154. sglang/srt/multimodal/processors/internvl.py +7 -2
  155. sglang/srt/multimodal/processors/llava.py +11 -7
  156. sglang/srt/offloader.py +433 -0
  157. sglang/srt/operations.py +6 -1
  158. sglang/srt/reasoning_parser.py +4 -3
  159. sglang/srt/server_args.py +237 -104
  160. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -0
  161. sglang/srt/speculative/eagle_utils.py +36 -13
  162. sglang/srt/speculative/eagle_worker.py +56 -3
  163. sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
  164. sglang/srt/two_batch_overlap.py +16 -11
  165. sglang/srt/utils.py +68 -70
  166. sglang/test/runners.py +8 -5
  167. sglang/test/test_block_fp8.py +5 -6
  168. sglang/test/test_block_fp8_ep.py +13 -19
  169. sglang/test/test_cutlass_moe.py +4 -6
  170. sglang/test/test_cutlass_w4a8_moe.py +4 -3
  171. sglang/test/test_fp4_moe.py +4 -3
  172. sglang/test/test_utils.py +7 -0
  173. sglang/utils.py +0 -1
  174. sglang/version.py +1 -1
  175. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/METADATA +7 -7
  176. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/RECORD +179 -161
  177. sglang/srt/layers/quantization/fp4.py +0 -557
  178. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
  179. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
  180. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,7 @@ from sglang.srt.layers.moe.ep_moe.kernels import (
12
12
  run_moe_ep_preproess,
13
13
  silu_and_mul_triton_kernel,
14
14
  )
15
- from sglang.srt.layers.moe.topk import select_experts
15
+ from sglang.srt.layers.moe.topk import TopKConfig, select_experts
16
16
  from sglang.test.test_utils import CustomTestCase
17
17
 
18
18
 
@@ -22,35 +22,26 @@ def ep_moe(
22
22
  w1: torch.Tensor,
23
23
  w2: torch.Tensor,
24
24
  router_logits: torch.Tensor,
25
- top_k: int,
26
- renormalize: bool,
25
+ topk_config: TopKConfig,
27
26
  # ep config
28
27
  num_experts: int = 256,
29
28
  fp8_dtype: torch.types = torch.float8_e4m3fn,
30
29
  num_experts_per_partition: int = 128,
31
30
  start_expert_id: int = 0,
32
31
  end_expert_id: int = 127,
33
- use_grouped_topk: bool = False,
34
- num_expert_group: Optional[int] = None,
35
- topk_group: Optional[int] = None,
36
- custom_routing_function: Optional[Callable] = None,
37
32
  use_fp8_w8a8: bool = False,
38
33
  w1_scale_inv: Optional[torch.Tensor] = None,
39
34
  w2_scale_inv: Optional[torch.Tensor] = None,
40
35
  block_shape: Optional[List[int]] = None,
41
36
  ):
42
37
  use_blockwise_fp8 = block_shape is not None
43
- topk_weights, topk_ids, _ = select_experts(
38
+ top_k = topk_config.top_k
39
+ topk_output = select_experts(
44
40
  hidden_states=hidden_states,
45
41
  router_logits=router_logits,
46
- top_k=top_k,
47
- use_grouped_topk=use_grouped_topk,
48
- renormalize=renormalize,
49
- topk_group=topk_group,
50
- num_expert_group=num_expert_group,
51
- # correction_bias=correction_bias, #skip this in test
52
- custom_routing_function=custom_routing_function,
42
+ topk_config=topk_config,
53
43
  )
44
+ topk_weights, topk_ids, _ = topk_output
54
45
 
55
46
  reorder_topk_ids, src2dst, seg_indptr = run_moe_ep_preproess(topk_ids, num_experts)
56
47
 
@@ -294,14 +285,18 @@ class TestW8A8BlockFP8EPMoE(CustomTestCase):
294
285
  start_id = cur_rank * num_experts_per_partition
295
286
  end_id = start_id + num_experts_per_partition - 1
296
287
 
288
+ topk_config = TopKConfig(
289
+ top_k=topk,
290
+ renormalize=False,
291
+ )
292
+
297
293
  with torch.inference_mode():
298
294
  out = ep_moe(
299
295
  hidden_states=a,
300
296
  w1=w1,
301
297
  w2=w2,
302
298
  router_logits=score,
303
- top_k=topk,
304
- renormalize=False,
299
+ topk_config=topk_config,
305
300
  use_fp8_w8a8=True,
306
301
  w1_scale_inv=w1_s,
307
302
  w2_scale_inv=w2_s,
@@ -316,8 +311,7 @@ class TestW8A8BlockFP8EPMoE(CustomTestCase):
316
311
  w1=w1_ref,
317
312
  w2=w2_ref,
318
313
  router_logits=score,
319
- top_k=topk,
320
- renormalize=False,
314
+ topk_config=topk_config,
321
315
  use_fp8_w8a8=False,
322
316
  w1_scale_inv=None,
323
317
  w2_scale_inv=None,
@@ -153,9 +153,8 @@ def run_test(tp_size, batch_size, model_config, check=False):
153
153
  x,
154
154
  w1,
155
155
  w2,
156
- topk_weights,
157
- topk_ids,
158
- inplace=False, # Use False for benchmarking to avoid side effects if run multiple times
156
+ (topk_weights, topk_ids, "dummy"),
157
+ inplace=False,
159
158
  activation="silu", # Assuming SiLU activation common in MoEs
160
159
  use_fp8_w8a8=True,
161
160
  w1_scale=w1_scale,
@@ -221,8 +220,7 @@ def run_test(tp_size, batch_size, model_config, check=False):
221
220
  x,
222
221
  w1, # Original shape
223
222
  w2, # Original shape
224
- topk_weights,
225
- topk_ids,
223
+ (topk_weights, topk_ids, "dummy"),
226
224
  inplace=False, # Important: Use False to get output tensor
227
225
  activation="silu",
228
226
  use_fp8_w8a8=True,
@@ -266,7 +264,7 @@ if __name__ == "__main__":
266
264
  "--batch-sizes",
267
265
  type=int,
268
266
  nargs="+",
269
- default=[1, 4, 8, 16, 32, 64, 128, 256, 512], # Adjusted default
267
+ default=[1, 4, 8, 16, 32, 64, 128, 256, 512, 1024], # Adjusted default
270
268
  help="List of batch sizes to test",
271
269
  )
272
270
  parser.add_argument("--check", action="store_true", help="Enable check mode")
@@ -6,7 +6,7 @@ import pytest
6
6
  import torch
7
7
 
8
8
  from sglang.srt.layers.moe.cutlass_w4a8_moe import cutlass_w4a8_moe
9
- from sglang.srt.layers.moe.topk import select_experts
9
+ from sglang.srt.layers.moe.topk import TopKConfig, select_experts
10
10
 
11
11
 
12
12
  def pack_int4_values_to_int8(int4_values_interleaved: torch.Tensor) -> torch.Tensor:
@@ -100,11 +100,12 @@ def test_cutlass_w4a8_moe(M, N, K, E, ep_size, topk, group_size, dtype):
100
100
  s_strides2 = c_strides2
101
101
 
102
102
  score = torch.randn((M, E), dtype=dtype, device=device)
103
- topk_weights, topk_ids, _ = select_experts(
103
+ topk_output = select_experts(
104
104
  hidden_states=a,
105
105
  router_logits=score,
106
- top_k=topk,
106
+ topk_config=TopKConfig(top_k=topk, renormalize=False),
107
107
  )
108
+ topk_weights, topk_ids, _ = topk_output
108
109
  expert_map = torch.arange(E, dtype=torch.int32, device=device)
109
110
  expert_map[local_e:] = E
110
111
 
@@ -9,7 +9,7 @@ from sgl_kernel import scaled_fp4_quant
9
9
  from sglang.srt.layers.activation import SiluAndMul
10
10
  from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
11
11
  from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
12
- from sglang.srt.layers.moe.topk import select_experts
12
+ from sglang.srt.layers.moe.topk import TopKConfig, select_experts
13
13
 
14
14
  if torch.cuda.get_device_capability() < (10, 0):
15
15
  pytest.skip(
@@ -163,11 +163,12 @@ def check_moe(
163
163
 
164
164
  score = torch.randn((m, e), device="cuda", dtype=dtype)
165
165
 
166
- topk_weights, topk_ids, _ = select_experts(
166
+ topk_output = select_experts(
167
167
  hidden_states=a,
168
168
  router_logits=score,
169
- top_k=topk,
169
+ topk_config=TopKConfig(top_k=topk, renormalize=False),
170
170
  )
171
+ topk_weights, topk_ids, _ = topk_output
171
172
 
172
173
  a1_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
173
174
  a2_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
sglang/test/test_utils.py CHANGED
@@ -61,6 +61,12 @@ DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8 = (
61
61
  DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8 = (
62
62
  "nvidia/Llama-3.1-8B-Instruct-FP8"
63
63
  )
64
+ DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8 = "Qwen/Qwen3-1.7B-FP8"
65
+ DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8"
66
+
67
+ # W8A8 models
68
+ DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
69
+ DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
64
70
 
65
71
  # EAGLE
66
72
  DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
@@ -78,6 +84,7 @@ DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
78
84
  "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
79
85
  )
80
86
  DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
87
+ DEFAULT_DEEPSEEK_W4AFP8_MODEL_FOR_TEST = "Barrrrry/DeepSeek-R1-W4AFP8"
81
88
 
82
89
  # Nightly tests
83
90
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
sglang/utils.py CHANGED
@@ -5,7 +5,6 @@ import json
5
5
  import logging
6
6
  import os
7
7
  import random
8
- import signal
9
8
  import socket
10
9
  import subprocess
11
10
  import sys
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.0rc2"
1
+ __version__ = "0.5.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.5.0rc2
3
+ Version: 0.5.1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -232,7 +232,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
232
232
  Requires-Dist: msgspec; extra == "runtime-common"
233
233
  Requires-Dist: ninja; extra == "runtime-common"
234
234
  Requires-Dist: openai==1.99.1; extra == "runtime-common"
235
- Requires-Dist: openai-harmony==0.0.3; extra == "runtime-common"
235
+ Requires-Dist: openai-harmony==0.0.4; extra == "runtime-common"
236
236
  Requires-Dist: orjson; extra == "runtime-common"
237
237
  Requires-Dist: outlines==0.1.11; extra == "runtime-common"
238
238
  Requires-Dist: packaging; extra == "runtime-common"
@@ -240,9 +240,9 @@ Requires-Dist: partial_json_parser; extra == "runtime-common"
240
240
  Requires-Dist: pillow; extra == "runtime-common"
241
241
  Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
242
242
  Requires-Dist: psutil; extra == "runtime-common"
243
+ Requires-Dist: pybase64; extra == "runtime-common"
243
244
  Requires-Dist: pydantic; extra == "runtime-common"
244
245
  Requires-Dist: pynvml; extra == "runtime-common"
245
- Requires-Dist: pybase64; extra == "runtime-common"
246
246
  Requires-Dist: python-multipart; extra == "runtime-common"
247
247
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
248
248
  Requires-Dist: sentencepiece; extra == "runtime-common"
@@ -254,7 +254,7 @@ Requires-Dist: torchao==0.9.0; extra == "runtime-common"
254
254
  Requires-Dist: transformers==4.55.2; extra == "runtime-common"
255
255
  Requires-Dist: uvicorn; extra == "runtime-common"
256
256
  Requires-Dist: uvloop; extra == "runtime-common"
257
- Requires-Dist: xgrammar==0.1.22; extra == "runtime-common"
257
+ Requires-Dist: xgrammar==0.1.23; extra == "runtime-common"
258
258
  Provides-Extra: srt
259
259
  Requires-Dist: sglang[runtime_common]; extra == "srt"
260
260
  Requires-Dist: sgl-kernel==0.3.5; extra == "srt"
@@ -278,13 +278,12 @@ Requires-Dist: petit_kernel==0.0.2; extra == "srt-hip"
278
278
  Requires-Dist: wave-lang==1.0.1; extra == "srt-hip"
279
279
  Provides-Extra: srt-cpu
280
280
  Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
281
- Requires-Dist: einops; extra == "srt-cpu"
281
+ Provides-Extra: srt-npu
282
+ Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
282
283
  Provides-Extra: srt-xpu
283
284
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
284
285
  Provides-Extra: srt-hpu
285
286
  Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
286
- Provides-Extra: srt-npu
287
- Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
288
287
  Provides-Extra: openai
289
288
  Requires-Dist: openai==1.99.1; extra == "openai"
290
289
  Requires-Dist: tiktoken; extra == "openai"
@@ -375,6 +374,7 @@ Dynamic: license-file
375
374
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
376
375
 
377
376
  ## News
377
+ - [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking. [Register here](https://lu.ma/gbfhjvuo).
378
378
  - [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
379
379
  - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
380
380
  - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).