sglang 0.4.0.post2__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. sglang/bench_offline_throughput.py +0 -12
  2. sglang/bench_one_batch.py +0 -12
  3. sglang/bench_serving.py +1 -0
  4. sglang/srt/aio_rwlock.py +100 -0
  5. sglang/srt/configs/model_config.py +8 -1
  6. sglang/srt/layers/attention/flashinfer_backend.py +49 -5
  7. sglang/srt/layers/linear.py +20 -2
  8. sglang/srt/layers/{ep_moe → moe/ep_moe}/layer.py +14 -39
  9. sglang/srt/layers/moe/fused_moe_native.py +46 -0
  10. sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/__init__.py +3 -7
  11. sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/fused_moe.py +110 -98
  12. sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/layer.py +16 -48
  13. sglang/srt/layers/moe/topk.py +191 -0
  14. sglang/srt/layers/quantization/__init__.py +3 -3
  15. sglang/srt/layers/quantization/fp8.py +169 -32
  16. sglang/srt/layers/quantization/fp8_kernel.py +278 -0
  17. sglang/srt/layers/quantization/fp8_utils.py +90 -1
  18. sglang/srt/layers/torchao_utils.py +11 -15
  19. sglang/srt/managers/schedule_batch.py +16 -10
  20. sglang/srt/managers/scheduler.py +2 -2
  21. sglang/srt/managers/tokenizer_manager.py +86 -76
  22. sglang/srt/mem_cache/memory_pool.py +15 -8
  23. sglang/srt/model_executor/cuda_graph_runner.py +1 -1
  24. sglang/srt/model_executor/model_runner.py +6 -0
  25. sglang/srt/models/dbrx.py +1 -1
  26. sglang/srt/models/deepseek.py +1 -1
  27. sglang/srt/models/deepseek_v2.py +67 -18
  28. sglang/srt/models/grok.py +1 -1
  29. sglang/srt/models/mixtral.py +2 -2
  30. sglang/srt/models/olmoe.py +1 -1
  31. sglang/srt/models/qwen2_moe.py +1 -1
  32. sglang/srt/models/xverse_moe.py +1 -1
  33. sglang/srt/openai_api/adapter.py +4 -0
  34. sglang/srt/server.py +1 -0
  35. sglang/srt/utils.py +33 -44
  36. sglang/test/test_block_fp8.py +341 -0
  37. sglang/version.py +1 -1
  38. {sglang-0.4.0.post2.dist-info → sglang-0.4.1.dist-info}/METADATA +3 -3
  39. {sglang-0.4.0.post2.dist-info → sglang-0.4.1.dist-info}/RECORD +44 -40
  40. sglang/srt/layers/fused_moe_patch.py +0 -133
  41. /sglang/srt/layers/{ep_moe → moe/ep_moe}/__init__.py +0 -0
  42. /sglang/srt/layers/{ep_moe → moe/ep_moe}/kernels.py +0 -0
  43. {sglang-0.4.0.post2.dist-info → sglang-0.4.1.dist-info}/LICENSE +0 -0
  44. {sglang-0.4.0.post2.dist-info → sglang-0.4.1.dist-info}/WHEEL +0 -0
  45. {sglang-0.4.0.post2.dist-info → sglang-0.4.1.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,17 @@
1
1
  sglang/__init__.py,sha256=b2oIdWzp5P8SzieeOs2TzJoN3Do3tfJbV8gZS_imVcs,1619
2
2
  sglang/api.py,sha256=NdO6cYnklnEBQBKqQjlqI8-P1EownKQ71t5ibCGhEVo,6953
3
3
  sglang/bench_latency.py,sha256=oZjSAzX7dUiSu-zdz0dkyUPo-qAX_lsXFH1gf03akgI,76
4
- sglang/bench_offline_throughput.py,sha256=rgMWDhA1Hai0gKBzxc0dzTWfI8l39Cyw2VOCyMt1YyY,12771
5
- sglang/bench_one_batch.py,sha256=aF0onHeRjy7AYVjsq1IA3rZEhUuYXuslg1fAhuvJ2yo,16120
4
+ sglang/bench_offline_throughput.py,sha256=iQiJCK3KQDCdwU1NVbIwbtthssWzBXiIsKUDA7Z_hO0,12510
5
+ sglang/bench_one_batch.py,sha256=jkyMhK0lqn5dRCYgAh30qZrNHP4gAbXODymBMNXK86I,15859
6
6
  sglang/bench_one_batch_server.py,sha256=-fV9FTLNNcSIy0pgYeggXedPVK0fVsXZqVQswT8OMOY,5945
7
- sglang/bench_serving.py,sha256=zv_EcbWno79j7WYFL2m6BfCLT6iSOfGV4uwGbDg9KQA,53141
7
+ sglang/bench_serving.py,sha256=3VQatM51v9f55aUQQ5crYMxxKHr1AbThicsWfBy_tjU,53190
8
8
  sglang/check_env.py,sha256=4OqpZaEJOfBM6-vtPILto5kqDmgiZM1Koc7lK78A7CI,8427
9
9
  sglang/global_config.py,sha256=fnT0U9vlHdGaQFKN9tYTnUF4-eVW4HYQURd5zvPtrg0,1286
10
10
  sglang/launch_server.py,sha256=4y2QeSj0wVNB9MJQZeahD4ahTDU6gwqo7MPUytyFop0,403
11
11
  sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
12
12
  sglang/llama3_eval.py,sha256=gWSboDchIGybIce88bJlrCG0yiLZ513mw4gcutJlzGM,10017
13
13
  sglang/utils.py,sha256=23jf4Mz8E5p5a6JOkjnfYZixdjZUk88F_mZ8rZcby5Q,11597
14
- sglang/version.py,sha256=OUNovuQ1RrdJFYetl0e0U0556H_wiyjhVks9-l-zF94,28
14
+ sglang/version.py,sha256=pMtTmSUht-XtbR_7Doz6bsQqopJJd8rZ8I8zy2HwwoA,22
15
15
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  sglang/lang/chat_template.py,sha256=cnfjjxIIcYRGRxXlJlOGnpFxFuhMHut7DS52LsOMKcA,15826
17
17
  sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
@@ -27,18 +27,19 @@ sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI
27
27
  sglang/lang/backend/runtime_endpoint.py,sha256=dfs-yZ1ekKmnbpZLluQHWPmMeZJKbaaZRRGYRa9eBE8,10541
28
28
  sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
29
29
  sglang/srt/_custom_ops.py,sha256=Y4gyTDGhWz-W2Igq25Ojm8XFiyvkawW9I-79iwYvxJ0,3574
30
+ sglang/srt/aio_rwlock.py,sha256=6LYtOdeTUY3hkfa1dmYkgsaF2ttrwIF3hUWz2AZ2fqw,2970
30
31
  sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21201
31
32
  sglang/srt/hf_transformers_utils.py,sha256=38Ms0H2-VMerOS6jnczcFtZMS6lhw9B5rSWKAfxVUfQ,7945
32
33
  sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
33
34
  sglang/srt/model_parallel.py,sha256=eLXZhvJ4wG6dh0FontNCIdVZvHYdWgaeY-5cu7TD9tE,6078
34
- sglang/srt/server.py,sha256=tEciMH_U6WIZYPUGDDM0c4BQ16cvgVdA4II-ksPZoMo,34621
35
+ sglang/srt/server.py,sha256=E9YKKXpXv3vPvRy0-cgcy0-5UA-OZz42-32EZWKTicA,34661
35
36
  sglang/srt/server_args.py,sha256=LgnQ-kBJZ3E7hMMZj9bSK0mn7Bhjk1nJHxLcxl-lGTM,34572
36
- sglang/srt/utils.py,sha256=WWEcMJHmvlOjiqE9UicT0ZYwa2PUKDZorAk2Y8PPRBI,42039
37
+ sglang/srt/utils.py,sha256=J8kFl6kDBwFZCM6AKaVTiqdhJKRg0JOH0pNrD1ZeWmM,41726
37
38
  sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
38
39
  sglang/srt/configs/device_config.py,sha256=dResqHjkg_dq10v6rnVpbXpvABZRB0jylOm-2_JAnx0,428
39
40
  sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
40
41
  sglang/srt/configs/load_config.py,sha256=TcPi_HY6xu5SiVZsxPOoB5pGeDUNebOk7muoUH9VBDg,3083
41
- sglang/srt/configs/model_config.py,sha256=OjEeigs5tMNKP-RImJk2NHVFXv-fyQfsGREWMO3rqhM,15839
42
+ sglang/srt/configs/model_config.py,sha256=vVarlLTw9Ged1PXIwRP-R8UhiG6oaezNIZhTNuF0eQc,16070
42
43
  sglang/srt/configs/qwen2vl.py,sha256=ZjLy9v2eZY4wptUfY3CWgYKg2B5DDrkfCSyTy_Zf_bg,4351
43
44
  sglang/srt/constrained/__init__.py,sha256=UWZNVLvOT5ZBX8M36sONgDmnKtkQ0cSfhQD2jO0ATuk,786
44
45
  sglang/srt/constrained/base_grammar_backend.py,sha256=FhVm7PxhXDl0joV9NP5RjKgz7dR1dZvUAQnh0mdtvVY,2353
@@ -60,35 +61,37 @@ sglang/srt/distributed/device_communicators/shm_broadcast.py,sha256=WVxBd1QfIgRW
60
61
  sglang/srt/distributed/device_communicators/xpu_communicator.py,sha256=P3WKgddcfpUhBa-_5PvjYxH146ZE-N1cotTzEpPRKlY,1620
61
62
  sglang/srt/layers/activation.py,sha256=EboMjT9HV2tNHQ6rzpojtlkzev1lAFbhQlxMg9hwxBQ,5471
62
63
  sglang/srt/layers/custom_op_util.py,sha256=0vu-yX2wwonmO1L_o5G7SA6C-8XuhDIh9rPDvNeLhoc,922
63
- sglang/srt/layers/fused_moe_patch.py,sha256=DMIyrwOON7OSidKZdreL5HzMhP0AD5Ues0xdY-ADOQw,4471
64
64
  sglang/srt/layers/layernorm.py,sha256=nRQ1w1xSUcU-zlqVC61BnGG6otS5W1w9VaSzeXizrx4,4037
65
- sglang/srt/layers/linear.py,sha256=dF2HvqiMbhWlCjvkLFRCcgUFGhG-B0keM_CIpjvgTtg,46154
65
+ sglang/srt/layers/linear.py,sha256=KyRFU0VcoNuN-hnQB9QQcBN9NCpeqPtLzzufIHUpV6w,47064
66
66
  sglang/srt/layers/logits_processor.py,sha256=JlOU0x8vBGIuTwHSdjR6Kly9_uzilBMv0NE_rvUx0W4,14747
67
67
  sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
68
68
  sglang/srt/layers/radix_attention.py,sha256=E4cmvkcCdCtb6VyLNrCKy1D6VwHQ063oH3JQXPaRy6w,2178
69
69
  sglang/srt/layers/rotary_embedding.py,sha256=29tx3JNR40AoXqBa2cFGBjva9vU2xgFipETlpMaaZas,3985
70
70
  sglang/srt/layers/sampler.py,sha256=k4Op_HMkQfT7t9wgQwBVotfTUXEocrzRyQqEFnff1pc,5511
71
- sglang/srt/layers/torchao_utils.py,sha256=07Fe2Csdh1JiQKPGGHWkbq0-a6bV7Cq136ygdtVAhgI,3708
71
+ sglang/srt/layers/torchao_utils.py,sha256=dQVuWNXxAvOPjr2G5BBMWqC2oKcS2B52rx-fEc_elmc,3545
72
72
  sglang/srt/layers/vocab_parallel_embedding.py,sha256=slGwLiWjuFLCUdRe-GTlfumyZpqVX9VF6No_UGOT-hA,21624
73
73
  sglang/srt/layers/attention/__init__.py,sha256=KIJhzOJWYioQE7Va4D83-V-ZUZVMZcczuNgDC3dlSRo,2583
74
74
  sglang/srt/layers/attention/double_sparsity_backend.py,sha256=RQdEKRykSLf9ilnaHmR6T7RFqh4emH_adfB3aJN2BUU,10920
75
- sglang/srt/layers/attention/flashinfer_backend.py,sha256=NgeigL1WiPOuOry0Gbxv-6HEcERB8Du0mBJgYcTVIAA,24943
75
+ sglang/srt/layers/attention/flashinfer_backend.py,sha256=umD1E2zvMnPbbgvx2Ex5LQB6a4a41brjsks1M0gFMMU,26357
76
76
  sglang/srt/layers/attention/torch_native_backend.py,sha256=nQdeqWEMMH_wrod5wssDCJG-uPKm0uslvkALKqPRPQ8,10509
77
77
  sglang/srt/layers/attention/triton_backend.py,sha256=-TobyZHwlbJ5HhbFg-jgCqVOw4Y-opgEuFo-EusASQc,6264
78
78
  sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=oJ_UK1t229zF3hbTDiQe7t-X-IbM2dOxx4U2ch-vmjA,17847
79
79
  sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXfY3EEaM7iRN_uElHnAfsrJMhTFbu9fj8Z0O2PbE,21480
80
80
  sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=tZJhzqcf1KKMT8z7_32eVk_D1NHP71c-S3UNxemfAHM,11542
81
81
  sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=lojFXRZMLWkzS2Y8uxaolnQhXaWKG19mCAWaF5KQeiI,6087
82
- sglang/srt/layers/ep_moe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
83
- sglang/srt/layers/ep_moe/kernels.py,sha256=wb_S2qLxoWWgQu9coXy0XLNGvHzdZSdwXr0PGy4QySg,10940
84
- sglang/srt/layers/ep_moe/layer.py,sha256=uMropMhU-MaycoxSLxcfD0jZC_cuL_boRbIu86mbZjY,23034
85
- sglang/srt/layers/fused_moe_triton/__init__.py,sha256=PHKFqd2hPOO-g9kSMseg2g76lpg9OGXQDThWU6bt9vs,902
86
- sglang/srt/layers/fused_moe_triton/fused_moe.py,sha256=fLGmkY6imJYjEw9-3-jJthkMcFGMBcu9HCNIuxAzMhE,29625
87
- sglang/srt/layers/fused_moe_triton/layer.py,sha256=eMpbZlP3FAQxbHochis7ybZ-fsNBP0PzKF1PN0Xo7so,21517
88
- sglang/srt/layers/quantization/__init__.py,sha256=FgNy_zNWMWnq3lEGyCSyfLSQtcZtWlq99JilkmEDW7I,4594
82
+ sglang/srt/layers/moe/fused_moe_native.py,sha256=8q-LFZMSCGLc2_Gltp2lH0gSb4A1WOuKQW3wo3rpj5g,1601
83
+ sglang/srt/layers/moe/topk.py,sha256=YjIiFqMERvkChkwZUqTrL_xaQyzsYsZzVUe4PzAhRZI,6299
84
+ sglang/srt/layers/moe/ep_moe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
+ sglang/srt/layers/moe/ep_moe/kernels.py,sha256=wb_S2qLxoWWgQu9coXy0XLNGvHzdZSdwXr0PGy4QySg,10940
86
+ sglang/srt/layers/moe/ep_moe/layer.py,sha256=6iQU5ZjQ8IXGoQ8ZlBuJqyQxYTEem9vXI6rbVIWKlZw,22303
87
+ sglang/srt/layers/moe/fused_moe_triton/__init__.py,sha256=h9yMFAL_bagUf-qBED8gSWdCOb7d8IdA-pE-L_nIg8E,842
88
+ sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=GVugCKapd3CvgkvPQ_FmQplC12-grv3n1FRkLJc6WhY,30790
89
+ sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=BclDj5JyCy-8Bfue4broL1-IG6a4dUyggE9WQLa06sg,20575
90
+ sglang/srt/layers/quantization/__init__.py,sha256=VPYXShHvbvkOgVBlkIqic4RhdJ1y6EZ3r34T-nZMT1k,4606
89
91
  sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
90
- sglang/srt/layers/quantization/fp8.py,sha256=3oIUPaD0PBXQyTKr44I0YJ8XXDdwyoS_-ZA97XdSxXE,24143
91
- sglang/srt/layers/quantization/fp8_utils.py,sha256=eJDLLDu8ZbrbE3BfFIf89JlIMPOP-14DesbeVsajW0Q,1035
92
+ sglang/srt/layers/quantization/fp8.py,sha256=wNnpXLroIl7D98mlfCiXZPE9hrP5ricHrXY1WZBzEEo,30810
93
+ sglang/srt/layers/quantization/fp8_kernel.py,sha256=v4-7hCQFyuUSZmeJS_5VDCu6a1-EGWXQ088FdPTjO_0,8137
94
+ sglang/srt/layers/quantization/fp8_utils.py,sha256=HBJBaNcln1NrLxzw0ppUjMd6w-ryuGDDHCYJq7mRQac,4035
92
95
  sglang/srt/lora/lora.py,sha256=-o2mBmUvoVpdkgdAkWTARN4kfyep3UNEJLcg6moh0SU,15056
93
96
  sglang/srt/lora/lora_config.py,sha256=a2fTQESlCbG1xLiBYy4ptZ6c0Burcqyg1_6V1XSok-Y,1506
94
97
  sglang/srt/lora/lora_manager.py,sha256=DHiqdl0_4wQ5PxZBZtlCpP14515mDV2_H9tzL3Rdss8,12886
@@ -96,23 +99,23 @@ sglang/srt/managers/data_parallel_controller.py,sha256=psI4FAuBGjtdnEuwagnGdtRqv
96
99
  sglang/srt/managers/detokenizer_manager.py,sha256=nZkbwt4yty_oy8rvg4T7PbgyVLoBLohvHl25xlQpBoo,8439
97
100
  sglang/srt/managers/image_processor.py,sha256=Y8RgyrzbJjJTpjbnZDa5qiiG5wWjZ68rOXUPDi6kkFo,13698
98
101
  sglang/srt/managers/io_struct.py,sha256=_LWWqT3LNwZGaWhg2d3kTg1V2MTHKzRasCvxF9Nfpi4,15429
99
- sglang/srt/managers/schedule_batch.py,sha256=SAd7sxhoC3Bp8_xd-TEcXEFZBlGZPbn8-wMvBcjU55Q,45607
102
+ sglang/srt/managers/schedule_batch.py,sha256=qryPWCdOTFzxomDa80U-5guShOb1K4kBUWcPCCchYB8,45762
100
103
  sglang/srt/managers/schedule_policy.py,sha256=cLNi__smbg02keWgUMfB_nEM3vllocPB0XyG1P5qO7I,15469
101
- sglang/srt/managers/scheduler.py,sha256=QlcVMtrLlNcBOkVISdO556jrK8a4LE4ULskC0oCH2IQ,61776
104
+ sglang/srt/managers/scheduler.py,sha256=3Olw4Yf4Qtn1i4PqK3PT9hkXYGE8nemL2_Xjn8JLxAQ,61819
102
105
  sglang/srt/managers/session_controller.py,sha256=Yp-IV3rXczACZxZXmF-QxW9CWICGy8KHQ9ttBGJ8WXA,2800
103
- sglang/srt/managers/tokenizer_manager.py,sha256=gnCCdB5XDobOoBKptwv-o0yYqDkMUxL78s0zBno5lM4,31219
106
+ sglang/srt/managers/tokenizer_manager.py,sha256=Vta7Lysvh4rPWqEB00shqAzpGUfv7GdPETDqFCU8RxA,31556
104
107
  sglang/srt/managers/tp_worker.py,sha256=X1EwFX3FSsmXx7jeeX2tjZRocaujabQYWm-M-0CFEBE,7363
105
108
  sglang/srt/managers/tp_worker_overlap_thread.py,sha256=-QNBJRKxraa9Xt2WI1AFzZYdneIJ1eXv0GjFzDqXoE0,8926
106
109
  sglang/srt/mem_cache/base_prefix_cache.py,sha256=QC8HS8RC5DXu14kyXsxAgEUsn0f932p2DjqzbKjc6Bs,962
107
110
  sglang/srt/mem_cache/chunk_cache.py,sha256=R2gHAuqKd5ayQW3NnsgoGUH31---Z5izCDyCqLL0FjQ,2524
108
111
  sglang/srt/mem_cache/flush_cache.py,sha256=GYcxmNXh4hsMpFfNOuCTpKilW7guZwTtAg_usVeM3J0,979
109
- sglang/srt/mem_cache/memory_pool.py,sha256=l9_srwXEfIIDF46nxykbHIOo1VSvU5_Ew3H0r5EC7Fo,11072
112
+ sglang/srt/mem_cache/memory_pool.py,sha256=oxk3UtiiFA3_1iIP6eFsk8HIcRI_8Z1-FE2KOWDr-YM,11366
110
113
  sglang/srt/mem_cache/radix_cache.py,sha256=c5voySV5L855c0G9cBEc9iQ4nR7PDDmg0V6fWWJHcq4,10945
111
114
  sglang/srt/metrics/collector.py,sha256=ZWoFx_FKN0sNMSZ8RJWUVQ0RFEYhIHxdw0d4TZTluMU,6861
112
115
  sglang/srt/metrics/func_timer.py,sha256=VFyNRrbnKVCwnQsrlLin1lITJfjQpf9m8sGPqL5LIsQ,3438
113
- sglang/srt/model_executor/cuda_graph_runner.py,sha256=kZ3nV03MD8EQYQB38u4_88_wyW4unECxAdMVICpPyuk,16241
116
+ sglang/srt/model_executor/cuda_graph_runner.py,sha256=1n5WxoE9-0B3unwkkcR355K_D290h2LGt_7EvH02DQM,16246
114
117
  sglang/srt/model_executor/forward_batch_info.py,sha256=L5mVoW5SaO6To-7nGk0TZM-FFB5_78cARpJ-aC2rwD0,12883
115
- sglang/srt/model_executor/model_runner.py,sha256=MLYBcYIQihu2I3PBTUghiU2mSWsDMzlKzcnX7yHa9JU,29837
118
+ sglang/srt/model_executor/model_runner.py,sha256=Bm3NWTS3xmOGXEJnucnJZQldpVOzu-DCEUfaJy_PTU0,30104
116
119
  sglang/srt/model_loader/__init__.py,sha256=zGZkOBz1zx-pkaIy47BasL3fjDlAcxAXUTjInOhXHAE,919
117
120
  sglang/srt/model_loader/loader.py,sha256=VBrY4W9CiVvS_D8yXhdkW9jReV9rSMSkJplabz0Fxgk,43528
118
121
  sglang/srt/model_loader/utils.py,sha256=0NaMR67fESFopaklmsleiL27XH1QUrjZW246MUu1EJ0,1369
@@ -120,9 +123,9 @@ sglang/srt/model_loader/weight_utils.py,sha256=kQo9KPThjH3HAOCfC_tdwdrshdWuWJOVp
120
123
  sglang/srt/models/baichuan.py,sha256=PzBOFcEAixakPEkQSaJwC0Xc1fu-yCsN9T0I67r8QmY,14919
121
124
  sglang/srt/models/chatglm.py,sha256=DOrEhmb0s-yPId88R6nJeLOTUEtogk-vkB69qT2JdWc,12913
122
125
  sglang/srt/models/commandr.py,sha256=PNXgfOZF84h-rSH0edEECUmEGW8YLb44V75Z_oDhFiA,14223
123
- sglang/srt/models/dbrx.py,sha256=2Wqcf3sv57l4gi2xH8yrb5WSmY-4_kbbf6fhpJ4aKWw,14581
124
- sglang/srt/models/deepseek.py,sha256=BVNICGoLjQoHmR5lc31YrZ6YbxSRTBilHqlLsALr2u8,15693
125
- sglang/srt/models/deepseek_v2.py,sha256=YKSrqagVcSUwCAi-rwIph-Xu12GrNETMNKxgnffWod8,35349
126
+ sglang/srt/models/dbrx.py,sha256=okIpIwdr8Cfrz_thzc1F75XqCUfHhFLvZ1B6BaswKoA,14585
127
+ sglang/srt/models/deepseek.py,sha256=_cVOvR6eSEgRf6TUBpTD5uMdijDWFw4sSt4lGzl8tbg,15697
128
+ sglang/srt/models/deepseek_v2.py,sha256=-v_OJr2c3gJ0NMxQjvT3Jknz1XPGkzKx0TVR3NIiC6A,37284
126
129
  sglang/srt/models/exaone.py,sha256=dkERTZVxrRroqu5AGLP7D4N6n8HvDqlNaDQUIe15mZY,13038
127
130
  sglang/srt/models/gemma.py,sha256=ydRqsG-7004r1fAiz01LHUmcj_6XN0Tn4xO1keJnMQk,12126
128
131
  sglang/srt/models/gemma2.py,sha256=41PlW8pMb4rMETdAni_JWDhZeIn_QsTQireAyUjsURA,15848
@@ -130,7 +133,7 @@ sglang/srt/models/gemma2_reward.py,sha256=nJ01KfqLSJtqMLm3sG8p2mGZFK1xhhjh7I7Ccb
130
133
  sglang/srt/models/gpt2.py,sha256=2je1kE09sGcaORWnJuGYAkcwwOrT9EK-KhQaoCKjCSA,9517
131
134
  sglang/srt/models/gpt_bigcode.py,sha256=tovyOdJu2x3LkzmkdFXX_iJdkxuyChIDxwgvPBy6UPo,9528
132
135
  sglang/srt/models/granite.py,sha256=AeQY9Dxd1ZnwgCYBK0vSXXiMGM-yt9iaOVf_ruOUHXw,20409
133
- sglang/srt/models/grok.py,sha256=UWvVEYfEoH0jGNFSbXpO66OGW5pzmIHlNKcn9gRZEoQ,15664
136
+ sglang/srt/models/grok.py,sha256=J9lgNbFebvXgF19nfZyHwlGPlGWY_m0LgP506YvOYrU,15668
134
137
  sglang/srt/models/internlm2.py,sha256=_xcKtd6YtEFUTozaN-yUb0xbSYckRpomfPSKcAk4j-Y,12127
135
138
  sglang/srt/models/internlm2_reward.py,sha256=8K26A9oIFFGx_9U2mF87j7FX8K87HGKMnVL3ht1Uc7I,2398
136
139
  sglang/srt/models/llama.py,sha256=S7nS05hhFGghXu0v-w9RZyBTY6OCEVF5Aaw4GX_E_9g,19929
@@ -142,24 +145,24 @@ sglang/srt/models/llavavid.py,sha256=dYUkKfHoE15vF_VXA_s_ICCTUMSmSgvP181fk8dUi0g
142
145
  sglang/srt/models/minicpm.py,sha256=ws4AqhOfAvYHGd04QuXCZel-Oxy9_vN4p4rTjs9RSz0,13723
143
146
  sglang/srt/models/minicpm3.py,sha256=YIKJDTpwjmpLlv1sNT93k2yZMvGQlI_H87czjf6QYyo,24707
144
147
  sglang/srt/models/mistral.py,sha256=EYifJUUzN2Z2-iL37eJiNZF_DB0H4pa0mKlgYRIxM70,838
145
- sglang/srt/models/mixtral.py,sha256=vi6ssY75kNLy_kJrDru6gJYiAogHjSniaO6aMFd1w4E,14515
148
+ sglang/srt/models/mixtral.py,sha256=L2Gz-Cmih1V75Ks9jmI2a6rUQ1Cl6F2uDgrhDjjDJzs,14523
146
149
  sglang/srt/models/mixtral_quant.py,sha256=uuVO1nWUZJiDhbqZN6gzSMwyfpyZorMuFXHeMCGo7N0,14022
147
150
  sglang/srt/models/mllama.py,sha256=3kX-UqeTSYZL5kPNdkfKEAEv3DpSAW1ArAAoeiXVzIc,37739
148
151
  sglang/srt/models/olmo.py,sha256=OCDMtX1OI83r80mzU4FMC3Tg8cleQ-7C8Tpoe8zgzss,11708
149
152
  sglang/srt/models/olmo2.py,sha256=aC7svioN7XT5owRxPrvhvWBNMON9QXGQBWJ1KHMyXeA,13442
150
- sglang/srt/models/olmoe.py,sha256=Rw-3YrHWd90MZQFnmcfUQ-3wAaI0PCFKb0DIrCDND3s,15347
153
+ sglang/srt/models/olmoe.py,sha256=LiHVGfRaC5c_BU_vVgtV9uLuDH_SC0dw1kEc61posmI,15351
151
154
  sglang/srt/models/phi3_small.py,sha256=44_my3QmgJ2N7SOkGZzEb62DXBeCVHojfmCWgkk2uCI,14802
152
155
  sglang/srt/models/qwen.py,sha256=_FKDbwaS5C07uJyyivZpBrXJVej4Ph9ivzJdzWJPxJ4,9904
153
156
  sglang/srt/models/qwen2.py,sha256=be4xgcuqNa9kBdaL7x3PjsnUky6fh5K33c_khAWSi04,12959
154
- sglang/srt/models/qwen2_moe.py,sha256=rYUk_vZW3ftKIIlqPvJZ1K-6oZ_PfGspixh1zm2Y8C8,16538
157
+ sglang/srt/models/qwen2_moe.py,sha256=6xRRJxWWh1M5UFPfvhsCpY477zv-30AeSRJXsvOkgFc,16542
155
158
  sglang/srt/models/qwen2_vl.py,sha256=3EaUlTbyWOTRXA7eViK1WqmVbCFhXLIpnos49zzf-yM,26561
156
159
  sglang/srt/models/registry.py,sha256=inKh9iwOp3LFYm3nqujg-OtABClOP-ifc1stA9cZegA,3434
157
160
  sglang/srt/models/stablelm.py,sha256=iBlIkM7CQmqI25nsujWk0LLCQD7TshzUU8qzZYYrt20,11311
158
161
  sglang/srt/models/torch_native_llama.py,sha256=YeXHorFm6QfnczLXwPb5TG9a-He0uiA9RzpR1YZKGg4,18758
159
162
  sglang/srt/models/xverse.py,sha256=Oq--KqvbYu2H4TMVGEHpSnJLEwXBpxlncR9ilsQeckc,13579
160
- sglang/srt/models/xverse_moe.py,sha256=AawKEQw--oAl-yzwCjoaZRG7q3rdkyDiam3FS0zjf_c,15537
163
+ sglang/srt/models/xverse_moe.py,sha256=7E60YIST4ELYwLRgjtHiLRI5Uyc7XqQTM7jQXiWaQs4,15541
161
164
  sglang/srt/models/yivl.py,sha256=88OubtuZ38Dxb2LzfV_MTPBI4wKhh4NJqFu--efbhFM,4809
162
- sglang/srt/openai_api/adapter.py,sha256=dvKq4O3Rhd77ad6iCtPNykgnk9PVJE-E8wHVsBAfCQQ,53927
165
+ sglang/srt/openai_api/adapter.py,sha256=DbLA4-v-QrKJHYDH4fpDSXqmyz_vpcFE-1tnhh60m6o,54057
163
166
  sglang/srt/openai_api/protocol.py,sha256=ecRNNqkhwwKZaIoJlPhtp2VTcHxBJDbNN8lrKS7uBx8,10406
164
167
  sglang/srt/sampling/sampling_batch_info.py,sha256=s--zNjk-LErZ5lMqnZ7KiuJltaziKRbQAU5qYpKIxAc,8564
165
168
  sglang/srt/sampling/sampling_params.py,sha256=n7RbBg_bS5fYhsiWa8uJYnfoXy_i5DvtTBOkuFnHDNU,5286
@@ -180,12 +183,13 @@ sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWU
180
183
  sglang/test/simple_eval_mgsm.py,sha256=rd7TSUyxdKbrXaVoewo24V8lCo_6kO8zxPhhmvylpw8,10259
181
184
  sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
182
185
  sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
186
+ sglang/test/test_block_fp8.py,sha256=rhrIun8aW5zq2qvuGRlo7F7aZ_upjVxtQMVlyc2Th_E,11771
183
187
  sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
184
188
  sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
185
189
  sglang/test/test_utils.py,sha256=HJG7kUQOk6n9FBbH89PDtQ41C3kt1cfJODhAEcFT0AQ,23823
186
190
  sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
187
- sglang-0.4.0.post2.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
188
- sglang-0.4.0.post2.dist-info/METADATA,sha256=maHXecD3U1DdhzfU2aBMhN96MQRqCBPsIA1KlO7t7dg,22512
189
- sglang-0.4.0.post2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
190
- sglang-0.4.0.post2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
191
- sglang-0.4.0.post2.dist-info/RECORD,,
191
+ sglang-0.4.1.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
192
+ sglang-0.4.1.dist-info/METADATA,sha256=RlVEQtwr_CCGTs83vNPwWXQukutbFfBz9xBPlXSl6qc,22523
193
+ sglang-0.4.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
194
+ sglang-0.4.1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
195
+ sglang-0.4.1.dist-info/RECORD,,
@@ -1,133 +0,0 @@
1
- """
2
- Torch-native implementation for FusedMoE. This is used for torch.compile.
3
- It is based on https://github.com/pytorch-labs/gpt-fast/blob/32971d3129541c5bfb4f715abc33d1c5f408d204/mixtral-moe/model.py#L204
4
- """
5
-
6
- from typing import Callable, Optional
7
-
8
- import torch
9
- from torch.nn import functional as F
10
-
11
-
12
- def fused_topk_native(
13
- hidden_states: torch.Tensor,
14
- gating_output: torch.Tensor,
15
- topk: int,
16
- renormalize: bool,
17
- ):
18
- assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
19
- M, _ = hidden_states.shape
20
- topk_weights = torch.empty(
21
- M, topk, dtype=torch.float32, device=hidden_states.device
22
- )
23
- topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
24
- topk_weights = F.softmax(gating_output.float(), dim=-1)
25
- topk_weights, topk_ids = torch.topk(topk_weights, topk, dim=-1)
26
- if renormalize:
27
- topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
28
- return topk_weights, topk_ids
29
-
30
-
31
- # This is used by the Deepseek-V2 model
32
- def grouped_topk(
33
- hidden_states: torch.Tensor,
34
- gating_output: torch.Tensor,
35
- topk: int,
36
- renormalize: bool,
37
- num_expert_group: int = 0,
38
- topk_group: int = 0,
39
- ):
40
-
41
- assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
42
-
43
- scores = torch.softmax(gating_output, dim=-1)
44
- num_token = scores.shape[0]
45
- group_scores = (
46
- scores.view(num_token, num_expert_group, -1).max(dim=-1).values
47
- ) # [n, n_group]
48
- group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
49
- 1
50
- ] # [n, top_k_group]
51
- group_mask = torch.zeros_like(group_scores) # [n, n_group]
52
- group_mask.scatter_(1, group_idx, 1) # [n, n_group]
53
- score_mask = (
54
- group_mask.unsqueeze(-1)
55
- .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
56
- .reshape(num_token, -1)
57
- ) # [n, e]
58
- tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e]
59
- topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
60
-
61
- if renormalize:
62
- topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
63
- return topk_weights, topk_ids
64
-
65
-
66
- def select_experts_native(
67
- hidden_states: torch.Tensor,
68
- router_logits: torch.Tensor,
69
- top_k: int,
70
- use_grouped_topk: bool,
71
- renormalize: bool,
72
- topk_group: Optional[int] = None,
73
- num_expert_group: Optional[int] = None,
74
- ):
75
- # DeekSeekv2 uses grouped_top_k
76
- if use_grouped_topk:
77
- assert topk_group is not None
78
- assert num_expert_group is not None
79
- topk_weights, topk_ids = grouped_topk(
80
- hidden_states=hidden_states,
81
- gating_output=router_logits,
82
- topk=top_k,
83
- renormalize=renormalize,
84
- num_expert_group=num_expert_group,
85
- topk_group=topk_group,
86
- )
87
- else:
88
- topk_weights, topk_ids = fused_topk_native(
89
- hidden_states=hidden_states,
90
- gating_output=router_logits,
91
- topk=top_k,
92
- renormalize=renormalize,
93
- )
94
- return topk_weights, topk_ids
95
-
96
-
97
- def fused_moe_forward_native(
98
- layer: torch.nn.Module,
99
- x: torch.Tensor,
100
- use_grouped_topk: bool,
101
- top_k: int,
102
- router_logits: torch.Tensor,
103
- renormalize: bool,
104
- topk_group: Optional[int] = None,
105
- num_expert_group: Optional[int] = None,
106
- custom_routing_function: Optional[Callable] = None,
107
- ) -> torch.Tensor:
108
-
109
- if use_grouped_topk:
110
- assert num_expert_group is not None and topk_group is not None
111
- topk_weights, topk_ids = grouped_topk(
112
- x,
113
- router_logits,
114
- top_k,
115
- renormalize,
116
- num_expert_group,
117
- topk_group,
118
- )
119
- elif custom_routing_function is None:
120
- topk_weights, topk_ids = fused_topk_native(x, router_logits, top_k, renormalize)
121
- else:
122
- topk_weights, topk_ids = custom_routing_function(
123
- x, router_logits, top_k, renormalize
124
- )
125
-
126
- w13_weights = layer.w13_weight[topk_ids]
127
- w1_weights, w3_weights = torch.chunk(w13_weights, 2, dim=2)
128
- w2_weights = layer.w2_weight[topk_ids]
129
- x1 = torch.einsum("ti,taoi -> tao", x, w1_weights)
130
- x1 = F.silu(x1)
131
- x3 = torch.einsum("ti, taoi -> tao", x, w3_weights)
132
- expert_outs = torch.einsum("tao, taio -> tai", (x1 * x3), w2_weights)
133
- return torch.einsum("tai,ta -> ti", expert_outs, topk_weights.to(expert_outs.dtype))
File without changes
File without changes