sglang 0.4.0.post1__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +6 -6
- sglang/bench_one_batch.py +1 -0
- sglang/bench_serving.py +9 -1
- sglang/check_env.py +140 -48
- sglang/lang/backend/runtime_endpoint.py +1 -0
- sglang/lang/chat_template.py +32 -0
- sglang/llama3_eval.py +316 -0
- sglang/srt/aio_rwlock.py +100 -0
- sglang/srt/configs/model_config.py +8 -1
- sglang/srt/constrained/xgrammar_backend.py +4 -1
- sglang/srt/layers/attention/flashinfer_backend.py +51 -5
- sglang/srt/layers/attention/triton_backend.py +16 -25
- sglang/srt/layers/attention/triton_ops/decode_attention.py +305 -350
- sglang/srt/layers/linear.py +20 -2
- sglang/srt/layers/logits_processor.py +133 -95
- sglang/srt/layers/{ep_moe → moe/ep_moe}/layer.py +18 -39
- sglang/srt/layers/moe/fused_moe_native.py +46 -0
- sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/__init__.py +3 -7
- sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/fused_moe.py +174 -119
- sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/layer.py +17 -49
- sglang/srt/layers/moe/topk.py +191 -0
- sglang/srt/layers/quantization/__init__.py +5 -50
- sglang/srt/layers/quantization/fp8.py +221 -36
- sglang/srt/layers/quantization/fp8_kernel.py +278 -0
- sglang/srt/layers/quantization/fp8_utils.py +90 -1
- sglang/srt/layers/radix_attention.py +8 -1
- sglang/srt/layers/sampler.py +27 -5
- sglang/srt/layers/torchao_utils.py +31 -0
- sglang/srt/managers/detokenizer_manager.py +37 -17
- sglang/srt/managers/io_struct.py +39 -10
- sglang/srt/managers/schedule_batch.py +54 -34
- sglang/srt/managers/schedule_policy.py +64 -5
- sglang/srt/managers/scheduler.py +171 -136
- sglang/srt/managers/tokenizer_manager.py +184 -133
- sglang/srt/mem_cache/base_prefix_cache.py +2 -2
- sglang/srt/mem_cache/chunk_cache.py +2 -2
- sglang/srt/mem_cache/memory_pool.py +15 -8
- sglang/srt/mem_cache/radix_cache.py +12 -2
- sglang/srt/model_executor/cuda_graph_runner.py +25 -11
- sglang/srt/model_executor/model_runner.py +28 -14
- sglang/srt/model_parallel.py +66 -5
- sglang/srt/models/dbrx.py +1 -1
- sglang/srt/models/deepseek.py +1 -1
- sglang/srt/models/deepseek_v2.py +67 -18
- sglang/srt/models/gemma2.py +34 -0
- sglang/srt/models/gemma2_reward.py +0 -1
- sglang/srt/models/granite.py +517 -0
- sglang/srt/models/grok.py +73 -9
- sglang/srt/models/llama.py +22 -0
- sglang/srt/models/llama_classification.py +11 -23
- sglang/srt/models/llama_reward.py +0 -2
- sglang/srt/models/llava.py +37 -14
- sglang/srt/models/mixtral.py +2 -2
- sglang/srt/models/olmoe.py +1 -1
- sglang/srt/models/qwen2.py +20 -0
- sglang/srt/models/qwen2_moe.py +1 -1
- sglang/srt/models/xverse_moe.py +1 -1
- sglang/srt/openai_api/adapter.py +8 -0
- sglang/srt/openai_api/protocol.py +9 -4
- sglang/srt/server.py +2 -1
- sglang/srt/server_args.py +19 -9
- sglang/srt/utils.py +40 -54
- sglang/test/test_block_fp8.py +341 -0
- sglang/test/test_utils.py +3 -2
- sglang/utils.py +10 -3
- sglang/version.py +1 -1
- {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/METADATA +12 -7
- {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/RECORD +73 -67
- sglang/srt/layers/fused_moe_patch.py +0 -133
- /sglang/srt/layers/{ep_moe → moe/ep_moe}/__init__.py +0 -0
- /sglang/srt/layers/{ep_moe → moe/ep_moe}/kernels.py +0 -0
- {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/LICENSE +0 -0
- {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/WHEEL +0 -0
- {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,19 @@
|
|
1
1
|
sglang/__init__.py,sha256=b2oIdWzp5P8SzieeOs2TzJoN3Do3tfJbV8gZS_imVcs,1619
|
2
2
|
sglang/api.py,sha256=NdO6cYnklnEBQBKqQjlqI8-P1EownKQ71t5ibCGhEVo,6953
|
3
3
|
sglang/bench_latency.py,sha256=oZjSAzX7dUiSu-zdz0dkyUPo-qAX_lsXFH1gf03akgI,76
|
4
|
-
sglang/bench_offline_throughput.py,sha256=
|
5
|
-
sglang/bench_one_batch.py,sha256=
|
4
|
+
sglang/bench_offline_throughput.py,sha256=iQiJCK3KQDCdwU1NVbIwbtthssWzBXiIsKUDA7Z_hO0,12510
|
5
|
+
sglang/bench_one_batch.py,sha256=jkyMhK0lqn5dRCYgAh30qZrNHP4gAbXODymBMNXK86I,15859
|
6
6
|
sglang/bench_one_batch_server.py,sha256=-fV9FTLNNcSIy0pgYeggXedPVK0fVsXZqVQswT8OMOY,5945
|
7
|
-
sglang/bench_serving.py,sha256=
|
8
|
-
sglang/check_env.py,sha256=
|
7
|
+
sglang/bench_serving.py,sha256=3VQatM51v9f55aUQQ5crYMxxKHr1AbThicsWfBy_tjU,53190
|
8
|
+
sglang/check_env.py,sha256=4OqpZaEJOfBM6-vtPILto5kqDmgiZM1Koc7lK78A7CI,8427
|
9
9
|
sglang/global_config.py,sha256=fnT0U9vlHdGaQFKN9tYTnUF4-eVW4HYQURd5zvPtrg0,1286
|
10
10
|
sglang/launch_server.py,sha256=4y2QeSj0wVNB9MJQZeahD4ahTDU6gwqo7MPUytyFop0,403
|
11
11
|
sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
|
12
|
-
sglang/
|
13
|
-
sglang/
|
12
|
+
sglang/llama3_eval.py,sha256=gWSboDchIGybIce88bJlrCG0yiLZ513mw4gcutJlzGM,10017
|
13
|
+
sglang/utils.py,sha256=23jf4Mz8E5p5a6JOkjnfYZixdjZUk88F_mZ8rZcby5Q,11597
|
14
|
+
sglang/version.py,sha256=pMtTmSUht-XtbR_7Doz6bsQqopJJd8rZ8I8zy2HwwoA,22
|
14
15
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
-
sglang/lang/chat_template.py,sha256=
|
16
|
+
sglang/lang/chat_template.py,sha256=cnfjjxIIcYRGRxXlJlOGnpFxFuhMHut7DS52LsOMKcA,15826
|
16
17
|
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
17
18
|
sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
|
18
19
|
sglang/lang/interpreter.py,sha256=SBjejhLhTKzNM0HbjtTg5r17WPJ64WFSk6lcM_SCWKs,30717
|
@@ -23,27 +24,28 @@ sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtx
|
|
23
24
|
sglang/lang/backend/base_backend.py,sha256=tdoh9YF3CyekY1BKiX9n7-aA4srDWIuA4RDJLM7q8qg,1985
|
24
25
|
sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
|
25
26
|
sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
|
26
|
-
sglang/lang/backend/runtime_endpoint.py,sha256=
|
27
|
+
sglang/lang/backend/runtime_endpoint.py,sha256=dfs-yZ1ekKmnbpZLluQHWPmMeZJKbaaZRRGYRa9eBE8,10541
|
27
28
|
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
28
29
|
sglang/srt/_custom_ops.py,sha256=Y4gyTDGhWz-W2Igq25Ojm8XFiyvkawW9I-79iwYvxJ0,3574
|
30
|
+
sglang/srt/aio_rwlock.py,sha256=6LYtOdeTUY3hkfa1dmYkgsaF2ttrwIF3hUWz2AZ2fqw,2970
|
29
31
|
sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21201
|
30
32
|
sglang/srt/hf_transformers_utils.py,sha256=38Ms0H2-VMerOS6jnczcFtZMS6lhw9B5rSWKAfxVUfQ,7945
|
31
33
|
sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
|
32
|
-
sglang/srt/model_parallel.py,sha256=
|
33
|
-
sglang/srt/server.py,sha256=
|
34
|
-
sglang/srt/server_args.py,sha256=
|
35
|
-
sglang/srt/utils.py,sha256=
|
34
|
+
sglang/srt/model_parallel.py,sha256=eLXZhvJ4wG6dh0FontNCIdVZvHYdWgaeY-5cu7TD9tE,6078
|
35
|
+
sglang/srt/server.py,sha256=E9YKKXpXv3vPvRy0-cgcy0-5UA-OZz42-32EZWKTicA,34661
|
36
|
+
sglang/srt/server_args.py,sha256=LgnQ-kBJZ3E7hMMZj9bSK0mn7Bhjk1nJHxLcxl-lGTM,34572
|
37
|
+
sglang/srt/utils.py,sha256=J8kFl6kDBwFZCM6AKaVTiqdhJKRg0JOH0pNrD1ZeWmM,41726
|
36
38
|
sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
|
37
39
|
sglang/srt/configs/device_config.py,sha256=dResqHjkg_dq10v6rnVpbXpvABZRB0jylOm-2_JAnx0,428
|
38
40
|
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
39
41
|
sglang/srt/configs/load_config.py,sha256=TcPi_HY6xu5SiVZsxPOoB5pGeDUNebOk7muoUH9VBDg,3083
|
40
|
-
sglang/srt/configs/model_config.py,sha256=
|
42
|
+
sglang/srt/configs/model_config.py,sha256=vVarlLTw9Ged1PXIwRP-R8UhiG6oaezNIZhTNuF0eQc,16070
|
41
43
|
sglang/srt/configs/qwen2vl.py,sha256=ZjLy9v2eZY4wptUfY3CWgYKg2B5DDrkfCSyTy_Zf_bg,4351
|
42
44
|
sglang/srt/constrained/__init__.py,sha256=UWZNVLvOT5ZBX8M36sONgDmnKtkQ0cSfhQD2jO0ATuk,786
|
43
45
|
sglang/srt/constrained/base_grammar_backend.py,sha256=FhVm7PxhXDl0joV9NP5RjKgz7dR1dZvUAQnh0mdtvVY,2353
|
44
46
|
sglang/srt/constrained/outlines_backend.py,sha256=CipNHNNXs8xtnJNVNe6FCwZUlSbIXbGmWVlZz3hUpFQ,6820
|
45
47
|
sglang/srt/constrained/outlines_jump_forward.py,sha256=iZWXeR3gNYoMubLGyFmLPO4V2YsN5DiGjD71Xk9iFaE,6418
|
46
|
-
sglang/srt/constrained/xgrammar_backend.py,sha256=
|
48
|
+
sglang/srt/constrained/xgrammar_backend.py,sha256=4It9_GqU4UZFhxIw_7hkzpXaMPUtksk6Xfe0Agsfw7A,4620
|
47
49
|
sglang/srt/distributed/__init__.py,sha256=__tl9Frrf3PFrSyNYcn5i-y2rL-J4-Qn6RJwrsZ4xgc,83
|
48
50
|
sglang/srt/distributed/communication_op.py,sha256=ZoIhboZyefiAwr-1K-wF3rAFSQ4Wt-RxXpsX443Gbt4,1157
|
49
51
|
sglang/srt/distributed/parallel_state.py,sha256=HplRH5S0AWdwSdhoHYX9_UWQZlFjh2Z1LHaz68EXlpE,47555
|
@@ -59,59 +61,61 @@ sglang/srt/distributed/device_communicators/shm_broadcast.py,sha256=WVxBd1QfIgRW
|
|
59
61
|
sglang/srt/distributed/device_communicators/xpu_communicator.py,sha256=P3WKgddcfpUhBa-_5PvjYxH146ZE-N1cotTzEpPRKlY,1620
|
60
62
|
sglang/srt/layers/activation.py,sha256=EboMjT9HV2tNHQ6rzpojtlkzev1lAFbhQlxMg9hwxBQ,5471
|
61
63
|
sglang/srt/layers/custom_op_util.py,sha256=0vu-yX2wwonmO1L_o5G7SA6C-8XuhDIh9rPDvNeLhoc,922
|
62
|
-
sglang/srt/layers/fused_moe_patch.py,sha256=DMIyrwOON7OSidKZdreL5HzMhP0AD5Ues0xdY-ADOQw,4471
|
63
64
|
sglang/srt/layers/layernorm.py,sha256=nRQ1w1xSUcU-zlqVC61BnGG6otS5W1w9VaSzeXizrx4,4037
|
64
|
-
sglang/srt/layers/linear.py,sha256=
|
65
|
-
sglang/srt/layers/logits_processor.py,sha256=
|
65
|
+
sglang/srt/layers/linear.py,sha256=KyRFU0VcoNuN-hnQB9QQcBN9NCpeqPtLzzufIHUpV6w,47064
|
66
|
+
sglang/srt/layers/logits_processor.py,sha256=JlOU0x8vBGIuTwHSdjR6Kly9_uzilBMv0NE_rvUx0W4,14747
|
66
67
|
sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
|
67
|
-
sglang/srt/layers/radix_attention.py,sha256=
|
68
|
+
sglang/srt/layers/radix_attention.py,sha256=E4cmvkcCdCtb6VyLNrCKy1D6VwHQ063oH3JQXPaRy6w,2178
|
68
69
|
sglang/srt/layers/rotary_embedding.py,sha256=29tx3JNR40AoXqBa2cFGBjva9vU2xgFipETlpMaaZas,3985
|
69
|
-
sglang/srt/layers/sampler.py,sha256=
|
70
|
-
sglang/srt/layers/torchao_utils.py,sha256=
|
70
|
+
sglang/srt/layers/sampler.py,sha256=k4Op_HMkQfT7t9wgQwBVotfTUXEocrzRyQqEFnff1pc,5511
|
71
|
+
sglang/srt/layers/torchao_utils.py,sha256=dQVuWNXxAvOPjr2G5BBMWqC2oKcS2B52rx-fEc_elmc,3545
|
71
72
|
sglang/srt/layers/vocab_parallel_embedding.py,sha256=slGwLiWjuFLCUdRe-GTlfumyZpqVX9VF6No_UGOT-hA,21624
|
72
73
|
sglang/srt/layers/attention/__init__.py,sha256=KIJhzOJWYioQE7Va4D83-V-ZUZVMZcczuNgDC3dlSRo,2583
|
73
74
|
sglang/srt/layers/attention/double_sparsity_backend.py,sha256=RQdEKRykSLf9ilnaHmR6T7RFqh4emH_adfB3aJN2BUU,10920
|
74
|
-
sglang/srt/layers/attention/flashinfer_backend.py,sha256=
|
75
|
+
sglang/srt/layers/attention/flashinfer_backend.py,sha256=umD1E2zvMnPbbgvx2Ex5LQB6a4a41brjsks1M0gFMMU,26357
|
75
76
|
sglang/srt/layers/attention/torch_native_backend.py,sha256=nQdeqWEMMH_wrod5wssDCJG-uPKm0uslvkALKqPRPQ8,10509
|
76
|
-
sglang/srt/layers/attention/triton_backend.py,sha256
|
77
|
-
sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=
|
77
|
+
sglang/srt/layers/attention/triton_backend.py,sha256=-TobyZHwlbJ5HhbFg-jgCqVOw4Y-opgEuFo-EusASQc,6264
|
78
|
+
sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=oJ_UK1t229zF3hbTDiQe7t-X-IbM2dOxx4U2ch-vmjA,17847
|
78
79
|
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXfY3EEaM7iRN_uElHnAfsrJMhTFbu9fj8Z0O2PbE,21480
|
79
80
|
sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=tZJhzqcf1KKMT8z7_32eVk_D1NHP71c-S3UNxemfAHM,11542
|
80
81
|
sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=lojFXRZMLWkzS2Y8uxaolnQhXaWKG19mCAWaF5KQeiI,6087
|
81
|
-
sglang/srt/layers/
|
82
|
-
sglang/srt/layers/
|
83
|
-
sglang/srt/layers/ep_moe/
|
84
|
-
sglang/srt/layers/
|
85
|
-
sglang/srt/layers/
|
86
|
-
sglang/srt/layers/fused_moe_triton/
|
87
|
-
sglang/srt/layers/
|
82
|
+
sglang/srt/layers/moe/fused_moe_native.py,sha256=8q-LFZMSCGLc2_Gltp2lH0gSb4A1WOuKQW3wo3rpj5g,1601
|
83
|
+
sglang/srt/layers/moe/topk.py,sha256=YjIiFqMERvkChkwZUqTrL_xaQyzsYsZzVUe4PzAhRZI,6299
|
84
|
+
sglang/srt/layers/moe/ep_moe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
85
|
+
sglang/srt/layers/moe/ep_moe/kernels.py,sha256=wb_S2qLxoWWgQu9coXy0XLNGvHzdZSdwXr0PGy4QySg,10940
|
86
|
+
sglang/srt/layers/moe/ep_moe/layer.py,sha256=6iQU5ZjQ8IXGoQ8ZlBuJqyQxYTEem9vXI6rbVIWKlZw,22303
|
87
|
+
sglang/srt/layers/moe/fused_moe_triton/__init__.py,sha256=h9yMFAL_bagUf-qBED8gSWdCOb7d8IdA-pE-L_nIg8E,842
|
88
|
+
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=GVugCKapd3CvgkvPQ_FmQplC12-grv3n1FRkLJc6WhY,30790
|
89
|
+
sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=BclDj5JyCy-8Bfue4broL1-IG6a4dUyggE9WQLa06sg,20575
|
90
|
+
sglang/srt/layers/quantization/__init__.py,sha256=VPYXShHvbvkOgVBlkIqic4RhdJ1y6EZ3r34T-nZMT1k,4606
|
88
91
|
sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
|
89
|
-
sglang/srt/layers/quantization/fp8.py,sha256=
|
90
|
-
sglang/srt/layers/quantization/
|
92
|
+
sglang/srt/layers/quantization/fp8.py,sha256=wNnpXLroIl7D98mlfCiXZPE9hrP5ricHrXY1WZBzEEo,30810
|
93
|
+
sglang/srt/layers/quantization/fp8_kernel.py,sha256=v4-7hCQFyuUSZmeJS_5VDCu6a1-EGWXQ088FdPTjO_0,8137
|
94
|
+
sglang/srt/layers/quantization/fp8_utils.py,sha256=HBJBaNcln1NrLxzw0ppUjMd6w-ryuGDDHCYJq7mRQac,4035
|
91
95
|
sglang/srt/lora/lora.py,sha256=-o2mBmUvoVpdkgdAkWTARN4kfyep3UNEJLcg6moh0SU,15056
|
92
96
|
sglang/srt/lora/lora_config.py,sha256=a2fTQESlCbG1xLiBYy4ptZ6c0Burcqyg1_6V1XSok-Y,1506
|
93
97
|
sglang/srt/lora/lora_manager.py,sha256=DHiqdl0_4wQ5PxZBZtlCpP14515mDV2_H9tzL3Rdss8,12886
|
94
98
|
sglang/srt/managers/data_parallel_controller.py,sha256=psI4FAuBGjtdnEuwagnGdtRqvqSSxOROfNKQqVDqlVA,8382
|
95
|
-
sglang/srt/managers/detokenizer_manager.py,sha256=
|
99
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=nZkbwt4yty_oy8rvg4T7PbgyVLoBLohvHl25xlQpBoo,8439
|
96
100
|
sglang/srt/managers/image_processor.py,sha256=Y8RgyrzbJjJTpjbnZDa5qiiG5wWjZ68rOXUPDi6kkFo,13698
|
97
|
-
sglang/srt/managers/io_struct.py,sha256=
|
98
|
-
sglang/srt/managers/schedule_batch.py,sha256=
|
99
|
-
sglang/srt/managers/schedule_policy.py,sha256=
|
100
|
-
sglang/srt/managers/scheduler.py,sha256=
|
101
|
+
sglang/srt/managers/io_struct.py,sha256=_LWWqT3LNwZGaWhg2d3kTg1V2MTHKzRasCvxF9Nfpi4,15429
|
102
|
+
sglang/srt/managers/schedule_batch.py,sha256=qryPWCdOTFzxomDa80U-5guShOb1K4kBUWcPCCchYB8,45762
|
103
|
+
sglang/srt/managers/schedule_policy.py,sha256=cLNi__smbg02keWgUMfB_nEM3vllocPB0XyG1P5qO7I,15469
|
104
|
+
sglang/srt/managers/scheduler.py,sha256=3Olw4Yf4Qtn1i4PqK3PT9hkXYGE8nemL2_Xjn8JLxAQ,61819
|
101
105
|
sglang/srt/managers/session_controller.py,sha256=Yp-IV3rXczACZxZXmF-QxW9CWICGy8KHQ9ttBGJ8WXA,2800
|
102
|
-
sglang/srt/managers/tokenizer_manager.py,sha256=
|
106
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=Vta7Lysvh4rPWqEB00shqAzpGUfv7GdPETDqFCU8RxA,31556
|
103
107
|
sglang/srt/managers/tp_worker.py,sha256=X1EwFX3FSsmXx7jeeX2tjZRocaujabQYWm-M-0CFEBE,7363
|
104
108
|
sglang/srt/managers/tp_worker_overlap_thread.py,sha256=-QNBJRKxraa9Xt2WI1AFzZYdneIJ1eXv0GjFzDqXoE0,8926
|
105
|
-
sglang/srt/mem_cache/base_prefix_cache.py,sha256=
|
106
|
-
sglang/srt/mem_cache/chunk_cache.py,sha256=
|
109
|
+
sglang/srt/mem_cache/base_prefix_cache.py,sha256=QC8HS8RC5DXu14kyXsxAgEUsn0f932p2DjqzbKjc6Bs,962
|
110
|
+
sglang/srt/mem_cache/chunk_cache.py,sha256=R2gHAuqKd5ayQW3NnsgoGUH31---Z5izCDyCqLL0FjQ,2524
|
107
111
|
sglang/srt/mem_cache/flush_cache.py,sha256=GYcxmNXh4hsMpFfNOuCTpKilW7guZwTtAg_usVeM3J0,979
|
108
|
-
sglang/srt/mem_cache/memory_pool.py,sha256=
|
109
|
-
sglang/srt/mem_cache/radix_cache.py,sha256=
|
112
|
+
sglang/srt/mem_cache/memory_pool.py,sha256=oxk3UtiiFA3_1iIP6eFsk8HIcRI_8Z1-FE2KOWDr-YM,11366
|
113
|
+
sglang/srt/mem_cache/radix_cache.py,sha256=c5voySV5L855c0G9cBEc9iQ4nR7PDDmg0V6fWWJHcq4,10945
|
110
114
|
sglang/srt/metrics/collector.py,sha256=ZWoFx_FKN0sNMSZ8RJWUVQ0RFEYhIHxdw0d4TZTluMU,6861
|
111
115
|
sglang/srt/metrics/func_timer.py,sha256=VFyNRrbnKVCwnQsrlLin1lITJfjQpf9m8sGPqL5LIsQ,3438
|
112
|
-
sglang/srt/model_executor/cuda_graph_runner.py,sha256=
|
116
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=1n5WxoE9-0B3unwkkcR355K_D290h2LGt_7EvH02DQM,16246
|
113
117
|
sglang/srt/model_executor/forward_batch_info.py,sha256=L5mVoW5SaO6To-7nGk0TZM-FFB5_78cARpJ-aC2rwD0,12883
|
114
|
-
sglang/srt/model_executor/model_runner.py,sha256=
|
118
|
+
sglang/srt/model_executor/model_runner.py,sha256=Bm3NWTS3xmOGXEJnucnJZQldpVOzu-DCEUfaJy_PTU0,30104
|
115
119
|
sglang/srt/model_loader/__init__.py,sha256=zGZkOBz1zx-pkaIy47BasL3fjDlAcxAXUTjInOhXHAE,919
|
116
120
|
sglang/srt/model_loader/loader.py,sha256=VBrY4W9CiVvS_D8yXhdkW9jReV9rSMSkJplabz0Fxgk,43528
|
117
121
|
sglang/srt/model_loader/utils.py,sha256=0NaMR67fESFopaklmsleiL27XH1QUrjZW246MUu1EJ0,1369
|
@@ -119,46 +123,47 @@ sglang/srt/model_loader/weight_utils.py,sha256=kQo9KPThjH3HAOCfC_tdwdrshdWuWJOVp
|
|
119
123
|
sglang/srt/models/baichuan.py,sha256=PzBOFcEAixakPEkQSaJwC0Xc1fu-yCsN9T0I67r8QmY,14919
|
120
124
|
sglang/srt/models/chatglm.py,sha256=DOrEhmb0s-yPId88R6nJeLOTUEtogk-vkB69qT2JdWc,12913
|
121
125
|
sglang/srt/models/commandr.py,sha256=PNXgfOZF84h-rSH0edEECUmEGW8YLb44V75Z_oDhFiA,14223
|
122
|
-
sglang/srt/models/dbrx.py,sha256=
|
123
|
-
sglang/srt/models/deepseek.py,sha256=
|
124
|
-
sglang/srt/models/deepseek_v2.py,sha256
|
126
|
+
sglang/srt/models/dbrx.py,sha256=okIpIwdr8Cfrz_thzc1F75XqCUfHhFLvZ1B6BaswKoA,14585
|
127
|
+
sglang/srt/models/deepseek.py,sha256=_cVOvR6eSEgRf6TUBpTD5uMdijDWFw4sSt4lGzl8tbg,15697
|
128
|
+
sglang/srt/models/deepseek_v2.py,sha256=-v_OJr2c3gJ0NMxQjvT3Jknz1XPGkzKx0TVR3NIiC6A,37284
|
125
129
|
sglang/srt/models/exaone.py,sha256=dkERTZVxrRroqu5AGLP7D4N6n8HvDqlNaDQUIe15mZY,13038
|
126
130
|
sglang/srt/models/gemma.py,sha256=ydRqsG-7004r1fAiz01LHUmcj_6XN0Tn4xO1keJnMQk,12126
|
127
|
-
sglang/srt/models/gemma2.py,sha256=
|
128
|
-
sglang/srt/models/gemma2_reward.py,sha256=
|
131
|
+
sglang/srt/models/gemma2.py,sha256=41PlW8pMb4rMETdAni_JWDhZeIn_QsTQireAyUjsURA,15848
|
132
|
+
sglang/srt/models/gemma2_reward.py,sha256=nJ01KfqLSJtqMLm3sG8p2mGZFK1xhhjh7I7Ccb-_Hq8,2494
|
129
133
|
sglang/srt/models/gpt2.py,sha256=2je1kE09sGcaORWnJuGYAkcwwOrT9EK-KhQaoCKjCSA,9517
|
130
134
|
sglang/srt/models/gpt_bigcode.py,sha256=tovyOdJu2x3LkzmkdFXX_iJdkxuyChIDxwgvPBy6UPo,9528
|
131
|
-
sglang/srt/models/
|
135
|
+
sglang/srt/models/granite.py,sha256=AeQY9Dxd1ZnwgCYBK0vSXXiMGM-yt9iaOVf_ruOUHXw,20409
|
136
|
+
sglang/srt/models/grok.py,sha256=J9lgNbFebvXgF19nfZyHwlGPlGWY_m0LgP506YvOYrU,15668
|
132
137
|
sglang/srt/models/internlm2.py,sha256=_xcKtd6YtEFUTozaN-yUb0xbSYckRpomfPSKcAk4j-Y,12127
|
133
138
|
sglang/srt/models/internlm2_reward.py,sha256=8K26A9oIFFGx_9U2mF87j7FX8K87HGKMnVL3ht1Uc7I,2398
|
134
|
-
sglang/srt/models/llama.py,sha256=
|
135
|
-
sglang/srt/models/llama_classification.py,sha256=
|
139
|
+
sglang/srt/models/llama.py,sha256=S7nS05hhFGghXu0v-w9RZyBTY6OCEVF5Aaw4GX_E_9g,19929
|
140
|
+
sglang/srt/models/llama_classification.py,sha256=DwboM1xHXdf3Fddf7xGnrfdOLJwXdiJs994cIpAPa2g,2984
|
136
141
|
sglang/srt/models/llama_embedding.py,sha256=rh-AiczPY_pTpzcACHvSMVjh1hsV_MZBBwP0LQxPsGM,3130
|
137
|
-
sglang/srt/models/llama_reward.py,sha256=
|
138
|
-
sglang/srt/models/llava.py,sha256=
|
142
|
+
sglang/srt/models/llama_reward.py,sha256=oPxh5E2UkxLULNdR68dFvt2I7j33CJFN6nyA-8L2_cg,4516
|
143
|
+
sglang/srt/models/llava.py,sha256=xrkg8sht8tBOID7427IEZtHL-KKWfEivDe2NqGjTSAs,26373
|
139
144
|
sglang/srt/models/llavavid.py,sha256=dYUkKfHoE15vF_VXA_s_ICCTUMSmSgvP181fk8dUi0g,12185
|
140
145
|
sglang/srt/models/minicpm.py,sha256=ws4AqhOfAvYHGd04QuXCZel-Oxy9_vN4p4rTjs9RSz0,13723
|
141
146
|
sglang/srt/models/minicpm3.py,sha256=YIKJDTpwjmpLlv1sNT93k2yZMvGQlI_H87czjf6QYyo,24707
|
142
147
|
sglang/srt/models/mistral.py,sha256=EYifJUUzN2Z2-iL37eJiNZF_DB0H4pa0mKlgYRIxM70,838
|
143
|
-
sglang/srt/models/mixtral.py,sha256=
|
148
|
+
sglang/srt/models/mixtral.py,sha256=L2Gz-Cmih1V75Ks9jmI2a6rUQ1Cl6F2uDgrhDjjDJzs,14523
|
144
149
|
sglang/srt/models/mixtral_quant.py,sha256=uuVO1nWUZJiDhbqZN6gzSMwyfpyZorMuFXHeMCGo7N0,14022
|
145
150
|
sglang/srt/models/mllama.py,sha256=3kX-UqeTSYZL5kPNdkfKEAEv3DpSAW1ArAAoeiXVzIc,37739
|
146
151
|
sglang/srt/models/olmo.py,sha256=OCDMtX1OI83r80mzU4FMC3Tg8cleQ-7C8Tpoe8zgzss,11708
|
147
152
|
sglang/srt/models/olmo2.py,sha256=aC7svioN7XT5owRxPrvhvWBNMON9QXGQBWJ1KHMyXeA,13442
|
148
|
-
sglang/srt/models/olmoe.py,sha256=
|
153
|
+
sglang/srt/models/olmoe.py,sha256=LiHVGfRaC5c_BU_vVgtV9uLuDH_SC0dw1kEc61posmI,15351
|
149
154
|
sglang/srt/models/phi3_small.py,sha256=44_my3QmgJ2N7SOkGZzEb62DXBeCVHojfmCWgkk2uCI,14802
|
150
155
|
sglang/srt/models/qwen.py,sha256=_FKDbwaS5C07uJyyivZpBrXJVej4Ph9ivzJdzWJPxJ4,9904
|
151
|
-
sglang/srt/models/qwen2.py,sha256=
|
152
|
-
sglang/srt/models/qwen2_moe.py,sha256=
|
156
|
+
sglang/srt/models/qwen2.py,sha256=be4xgcuqNa9kBdaL7x3PjsnUky6fh5K33c_khAWSi04,12959
|
157
|
+
sglang/srt/models/qwen2_moe.py,sha256=6xRRJxWWh1M5UFPfvhsCpY477zv-30AeSRJXsvOkgFc,16542
|
153
158
|
sglang/srt/models/qwen2_vl.py,sha256=3EaUlTbyWOTRXA7eViK1WqmVbCFhXLIpnos49zzf-yM,26561
|
154
159
|
sglang/srt/models/registry.py,sha256=inKh9iwOp3LFYm3nqujg-OtABClOP-ifc1stA9cZegA,3434
|
155
160
|
sglang/srt/models/stablelm.py,sha256=iBlIkM7CQmqI25nsujWk0LLCQD7TshzUU8qzZYYrt20,11311
|
156
161
|
sglang/srt/models/torch_native_llama.py,sha256=YeXHorFm6QfnczLXwPb5TG9a-He0uiA9RzpR1YZKGg4,18758
|
157
162
|
sglang/srt/models/xverse.py,sha256=Oq--KqvbYu2H4TMVGEHpSnJLEwXBpxlncR9ilsQeckc,13579
|
158
|
-
sglang/srt/models/xverse_moe.py,sha256=
|
163
|
+
sglang/srt/models/xverse_moe.py,sha256=7E60YIST4ELYwLRgjtHiLRI5Uyc7XqQTM7jQXiWaQs4,15541
|
159
164
|
sglang/srt/models/yivl.py,sha256=88OubtuZ38Dxb2LzfV_MTPBI4wKhh4NJqFu--efbhFM,4809
|
160
|
-
sglang/srt/openai_api/adapter.py,sha256=
|
161
|
-
sglang/srt/openai_api/protocol.py,sha256=
|
165
|
+
sglang/srt/openai_api/adapter.py,sha256=DbLA4-v-QrKJHYDH4fpDSXqmyz_vpcFE-1tnhh60m6o,54057
|
166
|
+
sglang/srt/openai_api/protocol.py,sha256=ecRNNqkhwwKZaIoJlPhtp2VTcHxBJDbNN8lrKS7uBx8,10406
|
162
167
|
sglang/srt/sampling/sampling_batch_info.py,sha256=s--zNjk-LErZ5lMqnZ7KiuJltaziKRbQAU5qYpKIxAc,8564
|
163
168
|
sglang/srt/sampling/sampling_params.py,sha256=n7RbBg_bS5fYhsiWa8uJYnfoXy_i5DvtTBOkuFnHDNU,5286
|
164
169
|
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
@@ -178,12 +183,13 @@ sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWU
|
|
178
183
|
sglang/test/simple_eval_mgsm.py,sha256=rd7TSUyxdKbrXaVoewo24V8lCo_6kO8zxPhhmvylpw8,10259
|
179
184
|
sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
|
180
185
|
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
186
|
+
sglang/test/test_block_fp8.py,sha256=rhrIun8aW5zq2qvuGRlo7F7aZ_upjVxtQMVlyc2Th_E,11771
|
181
187
|
sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
|
182
188
|
sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
|
183
|
-
sglang/test/test_utils.py,sha256=
|
189
|
+
sglang/test/test_utils.py,sha256=HJG7kUQOk6n9FBbH89PDtQ41C3kt1cfJODhAEcFT0AQ,23823
|
184
190
|
sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
|
185
|
-
sglang-0.4.
|
186
|
-
sglang-0.4.
|
187
|
-
sglang-0.4.
|
188
|
-
sglang-0.4.
|
189
|
-
sglang-0.4.
|
191
|
+
sglang-0.4.1.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
|
192
|
+
sglang-0.4.1.dist-info/METADATA,sha256=RlVEQtwr_CCGTs83vNPwWXQukutbFfBz9xBPlXSl6qc,22523
|
193
|
+
sglang-0.4.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
194
|
+
sglang-0.4.1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
195
|
+
sglang-0.4.1.dist-info/RECORD,,
|
@@ -1,133 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Torch-native implementation for FusedMoE. This is used for torch.compile.
|
3
|
-
It is based on https://github.com/pytorch-labs/gpt-fast/blob/32971d3129541c5bfb4f715abc33d1c5f408d204/mixtral-moe/model.py#L204
|
4
|
-
"""
|
5
|
-
|
6
|
-
from typing import Callable, Optional
|
7
|
-
|
8
|
-
import torch
|
9
|
-
from torch.nn import functional as F
|
10
|
-
|
11
|
-
|
12
|
-
def fused_topk_native(
|
13
|
-
hidden_states: torch.Tensor,
|
14
|
-
gating_output: torch.Tensor,
|
15
|
-
topk: int,
|
16
|
-
renormalize: bool,
|
17
|
-
):
|
18
|
-
assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
|
19
|
-
M, _ = hidden_states.shape
|
20
|
-
topk_weights = torch.empty(
|
21
|
-
M, topk, dtype=torch.float32, device=hidden_states.device
|
22
|
-
)
|
23
|
-
topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
|
24
|
-
topk_weights = F.softmax(gating_output.float(), dim=-1)
|
25
|
-
topk_weights, topk_ids = torch.topk(topk_weights, topk, dim=-1)
|
26
|
-
if renormalize:
|
27
|
-
topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
|
28
|
-
return topk_weights, topk_ids
|
29
|
-
|
30
|
-
|
31
|
-
# This is used by the Deepseek-V2 model
|
32
|
-
def grouped_topk(
|
33
|
-
hidden_states: torch.Tensor,
|
34
|
-
gating_output: torch.Tensor,
|
35
|
-
topk: int,
|
36
|
-
renormalize: bool,
|
37
|
-
num_expert_group: int = 0,
|
38
|
-
topk_group: int = 0,
|
39
|
-
):
|
40
|
-
|
41
|
-
assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
|
42
|
-
|
43
|
-
scores = torch.softmax(gating_output, dim=-1)
|
44
|
-
num_token = scores.shape[0]
|
45
|
-
group_scores = (
|
46
|
-
scores.view(num_token, num_expert_group, -1).max(dim=-1).values
|
47
|
-
) # [n, n_group]
|
48
|
-
group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
|
49
|
-
1
|
50
|
-
] # [n, top_k_group]
|
51
|
-
group_mask = torch.zeros_like(group_scores) # [n, n_group]
|
52
|
-
group_mask.scatter_(1, group_idx, 1) # [n, n_group]
|
53
|
-
score_mask = (
|
54
|
-
group_mask.unsqueeze(-1)
|
55
|
-
.expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
|
56
|
-
.reshape(num_token, -1)
|
57
|
-
) # [n, e]
|
58
|
-
tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e]
|
59
|
-
topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
|
60
|
-
|
61
|
-
if renormalize:
|
62
|
-
topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
|
63
|
-
return topk_weights, topk_ids
|
64
|
-
|
65
|
-
|
66
|
-
def select_experts_native(
|
67
|
-
hidden_states: torch.Tensor,
|
68
|
-
router_logits: torch.Tensor,
|
69
|
-
top_k: int,
|
70
|
-
use_grouped_topk: bool,
|
71
|
-
renormalize: bool,
|
72
|
-
topk_group: Optional[int] = None,
|
73
|
-
num_expert_group: Optional[int] = None,
|
74
|
-
):
|
75
|
-
# DeekSeekv2 uses grouped_top_k
|
76
|
-
if use_grouped_topk:
|
77
|
-
assert topk_group is not None
|
78
|
-
assert num_expert_group is not None
|
79
|
-
topk_weights, topk_ids = grouped_topk(
|
80
|
-
hidden_states=hidden_states,
|
81
|
-
gating_output=router_logits,
|
82
|
-
topk=top_k,
|
83
|
-
renormalize=renormalize,
|
84
|
-
num_expert_group=num_expert_group,
|
85
|
-
topk_group=topk_group,
|
86
|
-
)
|
87
|
-
else:
|
88
|
-
topk_weights, topk_ids = fused_topk_native(
|
89
|
-
hidden_states=hidden_states,
|
90
|
-
gating_output=router_logits,
|
91
|
-
topk=top_k,
|
92
|
-
renormalize=renormalize,
|
93
|
-
)
|
94
|
-
return topk_weights, topk_ids
|
95
|
-
|
96
|
-
|
97
|
-
def fused_moe_forward_native(
|
98
|
-
layer: torch.nn.Module,
|
99
|
-
x: torch.Tensor,
|
100
|
-
use_grouped_topk: bool,
|
101
|
-
top_k: int,
|
102
|
-
router_logits: torch.Tensor,
|
103
|
-
renormalize: bool,
|
104
|
-
topk_group: Optional[int] = None,
|
105
|
-
num_expert_group: Optional[int] = None,
|
106
|
-
custom_routing_function: Optional[Callable] = None,
|
107
|
-
) -> torch.Tensor:
|
108
|
-
|
109
|
-
if use_grouped_topk:
|
110
|
-
assert num_expert_group is not None and topk_group is not None
|
111
|
-
topk_weights, topk_ids = grouped_topk(
|
112
|
-
x,
|
113
|
-
router_logits,
|
114
|
-
top_k,
|
115
|
-
renormalize,
|
116
|
-
num_expert_group,
|
117
|
-
topk_group,
|
118
|
-
)
|
119
|
-
elif custom_routing_function is None:
|
120
|
-
topk_weights, topk_ids = fused_topk_native(x, router_logits, top_k, renormalize)
|
121
|
-
else:
|
122
|
-
topk_weights, topk_ids = custom_routing_function(
|
123
|
-
x, router_logits, top_k, renormalize
|
124
|
-
)
|
125
|
-
|
126
|
-
w13_weights = layer.w13_weight[topk_ids]
|
127
|
-
w1_weights, w3_weights = torch.chunk(w13_weights, 2, dim=2)
|
128
|
-
w2_weights = layer.w2_weight[topk_ids]
|
129
|
-
x1 = torch.einsum("ti,taoi -> tao", x, w1_weights)
|
130
|
-
x1 = F.silu(x1)
|
131
|
-
x3 = torch.einsum("ti, taoi -> tao", x, w3_weights)
|
132
|
-
expert_outs = torch.einsum("tao, taio -> tai", (x1 * x3), w2_weights)
|
133
|
-
return torch.einsum("tai,ta -> ti", expert_outs, topk_weights.to(expert_outs.dtype))
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|