PyPI - sglang - Versions diffs - 0.4.2.post2__py3-none-any.whl → 0.4.2.post3__py3-none-any.whl - Mend

sglang 0.4.2.post2py3-none-any.whl → 0.4.2.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

sglang/srt/speculative/eagle_utils.py CHANGED Viewed

@@ -177,29 +177,21 @@ class EagleVerifyInput:
         spec_steps: int,
         num_verify_token: int,
     ):
-        score_list = torch.cat(score_list, dim=1).flatten(
-            1
-        )  # b, n, topk; n= 1 + (num_steps-1) * self.topk
-        ss_token_list = torch.cat(
-            token_list, dim=1
-        )  # b, (self.topk + (num_steps-1) * self.topk)
-        top_scores = torch.topk(score_list, num_verify_token - 1, dim=-1)
-        top_scores_index = top_scores.indices
-        top_scores_index = torch.sort(top_scores_index).values
-        draft_tokens = torch.gather(ss_token_list, index=top_scores_index, dim=1)
-        draft_tokens = torch.cat((verified_id.unsqueeze(1), draft_tokens), dim=1)
-        parent_list = torch.cat(parents_list[:-1], dim=1)
-        tree_mask, position, retrive_index, retrive_cum_len = build_tree_kernel(
-            parent_list,
-            top_scores_index,
-            seq_lens,
-            seq_lens_sum,
-            topk,
-            spec_steps,
-            num_verify_token,
+        tree_mask, position, retrive_index, retrive_cum_len, draft_tokens = (
+            build_tree_kernel(
+                verified_id,
+                score_list,
+                token_list,
+                parents_list,
+                seq_lens,
+                seq_lens_sum,
+                topk,
+                spec_steps,
+                num_verify_token,
+            )
         )
         return cls(
-            draft_tokens.flatten(),
+            draft_tokens,
             tree_mask,
             position,
             retrive_index,
@@ -258,39 +250,77 @@ class EagleVerifyInput:
         return kv_indices, cum_kv_seq_len, qo_indptr, self.custom_mask
     def verify(self, batch: ScheduleBatch, logits_output: torch.Tensor) -> torch.Tensor:
-        predict = torch.argmax(logits_output.next_token_logits, dim=-1)
-        predict = torch.cat(
-            [predict, torch.full([1], -1, dtype=torch.long, device="cuda")], dim=-1
-        )
         draft_token = torch.cat(
-            [self.draft_token, torch.full([1], -1, dtype=torch.long, device="cuda")],
+            [self.draft_token, torch.full([1], -1, dtype=torch.int32, device="cuda")],
             dim=-1,
         )
-        target_predict = predict[self.retrive_index]
         candidates = draft_token[self.retrive_index]
-        # logits = logits_output.next_token_logits[self.retrive_index]
-        # target_predict = torch.argmax(logits[:, :-1], dim=-1)
-        accept_mask = candidates[:, 1:] == target_predict[:, :-1]
-        accept_mask = (torch.cumprod(accept_mask, dim=1)).sum(dim=1)
-        bs = self.retrive_cum_len.numel() - 1
-        max_draft_len = self.retrive_index.shape[-1]
-        accept_index = torch.full(
-            (bs, max_draft_len), -1, dtype=torch.long, device="cuda"
-        )
-        accept_length = torch.empty((bs,), dtype=torch.int, device="cuda")
-        extract_index = torch.full((bs * 2,), 0, dtype=torch.int, device="cuda")
-        eagle_verify_retrive[(bs,)](
-            self.retrive_index.contiguous(),
-            accept_mask.contiguous(),
-            self.retrive_cum_len,
-            accept_index,
-            accept_length,
-            extract_index,
-            max_draft_len,
-            self.draft_token_num,
-            triton.next_power_of_2(max_draft_len),
-        )
+        if batch.sampling_info.is_all_greedy:
+            # temp == 0
+            bs = self.retrive_cum_len.numel() - 1
+            predict = torch.argmax(logits_output.next_token_logits, dim=-1)
+            predict = torch.cat(
+                [predict, torch.full([1], -1, dtype=torch.int32, device="cuda")], dim=-1
+            )
+            target_predict = predict[self.retrive_index]
+            # logits = logits_output.next_token_logits[self.retrive_index]
+            # target_predict = torch.argmax(logits[:, :-1], dim=-1)
+            accept_mask = candidates[:, 1:] == target_predict[:, :-1]
+            accept_mask = (torch.cumprod(accept_mask, dim=1)).sum(dim=1)
+            max_draft_len = self.retrive_index.shape[-1]
+            accept_index = torch.full(
+                (bs, max_draft_len), -1, dtype=torch.int32, device="cuda"
+            )
+            accept_length = torch.empty((bs,), dtype=torch.int, device="cuda")
+            extract_index = torch.full((bs * 2,), 0, dtype=torch.int, device="cuda")
+            eagle_verify_retrive[(bs,)](
+                self.retrive_index.contiguous(),
+                accept_mask.contiguous(),
+                self.retrive_cum_len,
+                accept_index,
+                accept_length,
+                extract_index,
+                max_draft_len,
+                self.draft_token_num,
+                triton.next_power_of_2(max_draft_len),
+            )
+        else:
+            # temp > 0
+            bs = self.retrive_index.shape[0]
+            predict_shape = list(logits_output.next_token_logits.shape)[:-1]
+            predict_shape[-1] += 1
+            target_logits = logits_output.next_token_logits[self.retrive_index]
+            predict = torch.full(predict_shape, -1, dtype=torch.int32, device="cuda")
+            accept_index = torch.full(
+                (bs, self.spec_steps + 1), -1, dtype=torch.int32, device="cuda"
+            )
+            accept_length = torch.empty((bs,), dtype=torch.int32, device="cuda")
+            expanded_temperature = batch.sampling_info.temperatures.unsqueeze(1)
+            target_probs = F.softmax(target_logits / expanded_temperature, dim=-1)
+            draft_probs = torch.full_like(
+                target_probs, 0, dtype=torch.float32, device="cuda"
+            )
+            coins = torch.rand_like(candidates, dtype=torch.float32, device="cuda")
+            tree_speculative_sampling_target_only(
+                predicts=predict,  # mutable
+                accept_index=accept_index,  # mutable
+                accept_token_num=accept_length,  # mutable
+                candidates=candidates.to(torch.int32),
+                retrive_index=self.retrive_index.to(torch.int32),
+                retrive_next_token=self.retrive_next_token.to(torch.int32),
+                retrive_next_sibling=self.retrive_next_sibling.to(torch.int32),
+                uniform_samples=coins,
+                target_probs=target_probs,
+                draft_probs=draft_probs,
+                threshold_single=global_server_args_dict[
+                    "speculative_accept_threshold_single"
+                ],
+                threshold_acc=global_server_args_dict[
+                    "speculative_accept_threshold_acc"
+                ],
+                deterministic=True,
+            )
         new_accept_index = []
         unfinished_index = []

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.2.~~post2~~"
1	+ __version__ = "0.4.2.post3"

{sglang-0.4.2.post2.dist-info → sglang-0.4.2.post3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: sglang
-Version: 0.4.2.post2
+Version: 0.4.2.post3
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -239,7 +239,7 @@ Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
-Requires-Dist: sgl-kernel>=0.0.3.post1; extra == "srt"
+Requires-Dist: sgl-kernel>=0.0.3.post2; extra == "srt"
 Requires-Dist: torch; extra == "srt"
 Requires-Dist: vllm==0.6.4.post1; extra == "srt"
 Requires-Dist: flashinfer_python>=0.2.0.post2; extra == "srt"

{sglang-0.4.2.post2.dist-info → sglang-0.4.2.post3.dist-info}/RECORD RENAMED Viewed

@@ -10,7 +10,7 @@ sglang/global_config.py,sha256=fnT0U9vlHdGaQFKN9tYTnUF4-eVW4HYQURd5zvPtrg0,1286
 sglang/launch_server.py,sha256=mDXfwha8LHpWQJekcCosR98QhCQsbmilsBlI5jAIgg0,420
 sglang/llama3_eval.py,sha256=gWSboDchIGybIce88bJlrCG0yiLZ513mw4gcutJlzGM,10017
 sglang/utils.py,sha256=7HpOrPBhMivWH719m7Dy1rjrAXOAsnqelpwNBBbvjqs,13319
-sglang/version.py,sha256=64ujEkLGOA9yAnhsrnI5zBOk5lJIP4Z-b7gpnc9vbUo,28
+sglang/version.py,sha256=08dwZ-8Pb-Ir0QXBY3R8hBlzHyVuy4icqVMBMJri3oM,28
 sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sglang/lang/chat_template.py,sha256=v4SyYViPHX3i3XT46F7vlARn4UaSiP3PBpTGtzO6uRY,17006
 sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
@@ -29,7 +29,7 @@ sglang/srt/_custom_ops.py,sha256=7jL5BTcoS8PmR56y2Qsa3q8emI-tmrJuV4hLTwLVFBE,504
 sglang/srt/aio_rwlock.py,sha256=6LYtOdeTUY3hkfa1dmYkgsaF2ttrwIF3hUWz2AZ2fqw,2970
 sglang/srt/conversation.py,sha256=USUoYiJf5DdHz7Ouclu30k3QSxMiem4WgZrA148MpSA,21695
 sglang/srt/custom_op.py,sha256=M5oqlgh32vAVeStFCruydTUfi_blGFJihVTnQBEOvwo,1134
-sglang/srt/function_call_parser.py,sha256=HMqCCd-YQeyADV_gTCduF9gmw2k3bRAkoJYcFsK3w3c,19230
+sglang/srt/function_call_parser.py,sha256=YmagXt1BIuTbeiWmSleZwJFCFR5r5EFqVQqKnJDYXiE,19568
 sglang/srt/hf_transformers_utils.py,sha256=_24uqCkZ4dvS9Uc5p2cCzX0Q8ShUzrh_Hp6mvg7hxHY,7729
 sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
 sglang/srt/model_parallel.py,sha256=eLXZhvJ4wG6dh0FontNCIdVZvHYdWgaeY-5cu7TD9tE,6078
@@ -76,14 +76,14 @@ sglang/srt/layers/sampler.py,sha256=FIkh6sh91Fh5R8QJ6x66bJ8Y-xl5EfT4XVPXGXJ1l7I,
 sglang/srt/layers/torchao_utils.py,sha256=Ws24FdRBSkTpyeyA6bQrdDm-W5wfDxKvSIPUSahyMfA,4063
 sglang/srt/layers/vocab_parallel_embedding.py,sha256=txcjkuSDa6gZwESKj8X-HSLhAnMmDXL0FmFWY9SKqik,22155
 sglang/srt/layers/attention/__init__.py,sha256=KlQ0fl-o9v_NxBDhNZ4dPW2uQ2HeJjLm-0MTMWgaa28,2980
-sglang/srt/layers/attention/double_sparsity_backend.py,sha256=QEDF8tQKMkh-nbt4jHKHZhhgHuV0Fla_BPzzoo9JfT4,9231
+sglang/srt/layers/attention/double_sparsity_backend.py,sha256=4mVyFPfZxPTwkQHGNCfI_4hQ8CbsWXJfxz-IQW77gAc,9143
 sglang/srt/layers/attention/flashinfer_backend.py,sha256=9BJEAQ5IcSMGvPfa6_D3cP9Gbo2XQ5GHBnF7cw2Rsng,42933
 sglang/srt/layers/attention/torch_native_backend.py,sha256=KrcAqTLVZLtwgOmB0xhwUUsX32M-5LYZpNxaRNT4VuA,9252
-sglang/srt/layers/attention/triton_backend.py,sha256=PZU496wPzyRopA600riR5sxz-gIz8u9TYz0MzmMuX5Y,8858
+sglang/srt/layers/attention/triton_backend.py,sha256=mtLs768rhtCF_BVAV_rmYac0U4R1_HHc-9ic4JratsY,10100
 sglang/srt/layers/attention/vision.py,sha256=zLjKmzUlkgq1RFcP3b4EPArOAKovoaDLgYfM5SyB2wM,13181
 sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=tcUAdacBWTpZmro7vZeRPasfwRWFlCR4bxfGpFOYgZ8,17831
-sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXfY3EEaM7iRN_uElHnAfsrJMhTFbu9fj8Z0O2PbE,21480
-sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=DWOZXSTVN5ZbcFjDjcqs-nPdUkxSwum0SVXhVKqwh2g,11688
+sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=ztLWKeW-260EiIw3kCAbtUTUHHxAICz2mVxZJFes4oI,31167
+sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=R6QgrcBf6XuLzQ1jamrILNypaPi3ynkMPTfjae0d3JA,12695
 sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=Y66gZ37u0GKMPtI8n5MbO6uOxRuGEmKIG0IPbJTOqAM,6213
 sglang/srt/layers/moe/fused_moe_native.py,sha256=OEWpM93X5tJG4-rwz5qmdpTzEUR73zun29YRV3bZglY,4269
 sglang/srt/layers/moe/topk.py,sha256=6A4W1ztlV2dQvkXcPJvFvAg0QEhE58Q7eE7iw8N36J4,7230
@@ -91,7 +91,7 @@ sglang/srt/layers/moe/ep_moe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
 sglang/srt/layers/moe/ep_moe/kernels.py,sha256=wb_S2qLxoWWgQu9coXy0XLNGvHzdZSdwXr0PGy4QySg,10940
 sglang/srt/layers/moe/ep_moe/layer.py,sha256=aS8t1XUvlTnO9IQaxGjW5bOXP4FrJDXzymEIvlIDMro,22603
 sglang/srt/layers/moe/fused_moe_triton/__init__.py,sha256=h9yMFAL_bagUf-qBED8gSWdCOb7d8IdA-pE-L_nIg8E,842
-sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=_uUn0EKf_trPyKP4jfCytKKgnC3ziUM_X9L_PbaXhbE,37241
+sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=tWV490Ao5vIasPDBBY9ktuAZdWlONnnv3uPCifcTfpI,37241
 sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=-49WRpq9OtRZocQjW-YNcB_ruK09nIJqGHKNa8CJsws,22691
 "sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",sha256=iNGsE2ZeVnQEnN4A8UJ9Jv0d3hbRF2MJ9oBgjup5Szk,2737
 "sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",sha256=JJN0hryyLr5Zv3dSS7C8cPFhAwTT6XxUVnBGMZvV6JA,2752
@@ -196,7 +196,7 @@ sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=-49WRpq9OtRZocQjW-YNcB_ru
 sglang/srt/layers/quantization/__init__.py,sha256=_Sba1KQnmZNKGDKM1MfBs2T3uDqOHfeW6IHO2mTUvfs,4471
 sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
 sglang/srt/layers/quantization/fp8.py,sha256=ibttPVCUsCQ0LXy7FUb8wnzqGcGZQXQLqwCB4a2fai4,35160
-sglang/srt/layers/quantization/fp8_kernel.py,sha256=gm_mDHeBswggsaNMEo0niayx9DS2mGJkLkrnnOYQnlU,15709
+sglang/srt/layers/quantization/fp8_kernel.py,sha256=qlXXT9WO3TKxZv9r0rAdcDjO_jZYobtKnvTxmHDrfoo,16304
 sglang/srt/layers/quantization/fp8_utils.py,sha256=7v-RNwuYXa-gPO3msRDB0Z3uajOQMYd2Cj0NMoq1hg4,4148
 sglang/srt/layers/quantization/int8_kernel.py,sha256=t_BLVf8XjOyn7S3Lu3B4hXvw8DvTg4Anco7TNadL58U,1436
 sglang/srt/layers/quantization/modelopt_quant.py,sha256=_VdVz77dTP-IczPeFrdH6Ttro2D26BZvMlZkCKWj_5o,6200
@@ -325,7 +325,7 @@ sglang/srt/models/granite.py,sha256=3HqQXJlfoKd11w1NCpTYmiPO9HlkA1jJqoAmuTzHuU0,
 sglang/srt/models/grok.py,sha256=NXC0I5_wXmlQ0-gMWgiT-X9ebzOsrTJGcltAXkY6064,18030
 sglang/srt/models/internlm2.py,sha256=INGGwSCYKoZRAokXJC78RKKde2fgHn9P4JG-N37Pfn0,12124
 sglang/srt/models/internlm2_reward.py,sha256=8K26A9oIFFGx_9U2mF87j7FX8K87HGKMnVL3ht1Uc7I,2398
-sglang/srt/models/llama.py,sha256=YKtLpxgk_OmPRpBJSr1BCTWC6IILFzZtakKnWYYHKw0,22040
+sglang/srt/models/llama.py,sha256=hGBUo-70o0vLT6BI-v32qEv_g8Vr7ItEpqxAt1Mf9-0,22248
 sglang/srt/models/llama_classification.py,sha256=DwboM1xHXdf3Fddf7xGnrfdOLJwXdiJs994cIpAPa2g,2984
 sglang/srt/models/llama_eagle.py,sha256=88DzR54DKBIKJ1h-bkIa8mc1qJnlkdZ1eGYY3c5mpBY,4442
 sglang/srt/models/llama_embedding.py,sha256=rh-AiczPY_pTpzcACHvSMVjh1hsV_MZBBwP0LQxPsGM,3130
@@ -365,9 +365,9 @@ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=1Zp2aL6dD6
 sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=_Nxv0XgUPirZjw2SEJYp_Cd9ZcLwmt7h6JE6J4hhFq4,3629
 sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=5tOgCg7OvE9kSN9VMCpH1hwqo1YMxt9iS5PVpct9HpU,2468
 sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=l1DyU8kC8n_F4Z6Jd8mZKfF23buuLZ5dWuVfyqDWkUI,2968
-sglang/srt/speculative/build_eagle_tree.py,sha256=GlHDIbaZInhf1LYuWVmkVCvPcd6sKDxeXafLTai7Zpw,9892
+sglang/srt/speculative/build_eagle_tree.py,sha256=zWthboIgzPzSOXcGxDpDv0rBOQP55HYGrBKGqm2gWF0,20732
 sglang/srt/speculative/eagle_draft_cuda_graph_runner.py,sha256=5ZCy6ndPA2p95xDgo2kXWD3zCtVaq4q5X0HBpAbB3Xs,7929
-sglang/srt/speculative/eagle_utils.py,sha256=DUANfRddoQ4MqNuQW3Uz21qWRaetYf4prVcljZR0tyI,22945
+sglang/srt/speculative/eagle_utils.py,sha256=BV89f2CTp9H0pSvJfK13WYvTL7LW3BtcplQfLngKihg,24451
 sglang/srt/speculative/eagle_worker.py,sha256=4oROLwUBJIwEHNHNEfvsy74DqLQLVc4KfjdR-MrB1OM,12038
 sglang/srt/speculative/spec_info.py,sha256=D7A27UU1iOwIBEjXTgAxZ7jdftbTiVlMCvK8GmYr2zg,488
 sglang/test/few_shot_gsm8k.py,sha256=7yDbEQe49gZeJhz2wFFX-gf_59ThDKsCS1xwfogNc7k,4034
@@ -386,8 +386,8 @@ sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c
 sglang/test/test_programs.py,sha256=aUV9Ex_B714ph7ytv6W3J7sdGDKC6lGIhUy95Yg6AHQ,18878
 sglang/test/test_utils.py,sha256=BU6lAX3bu3TNQZqVC9UPnyq3I7iV5kigHQKJx7UNlOQ,26192
 sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
-sglang-0.4.2.post2.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
-sglang-0.4.2.post2.dist-info/METADATA,sha256=-Cl1_HFESAZxOXkBnaqDshP2M93b_4FWYGVh-1Yyw3s,23763
-sglang-0.4.2.post2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-sglang-0.4.2.post2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
-sglang-0.4.2.post2.dist-info/RECORD,,
+sglang-0.4.2.post3.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
+sglang-0.4.2.post3.dist-info/METADATA,sha256=eVi6WuPieNGNX7TzNcBd8JolIIPQphw6609pqdALCUQ,23763
+sglang-0.4.2.post3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+sglang-0.4.2.post3.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
+sglang-0.4.2.post3.dist-info/RECORD,,

{sglang-0.4.2.post2.dist-info → sglang-0.4.2.post3.dist-info}/LICENSE RENAMED Viewed

File without changes

{sglang-0.4.2.post2.dist-info → sglang-0.4.2.post3.dist-info}/WHEEL RENAMED Viewed

File without changes

{sglang-0.4.2.post2.dist-info → sglang-0.4.2.post3.dist-info}/top_level.txt RENAMED Viewed

File without changes

sglang 0.4.2.post2__py3-none-any.whl → 0.4.2.post3__py3-none-any.whl

sglang 0.4.2.post2py3-none-any.whl → 0.4.2.post3py3-none-any.whl