PyPI - sglang - Versions diffs - 0.4.2__py3-none-any.whl → 0.4.2.post2__py3-none-any.whl - Mend

sglang 0.4.2py3-none-any.whl → 0.4.2.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

sglang/srt/speculative/eagle_worker.py CHANGED Viewed

@@ -1,3 +1,5 @@
+import logging
+import time
 from typing import List, Optional, Union
 import torch
@@ -12,8 +14,18 @@ from sglang.srt.model_executor.forward_batch_info import (
 )
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.speculative.eagle_utils import EAGLEDraftInput
-from sglang.srt.utils import rank0_print
+from sglang.srt.speculative.eagle_draft_cuda_graph_runner import (
+    EAGLEDraftCudaGraphRunner,
+)
+from sglang.srt.speculative.eagle_utils import (
+    EagleDraftInput,
+    EagleVerifyInput,
+    assign_draft_cache_locs,
+    fast_topk,
+    select_top_k_tokens,
+)
+logger = logging.getLogger(__name__)
 class EAGLEWorker(TpModelWorker):
@@ -40,41 +52,47 @@ class EAGLEWorker(TpModelWorker):
             is_draft_worker=True,
         )
         self.target_worker = target_worker
-        self.server_args = server_args
         self.finish_extend_len = []
+        # Parse arguments
+        self.topk = server_args.speculative_eagle_topk
+        self.speculative_num_steps = server_args.speculative_num_steps
+        self.server_args = server_args
         # Share the embedding and lm_head
         embed, head = self.target_worker.model_runner.model.get_embed_and_head()
         self.model_runner.model.set_embed_and_head(embed, head)
         self.model_runner.server_args.disable_cuda_graph = backup_disable_cuda_graph
-        self.model_runner.init_cuda_graphs()
-    def forward_draft_decode(self, batch: ScheduleBatch):
-        batch.spec_info.prepare_for_decode(batch)
-        batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
-        model_worker_batch = batch.get_model_worker_batch()
-        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
-        logits_output = self.model_runner.forward(forward_batch)
-        self.capture_for_decode(logits_output, forward_batch)
+        # Create multi-step attn backends and cuda graph runners
+        from sglang.srt.layers.attention.flashinfer_backend import (
+            FlashInferMultiStepDraftBackend,
+        )
-    def forward_draft_extend(self, batch: ScheduleBatch):
-        self._set_mem_pool(batch, self.model_runner)
-        batch.spec_info.prepare_for_extend(batch)
-        batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
-        model_worker_batch = batch.get_model_worker_batch()
-        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
-        logits_output = self.model_runner.forward(forward_batch)
-        self.capture_for_decode(logits_output, forward_batch)
-        self._set_mem_pool(batch, self.target_worker.model_runner)
+        self.draft_attn_backend = FlashInferMultiStepDraftBackend(
+            self.model_runner,
+            self.topk,
+            self.speculative_num_steps,
+        )
+        self.model_runner.draft_attn_backend = self.draft_attn_backend
+        self.init_cuda_graphs()
+    def init_cuda_graphs(self):
+        """Capture cuda graphs."""
+        self.cuda_graph_runner = None
+        if self.server_args.disable_cuda_graph:
+            return
+        tic = time.time()
+        logger.info("Capture cuda graph begin. This can take up to several minutes.")
+        self.cuda_graph_runner = EAGLEDraftCudaGraphRunner(self)
+        logger.info(f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f} s")
     def forward_batch_speculative_generation(self, batch: ScheduleBatch):
         if batch.forward_mode.is_decode():
             # Draft
-            self._set_mem_pool(batch, self.model_runner)
-            for i in range(self.server_args.speculative_num_steps):
-                self.forward_draft_decode(batch)
-            batch.spec_info.clear_draft_cache(batch)
-            self._set_mem_pool(batch, self.target_worker.model_runner)
+            spec_info: EagleVerifyInput = self.draft(batch)
             # Verify
             (
@@ -84,8 +102,7 @@ class EAGLEWorker(TpModelWorker):
                 self.finish_extend_len,
                 accept_length_cpu,
                 model_worker_batch,
-            ) = self.verify(batch)
-            next_draft_input.load_server_args(self.server_args)
+            ) = self.verify(batch, spec_info)
             batch.spec_info = next_draft_input
             # if it is None, means all requsets are finished
             if batch.spec_info.verified_id is not None:
@@ -107,39 +124,156 @@ class EAGLEWorker(TpModelWorker):
             )
             # Forward with the draft model.
-            spec_info = EAGLEDraftInput()
-            spec_info.load_server_args(self.server_args)
-            spec_info.hidden_states = logits_output.hidden_states
-            spec_info.verified_id = next_token_ids
-            batch.spec_info = spec_info
+            batch.spec_info = EagleDraftInput(
+                hidden_states=logits_output.hidden_states,
+                verified_id=next_token_ids,
+            )
             self.forward_draft_extend(batch)
             return logits_output, next_token_ids, model_worker_batch, 0
-    def verify(self, batch: ScheduleBatch):
-        verify_input = batch.spec_info.prepare_for_verify(batch)
-        verify_input.prepare_for_verify(batch)
+    def draft(self, batch: ScheduleBatch):
+        self._set_mem_pool(batch, self.model_runner)
+        # Parse args
+        num_seqs = batch.batch_size()
+        spec_info = batch.spec_info
+        # Allocate cache locations
+        out_cache_loc = batch.alloc_token_slots(
+            num_seqs * self.topk * self.speculative_num_steps
+        )
+        assign_draft_cache_locs[(num_seqs,)](
+            batch.req_pool_indices,
+            batch.req_to_token_pool.req_to_token,
+            batch.seq_lens,
+            out_cache_loc,
+            batch.req_to_token_pool.req_to_token.shape[1],
+            self.topk,
+            self.speculative_num_steps,
+        )
+        batch.out_cache_loc = out_cache_loc
+        batch.seq_lens_sum = torch.sum(batch.seq_lens).item()
+        spec_info.positions = batch.seq_lens.repeat_interleave(self.topk, dim=0)
+        # Get forward batch
+        spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
+        model_worker_batch = batch.get_model_worker_batch()
+        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
+        can_cuda_graph = self.cuda_graph_runner and self.cuda_graph_runner.can_run(
+            forward_batch
+        )
+        if can_cuda_graph:
+            score_list, token_list, parents_list = self.cuda_graph_runner.replay(
+                forward_batch
+            )
+        else:
+            # Initialize attention backend
+            self.draft_attn_backend.init_forward_metadata(forward_batch)
+            # Run forward steps
+            score_list, token_list, parents_list = self.draft_forward(forward_batch)
+        ret = EagleVerifyInput.create(
+            spec_info.verified_id,
+            score_list,
+            token_list,
+            parents_list,
+            batch.seq_lens,
+            batch.seq_lens_sum,
+            self.topk,
+            self.speculative_num_steps,
+            self.server_args.speculative_num_draft_tokens,
+        )
+        # Free cache locations
+        batch.token_to_kv_pool.free(out_cache_loc)
+        self._set_mem_pool(batch, self.target_worker.model_runner)
+        return ret
+    def draft_forward(self, forward_batch: ForwardBatch):
+        # Parse args
+        spec_info = forward_batch.spec_info
+        out_cache_loc = forward_batch.out_cache_loc
+        topk_p, topk_index, hidden_states = (
+            spec_info.topk_p,
+            spec_info.topk_index,
+            spec_info.hidden_states,
+        )
+        # Return values
+        score_list: List[torch.Tensor] = []
+        token_list: List[torch.Tensor] = []
+        parents_list: List[torch.Tensor] = []
+        # Forward multiple steps
+        scores = None
+        for i in range(self.speculative_num_steps):
+            input_ids, hidden_states, scores, tree_info = select_top_k_tokens(
+                i, topk_p, topk_index, hidden_states, scores, self.topk
+            )
+            score_list.append(tree_info[0])
+            token_list.append(tree_info[1])
+            parents_list.append(tree_info[2])
+            # Set inputs
+            forward_batch.input_ids = input_ids
+            forward_batch.out_cache_loc = out_cache_loc[
+                forward_batch.batch_size
+                * self.topk
+                * i : forward_batch.batch_size
+                * self.topk
+                * (i + 1)
+            ]
+            forward_batch.positions.add_(1)
+            forward_batch.attn_backend = self.draft_attn_backend.attn_backends[i]
+            spec_info.hidden_states = hidden_states
+            # Run forward
+            logits_output = self.model_runner.model.forward(
+                forward_batch.input_ids, forward_batch.positions, forward_batch
+            )
+            probs = torch.softmax(logits_output.next_token_logits, dim=-1)
+            topk_p, topk_index = fast_topk(probs, self.topk, dim=-1)
+            hidden_states = logits_output.hidden_states
+        return score_list, token_list, parents_list
+    def verify(self, batch: ScheduleBatch, spec_info: EagleVerifyInput):
+        spec_info.prepare_for_verify(batch)
         batch.forward_mode = ForwardMode.TARGET_VERIFY
-        batch.spec_info = verify_input
-        batch.spec_info.capture_hidden_mode = CaptureHiddenMode.FULL
+        batch.spec_info = spec_info
         model_worker_batch = batch.get_model_worker_batch()
         logits_output, _ = self.target_worker.forward_batch_generation(
             model_worker_batch, skip_sample=True
         )
-        verify_input.hidden_states = logits_output.hidden_states
-        res = verify_input.verify(batch, logits_output)
+        spec_info.hidden_states = logits_output.hidden_states
+        res = spec_info.verify(batch, logits_output)
         batch.forward_mode = ForwardMode.DECODE
         return res + (model_worker_batch,)
+    def forward_draft_extend(self, batch: ScheduleBatch):
+        self._set_mem_pool(batch, self.model_runner)
+        batch.spec_info.prepare_for_extend(batch)
+        batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
+        model_worker_batch = batch.get_model_worker_batch()
+        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
+        logits_output = self.model_runner.forward(forward_batch)
+        self.capture_for_decode(logits_output, forward_batch)
+        self._set_mem_pool(batch, self.target_worker.model_runner)
     def _set_mem_pool(self, batch: ScheduleBatch, runner: ModelRunner):
         batch.token_to_kv_pool = runner.token_to_kv_pool
         batch.req_to_token_pool = runner.req_to_token_pool
     def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
         seq_lens_backup = batch.seq_lens
+        req_pool_indices_backup = batch.req_pool_indices
         self._set_mem_pool(batch, self.model_runner)
         batch.forward_mode = ForwardMode.DRAFT_EXTEND
-        batch.spec_info.prepare_extend_after_decode(batch)
+        batch.spec_info.prepare_extend_after_decode(batch, self.speculative_num_steps)
         batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
         model_worker_batch = batch.get_model_worker_batch()
         forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
@@ -151,17 +285,15 @@ class EAGLEWorker(TpModelWorker):
         # This is because `seq_lens` can be modified in `prepare_extend_after_decode`
         batch.forward_mode = ForwardMode.DECODE
         batch.seq_lens = seq_lens_backup
+        batch.req_pool_indices = req_pool_indices_backup
     def capture_for_decode(
         self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
     ):
-        sample_output = torch.softmax(
-            logits_output.next_token_logits, dim=-1
-        )  # TODO(kavioyu): Support more sampling methods
+        probs = torch.softmax(logits_output.next_token_logits, dim=-1)
         spec_info = forward_batch.spec_info
-        spec_info.sample_output = sample_output
+        spec_info.topk_p, spec_info.topk_index = fast_topk(probs, self.topk, dim=-1)
         spec_info.hidden_states = logits_output.hidden_states
-        spec_info.prev_mode = forward_batch.forward_mode
     # Don't support prefix share now.
     def finish_request(self, reqs: Union[Req, List[Req]]):

sglang/srt/utils.py CHANGED Viewed

@@ -444,8 +444,6 @@ def load_image(image_file: Union[str, bytes]):
     else:
         raise ValueError(f"Invalid image: {image}")
-    # if image_size is None:
-    #     image_size = image.size
     return image, image_size
@@ -1048,6 +1046,13 @@ def get_device_name(device_id: int = 0) -> str:
         return torch.hpu.get_device_name(device_id)
+def get_device_core_count(device_id: int = 0) -> int:
+    if hasattr(torch, "cuda") and torch.cuda.is_available():
+        return torch.cuda.get_device_properties(device_id).multi_processor_count
+    return 0
 def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
     major, minor = None, None
     if hasattr(torch, "cuda") and torch.cuda.is_available():

sglang/test/runners.py CHANGED Viewed

@@ -272,6 +272,7 @@ class SRTRunner:
         port: int = DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
         lora_paths: List[str] = None,
         max_loras_per_batch: int = 4,
+        lora_backend: str = "triton",
         disable_cuda_graph: bool = False,
         disable_radix_cache: bool = False,
     ):
@@ -287,6 +288,7 @@ class SRTRunner:
             is_embedding=not self.is_generation,
             lora_paths=lora_paths,
             max_loras_per_batch=max_loras_per_batch,
+            lora_backend=lora_backend,
             disable_cuda_graph=disable_cuda_graph,
             disable_radix_cache=disable_radix_cache,
         )

sglang/utils.py CHANGED Viewed

@@ -373,3 +373,45 @@ class TypeBasedDispatcher:
             if isinstance(obj, ty):
                 return fn(obj)
         raise ValueError(f"Invalid object: {obj}")
+def trim_overlap(existing_text, new_chunk):
+    """
+    Finds the largest suffix of 'existing_text' that is a prefix of 'new_chunk'
+    and removes that overlap from the start of 'new_chunk'.
+    """
+    max_overlap = 0
+    max_possible = min(len(existing_text), len(new_chunk))
+    for i in range(max_possible, 0, -1):
+        if existing_text.endswith(new_chunk[:i]):
+            max_overlap = i
+            break
+    return new_chunk[max_overlap:]
+def stream_and_merge(llm, prompt, sampling_params):
+    """
+    1) Streams the text,
+    2) Removes chunk overlaps,
+    3) Returns the merged text.
+    """
+    final_text = ""
+    for chunk in llm.generate(prompt, sampling_params, stream=True):
+        chunk_text = chunk["text"]
+        cleaned_chunk = trim_overlap(final_text, chunk_text)
+        final_text += cleaned_chunk
+    return final_text
+async def async_stream_and_merge(llm, prompt, sampling_params):
+    """
+    Streams tokens asynchronously, removes chunk overlaps,
+    and yields the cleaned chunk in real time for printing.
+    """
+    final_text = ""
+    generator = await llm.async_generate(prompt, sampling_params, stream=True)
+    async for chunk in generator:
+        chunk_text = chunk["text"]
+        cleaned_chunk = trim_overlap(final_text, chunk_text)
+        final_text += cleaned_chunk
+        yield cleaned_chunk  # yield the non-overlapping portion

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.2"
1	+ __version__ = "0.4.2.post2"

{sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: sglang
-Version: 0.4.2
+Version: 0.4.2.post2
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -225,7 +225,6 @@ Requires-Dist: huggingface_hub; extra == "runtime-common"
 Requires-Dist: interegular; extra == "runtime-common"
 Requires-Dist: modelscope; extra == "runtime-common"
 Requires-Dist: orjson; extra == "runtime-common"
-Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
 Requires-Dist: packaging; extra == "runtime-common"
 Requires-Dist: pillow; extra == "runtime-common"
 Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
@@ -240,21 +239,27 @@ Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
-Requires-Dist: sgl-kernel>=0.0.3; extra == "srt"
+Requires-Dist: sgl-kernel>=0.0.3.post1; extra == "srt"
 Requires-Dist: torch; extra == "srt"
 Requires-Dist: vllm==0.6.4.post1; extra == "srt"
-Requires-Dist: flashinfer==0.1.6; extra == "srt"
+Requires-Dist: flashinfer_python>=0.2.0.post2; extra == "srt"
+Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
-Requires-Dist: vllm==0.6.3.post2.dev1; extra == "srt-hip"
+Requires-Dist: vllm==0.6.7.dev2; extra == "srt-hip"
+Requires-Dist: outlines==0.1.11; extra == "srt-hip"
+Requires-Dist: sgl-kernel>=0.0.3.post1; extra == "srt-hip"
 Provides-Extra: srt-xpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
+Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt-xpu"
 Provides-Extra: srt-hpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
+Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt-hpu"
 Provides-Extra: srt-cpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
 Requires-Dist: torch; extra == "srt-cpu"
+Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt-cpu"
 Provides-Extra: openai
 Requires-Dist: openai>=1.0; extra == "openai"
 Requires-Dist: tiktoken; extra == "openai"
@@ -333,7 +338,7 @@ Requires-Dist: sglang[test]; extra == "dev-cpu"
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
-- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeekSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
+- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
 - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
 - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
 - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
@@ -372,7 +377,11 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
 ## Adoption and Sponsorship
-The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
+The project is supported by (alphabetically): AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS CORP, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
+## Contact Us
+For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
 ## Acknowledgment and Citation
 We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.

sglang 0.4.2__py3-none-any.whl → 0.4.2.post2__py3-none-any.whl

sglang 0.4.2py3-none-any.whl → 0.4.2.post2py3-none-any.whl