PyPI - sglang - Versions diffs - 0.4.1.post2__py3-none-any.whl → 0.4.1.post4__py3-none-any.whl - Mend

sglang 0.4.1.post2py3-none-any.whl → 0.4.1.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (173) hide show

sglang/srt/speculative/eagle_worker.py ADDED Viewed

@@ -0,0 +1,170 @@
+from typing import List, Optional, Union
+import torch
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
+from sglang.srt.managers.tp_worker import TpModelWorker
+from sglang.srt.model_executor.forward_batch_info import (
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+)
+from sglang.srt.model_executor.model_runner import ModelRunner
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.speculative.eagle_utils import EAGLEDraftInput
+class EAGLEWorker(TpModelWorker):
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        gpu_id: int,
+        tp_rank: int,
+        dp_rank: Optional[int],
+        nccl_port: int,
+        target_worker: TpModelWorker,
+    ):
+        # Do not capture cuda graph in `super().__init__()`
+        # We will capture it later
+        backup_disable_cuda_graph = server_args.disable_cuda_graph
+        server_args.disable_cuda_graph = True
+        super().__init__(
+            gpu_id=gpu_id,
+            tp_rank=tp_rank,
+            server_args=server_args,
+            nccl_port=nccl_port,
+            dp_rank=dp_rank,
+            is_draft_worker=True,
+        )
+        self.target_worker = target_worker
+        self.server_args = server_args
+        # Share the embedding and lm_head
+        embed, head = self.target_worker.model_runner.model.get_embed_and_head()
+        self.model_runner.model.set_embed_and_head(embed, head)
+        self.model_runner.server_args.disable_cuda_graph = backup_disable_cuda_graph
+        self.model_runner.init_cuda_graphs()
+    def forward_draft_decode(self, batch: ScheduleBatch):
+        batch.spec_info.prepare_for_decode(batch)
+        model_worker_batch = batch.get_model_worker_batch()
+        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
+        forward_batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
+        logits_output = self.model_runner.forward(forward_batch)
+        self.capture_for_decode(logits_output, forward_batch)
+    def forward_draft_extend(self, batch: ScheduleBatch):
+        self._swap_mem_pool(batch, self.model_runner)
+        batch.spec_info.prepare_for_extend(batch)
+        model_worker_batch = batch.get_model_worker_batch()
+        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
+        forward_batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
+        logits_output = self.model_runner.forward(forward_batch)
+        self.capture_for_decode(logits_output, forward_batch)
+        self._swap_mem_pool(batch, self.target_worker.model_runner)
+    def forward_batch_speculative_generation(self, batch: ScheduleBatch):
+        if batch.forward_mode.is_decode():
+            prev_spec_info = batch.spec_info
+            self._swap_mem_pool(batch, self.model_runner)
+            for i in range(self.server_args.speculative_num_steps):
+                self.forward_draft_decode(batch)
+            batch.spec_info.clear_draft_cache(batch)
+            self._swap_mem_pool(batch, self.target_worker.model_runner)
+            (
+                next_draft_input,
+                logits_output,
+                verified_id,
+                self.finish_extend_len,
+                model_worker_batch,
+            ) = self.verify(batch)
+            next_draft_input.init(self.server_args)
+            batch.spec_info = next_draft_input
+            # if it is None, means all requsets are finished
+            if batch.spec_info.verified_id is not None:
+                self.forward_extend_after_decode(batch)
+            batch.spec_info = prev_spec_info
+            return logits_output, verified_id, model_worker_batch, next_draft_input
+        else:
+            spec_info = EAGLEDraftInput()
+            spec_info.init(self.server_args)
+            model_worker_batch = batch.get_model_worker_batch()
+            model_worker_batch.spec_info = spec_info
+            spec_info.capture_hidden_mode = CaptureHiddenMode.FULL
+            logits_output, next_token_ids = self.target_worker.forward_batch_generation(
+                model_worker_batch
+            )
+            model_worker_batch.spec_info.verified_id = next_token_ids
+            model_worker_batch.spec_info.hidden_states = logits_output.hidden_states
+            batch.spec_info = spec_info
+            self.forward_draft_extend(batch)
+            batch.spec_info = None
+            return logits_output, next_token_ids, model_worker_batch, spec_info
+    def verify(self, batch: ScheduleBatch):
+        verify_input = batch.spec_info.prepare_for_verify(batch)
+        batch.forward_mode = ForwardMode.TARGET_VERIFY
+        verify_input.prepare_for_verify(batch)
+        batch.spec_info = verify_input
+        batch.spec_info.capture_hidden_mode = CaptureHiddenMode.FULL
+        model_worker_batch = batch.get_model_worker_batch()
+        logits_output, _ = self.target_worker.forward_batch_generation(
+            model_worker_batch, skip_sample=True
+        )
+        verify_input.hidden_states = logits_output.hidden_states
+        res = verify_input.verify(batch, logits_output)
+        batch.forward_mode = ForwardMode.DECODE
+        return res + (model_worker_batch,)
+    def _swap_mem_pool(self, batch: ScheduleBatch, runner: ModelRunner):
+        batch.token_to_kv_pool = runner.token_to_kv_pool
+        batch.req_to_token_pool = runner.req_to_token_pool
+    def forward_extend_after_decode(self, batch: ScheduleBatch):
+        self._swap_mem_pool(batch, self.model_runner)
+        batch.forward_mode = ForwardMode.DRAFT_EXTEND
+        if batch.spec_info.has_finished:
+            index = batch.spec_info.unfinished_index
+            seq_lens = batch.seq_lens
+            batch.seq_lens = batch.seq_lens[index]
+        batch.spec_info.prepare_extend_after_decode(batch)
+        model_worker_batch = batch.get_model_worker_batch()
+        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
+        forward_batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
+        logits_output = self.model_runner.forward(forward_batch)
+        batch.spec_info.hidden_states = logits_output.hidden_states
+        self.capture_for_decode(logits_output, forward_batch)
+        batch.forward_mode = ForwardMode.DECODE
+        if batch.spec_info.has_finished:
+            batch.seq_lens = seq_lens
+        self._swap_mem_pool(batch, self.target_worker.model_runner)
+    def capture_for_decode(self, logits_output, forward_batch):
+        if isinstance(logits_output, LogitsProcessorOutput):
+            logits = logits_output.next_token_logits
+        sample_output = torch.softmax(
+            logits, dim=-1
+        )  # TODO: Support more sampling method @kavioyu
+        forward_batch.spec_info.capture_for_decode(
+            sample_output, logits_output.hidden_states, forward_batch.forward_mode
+        )
+    # Don't support prefix share now.
+    def finish_request(self, reqs: Union[Req, List[Req]]):
+        if not isinstance(reqs, List):
+            reqs = [reqs]
+        for req in reqs:
+            req_len = (
+                len(req.origin_input_ids)
+                + len(req.output_ids)
+                - self.finish_extend_len[req.rid]
+                - 1
+            )
+            kv_indices = self.model_runner.req_to_token_pool.req_to_token[
+                req.req_pool_idx
+            ][:req_len]
+            self.model_runner.token_to_kv_pool.free(kv_indices)
+            self.model_runner.req_to_token_pool.free(req.req_pool_idx)

sglang/srt/speculative/spec_info.py CHANGED Viewed

@@ -2,8 +2,12 @@ from enum import IntEnum, auto
 class SpeculativeAlgorithm(IntEnum):
+    NONE = auto()
     EAGLE = auto()
+    def is_none(self):
+        return self == SpeculativeAlgorithm.NONE
     def is_eagle(self):
         return self == SpeculativeAlgorithm.EAGLE
@@ -11,6 +15,7 @@ class SpeculativeAlgorithm(IntEnum):
     def from_string(name: str):
         name_map = {
             "EAGLE": SpeculativeAlgorithm.EAGLE,
+            None: SpeculativeAlgorithm.NONE,
         }
         return name_map[name]

sglang/srt/utils.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import base64
 import dataclasses
+import io
 import ipaddress
 import itertools
 import json
@@ -34,6 +35,7 @@ import warnings
 from functools import lru_cache
 from importlib.metadata import PackageNotFoundError, version
 from io import BytesIO
+from multiprocessing.reduction import ForkingPickler
 from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
 import numpy as np
@@ -60,7 +62,6 @@ from triton.runtime.cache import (
 logger = logging.getLogger(__name__)
 show_time_cost = False
 time_infos = {}
@@ -1206,7 +1207,6 @@ def _cuda_device_count_stateless(cuda_visible_devices: Optional[str] = None) ->
     # https://github.com/pytorch/pytorch/blob/
     # c1cd946818442aca8c7f812b16d187ce1586c3bc/
     # torch/cuda/__init__.py#L831C1-L831C17
-    import torch.cuda
     import torch.version
     if not torch.cuda._is_compiled():
@@ -1335,3 +1335,16 @@ def parse_tool_response(text, tools, **kwargs):
         for call_info in call_info_list
     ]
     return text, call_info_list
+class MultiprocessingSerializer:
+    @staticmethod
+    def serialize(obj):
+        buf = io.BytesIO()
+        ForkingPickler(buf).dump(obj)
+        buf.seek(0)
+        return buf.read()
+    @staticmethod
+    def deserialize(data):
+        return ForkingPickler.loads(data)

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.1.~~post2~~"
1	+ __version__ = "0.4.1.post4"

{sglang-0.4.1.post2.dist-info → sglang-0.4.1.post4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.4.1.post2
+Version: 0.4.1.post4
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -243,11 +243,11 @@ Requires-Dist: torch; extra == "srt"
 Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
 Requires-Dist: flashinfer==0.1.6; extra == "srt"
-Requires-Dist: sgl-kernel>=0.0.2.post10; extra == "srt"
+Requires-Dist: sgl-kernel>=0.0.2.post11; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
-Requires-Dist: vllm==0.6.3.dev13; extra == "srt-hip"
+Requires-Dist: vllm==0.6.3.post2.dev1; extra == "srt-hip"
 Provides-Extra: srt-xpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
 Provides-Extra: srt-hpu
@@ -315,7 +315,7 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
 | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
 | [**Documentation**](https://sgl-project.github.io/)
-| [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA)
+| [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2um0ad92q-LkU19KQTxCGzlCgRiOiQEw)
 | [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
@@ -347,9 +347,10 @@ The core features include:
 ## Getting Started
 - [Install SGLang](https://sgl-project.github.io/start/install.html)
-- [Send requests](https://sgl-project.github.io/start/send_request.html)
-- [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
-- [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
+- [Quick Start](https://sgl-project.github.io/start/send_request.html)
+- [Backend Tutorial](https://sgl-project.github.io/backend/openai_api_completions.html)
+- [Frontend Tutorial](https://sgl-project.github.io/frontend/frontend.html)
+- [Contribution Guide](https://sgl-project.github.io/references/contribution_guide.html)
 ## Benchmark and Performance
 Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
@@ -361,5 +362,5 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
 ## Acknowledgment and Citation
-We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
+We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
 Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.

sglang 0.4.1.post2__py3-none-any.whl → 0.4.1.post4__py3-none-any.whl

sglang 0.4.1.post2py3-none-any.whl → 0.4.1.post4py3-none-any.whl