PyPI - sglang - Versions diffs - 0.3.3__tar.gz → 0.3.3.post1__tar.gz - Mend

sglang 0.3.3tar.gz → 0.3.3.post1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (146) hide show

{sglang-0.3.3/sglang.egg-info → sglang-0.3.3.post1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.3.3
+Version: 0.3.3.post1
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -257,8 +257,8 @@ Provides-Extra: dev
 Requires-Dist: sglang[all]; extra == "dev"
 Requires-Dist: sglang[test]; extra == "dev"
-<div align="center">
-<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
+<div align="center"  id="sglangtop">
+<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
 [![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
 ![PyPI - Downloads](https://img.shields.io/pypi/dm/sglang)
@@ -270,10 +270,9 @@ Requires-Dist: sglang[test]; extra == "dev"
 --------------------------------------------------------------------------------
-| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
+| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pptx) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
 ## Upcoming Events
-- [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
 - [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
 ## News
@@ -324,7 +323,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
+git clone -b v0.3.3.post1 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
@@ -848,3 +847,11 @@ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
 ## Citation And Acknowledgment
 Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
 We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
+<p align="center">
+  <a href="#sglangtop" target="_blank">
+  <bold>Back To Top </bold>
+  </a>
+</p>

{sglang-0.3.3 → sglang-0.3.3.post1}/README.md RENAMED Viewed

@@ -1,5 +1,5 @@
-<div align="center">
-<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
+<div align="center"  id="sglangtop">
+<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
 [![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
 ![PyPI - Downloads](https://img.shields.io/pypi/dm/sglang)
@@ -11,10 +11,9 @@
 --------------------------------------------------------------------------------
-| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
+| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pptx) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
 ## Upcoming Events
-- [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
 - [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
 ## News
@@ -65,7 +64,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
+git clone -b v0.3.3.post1 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
@@ -589,3 +588,11 @@ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
 ## Citation And Acknowledgment
 Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
 We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
+<p align="center">
+  <a href="#sglangtop" target="_blank">
+  <bold>Back To Top </bold>
+  </a>
+</p>

{sglang-0.3.3 → sglang-0.3.3.post1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.3.3"
+version = "0.3.3.post1"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"

{sglang-0.3.3 → sglang-0.3.3.post1}/sglang/bench_latency.py RENAMED Viewed

@@ -139,7 +139,7 @@ def load_model(server_args, port_args, tp_rank):
         gpu_id=tp_rank,
         tp_rank=tp_rank,
         tp_size=server_args.tp_size,
-        nccl_port=port_args.nccl_ports[0],
+        nccl_port=port_args.nccl_port,
         server_args=server_args,
     )
     rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
@@ -220,6 +220,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
     return reqs
+@torch.inference_mode()
 def extend(reqs, model_runner):
     batch = ScheduleBatch.init_new(
         reqs=reqs,
@@ -235,6 +236,7 @@ def extend(reqs, model_runner):
     return next_token_ids, logits_output.next_token_logits, batch
+@torch.inference_mode()
 def decode(input_token_ids, batch, model_runner):
     batch.prepare_for_decode(input_token_ids)
     model_worker_batch = batch.get_model_worker_batch()
@@ -244,7 +246,6 @@ def decode(input_token_ids, batch, model_runner):
     return next_token_ids, logits_output.next_token_logits
-@torch.inference_mode()
 def correctness_test(
     server_args,
     port_args,
@@ -287,7 +288,6 @@ def correctness_test(
         rank_print(tokenizer.decode(output_ids[i]), "\n")
-@torch.inference_mode()
 def latency_test_run_once(
     run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len
 ):

{sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/conversation.py RENAMED Viewed

@@ -70,6 +70,9 @@ class Conversation:
     sep2: str = None
     # Stop criteria (the default one is EOS token)
     stop_str: Union[str, List[str]] = None
+    # The string that represents an image token in the prompt
+    image_token: str = "<image>"
     image_data: Optional[List[str]] = None
     modalities: Optional[List[str]] = None
@@ -334,6 +337,7 @@ class Conversation:
             sep=self.sep,
             sep2=self.sep2,
             stop_str=self.stop_str,
+            image_token=self.image_token,
         )
     def dict(self):
@@ -381,6 +385,7 @@ def generate_chat_conv(
         stop_str=conv.stop_str,
         image_data=[],
         modalities=[],
+        image_token=conv.image_token,
     )
     if isinstance(request.messages, str):
@@ -412,9 +417,13 @@ def generate_chat_conv(
                         num_image_url += 1
                         conv.modalities.append(content.modalities)
                 if num_image_url > 1:
-                    image_token = "<image>"
+                    image_token = conv.image_token
                 else:
-                    image_token = "<image>\n"
+                    image_token = (
+                        conv.image_token + "\n"
+                        if conv.name != "qwen2-vl"
+                        else conv.image_token
+                    )
                 for content in message.content:
                     if content.type == "text":
                         if num_image_url > 16:

sglang-0.3.3.post1/sglang/srt/managers/data_parallel_controller.py ADDED Viewed

@@ -0,0 +1,177 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""A controller that dispatches requests to multiple data parallel workers."""
+import logging
+import multiprocessing as mp
+from enum import Enum, auto
+import zmq
+from sglang.srt.managers.io_struct import (
+    TokenizedEmbeddingReqInput,
+    TokenizedGenerateReqInput,
+    TokenizedRewardReqInput,
+)
+from sglang.srt.managers.scheduler import run_scheduler_process
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.utils import (
+    configure_logger,
+    kill_parent_process,
+    suppress_other_loggers,
+)
+from sglang.utils import get_exception_traceback
+logger = logging.getLogger(__name__)
+class LoadBalanceMethod(Enum):
+    """Load balance method."""
+    ROUND_ROBIN = auto()
+    SHORTEST_QUEUE = auto()
+    @classmethod
+    def from_str(cls, method: str):
+        method = method.upper()
+        try:
+            return cls[method]
+        except KeyError as exc:
+            raise ValueError(f"Invalid load balance method: {method}") from exc
+class DataParallelController:
+    """A controller that dispatches requests to multiple data parallel workers."""
+    def __init__(self, server_args, port_args) -> None:
+        # Parse args
+        self.server_args = server_args
+        self.port_args = port_args
+        self.load_balance_method = LoadBalanceMethod.from_str(
+            server_args.load_balance_method
+        )
+        # Init inter-process communication
+        self.context = zmq.Context(1 + server_args.dp_size)
+        self.recv_from_tokenizer = self.context.socket(zmq.PULL)
+        self.recv_from_tokenizer.bind(f"ipc://{port_args.scheduler_input_ipc_name}")
+        # Dispatch method
+        self.round_robin_counter = 0
+        dispatch_lookup = {
+            LoadBalanceMethod.ROUND_ROBIN: self.round_robin_scheduler,
+            LoadBalanceMethod.SHORTEST_QUEUE: self.shortest_queue_scheduler,
+        }
+        self.dispatching = dispatch_lookup[self.load_balance_method]
+        # Start data parallel workers
+        base_gpu_id = 0
+        self.workers = []
+        for dp_rank in range(server_args.dp_size):
+            tmp_port_args = PortArgs.init_new(server_args)
+            tmp_port_args.detokenizer_ipc_name = port_args.detokenizer_ipc_name
+            send_to = self.launch_tensor_parallel_group(
+                server_args,
+                tmp_port_args,
+                base_gpu_id,
+                dp_rank,
+            )
+            self.workers.append(send_to)
+            base_gpu_id += server_args.tp_size
+    def launch_tensor_parallel_group(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        base_gpu_id: int,
+        dp_rank: int,
+    ):
+        # Launch tensor parallel scheduler processes
+        scheduler_procs = []
+        scheduler_pipe_readers = []
+        tp_size_per_node = server_args.tp_size // server_args.nnodes
+        tp_rank_range = range(
+            tp_size_per_node * server_args.node_rank,
+            tp_size_per_node * (server_args.node_rank + 1),
+        )
+        for tp_rank in tp_rank_range:
+            reader, writer = mp.Pipe(duplex=False)
+            gpu_id = base_gpu_id + tp_rank % tp_size_per_node
+            proc = mp.Process(
+                target=run_scheduler_process,
+                args=(server_args, port_args, gpu_id, tp_rank, dp_rank, writer),
+            )
+            proc.start()
+            scheduler_procs.append(proc)
+            scheduler_pipe_readers.append(reader)
+        send_to = self.context.socket(zmq.PUSH)
+        send_to.connect(f"ipc://{port_args.scheduler_input_ipc_name}")
+        # Wait for model to finish loading
+        for i in range(len(scheduler_pipe_readers)):
+            scheduler_pipe_readers[i].recv()
+        return send_to
+    def round_robin_scheduler(self, req):
+        self.workers[self.round_robin_counter].send_pyobj(req)
+        self.round_robin_counter = (self.round_robin_counter + 1) % len(self.workers)
+    def shortest_queue_scheduler(self, input_requests):
+        raise NotImplementedError()
+    def event_loop(self):
+        while True:
+            while True:
+                try:
+                    recv_req = self.recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK)
+                except zmq.ZMQError:
+                    break
+                if isinstance(
+                    recv_req,
+                    (
+                        TokenizedGenerateReqInput,
+                        TokenizedEmbeddingReqInput,
+                        TokenizedRewardReqInput,
+                    ),
+                ):
+                    self.dispatching(recv_req)
+                else:
+                    # Send other control messages to all workers
+                    for worker in self.workers:
+                        worker.queue.put(recv_req)
+def run_data_parallel_controller_process(
+    server_args: ServerArgs,
+    port_args: PortArgs,
+    pipe_writer,
+):
+    configure_logger(server_args)
+    suppress_other_loggers()
+    try:
+        controller = DataParallelController(server_args, port_args)
+        pipe_writer.send("ready")
+        controller.event_loop()
+    except Exception:
+        msg = get_exception_traceback()
+        logger.error(msg)
+        kill_parent_process()

{sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/io_struct.py RENAMED Viewed

@@ -20,6 +20,7 @@ processes (TokenizerManager, DetokenizerManager, Controller).
 import uuid
 from dataclasses import dataclass
+from enum import Enum
 from typing import Dict, List, Optional, Union
 from sglang.srt.managers.schedule_batch import BaseFinishReason
@@ -119,8 +120,7 @@ class GenerateReqInput:
             elif not isinstance(self.image_data, list):
                 self.image_data = [self.image_data] * num
             elif isinstance(self.image_data, list):
-                # FIXME incorrect order for duplication
-                self.image_data = self.image_data * num
+                pass
             if self.sampling_params is None:
                 self.sampling_params = [{}] * num
@@ -344,3 +344,8 @@ class UpdateWeightReqOutput:
 class AbortReq:
     # The request id
     rid: str
+class ProfileReq(Enum):
+    START_PROFILE = 1
+    STOP_PROFILE = 2

{sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/schedule_batch.py RENAMED Viewed

@@ -423,6 +423,9 @@ class ScheduleBatch:
     # Stream
     has_stream: bool = False
+    # device
+    device: str = "cuda"
     # Has regex
     has_regex: bool = False
@@ -439,6 +442,7 @@ class ScheduleBatch:
             tree_cache=tree_cache,
             return_logprob=return_logprob,
             has_stream=has_stream,
+            device=req_to_token_pool.device,
             has_regex=has_regex,
         )
@@ -806,6 +810,8 @@ class ScheduleBatch:
             self.sampling_info.regex_fsm_states = [
                 req.regex_fsm_state for req in self.reqs
             ]
+        else:
+            self.sampling_info.regex_fsms = None
         return ModelWorkerBatch(
             forward_mode=self.forward_mode,

{sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/scheduler.py RENAMED Viewed

@@ -37,6 +37,7 @@ from sglang.srt.managers.io_struct import (
     BatchEmbeddingOut,
     BatchTokenIDOut,
     FlushCacheReq,
+    ProfileReq,
     TokenizedEmbeddingReqInput,
     TokenizedGenerateReqInput,
     TokenizedRewardReqInput,
@@ -141,7 +142,7 @@ class Scheduler:
             gpu_id=gpu_id,
             tp_rank=tp_rank,
             server_args=server_args,
-            nccl_port=port_args.nccl_ports[0],
+            nccl_port=port_args.nccl_port,
         )
         self.tp_cpu_group = self.tp_worker.model_runner.tp_group.cpu_group
@@ -229,6 +230,22 @@ class Scheduler:
         self.new_token_ratio_decay = global_config.new_token_ratio_decay
         self.batch_is_full = False
+        if os.getenv("SGLANG_TORCH_PROFILER_DIR", "") == "":
+            self.profiler = None
+        else:
+            self.torch_profiler_trace_dir = os.getenv("SGLANG_TORCH_PROFILER_DIR")
+            logger.info(
+                "Profiling enabled. Traces will be saved to: %s",
+                self.torch_profiler_trace_dir,
+            )
+            self.profiler = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.CUDA,
+                ],
+                with_stack=True,
+            )
     @torch.inference_mode()
     def event_loop(self):
         while True:
@@ -271,6 +288,11 @@ class Scheduler:
             elif isinstance(recv_req, UpdateWeightReqInput):
                 success, message = self.update_weights(recv_req)
                 self.out_pyobjs.append(UpdateWeightReqOutput(success, message))
+            elif isinstance(recv_req, ProfileReq):
+                if recv_req == ProfileReq.START_PROFILE:
+                    self.start_profile()
+                else:
+                    self.stop_profile()
             else:
                 raise ValueError(f"Invalid request: {recv_req}")
@@ -433,6 +455,9 @@ class Scheduler:
                         result = self.run_batch(batch)
                         self.process_batch_result(batch, result)
+                    if self.running_batch.is_empty():
+                        self.running_batch = None
                     if self.running_batch is None:
                         break
@@ -772,9 +797,6 @@ class Scheduler:
         if self.tp_rank == 0 and self.decode_forward_ct % 40 == 0:
             self.print_decode_stats()
-        if self.running_batch.is_empty():
-            self.running_batch = None
     def add_logprob_return_values(
         self,
         i: int,
@@ -1000,15 +1022,34 @@ class Scheduler:
             logger.error(message)
         return success, message
+    def start_profile(self) -> None:
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        self.profiler.start()
+    def stop_profile(self) -> None:
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        self.profiler.stop()
+        self.profiler.export_chrome_trace(
+            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+        )
+        logger.info("Profiler is done")
 def run_scheduler_process(
     server_args: ServerArgs,
     port_args: PortArgs,
     gpu_id: int,
     tp_rank: int,
+    dp_rank: Optional[int],
     pipe_writer,
 ):
-    configure_logger(server_args, prefix=f" TP{tp_rank}")
+    if dp_rank is None:
+        configure_logger(server_args, prefix=f" TP{tp_rank}")
+    else:
+        configure_logger(server_args, prefix=f" DP{dp_rank} TP{tp_rank}")
     suppress_other_loggers()
     try:

{sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/tokenizer_manager.py RENAMED Viewed

@@ -46,6 +46,7 @@ from sglang.srt.managers.io_struct import (
     EmbeddingReqInput,
     FlushCacheReq,
     GenerateReqInput,
+    ProfileReq,
     RewardReqInput,
     TokenizedEmbeddingReqInput,
     TokenizedGenerateReqInput,
@@ -512,6 +513,14 @@ class TokenizerManager:
         req = AbortReq(rid)
         self.send_to_scheduler.send_pyobj(req)
+    def start_profile(self):
+        req = ProfileReq.START_PROFILE
+        self.send_to_scheduler.send_pyobj(req)
+    def stop_profile(self):
+        req = ProfileReq.STOP_PROFILE
+        self.send_to_scheduler.send_pyobj(req)
     async def update_weights(
         self, obj: UpdateWeightReqInput, request: Optional[fastapi.Request] = None
     ):

sglang 0.3.3__tar.gz → 0.3.3.post1__tar.gz

sglang 0.3.3tar.gz → 0.3.3.post1tar.gz