PyPI - sglang - Versions diffs - 0.4.1.post3__py3-none-any.whl → 0.4.1.post5__py3-none-any.whl - Mend

sglang 0.4.1.post3py3-none-any.whl → 0.4.1.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

sglang/bench_one_batch.py +2 -0
sglang/bench_serving.py +18 -1
sglang/lang/interpreter.py +71 -1
sglang/lang/ir.py +2 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/chatglm.py +78 -0
sglang/srt/configs/dbrx.py +279 -0
sglang/srt/configs/model_config.py +1 -1
sglang/srt/hf_transformers_utils.py +9 -14
sglang/srt/layers/attention/__init__.py +22 -6
sglang/srt/layers/attention/double_sparsity_backend.py +0 -52
sglang/srt/layers/attention/flashinfer_backend.py +215 -83
sglang/srt/layers/attention/torch_native_backend.py +1 -38
sglang/srt/layers/attention/triton_backend.py +20 -11
sglang/srt/layers/attention/triton_ops/decode_attention.py +4 -0
sglang/srt/layers/linear.py +159 -55
sglang/srt/layers/logits_processor.py +170 -215
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +198 -29
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -7
sglang/srt/layers/parameter.py +431 -0
sglang/srt/layers/quantization/__init__.py +3 -2
sglang/srt/layers/quantization/fp8.py +3 -3
sglang/srt/layers/quantization/modelopt_quant.py +174 -0
sglang/srt/layers/sampler.py +57 -21
sglang/srt/layers/torchao_utils.py +17 -3
sglang/srt/layers/vocab_parallel_embedding.py +1 -1
sglang/srt/managers/cache_controller.py +307 -0
sglang/srt/managers/data_parallel_controller.py +2 -0
sglang/srt/managers/io_struct.py +1 -2
sglang/srt/managers/schedule_batch.py +33 -3
sglang/srt/managers/schedule_policy.py +159 -90
sglang/srt/managers/scheduler.py +68 -28
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +27 -21
sglang/srt/managers/tp_worker.py +16 -4
sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
sglang/srt/mem_cache/memory_pool.py +206 -1
sglang/srt/metrics/collector.py +22 -30
sglang/srt/model_executor/cuda_graph_runner.py +129 -77
sglang/srt/model_executor/forward_batch_info.py +51 -21
sglang/srt/model_executor/model_runner.py +72 -64
sglang/srt/models/chatglm.py +1 -1
sglang/srt/models/dbrx.py +1 -1
sglang/srt/models/deepseek_v2.py +34 -7
sglang/srt/models/grok.py +109 -29
sglang/srt/models/llama.py +9 -2
sglang/srt/openai_api/adapter.py +0 -17
sglang/srt/openai_api/protocol.py +3 -3
sglang/srt/sampling/sampling_batch_info.py +22 -0
sglang/srt/sampling/sampling_params.py +9 -1
sglang/srt/server.py +20 -13
sglang/srt/server_args.py +120 -58
sglang/srt/speculative/build_eagle_tree.py +347 -0
sglang/srt/speculative/eagle_utils.py +626 -0
sglang/srt/speculative/eagle_worker.py +184 -0
sglang/srt/speculative/spec_info.py +5 -0
sglang/srt/utils.py +47 -7
sglang/test/test_programs.py +23 -1
sglang/test/test_utils.py +36 -7
sglang/version.py +1 -1
{sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/METADATA +12 -12
{sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/RECORD +86 -57
{sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/WHEEL +1 -1
{sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/top_level.txt +0 -0

sglang/srt/speculative/eagle_worker.py ADDED Viewed

@@ -0,0 +1,184 @@
+from typing import List, Optional, Union
+import torch
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
+from sglang.srt.managers.tp_worker import TpModelWorker
+from sglang.srt.model_executor.forward_batch_info import (
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+)
+from sglang.srt.model_executor.model_runner import ModelRunner
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.speculative.eagle_utils import EAGLEDraftInput
+class EAGLEWorker(TpModelWorker):
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        gpu_id: int,
+        tp_rank: int,
+        dp_rank: Optional[int],
+        nccl_port: int,
+        target_worker: TpModelWorker,
+    ):
+        # Do not capture cuda graph in `super().__init__()`
+        # We will capture it later
+        backup_disable_cuda_graph = server_args.disable_cuda_graph
+        server_args.disable_cuda_graph = True
+        super().__init__(
+            gpu_id=gpu_id,
+            tp_rank=tp_rank,
+            server_args=server_args,
+            nccl_port=nccl_port,
+            dp_rank=dp_rank,
+            is_draft_worker=True,
+        )
+        self.target_worker = target_worker
+        self.server_args = server_args
+        # Share the embedding and lm_head
+        embed, head = self.target_worker.model_runner.model.get_embed_and_head()
+        self.model_runner.model.set_embed_and_head(embed, head)
+        self.model_runner.server_args.disable_cuda_graph = backup_disable_cuda_graph
+        self.model_runner.init_cuda_graphs()
+    def forward_draft_decode(self, batch: ScheduleBatch):
+        batch.spec_info.prepare_for_decode(batch)
+        model_worker_batch = batch.get_model_worker_batch()
+        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
+        forward_batch.capture_hidden_mode = CaptureHiddenMode.LAST
+        logits_output = self.model_runner.forward(forward_batch)
+        self.capture_for_decode(logits_output, forward_batch)
+    def forward_draft_extend(self, batch: ScheduleBatch):
+        self._set_mem_pool(batch, self.model_runner)
+        batch.spec_info.prepare_for_extend(batch)
+        model_worker_batch = batch.get_model_worker_batch()
+        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
+        forward_batch.capture_hidden_mode = CaptureHiddenMode.LAST
+        logits_output = self.model_runner.forward(forward_batch)
+        self.capture_for_decode(logits_output, forward_batch)
+        self._set_mem_pool(batch, self.target_worker.model_runner)
+    def forward_batch_speculative_generation(self, batch: ScheduleBatch):
+        if batch.forward_mode.is_decode():
+            # Draft
+            self._set_mem_pool(batch, self.model_runner)
+            for i in range(self.server_args.speculative_num_steps):
+                self.forward_draft_decode(batch)
+            batch.spec_info.clear_draft_cache(batch)
+            self._set_mem_pool(batch, self.target_worker.model_runner)
+            # Verify
+            (
+                next_draft_input,
+                logits_output,
+                verified_id,
+                self.finish_extend_len,
+                accept_length_cpu,
+                model_worker_batch,
+            ) = self.verify(batch)
+            next_draft_input.load_server_args(self.server_args)
+            batch.spec_info = next_draft_input
+            # if it is None, means all requsets are finished
+            if batch.spec_info.verified_id is not None:
+                self.forward_draft_extend_after_decode(batch)
+            return (
+                logits_output,
+                verified_id,
+                model_worker_batch,
+                sum(accept_length_cpu),
+            )
+        else:
+            # Forward with the target model and get hidden states.
+            # We need the full hidden states to prefill the KV cache of the draft model.
+            model_worker_batch = batch.get_model_worker_batch()
+            model_worker_batch.capture_hidden_mode = CaptureHiddenMode.FULL
+            logits_output, next_token_ids = self.target_worker.forward_batch_generation(
+                model_worker_batch
+            )
+            # Forward with the draft model.
+            spec_info = EAGLEDraftInput()
+            spec_info.load_server_args(self.server_args)
+            spec_info.hidden_states = logits_output.hidden_states
+            spec_info.verified_id = next_token_ids
+            batch.spec_info = spec_info
+            self.forward_draft_extend(batch)
+            return logits_output, next_token_ids, model_worker_batch, 0
+    def verify(self, batch: ScheduleBatch):
+        verify_input = batch.spec_info.prepare_for_verify(batch)
+        verify_input.prepare_for_verify(batch)
+        batch.forward_mode = ForwardMode.TARGET_VERIFY
+        batch.spec_info = verify_input
+        batch.spec_info.capture_hidden_mode = CaptureHiddenMode.FULL
+        model_worker_batch = batch.get_model_worker_batch()
+        logits_output, _ = self.target_worker.forward_batch_generation(
+            model_worker_batch, skip_sample=True
+        )
+        verify_input.hidden_states = logits_output.hidden_states
+        res = verify_input.verify(batch, logits_output)
+        batch.forward_mode = ForwardMode.DECODE
+        return res + (model_worker_batch,)
+    def _set_mem_pool(self, batch: ScheduleBatch, runner: ModelRunner):
+        batch.token_to_kv_pool = runner.token_to_kv_pool
+        batch.req_to_token_pool = runner.req_to_token_pool
+    def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
+        self._set_mem_pool(batch, self.model_runner)
+        batch.forward_mode = ForwardMode.DRAFT_EXTEND
+        if batch.spec_info.has_finished:
+            index = batch.spec_info.unfinished_index
+            seq_lens = batch.seq_lens
+            batch.seq_lens = batch.seq_lens[index]
+        batch.spec_info.prepare_extend_after_decode(batch)
+        model_worker_batch = batch.get_model_worker_batch()
+        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
+        forward_batch.capture_hidden_mode = CaptureHiddenMode.LAST
+        logits_output = self.model_runner.forward(forward_batch)
+        batch.spec_info.hidden_states = logits_output.hidden_states
+        self.capture_for_decode(logits_output, forward_batch)
+        batch.forward_mode = ForwardMode.DECODE
+        if batch.spec_info.has_finished:
+            batch.seq_lens = seq_lens
+        self._set_mem_pool(batch, self.target_worker.model_runner)
+    def capture_for_decode(
+        self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
+    ):
+        sample_output = torch.softmax(
+            logits_output.next_token_logits, dim=-1
+        )  # TODO(kavioyu): Support more sampling methods
+        spec_info = forward_batch.spec_info
+        spec_info.sample_output = sample_output
+        spec_info.hidden_states = logits_output.hidden_states
+        spec_info.prev_mode = forward_batch.forward_mode
+    # Don't support prefix share now.
+    def finish_request(self, reqs: Union[Req, List[Req]]):
+        if not isinstance(reqs, List):
+            reqs = [reqs]
+        for req in reqs:
+            if req.rid not in self.finish_extend_len:
+                continue
+            req_len = (
+                len(req.origin_input_ids)
+                + len(req.output_ids)
+                - self.finish_extend_len[req.rid]
+                - 1
+            )
+            kv_indices = self.model_runner.req_to_token_pool.req_to_token[
+                req.req_pool_idx
+            ][:req_len]
+            self.model_runner.token_to_kv_pool.free(kv_indices)
+            self.model_runner.req_to_token_pool.free(req.req_pool_idx)

sglang/srt/speculative/spec_info.py CHANGED Viewed

@@ -2,8 +2,12 @@ from enum import IntEnum, auto
 class SpeculativeAlgorithm(IntEnum):
+    NONE = auto()
     EAGLE = auto()
+    def is_none(self):
+        return self == SpeculativeAlgorithm.NONE
     def is_eagle(self):
         return self == SpeculativeAlgorithm.EAGLE
@@ -11,6 +15,7 @@ class SpeculativeAlgorithm(IntEnum):
     def from_string(name: str):
         name_map = {
             "EAGLE": SpeculativeAlgorithm.EAGLE,
+            None: SpeculativeAlgorithm.NONE,
         }
         return name_map[name]

sglang/srt/utils.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import base64
 import dataclasses
+import io
 import ipaddress
 import itertools
 import json
@@ -34,6 +35,7 @@ import warnings
 from functools import lru_cache
 from importlib.metadata import PackageNotFoundError, version
 from io import BytesIO
+from multiprocessing.reduction import ForkingPickler
 from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
 import numpy as np
@@ -60,7 +62,6 @@ from triton.runtime.cache import (
 logger = logging.getLogger(__name__)
 show_time_cost = False
 time_infos = {}
@@ -334,6 +335,8 @@ def is_port_available(port):
             return True
         except socket.error:
             return False
+        except OverflowError:
+            return False
 def decode_video_base64(video_base64):
@@ -708,13 +711,14 @@ def broadcast_pyobj(
     data: List[Any],
     rank: int,
     dist_group: Optional[torch.distributed.ProcessGroup] = None,
+    src: int = 0,
 ):
     """Broadcast inputs from rank=0 to all other ranks with torch.dist backend."""
     if rank == 0:
         if len(data) == 0:
             tensor_size = torch.tensor([0], dtype=torch.long)
-            dist.broadcast(tensor_size, src=0, group=dist_group)
+            dist.broadcast(tensor_size, src=src, group=dist_group)
         else:
             serialized_data = pickle.dumps(data)
             size = len(serialized_data)
@@ -723,19 +727,19 @@ def broadcast_pyobj(
             )
             tensor_size = torch.tensor([size], dtype=torch.long)
-            dist.broadcast(tensor_size, src=0, group=dist_group)
-            dist.broadcast(tensor_data, src=0, group=dist_group)
+            dist.broadcast(tensor_size, src=src, group=dist_group)
+            dist.broadcast(tensor_data, src=src, group=dist_group)
         return data
     else:
         tensor_size = torch.tensor([0], dtype=torch.long)
-        dist.broadcast(tensor_size, src=0, group=dist_group)
+        dist.broadcast(tensor_size, src=src, group=dist_group)
         size = tensor_size.item()
         if size == 0:
             return []
         tensor_data = torch.empty(size, dtype=torch.uint8)
-        dist.broadcast(tensor_data, src=0, group=dist_group)
+        dist.broadcast(tensor_data, src=src, group=dist_group)
         serialized_data = bytes(tensor_data.cpu().numpy())
         data = pickle.loads(serialized_data)
@@ -1206,7 +1210,6 @@ def _cuda_device_count_stateless(cuda_visible_devices: Optional[str] = None) ->
     # https://github.com/pytorch/pytorch/blob/
     # c1cd946818442aca8c7f812b16d187ce1586c3bc/
     # torch/cuda/__init__.py#L831C1-L831C17
-    import torch.cuda
     import torch.version
     if not torch.cuda._is_compiled():
@@ -1335,3 +1338,40 @@ def parse_tool_response(text, tools, **kwargs):
         for call_info in call_info_list
     ]
     return text, call_info_list
+class MultiprocessingSerializer:
+    @staticmethod
+    def serialize(obj):
+        buf = io.BytesIO()
+        ForkingPickler(buf).dump(obj)
+        buf.seek(0)
+        return buf.read()
+    @staticmethod
+    def deserialize(data):
+        return ForkingPickler.loads(data)
+def debug_timing(func):
+    # todo: replace with a more organized instrumentation
+    def wrapper(*args, **kwargs):
+        if logger.isEnabledFor(logging.DEBUG):
+            tic = torch.cuda.Event(enable_timing=True)
+            toc = torch.cuda.Event(enable_timing=True)
+            tic.record()
+            result = func(*args, **kwargs)
+            toc.record()
+            torch.cuda.synchronize()  # Ensure all CUDA operations are complete
+            elapsed = tic.elapsed_time(toc)
+            indices = kwargs.get("indices", args[1] if len(args) > 1 else None)
+            num_tokens = len(indices) if indices is not None else 0
+            throughput = num_tokens / elapsed * 1000 if elapsed > 0 else 0
+            logger.debug(
+                f"Transfer time: {elapsed} ms, throughput: {throughput} tokens/s"
+            )
+            return result
+        else:
+            return func(*args, **kwargs)
+    return wrapper

sglang/test/test_programs.py CHANGED Viewed

@@ -509,13 +509,35 @@ def test_hellaswag_select():
         temperature=0,
         num_threads=64,
         progress_bar=True,
+        generator_style=False,
     )
-    preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
+    preds = []
+    for i, ret in enumerate(rets):
+        preds.append(choices[i].index(ret["answer"]))
     latency = time.time() - tic
     # Compute accuracy
     accuracy = np.mean(np.array(preds) == np.array(labels))
+    # Test generator style of run_batch
+    tic = time.time()
+    rets = few_shot_hellaswag.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=64,
+        progress_bar=True,
+        generator_style=True,
+    )
+    preds_gen = []
+    for i, ret in enumerate(rets):
+        preds_gen.append(choices[i].index(ret["answer"]))
+    latency_gen = time.time() - tic
+    # Compute accuracy
+    accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
+    assert np.abs(accuracy_gen - accuracy) < 0.01
+    assert np.abs(latency_gen - latency) < 1
     return accuracy, latency

sglang/test/test_utils.py CHANGED Viewed

@@ -36,7 +36,7 @@ DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
 DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
-DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
@@ -532,6 +532,8 @@ def run_bench_serving(
     request_rate,
     other_server_args,
     dataset_name="random",
+    dataset_path="",
+    tokenizer=None,
     random_input_len=4096,
     random_output_len=2048,
     disable_stream=False,
@@ -553,9 +555,9 @@ def run_bench_serving(
         host=None,
         port=None,
         dataset_name=dataset_name,
-        dataset_path="",
+        dataset_path=dataset_path,
         model=None,
-        tokenizer=None,
+        tokenizer=tokenizer,
         num_prompts=num_prompts,
         sharegpt_output_len=None,
         random_input_len=random_input_len,
@@ -657,16 +659,16 @@ STDERR_FILENAME = "stderr.txt"
 STDOUT_FILENAME = "stdout.txt"
-def read_output(output_lines):
+def read_output(output_lines: List[str], filename: str = STDERR_FILENAME):
     """Print the output in real time with another thread."""
-    while not os.path.exists(STDERR_FILENAME):
+    while not os.path.exists(filename):
         time.sleep(1)
     pt = 0
     while pt >= 0:
-        if pt > 0 and not os.path.exists(STDERR_FILENAME):
+        if pt > 0 and not os.path.exists(filename):
             break
-        lines = open(STDERR_FILENAME).readlines()
+        lines = open(filename).readlines()
         for line in lines[pt:]:
             print(line, end="", flush=True)
             output_lines.append(line)
@@ -747,6 +749,33 @@ def run_and_check_memory_leak(
         assert has_abort
+def run_command_and_capture_output(command, env: Optional[dict] = None):
+    stdout = open(STDOUT_FILENAME, "w")
+    stderr = open(STDERR_FILENAME, "w")
+    process = subprocess.Popen(
+        command, stdout=stdout, stderr=stderr, env=env, text=True
+    )
+    # Launch a thread to stream the output
+    output_lines = []
+    t = threading.Thread(target=read_output, args=(output_lines, STDOUT_FILENAME))
+    t.start()
+    # Join the process
+    process.wait()
+    stdout.close()
+    stderr.close()
+    if os.path.exists(STDOUT_FILENAME):
+        os.remove(STDOUT_FILENAME)
+    if os.path.exists(STDERR_FILENAME):
+        os.remove(STDERR_FILENAME)
+    kill_process_tree(process.pid)
+    t.join()
+    return output_lines
 def run_mmlu_test(
     disable_radix_cache=False,
     enable_mixed_chunk=False,

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.1.~~post3~~"
1	+ __version__ = "0.4.1.post5"

{sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: sglang
-Version: 0.4.1.post3
+Version: 0.4.1.post5
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -239,15 +239,15 @@ Requires-Dist: uvloop; extra == "runtime-common"
 Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
+Requires-Dist: cuda-python; extra == "srt"
+Requires-Dist: sgl-kernel>=0.0.2.post11; extra == "srt"
 Requires-Dist: torch; extra == "srt"
 Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
-Requires-Dist: cuda-python; extra == "srt"
 Requires-Dist: flashinfer==0.1.6; extra == "srt"
-Requires-Dist: sgl-kernel>=0.0.2.post10; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
-Requires-Dist: vllm==0.6.3.dev13; extra == "srt-hip"
+Requires-Dist: vllm==0.6.3.post2.dev1; extra == "srt-hip"
 Provides-Extra: srt-xpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
 Provides-Extra: srt-hpu
@@ -315,7 +315,7 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
 | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
 | [**Documentation**](https://sgl-project.github.io/)
-| [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA)
+| [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2um0ad92q-LkU19KQTxCGzlCgRiOiQEw)
 | [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
@@ -347,12 +347,13 @@ The core features include:
 ## Getting Started
 - [Install SGLang](https://sgl-project.github.io/start/install.html)
-- [Send requests](https://sgl-project.github.io/start/send_request.html)
-- [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
-- [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
+- [Quick Start](https://sgl-project.github.io/start/send_request.html)
+- [Backend Tutorial](https://sgl-project.github.io/backend/openai_api_completions.html)
+- [Frontend Tutorial](https://sgl-project.github.io/frontend/frontend.html)
+- [Contribution Guide](https://sgl-project.github.io/references/contribution_guide.html)
 ## Benchmark and Performance
-Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
+Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
 ## Roadmap
 [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
@@ -361,5 +362,4 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
 ## Acknowledgment and Citation
-We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
-Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
+We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.

sglang 0.4.1.post3__py3-none-any.whl → 0.4.1.post5__py3-none-any.whl

sglang 0.4.1.post3py3-none-any.whl → 0.4.1.post5py3-none-any.whl