PyPI - sglang - Versions diffs - 0.3.1.post2__tar.gz → 0.3.1.post3__tar.gz - Mend

sglang 0.3.1.post2tar.gz → 0.3.1.post3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

{sglang-0.3.1.post2/sglang.egg-info → sglang-0.3.1.post3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.3.1.post2
+Version: 0.3.1.post3
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.3.1.post2 https://github.com/sgl-project/sglang.git
+git clone -b v0.3.1.post3 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
@@ -499,6 +499,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - Llama / Llama 2 / Llama 3 / Llama 3.1
 - Mistral / Mixtral / Mistral NeMo
 - Gemma / Gemma 2
+- OLMoE
 - Qwen / Qwen 2 / Qwen 2 MoE
 - DeepSeek / DeepSeek 2
 - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)

{sglang-0.3.1.post2 → sglang-0.3.1.post3}/README.md RENAMED Viewed

@@ -60,7 +60,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.3.1.post2 https://github.com/sgl-project/sglang.git
+git clone -b v0.3.1.post3 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
@@ -241,6 +241,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - Llama / Llama 2 / Llama 3 / Llama 3.1
 - Mistral / Mixtral / Mistral NeMo
 - Gemma / Gemma 2
+- OLMoE
 - Qwen / Qwen 2 / Qwen 2 MoE
 - DeepSeek / DeepSeek 2
 - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)

{sglang-0.3.1.post2 → sglang-0.3.1.post3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.3.1.post2"
+version = "0.3.1.post3"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"

{sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/bench_latency.py RENAMED Viewed

@@ -64,8 +64,13 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.srt.server import _set_envs_and_config
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import kill_child_process, suppress_other_loggers
+from sglang.srt.utils import (
+    configure_logger,
+    kill_child_process,
+    suppress_other_loggers,
+)
 @dataclasses.dataclass
@@ -341,6 +346,8 @@ def latency_test(
     bench_args,
     tp_rank,
 ):
+    configure_logger(server_args, prefix=f" TP{tp_rank}")
+    _set_envs_and_config(server_args)
     rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
     # Load the model

{sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/activation.py RENAMED Viewed

@@ -31,8 +31,9 @@ from vllm.distributed import (
     get_tensor_model_parallel_world_size,
 )
 from vllm.model_executor.custom_op import CustomOp
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.utils import set_weight_attrs
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.utils import set_weight_attrs
 logger = logging.getLogger(__name__)

{sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/attention_backend.py RENAMED Viewed

@@ -346,7 +346,9 @@ class TritonAttnBackend(AttentionBackend):
         self.decode_attention_fwd = decode_attention_fwd
         self.extend_attention_fwd = extend_attention_fwd
-        self.num_head = model_runner.model_config.num_attention_heads
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // model_runner.tp_size
+        )
         if global_server_args_dict.get("triton_attention_reduce_in_fp32", False):
             self.reduce_dtype = torch.float32

sglang 0.3.1.post2__tar.gz → 0.3.1.post3__tar.gz

sglang 0.3.1.post2tar.gz → 0.3.1.post3tar.gz