PyPI - sglang - Versions diffs - 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl - Mend

sglang 0.3.5.post2py3-none-any.whl → 0.3.6.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

sglang/__init__.py +2 -2
sglang/api.py +2 -2
sglang/bench_latency.py +1 -553
sglang/bench_offline_throughput.py +48 -20
sglang/bench_one_batch.py +472 -0
sglang/{bench_server_latency.py → bench_one_batch_server.py} +3 -3
sglang/bench_serving.py +125 -6
sglang/check_env.py +3 -6
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/runtime_endpoint.py +2 -2
sglang/srt/configs/model_config.py +13 -14
sglang/srt/constrained/__init__.py +13 -14
sglang/srt/constrained/base_grammar_backend.py +13 -15
sglang/srt/constrained/outlines_backend.py +28 -17
sglang/srt/constrained/outlines_jump_forward.py +13 -15
sglang/srt/constrained/xgrammar_backend.py +47 -58
sglang/srt/conversation.py +13 -15
sglang/srt/hf_transformers_utils.py +13 -15
sglang/srt/layers/activation.py +16 -13
sglang/srt/layers/attention/flashinfer_backend.py +106 -54
sglang/srt/layers/attention/triton_backend.py +9 -7
sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
sglang/srt/layers/custom_op_util.py +25 -0
sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +11 -4
sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
sglang/srt/layers/fused_moe_triton/layer.py +633 -0
sglang/srt/layers/layernorm.py +17 -15
sglang/srt/layers/logits_processor.py +23 -25
sglang/srt/layers/quantization/__init__.py +77 -17
sglang/srt/layers/radix_attention.py +13 -15
sglang/srt/layers/rotary_embedding.py +13 -13
sglang/srt/layers/sampler.py +4 -8
sglang/srt/layers/torchao_utils.py +2 -0
sglang/srt/lora/lora.py +13 -14
sglang/srt/lora/lora_config.py +13 -14
sglang/srt/lora/lora_manager.py +22 -24
sglang/srt/managers/data_parallel_controller.py +98 -27
sglang/srt/managers/detokenizer_manager.py +13 -15
sglang/srt/managers/io_struct.py +63 -21
sglang/srt/managers/schedule_batch.py +154 -59
sglang/srt/managers/schedule_policy.py +18 -16
sglang/srt/managers/scheduler.py +278 -109
sglang/srt/managers/session_controller.py +61 -0
sglang/srt/managers/tokenizer_manager.py +63 -18
sglang/srt/managers/tp_worker.py +25 -16
sglang/srt/managers/tp_worker_overlap_thread.py +62 -67
sglang/srt/metrics/collector.py +13 -15
sglang/srt/metrics/func_timer.py +13 -15
sglang/srt/mm_utils.py +13 -14
sglang/srt/model_executor/cuda_graph_runner.py +63 -25
sglang/srt/model_executor/forward_batch_info.py +128 -32
sglang/srt/model_executor/model_runner.py +132 -64
sglang/srt/model_parallel.py +98 -0
sglang/srt/models/chatglm.py +15 -16
sglang/srt/models/commandr.py +15 -16
sglang/srt/models/dbrx.py +15 -16
sglang/srt/models/deepseek.py +15 -15
sglang/srt/models/deepseek_v2.py +162 -59
sglang/srt/models/exaone.py +14 -15
sglang/srt/models/gemma.py +14 -14
sglang/srt/models/gemma2.py +31 -25
sglang/srt/models/gemma2_reward.py +13 -14
sglang/srt/models/gpt_bigcode.py +14 -14
sglang/srt/models/grok.py +15 -15
sglang/srt/models/internlm2.py +13 -15
sglang/srt/models/internlm2_reward.py +13 -14
sglang/srt/models/llama.py +21 -21
sglang/srt/models/llama_classification.py +13 -14
sglang/srt/models/llama_reward.py +13 -14
sglang/srt/models/llava.py +14 -16
sglang/srt/models/llavavid.py +14 -16
sglang/srt/models/minicpm.py +13 -15
sglang/srt/models/minicpm3.py +13 -15
sglang/srt/models/mistral.py +13 -15
sglang/srt/models/mixtral.py +15 -15
sglang/srt/models/mixtral_quant.py +14 -14
sglang/srt/models/olmo.py +22 -20
sglang/srt/models/olmoe.py +23 -20
sglang/srt/models/phi3_small.py +447 -0
sglang/srt/models/qwen.py +14 -14
sglang/srt/models/qwen2.py +22 -19
sglang/srt/models/qwen2_moe.py +17 -18
sglang/srt/models/qwen2_vl.py +13 -6
sglang/srt/models/stablelm.py +18 -16
sglang/srt/models/torch_native_llama.py +107 -93
sglang/srt/models/xverse.py +13 -14
sglang/srt/models/xverse_moe.py +15 -16
sglang/srt/models/yivl.py +13 -15
sglang/srt/openai_api/adapter.py +19 -17
sglang/srt/openai_api/protocol.py +14 -16
sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
sglang/srt/sampling/sampling_batch_info.py +61 -57
sglang/srt/sampling/sampling_params.py +14 -16
sglang/srt/server.py +86 -35
sglang/srt/server_args.py +96 -80
sglang/srt/utils.py +266 -68
sglang/test/few_shot_gsm8k.py +8 -4
sglang/test/runners.py +38 -20
sglang/test/srt/sampling/penaltylib/utils.py +23 -21
sglang/test/test_utils.py +31 -20
sglang/version.py +1 -1
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/LICENSE +1 -1
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/METADATA +66 -57
sglang-0.3.6.post1.dist-info/RECORD +164 -0
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/WHEEL +1 -1
sglang/srt/layers/fused_moe/__init__.py +0 -1
sglang-0.3.5.post2.dist-info/RECORD +0 -156
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Logits processing."""
 import dataclasses
@@ -62,21 +60,21 @@ class LogitsMetadata:
     @classmethod
     def from_forward_batch(cls, forward_batch: ForwardBatch):
+        extend_logprob_pruned_lens_cpu = None
         if forward_batch.return_logprob:
             return_top_logprob = any(x > 0 for x in forward_batch.top_logprobs_nums)
+            if forward_batch.forward_mode.is_extend():
+                extend_logprob_pruned_lens_cpu = [
+                    extend_len - start_len
+                    for extend_len, start_len in zip(
+                        forward_batch.extend_seq_lens_cpu,
+                        forward_batch.extend_logprob_start_lens_cpu,
+                    )
+                ]
         else:
             return_top_logprob = False
-        if forward_batch.forward_mode.is_extend():
-            extend_logprob_pruned_lens_cpu = [
-                extend_len - start_len
-                for extend_len, start_len in zip(
-                    forward_batch.extend_seq_lens,
-                    forward_batch.extend_logprob_start_lens_cpu,
-                )
-            ]
-        else:
-            extend_logprob_pruned_lens_cpu = None
         return cls(
             forward_mode=forward_batch.forward_mode,
             top_logprobs_nums=forward_batch.top_logprobs_nums,

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -1,18 +1,19 @@
 # Adapted from https://raw.githubusercontent.com/vllm-project/vllm/v0.5.5/vllm/model_executor/layers/quantization/__init__.py
-from typing import Dict, Type
+from typing import Callable, Dict, Optional, Type
+import torch
 from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
 from vllm.model_executor.layers.quantization.bitsandbytes import BitsAndBytesConfig
-from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (
     CompressedTensorsConfig,
 )
 from vllm.model_executor.layers.quantization.deepspeedfp import DeepSpeedFPConfig
 from vllm.model_executor.layers.quantization.experts_int8 import ExpertsInt8Config
 from vllm.model_executor.layers.quantization.fbgemm_fp8 import FBGEMMFp8Config
-from vllm.model_executor.layers.quantization.fp8 import Fp8Config
+from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8MoEMethod
 from vllm.model_executor.layers.quantization.gguf import GGUFConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinConfig
@@ -30,8 +31,6 @@ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "tpu_int8": Int8TpuConfig,
     "fp8": Fp8Config,
     "fbgemm_fp8": FBGEMMFp8Config,
-    # The order of gptq methods is important for config.py iteration over
-    # override_quantization_method(..)
     "marlin": MarlinConfig,
     "gguf": GGUFConfig,
     "gptq_marlin_24": GPTQMarlin24Config,
@@ -47,20 +46,68 @@ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
 def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     if quantization not in QUANTIZATION_METHODS:
-        raise ValueError(f"Invalid quantization method: {quantization}")
+        raise ValueError(
+            f"Invalid quantization method: {quantization}. "
+            f"Available methods: {list(QUANTIZATION_METHODS.keys())}"
+        )
     return QUANTIZATION_METHODS[quantization]
-__all__ = [
-    "QuantizationConfig",
-    "get_quantization_config",
-    "QUANTIZATION_METHODS",
-]
+def fp8_moe_apply(
+    self,
+    layer: torch.nn.Module,
+    x: torch.Tensor,
+    router_logits: torch.Tensor,
+    top_k: int,
+    renormalize: bool,
+    use_grouped_topk: bool,
+    topk_group: Optional[int] = None,
+    num_expert_group: Optional[int] = None,
+    custom_routing_function: Optional[Callable] = None,
+) -> torch.Tensor:
+    """Enhanced apply method for FP8 MoE."""
+    from sglang.srt.layers.fused_moe_triton import FusedMoE
+    from sglang.srt.layers.fused_moe_triton.fused_moe import fused_experts
+    # Expert selection
+    topk_weights, topk_ids = FusedMoE.select_experts(
+        hidden_states=x,
+        router_logits=router_logits,
+        use_grouped_topk=use_grouped_topk,
+        top_k=top_k,
+        renormalize=renormalize,
+        topk_group=topk_group,
+        num_expert_group=num_expert_group,
+        custom_routing_function=custom_routing_function,
+    )
+    # Expert fusion with FP8 quantization
+    return fused_experts(
+        x,
+        layer.w13_weight,
+        layer.w2_weight,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=True,
+        use_fp8_w8a8=True,
+        w1_scale=layer.w13_weight_scale,
+        w2_scale=layer.w2_weight_scale,
+        a1_scale=layer.w13_input_scale,
+        a2_scale=layer.w2_input_scale,
+    )
+def fp8_get_quant_method(self, layer, prefix):
+    """Enhanced get_quant_method for FP8 config."""
+    from vllm.model_executor.layers.linear import LinearBase
+    from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
+    from vllm.model_executor.layers.quantization.utils.quant_utils import (
+        is_layer_skipped,
+    )
+    from sglang.srt.layers.fused_moe_triton.layer import FusedMoE
+    from sglang.srt.layers.linear import UnquantizedLinearMethod
-"""
-def fp8_get_quant_method(
-    self, layer: torch.nn.Module, prefix: str
-) -> Optional["QuantizeMethodBase"]:
     if isinstance(layer, LinearBase):
         if is_layer_skipped(prefix, self.ignored_layers):
             return UnquantizedLinearMethod()
@@ -70,5 +117,18 @@ def fp8_get_quant_method(
     return None
-setattr(Fp8Config, "get_quant_method", fp8_get_quant_method)
-"""
+def apply_monkey_patches():
+    """Apply all monkey patches in one place."""
+    setattr(Fp8MoEMethod, "apply", fp8_moe_apply)
+    setattr(Fp8Config, "get_quant_method", fp8_get_quant_method)
+# Apply patches when module is imported
+apply_monkey_patches()
+__all__ = [
+    "QuantizationConfig",
+    "get_quantization_config",
+    "QUANTIZATION_METHODS",
+]

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Radix attention."""
 from torch import nn

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -1,16 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """MRotaryEmbedding"""
 from typing import Any, Dict, List, Optional, Tuple, Union

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-import os
 from typing import Union
 import torch
@@ -8,7 +7,7 @@ from torch import nn
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
-from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils import crash_on_warnings, is_flashinfer_available
 if is_flashinfer_available():
     from flashinfer.sampling import (
@@ -19,17 +18,13 @@ if is_flashinfer_available():
     )
-# Crash on warning if we are running CI tests
-crash_on_warning = os.getenv("SGLANG_IS_IN_CI", "false") == "true"
 logger = logging.getLogger(__name__)
 class Sampler(nn.Module):
     def __init__(self):
         super().__init__()
-        self.use_nan_detectioin = not global_server_args_dict["disable_nan_detection"]
+        self.use_nan_detectioin = global_server_args_dict["enable_nan_detection"]
     def forward(
         self,
@@ -46,7 +41,8 @@ class Sampler(nn.Module):
             logits = torch.where(
                 torch.isnan(logits), torch.full_like(logits, -1e5), logits
             )
-            exit(1) if crash_on_warning else None
+            if crash_on_warnings():
+                raise ValueError("Detected errors during sampling! NaN in the logits.")
         if sampling_info.is_all_greedy:
             # Use torch.argmax if all requests use greedy sampling

sglang/srt/layers/torchao_utils.py CHANGED Viewed

@@ -62,6 +62,8 @@ def torchao_quantize_param_data(param: torch.Tensor, torchao_config: str):
                 granularity=GRANULARITY_MAP[granularity]
             ),
         )
+    else:
+        raise ValueError(f"Unexpected config: {torchao_config}")
     return dummy_linear.weight

sglang/srt/lora/lora.py CHANGED Viewed

@@ -1,17 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 # Integrates "S-LoRA: Serving Thousands of Concurrent LoRA Adapters"
 # and "Punica: Multi-Tenant LoRA Serving"

sglang/srt/lora/lora_config.py CHANGED Viewed

@@ -1,17 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 import json
 import os

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -1,22 +1,20 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 # Integrates "S-LoRA: Serving Thousands of Concurrent LoRA Adapters"
 # and "Punica: Multi-Tenant LoRA Serving"
 import logging
 import re
@@ -146,9 +144,9 @@ class LoRAManager:
             }
         else:
             logger.warning(
-                f"WARNING: get_module_name() is not defined, "
-                f"which is used to map config module name to model implementation module name."
-                f"Use the default one, but please check if it is correct for your model."
+                "WARNING: get_module_name() is not defined, "
+                "which is used to map config module name to model implementation module name."
+                "Use the default one, but please check if it is correct for your model."
             )
             self.target_modules = {
                 get_module_name(module) for module in self.origin_target_modules
@@ -194,9 +192,9 @@ class LoRAManager:
                 hidden_dim_A, _ = self.base_model.get_hidden_dim(module_A)
             else:
                 logger.warning(
-                    f"WARNING: get_hidden_dim() is not defined, "
-                    f"which is used to get the hidden dim for different lora modules"
-                    f"Use the default one, but please check if it is correct for your model."
+                    "WARNING: get_hidden_dim() is not defined, "
+                    "which is used to get the hidden dim for different lora modules"
+                    "Use the default one, but please check if it is correct for your model."
                 )
                 hidden_dim_A, _ = get_hidden_dim(module_A, self.base_hf_config)
             c = self.loras[-1].get_stacked_multiply(module_A)
@@ -218,9 +216,9 @@ class LoRAManager:
                 _, hidden_dim_B = self.base_model.get_hidden_dim(module_B)
             else:
                 logger.warning(
-                    f"WARNING: get_hidden_dim() is not defined, "
-                    f"which is used to get the hidden dim for different lora modules"
-                    f"Use the default one, but please check if it is correct for your model."
+                    "WARNING: get_hidden_dim() is not defined, "
+                    "which is used to get the hidden dim for different lora modules"
+                    "Use the default one, but please check if it is correct for your model."
                 )
                 _, hidden_dim_B = get_hidden_dim(module_B, self.base_hf_config)
             c = self.loras[-1].get_stacked_multiply(module_B)

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -1,22 +1,21 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """A controller that dispatches requests to multiple data parallel workers."""
 import logging
 import multiprocessing as mp
+import threading
 from enum import Enum, auto
 import zmq
@@ -28,6 +27,7 @@ from sglang.srt.managers.io_struct import (
 from sglang.srt.managers.scheduler import run_scheduler_process
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import (
+    bind_port,
     configure_logger,
     get_zmq_socket,
     kill_parent_process,
@@ -80,20 +80,62 @@ class DataParallelController:
         # Start data parallel workers
         base_gpu_id = 0
-        self.workers = []
+        self.workers = [None] * server_args.dp_size
+        threads = []
+        sockets = []
         for dp_rank in range(server_args.dp_size):
             tmp_port_args = PortArgs.init_new(server_args)
+            tmp_port_args.tokenizer_ipc_name = port_args.tokenizer_ipc_name
             tmp_port_args.detokenizer_ipc_name = port_args.detokenizer_ipc_name
-            send_to = self.launch_tensor_parallel_group(
-                server_args,
-                tmp_port_args,
-                base_gpu_id,
-                dp_rank,
+            if server_args.enable_dp_attention:
+                # Data parallelism resues the tensor parallelism group,
+                # so all dp ranks should use the same nccl port.
+                tmp_port_args.nccl_port = port_args.nccl_port
+            else:
+                # This port is checked free in PortArgs.init_new.
+                # We hold it first so that the next dp worker gets a different port
+                sockets.append(bind_port(tmp_port_args.nccl_port))
+            # Create a thread for each worker
+            thread = threading.Thread(
+                target=self.launch_worker_func,
+                args=(server_args, tmp_port_args, base_gpu_id, dp_rank),
             )
+            threads.append(thread)
+            base_gpu_id += 1 if server_args.enable_dp_attention else server_args.tp_size
+        # Free all sockets before starting the threads to launch TP workers
+        for sock in sockets:
+            sock.close()
+        # Start all threads
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join()
+    def launch_worker_func(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        base_gpu_id: int,
+        dp_rank: int,
+    ):
+        logger.info(f"Launch DP{dp_rank} starting at GPU #{base_gpu_id}.")
-            self.workers.append(send_to)
-            base_gpu_id += server_args.tp_size
+        launch_func_ = (
+            self.launch_tensor_parallel_process
+            if server_args.enable_dp_attention
+            else self.launch_tensor_parallel_group
+        )
+        self.workers[dp_rank] = launch_func_(
+            server_args,
+            port_args,
+            base_gpu_id,
+            dp_rank,
+        )
     def launch_tensor_parallel_group(
         self,
@@ -112,7 +154,7 @@ class DataParallelController:
         )
         for tp_rank in tp_rank_range:
             reader, writer = mp.Pipe(duplex=False)
-            gpu_id = base_gpu_id + tp_rank % tp_size_per_node
+            gpu_id = server_args.base_gpu_id + base_gpu_id + tp_rank % tp_size_per_node
             proc = mp.Process(
                 target=run_scheduler_process,
                 args=(server_args, port_args, gpu_id, tp_rank, dp_rank, writer),
@@ -125,9 +167,36 @@ class DataParallelController:
             self.context, zmq.PUSH, port_args.scheduler_input_ipc_name
         )
-        # Wait for model to finish loading
+        # Wait for model to finish loading and get max token nums
+        scheduler_info = []
         for i in range(len(scheduler_pipe_readers)):
-            scheduler_pipe_readers[i].recv()
+            scheduler_info.append(scheduler_pipe_readers[i].recv())
+        self.max_total_num_tokens = scheduler_info[0]["max_total_num_tokens"]
+        return send_to
+    def launch_tensor_parallel_process(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        base_gpu_id: int,
+        dp_rank: int,
+    ):
+        reader, writer = mp.Pipe(duplex=False)
+        gpu_id = base_gpu_id
+        tp_rank = dp_rank
+        proc = mp.Process(
+            target=run_scheduler_process,
+            args=(server_args, port_args, gpu_id, tp_rank, dp_rank, writer),
+        )
+        proc.start()
+        send_to = get_zmq_socket(
+            self.context, zmq.PUSH, port_args.scheduler_input_ipc_name
+        )
+        scheduler_info = reader.recv()
+        self.max_total_num_tokens = scheduler_info["max_total_num_tokens"]
         return send_to
@@ -170,7 +239,9 @@ def run_data_parallel_controller_process(
     try:
         controller = DataParallelController(server_args, port_args)
-        pipe_writer.send("ready")
+        pipe_writer.send(
+            {"status": "ready", "max_total_num_tokens": controller.max_total_num_tokens}
+        )
         controller.event_loop()
     except Exception:
         msg = get_exception_traceback()

sglang 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl

sglang 0.3.5.post2py3-none-any.whl → 0.3.6.post1py3-none-any.whl