PyPI - sglang - Versions diffs - 0.3.6__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl - Mend

sglang 0.3.6py3-none-any.whl → 0.3.6.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

sglang/__init__.py +2 -2
sglang/api.py +2 -2
sglang/bench_one_batch.py +2 -4
sglang/bench_serving.py +75 -26
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/runtime_endpoint.py +2 -2
sglang/srt/configs/model_config.py +13 -14
sglang/srt/constrained/__init__.py +13 -14
sglang/srt/constrained/base_grammar_backend.py +13 -15
sglang/srt/constrained/outlines_backend.py +13 -15
sglang/srt/constrained/outlines_jump_forward.py +13 -15
sglang/srt/constrained/xgrammar_backend.py +38 -57
sglang/srt/conversation.py +13 -15
sglang/srt/hf_transformers_utils.py +13 -15
sglang/srt/layers/activation.py +13 -13
sglang/srt/layers/attention/flashinfer_backend.py +13 -6
sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
sglang/srt/layers/custom_op_util.py +13 -14
sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
sglang/srt/layers/fused_moe_triton/layer.py +633 -0
sglang/srt/layers/layernorm.py +13 -15
sglang/srt/layers/logits_processor.py +13 -15
sglang/srt/layers/quantization/__init__.py +77 -17
sglang/srt/layers/radix_attention.py +13 -15
sglang/srt/layers/rotary_embedding.py +13 -13
sglang/srt/lora/lora.py +13 -14
sglang/srt/lora/lora_config.py +13 -14
sglang/srt/lora/lora_manager.py +22 -24
sglang/srt/managers/data_parallel_controller.py +25 -19
sglang/srt/managers/detokenizer_manager.py +13 -16
sglang/srt/managers/io_struct.py +43 -28
sglang/srt/managers/schedule_batch.py +55 -26
sglang/srt/managers/schedule_policy.py +13 -15
sglang/srt/managers/scheduler.py +89 -70
sglang/srt/managers/session_controller.py +14 -15
sglang/srt/managers/tokenizer_manager.py +29 -22
sglang/srt/managers/tp_worker.py +13 -15
sglang/srt/managers/tp_worker_overlap_thread.py +13 -15
sglang/srt/metrics/collector.py +13 -15
sglang/srt/metrics/func_timer.py +13 -15
sglang/srt/mm_utils.py +13 -14
sglang/srt/model_executor/cuda_graph_runner.py +20 -19
sglang/srt/model_executor/forward_batch_info.py +19 -17
sglang/srt/model_executor/model_runner.py +42 -30
sglang/srt/models/chatglm.py +15 -16
sglang/srt/models/commandr.py +15 -16
sglang/srt/models/dbrx.py +15 -16
sglang/srt/models/deepseek.py +15 -15
sglang/srt/models/deepseek_v2.py +15 -15
sglang/srt/models/exaone.py +14 -15
sglang/srt/models/gemma.py +14 -14
sglang/srt/models/gemma2.py +24 -19
sglang/srt/models/gemma2_reward.py +13 -14
sglang/srt/models/gpt_bigcode.py +14 -14
sglang/srt/models/grok.py +15 -15
sglang/srt/models/internlm2.py +13 -15
sglang/srt/models/internlm2_reward.py +13 -14
sglang/srt/models/llama.py +21 -21
sglang/srt/models/llama_classification.py +13 -14
sglang/srt/models/llama_reward.py +13 -14
sglang/srt/models/llava.py +13 -15
sglang/srt/models/llavavid.py +13 -15
sglang/srt/models/minicpm.py +13 -15
sglang/srt/models/minicpm3.py +13 -15
sglang/srt/models/mistral.py +13 -15
sglang/srt/models/mixtral.py +15 -15
sglang/srt/models/mixtral_quant.py +14 -14
sglang/srt/models/olmo.py +21 -19
sglang/srt/models/olmoe.py +23 -20
sglang/srt/models/qwen.py +14 -14
sglang/srt/models/qwen2.py +22 -19
sglang/srt/models/qwen2_moe.py +17 -18
sglang/srt/models/stablelm.py +18 -16
sglang/srt/models/torch_native_llama.py +15 -17
sglang/srt/models/xverse.py +13 -14
sglang/srt/models/xverse_moe.py +15 -16
sglang/srt/models/yivl.py +13 -15
sglang/srt/openai_api/adapter.py +13 -15
sglang/srt/openai_api/protocol.py +13 -15
sglang/srt/sampling/sampling_batch_info.py +4 -1
sglang/srt/sampling/sampling_params.py +13 -15
sglang/srt/server.py +59 -34
sglang/srt/server_args.py +22 -22
sglang/srt/utils.py +196 -17
sglang/test/few_shot_gsm8k.py +8 -4
sglang/test/runners.py +13 -14
sglang/test/test_utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/LICENSE +1 -1
{sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/METADATA +24 -15
sglang-0.3.6.post1.dist-info/RECORD +164 -0
sglang/srt/layers/fused_moe/__init__.py +0 -1
sglang-0.3.6.dist-info/RECORD +0 -161
/sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +0 -0
{sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/WHEEL +0 -0
{sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -1,18 +1,19 @@
 # Adapted from https://raw.githubusercontent.com/vllm-project/vllm/v0.5.5/vllm/model_executor/layers/quantization/__init__.py
-from typing import Dict, Type
+from typing import Callable, Dict, Optional, Type
+import torch
 from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
 from vllm.model_executor.layers.quantization.bitsandbytes import BitsAndBytesConfig
-from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (
     CompressedTensorsConfig,
 )
 from vllm.model_executor.layers.quantization.deepspeedfp import DeepSpeedFPConfig
 from vllm.model_executor.layers.quantization.experts_int8 import ExpertsInt8Config
 from vllm.model_executor.layers.quantization.fbgemm_fp8 import FBGEMMFp8Config
-from vllm.model_executor.layers.quantization.fp8 import Fp8Config
+from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8MoEMethod
 from vllm.model_executor.layers.quantization.gguf import GGUFConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinConfig
@@ -30,8 +31,6 @@ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "tpu_int8": Int8TpuConfig,
     "fp8": Fp8Config,
     "fbgemm_fp8": FBGEMMFp8Config,
-    # The order of gptq methods is important for config.py iteration over
-    # override_quantization_method(..)
     "marlin": MarlinConfig,
     "gguf": GGUFConfig,
     "gptq_marlin_24": GPTQMarlin24Config,
@@ -47,20 +46,68 @@ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
 def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     if quantization not in QUANTIZATION_METHODS:
-        raise ValueError(f"Invalid quantization method: {quantization}")
+        raise ValueError(
+            f"Invalid quantization method: {quantization}. "
+            f"Available methods: {list(QUANTIZATION_METHODS.keys())}"
+        )
     return QUANTIZATION_METHODS[quantization]
-__all__ = [
-    "QuantizationConfig",
-    "get_quantization_config",
-    "QUANTIZATION_METHODS",
-]
+def fp8_moe_apply(
+    self,
+    layer: torch.nn.Module,
+    x: torch.Tensor,
+    router_logits: torch.Tensor,
+    top_k: int,
+    renormalize: bool,
+    use_grouped_topk: bool,
+    topk_group: Optional[int] = None,
+    num_expert_group: Optional[int] = None,
+    custom_routing_function: Optional[Callable] = None,
+) -> torch.Tensor:
+    """Enhanced apply method for FP8 MoE."""
+    from sglang.srt.layers.fused_moe_triton import FusedMoE
+    from sglang.srt.layers.fused_moe_triton.fused_moe import fused_experts
+    # Expert selection
+    topk_weights, topk_ids = FusedMoE.select_experts(
+        hidden_states=x,
+        router_logits=router_logits,
+        use_grouped_topk=use_grouped_topk,
+        top_k=top_k,
+        renormalize=renormalize,
+        topk_group=topk_group,
+        num_expert_group=num_expert_group,
+        custom_routing_function=custom_routing_function,
+    )
+    # Expert fusion with FP8 quantization
+    return fused_experts(
+        x,
+        layer.w13_weight,
+        layer.w2_weight,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=True,
+        use_fp8_w8a8=True,
+        w1_scale=layer.w13_weight_scale,
+        w2_scale=layer.w2_weight_scale,
+        a1_scale=layer.w13_input_scale,
+        a2_scale=layer.w2_input_scale,
+    )
+def fp8_get_quant_method(self, layer, prefix):
+    """Enhanced get_quant_method for FP8 config."""
+    from vllm.model_executor.layers.linear import LinearBase
+    from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
+    from vllm.model_executor.layers.quantization.utils.quant_utils import (
+        is_layer_skipped,
+    )
+    from sglang.srt.layers.fused_moe_triton.layer import FusedMoE
+    from sglang.srt.layers.linear import UnquantizedLinearMethod
-"""
-def fp8_get_quant_method(
-    self, layer: torch.nn.Module, prefix: str
-) -> Optional["QuantizeMethodBase"]:
     if isinstance(layer, LinearBase):
         if is_layer_skipped(prefix, self.ignored_layers):
             return UnquantizedLinearMethod()
@@ -70,5 +117,18 @@ def fp8_get_quant_method(
     return None
-setattr(Fp8Config, "get_quant_method", fp8_get_quant_method)
-"""
+def apply_monkey_patches():
+    """Apply all monkey patches in one place."""
+    setattr(Fp8MoEMethod, "apply", fp8_moe_apply)
+    setattr(Fp8Config, "get_quant_method", fp8_get_quant_method)
+# Apply patches when module is imported
+apply_monkey_patches()
+__all__ = [
+    "QuantizationConfig",
+    "get_quantization_config",
+    "QUANTIZATION_METHODS",
+]

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Radix attention."""
 from torch import nn

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -1,16 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """MRotaryEmbedding"""
 from typing import Any, Dict, List, Optional, Tuple, Union

sglang/srt/lora/lora.py CHANGED Viewed

@@ -1,17 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 # Integrates "S-LoRA: Serving Thousands of Concurrent LoRA Adapters"
 # and "Punica: Multi-Tenant LoRA Serving"

sglang/srt/lora/lora_config.py CHANGED Viewed

@@ -1,17 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 import json
 import os

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -1,22 +1,20 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 # Integrates "S-LoRA: Serving Thousands of Concurrent LoRA Adapters"
 # and "Punica: Multi-Tenant LoRA Serving"
 import logging
 import re
@@ -146,9 +144,9 @@ class LoRAManager:
             }
         else:
             logger.warning(
-                f"WARNING: get_module_name() is not defined, "
-                f"which is used to map config module name to model implementation module name."
-                f"Use the default one, but please check if it is correct for your model."
+                "WARNING: get_module_name() is not defined, "
+                "which is used to map config module name to model implementation module name."
+                "Use the default one, but please check if it is correct for your model."
             )
             self.target_modules = {
                 get_module_name(module) for module in self.origin_target_modules
@@ -194,9 +192,9 @@ class LoRAManager:
                 hidden_dim_A, _ = self.base_model.get_hidden_dim(module_A)
             else:
                 logger.warning(
-                    f"WARNING: get_hidden_dim() is not defined, "
-                    f"which is used to get the hidden dim for different lora modules"
-                    f"Use the default one, but please check if it is correct for your model."
+                    "WARNING: get_hidden_dim() is not defined, "
+                    "which is used to get the hidden dim for different lora modules"
+                    "Use the default one, but please check if it is correct for your model."
                 )
                 hidden_dim_A, _ = get_hidden_dim(module_A, self.base_hf_config)
             c = self.loras[-1].get_stacked_multiply(module_A)
@@ -218,9 +216,9 @@ class LoRAManager:
                 _, hidden_dim_B = self.base_model.get_hidden_dim(module_B)
             else:
                 logger.warning(
-                    f"WARNING: get_hidden_dim() is not defined, "
-                    f"which is used to get the hidden dim for different lora modules"
-                    f"Use the default one, but please check if it is correct for your model."
+                    "WARNING: get_hidden_dim() is not defined, "
+                    "which is used to get the hidden dim for different lora modules"
+                    "Use the default one, but please check if it is correct for your model."
                 )
                 _, hidden_dim_B = get_hidden_dim(module_B, self.base_hf_config)
             c = self.loras[-1].get_stacked_multiply(module_B)

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """A controller that dispatches requests to multiple data parallel workers."""
 import logging
@@ -169,9 +167,12 @@ class DataParallelController:
             self.context, zmq.PUSH, port_args.scheduler_input_ipc_name
         )
-        # Wait for model to finish loading
+        # Wait for model to finish loading and get max token nums
+        scheduler_info = []
         for i in range(len(scheduler_pipe_readers)):
-            scheduler_pipe_readers[i].recv()
+            scheduler_info.append(scheduler_pipe_readers[i].recv())
+        self.max_total_num_tokens = scheduler_info[0]["max_total_num_tokens"]
         return send_to
@@ -193,7 +194,10 @@ class DataParallelController:
         send_to = get_zmq_socket(
             self.context, zmq.PUSH, port_args.scheduler_input_ipc_name
         )
-        reader.recv()
+        scheduler_info = reader.recv()
+        self.max_total_num_tokens = scheduler_info["max_total_num_tokens"]
         return send_to
     def round_robin_scheduler(self, req):
@@ -235,7 +239,9 @@ def run_data_parallel_controller_process(
     try:
         controller = DataParallelController(server_args, port_args)
-        pipe_writer.send("ready")
+        pipe_writer.send(
+            {"status": "ready", "max_total_num_tokens": controller.max_total_num_tokens}
+        )
         controller.event_loop()
     except Exception:
         msg = get_exception_traceback()

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """DetokenizerManager is a process that detokenizes the token ids."""
 import dataclasses
@@ -175,7 +173,6 @@ class DetokenizerManager:
                     output_strs=output_strs,
                     meta_info=recv_obj.meta_info,
                     finished_reason=recv_obj.finished_reason,
-                    session_ids=recv_obj.session_ids,
                 )
             )

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """
 The definition of objects transfered between different
 processes (TokenizerManager, DetokenizerManager, Controller).
@@ -21,7 +19,7 @@ processes (TokenizerManager, DetokenizerManager, Controller).
 import uuid
 from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.sampling.sampling_params import SamplingParams
@@ -31,8 +29,10 @@ from sglang.srt.sampling.sampling_params import SamplingParams
 class GenerateReqInput:
     # The input prompt. It can be a single prompt or a batch of prompts.
     text: Optional[Union[List[str], str]] = None
-    # The token ids for text; one can either specify text or input_ids.
+    # The token ids for text; one can specify either text or input_ids
     input_ids: Optional[Union[List[List[int]], List[int]]] = None
+    # The embeddings for input_ids; one can specify either text or input_ids or input_embeds.
+    input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
     # The image input. It can be a file name, a url, or base64 encoded string.
     # See also python/sglang/srt/utils.py:load_image.
     image_data: Optional[Union[List[str], str]] = None
@@ -57,14 +57,21 @@ class GenerateReqInput:
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
     # Session id info for continual prompting
-    session_id: Optional[Union[List[str], str]] = None
-    session_rid: Optional[Union[List[str], str]] = None
+    session: Optional[
+        Union[List[Tuple[str, Optional[str]]], Tuple[str, Optional[str]]]
+    ] = None
     def normalize_batch_and_arguments(self):
-        if (self.text is None and self.input_ids is None) or (
-            self.text is not None and self.input_ids is not None
+        if (
+            self.text is None and self.input_ids is None and self.input_embeds is None
+        ) or (
+            self.text is not None
+            and self.input_ids is not None
+            and self.input_embeds is not None
         ):
-            raise ValueError("Either text or input_ids should be provided.")
+            raise ValueError(
+                "Either text, input_ids or input_embeds should be provided."
+            )
         # Derive the batch size
         if self.text is not None:
@@ -74,13 +81,21 @@ class GenerateReqInput:
             else:
                 self.is_single = False
                 self.batch_size = len(self.text)
-        else:
+            self.input_embeds = None
+        elif self.input_ids is not None:
             if isinstance(self.input_ids[0], int):
                 self.is_single = True
                 self.batch_size = 1
             else:
                 self.is_single = False
                 self.batch_size = len(self.input_ids)
+            self.input_embeds = None
+        else:
+            if isinstance(self.input_embeds[0][0], float):
+                self.is_single = True
+                self.batch_size = 1
+            else:
+                self.batch_size = len(self.input_embeds)
         # Handle parallel sampling
         # When parallel sampling is used, we always treat the input as a batch.
@@ -203,9 +218,11 @@ class TokenizedGenerateReqInput:
     # LoRA related
     lora_path: Optional[str] = None  # None means just use the base model
+    # The input embeds
+    input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
     # Session id info for continual prompting
-    session_id: Optional[int] = None
+    session_id: Optional[str] = None
     session_rid: Optional[str] = None
@@ -219,6 +236,8 @@ class EmbeddingReqInput:
     rid: Optional[Union[List[str], str]] = None
     # Dummy sampling params for compatibility
     sampling_params: Union[List[Dict], Dict] = None
+    # Dummy input embeds for compatibility
+    input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
     def normalize_batch_and_arguments(self):
         if (self.text is None and self.input_ids is None) or (
@@ -301,8 +320,6 @@ class BatchTokenIDOut:
     meta_info: List[Dict]
     finished_reason: List[BaseFinishReason]
     no_stop_trim: List[bool]
-    # The updated session unique id
-    session_ids: List[str]
 @dataclass
@@ -315,8 +332,6 @@ class BatchStrOut:
     meta_info: List[Dict]
     # The finish reason
     finished_reason: List[BaseFinishReason]
-    # The update session unique id
-    session_ids: List[str]
 @dataclass

sglang 0.3.6__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl

sglang 0.3.6py3-none-any.whl → 0.3.6.post1py3-none-any.whl