PyPI - sglang - Versions diffs - 0.3.5.post1__tar.gz → 0.3.5.post2__tar.gz - Mend

sglang 0.3.5.post1tar.gz → 0.3.5.post2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (162) hide show

{sglang-0.3.5.post1 → sglang-0.3.5.post2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.3.5.post1
+Version: 0.3.5.post2
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -233,7 +233,7 @@ Requires-Dist: torchao; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
 Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
-Requires-Dist: outlines>=0.0.44; extra == "runtime-common"
+Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
 Requires-Dist: modelscope; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"

{sglang-0.3.5.post1 → sglang-0.3.5.post2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.3.5.post1"
+version = "0.3.5.post2"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -19,7 +19,7 @@ dependencies = ["requests", "tqdm", "numpy", "IPython"]
 runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
     "orjson", "packaging", "pillow", "prometheus-client>=0.20.0", "psutil", "pydantic", "python-multipart",
     "torchao", "uvicorn", "uvloop", "pyzmq>=25.1.2",
-    "outlines>=0.0.44", "modelscope"]
+    "outlines>=0.0.44,<0.1.0", "modelscope"]
 srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
 # HIP (Heterogeneous-computing Interface for Portability) for AMD

sglang-0.3.5.post2/sglang/bench_offline_throughput.py ADDED Viewed

@@ -0,0 +1,309 @@
+"""
+Benchmark the throughput of using the offline LLM engine.
+This script does not launch a server.
+It accepts server arguments (the same as launch_server.py) and benchmark arguments (the same as bench_serving.py).
+# Usage
+## Sharegpt dataset with default args
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct
+## Random dataset with default args
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random
+## Shared prefix dataset with default args
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name generated-shared-prefix
+## Sharegpt dataset on runtime backend
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --backend runtime
+"""
+import argparse
+import dataclasses
+import json
+import logging
+import random
+import time
+from typing import List, Optional, Tuple
+import numpy as np
+from sglang.api import Engine
+from sglang.bench_serving import (
+    get_dataset,
+    get_tokenizer,
+    sample_random_requests,
+    set_ulimit,
+)
+from sglang.srt.server import Runtime
+from sglang.srt.server_args import ServerArgs
+@dataclasses.dataclass
+class BenchArgs:
+    backend: str = "engine"
+    result_filename: str = ""
+    dataset_name: str = "sharegpt"
+    dataset_path: str = ""
+    num_prompts: int = 1000
+    sharegpt_output_len: Optional[int] = None
+    random_input_len: int = 1024
+    random_output_len: int = 1024
+    random_range_ratio: float = 0.0
+    gen_num_groups: int = 64
+    gen_prompts_per_group: int = 16
+    gen_system_prompt_len: int = 2048
+    gen_question_len: int = 128
+    gen_output_len: int = 256
+    disable_ignore_eos: bool = False
+    seed: int = 1
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--backend", type=str, default=BenchArgs.backend)
+        parser.add_argument(
+            "--result-filename", type=str, default=BenchArgs.result_filename
+        )
+        parser.add_argument(
+            "--dataset-name",
+            type=str,
+            default="sharegpt",
+            choices=["sharegpt", "random", "generated-shared-prefix"],
+            help="Name of the dataset to benchmark on.",
+        )
+        parser.add_argument(
+            "--dataset-path", type=str, default="", help="Path to the dataset."
+        )
+        parser.add_argument(
+            "--num-prompts",
+            type=int,
+            default=BenchArgs.num_prompts,
+            help="Number of prompts to process. Default is 1000.",
+        )
+        parser.add_argument(
+            "--sharegpt-output-len",
+            type=int,
+            default=BenchArgs.sharegpt_output_len,
+            help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
+        )
+        parser.add_argument(
+            "--random-input-len",
+            type=int,
+            default=BenchArgs.random_input_len,
+            help="Number of input tokens per request, used only for random dataset.",
+        )
+        parser.add_argument(
+            "--random-output-len",
+            type=int,
+            default=BenchArgs.random_output_len,
+            help="Number of output tokens per request, used only for random dataset.",
+        )
+        parser.add_argument(
+            "--random-range-ratio",
+            type=float,
+            default=BenchArgs.random_range_ratio,
+            help="Range of sampled ratio of input/output length, "
+            "used only for random dataset.",
+        )
+        parser.add_argument(
+            "--gen-num-groups",
+            type=int,
+            default=BenchArgs.gen_num_groups,
+            help="Number of groups with shared prefix, used"
+            "only for generate-shared-prefix",
+        )
+        parser.add_argument(
+            "--gen-prompts-per-group",
+            type=int,
+            default=BenchArgs.gen_prompts_per_group,
+            help="Number of prompts per group of shared prefix, used"
+            "only for generate-shared-prefix",
+        )
+        parser.add_argument(
+            "--gen-system-prompt-len",
+            type=int,
+            default=BenchArgs.gen_system_prompt_len,
+            help="System prompt length, used" "only for generate-shared-prefix",
+        )
+        parser.add_argument(
+            "--gen-question-len",
+            type=int,
+            default=BenchArgs.gen_question_len,
+            help="Question length, used" "only for generate-shared-prefix",
+        )
+        parser.add_argument(
+            "--gen-output-len",
+            type=int,
+            default=BenchArgs.gen_output_len,
+            help="Target length in tokens for outputs in generated-shared-prefix dataset",
+        )
+        parser.add_argument(
+            "--disable-ignore-eos",
+            type=bool,
+            default=BenchArgs.disable_ignore_eos,
+            help="Disable ignore EOS token",
+        )
+        parser.add_argument("--seed", type=int, default=1, help="The random seed.")
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        return cls(**{attr: getattr(args, attr) for attr in attrs})
+def throughput_test_once(
+    backend_name: str,
+    backend,
+    reqs: List[Tuple[str, int, int]],
+    ignore_eos: bool,
+):
+    measurement_results = {
+        "backend": backend_name,
+        "successful_requests": len(reqs),
+        "total_latency": -1,
+        "total_input_tokens": sum(r[1] for r in reqs),
+        "total_output_tokens": -1,
+        "request_throughput": -1,
+        "input_throughput": -1,
+        "output_throughput": -1,
+        "total_throughput": -1,
+    }
+    prompt = [r[0] for r in reqs]
+    sampling_params = [
+        {
+            "temperature": 0,
+            "max_new_tokens": r[2],
+            "ignore_eos": ignore_eos,
+        }
+        for r in reqs
+    ]
+    st = time.perf_counter()
+    gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
+    latency = time.perf_counter() - st
+    if backend_name == "runtime":
+        gen_out = json.loads(gen_out)
+    measurement_results["total_latency"] = latency
+    measurement_results["total_output_tokens"] = sum(
+        o["meta_info"]["completion_tokens"] for o in gen_out
+    )
+    measurement_results["request_throughput"] = (
+        measurement_results["successful_requests"] / latency
+    )
+    measurement_results["input_throughput"] = (
+        measurement_results["total_input_tokens"] / latency
+    )
+    measurement_results["output_throughput"] = (
+        measurement_results["total_output_tokens"] / latency
+    )
+    measurement_results["total_throughput"] = (
+        measurement_results["total_input_tokens"]
+        + measurement_results["total_output_tokens"]
+    ) / latency
+    return measurement_results
+def throughput_test(
+    server_args: ServerArgs,
+    bench_args: BenchArgs,
+):
+    if bench_args.backend == "engine":
+        backend = Engine(**dataclasses.asdict(server_args))
+        if not backend:
+            raise ValueError("Please provide valid engine arguments")
+    elif bench_args.backend == "runtime":
+        backend = Runtime(**dataclasses.asdict(server_args))
+    else:
+        raise ValueError('Please set backend to either "engine" or "runtime"')
+    tokenizer_id = server_args.model_path
+    tokenizer = get_tokenizer(tokenizer_id)
+    # Set global environmnets
+    set_ulimit()
+    random.seed(bench_args.seed)
+    np.random.seed(bench_args.seed)
+    # Read dataset
+    input_requests = get_dataset(bench_args, tokenizer)
+    warmup_requests = sample_random_requests(
+        input_len=20,
+        output_len=4,
+        num_prompts=2,
+        range_ratio=0.8,
+        tokenizer=tokenizer,
+        dataset_path=bench_args.dataset_path,
+    )
+    # Warm up
+    throughput_test_once(
+        backend_name=bench_args.backend,
+        backend=backend,
+        reqs=warmup_requests,
+        ignore_eos=not bench_args.disable_ignore_eos,
+    )
+    result = throughput_test_once(
+        backend_name=bench_args.backend,
+        backend=backend,
+        reqs=input_requests,
+        ignore_eos=not bench_args.disable_ignore_eos,
+    )
+    if bench_args.result_filename:
+        with open(bench_args.result_filename, "a") as fout:
+            fout.write(json.dumps(result) + "\n")
+    print(
+        "\n{s:{c}^{n}}".format(s=" Offline Throughput Benchmark Result ", n=50, c="=")
+    )
+    print("{:<40} {:<10}".format("Backend:", result["backend"]))
+    print("{:<40} {:<10}".format("Successful requests:", result["successful_requests"]))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", result["total_latency"]))
+    print("{:<40} {:<10}".format("Total input tokens:", result["total_input_tokens"]))
+    print(
+        "{:<40} {:<10}".format("Total generated tokens:", result["total_output_tokens"])
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Request throughput (req/s):", result["request_throughput"]
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Input token throughput (tok/s):", result["input_throughput"]
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Output token throughput (tok/s):", result["output_throughput"]
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total token throughput (tok/s):", result["total_throughput"]
+        )
+    )
+    print("=" * 50)
+    return result
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    BenchArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    bench_args = BenchArgs.from_cli_args(args)
+    logging.basicConfig(
+        level=getattr(logging, server_args.log_level.upper()),
+        format="%(message)s",
+    )
+    throughput_test(server_args, bench_args)

{sglang-0.3.5.post1 → sglang-0.3.5.post2}/sglang/bench_serving.py RENAMED Viewed

@@ -421,6 +421,37 @@ def get_tokenizer(
     )
+def get_dataset(args, tokenizer):
+    if args.dataset_name == "sharegpt":
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            fixed_output_len=args.sharegpt_output_len,
+        )
+    elif args.dataset_name == "random":
+        input_requests = sample_random_requests(
+            input_len=args.random_input_len,
+            output_len=args.random_output_len,
+            num_prompts=args.num_prompts,
+            range_ratio=args.random_range_ratio,
+            tokenizer=tokenizer,
+            dataset_path=args.dataset_path,
+        )
+    elif args.dataset_name == "generated-shared-prefix":
+        input_requests = sample_generated_shared_prefix_requests(
+            num_groups=args.gen_num_groups,
+            prompts_per_group=args.gen_prompts_per_group,
+            system_prompt_len=args.gen_system_prompt_len,
+            question_len=args.gen_question_len,
+            output_len=args.gen_output_len,
+            tokenizer=tokenizer,
+        )
+    else:
+        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+    return input_requests
 ASYNC_REQUEST_FUNCS = {
     "sglang": async_request_sglang_generate,
     "sglang-native": async_request_sglang_generate,
@@ -443,6 +474,8 @@ class BenchmarkMetrics:
     input_throughput: float
     output_throughput: float
     output_throughput_retokenized: float
+    total_throughput: float
+    total_throughput_retokenized: float
     mean_ttft_ms: float
     median_ttft_ms: float
     std_ttft_ms: float
@@ -590,7 +623,6 @@ def sample_random_requests(
             (data["conversations"][0]["value"], data["conversations"][1]["value"])
             for data in dataset
         ]
         # Shuffle the dataset.
         random.shuffle(dataset)
@@ -764,6 +796,9 @@ def calculate_metrics(
         input_throughput=total_input / dur_s,
         output_throughput=sum(output_lens) / dur_s,
         output_throughput_retokenized=sum(retokenized_output_lens) / dur_s,
+        total_throughput=(total_input + sum(output_lens)) / dur_s,
+        total_throughput_retokenized=(total_input + sum(retokenized_output_lens))
+        / dur_s,
         mean_ttft_ms=np.mean(ttfts or 0)
         * 1000,  # ttfts is empty if streaming is not supported by backend
         median_ttft_ms=np.median(ttfts or 0) * 1000,
@@ -881,6 +916,11 @@ async def benchmark(
             "Output token throughput (tok/s):", metrics.output_throughput
         )
     )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total token throughput (tok/s):", metrics.total_throughput
+        )
+    )
     print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
     print(
         "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
@@ -1098,35 +1138,7 @@ def run_benchmark(args_: argparse.Namespace):
     tokenizer = get_tokenizer(tokenizer_id)
-    if args.dataset_name == "sharegpt":
-        assert args.random_input_len is None and args.random_output_len is None
-        input_requests = sample_sharegpt_requests(
-            dataset_path=args.dataset_path,
-            num_requests=args.num_prompts,
-            tokenizer=tokenizer,
-            fixed_output_len=args.sharegpt_output_len,
-        )
-    elif args.dataset_name == "random":
-        assert args.random_input_len is not None and args.random_output_len is not None
-        input_requests = sample_random_requests(
-            input_len=args.random_input_len,
-            output_len=args.random_output_len,
-            num_prompts=args.num_prompts,
-            range_ratio=args.random_range_ratio,
-            tokenizer=tokenizer,
-            dataset_path=args.dataset_path,
-        )
-    elif args.dataset_name == "generated-shared-prefix":
-        input_requests = sample_generated_shared_prefix_requests(
-            num_groups=args.gen_num_groups,
-            prompts_per_group=args.gen_prompts_per_group,
-            system_prompt_len=args.gen_system_prompt_len,
-            question_len=args.gen_question_len,
-            output_len=args.gen_output_len,
-            tokenizer=tokenizer,
-        )
-    else:
-        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+    input_requests = get_dataset(args, tokenizer)
     if not args.multi:
         return asyncio.run(
@@ -1229,10 +1241,12 @@ if __name__ == "__main__":
     parser.add_argument(
         "--random-input-len",
         type=int,
+        default=1024,
         help="Number of input tokens per request, used only for random dataset.",
     )
     parser.add_argument(
         "--random-output-len",
+        default=1024,
         type=int,
         help="Number of output tokens per request, used only for random dataset.",
     )

{sglang-0.3.5.post1 → sglang-0.3.5.post2}/sglang/srt/constrained/base_grammar_backend.py RENAMED Viewed

@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
-"""The baseclass of backends for grammar-guided constrained decoding."""
+"""The baseclass of a backend for grammar-guided constrained decoding."""
 from concurrent.futures import Future, ThreadPoolExecutor
 from dataclasses import dataclass
@@ -52,7 +52,7 @@ class BaseGrammarBackend:
         else:
             entry.value = self.init_value_impl(key)
             entry.event.set()
-        return entry.value.copy()
+        return entry.value.copy() if entry.value else None
     def init_value_impl(self, key: Tuple[str, str]) -> BaseGrammarObject:
         raise NotImplementedError()
@@ -62,7 +62,8 @@ class BaseGrammarBackend:
             entry = self.cache.get(key)
             if not entry or not entry.event.is_set():
                 return None
-            return self.cache[key].value.copy()
+            val = self.cache[key].value
+            return val.copy() if val else None
     def get_future_value(self, key: Tuple[str, str]) -> Future:
         return self.executor.submit(self.init_value, key)

{sglang-0.3.5.post1 → sglang-0.3.5.post2}/sglang/srt/constrained/outlines_backend.py RENAMED Viewed

@@ -19,9 +19,12 @@ import json
 import logging
 from typing import Dict, List, Optional, Tuple, Union
+import interegular
 import torch
 from outlines.fsm.guide import RegexGuide
+from outlines.fsm.json_schema import build_regex_from_schema
 from outlines.models.transformers import TransformerTokenizer
+from pydantic import BaseModel
 from sglang.srt.constrained.base_grammar_backend import (
     BaseGrammarBackend,
@@ -32,26 +35,6 @@ from sglang.srt.constrained.outlines_jump_forward import OutlinesJumpForwardMap
 logger = logging.getLogger(__name__)
-try:
-    from outlines.fsm.json_schema import build_regex_from_object
-except ImportError:
-    # Since outlines 0.0.32, build_regex_from_object is replaced by build_regex_from_schema,
-    # which only accepts string schema as input.
-    from outlines.fsm.json_schema import build_regex_from_schema
-    from pydantic import BaseModel
-    def build_regex_from_object(
-        object: Union[str, BaseModel, Dict], whitespace_pattern: Optional[str] = None
-    ):
-        if isinstance(object, type(BaseModel)):
-            schema = json.dumps(object.model_json_schema())
-        elif isinstance(object, Dict):
-            schema = json.dumps(object)
-        else:
-            schema = object
-        return build_regex_from_schema(schema, whitespace_pattern)
 class OutlinesGrammar(BaseGrammarObject):
     def __init__(
         self,
@@ -147,19 +130,36 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
                     key_string,
                     whitespace_pattern=self.whitespace_pattern,
                 )
-            except NotImplementedError as e:
+            except (NotImplementedError, json.decoder.JSONDecodeError) as e:
                 logger.warning(
-                    f"skip invalid json schema: json_schema={key_string}, {e=}"
+                    f"Skip invalid json_schema: json_schema={key_string}, {e=}"
                 )
-                return None, key_string
+                return None
         elif key_type == "regex":
             regex = key_string
         else:
             raise ValueError(f"Invalid key_type: {key_type}")
-        guide = RegexGuide(regex, self.outlines_tokenizer)
+        try:
+            guide = RegexGuide(regex, self.outlines_tokenizer)
+        except interegular.patterns.InvalidSyntax as e:
+            logger.warning(f"skip invalid regex schema: {regex=}, {e=}")
+            return None
         if self.allow_jump_forward:
             jump_forward_map = OutlinesJumpForwardMap(regex)
         else:
             jump_forward_map = None
         return OutlinesGrammar(guide, jump_forward_map)
+def build_regex_from_object(
+    object: Union[str, BaseModel, Dict], whitespace_pattern: Optional[str] = None
+):
+    if isinstance(object, type(BaseModel)):
+        schema = json.dumps(object.model_json_schema())
+    elif isinstance(object, Dict):
+        schema = json.dumps(object)
+    else:
+        schema = object
+    return build_regex_from_schema(schema, whitespace_pattern)

{sglang-0.3.5.post1 → sglang-0.3.5.post2}/sglang/srt/constrained/xgrammar_backend.py RENAMED Viewed

@@ -15,16 +15,29 @@ limitations under the License.
 """Constrained decoding with xgrammar backend."""
+import logging
 from typing import List, Tuple
 import torch
-from xgrammar import CachedGrammarCompiler, CompiledGrammar, GrammarMatcher
+try:
+    from xgrammar import CachedGrammarCompiler, CompiledGrammar, GrammarMatcher
+    import_error = None
+except ImportError as e:
+    CachedGrammarCompiler = CompiledGrammar = GrammarMatcher = TokenizerInfo = (
+        ImportError
+    )
+    import_error = e
 from sglang.srt.constrained.base_grammar_backend import (
     BaseGrammarBackend,
     BaseGrammarObject,
 )
+logger = logging.getLogger(__name__)
 MAX_ROLLBACK_TOKENS = 10
@@ -91,15 +104,37 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
         vocab_size: int,
     ):
         super().__init__()
+        if import_error:
+            logger.warning(
+                f"Ignore import error for the grammar backend: {import_error}"
+            )
+            self.grammar_cache = None
+            return
         self.grammar_cache = CachedGrammarCompiler(tokenizer_or_vocab=tokenizer)
         self.vocab_size = vocab_size
     def init_value_impl(self, key: Tuple[str, str]) -> XGrammarGrammar:
+        if import_error:
+            raise import_error
         key_type, key_string = key
         if key_type == "json":
-            ctx = self.grammar_cache.get_compiled_grammar_for_json_schema(key_string)
+            try:
+                ctx = self.grammar_cache.get_compiled_grammar_for_json_schema(
+                    key_string
+                )
+            except RuntimeError as e:
+                logging.warning(
+                    f"Skip invalid json_schema: json_schema={key_string}, {e=}"
+                )
+                return None
         elif key_type == "regex":
-            raise ValueError("regex hasn't been supported by xgrammar yet")
+            logger.warning(
+                "regex hasn't been supported by xgrammar yet. This is skipped."
+            )
+            return None
         else:
             raise ValueError(f"Invalid key_type: {key_type}")
@@ -111,4 +146,5 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
         return XGrammarGrammar(matcher, self.vocab_size, ctx)
     def reset(self):
-        self.grammar_cache.clear()
+        if self.grammar_cache:
+            self.grammar_cache.clear()

{sglang-0.3.5.post1 → sglang-0.3.5.post2}/sglang/srt/layers/fused_moe/patch.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Callable, Optional
 import torch
 from torch.nn import functional as F
@@ -98,7 +98,9 @@ def fused_moe_forward_native(
     renormalize: bool,
     topk_group: Optional[int] = None,
     num_expert_group: Optional[int] = None,
+    custom_routing_function: Optional[Callable] = None,
 ) -> torch.Tensor:
+    assert custom_routing_function is None
     topk_weights, topk_ids = select_experts_native(
         hidden_states=x,
         router_logits=router_logits,
@@ -114,4 +116,4 @@ def fused_moe_forward_native(
     x1 = F.silu(torch.einsum("ti,taoi -> tao", x, w1_weights))
     x3 = torch.einsum("ti, taoi -> tao", x, w3_weights)
     expert_outs = torch.einsum("tao, taio -> tai", (x1 * x3), w2_weights)
-    return torch.einsum("tai,ta -> ti", expert_outs, topk_weights)
+    return torch.einsum("tai,ta -> ti", expert_outs, topk_weights.to(expert_outs.dtype))

sglang 0.3.5.post1__tar.gz → 0.3.5.post2__tar.gz

sglang 0.3.5.post1tar.gz → 0.3.5.post2tar.gz