PyPI - sglang - Versions diffs - 0.3.5.post1__py3-none-any.whl → 0.3.6__py3-none-any.whl - Mend

sglang 0.3.5.post1py3-none-any.whl → 0.3.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

sglang/bench_latency.py +1 -553
sglang/bench_offline_throughput.py +337 -0
sglang/bench_one_batch.py +474 -0
sglang/{bench_server_latency.py → bench_one_batch_server.py} +3 -3
sglang/bench_serving.py +115 -31
sglang/check_env.py +3 -6
sglang/srt/constrained/base_grammar_backend.py +4 -3
sglang/srt/constrained/outlines_backend.py +39 -26
sglang/srt/constrained/xgrammar_backend.py +58 -14
sglang/srt/layers/activation.py +3 -0
sglang/srt/layers/attention/flashinfer_backend.py +93 -48
sglang/srt/layers/attention/triton_backend.py +9 -7
sglang/srt/layers/custom_op_util.py +26 -0
sglang/srt/layers/fused_moe/fused_moe.py +11 -4
sglang/srt/layers/fused_moe/patch.py +4 -2
sglang/srt/layers/layernorm.py +4 -0
sglang/srt/layers/logits_processor.py +10 -10
sglang/srt/layers/sampler.py +4 -8
sglang/srt/layers/torchao_utils.py +2 -0
sglang/srt/managers/data_parallel_controller.py +74 -9
sglang/srt/managers/detokenizer_manager.py +1 -14
sglang/srt/managers/io_struct.py +27 -0
sglang/srt/managers/schedule_batch.py +104 -38
sglang/srt/managers/schedule_policy.py +5 -1
sglang/srt/managers/scheduler.py +210 -56
sglang/srt/managers/session_controller.py +62 -0
sglang/srt/managers/tokenizer_manager.py +38 -0
sglang/srt/managers/tp_worker.py +12 -1
sglang/srt/managers/tp_worker_overlap_thread.py +49 -52
sglang/srt/model_executor/cuda_graph_runner.py +43 -6
sglang/srt/model_executor/forward_batch_info.py +109 -15
sglang/srt/model_executor/model_runner.py +102 -43
sglang/srt/model_parallel.py +98 -0
sglang/srt/models/deepseek_v2.py +147 -44
sglang/srt/models/gemma2.py +9 -8
sglang/srt/models/llava.py +1 -1
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/olmo.py +3 -3
sglang/srt/models/phi3_small.py +447 -0
sglang/srt/models/qwen2_vl.py +13 -6
sglang/srt/models/torch_native_llama.py +94 -78
sglang/srt/openai_api/adapter.py +11 -4
sglang/srt/openai_api/protocol.py +30 -27
sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
sglang/srt/sampling/sampling_batch_info.py +58 -57
sglang/srt/sampling/sampling_params.py +3 -3
sglang/srt/server.py +29 -2
sglang/srt/server_args.py +97 -60
sglang/srt/utils.py +103 -51
sglang/test/runners.py +25 -6
sglang/test/srt/sampling/penaltylib/utils.py +23 -21
sglang/test/test_utils.py +33 -22
sglang/version.py +1 -1
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/METADATA +43 -43
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/RECORD +62 -56
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/WHEEL +1 -1
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/LICENSE +0 -0
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/top_level.txt +0 -0

sglang/bench_serving.py CHANGED Viewed

@@ -15,6 +15,7 @@ import argparse
 import asyncio
 import json
 import os
+import pickle
 import random
 import resource
 import sys
@@ -387,6 +388,24 @@ async def async_request_gserver(
     raise NotImplementedError()
+async def async_request_profile(api_url: str) -> RequestFuncOutput:
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        output = RequestFuncOutput()
+        try:
+            async with session.post(url=api_url) as response:
+                if response.status == 200:
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+    return output
 def get_model(pretrained_model_name_or_path: str) -> str:
     if os.getenv("SGLANG_USE_MODELSCOPE", "False").lower() == "true":
         import huggingface_hub.constants
@@ -421,6 +440,37 @@ def get_tokenizer(
     )
+def get_dataset(args, tokenizer):
+    if args.dataset_name == "sharegpt":
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            fixed_output_len=args.sharegpt_output_len,
+        )
+    elif args.dataset_name == "random":
+        input_requests = sample_random_requests(
+            input_len=args.random_input_len,
+            output_len=args.random_output_len,
+            num_prompts=args.num_prompts,
+            range_ratio=args.random_range_ratio,
+            tokenizer=tokenizer,
+            dataset_path=args.dataset_path,
+        )
+    elif args.dataset_name == "generated-shared-prefix":
+        input_requests = sample_generated_shared_prefix_requests(
+            num_groups=args.gen_num_groups,
+            prompts_per_group=args.gen_prompts_per_group,
+            system_prompt_len=args.gen_system_prompt_len,
+            question_len=args.gen_question_len,
+            output_len=args.gen_output_len,
+            tokenizer=tokenizer,
+        )
+    else:
+        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+    return input_requests
 ASYNC_REQUEST_FUNCS = {
     "sglang": async_request_sglang_generate,
     "sglang-native": async_request_sglang_generate,
@@ -443,6 +493,8 @@ class BenchmarkMetrics:
     input_throughput: float
     output_throughput: float
     output_throughput_retokenized: float
+    total_throughput: float
+    total_throughput_retokenized: float
     mean_ttft_ms: float
     median_ttft_ms: float
     std_ttft_ms: float
@@ -590,7 +642,6 @@ def sample_random_requests(
             (data["conversations"][0]["value"], data["conversations"][1]["value"])
             for data in dataset
         ]
         # Shuffle the dataset.
         random.shuffle(dataset)
@@ -650,6 +701,11 @@ def sample_generated_shared_prefix_requests(
     output_len: int,
     tokenizer: PreTrainedTokenizerBase,
 ) -> List[Tuple[str, int, int]]:
+    if args.generated_input_path and os.path.exists(args.generated_input_path):
+        print(f"\nloading generated input data from {args.generated_input_path}")
+        with open(args.generated_input_path, "rb") as f:
+            return pickle.load(f)
     """Generate benchmark requests with shared system prompts using random tokens."""
     # Generate system prompts for each group
     system_prompts = []
@@ -663,6 +719,9 @@ def sample_generated_shared_prefix_requests(
         question = gen_prompt(tokenizer, question_len)
         questions.append(question)
+    # Shuffle questions
+    random.shuffle(questions)
     # Combine system prompts with questions
     input_requests = []
     total_input_tokens = 0
@@ -691,6 +750,11 @@ def sample_generated_shared_prefix_requests(
     print(
         f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
     )
+    if args.generated_input_save_path:
+        print(f"Saving generated input data to {args.generated_input_save_path}")
+        os.makedirs(os.path.dirname(args.generated_input_save_path), exist_ok=True)
+        with open(args.generated_input_save_path, "wb") as f:
+            pickle.dump(input_requests, f)
     return input_requests
@@ -764,6 +828,9 @@ def calculate_metrics(
         input_throughput=total_input / dur_s,
         output_throughput=sum(output_lens) / dur_s,
         output_throughput_retokenized=sum(retokenized_output_lens) / dur_s,
+        total_throughput=(total_input + sum(output_lens)) / dur_s,
+        total_throughput_retokenized=(total_input + sum(retokenized_output_lens))
+        / dur_s,
         mean_ttft_ms=np.mean(ttfts or 0)
         * 1000,  # ttfts is empty if streaming is not supported by backend
         median_ttft_ms=np.median(ttfts or 0) * 1000,
@@ -787,12 +854,14 @@ def calculate_metrics(
 async def benchmark(
     backend: str,
     api_url: str,
+    base_url: str,
     model_id: str,
     tokenizer: PreTrainedTokenizerBase,
     input_requests: List[Tuple[str, int, int]],
     request_rate: float,
     disable_tqdm: bool,
     extra_request_body: Dict[str, Any],
+    profile: bool,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -820,6 +889,14 @@ async def benchmark(
     time.sleep(1.5)
+    if profile:
+        print("Starting profiler...")
+        profile_output = await async_request_profile(
+            api_url=base_url + "/start_profile"
+        )
+        if profile_output.success:
+            print("Profiler started")
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
     benchmark_start_time = time.perf_counter()
@@ -841,6 +918,12 @@ async def benchmark(
         )
     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+    if profile:
+        print("Stopping profiler...")
+        profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
+        if profile_output.success:
+            print("Profiler stopped")
     if pbar is not None:
         pbar.close()
@@ -881,6 +964,11 @@ async def benchmark(
             "Output token throughput (tok/s):", metrics.output_throughput
         )
     )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total token throughput (tok/s):", metrics.total_throughput
+        )
+    )
     print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
     print(
         "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
@@ -1060,6 +1148,9 @@ def run_benchmark(args_: argparse.Namespace):
             if args.base_url
             else f"http://{args.host}:{args.port}/v1/models/model:predict"
         )
+    base_url = (
+        f"http://{args.host}:{args.port}" if args.base_url is None else args.base_url
+    )
     # Get model name
     if args.model is None:
@@ -1098,47 +1189,21 @@ def run_benchmark(args_: argparse.Namespace):
     tokenizer = get_tokenizer(tokenizer_id)
-    if args.dataset_name == "sharegpt":
-        assert args.random_input_len is None and args.random_output_len is None
-        input_requests = sample_sharegpt_requests(
-            dataset_path=args.dataset_path,
-            num_requests=args.num_prompts,
-            tokenizer=tokenizer,
-            fixed_output_len=args.sharegpt_output_len,
-        )
-    elif args.dataset_name == "random":
-        assert args.random_input_len is not None and args.random_output_len is not None
-        input_requests = sample_random_requests(
-            input_len=args.random_input_len,
-            output_len=args.random_output_len,
-            num_prompts=args.num_prompts,
-            range_ratio=args.random_range_ratio,
-            tokenizer=tokenizer,
-            dataset_path=args.dataset_path,
-        )
-    elif args.dataset_name == "generated-shared-prefix":
-        input_requests = sample_generated_shared_prefix_requests(
-            num_groups=args.gen_num_groups,
-            prompts_per_group=args.gen_prompts_per_group,
-            system_prompt_len=args.gen_system_prompt_len,
-            question_len=args.gen_question_len,
-            output_len=args.gen_output_len,
-            tokenizer=tokenizer,
-        )
-    else:
-        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+    input_requests = get_dataset(args, tokenizer)
     if not args.multi:
         return asyncio.run(
             benchmark(
                 backend=backend,
                 api_url=api_url,
+                base_url=base_url,
                 model_id=model_id,
                 tokenizer=tokenizer,
                 input_requests=input_requests,
                 request_rate=args.request_rate,
                 disable_tqdm=args.disable_tqdm,
                 extra_request_body=extra_request_body,
+                profile=args.profile,
             )
         )
     else:
@@ -1150,12 +1215,14 @@ def run_benchmark(args_: argparse.Namespace):
                 benchmark(
                     backend=backend,
                     api_url=api_url,
+                    base_url=base_url,
                     model_id=model_id,
                     tokenizer=tokenizer,
                     input_requests=input_requests,
                     request_rate=rate,
                     disable_tqdm=args.disable_tqdm,
                     extra_request_body=extra_request_body,
+                    profile=args.profile,
                 )
             )
@@ -1229,10 +1296,12 @@ if __name__ == "__main__":
     parser.add_argument(
         "--random-input-len",
         type=int,
+        default=1024,
         help="Number of input tokens per request, used only for random dataset.",
     )
     parser.add_argument(
         "--random-output-len",
+        default=1024,
         type=int,
         help="Number of output tokens per request, used only for random dataset.",
     )
@@ -1317,6 +1386,21 @@ if __name__ == "__main__":
         default=256,
         help="Target length in tokens for outputs in generated-shared-prefix dataset",
     )
+    parser.add_argument(
+        "--generated-input-save-path",
+        type=str,
+        help="Path to save generated input data",
+    )
+    parser.add_argument(
+        "--generated-input-path",
+        type=str,
+        help="Path to load previously generated input data",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Use Torch Profiler. The endpoint must be launched with "
+        "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
+    )
     args = parser.parse_args()
     run_benchmark(args)

sglang/check_env.py CHANGED Viewed

@@ -15,24 +15,21 @@ PACKAGE_LIST = [
     "flashinfer",
     "triton",
     "transformers",
-    "requests",
-    "tqdm",
+    "torchao",
     "numpy",
     "aiohttp",
     "fastapi",
     "hf_transfer",
     "huggingface_hub",
     "interegular",
-    "packaging",
-    "PIL",
     "psutil",
     "pydantic",
+    "multipart",
+    "zmq",
     "uvicorn",
     "uvloop",
-    "zmq",
     "vllm",
     "outlines",
-    "multipart",
     "openai",
     "tiktoken",
     "anthropic",

sglang/srt/constrained/base_grammar_backend.py CHANGED Viewed

@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
-"""The baseclass of backends for grammar-guided constrained decoding."""
+"""The baseclass of a backend for grammar-guided constrained decoding."""
 from concurrent.futures import Future, ThreadPoolExecutor
 from dataclasses import dataclass
@@ -52,7 +52,7 @@ class BaseGrammarBackend:
         else:
             entry.value = self.init_value_impl(key)
             entry.event.set()
-        return entry.value.copy()
+        return entry.value.copy() if entry.value else None
     def init_value_impl(self, key: Tuple[str, str]) -> BaseGrammarObject:
         raise NotImplementedError()
@@ -62,7 +62,8 @@ class BaseGrammarBackend:
             entry = self.cache.get(key)
             if not entry or not entry.event.is_set():
                 return None
-            return self.cache[key].value.copy()
+            val = self.cache[key].value
+            return val.copy() if val else None
     def get_future_value(self, key: Tuple[str, str]) -> Future:
         return self.executor.submit(self.init_value, key)

sglang/srt/constrained/outlines_backend.py CHANGED Viewed

@@ -19,9 +19,12 @@ import json
 import logging
 from typing import Dict, List, Optional, Tuple, Union
+import interegular
 import torch
 from outlines.fsm.guide import RegexGuide
+from outlines.fsm.json_schema import build_regex_from_schema
 from outlines.models.transformers import TransformerTokenizer
+from pydantic import BaseModel
 from sglang.srt.constrained.base_grammar_backend import (
     BaseGrammarBackend,
@@ -32,26 +35,6 @@ from sglang.srt.constrained.outlines_jump_forward import OutlinesJumpForwardMap
 logger = logging.getLogger(__name__)
-try:
-    from outlines.fsm.json_schema import build_regex_from_object
-except ImportError:
-    # Since outlines 0.0.32, build_regex_from_object is replaced by build_regex_from_schema,
-    # which only accepts string schema as input.
-    from outlines.fsm.json_schema import build_regex_from_schema
-    from pydantic import BaseModel
-    def build_regex_from_object(
-        object: Union[str, BaseModel, Dict], whitespace_pattern: Optional[str] = None
-    ):
-        if isinstance(object, type(BaseModel)):
-            schema = json.dumps(object.model_json_schema())
-        elif isinstance(object, Dict):
-            schema = json.dumps(object)
-        else:
-            schema = object
-        return build_regex_from_schema(schema, whitespace_pattern)
 class OutlinesGrammar(BaseGrammarObject):
     def __init__(
         self,
@@ -98,9 +81,22 @@ class OutlinesGrammar(BaseGrammarObject):
     ):
         self.state = next_state
-    def fill_vocab_mask(self, vocab_mask: torch.Tensor):
+    def allocate_vocab_mask(
+        self, vocab_size: int, batch_size: int, device
+    ) -> torch.Tensor:
+        return torch.zeros(batch_size, vocab_size, dtype=torch.bool, device=device)
+    def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
+        tokens = torch.tensor(
+            self.guide.get_next_instruction(self.state).tokens, dtype=torch.int64
+        ).to(vocab_mask.device, non_blocking=True)
+        vocab_mask = vocab_mask[idx]
         vocab_mask.fill_(1)
-        vocab_mask[self.guide.get_next_instruction(self.state).tokens] = 0
+        vocab_mask.scatter_(0, tokens, torch.zeros_like(tokens, dtype=torch.bool))
+    @staticmethod
+    def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor):
+        logits.masked_fill_(vocab_mask, float("-inf"))
     def copy(self):
         return OutlinesGrammar(self.guide, self.jump_forward_map)
@@ -147,19 +143,36 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
                     key_string,
                     whitespace_pattern=self.whitespace_pattern,
                 )
-            except NotImplementedError as e:
+            except (NotImplementedError, json.decoder.JSONDecodeError) as e:
                 logger.warning(
-                    f"skip invalid json schema: json_schema={key_string}, {e=}"
+                    f"Skip invalid json_schema: json_schema={key_string}, {e=}"
                 )
-                return None, key_string
+                return None
         elif key_type == "regex":
             regex = key_string
         else:
             raise ValueError(f"Invalid key_type: {key_type}")
-        guide = RegexGuide(regex, self.outlines_tokenizer)
+        try:
+            guide = RegexGuide(regex, self.outlines_tokenizer)
+        except interegular.patterns.InvalidSyntax as e:
+            logger.warning(f"skip invalid regex schema: {regex=}, {e=}")
+            return None
         if self.allow_jump_forward:
             jump_forward_map = OutlinesJumpForwardMap(regex)
         else:
             jump_forward_map = None
         return OutlinesGrammar(guide, jump_forward_map)
+def build_regex_from_object(
+    object: Union[str, BaseModel, Dict], whitespace_pattern: Optional[str] = None
+):
+    if isinstance(object, type(BaseModel)):
+        schema = json.dumps(object.model_json_schema())
+    elif isinstance(object, Dict):
+        schema = json.dumps(object)
+    else:
+        schema = object
+    return build_regex_from_schema(schema, whitespace_pattern)

sglang/srt/constrained/xgrammar_backend.py CHANGED Viewed

@@ -15,16 +15,34 @@ limitations under the License.
 """Constrained decoding with xgrammar backend."""
+import logging
 from typing import List, Tuple
 import torch
-from xgrammar import CachedGrammarCompiler, CompiledGrammar, GrammarMatcher
+try:
+    from xgrammar import (
+        CachedGrammarCompiler,
+        CompiledGrammar,
+        GrammarMatcher,
+        TokenizerInfo,
+    )
+    import_error = None
+except ImportError as e:
+    CachedGrammarCompiler = CompiledGrammar = GrammarMatcher = TokenizerInfo = (
+        ImportError
+    )
+    import_error = e
 from sglang.srt.constrained.base_grammar_backend import (
     BaseGrammarBackend,
     BaseGrammarObject,
 )
+logger = logging.getLogger(__name__)
 MAX_ROLLBACK_TOKENS = 10
@@ -67,19 +85,23 @@ class XGrammarGrammar(BaseGrammarObject):
         for i in range(k, len(new_output_ids)):
             assert self.matcher.accept_token(new_output_ids[i])
-    def fill_vocab_mask(self, vocab_mask: torch.Tensor):
-        # Note that this bitmask is a bitset, not bool
-        bitmask = self.matcher.get_next_token_bitmask()
-        # Mask the tokens that are not allowed
-        vocab_mask[
-            self.matcher.get_rejected_tokens_from_bitmask(bitmask, self.vocab_size)
-        ] = 1
+    def allocate_vocab_mask(
+        self, vocab_size: int, batch_size: int, device
+    ) -> torch.Tensor:
+        return self.matcher.allocate_token_bitmask(vocab_size, batch_size)
+    def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
+        self.matcher.fill_next_token_bitmask(vocab_mask, idx)
+    @staticmethod
+    def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
+        GrammarMatcher.apply_token_bitmask_inplace(logits, vocab_mask)
     def copy(self):
         matcher = GrammarMatcher(
             self.ctx,
             max_rollback_tokens=MAX_ROLLBACK_TOKENS,
-            mask_vocab_size=self.vocab_size,
+            vocab_size=self.vocab_size,
         )
         return XGrammarGrammar(matcher, self.vocab_size, self.ctx)
@@ -91,24 +113,46 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
         vocab_size: int,
     ):
         super().__init__()
-        self.grammar_cache = CachedGrammarCompiler(tokenizer_or_vocab=tokenizer)
+        if import_error:
+            logger.warning(
+                f"Ignore import error for the grammar backend: {import_error}"
+            )
+            self.grammar_cache = None
+            return
+        tokenizer_info = TokenizerInfo.from_huggingface(tokenizer)
+        self.grammar_cache = CachedGrammarCompiler(tokenizer_info=tokenizer_info)
         self.vocab_size = vocab_size
     def init_value_impl(self, key: Tuple[str, str]) -> XGrammarGrammar:
+        if import_error:
+            raise import_error
         key_type, key_string = key
         if key_type == "json":
-            ctx = self.grammar_cache.get_compiled_grammar_for_json_schema(key_string)
+            try:
+                ctx = self.grammar_cache.compile_json_schema_grammar(schema=key_string)
+            except RuntimeError as e:
+                logging.warning(
+                    f"Skip invalid json_schema: json_schema={key_string}, {e=}"
+                )
+                return None
         elif key_type == "regex":
-            raise ValueError("regex hasn't been supported by xgrammar yet")
+            logger.warning(
+                "regex hasn't been supported by xgrammar yet. This is skipped."
+            )
+            return None
         else:
             raise ValueError(f"Invalid key_type: {key_type}")
         matcher = GrammarMatcher(
             ctx,
             max_rollback_tokens=MAX_ROLLBACK_TOKENS,
-            mask_vocab_size=self.vocab_size,
+            vocab_size=self.vocab_size,
         )
         return XGrammarGrammar(matcher, self.vocab_size, ctx)
     def reset(self):
-        self.grammar_cache.clear()
+        if self.grammar_cache:
+            self.grammar_cache.clear()

sglang/srt/layers/activation.py CHANGED Viewed

@@ -32,12 +32,14 @@ from vllm.distributed import (
 )
 from vllm.model_executor.custom_op import CustomOp
+from sglang.srt.layers.custom_op_util import register_custom_op
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.utils import set_weight_attrs
 logger = logging.getLogger(__name__)
+@register_custom_op("sglang_silu_and_mul")
 class SiluAndMul(CustomOp):
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
@@ -51,6 +53,7 @@ class SiluAndMul(CustomOp):
         return out
+@register_custom_op("sglang_gelu_and_mul")
 class GeluAndMul(CustomOp):
     def __init__(self, approximate="tanh"):
         super().__init__()

sglang 0.3.5.post1__py3-none-any.whl → 0.3.6__py3-none-any.whl

sglang 0.3.5.post1py3-none-any.whl → 0.3.6py3-none-any.whl