PyPI - sglang - Versions diffs - 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl - Mend

sglang 0.2.6py3-none-any.whl → 0.2.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

sglang/__init__.py +33 -26
sglang/api.py +9 -1
sglang/bench_latency.py +2 -2
sglang/bench_serving.py +10 -1
sglang/check_env.py +1 -1
sglang/lang/backend/litellm.py +1 -1
sglang/lang/backend/openai.py +1 -1
sglang/lang/interpreter.py +21 -5
sglang/lang/ir.py +1 -2
sglang/srt/constrained/__init__.py +15 -0
sglang/srt/constrained/{base_cache.py → base_tool_cache.py} +17 -2
sglang/srt/constrained/fsm_cache.py +17 -2
sglang/srt/constrained/jump_forward.py +17 -2
sglang/srt/conversation.py +26 -0
sglang/srt/hf_transformers_utils.py +15 -0
sglang/srt/layers/context_flashattention_nopad.py +15 -0
sglang/srt/layers/extend_attention.py +15 -0
sglang/srt/layers/fused_moe.py +15 -0
sglang/srt/layers/linear.py +15 -0
sglang/srt/layers/logits_processor.py +41 -13
sglang/srt/layers/quantization/__init__.py +15 -0
sglang/srt/layers/quantization/fp8.py +15 -0
sglang/srt/layers/radix_attention.py +17 -2
sglang/srt/layers/token_attention.py +16 -1
sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} +17 -2
sglang/srt/managers/{controller/manager_single.py → controller_single.py} +17 -2
sglang/srt/managers/detokenizer_manager.py +16 -1
sglang/srt/managers/io_struct.py +36 -3
sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} +37 -22
sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} +60 -21
sglang/srt/managers/tokenizer_manager.py +39 -16
sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} +159 -46
sglang/srt/mem_cache/base_cache.py +43 -0
sglang/srt/mem_cache/chunk_cache.py +60 -0
sglang/srt/mem_cache/flush_cache.py +33 -0
sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} +16 -1
sglang/srt/{managers/controller → mem_cache}/radix_cache.py +20 -2
sglang/srt/mm_utils.py +15 -0
sglang/srt/model_config.py +15 -0
sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py +16 -1
sglang/srt/{managers/controller → model_executor}/model_runner.py +49 -14
sglang/srt/model_loader/model_loader.py +15 -0
sglang/srt/model_loader/utils.py +16 -1
sglang/srt/models/chatglm.py +16 -1
sglang/srt/models/commandr.py +16 -1
sglang/srt/models/dbrx.py +16 -1
sglang/srt/models/deepseek.py +16 -1
sglang/srt/models/deepseek_v2.py +16 -1
sglang/srt/models/gemma.py +16 -1
sglang/srt/models/gemma2.py +16 -1
sglang/srt/models/gpt_bigcode.py +16 -1
sglang/srt/models/grok.py +16 -1
sglang/srt/models/internlm2.py +16 -1
sglang/srt/models/llama2.py +21 -22
sglang/srt/models/llama_classification.py +16 -1
sglang/srt/models/llava.py +17 -2
sglang/srt/models/llavavid.py +17 -2
sglang/srt/models/minicpm.py +16 -1
sglang/srt/models/mistral.py +15 -0
sglang/srt/models/mixtral.py +16 -1
sglang/srt/models/mixtral_quant.py +16 -1
sglang/srt/models/qwen.py +16 -1
sglang/srt/models/qwen2.py +16 -1
sglang/srt/models/qwen2_moe.py +16 -1
sglang/srt/models/stablelm.py +16 -1
sglang/srt/models/yivl.py +15 -0
sglang/srt/openai_api/adapter.py +569 -131
sglang/srt/openai_api/protocol.py +84 -2
sglang/srt/sampling_params.py +15 -0
sglang/srt/server.py +92 -23
sglang/srt/server_args.py +52 -11
sglang/srt/utils.py +15 -0
sglang/test/test_programs.py +9 -6
sglang/utils.py +22 -0
sglang/version.py +1 -1
{sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/METADATA +33 -7
sglang-0.2.8.dist-info/RECORD +95 -0
{sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/WHEEL +1 -1
sglang/srt/flush_cache.py +0 -18
sglang-0.2.6.dist-info/RECORD +0 -93
{sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/LICENSE +0 -0
{sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/top_level.txt +0 -0

sglang/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
 # SGL API Components
 from sglang.api import (
     Runtime,
     assistant,
@@ -14,48 +15,54 @@ from sglang.api import (
     select,
     set_default_backend,
     system,
+    system_begin,
+    system_end,
     user,
     user_begin,
     user_end,
     video,
 )
-# Global Configurations
-from sglang.global_config import global_config
-# SGL Backends
-from sglang.lang.backend.anthropic import Anthropic
-from sglang.lang.backend.litellm import LiteLLM
-from sglang.lang.backend.openai import OpenAI
-from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
-from sglang.lang.backend.vertexai import VertexAI
-from .version import __version__
-# public APIs management
+# SGLang DSL APIs
 __all__ = [
-    "global_config",
-    "Anthropic",
-    "LiteLLM",
-    "OpenAI",
-    "RuntimeEndpoint",
-    "VertexAI",
-    "function",
     "Runtime",
-    "set_default_backend",
+    "assistant",
+    "assistant_begin",
+    "assistant_end",
     "flush_cache",
-    "get_server_args",
+    "function",
     "gen",
     "gen_int",
     "gen_string",
+    "get_server_args",
     "image",
-    "video",
     "select",
+    "set_default_backend",
     "system",
+    "system_begin",
+    "system_end",
     "user",
-    "assistant",
     "user_begin",
     "user_end",
-    "assistant_begin",
-    "assistant_end",
+    "video",
 ]
+# Global Configurations
+from sglang.global_config import global_config
+__all__ += ["global_config"]
+from sglang.version import __version__
+__all__ += ["__version__"]
+# SGL Backends
+from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+from sglang.utils import LazyImport
+Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
+LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
+OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
+VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
+__all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"]

sglang/api.py CHANGED Viewed

@@ -75,7 +75,7 @@ def gen(
     choices: Optional[List[str]] = None,
     regex: Optional[str] = None,
 ):
-    """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
+    """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
     if choices:
         return SglSelect(name, choices, 0.0 if temperature is None else temperature)
@@ -210,6 +210,14 @@ def assistant(expr: Optional[SglExpr] = None):
     return _role_common("assistant", expr)
+def system_begin():
+    return SglRoleBegin("system")
+def system_end():
+    return SglRoleEnd("system")
 def user_begin():
     return SglRoleBegin("user")

sglang/bench_latency.py CHANGED Viewed

@@ -37,9 +37,9 @@ import torch
 import torch.distributed as dist
 from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.managers.controller.infer_batch import Batch, ForwardMode, Req
-from sglang.srt.managers.controller.model_runner import ModelRunner
+from sglang.srt.managers.schedule_batch import Batch, ForwardMode, Req
 from sglang.srt.model_config import ModelConfig
+from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.sampling_params import SamplingParams
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import suppress_other_loggers

sglang/bench_serving.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/backend_request_func.py
 # Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py
 """
 Benchmark online serving.
@@ -84,6 +85,9 @@ async def async_request_trt_llm(
             "min_length": request_func_input.output_len,
             "end_id": 1048576,
         }
+        if args.disable_ignore_eos:
+            del payload["min_length"]
+            del payload["end_id"]
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
@@ -149,7 +153,7 @@ async def async_request_openai_completions(
             "best_of": 1,
             "max_tokens": request_func_input.output_len,
             "stream": not args.disable_stream,
-            "ignore_eos": True,
+            "ignore_eos": not args.disable_ignore_eos,
         }
         headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
@@ -969,6 +973,11 @@ if __name__ == "__main__":
         action="store_true",
         help="Disable streaming mode.",
     )
+    parser.add_argument(
+        "--disable-ignore-eos",
+        action="store_true",
+        help="Disable ignoring EOS.",
+    )
     set_ulimit()

sglang/check_env.py CHANGED Viewed

@@ -22,7 +22,7 @@ PACKAGE_LIST = [
     "huggingface_hub",
     "interegular",
     "packaging",
-    "pillow",
+    "PIL",
     "psutil",
     "pydantic",
     "uvicorn",

sglang/lang/backend/litellm.py CHANGED Viewed

@@ -61,7 +61,7 @@ class LiteLLM(BaseBackend):
             model=self.model_name,
             messages=messages,
             **self.client_params,
-            **sampling_params.to_anthropic_kwargs(),
+            **sampling_params.to_litellm_kwargs(),
         )
         comp = ret.choices[0].message.content

sglang/lang/backend/openai.py CHANGED Viewed

@@ -18,7 +18,7 @@ except ImportError as e:
     openai = tiktoken = e
-logger = logging.getLogger("openai")
+logger = logging.getLogger(__name__)
 def create_logit_bias_int(tokenizer):

sglang/lang/interpreter.py CHANGED Viewed

@@ -553,6 +553,8 @@ class StreamExecutor:
                 "output_token_logprobs": output_token_logprobs,
             }
             self.variable_event[name].set()
+            if self.stream_var_event:
+                self.stream_var_event[name].set()
         self.text_ += decision
     def _execute_variable(self, expr: SglVariable):
@@ -705,9 +707,9 @@ class ProgramState:
     def _role_common(self, name: str, expr: Optional[SglExpr] = None):
         if expr is not None:
-            self.stream_executor.submit(
-                SglExprList([SglRoleBegin(name), expr, SglRoleEnd(name)])
-            )
+            role_expr = SglExprList([SglRoleBegin(name), expr, SglRoleEnd(name)])
+            self.stream_executor.submit(role_expr)
+            return role_expr
         else:
             @contextmanager
@@ -778,7 +780,14 @@ class ProgramState:
                     if self.stream_executor.is_finished:
                         break
             else:
-                event = self.stream_executor.stream_var_event[var_name]
+                event = None
+                while not event:
+                    if var_name in self.stream_executor.stream_var_event:
+                        event = self.stream_executor.stream_var_event[var_name]
+                    if self.stream_executor.is_finished:
+                        yield ""
+                        return
                 while True:
                     event.wait()
                     event.clear()
@@ -813,7 +822,14 @@ class ProgramState:
                     if self.stream_executor.is_finished:
                         break
             else:
-                event = self.stream_executor.stream_var_event[var_name]
+                event = None
+                while not event:
+                    if var_name in self.stream_executor.stream_var_event:
+                        event = self.stream_executor.stream_var_event[var_name]
+                    if self.stream_executor.is_finished:
+                        yield ""
+                        return
                 while True:
                     await loop.run_in_executor(None, event.wait)
                     event.clear()

sglang/lang/ir.py CHANGED Viewed

@@ -99,7 +99,6 @@ class SglSamplingParams:
             "stop": self.stop or None,
             "temperature": self.temperature,
             "top_p": self.top_p,
-            "top_k": self.top_k,
             "frequency_penalty": self.frequency_penalty,
             "presence_penalty": self.presence_penalty,
         }
@@ -410,7 +409,7 @@ class SglGen(SglExpr):
         dtype: Optional[type] = None,
         regex: Optional[str] = None,
     ):
-        """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
+        """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
         super().__init__()
         self.name = name
         self.sampling_params = SglSamplingParams(

sglang/srt/constrained/__init__.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 import json
 from typing import Dict, Optional, Union

sglang/srt/constrained/{base_cache.py → base_tool_cache.py} RENAMED Viewed

@@ -1,9 +1,24 @@
-"""Base cache class."""
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""Base tool cache for constrained decoding tools."""
 import time
-class BaseCache:
+class BaseToolCache:
     def __init__(self, enable=True):
         self.enable = enable
         self.reset()

sglang/srt/constrained/fsm_cache.py CHANGED Viewed

@@ -1,10 +1,25 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Cache for the compressed finite state machine."""
 from sglang.srt.constrained import RegexGuide, TransformerTokenizer
-from sglang.srt.constrained.base_cache import BaseCache
+from sglang.srt.constrained.base_tool_cache import BaseToolCache
-class FSMCache(BaseCache):
+class FSMCache(BaseToolCache):
     def __init__(self, tokenizer_path, tokenizer_args_dict, enable=True):
         super().__init__(enable=enable)

sglang/srt/constrained/jump_forward.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """
 Faster constrained decoding.
 Reference: https://lmsys.org/blog/2024-02-05-compressed-fsm/
@@ -15,7 +30,7 @@ from sglang.srt.constrained import (
     make_byte_level_fsm,
     make_deterministic_fsm,
 )
-from sglang.srt.constrained.base_cache import BaseCache
+from sglang.srt.constrained.base_tool_cache import BaseToolCache
 IP_REGEX = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
@@ -136,7 +151,7 @@ class JumpForwardMap:
         )
-class JumpForwardCache(BaseCache):
+class JumpForwardCache(BaseToolCache):
     def __init__(self):
         super().__init__()

sglang/srt/conversation.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Conversation templates."""
 # Adapted from
@@ -421,3 +436,14 @@ register_conv_template(
         sep2="</s>",
     )
 )
+# Reference: https://github.com/InternLM/lmdeploy/blob/387bf54b4f124e72aab30ae9755f562e435d3d01/lmdeploy/model.py#L425-L442
+register_conv_template(
+    Conversation(
+        name="internlm2-chat",
+        system_template="<|im_start|>system\n{system_message}",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="\n",
+        stop_str=["<|im_end|>", "<|action_end|>"],
+    )
+)

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Utilities for Huggingface Transformers."""
 import functools

sglang/srt/layers/context_flashattention_nopad.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 # Adapted from
 # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py#L1
 import torch

sglang/srt/layers/extend_attention.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 import torch
 import triton
 import triton.language as tl

sglang/srt/layers/fused_moe.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/layers/fused_moe/fused_moe.py#L1
 """Fused MoE kernel."""

sglang/srt/layers/linear.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 # temporarily adapted from https://github.com/vllm-project/vllm/blob/e76466dde2bc9525d55165ceaa600d298c7bf773/vllm/model_executor/layers/linear.py
 # FIXME: refactor the linear abstraction
 from abc import abstractmethod

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Logits processing."""
 import dataclasses
@@ -10,7 +25,7 @@ from vllm.distributed import (
     tensor_model_parallel_all_gather,
 )
-from sglang.srt.managers.controller.model_runner import ForwardMode, InputMetadata
+from sglang.srt.model_executor.model_runner import ForwardMode, InputMetadata
 @dataclasses.dataclass
@@ -77,33 +92,46 @@ class LogitsProcessor(nn.Module):
     @staticmethod
     def get_top_logprobs(all_logprobs, logits_metadata: LogitsMetadata):
-        # TODO: vectorize the code below
         if logits_metadata.forward_mode == ForwardMode.DECODE:
             output_top_logprobs = []
-            for i in range(all_logprobs.shape[0]):
-                k = logits_metadata.top_logprobs_nums[i]
-                t = all_logprobs[i].topk(k)
-                v_cpu = t.values.tolist()
-                p_cpu = t.indices.tolist()
-                output_top_logprobs.append(list(zip(v_cpu, p_cpu)))
+            max_k = max(logits_metadata.top_logprobs_nums)
+            ret = all_logprobs.topk(max_k, dim=1)
+            values = ret.values.tolist()
+            indices = ret.indices.tolist()
+            for i, k in enumerate(logits_metadata.top_logprobs_nums):
+                output_top_logprobs.append(list(zip(values[i][:k], indices[i][:k])))
             return None, output_top_logprobs
         else:
+            # TODO: vectorize the code below
             input_top_logprobs, output_top_logprobs = [], []
             pt = 0
             extend_seq_lens_cpu = logits_metadata.extend_seq_lens.tolist()
+            max_k = max(logits_metadata.top_logprobs_nums)
+            ret = all_logprobs.topk(max_k, dim=1)
+            values = ret.values.tolist()
+            indices = ret.indices.tolist()
             for i, extend_seq_len in enumerate(extend_seq_lens_cpu):
                 if extend_seq_len == 0:
                     input_top_logprobs.append([])
                     output_top_logprobs.append([])
                     continue
                 k = logits_metadata.top_logprobs_nums[i]
-                t = all_logprobs[pt : pt + extend_seq_len].topk(k)
-                vs_cpu = t.values.tolist()
-                ps_cpu = t.indices.tolist()
                 input_top_logprobs.append(
-                    [list(zip(vs_cpu[j], ps_cpu[j])) for j in range(len(vs_cpu) - 1)]
+                    [
+                        list(zip(values[pt + j][:k], indices[pt + j][:k]))
+                        for j in range(extend_seq_len - 1)
+                    ]
+                )
+                output_top_logprobs.append(
+                    list(
+                        zip(
+                            values[pt + extend_seq_len - 1][:k],
+                            indices[pt + extend_seq_len - 1][:k],
+                        )
+                    )
                 )
-                output_top_logprobs.append(list(zip(vs_cpu[-1], ps_cpu[-1])))
                 pt += extend_seq_len
             return input_top_logprobs, output_top_logprobs

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 # temporarily adapted from vLLM
 # FIXME: in progress of refactoring the model loader

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 # adapted from https://github.com/vllm-project/vllm/blob/e76466dde2bc9525d55165ceaa600d298c7bf773/vllm/model_executor/layers/quantization/fp8.py
 # FIXME refactor in progress
 from typing import Any, Dict, List, Optional, Union

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Radix attention."""
 import torch
@@ -7,7 +22,7 @@ from torch import nn
 from sglang.global_config import global_config
 from sglang.srt.layers.extend_attention import extend_attention_fwd
 from sglang.srt.layers.token_attention import token_attention_fwd
-from sglang.srt.managers.controller.model_runner import (
+from sglang.srt.model_executor.model_runner import (
     ForwardMode,
     InputMetadata,
     global_server_args_dict,
@@ -88,7 +103,7 @@ class RadixAttention(nn.Module):
         return o
     def extend_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata):
-        if not input_metadata.use_ragged:
+        if not input_metadata.flashinfer_use_ragged:
             self.store_kv_cache(k, v, input_metadata)
             o = input_metadata.flashinfer_prefill_wrapper_paged.forward(

sglang 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl

sglang 0.2.6py3-none-any.whl → 0.2.8py3-none-any.whl