sglang 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +33 -26
- sglang/api.py +9 -1
- sglang/bench_latency.py +2 -2
- sglang/bench_serving.py +10 -1
- sglang/check_env.py +1 -1
- sglang/lang/backend/litellm.py +1 -1
- sglang/lang/backend/openai.py +1 -1
- sglang/lang/interpreter.py +21 -5
- sglang/lang/ir.py +1 -2
- sglang/srt/constrained/__init__.py +15 -0
- sglang/srt/constrained/{base_cache.py → base_tool_cache.py} +17 -2
- sglang/srt/constrained/fsm_cache.py +17 -2
- sglang/srt/constrained/jump_forward.py +17 -2
- sglang/srt/conversation.py +26 -0
- sglang/srt/hf_transformers_utils.py +15 -0
- sglang/srt/layers/context_flashattention_nopad.py +15 -0
- sglang/srt/layers/extend_attention.py +15 -0
- sglang/srt/layers/fused_moe.py +15 -0
- sglang/srt/layers/linear.py +15 -0
- sglang/srt/layers/logits_processor.py +41 -13
- sglang/srt/layers/quantization/__init__.py +15 -0
- sglang/srt/layers/quantization/fp8.py +15 -0
- sglang/srt/layers/radix_attention.py +17 -2
- sglang/srt/layers/token_attention.py +16 -1
- sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} +17 -2
- sglang/srt/managers/{controller/manager_single.py → controller_single.py} +17 -2
- sglang/srt/managers/detokenizer_manager.py +16 -1
- sglang/srt/managers/io_struct.py +36 -3
- sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} +37 -22
- sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} +60 -21
- sglang/srt/managers/tokenizer_manager.py +39 -16
- sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} +159 -46
- sglang/srt/mem_cache/base_cache.py +43 -0
- sglang/srt/mem_cache/chunk_cache.py +60 -0
- sglang/srt/mem_cache/flush_cache.py +33 -0
- sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} +16 -1
- sglang/srt/{managers/controller → mem_cache}/radix_cache.py +20 -2
- sglang/srt/mm_utils.py +15 -0
- sglang/srt/model_config.py +15 -0
- sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py +16 -1
- sglang/srt/{managers/controller → model_executor}/model_runner.py +49 -14
- sglang/srt/model_loader/model_loader.py +15 -0
- sglang/srt/model_loader/utils.py +16 -1
- sglang/srt/models/chatglm.py +16 -1
- sglang/srt/models/commandr.py +16 -1
- sglang/srt/models/dbrx.py +16 -1
- sglang/srt/models/deepseek.py +16 -1
- sglang/srt/models/deepseek_v2.py +16 -1
- sglang/srt/models/gemma.py +16 -1
- sglang/srt/models/gemma2.py +16 -1
- sglang/srt/models/gpt_bigcode.py +16 -1
- sglang/srt/models/grok.py +16 -1
- sglang/srt/models/internlm2.py +16 -1
- sglang/srt/models/llama2.py +21 -22
- sglang/srt/models/llama_classification.py +16 -1
- sglang/srt/models/llava.py +17 -2
- sglang/srt/models/llavavid.py +17 -2
- sglang/srt/models/minicpm.py +16 -1
- sglang/srt/models/mistral.py +15 -0
- sglang/srt/models/mixtral.py +16 -1
- sglang/srt/models/mixtral_quant.py +16 -1
- sglang/srt/models/qwen.py +16 -1
- sglang/srt/models/qwen2.py +16 -1
- sglang/srt/models/qwen2_moe.py +16 -1
- sglang/srt/models/stablelm.py +16 -1
- sglang/srt/models/yivl.py +15 -0
- sglang/srt/openai_api/adapter.py +569 -131
- sglang/srt/openai_api/protocol.py +84 -2
- sglang/srt/sampling_params.py +15 -0
- sglang/srt/server.py +92 -23
- sglang/srt/server_args.py +52 -11
- sglang/srt/utils.py +15 -0
- sglang/test/test_programs.py +9 -6
- sglang/utils.py +22 -0
- sglang/version.py +1 -1
- {sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/METADATA +33 -7
- sglang-0.2.8.dist-info/RECORD +95 -0
- {sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/WHEEL +1 -1
- sglang/srt/flush_cache.py +0 -18
- sglang-0.2.6.dist-info/RECORD +0 -93
- {sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/LICENSE +0 -0
- {sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/top_level.txt +0 -0
sglang/__init__.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# SGL API Components
|
2
|
+
|
2
3
|
from sglang.api import (
|
3
4
|
Runtime,
|
4
5
|
assistant,
|
@@ -14,48 +15,54 @@ from sglang.api import (
|
|
14
15
|
select,
|
15
16
|
set_default_backend,
|
16
17
|
system,
|
18
|
+
system_begin,
|
19
|
+
system_end,
|
17
20
|
user,
|
18
21
|
user_begin,
|
19
22
|
user_end,
|
20
23
|
video,
|
21
24
|
)
|
22
25
|
|
23
|
-
#
|
24
|
-
from sglang.global_config import global_config
|
25
|
-
|
26
|
-
# SGL Backends
|
27
|
-
from sglang.lang.backend.anthropic import Anthropic
|
28
|
-
from sglang.lang.backend.litellm import LiteLLM
|
29
|
-
from sglang.lang.backend.openai import OpenAI
|
30
|
-
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
31
|
-
from sglang.lang.backend.vertexai import VertexAI
|
32
|
-
|
33
|
-
from .version import __version__
|
34
|
-
|
35
|
-
# public APIs management
|
26
|
+
# SGLang DSL APIs
|
36
27
|
__all__ = [
|
37
|
-
"global_config",
|
38
|
-
"Anthropic",
|
39
|
-
"LiteLLM",
|
40
|
-
"OpenAI",
|
41
|
-
"RuntimeEndpoint",
|
42
|
-
"VertexAI",
|
43
|
-
"function",
|
44
28
|
"Runtime",
|
45
|
-
"
|
29
|
+
"assistant",
|
30
|
+
"assistant_begin",
|
31
|
+
"assistant_end",
|
46
32
|
"flush_cache",
|
47
|
-
"
|
33
|
+
"function",
|
48
34
|
"gen",
|
49
35
|
"gen_int",
|
50
36
|
"gen_string",
|
37
|
+
"get_server_args",
|
51
38
|
"image",
|
52
|
-
"video",
|
53
39
|
"select",
|
40
|
+
"set_default_backend",
|
54
41
|
"system",
|
42
|
+
"system_begin",
|
43
|
+
"system_end",
|
55
44
|
"user",
|
56
|
-
"assistant",
|
57
45
|
"user_begin",
|
58
46
|
"user_end",
|
59
|
-
"
|
60
|
-
"assistant_end",
|
47
|
+
"video",
|
61
48
|
]
|
49
|
+
|
50
|
+
# Global Configurations
|
51
|
+
from sglang.global_config import global_config
|
52
|
+
|
53
|
+
__all__ += ["global_config"]
|
54
|
+
|
55
|
+
from sglang.version import __version__
|
56
|
+
|
57
|
+
__all__ += ["__version__"]
|
58
|
+
|
59
|
+
# SGL Backends
|
60
|
+
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
61
|
+
from sglang.utils import LazyImport
|
62
|
+
|
63
|
+
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
|
64
|
+
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
|
65
|
+
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
|
66
|
+
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
|
67
|
+
|
68
|
+
__all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"]
|
sglang/api.py
CHANGED
@@ -75,7 +75,7 @@ def gen(
|
|
75
75
|
choices: Optional[List[str]] = None,
|
76
76
|
regex: Optional[str] = None,
|
77
77
|
):
|
78
|
-
"""Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
|
78
|
+
"""Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
|
79
79
|
|
80
80
|
if choices:
|
81
81
|
return SglSelect(name, choices, 0.0 if temperature is None else temperature)
|
@@ -210,6 +210,14 @@ def assistant(expr: Optional[SglExpr] = None):
|
|
210
210
|
return _role_common("assistant", expr)
|
211
211
|
|
212
212
|
|
213
|
+
def system_begin():
|
214
|
+
return SglRoleBegin("system")
|
215
|
+
|
216
|
+
|
217
|
+
def system_end():
|
218
|
+
return SglRoleEnd("system")
|
219
|
+
|
220
|
+
|
213
221
|
def user_begin():
|
214
222
|
return SglRoleBegin("user")
|
215
223
|
|
sglang/bench_latency.py
CHANGED
@@ -37,9 +37,9 @@ import torch
|
|
37
37
|
import torch.distributed as dist
|
38
38
|
|
39
39
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
40
|
-
from sglang.srt.managers.
|
41
|
-
from sglang.srt.managers.controller.model_runner import ModelRunner
|
40
|
+
from sglang.srt.managers.schedule_batch import Batch, ForwardMode, Req
|
42
41
|
from sglang.srt.model_config import ModelConfig
|
42
|
+
from sglang.srt.model_executor.model_runner import ModelRunner
|
43
43
|
from sglang.srt.sampling_params import SamplingParams
|
44
44
|
from sglang.srt.server_args import ServerArgs
|
45
45
|
from sglang.srt.utils import suppress_other_loggers
|
sglang/bench_serving.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/backend_request_func.py
|
2
2
|
# Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py
|
3
|
+
|
3
4
|
"""
|
4
5
|
Benchmark online serving.
|
5
6
|
|
@@ -84,6 +85,9 @@ async def async_request_trt_llm(
|
|
84
85
|
"min_length": request_func_input.output_len,
|
85
86
|
"end_id": 1048576,
|
86
87
|
}
|
88
|
+
if args.disable_ignore_eos:
|
89
|
+
del payload["min_length"]
|
90
|
+
del payload["end_id"]
|
87
91
|
output = RequestFuncOutput()
|
88
92
|
output.prompt_len = request_func_input.prompt_len
|
89
93
|
|
@@ -149,7 +153,7 @@ async def async_request_openai_completions(
|
|
149
153
|
"best_of": 1,
|
150
154
|
"max_tokens": request_func_input.output_len,
|
151
155
|
"stream": not args.disable_stream,
|
152
|
-
"ignore_eos":
|
156
|
+
"ignore_eos": not args.disable_ignore_eos,
|
153
157
|
}
|
154
158
|
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
155
159
|
|
@@ -969,6 +973,11 @@ if __name__ == "__main__":
|
|
969
973
|
action="store_true",
|
970
974
|
help="Disable streaming mode.",
|
971
975
|
)
|
976
|
+
parser.add_argument(
|
977
|
+
"--disable-ignore-eos",
|
978
|
+
action="store_true",
|
979
|
+
help="Disable ignoring EOS.",
|
980
|
+
)
|
972
981
|
|
973
982
|
set_ulimit()
|
974
983
|
|
sglang/check_env.py
CHANGED
sglang/lang/backend/litellm.py
CHANGED
sglang/lang/backend/openai.py
CHANGED
sglang/lang/interpreter.py
CHANGED
@@ -553,6 +553,8 @@ class StreamExecutor:
|
|
553
553
|
"output_token_logprobs": output_token_logprobs,
|
554
554
|
}
|
555
555
|
self.variable_event[name].set()
|
556
|
+
if self.stream_var_event:
|
557
|
+
self.stream_var_event[name].set()
|
556
558
|
self.text_ += decision
|
557
559
|
|
558
560
|
def _execute_variable(self, expr: SglVariable):
|
@@ -705,9 +707,9 @@ class ProgramState:
|
|
705
707
|
|
706
708
|
def _role_common(self, name: str, expr: Optional[SglExpr] = None):
|
707
709
|
if expr is not None:
|
708
|
-
|
709
|
-
|
710
|
-
|
710
|
+
role_expr = SglExprList([SglRoleBegin(name), expr, SglRoleEnd(name)])
|
711
|
+
self.stream_executor.submit(role_expr)
|
712
|
+
return role_expr
|
711
713
|
else:
|
712
714
|
|
713
715
|
@contextmanager
|
@@ -778,7 +780,14 @@ class ProgramState:
|
|
778
780
|
if self.stream_executor.is_finished:
|
779
781
|
break
|
780
782
|
else:
|
781
|
-
event =
|
783
|
+
event = None
|
784
|
+
while not event:
|
785
|
+
if var_name in self.stream_executor.stream_var_event:
|
786
|
+
event = self.stream_executor.stream_var_event[var_name]
|
787
|
+
if self.stream_executor.is_finished:
|
788
|
+
yield ""
|
789
|
+
return
|
790
|
+
|
782
791
|
while True:
|
783
792
|
event.wait()
|
784
793
|
event.clear()
|
@@ -813,7 +822,14 @@ class ProgramState:
|
|
813
822
|
if self.stream_executor.is_finished:
|
814
823
|
break
|
815
824
|
else:
|
816
|
-
event =
|
825
|
+
event = None
|
826
|
+
while not event:
|
827
|
+
if var_name in self.stream_executor.stream_var_event:
|
828
|
+
event = self.stream_executor.stream_var_event[var_name]
|
829
|
+
if self.stream_executor.is_finished:
|
830
|
+
yield ""
|
831
|
+
return
|
832
|
+
|
817
833
|
while True:
|
818
834
|
await loop.run_in_executor(None, event.wait)
|
819
835
|
event.clear()
|
sglang/lang/ir.py
CHANGED
@@ -99,7 +99,6 @@ class SglSamplingParams:
|
|
99
99
|
"stop": self.stop or None,
|
100
100
|
"temperature": self.temperature,
|
101
101
|
"top_p": self.top_p,
|
102
|
-
"top_k": self.top_k,
|
103
102
|
"frequency_penalty": self.frequency_penalty,
|
104
103
|
"presence_penalty": self.presence_penalty,
|
105
104
|
}
|
@@ -410,7 +409,7 @@ class SglGen(SglExpr):
|
|
410
409
|
dtype: Optional[type] = None,
|
411
410
|
regex: Optional[str] = None,
|
412
411
|
):
|
413
|
-
"""Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
|
412
|
+
"""Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
|
414
413
|
super().__init__()
|
415
414
|
self.name = name
|
416
415
|
self.sampling_params = SglSamplingParams(
|
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
import json
|
2
17
|
from typing import Dict, Optional, Union
|
3
18
|
|
@@ -1,9 +1,24 @@
|
|
1
|
-
"""
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
16
|
+
"""Base tool cache for constrained decoding tools."""
|
2
17
|
|
3
18
|
import time
|
4
19
|
|
5
20
|
|
6
|
-
class
|
21
|
+
class BaseToolCache:
|
7
22
|
def __init__(self, enable=True):
|
8
23
|
self.enable = enable
|
9
24
|
self.reset()
|
@@ -1,10 +1,25 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
"""Cache for the compressed finite state machine."""
|
2
17
|
|
3
18
|
from sglang.srt.constrained import RegexGuide, TransformerTokenizer
|
4
|
-
from sglang.srt.constrained.
|
19
|
+
from sglang.srt.constrained.base_tool_cache import BaseToolCache
|
5
20
|
|
6
21
|
|
7
|
-
class FSMCache(
|
22
|
+
class FSMCache(BaseToolCache):
|
8
23
|
def __init__(self, tokenizer_path, tokenizer_args_dict, enable=True):
|
9
24
|
super().__init__(enable=enable)
|
10
25
|
|
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
"""
|
2
17
|
Faster constrained decoding.
|
3
18
|
Reference: https://lmsys.org/blog/2024-02-05-compressed-fsm/
|
@@ -15,7 +30,7 @@ from sglang.srt.constrained import (
|
|
15
30
|
make_byte_level_fsm,
|
16
31
|
make_deterministic_fsm,
|
17
32
|
)
|
18
|
-
from sglang.srt.constrained.
|
33
|
+
from sglang.srt.constrained.base_tool_cache import BaseToolCache
|
19
34
|
|
20
35
|
IP_REGEX = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
|
21
36
|
|
@@ -136,7 +151,7 @@ class JumpForwardMap:
|
|
136
151
|
)
|
137
152
|
|
138
153
|
|
139
|
-
class JumpForwardCache(
|
154
|
+
class JumpForwardCache(BaseToolCache):
|
140
155
|
def __init__(self):
|
141
156
|
super().__init__()
|
142
157
|
|
sglang/srt/conversation.py
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
"""Conversation templates."""
|
2
17
|
|
3
18
|
# Adapted from
|
@@ -421,3 +436,14 @@ register_conv_template(
|
|
421
436
|
sep2="</s>",
|
422
437
|
)
|
423
438
|
)
|
439
|
+
|
440
|
+
# Reference: https://github.com/InternLM/lmdeploy/blob/387bf54b4f124e72aab30ae9755f562e435d3d01/lmdeploy/model.py#L425-L442
|
441
|
+
register_conv_template(
|
442
|
+
Conversation(
|
443
|
+
name="internlm2-chat",
|
444
|
+
system_template="<|im_start|>system\n{system_message}",
|
445
|
+
roles=("<|im_start|>user", "<|im_start|>assistant"),
|
446
|
+
sep="\n",
|
447
|
+
stop_str=["<|im_end|>", "<|action_end|>"],
|
448
|
+
)
|
449
|
+
)
|
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
"""Utilities for Huggingface Transformers."""
|
2
17
|
|
3
18
|
import functools
|
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
# Adapted from
|
2
17
|
# https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py#L1
|
3
18
|
import torch
|
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
import torch
|
2
17
|
import triton
|
3
18
|
import triton.language as tl
|
sglang/srt/layers/fused_moe.py
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
# Adapted from
|
2
17
|
# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/layers/fused_moe/fused_moe.py#L1
|
3
18
|
"""Fused MoE kernel."""
|
sglang/srt/layers/linear.py
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
# temporarily adapted from https://github.com/vllm-project/vllm/blob/e76466dde2bc9525d55165ceaa600d298c7bf773/vllm/model_executor/layers/linear.py
|
2
17
|
# FIXME: refactor the linear abstraction
|
3
18
|
from abc import abstractmethod
|
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
"""Logits processing."""
|
2
17
|
|
3
18
|
import dataclasses
|
@@ -10,7 +25,7 @@ from vllm.distributed import (
|
|
10
25
|
tensor_model_parallel_all_gather,
|
11
26
|
)
|
12
27
|
|
13
|
-
from sglang.srt.
|
28
|
+
from sglang.srt.model_executor.model_runner import ForwardMode, InputMetadata
|
14
29
|
|
15
30
|
|
16
31
|
@dataclasses.dataclass
|
@@ -77,33 +92,46 @@ class LogitsProcessor(nn.Module):
|
|
77
92
|
|
78
93
|
@staticmethod
|
79
94
|
def get_top_logprobs(all_logprobs, logits_metadata: LogitsMetadata):
|
80
|
-
# TODO: vectorize the code below
|
81
95
|
if logits_metadata.forward_mode == ForwardMode.DECODE:
|
82
96
|
output_top_logprobs = []
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
output_top_logprobs.append(list(zip(
|
97
|
+
max_k = max(logits_metadata.top_logprobs_nums)
|
98
|
+
ret = all_logprobs.topk(max_k, dim=1)
|
99
|
+
values = ret.values.tolist()
|
100
|
+
indices = ret.indices.tolist()
|
101
|
+
for i, k in enumerate(logits_metadata.top_logprobs_nums):
|
102
|
+
output_top_logprobs.append(list(zip(values[i][:k], indices[i][:k])))
|
89
103
|
return None, output_top_logprobs
|
90
104
|
else:
|
105
|
+
# TODO: vectorize the code below
|
91
106
|
input_top_logprobs, output_top_logprobs = [], []
|
92
107
|
pt = 0
|
93
108
|
extend_seq_lens_cpu = logits_metadata.extend_seq_lens.tolist()
|
109
|
+
|
110
|
+
max_k = max(logits_metadata.top_logprobs_nums)
|
111
|
+
ret = all_logprobs.topk(max_k, dim=1)
|
112
|
+
values = ret.values.tolist()
|
113
|
+
indices = ret.indices.tolist()
|
114
|
+
|
94
115
|
for i, extend_seq_len in enumerate(extend_seq_lens_cpu):
|
95
116
|
if extend_seq_len == 0:
|
96
117
|
input_top_logprobs.append([])
|
97
118
|
output_top_logprobs.append([])
|
98
119
|
continue
|
99
120
|
k = logits_metadata.top_logprobs_nums[i]
|
100
|
-
t = all_logprobs[pt : pt + extend_seq_len].topk(k)
|
101
|
-
vs_cpu = t.values.tolist()
|
102
|
-
ps_cpu = t.indices.tolist()
|
103
121
|
input_top_logprobs.append(
|
104
|
-
[
|
122
|
+
[
|
123
|
+
list(zip(values[pt + j][:k], indices[pt + j][:k]))
|
124
|
+
for j in range(extend_seq_len - 1)
|
125
|
+
]
|
126
|
+
)
|
127
|
+
output_top_logprobs.append(
|
128
|
+
list(
|
129
|
+
zip(
|
130
|
+
values[pt + extend_seq_len - 1][:k],
|
131
|
+
indices[pt + extend_seq_len - 1][:k],
|
132
|
+
)
|
133
|
+
)
|
105
134
|
)
|
106
|
-
output_top_logprobs.append(list(zip(vs_cpu[-1], ps_cpu[-1])))
|
107
135
|
pt += extend_seq_len
|
108
136
|
|
109
137
|
return input_top_logprobs, output_top_logprobs
|
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
# temporarily adapted from vLLM
|
2
17
|
# FIXME: in progress of refactoring the model loader
|
3
18
|
|
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
# adapted from https://github.com/vllm-project/vllm/blob/e76466dde2bc9525d55165ceaa600d298c7bf773/vllm/model_executor/layers/quantization/fp8.py
|
2
17
|
# FIXME refactor in progress
|
3
18
|
from typing import Any, Dict, List, Optional, Union
|
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
"""Radix attention."""
|
2
17
|
|
3
18
|
import torch
|
@@ -7,7 +22,7 @@ from torch import nn
|
|
7
22
|
from sglang.global_config import global_config
|
8
23
|
from sglang.srt.layers.extend_attention import extend_attention_fwd
|
9
24
|
from sglang.srt.layers.token_attention import token_attention_fwd
|
10
|
-
from sglang.srt.
|
25
|
+
from sglang.srt.model_executor.model_runner import (
|
11
26
|
ForwardMode,
|
12
27
|
InputMetadata,
|
13
28
|
global_server_args_dict,
|
@@ -88,7 +103,7 @@ class RadixAttention(nn.Module):
|
|
88
103
|
return o
|
89
104
|
|
90
105
|
def extend_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata):
|
91
|
-
if not input_metadata.
|
106
|
+
if not input_metadata.flashinfer_use_ragged:
|
92
107
|
self.store_kv_cache(k, v, input_metadata)
|
93
108
|
|
94
109
|
o = input_metadata.flashinfer_prefill_wrapper_paged.forward(
|