sglang 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. sglang/__init__.py +33 -26
  2. sglang/api.py +9 -1
  3. sglang/bench_latency.py +2 -2
  4. sglang/bench_serving.py +10 -1
  5. sglang/check_env.py +1 -1
  6. sglang/lang/backend/litellm.py +1 -1
  7. sglang/lang/backend/openai.py +1 -1
  8. sglang/lang/interpreter.py +21 -5
  9. sglang/lang/ir.py +1 -2
  10. sglang/srt/constrained/__init__.py +15 -0
  11. sglang/srt/constrained/{base_cache.py → base_tool_cache.py} +17 -2
  12. sglang/srt/constrained/fsm_cache.py +17 -2
  13. sglang/srt/constrained/jump_forward.py +17 -2
  14. sglang/srt/conversation.py +26 -0
  15. sglang/srt/hf_transformers_utils.py +15 -0
  16. sglang/srt/layers/context_flashattention_nopad.py +15 -0
  17. sglang/srt/layers/extend_attention.py +15 -0
  18. sglang/srt/layers/fused_moe.py +15 -0
  19. sglang/srt/layers/linear.py +15 -0
  20. sglang/srt/layers/logits_processor.py +41 -13
  21. sglang/srt/layers/quantization/__init__.py +15 -0
  22. sglang/srt/layers/quantization/fp8.py +15 -0
  23. sglang/srt/layers/radix_attention.py +17 -2
  24. sglang/srt/layers/token_attention.py +16 -1
  25. sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} +17 -2
  26. sglang/srt/managers/{controller/manager_single.py → controller_single.py} +17 -2
  27. sglang/srt/managers/detokenizer_manager.py +16 -1
  28. sglang/srt/managers/io_struct.py +36 -3
  29. sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} +37 -22
  30. sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} +60 -21
  31. sglang/srt/managers/tokenizer_manager.py +39 -16
  32. sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} +159 -46
  33. sglang/srt/mem_cache/base_cache.py +43 -0
  34. sglang/srt/mem_cache/chunk_cache.py +60 -0
  35. sglang/srt/mem_cache/flush_cache.py +33 -0
  36. sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} +16 -1
  37. sglang/srt/{managers/controller → mem_cache}/radix_cache.py +20 -2
  38. sglang/srt/mm_utils.py +15 -0
  39. sglang/srt/model_config.py +15 -0
  40. sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py +16 -1
  41. sglang/srt/{managers/controller → model_executor}/model_runner.py +49 -14
  42. sglang/srt/model_loader/model_loader.py +15 -0
  43. sglang/srt/model_loader/utils.py +16 -1
  44. sglang/srt/models/chatglm.py +16 -1
  45. sglang/srt/models/commandr.py +16 -1
  46. sglang/srt/models/dbrx.py +16 -1
  47. sglang/srt/models/deepseek.py +16 -1
  48. sglang/srt/models/deepseek_v2.py +16 -1
  49. sglang/srt/models/gemma.py +16 -1
  50. sglang/srt/models/gemma2.py +16 -1
  51. sglang/srt/models/gpt_bigcode.py +16 -1
  52. sglang/srt/models/grok.py +16 -1
  53. sglang/srt/models/internlm2.py +16 -1
  54. sglang/srt/models/llama2.py +21 -22
  55. sglang/srt/models/llama_classification.py +16 -1
  56. sglang/srt/models/llava.py +17 -2
  57. sglang/srt/models/llavavid.py +17 -2
  58. sglang/srt/models/minicpm.py +16 -1
  59. sglang/srt/models/mistral.py +15 -0
  60. sglang/srt/models/mixtral.py +16 -1
  61. sglang/srt/models/mixtral_quant.py +16 -1
  62. sglang/srt/models/qwen.py +16 -1
  63. sglang/srt/models/qwen2.py +16 -1
  64. sglang/srt/models/qwen2_moe.py +16 -1
  65. sglang/srt/models/stablelm.py +16 -1
  66. sglang/srt/models/yivl.py +15 -0
  67. sglang/srt/openai_api/adapter.py +569 -131
  68. sglang/srt/openai_api/protocol.py +84 -2
  69. sglang/srt/sampling_params.py +15 -0
  70. sglang/srt/server.py +92 -23
  71. sglang/srt/server_args.py +52 -11
  72. sglang/srt/utils.py +15 -0
  73. sglang/test/test_programs.py +9 -6
  74. sglang/utils.py +22 -0
  75. sglang/version.py +1 -1
  76. {sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/METADATA +33 -7
  77. sglang-0.2.8.dist-info/RECORD +95 -0
  78. {sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/WHEEL +1 -1
  79. sglang/srt/flush_cache.py +0 -18
  80. sglang-0.2.6.dist-info/RECORD +0 -93
  81. {sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/LICENSE +0 -0
  82. {sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/top_level.txt +0 -0
sglang/__init__.py CHANGED
@@ -1,4 +1,5 @@
1
1
  # SGL API Components
2
+
2
3
  from sglang.api import (
3
4
  Runtime,
4
5
  assistant,
@@ -14,48 +15,54 @@ from sglang.api import (
14
15
  select,
15
16
  set_default_backend,
16
17
  system,
18
+ system_begin,
19
+ system_end,
17
20
  user,
18
21
  user_begin,
19
22
  user_end,
20
23
  video,
21
24
  )
22
25
 
23
- # Global Configurations
24
- from sglang.global_config import global_config
25
-
26
- # SGL Backends
27
- from sglang.lang.backend.anthropic import Anthropic
28
- from sglang.lang.backend.litellm import LiteLLM
29
- from sglang.lang.backend.openai import OpenAI
30
- from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
31
- from sglang.lang.backend.vertexai import VertexAI
32
-
33
- from .version import __version__
34
-
35
- # public APIs management
26
+ # SGLang DSL APIs
36
27
  __all__ = [
37
- "global_config",
38
- "Anthropic",
39
- "LiteLLM",
40
- "OpenAI",
41
- "RuntimeEndpoint",
42
- "VertexAI",
43
- "function",
44
28
  "Runtime",
45
- "set_default_backend",
29
+ "assistant",
30
+ "assistant_begin",
31
+ "assistant_end",
46
32
  "flush_cache",
47
- "get_server_args",
33
+ "function",
48
34
  "gen",
49
35
  "gen_int",
50
36
  "gen_string",
37
+ "get_server_args",
51
38
  "image",
52
- "video",
53
39
  "select",
40
+ "set_default_backend",
54
41
  "system",
42
+ "system_begin",
43
+ "system_end",
55
44
  "user",
56
- "assistant",
57
45
  "user_begin",
58
46
  "user_end",
59
- "assistant_begin",
60
- "assistant_end",
47
+ "video",
61
48
  ]
49
+
50
+ # Global Configurations
51
+ from sglang.global_config import global_config
52
+
53
+ __all__ += ["global_config"]
54
+
55
+ from sglang.version import __version__
56
+
57
+ __all__ += ["__version__"]
58
+
59
+ # SGL Backends
60
+ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
61
+ from sglang.utils import LazyImport
62
+
63
+ Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
64
+ LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
65
+ OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
66
+ VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
67
+
68
+ __all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"]
sglang/api.py CHANGED
@@ -75,7 +75,7 @@ def gen(
75
75
  choices: Optional[List[str]] = None,
76
76
  regex: Optional[str] = None,
77
77
  ):
78
- """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
78
+ """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
79
79
 
80
80
  if choices:
81
81
  return SglSelect(name, choices, 0.0 if temperature is None else temperature)
@@ -210,6 +210,14 @@ def assistant(expr: Optional[SglExpr] = None):
210
210
  return _role_common("assistant", expr)
211
211
 
212
212
 
213
+ def system_begin():
214
+ return SglRoleBegin("system")
215
+
216
+
217
+ def system_end():
218
+ return SglRoleEnd("system")
219
+
220
+
213
221
  def user_begin():
214
222
  return SglRoleBegin("user")
215
223
 
sglang/bench_latency.py CHANGED
@@ -37,9 +37,9 @@ import torch
37
37
  import torch.distributed as dist
38
38
 
39
39
  from sglang.srt.hf_transformers_utils import get_tokenizer
40
- from sglang.srt.managers.controller.infer_batch import Batch, ForwardMode, Req
41
- from sglang.srt.managers.controller.model_runner import ModelRunner
40
+ from sglang.srt.managers.schedule_batch import Batch, ForwardMode, Req
42
41
  from sglang.srt.model_config import ModelConfig
42
+ from sglang.srt.model_executor.model_runner import ModelRunner
43
43
  from sglang.srt.sampling_params import SamplingParams
44
44
  from sglang.srt.server_args import ServerArgs
45
45
  from sglang.srt.utils import suppress_other_loggers
sglang/bench_serving.py CHANGED
@@ -1,5 +1,6 @@
1
1
  # Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/backend_request_func.py
2
2
  # Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py
3
+
3
4
  """
4
5
  Benchmark online serving.
5
6
 
@@ -84,6 +85,9 @@ async def async_request_trt_llm(
84
85
  "min_length": request_func_input.output_len,
85
86
  "end_id": 1048576,
86
87
  }
88
+ if args.disable_ignore_eos:
89
+ del payload["min_length"]
90
+ del payload["end_id"]
87
91
  output = RequestFuncOutput()
88
92
  output.prompt_len = request_func_input.prompt_len
89
93
 
@@ -149,7 +153,7 @@ async def async_request_openai_completions(
149
153
  "best_of": 1,
150
154
  "max_tokens": request_func_input.output_len,
151
155
  "stream": not args.disable_stream,
152
- "ignore_eos": True,
156
+ "ignore_eos": not args.disable_ignore_eos,
153
157
  }
154
158
  headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
155
159
 
@@ -969,6 +973,11 @@ if __name__ == "__main__":
969
973
  action="store_true",
970
974
  help="Disable streaming mode.",
971
975
  )
976
+ parser.add_argument(
977
+ "--disable-ignore-eos",
978
+ action="store_true",
979
+ help="Disable ignoring EOS.",
980
+ )
972
981
 
973
982
  set_ulimit()
974
983
 
sglang/check_env.py CHANGED
@@ -22,7 +22,7 @@ PACKAGE_LIST = [
22
22
  "huggingface_hub",
23
23
  "interegular",
24
24
  "packaging",
25
- "pillow",
25
+ "PIL",
26
26
  "psutil",
27
27
  "pydantic",
28
28
  "uvicorn",
@@ -61,7 +61,7 @@ class LiteLLM(BaseBackend):
61
61
  model=self.model_name,
62
62
  messages=messages,
63
63
  **self.client_params,
64
- **sampling_params.to_anthropic_kwargs(),
64
+ **sampling_params.to_litellm_kwargs(),
65
65
  )
66
66
  comp = ret.choices[0].message.content
67
67
 
@@ -18,7 +18,7 @@ except ImportError as e:
18
18
  openai = tiktoken = e
19
19
 
20
20
 
21
- logger = logging.getLogger("openai")
21
+ logger = logging.getLogger(__name__)
22
22
 
23
23
 
24
24
  def create_logit_bias_int(tokenizer):
@@ -553,6 +553,8 @@ class StreamExecutor:
553
553
  "output_token_logprobs": output_token_logprobs,
554
554
  }
555
555
  self.variable_event[name].set()
556
+ if self.stream_var_event:
557
+ self.stream_var_event[name].set()
556
558
  self.text_ += decision
557
559
 
558
560
  def _execute_variable(self, expr: SglVariable):
@@ -705,9 +707,9 @@ class ProgramState:
705
707
 
706
708
  def _role_common(self, name: str, expr: Optional[SglExpr] = None):
707
709
  if expr is not None:
708
- self.stream_executor.submit(
709
- SglExprList([SglRoleBegin(name), expr, SglRoleEnd(name)])
710
- )
710
+ role_expr = SglExprList([SglRoleBegin(name), expr, SglRoleEnd(name)])
711
+ self.stream_executor.submit(role_expr)
712
+ return role_expr
711
713
  else:
712
714
 
713
715
  @contextmanager
@@ -778,7 +780,14 @@ class ProgramState:
778
780
  if self.stream_executor.is_finished:
779
781
  break
780
782
  else:
781
- event = self.stream_executor.stream_var_event[var_name]
783
+ event = None
784
+ while not event:
785
+ if var_name in self.stream_executor.stream_var_event:
786
+ event = self.stream_executor.stream_var_event[var_name]
787
+ if self.stream_executor.is_finished:
788
+ yield ""
789
+ return
790
+
782
791
  while True:
783
792
  event.wait()
784
793
  event.clear()
@@ -813,7 +822,14 @@ class ProgramState:
813
822
  if self.stream_executor.is_finished:
814
823
  break
815
824
  else:
816
- event = self.stream_executor.stream_var_event[var_name]
825
+ event = None
826
+ while not event:
827
+ if var_name in self.stream_executor.stream_var_event:
828
+ event = self.stream_executor.stream_var_event[var_name]
829
+ if self.stream_executor.is_finished:
830
+ yield ""
831
+ return
832
+
817
833
  while True:
818
834
  await loop.run_in_executor(None, event.wait)
819
835
  event.clear()
sglang/lang/ir.py CHANGED
@@ -99,7 +99,6 @@ class SglSamplingParams:
99
99
  "stop": self.stop or None,
100
100
  "temperature": self.temperature,
101
101
  "top_p": self.top_p,
102
- "top_k": self.top_k,
103
102
  "frequency_penalty": self.frequency_penalty,
104
103
  "presence_penalty": self.presence_penalty,
105
104
  }
@@ -410,7 +409,7 @@ class SglGen(SglExpr):
410
409
  dtype: Optional[type] = None,
411
410
  regex: Optional[str] = None,
412
411
  ):
413
- """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
412
+ """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
414
413
  super().__init__()
415
414
  self.name = name
416
415
  self.sampling_params = SglSamplingParams(
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  import json
2
17
  from typing import Dict, Optional, Union
3
18
 
@@ -1,9 +1,24 @@
1
- """Base cache class."""
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
16
+ """Base tool cache for constrained decoding tools."""
2
17
 
3
18
  import time
4
19
 
5
20
 
6
- class BaseCache:
21
+ class BaseToolCache:
7
22
  def __init__(self, enable=True):
8
23
  self.enable = enable
9
24
  self.reset()
@@ -1,10 +1,25 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Cache for the compressed finite state machine."""
2
17
 
3
18
  from sglang.srt.constrained import RegexGuide, TransformerTokenizer
4
- from sglang.srt.constrained.base_cache import BaseCache
19
+ from sglang.srt.constrained.base_tool_cache import BaseToolCache
5
20
 
6
21
 
7
- class FSMCache(BaseCache):
22
+ class FSMCache(BaseToolCache):
8
23
  def __init__(self, tokenizer_path, tokenizer_args_dict, enable=True):
9
24
  super().__init__(enable=enable)
10
25
 
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """
2
17
  Faster constrained decoding.
3
18
  Reference: https://lmsys.org/blog/2024-02-05-compressed-fsm/
@@ -15,7 +30,7 @@ from sglang.srt.constrained import (
15
30
  make_byte_level_fsm,
16
31
  make_deterministic_fsm,
17
32
  )
18
- from sglang.srt.constrained.base_cache import BaseCache
33
+ from sglang.srt.constrained.base_tool_cache import BaseToolCache
19
34
 
20
35
  IP_REGEX = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
21
36
 
@@ -136,7 +151,7 @@ class JumpForwardMap:
136
151
  )
137
152
 
138
153
 
139
- class JumpForwardCache(BaseCache):
154
+ class JumpForwardCache(BaseToolCache):
140
155
  def __init__(self):
141
156
  super().__init__()
142
157
 
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Conversation templates."""
2
17
 
3
18
  # Adapted from
@@ -421,3 +436,14 @@ register_conv_template(
421
436
  sep2="</s>",
422
437
  )
423
438
  )
439
+
440
+ # Reference: https://github.com/InternLM/lmdeploy/blob/387bf54b4f124e72aab30ae9755f562e435d3d01/lmdeploy/model.py#L425-L442
441
+ register_conv_template(
442
+ Conversation(
443
+ name="internlm2-chat",
444
+ system_template="<|im_start|>system\n{system_message}",
445
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
446
+ sep="\n",
447
+ stop_str=["<|im_end|>", "<|action_end|>"],
448
+ )
449
+ )
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Utilities for Huggingface Transformers."""
2
17
 
3
18
  import functools
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # Adapted from
2
17
  # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py#L1
3
18
  import torch
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  import torch
2
17
  import triton
3
18
  import triton.language as tl
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # Adapted from
2
17
  # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/layers/fused_moe/fused_moe.py#L1
3
18
  """Fused MoE kernel."""
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # temporarily adapted from https://github.com/vllm-project/vllm/blob/e76466dde2bc9525d55165ceaa600d298c7bf773/vllm/model_executor/layers/linear.py
2
17
  # FIXME: refactor the linear abstraction
3
18
  from abc import abstractmethod
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Logits processing."""
2
17
 
3
18
  import dataclasses
@@ -10,7 +25,7 @@ from vllm.distributed import (
10
25
  tensor_model_parallel_all_gather,
11
26
  )
12
27
 
13
- from sglang.srt.managers.controller.model_runner import ForwardMode, InputMetadata
28
+ from sglang.srt.model_executor.model_runner import ForwardMode, InputMetadata
14
29
 
15
30
 
16
31
  @dataclasses.dataclass
@@ -77,33 +92,46 @@ class LogitsProcessor(nn.Module):
77
92
 
78
93
  @staticmethod
79
94
  def get_top_logprobs(all_logprobs, logits_metadata: LogitsMetadata):
80
- # TODO: vectorize the code below
81
95
  if logits_metadata.forward_mode == ForwardMode.DECODE:
82
96
  output_top_logprobs = []
83
- for i in range(all_logprobs.shape[0]):
84
- k = logits_metadata.top_logprobs_nums[i]
85
- t = all_logprobs[i].topk(k)
86
- v_cpu = t.values.tolist()
87
- p_cpu = t.indices.tolist()
88
- output_top_logprobs.append(list(zip(v_cpu, p_cpu)))
97
+ max_k = max(logits_metadata.top_logprobs_nums)
98
+ ret = all_logprobs.topk(max_k, dim=1)
99
+ values = ret.values.tolist()
100
+ indices = ret.indices.tolist()
101
+ for i, k in enumerate(logits_metadata.top_logprobs_nums):
102
+ output_top_logprobs.append(list(zip(values[i][:k], indices[i][:k])))
89
103
  return None, output_top_logprobs
90
104
  else:
105
+ # TODO: vectorize the code below
91
106
  input_top_logprobs, output_top_logprobs = [], []
92
107
  pt = 0
93
108
  extend_seq_lens_cpu = logits_metadata.extend_seq_lens.tolist()
109
+
110
+ max_k = max(logits_metadata.top_logprobs_nums)
111
+ ret = all_logprobs.topk(max_k, dim=1)
112
+ values = ret.values.tolist()
113
+ indices = ret.indices.tolist()
114
+
94
115
  for i, extend_seq_len in enumerate(extend_seq_lens_cpu):
95
116
  if extend_seq_len == 0:
96
117
  input_top_logprobs.append([])
97
118
  output_top_logprobs.append([])
98
119
  continue
99
120
  k = logits_metadata.top_logprobs_nums[i]
100
- t = all_logprobs[pt : pt + extend_seq_len].topk(k)
101
- vs_cpu = t.values.tolist()
102
- ps_cpu = t.indices.tolist()
103
121
  input_top_logprobs.append(
104
- [list(zip(vs_cpu[j], ps_cpu[j])) for j in range(len(vs_cpu) - 1)]
122
+ [
123
+ list(zip(values[pt + j][:k], indices[pt + j][:k]))
124
+ for j in range(extend_seq_len - 1)
125
+ ]
126
+ )
127
+ output_top_logprobs.append(
128
+ list(
129
+ zip(
130
+ values[pt + extend_seq_len - 1][:k],
131
+ indices[pt + extend_seq_len - 1][:k],
132
+ )
133
+ )
105
134
  )
106
- output_top_logprobs.append(list(zip(vs_cpu[-1], ps_cpu[-1])))
107
135
  pt += extend_seq_len
108
136
 
109
137
  return input_top_logprobs, output_top_logprobs
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # temporarily adapted from vLLM
2
17
  # FIXME: in progress of refactoring the model loader
3
18
 
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # adapted from https://github.com/vllm-project/vllm/blob/e76466dde2bc9525d55165ceaa600d298c7bf773/vllm/model_executor/layers/quantization/fp8.py
2
17
  # FIXME refactor in progress
3
18
  from typing import Any, Dict, List, Optional, Union
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Radix attention."""
2
17
 
3
18
  import torch
@@ -7,7 +22,7 @@ from torch import nn
7
22
  from sglang.global_config import global_config
8
23
  from sglang.srt.layers.extend_attention import extend_attention_fwd
9
24
  from sglang.srt.layers.token_attention import token_attention_fwd
10
- from sglang.srt.managers.controller.model_runner import (
25
+ from sglang.srt.model_executor.model_runner import (
11
26
  ForwardMode,
12
27
  InputMetadata,
13
28
  global_server_args_dict,
@@ -88,7 +103,7 @@ class RadixAttention(nn.Module):
88
103
  return o
89
104
 
90
105
  def extend_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata):
91
- if not input_metadata.use_ragged:
106
+ if not input_metadata.flashinfer_use_ragged:
92
107
  self.store_kv_cache(k, v, input_metadata)
93
108
 
94
109
  o = input_metadata.flashinfer_prefill_wrapper_paged.forward(