sglang 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. sglang/__init__.py +33 -26
  2. sglang/api.py +9 -1
  3. sglang/bench_latency.py +2 -2
  4. sglang/bench_serving.py +10 -1
  5. sglang/check_env.py +1 -1
  6. sglang/lang/backend/litellm.py +1 -1
  7. sglang/lang/backend/openai.py +1 -1
  8. sglang/lang/backend/runtime_endpoint.py +4 -4
  9. sglang/lang/interpreter.py +24 -9
  10. sglang/lang/ir.py +1 -1
  11. sglang/srt/constrained/__init__.py +15 -0
  12. sglang/srt/constrained/base_cache.py +15 -0
  13. sglang/srt/constrained/fsm_cache.py +36 -1
  14. sglang/srt/constrained/jump_forward.py +15 -0
  15. sglang/srt/conversation.py +26 -0
  16. sglang/srt/hf_transformers_utils.py +18 -1
  17. sglang/srt/layers/context_flashattention_nopad.py +15 -0
  18. sglang/srt/layers/extend_attention.py +15 -0
  19. sglang/srt/layers/fused_moe.py +15 -0
  20. sglang/srt/layers/linear.py +15 -0
  21. sglang/srt/layers/logits_processor.py +109 -72
  22. sglang/srt/layers/quantization/__init__.py +15 -0
  23. sglang/srt/layers/quantization/fp8.py +15 -0
  24. sglang/srt/layers/radix_attention.py +21 -3
  25. sglang/srt/layers/token_attention.py +16 -1
  26. sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} +17 -2
  27. sglang/srt/managers/{controller/manager_single.py → controller_single.py} +17 -2
  28. sglang/srt/managers/detokenizer_manager.py +16 -1
  29. sglang/srt/managers/io_struct.py +38 -5
  30. sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} +37 -22
  31. sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} +85 -25
  32. sglang/srt/managers/tokenizer_manager.py +99 -57
  33. sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} +177 -81
  34. sglang/srt/mem_cache/flush_cache.py +33 -0
  35. sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} +16 -1
  36. sglang/srt/{managers/controller → mem_cache}/radix_cache.py +15 -0
  37. sglang/srt/mm_utils.py +15 -0
  38. sglang/srt/model_config.py +20 -0
  39. sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py +42 -18
  40. sglang/srt/{managers/controller → model_executor}/model_runner.py +51 -16
  41. sglang/srt/model_loader/model_loader.py +15 -0
  42. sglang/srt/model_loader/utils.py +16 -1
  43. sglang/srt/models/chatglm.py +16 -1
  44. sglang/srt/models/commandr.py +16 -1
  45. sglang/srt/models/dbrx.py +16 -1
  46. sglang/srt/models/deepseek.py +16 -1
  47. sglang/srt/models/deepseek_v2.py +532 -0
  48. sglang/srt/models/gemma.py +16 -1
  49. sglang/srt/models/gemma2.py +16 -1
  50. sglang/srt/models/gpt_bigcode.py +16 -1
  51. sglang/srt/models/grok.py +16 -1
  52. sglang/srt/models/internlm2.py +16 -1
  53. sglang/srt/models/llama2.py +16 -1
  54. sglang/srt/models/llama_classification.py +19 -4
  55. sglang/srt/models/llava.py +17 -2
  56. sglang/srt/models/llavavid.py +17 -2
  57. sglang/srt/models/minicpm.py +16 -1
  58. sglang/srt/models/mistral.py +15 -0
  59. sglang/srt/models/mixtral.py +16 -1
  60. sglang/srt/models/mixtral_quant.py +16 -1
  61. sglang/srt/models/qwen.py +16 -1
  62. sglang/srt/models/qwen2.py +16 -1
  63. sglang/srt/models/qwen2_moe.py +16 -1
  64. sglang/srt/models/stablelm.py +16 -1
  65. sglang/srt/models/yivl.py +15 -0
  66. sglang/srt/openai_api/adapter.py +545 -160
  67. sglang/srt/openai_api/protocol.py +65 -1
  68. sglang/srt/sampling_params.py +20 -4
  69. sglang/srt/server.py +90 -37
  70. sglang/srt/server_args.py +76 -17
  71. sglang/srt/utils.py +15 -0
  72. sglang/test/test_programs.py +5 -1
  73. sglang/utils.py +22 -0
  74. sglang/version.py +1 -1
  75. {sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/METADATA +40 -12
  76. sglang-0.2.7.dist-info/RECORD +93 -0
  77. {sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/WHEEL +1 -1
  78. sglang/srt/flush_cache.py +0 -18
  79. sglang-0.2.5.dist-info/RECORD +0 -92
  80. {sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/LICENSE +0 -0
  81. {sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # Adapted from
2
17
  # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/llama.py#L1
3
18
  """Inference-only LLaMA model compatible with HuggingFace weights."""
@@ -21,7 +36,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
21
36
 
22
37
  from sglang.srt.layers.logits_processor import LogitsProcessor
23
38
  from sglang.srt.layers.radix_attention import RadixAttention
24
- from sglang.srt.managers.controller.model_runner import InputMetadata
39
+ from sglang.srt.model_executor.model_runner import InputMetadata
25
40
 
26
41
  MergedColumnParallelLinear = None
27
42
  QKVParallelLinear = None
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  from typing import Iterable, Optional, Tuple
2
17
 
3
18
  import torch
@@ -10,7 +25,7 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConf
10
25
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
11
26
 
12
27
  from sglang.srt.layers.logits_processor import LogitProcessorOutput
13
- from sglang.srt.managers.controller.model_runner import InputMetadata
28
+ from sglang.srt.model_executor.model_runner import InputMetadata
14
29
  from sglang.srt.models.llama2 import LlamaModel
15
30
 
16
31
 
@@ -54,9 +69,9 @@ class LlamaForClassification(nn.Module):
54
69
  next_token_logits=scores,
55
70
  next_token_logprobs=scores,
56
71
  normalized_prompt_logprobs=scores,
57
- prefill_token_logprobs=torch.ones_like(input_ids),
58
- prefill_top_logprobs=None,
59
- decode_top_logprobs=None,
72
+ input_token_logprobs=torch.ones_like(input_ids),
73
+ input_top_logprobs=None,
74
+ output_top_logprobs=None,
60
75
  )
61
76
 
62
77
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Inference-only LLaVa model compatible with HuggingFace weights."""
2
17
 
3
18
  from typing import Iterable, List, Optional, Tuple
@@ -17,13 +32,13 @@ from vllm.config import CacheConfig
17
32
  from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
18
33
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
19
34
 
20
- from sglang.srt.managers.controller.infer_batch import ForwardMode
21
- from sglang.srt.managers.controller.model_runner import InputMetadata
35
+ from sglang.srt.managers.schedule_batch import ForwardMode
22
36
  from sglang.srt.mm_utils import (
23
37
  get_anyres_image_grid_shape,
24
38
  unpad_image,
25
39
  unpad_image_shape,
26
40
  )
41
+ from sglang.srt.model_executor.model_runner import InputMetadata
27
42
  from sglang.srt.models.llama2 import LlamaForCausalLM
28
43
  from sglang.srt.models.mistral import MistralForCausalLM
29
44
  from sglang.srt.models.qwen2 import Qwen2ForCausalLM
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Inference-only LLaVa video model compatible with HuggingFace weights."""
2
17
 
3
18
  from typing import Iterable, List, Optional, Tuple
@@ -11,13 +26,13 @@ from vllm.config import CacheConfig
11
26
  from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
12
27
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
13
28
 
14
- from sglang.srt.managers.controller.infer_batch import ForwardMode
15
- from sglang.srt.managers.controller.model_runner import InputMetadata
29
+ from sglang.srt.managers.schedule_batch import ForwardMode
16
30
  from sglang.srt.mm_utils import (
17
31
  get_anyres_image_grid_shape,
18
32
  unpad_image,
19
33
  unpad_image_shape,
20
34
  )
35
+ from sglang.srt.model_executor.model_runner import InputMetadata
21
36
  from sglang.srt.models.llama2 import LlamaForCausalLM
22
37
 
23
38
 
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Inference-only MiniCPM model compatible with HuggingFace weights."""
2
17
 
3
18
  import math
@@ -24,7 +39,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
24
39
 
25
40
  from sglang.srt.layers.logits_processor import LogitsProcessor
26
41
  from sglang.srt.layers.radix_attention import RadixAttention
27
- from sglang.srt.managers.controller.model_runner import InputMetadata
42
+ from sglang.srt.model_executor.model_runner import InputMetadata
28
43
 
29
44
 
30
45
  class MiniCPMMLP(nn.Module):
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Inference-only Mistral model."""
2
17
 
3
18
  from sglang.srt.models.llama2 import LlamaForCausalLM
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # Adapted from
2
17
  # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/mixtral.py#L1
3
18
  """Inference-only Mixtral model."""
@@ -35,7 +50,7 @@ from vllm.utils import print_warning_once
35
50
 
36
51
  from sglang.srt.layers.logits_processor import LogitsProcessor
37
52
  from sglang.srt.layers.radix_attention import RadixAttention
38
- from sglang.srt.managers.controller.model_runner import InputMetadata
53
+ from sglang.srt.model_executor.model_runner import InputMetadata
39
54
 
40
55
 
41
56
  class MixtralMoE(nn.Module):
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # Adapted from
2
17
  # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/mixtral_quant.py#L1
3
18
  """Inference-only Mixtral model."""
@@ -30,7 +45,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
30
45
 
31
46
  from sglang.srt.layers.logits_processor import LogitsProcessor
32
47
  from sglang.srt.layers.radix_attention import RadixAttention
33
- from sglang.srt.managers.controller.model_runner import InputMetadata
48
+ from sglang.srt.model_executor.model_runner import InputMetadata
34
49
 
35
50
 
36
51
  class MixtralMLP(nn.Module):
sglang/srt/models/qwen.py CHANGED
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # Adapted from
2
17
  # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/qwen.py#L1
3
18
  from typing import Any, Dict, Iterable, Optional, Tuple
@@ -24,7 +39,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
24
39
 
25
40
  from sglang.srt.layers.logits_processor import LogitsProcessor
26
41
  from sglang.srt.layers.radix_attention import RadixAttention
27
- from sglang.srt.managers.controller.model_runner import InputMetadata
42
+ from sglang.srt.model_executor.model_runner import InputMetadata
28
43
 
29
44
 
30
45
  class QWenMLP(nn.Module):
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # Adapted from llama2.py
2
17
  # Modify details for the adaptation of Qwen2 model.
3
18
  """Inference-only Qwen2 model compatible with HuggingFace weights."""
@@ -24,7 +39,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
24
39
 
25
40
  from sglang.srt.layers.logits_processor import LogitsProcessor
26
41
  from sglang.srt.layers.radix_attention import RadixAttention
27
- from sglang.srt.managers.controller.model_runner import InputMetadata
42
+ from sglang.srt.model_executor.model_runner import InputMetadata
28
43
 
29
44
  Qwen2Config = None
30
45
 
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # coding=utf-8
2
17
  # Adapted from
3
18
  # https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/qwen2_moe.py
@@ -36,7 +51,7 @@ from vllm.sequence import IntermediateTensors, SamplerOutput
36
51
 
37
52
  from sglang.srt.layers.logits_processor import LogitsProcessor
38
53
  from sglang.srt.layers.radix_attention import RadixAttention
39
- from sglang.srt.managers.controller.model_runner import InputMetadata
54
+ from sglang.srt.model_executor.model_runner import InputMetadata
40
55
 
41
56
 
42
57
  class Qwen2MoeMLP(nn.Module):
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # Adapted from:
2
17
  # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/stablelm.py#L1
3
18
  """Inference-only StableLM-2 (https://huggingface.co/stabilityai/stablelm-2-1_6b)
@@ -25,7 +40,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
25
40
 
26
41
  from sglang.srt.layers.logits_processor import LogitsProcessor
27
42
  from sglang.srt.layers.radix_attention import RadixAttention
28
- from sglang.srt.managers.controller.model_runner import InputMetadata
43
+ from sglang.srt.model_executor.model_runner import InputMetadata
29
44
 
30
45
 
31
46
  class StablelmMLP(nn.Module):
sglang/srt/models/yivl.py CHANGED
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Inference-only Yi-VL model."""
2
17
 
3
18
  from typing import Iterable, Optional, Tuple