sglang 0.4.6.post1__py3-none-any.whl → 0.4.6.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. sglang/bench_one_batch.py +2 -0
  2. sglang/check_env.py +3 -3
  3. sglang/srt/configs/__init__.py +4 -0
  4. sglang/srt/configs/kimi_vl.py +38 -0
  5. sglang/srt/configs/kimi_vl_moonvit.py +32 -0
  6. sglang/srt/configs/model_config.py +15 -0
  7. sglang/srt/conversation.py +122 -1
  8. sglang/srt/entrypoints/engine.py +44 -22
  9. sglang/srt/function_call_parser.py +97 -0
  10. sglang/srt/hf_transformers_utils.py +2 -0
  11. sglang/srt/layers/attention/cutlass_mla_backend.py +1 -1
  12. sglang/srt/layers/attention/flashinfer_backend.py +107 -82
  13. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -16
  14. sglang/srt/layers/attention/flashmla_backend.py +3 -0
  15. sglang/srt/layers/dp_attention.py +5 -2
  16. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +1 -3
  17. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  18. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  19. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  20. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  21. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -6
  22. sglang/srt/layers/quantization/__init__.py +2 -2
  23. sglang/srt/layers/quantization/deep_gemm.py +1 -1
  24. sglang/srt/layers/utils.py +35 -0
  25. sglang/srt/lora/layers.py +35 -9
  26. sglang/srt/lora/lora_manager.py +84 -35
  27. sglang/srt/managers/data_parallel_controller.py +52 -34
  28. sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
  29. sglang/srt/managers/schedule_batch.py +25 -15
  30. sglang/srt/managers/scheduler.py +263 -59
  31. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -1
  32. sglang/srt/managers/tp_worker.py +51 -16
  33. sglang/srt/managers/tp_worker_overlap_thread.py +9 -3
  34. sglang/srt/mem_cache/memory_pool.py +70 -36
  35. sglang/srt/model_executor/cuda_graph_runner.py +82 -19
  36. sglang/srt/model_executor/forward_batch_info.py +31 -1
  37. sglang/srt/model_executor/model_runner.py +115 -57
  38. sglang/srt/models/deepseek_nextn.py +1 -257
  39. sglang/srt/models/deepseek_v2.py +78 -18
  40. sglang/srt/models/kimi_vl.py +308 -0
  41. sglang/srt/models/kimi_vl_moonvit.py +639 -0
  42. sglang/srt/models/llama.py +92 -30
  43. sglang/srt/models/llama4.py +2 -1
  44. sglang/srt/models/llama_eagle.py +4 -1
  45. sglang/srt/models/llama_eagle3.py +4 -1
  46. sglang/srt/models/qwen2_moe.py +8 -3
  47. sglang/srt/models/qwen2_vl.py +0 -12
  48. sglang/srt/models/qwen3_moe.py +8 -3
  49. sglang/srt/openai_api/adapter.py +34 -22
  50. sglang/srt/openai_api/protocol.py +11 -1
  51. sglang/srt/server_args.py +67 -22
  52. sglang/srt/speculative/eagle_worker.py +3 -2
  53. sglang/srt/utils.py +88 -9
  54. sglang/test/runners.py +4 -0
  55. sglang/test/test_utils.py +29 -0
  56. sglang/version.py +1 -1
  57. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/METADATA +5 -4
  58. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/RECORD +61 -51
  59. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/WHEEL +1 -1
  60. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/licenses/LICENSE +0 -0
  61. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,146 @@
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 64,
5
+ "BLOCK_SIZE_K": 64,
6
+ "GROUP_SIZE_M": 1,
7
+ "num_warps": 4,
8
+ "num_stages": 5
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 64,
13
+ "BLOCK_SIZE_K": 64,
14
+ "GROUP_SIZE_M": 1,
15
+ "num_warps": 4,
16
+ "num_stages": 5
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 64,
21
+ "BLOCK_SIZE_K": 128,
22
+ "GROUP_SIZE_M": 1,
23
+ "num_warps": 4,
24
+ "num_stages": 2
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 64,
29
+ "BLOCK_SIZE_K": 128,
30
+ "GROUP_SIZE_M": 64,
31
+ "num_warps": 4,
32
+ "num_stages": 3
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 32,
37
+ "BLOCK_SIZE_K": 128,
38
+ "GROUP_SIZE_M": 64,
39
+ "num_warps": 4,
40
+ "num_stages": 2
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 16,
44
+ "BLOCK_SIZE_N": 128,
45
+ "BLOCK_SIZE_K": 128,
46
+ "GROUP_SIZE_M": 16,
47
+ "num_warps": 8,
48
+ "num_stages": 3
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 128,
53
+ "BLOCK_SIZE_K": 128,
54
+ "GROUP_SIZE_M": 16,
55
+ "num_warps": 8,
56
+ "num_stages": 5
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 16,
60
+ "BLOCK_SIZE_N": 128,
61
+ "BLOCK_SIZE_K": 128,
62
+ "GROUP_SIZE_M": 16,
63
+ "num_warps": 4,
64
+ "num_stages": 3
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 16,
68
+ "BLOCK_SIZE_N": 128,
69
+ "BLOCK_SIZE_K": 128,
70
+ "GROUP_SIZE_M": 64,
71
+ "num_warps": 8,
72
+ "num_stages": 2
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 16,
76
+ "BLOCK_SIZE_N": 128,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 16,
79
+ "num_warps": 4,
80
+ "num_stages": 2
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 16,
84
+ "BLOCK_SIZE_N": 128,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 16,
87
+ "num_warps": 4,
88
+ "num_stages": 2
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 32,
92
+ "BLOCK_SIZE_N": 128,
93
+ "BLOCK_SIZE_K": 128,
94
+ "GROUP_SIZE_M": 32,
95
+ "num_warps": 4,
96
+ "num_stages": 2
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 64,
100
+ "BLOCK_SIZE_N": 64,
101
+ "BLOCK_SIZE_K": 64,
102
+ "GROUP_SIZE_M": 16,
103
+ "num_warps": 4,
104
+ "num_stages": 3
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 64,
108
+ "BLOCK_SIZE_N": 128,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 32,
111
+ "num_warps": 4,
112
+ "num_stages": 3
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 128,
116
+ "BLOCK_SIZE_N": 128,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 16,
119
+ "num_warps": 8,
120
+ "num_stages": 3
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 128,
124
+ "BLOCK_SIZE_N": 256,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 16,
127
+ "num_warps": 8,
128
+ "num_stages": 4
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 256,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 32,
135
+ "num_warps": 8,
136
+ "num_stages": 4
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 128,
140
+ "BLOCK_SIZE_N": 256,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 16,
143
+ "num_warps": 8,
144
+ "num_stages": 4
145
+ }
146
+ }
@@ -0,0 +1,146 @@
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 64,
5
+ "BLOCK_SIZE_K": 64,
6
+ "GROUP_SIZE_M": 16,
7
+ "num_warps": 4,
8
+ "num_stages": 4
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 64,
13
+ "BLOCK_SIZE_K": 64,
14
+ "GROUP_SIZE_M": 1,
15
+ "num_warps": 4,
16
+ "num_stages": 4
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 64,
21
+ "BLOCK_SIZE_K": 64,
22
+ "GROUP_SIZE_M": 16,
23
+ "num_warps": 4,
24
+ "num_stages": 3
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 128,
29
+ "BLOCK_SIZE_K": 64,
30
+ "GROUP_SIZE_M": 1,
31
+ "num_warps": 4,
32
+ "num_stages": 4
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 64,
37
+ "BLOCK_SIZE_K": 128,
38
+ "GROUP_SIZE_M": 1,
39
+ "num_warps": 4,
40
+ "num_stages": 5
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 16,
44
+ "BLOCK_SIZE_N": 64,
45
+ "BLOCK_SIZE_K": 128,
46
+ "GROUP_SIZE_M": 32,
47
+ "num_warps": 4,
48
+ "num_stages": 3
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 128,
53
+ "BLOCK_SIZE_K": 128,
54
+ "GROUP_SIZE_M": 32,
55
+ "num_warps": 4,
56
+ "num_stages": 3
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 16,
60
+ "BLOCK_SIZE_N": 128,
61
+ "BLOCK_SIZE_K": 128,
62
+ "GROUP_SIZE_M": 16,
63
+ "num_warps": 8,
64
+ "num_stages": 5
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 16,
68
+ "BLOCK_SIZE_N": 128,
69
+ "BLOCK_SIZE_K": 128,
70
+ "GROUP_SIZE_M": 64,
71
+ "num_warps": 4,
72
+ "num_stages": 2
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 16,
76
+ "BLOCK_SIZE_N": 128,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 16,
79
+ "num_warps": 4,
80
+ "num_stages": 2
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 16,
84
+ "BLOCK_SIZE_N": 128,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 16,
87
+ "num_warps": 4,
88
+ "num_stages": 2
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 32,
92
+ "BLOCK_SIZE_N": 256,
93
+ "BLOCK_SIZE_K": 128,
94
+ "GROUP_SIZE_M": 1,
95
+ "num_warps": 4,
96
+ "num_stages": 2
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 64,
100
+ "BLOCK_SIZE_N": 256,
101
+ "BLOCK_SIZE_K": 128,
102
+ "GROUP_SIZE_M": 1,
103
+ "num_warps": 8,
104
+ "num_stages": 3
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 64,
108
+ "BLOCK_SIZE_N": 256,
109
+ "BLOCK_SIZE_K": 128,
110
+ "GROUP_SIZE_M": 1,
111
+ "num_warps": 8,
112
+ "num_stages": 3
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 128,
116
+ "BLOCK_SIZE_N": 256,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 1,
119
+ "num_warps": 8,
120
+ "num_stages": 4
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 64,
124
+ "BLOCK_SIZE_N": 256,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 1,
127
+ "num_warps": 4,
128
+ "num_stages": 3
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 64,
132
+ "BLOCK_SIZE_N": 256,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 1,
135
+ "num_warps": 4,
136
+ "num_stages": 3
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 128,
140
+ "BLOCK_SIZE_N": 256,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 1,
143
+ "num_warps": 8,
144
+ "num_stages": 3
145
+ }
146
+ }
@@ -0,0 +1,146 @@
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 64,
5
+ "BLOCK_SIZE_K": 128,
6
+ "GROUP_SIZE_M": 1,
7
+ "num_warps": 4,
8
+ "num_stages": 3
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 64,
13
+ "BLOCK_SIZE_K": 64,
14
+ "GROUP_SIZE_M": 1,
15
+ "num_warps": 4,
16
+ "num_stages": 3
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 64,
21
+ "BLOCK_SIZE_K": 64,
22
+ "GROUP_SIZE_M": 64,
23
+ "num_warps": 4,
24
+ "num_stages": 5
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 128,
29
+ "BLOCK_SIZE_K": 128,
30
+ "GROUP_SIZE_M": 1,
31
+ "num_warps": 8,
32
+ "num_stages": 5
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 128,
37
+ "BLOCK_SIZE_K": 128,
38
+ "GROUP_SIZE_M": 1,
39
+ "num_warps": 8,
40
+ "num_stages": 5
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 16,
44
+ "BLOCK_SIZE_N": 64,
45
+ "BLOCK_SIZE_K": 256,
46
+ "GROUP_SIZE_M": 16,
47
+ "num_warps": 4,
48
+ "num_stages": 3
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 128,
53
+ "BLOCK_SIZE_K": 256,
54
+ "GROUP_SIZE_M": 1,
55
+ "num_warps": 8,
56
+ "num_stages": 3
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 16,
60
+ "BLOCK_SIZE_N": 128,
61
+ "BLOCK_SIZE_K": 256,
62
+ "GROUP_SIZE_M": 1,
63
+ "num_warps": 8,
64
+ "num_stages": 3
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 16,
68
+ "BLOCK_SIZE_N": 256,
69
+ "BLOCK_SIZE_K": 128,
70
+ "GROUP_SIZE_M": 1,
71
+ "num_warps": 8,
72
+ "num_stages": 3
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 16,
76
+ "BLOCK_SIZE_N": 128,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 1,
79
+ "num_warps": 4,
80
+ "num_stages": 2
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 16,
84
+ "BLOCK_SIZE_N": 128,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 16,
87
+ "num_warps": 4,
88
+ "num_stages": 2
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 32,
92
+ "BLOCK_SIZE_N": 256,
93
+ "BLOCK_SIZE_K": 128,
94
+ "GROUP_SIZE_M": 1,
95
+ "num_warps": 4,
96
+ "num_stages": 2
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 64,
100
+ "BLOCK_SIZE_N": 128,
101
+ "BLOCK_SIZE_K": 128,
102
+ "GROUP_SIZE_M": 1,
103
+ "num_warps": 4,
104
+ "num_stages": 2
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 64,
108
+ "BLOCK_SIZE_N": 128,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 1,
111
+ "num_warps": 4,
112
+ "num_stages": 3
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 128,
116
+ "BLOCK_SIZE_N": 256,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 1,
119
+ "num_warps": 8,
120
+ "num_stages": 4
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 128,
124
+ "BLOCK_SIZE_N": 256,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 1,
127
+ "num_warps": 8,
128
+ "num_stages": 4
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 256,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 1,
135
+ "num_warps": 8,
136
+ "num_stages": 4
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 128,
140
+ "BLOCK_SIZE_N": 256,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 1,
143
+ "num_warps": 8,
144
+ "num_stages": 4
145
+ }
146
+ }
@@ -940,10 +940,12 @@ def get_moe_configs(
940
940
  )
941
941
  if os.path.exists(config_file_path):
942
942
  with open(config_file_path) as f:
943
- logger.info(
944
- "Using configuration from %s for MoE layer. Please note that due to the large number of configs under fused_moe_triton/configs potentially not being tuned with the corresponding Triton version in your current environment, using the current configs may result in performance degradation. To achieve best performance, you can consider re-tuning the Triton fused MOE kernel in your current environment. For the tuning method, please refer to: https://github.com/sgl-project/sglang/blob/main/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py. ",
945
- config_file_path,
946
- )
943
+ # Please note that although we find the config files, performance might still be suboptimal.
944
+ # This is because the tuning environment might differ from your current environment.
945
+ # For example, updating the Triton version might cause all old configs to become suboptimal.
946
+ # To achieve the best performance, consider re-tuning the Triton fused MOE kernel in your environment.
947
+ # For the tuning method, refer to: https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
948
+ logger.info("Using MoE kernel config from %s.", config_file_path)
947
949
  # If a configuration has been found, return it
948
950
  return {int(key): val for key, val in json.load(f).items()}
949
951
 
@@ -951,8 +953,8 @@ def get_moe_configs(
951
953
  # configuration
952
954
  logger.warning(
953
955
  (
954
- "Using default MoE config. Performance might be sub-optimal! "
955
- "Config file not found at %s, you can tune the config with https://github.com/sgl-project/sglang/blob/main/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py."
956
+ "Using default MoE kernel config. Performance might be sub-optimal! "
957
+ "Config file not found at %s, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton"
956
958
  ),
957
959
  config_file_path,
958
960
  )
@@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
109
109
  if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
110
110
  raise ValueError(
111
111
  f"{quantization} quantization requires some operators from vllm. "
112
- "Pleaes install vllm by `pip install vllm==0.7.2`"
112
+ "Pleaes install vllm by `pip install vllm==0.8.4`"
113
113
  )
114
114
 
115
115
  return QUANTIZATION_METHODS[quantization]
@@ -310,7 +310,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
310
310
  if correction_bias is not None:
311
311
  if not has_correction_bias:
312
312
  raise ValueError(
313
- "Please increase the version of your vllm. Try `pip install vllm==0.7.2`"
313
+ "Please increase the version of your vllm. Try `pip install vllm==0.8.4`"
314
314
  )
315
315
  kwargs["e_score_correction_bias"] = correction_bias
316
316
  return original_apply(**kwargs)
@@ -293,7 +293,7 @@ def _maybe_compile_deep_gemm_one_type_all(
293
293
  logger.info(
294
294
  f"Try DeepGEMM JIT Compiling for "
295
295
  f"<{kernel_helper.name}> N={n}, K={k}, num_groups={num_groups} with all Ms."
296
- f"{' It only takes a litte time(Typically 1 sec) if you have run `sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
296
+ f"{' It only takes a litte time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
297
297
  )
298
298
 
299
299
  # NOTE(alcanderian): get_num_sms should be change when 2-batch-overlap is introduced
@@ -0,0 +1,35 @@
1
+ import logging
2
+ import re
3
+
4
+ import torch
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ def get_layer_id(weight_name):
10
+ # example weight name: model.layers.10.self_attn.qkv_proj.weight
11
+ match = re.search(r"layers\.(\d+)\.", weight_name)
12
+ if match:
13
+ return int(match.group(1))
14
+ return None
15
+
16
+
17
+ class PPMissingLayer(torch.nn.Identity):
18
+ # Adapted from
19
+ # https://github.com/vllm-project/vllm/blob/18ed3132d2bfe1df9a74729457b69243955221e8/vllm/model_executor/models/utils.py#L468C1-L486C1
20
+ """
21
+ A placeholder layer for missing layers in a pipeline parallel model.
22
+ """
23
+
24
+ def __init__(self, *args, **kwargs):
25
+ super().__init__()
26
+ self.return_tuple = kwargs.get("return_tuple", False)
27
+
28
+ def forward(self, *args, **kwargs):
29
+ """
30
+ Return the first arg from args or the first value from kwargs.
31
+
32
+ Wraps the input in a tuple if `self.return_tuple` is True.
33
+ """
34
+ input = args[0] if args else next(iter(kwargs.values()))
35
+ return (input,) if self.return_tuple else input
sglang/srt/lora/layers.py CHANGED
@@ -136,11 +136,19 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
136
136
  self.set_lora = True
137
137
  self.A_buffer_gate_up = A_buffer
138
138
  if self.lora_backend.fuse_stacked_lora_b:
139
- # TODO: avoid using contiguous() in GPU.
140
139
  # B_buffer_gate_up: (num_lora, 2 * output_dim, r)
141
- self.B_buffer_gate_up = torch.cat(
142
- (B_buffer[0], B_buffer[1]), dim=-2
143
- ).contiguous()
140
+ if not hasattr(self, "B_buffer_gate_up") or self.B_buffer_gate_up is None:
141
+ self.B_buffer_gate_up = torch.empty(
142
+ (
143
+ B_buffer[0].shape[0],
144
+ 2 * B_buffer[0].shape[1],
145
+ B_buffer[0].shape[2],
146
+ ),
147
+ dtype=B_buffer[0].dtype,
148
+ device=B_buffer[0].device,
149
+ )
150
+ self.B_buffer_gate_up[:, : B_buffer[0].shape[1], :].copy_(B_buffer[0])
151
+ self.B_buffer_gate_up[:, B_buffer[0].shape[1] :, :].copy_(B_buffer[1])
144
152
  else:
145
153
  self.B_buffer_gate_up = (B_buffer[0], B_buffer[1])
146
154
 
@@ -171,7 +179,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
171
179
 
172
180
 
173
181
  class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
174
- def init__(
182
+ def __init__(
175
183
  self,
176
184
  base_layer: QKVParallelLinear,
177
185
  lora_backend: BaseLoRABackend,
@@ -194,12 +202,30 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
194
202
  output_dim_q, output_dim_kv = B_buffer_q.shape[-2], B_buffer_kv.shape[-2]
195
203
 
196
204
  # B_buffer_qkv: (num_lora, output_dim_q + 2 * output_dim_kv, r)
197
- self.B_buffer_qkv = torch.cat(
198
- (B_buffer_q[0], B_buffer_kv[0], B_buffer_kv[1]), dim=-2
199
- ).contiguous()
205
+ if not hasattr(self, "B_buffer_qkv") or self.B_buffer_qkv is None:
206
+ self.B_buffer_qkv = torch.empty(
207
+ (
208
+ B_buffer_q[0].shape[0],
209
+ output_dim_q + 2 * output_dim_kv,
210
+ B_buffer_q[0].shape[2],
211
+ ),
212
+ dtype=B_buffer_q[0].dtype,
213
+ device=B_buffer_q[0].device,
214
+ )
215
+ self.B_buffer_qkv[:, :output_dim_q, :].copy_(B_buffer_q[0])
216
+ self.B_buffer_qkv[:, output_dim_q : output_dim_q + output_dim_kv, :].copy_(
217
+ B_buffer_kv[0]
218
+ )
219
+ self.B_buffer_qkv[:, output_dim_q + output_dim_kv :, :].copy_(
220
+ B_buffer_kv[1]
221
+ )
200
222
 
201
223
  # Offsets of q/k/v in output dimension
202
- self.output_offset = torch.tensor(
224
+ if not hasattr(self, "output_offset") or self.output_offset is None:
225
+ self.output_offset = torch.empty(
226
+ 4, dtype=torch.int32, device=B_buffer_q.device
227
+ )
228
+ self.output_offset[:4] = torch.tensor(
203
229
  [
204
230
  0,
205
231
  output_dim_q,