sglang 0.4.1.post4__py3-none-any.whl → 0.4.1.post6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. sglang/bench_serving.py +18 -1
  2. sglang/lang/interpreter.py +71 -1
  3. sglang/lang/ir.py +2 -0
  4. sglang/srt/configs/__init__.py +4 -0
  5. sglang/srt/configs/chatglm.py +78 -0
  6. sglang/srt/configs/dbrx.py +279 -0
  7. sglang/srt/configs/model_config.py +16 -7
  8. sglang/srt/hf_transformers_utils.py +9 -14
  9. sglang/srt/layers/attention/__init__.py +8 -1
  10. sglang/srt/layers/attention/flashinfer_backend.py +21 -5
  11. sglang/srt/layers/linear.py +89 -47
  12. sglang/srt/layers/logits_processor.py +6 -6
  13. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +16 -5
  14. sglang/srt/layers/moe/fused_moe_triton/layer.py +39 -12
  15. sglang/srt/layers/moe/topk.py +4 -2
  16. sglang/srt/layers/parameter.py +439 -0
  17. sglang/srt/layers/quantization/__init__.py +5 -2
  18. sglang/srt/layers/quantization/fp8.py +107 -53
  19. sglang/srt/layers/quantization/fp8_utils.py +1 -1
  20. sglang/srt/layers/quantization/int8_kernel.py +54 -0
  21. sglang/srt/layers/quantization/modelopt_quant.py +174 -0
  22. sglang/srt/layers/quantization/w8a8_int8.py +117 -0
  23. sglang/srt/layers/radix_attention.py +2 -0
  24. sglang/srt/layers/vocab_parallel_embedding.py +16 -3
  25. sglang/srt/managers/cache_controller.py +307 -0
  26. sglang/srt/managers/configure_logging.py +43 -0
  27. sglang/srt/managers/data_parallel_controller.py +2 -0
  28. sglang/srt/managers/detokenizer_manager.py +0 -2
  29. sglang/srt/managers/io_struct.py +29 -13
  30. sglang/srt/managers/schedule_batch.py +7 -1
  31. sglang/srt/managers/scheduler.py +58 -15
  32. sglang/srt/managers/session_controller.py +1 -1
  33. sglang/srt/managers/tokenizer_manager.py +109 -45
  34. sglang/srt/mem_cache/memory_pool.py +313 -53
  35. sglang/srt/metrics/collector.py +32 -35
  36. sglang/srt/model_executor/cuda_graph_runner.py +14 -7
  37. sglang/srt/model_executor/forward_batch_info.py +20 -15
  38. sglang/srt/model_executor/model_runner.py +53 -10
  39. sglang/srt/models/chatglm.py +1 -1
  40. sglang/srt/models/dbrx.py +1 -1
  41. sglang/srt/models/grok.py +25 -16
  42. sglang/srt/models/llama.py +46 -4
  43. sglang/srt/models/qwen2.py +11 -0
  44. sglang/srt/models/qwen2_eagle.py +131 -0
  45. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +15 -5
  46. sglang/srt/sampling/sampling_batch_info.py +15 -5
  47. sglang/srt/sampling/sampling_params.py +1 -1
  48. sglang/srt/server.py +125 -69
  49. sglang/srt/server_args.py +39 -19
  50. sglang/srt/speculative/eagle_utils.py +93 -85
  51. sglang/srt/speculative/eagle_worker.py +48 -33
  52. sglang/srt/torch_memory_saver_adapter.py +59 -0
  53. sglang/srt/utils.py +61 -5
  54. sglang/test/test_programs.py +23 -1
  55. sglang/test/test_utils.py +36 -7
  56. sglang/version.py +1 -1
  57. {sglang-0.4.1.post4.dist-info → sglang-0.4.1.post6.dist-info}/METADATA +16 -15
  58. {sglang-0.4.1.post4.dist-info → sglang-0.4.1.post6.dist-info}/RECORD +61 -51
  59. {sglang-0.4.1.post4.dist-info → sglang-0.4.1.post6.dist-info}/WHEEL +1 -1
  60. {sglang-0.4.1.post4.dist-info → sglang-0.4.1.post6.dist-info}/LICENSE +0 -0
  61. {sglang-0.4.1.post4.dist-info → sglang-0.4.1.post6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,439 @@
1
+ """Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/parameter.py"""
2
+
3
+ import logging
4
+ from fractions import Fraction
5
+ from typing import Callable, Optional, Union
6
+
7
+ import torch
8
+ from torch.nn import Parameter
9
+ from vllm.distributed import get_tensor_model_parallel_rank
10
+
11
+ __all__ = [
12
+ "BasevLLMParameter",
13
+ "PackedvLLMParameter",
14
+ "PerTensorScaleParameter",
15
+ "ModelWeightParameter",
16
+ "ChannelQuantScaleParameter",
17
+ "GroupQuantScaleParameter",
18
+ "PackedColumnParameter",
19
+ "RowvLLMParameter",
20
+ ]
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class BasevLLMParameter(Parameter):
26
+ """
27
+ Base parameter for vLLM linear layers. Extends the torch.nn.parameter
28
+ by taking in a linear weight loader. Will copy the loaded weight
29
+ into the parameter when the provided weight loader is called.
30
+ """
31
+
32
+ def __new__(cls, data: torch.Tensor, **kwargs):
33
+
34
+ return super().__new__(cls, data=data, requires_grad=False)
35
+
36
+ def __init__(self, data: torch.Tensor, weight_loader: Callable):
37
+ """
38
+ Initialize the BasevLLMParameter
39
+
40
+ :param data: torch tensor with the parameter data
41
+ :param weight_loader: weight loader callable
42
+
43
+ :returns: a torch.nn.parameter
44
+ """
45
+
46
+ self._weight_loader = weight_loader
47
+
48
+ @property
49
+ def weight_loader(self):
50
+ return self._weight_loader
51
+
52
+ def _assert_and_load(self, loaded_weight: torch.Tensor):
53
+ assert self.data.shape == loaded_weight.shape
54
+ self.data.copy_(loaded_weight)
55
+
56
+ def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
57
+ self._assert_and_load(loaded_weight)
58
+
59
+ def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
60
+ self._assert_and_load(loaded_weight)
61
+
62
+ def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
63
+ self._assert_and_load(loaded_weight)
64
+
65
+ def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
66
+ self._assert_and_load(loaded_weight)
67
+
68
+
69
+ class _ColumnvLLMParameter(BasevLLMParameter):
70
+ """
71
+ Private class defining weight loading functionality
72
+ (load_merged_column_weight, load_qkv_weight)
73
+ for parameters being loaded into linear layers with column
74
+ parallelism. This includes QKV and MLP layers which are
75
+ not already fused on disk. Requires an output dimension
76
+ to be defined. Called within the weight loader of
77
+ each of the column parallel linear layers.
78
+ """
79
+
80
+ def __init__(self, output_dim: int, **kwargs):
81
+ self._output_dim = output_dim
82
+ super().__init__(**kwargs)
83
+
84
+ @property
85
+ def output_dim(self):
86
+ return self._output_dim
87
+
88
+ def load_column_parallel_weight(
89
+ self,
90
+ loaded_weight: torch.Tensor,
91
+ tp_rank: int,
92
+ use_presharded_weights: bool = False,
93
+ ):
94
+ if not use_presharded_weights:
95
+ shard_size = self.data.shape[self.output_dim]
96
+ loaded_weight = loaded_weight.narrow(
97
+ self.output_dim, tp_rank * shard_size, shard_size
98
+ )
99
+ assert self.data.shape == loaded_weight.shape
100
+ self.data.copy_(loaded_weight)
101
+
102
+ def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
103
+
104
+ shard_offset = kwargs.get("shard_offset")
105
+ shard_size = kwargs.get("shard_size")
106
+ use_presharded_weights = kwargs.get("use_presharded_weights")
107
+ if (
108
+ isinstance(self, (PackedColumnParameter, PackedvLLMParameter))
109
+ and self.packed_dim == self.output_dim
110
+ ):
111
+ shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
112
+ shard_offset=shard_offset, shard_size=shard_size
113
+ )
114
+
115
+ param_data = self.data
116
+
117
+ tp_rank = get_tensor_model_parallel_rank()
118
+ param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
119
+ if not use_presharded_weights:
120
+ loaded_weight = loaded_weight.narrow(
121
+ self.output_dim, tp_rank * shard_size, shard_size
122
+ )
123
+ assert param_data.shape == loaded_weight.shape
124
+ param_data.copy_(loaded_weight)
125
+
126
+ def load_qkv_weight(self, loaded_weight: torch.Tensor, tp_rank: int, **kwargs):
127
+
128
+ shard_offset = kwargs.get("shard_offset")
129
+ shard_size = kwargs.get("shard_size")
130
+ shard_id = kwargs.get("shard_id")
131
+ num_heads = kwargs.get("num_heads")
132
+
133
+ if (
134
+ isinstance(self, (PackedColumnParameter, PackedvLLMParameter))
135
+ and self.output_dim == self.packed_dim
136
+ ):
137
+ shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
138
+ shard_offset=shard_offset, shard_size=shard_size
139
+ )
140
+
141
+ param_data = self.data
142
+ shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
143
+ param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
144
+ loaded_weight = loaded_weight.narrow(
145
+ self.output_dim, shard_id * shard_size, shard_size
146
+ )
147
+
148
+ assert param_data.shape == loaded_weight.shape
149
+ param_data.copy_(loaded_weight)
150
+
151
+
152
+ class RowvLLMParameter(BasevLLMParameter):
153
+ """
154
+ Parameter class defining weight_loading functionality
155
+ (load_row_parallel_weight) for parameters being loaded
156
+ into linear layers with row parallel functionality.
157
+ Requires an input_dim to be defined.
158
+ """
159
+
160
+ def __init__(self, input_dim: int, **kwargs):
161
+ self._input_dim = input_dim
162
+ super().__init__(**kwargs)
163
+
164
+ @property
165
+ def input_dim(self):
166
+ return self._input_dim
167
+
168
+ def load_row_parallel_weight(
169
+ self,
170
+ loaded_weight: torch.Tensor,
171
+ tp_rank: int,
172
+ use_presharded_weights: bool = False,
173
+ ):
174
+ if not use_presharded_weights:
175
+ shard_size = self.data.shape[self.input_dim]
176
+ loaded_weight = loaded_weight.narrow(
177
+ self.input_dim, tp_rank * shard_size, shard_size
178
+ )
179
+
180
+ if len(loaded_weight.shape) == 0:
181
+ loaded_weight = loaded_weight.reshape(1)
182
+
183
+ assert self.data.shape == loaded_weight.shape
184
+ self.data.copy_(loaded_weight)
185
+
186
+
187
+ class ModelWeightParameter(_ColumnvLLMParameter, RowvLLMParameter):
188
+ """
189
+ Parameter class for linear layer weights. Uses both column and
190
+ row parallelism.
191
+ """
192
+
193
+ pass
194
+
195
+
196
+ class GroupQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
197
+ """
198
+ Parameter class for weight scales loaded for weights with
199
+ grouped quantization. Uses both column and row parallelism.
200
+ """
201
+
202
+ pass
203
+
204
+
205
+ class ChannelQuantScaleParameter(_ColumnvLLMParameter):
206
+ """
207
+ Parameter class for weight scales loaded for weights with
208
+ channel-wise quantization. Equivalent to _ColumnvLLMParameter.
209
+ """
210
+
211
+ pass
212
+
213
+
214
+ class PerTensorScaleParameter(BasevLLMParameter):
215
+ """
216
+ Parameter class for scales where the number of scales is
217
+ equivalent to the number of logical matrices in fused linear
218
+ layers (e.g. for QKV, there are 3 scales loaded from disk).
219
+ This is relevant to weights with per-tensor quantization.
220
+ Adds functionality to map the scalers to a shard during
221
+ weight loading.
222
+
223
+ Note: additional parameter manipulation may be handled
224
+ for each quantization config specifically, within
225
+ process_weights_after_loading
226
+ """
227
+
228
+ def __init__(self, **kwargs):
229
+ self.qkv_idxs = {"q": 0, "k": 1, "v": 2}
230
+ super().__init__(**kwargs)
231
+
232
+ def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
233
+ if isinstance(shard_id, int):
234
+ return shard_id
235
+
236
+ # if not int, assume shard_id for qkv
237
+ # map to int and return
238
+ assert isinstance(shard_id, str)
239
+ assert shard_id in self.qkv_idxs
240
+ return self.qkv_idxs[shard_id]
241
+
242
+ # For row parallel layers, no sharding needed
243
+ # load weight into parameter as is
244
+ def load_row_parallel_weight(self, *args, **kwargs):
245
+ kwargs.pop("tp_rank", None)
246
+ kwargs.pop("use_presharded_weights", None)
247
+ super().load_row_parallel_weight(*args, **kwargs)
248
+
249
+ def load_merged_column_weight(self, *args, **kwargs):
250
+ self._load_into_shard_id(*args, **kwargs)
251
+
252
+ def load_qkv_weight(self, *args, **kwargs):
253
+ self._load_into_shard_id(*args, **kwargs)
254
+
255
+ def load_column_parallel_weight(self, *args, **kwargs):
256
+ kwargs.pop("tp_rank", None)
257
+ kwargs.pop("use_presharded_weights", None)
258
+ super().load_row_parallel_weight(*args, **kwargs)
259
+
260
+ def _load_into_shard_id(
261
+ self, loaded_weight: torch.Tensor, shard_id: Union[str, int], **kwargs
262
+ ):
263
+ """
264
+ Slice the parameter data based on the shard id for
265
+ loading.
266
+ """
267
+
268
+ param_data = self.data
269
+ shard_id = self._shard_id_as_int(shard_id)
270
+
271
+ # AutoFP8 scales do not have a shape
272
+ # compressed-tensors scales do have a shape
273
+ if len(loaded_weight.shape) != 0:
274
+ assert loaded_weight.shape[0] == 1
275
+ loaded_weight = loaded_weight[0]
276
+
277
+ param_data = param_data[shard_id]
278
+ assert param_data.shape == loaded_weight.shape
279
+ param_data.copy_(loaded_weight)
280
+
281
+
282
+ class PackedColumnParameter(_ColumnvLLMParameter):
283
+ """
284
+ Parameter for model parameters which are packed on disk
285
+ and support column parallelism only. See PackedvLLMParameter
286
+ for more details on the packed properties.
287
+ """
288
+
289
+ def __init__(
290
+ self,
291
+ packed_factor: Union[int, Fraction],
292
+ packed_dim: int,
293
+ marlin_tile_size: Optional[int] = None,
294
+ **kwargs
295
+ ):
296
+ self._packed_factor = packed_factor
297
+ self._packed_dim = packed_dim
298
+ self._marlin_tile_size = marlin_tile_size
299
+ super().__init__(**kwargs)
300
+
301
+ @property
302
+ def packed_dim(self):
303
+ return self._packed_dim
304
+
305
+ @property
306
+ def packed_factor(self):
307
+ return self._packed_factor
308
+
309
+ @property
310
+ def marlin_tile_size(self):
311
+ return self._marlin_tile_size
312
+
313
+ def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
314
+ return _adjust_shard_indexes_for_packing(
315
+ shard_size=shard_size,
316
+ shard_offset=shard_offset,
317
+ packed_factor=self.packed_factor,
318
+ marlin_tile_size=self.marlin_tile_size,
319
+ )
320
+
321
+
322
+ class PackedvLLMParameter(ModelWeightParameter):
323
+ """
324
+ Parameter for model weights which are packed on disk.
325
+ Example: GPTQ Marlin weights are int4 or int8, packed into int32.
326
+ Extends the ModelWeightParameter to take in the
327
+ packed factor, the packed dimension, and optionally, marlin
328
+ tile size for marlin kernels. Adjusts the shard_size and
329
+ shard_offset for fused linear layers model weight loading
330
+ by accounting for packing and optionally, marlin tile size.
331
+ """
332
+
333
+ def __init__(
334
+ self,
335
+ packed_factor: Union[int, Fraction],
336
+ packed_dim: int,
337
+ marlin_tile_size: Optional[int] = None,
338
+ **kwargs
339
+ ):
340
+ self._packed_factor = packed_factor
341
+ self._packed_dim = packed_dim
342
+ self._marlin_tile_size = marlin_tile_size
343
+ super().__init__(**kwargs)
344
+
345
+ @property
346
+ def packed_dim(self):
347
+ return self._packed_dim
348
+
349
+ @property
350
+ def packed_factor(self):
351
+ return self._packed_factor
352
+
353
+ @property
354
+ def marlin_tile_size(self):
355
+ return self._marlin_tile_size
356
+
357
+ def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
358
+ return _adjust_shard_indexes_for_packing(
359
+ shard_size=shard_size,
360
+ shard_offset=shard_offset,
361
+ packed_factor=self.packed_factor,
362
+ marlin_tile_size=self.marlin_tile_size,
363
+ )
364
+
365
+
366
+ def permute_param_layout_(
367
+ param: BasevLLMParameter, input_dim: int, output_dim: int, **kwargs
368
+ ) -> BasevLLMParameter:
369
+ """
370
+ Permute a parameter's layout to the specified input and output dimensions,
371
+ useful for forcing the parameter into a known layout, for example, if I need
372
+ a packed (quantized) weight matrix to be in the layout
373
+ {input_dim = 0, output_dim = 1, packed_dim = 0}
374
+ then I can call:
375
+ permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
376
+ to ensure x is in the correct layout (permuting it to the correct layout if
377
+ required, asserting if it cannot get it to the correct layout)
378
+ """
379
+
380
+ curr_input_dim = getattr(param, "input_dim", None)
381
+ curr_output_dim = getattr(param, "output_dim", None)
382
+
383
+ if curr_input_dim is None or curr_output_dim is None:
384
+ assert param.data.dim() == 2, (
385
+ "permute_param_layout_ only supports 2D parameters when either "
386
+ "input_dim or output_dim is not set"
387
+ )
388
+
389
+ # if one of the dimensions is not set, set it to the opposite of the other
390
+ # we can only do this since we asserted the parameter is 2D above
391
+ if curr_input_dim is None:
392
+ assert curr_output_dim is not None, "either input or output dim must be set"
393
+ curr_input_dim = (curr_output_dim + 1) % 2
394
+ if curr_output_dim is None:
395
+ assert curr_input_dim is not None, "either input or output dim must be set"
396
+ curr_output_dim = (curr_input_dim + 1) % 2
397
+
398
+ # create permutation from the current layout to the layout with
399
+ # self.input_dim at input_dim and self.output_dim at output_dim preserving
400
+ # other dimensions
401
+ perm = [
402
+ i for i in range(param.data.dim()) if i not in [curr_input_dim, curr_output_dim]
403
+ ]
404
+ perm.insert(input_dim, curr_input_dim)
405
+ perm.insert(output_dim, curr_output_dim)
406
+
407
+ if "packed_dim" in kwargs:
408
+ assert (
409
+ hasattr(param, "packed_dim")
410
+ and param.packed_dim == perm[kwargs["packed_dim"]]
411
+ ), "permute_param_layout_ currently doesn't support repacking"
412
+
413
+ param.data = param.data.permute(*perm)
414
+ if hasattr(param, "_input_dim"):
415
+ param._input_dim = input_dim
416
+ if hasattr(param, "_output_dim"):
417
+ param._output_dim = output_dim
418
+ if "packed_dim" in kwargs and hasattr(param, "_packed_dim"):
419
+ param._packed_dim = kwargs["packed_dim"]
420
+
421
+ return param
422
+
423
+
424
+ def _adjust_shard_indexes_for_marlin(shard_size, shard_offset, marlin_tile_size):
425
+ return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
426
+
427
+
428
+ def _adjust_shard_indexes_for_packing(
429
+ shard_size, shard_offset, packed_factor, marlin_tile_size
430
+ ):
431
+ shard_size = shard_size // packed_factor
432
+ shard_offset = shard_offset // packed_factor
433
+ if marlin_tile_size is not None:
434
+ return _adjust_shard_indexes_for_marlin(
435
+ shard_size=shard_size,
436
+ shard_offset=shard_offset,
437
+ marlin_tile_size=marlin_tile_size,
438
+ )
439
+ return shard_size, shard_offset
@@ -1,8 +1,7 @@
1
1
  # Adapted from https://raw.githubusercontent.com/vllm-project/vllm/v0.5.5/vllm/model_executor/layers/quantization/__init__.py
2
2
 
3
- from typing import Callable, Dict, Optional, Type
3
+ from typing import Dict, Type
4
4
 
5
- import torch
6
5
  from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
7
6
  from vllm.model_executor.layers.quantization.awq import AWQConfig
8
7
  from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
@@ -23,6 +22,8 @@ from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig
23
22
 
24
23
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
25
24
  from sglang.srt.layers.quantization.fp8 import Fp8Config
25
+ from sglang.srt.layers.quantization.modelopt_quant import ModelOptFp8Config
26
+ from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
26
27
 
27
28
  QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
28
29
  "aqlm": AQLMConfig,
@@ -32,6 +33,7 @@ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
32
33
  "fp8": Fp8Config,
33
34
  "fbgemm_fp8": FBGEMMFp8Config,
34
35
  "marlin": MarlinConfig,
36
+ "modelopt": ModelOptFp8Config,
35
37
  "gguf": GGUFConfig,
36
38
  "gptq_marlin_24": GPTQMarlin24Config,
37
39
  "gptq_marlin": GPTQMarlinConfig,
@@ -41,6 +43,7 @@ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
41
43
  "bitsandbytes": BitsAndBytesConfig,
42
44
  "qqq": QQQConfig,
43
45
  "experts_int8": ExpertsInt8Config,
46
+ "w8a8_int8": W8A8Int8Config,
44
47
  }
45
48
 
46
49