sglang 0.4.1.post5__py3-none-any.whl → 0.4.1.post7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. sglang/__init__.py +21 -23
  2. sglang/api.py +2 -7
  3. sglang/bench_offline_throughput.py +24 -16
  4. sglang/bench_one_batch.py +51 -3
  5. sglang/bench_one_batch_server.py +1 -1
  6. sglang/bench_serving.py +37 -28
  7. sglang/lang/backend/runtime_endpoint.py +183 -4
  8. sglang/lang/chat_template.py +15 -4
  9. sglang/launch_server.py +1 -1
  10. sglang/srt/_custom_ops.py +80 -42
  11. sglang/srt/configs/device_config.py +1 -1
  12. sglang/srt/configs/model_config.py +16 -6
  13. sglang/srt/constrained/base_grammar_backend.py +21 -0
  14. sglang/srt/constrained/xgrammar_backend.py +8 -4
  15. sglang/srt/conversation.py +14 -1
  16. sglang/srt/distributed/__init__.py +3 -3
  17. sglang/srt/distributed/communication_op.py +2 -1
  18. sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
  19. sglang/srt/distributed/device_communicators/custom_all_reduce.py +107 -40
  20. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
  21. sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
  22. sglang/srt/distributed/device_communicators/pynccl.py +80 -1
  23. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
  24. sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
  25. sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
  26. sglang/srt/distributed/parallel_state.py +1 -1
  27. sglang/srt/distributed/utils.py +2 -1
  28. sglang/srt/entrypoints/engine.py +449 -0
  29. sglang/srt/entrypoints/http_server.py +579 -0
  30. sglang/srt/layers/activation.py +3 -3
  31. sglang/srt/layers/attention/flashinfer_backend.py +27 -12
  32. sglang/srt/layers/attention/triton_backend.py +4 -6
  33. sglang/srt/layers/attention/vision.py +204 -0
  34. sglang/srt/layers/dp_attention.py +69 -0
  35. sglang/srt/layers/linear.py +76 -102
  36. sglang/srt/layers/logits_processor.py +48 -63
  37. sglang/srt/layers/moe/ep_moe/layer.py +4 -4
  38. sglang/srt/layers/moe/fused_moe_native.py +69 -0
  39. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -6
  40. sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -14
  41. sglang/srt/layers/moe/topk.py +4 -2
  42. sglang/srt/layers/parameter.py +26 -17
  43. sglang/srt/layers/quantization/__init__.py +22 -23
  44. sglang/srt/layers/quantization/fp8.py +112 -55
  45. sglang/srt/layers/quantization/fp8_utils.py +1 -1
  46. sglang/srt/layers/quantization/int8_kernel.py +54 -0
  47. sglang/srt/layers/quantization/modelopt_quant.py +2 -3
  48. sglang/srt/layers/quantization/w8a8_int8.py +117 -0
  49. sglang/srt/layers/radix_attention.py +2 -0
  50. sglang/srt/layers/rotary_embedding.py +1179 -31
  51. sglang/srt/layers/sampler.py +39 -1
  52. sglang/srt/layers/vocab_parallel_embedding.py +17 -4
  53. sglang/srt/lora/lora.py +1 -9
  54. sglang/srt/managers/configure_logging.py +46 -0
  55. sglang/srt/managers/data_parallel_controller.py +79 -72
  56. sglang/srt/managers/detokenizer_manager.py +23 -8
  57. sglang/srt/managers/image_processor.py +158 -2
  58. sglang/srt/managers/io_struct.py +54 -15
  59. sglang/srt/managers/schedule_batch.py +49 -22
  60. sglang/srt/managers/schedule_policy.py +26 -12
  61. sglang/srt/managers/scheduler.py +319 -181
  62. sglang/srt/managers/session_controller.py +1 -0
  63. sglang/srt/managers/tokenizer_manager.py +303 -158
  64. sglang/srt/managers/tp_worker.py +6 -4
  65. sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
  66. sglang/srt/managers/utils.py +44 -0
  67. sglang/srt/mem_cache/memory_pool.py +110 -77
  68. sglang/srt/metrics/collector.py +25 -11
  69. sglang/srt/model_executor/cuda_graph_runner.py +4 -6
  70. sglang/srt/model_executor/model_runner.py +80 -21
  71. sglang/srt/model_loader/loader.py +8 -6
  72. sglang/srt/model_loader/weight_utils.py +55 -2
  73. sglang/srt/models/baichuan.py +6 -6
  74. sglang/srt/models/chatglm.py +2 -2
  75. sglang/srt/models/commandr.py +3 -3
  76. sglang/srt/models/dbrx.py +4 -4
  77. sglang/srt/models/deepseek.py +3 -3
  78. sglang/srt/models/deepseek_v2.py +8 -8
  79. sglang/srt/models/exaone.py +2 -2
  80. sglang/srt/models/gemma.py +2 -2
  81. sglang/srt/models/gemma2.py +6 -24
  82. sglang/srt/models/gpt2.py +3 -5
  83. sglang/srt/models/gpt_bigcode.py +1 -1
  84. sglang/srt/models/granite.py +2 -2
  85. sglang/srt/models/grok.py +3 -3
  86. sglang/srt/models/internlm2.py +2 -2
  87. sglang/srt/models/llama.py +41 -4
  88. sglang/srt/models/minicpm.py +2 -2
  89. sglang/srt/models/minicpm3.py +6 -6
  90. sglang/srt/models/minicpmv.py +1238 -0
  91. sglang/srt/models/mixtral.py +3 -3
  92. sglang/srt/models/mixtral_quant.py +3 -3
  93. sglang/srt/models/mllama.py +2 -2
  94. sglang/srt/models/olmo.py +3 -3
  95. sglang/srt/models/olmo2.py +4 -4
  96. sglang/srt/models/olmoe.py +7 -13
  97. sglang/srt/models/phi3_small.py +2 -2
  98. sglang/srt/models/qwen.py +2 -2
  99. sglang/srt/models/qwen2.py +52 -4
  100. sglang/srt/models/qwen2_eagle.py +131 -0
  101. sglang/srt/models/qwen2_moe.py +3 -3
  102. sglang/srt/models/qwen2_vl.py +22 -122
  103. sglang/srt/models/stablelm.py +2 -2
  104. sglang/srt/models/torch_native_llama.py +3 -3
  105. sglang/srt/models/xverse.py +6 -6
  106. sglang/srt/models/xverse_moe.py +6 -6
  107. sglang/srt/openai_api/protocol.py +2 -0
  108. sglang/srt/sampling/custom_logit_processor.py +38 -0
  109. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +15 -5
  110. sglang/srt/sampling/sampling_batch_info.py +153 -9
  111. sglang/srt/sampling/sampling_params.py +4 -2
  112. sglang/srt/server.py +4 -1037
  113. sglang/srt/server_args.py +84 -32
  114. sglang/srt/speculative/eagle_worker.py +1 -0
  115. sglang/srt/torch_memory_saver_adapter.py +59 -0
  116. sglang/srt/utils.py +130 -63
  117. sglang/test/runners.py +8 -13
  118. sglang/test/test_programs.py +1 -1
  119. sglang/test/test_utils.py +3 -1
  120. sglang/utils.py +12 -2
  121. sglang/version.py +1 -1
  122. {sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/METADATA +26 -13
  123. {sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/RECORD +126 -117
  124. sglang/launch_server_llavavid.py +0 -25
  125. sglang/srt/constrained/__init__.py +0 -16
  126. sglang/srt/distributed/device_communicators/__init__.py +0 -0
  127. {sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/LICENSE +0 -0
  128. {sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/WHEEL +0 -0
  129. {sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/top_level.txt +0 -0
@@ -22,9 +22,8 @@ from typing import Any, Dict, Iterable, Optional, Tuple
22
22
  import torch
23
23
  from torch import nn
24
24
  from transformers import GraniteConfig
25
- from vllm.distributed import get_tensor_model_parallel_world_size
26
- from vllm.model_executor.layers.rotary_embedding import get_rope
27
25
 
26
+ from sglang.srt.distributed import get_tensor_model_parallel_world_size
28
27
  from sglang.srt.layers.activation import SiluAndMul
29
28
  from sglang.srt.layers.layernorm import RMSNorm
30
29
  from sglang.srt.layers.linear import (
@@ -36,6 +35,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorO
36
35
  from sglang.srt.layers.pooler import Pooler, PoolingType
37
36
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
38
37
  from sglang.srt.layers.radix_attention import RadixAttention
38
+ from sglang.srt.layers.rotary_embedding import get_rope
39
39
  from sglang.srt.layers.vocab_parallel_embedding import (
40
40
  ParallelLMHead,
41
41
  VocabParallelEmbedding,
sglang/srt/models/grok.py CHANGED
@@ -22,12 +22,11 @@ import torch
22
22
  import torch.nn.functional as F
23
23
  from torch import nn
24
24
  from transformers import PretrainedConfig
25
- from vllm.distributed import (
25
+
26
+ from sglang.srt.distributed import (
26
27
  get_tensor_model_parallel_rank,
27
28
  get_tensor_model_parallel_world_size,
28
29
  )
29
- from vllm.model_executor.layers.rotary_embedding import get_rope
30
-
31
30
  from sglang.srt.layers.activation import GeluAndMul
32
31
  from sglang.srt.layers.layernorm import RMSNorm
33
32
  from sglang.srt.layers.linear import (
@@ -40,6 +39,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
40
39
  from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
41
40
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
41
  from sglang.srt.layers.radix_attention import RadixAttention
42
+ from sglang.srt.layers.rotary_embedding import get_rope
43
43
  from sglang.srt.layers.vocab_parallel_embedding import (
44
44
  ParallelLMHead,
45
45
  VocabParallelEmbedding,
@@ -19,9 +19,8 @@ from typing import Any, Dict, Iterable, Optional, Tuple
19
19
  import torch
20
20
  from torch import nn
21
21
  from transformers import PretrainedConfig
22
- from vllm.distributed import get_tensor_model_parallel_world_size
23
- from vllm.model_executor.layers.rotary_embedding import get_rope
24
22
 
23
+ from sglang.srt.distributed import get_tensor_model_parallel_world_size
25
24
  from sglang.srt.layers.activation import SiluAndMul
26
25
  from sglang.srt.layers.layernorm import RMSNorm
27
26
  from sglang.srt.layers.linear import (
@@ -32,6 +31,7 @@ from sglang.srt.layers.linear import (
32
31
  from sglang.srt.layers.logits_processor import LogitsProcessor
33
32
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
34
33
  from sglang.srt.layers.radix_attention import RadixAttention
34
+ from sglang.srt.layers.rotary_embedding import get_rope
35
35
  from sglang.srt.layers.vocab_parallel_embedding import (
36
36
  ParallelLMHead,
37
37
  VocabParallelEmbedding,
@@ -22,9 +22,11 @@ from typing import Any, Dict, Iterable, Optional, Tuple
22
22
  import torch
23
23
  from torch import nn
24
24
  from transformers import LlamaConfig
25
- from vllm.distributed import get_tensor_model_parallel_world_size
26
- from vllm.model_executor.layers.rotary_embedding import get_rope
27
25
 
26
+ from sglang.srt.distributed import (
27
+ get_tensor_model_parallel_rank,
28
+ get_tensor_model_parallel_world_size,
29
+ )
28
30
  from sglang.srt.layers.activation import SiluAndMul
29
31
  from sglang.srt.layers.layernorm import RMSNorm
30
32
  from sglang.srt.layers.linear import (
@@ -36,12 +38,16 @@ from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorO
36
38
  from sglang.srt.layers.pooler import Pooler, PoolingType
37
39
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
38
40
  from sglang.srt.layers.radix_attention import RadixAttention
41
+ from sglang.srt.layers.rotary_embedding import get_rope
39
42
  from sglang.srt.layers.vocab_parallel_embedding import (
40
43
  ParallelLMHead,
41
44
  VocabParallelEmbedding,
42
45
  )
43
46
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
44
- from sglang.srt.model_loader.weight_utils import default_weight_loader
47
+ from sglang.srt.model_loader.weight_utils import (
48
+ default_weight_loader,
49
+ kv_cache_scales_loader,
50
+ )
45
51
  from sglang.srt.utils import make_layers
46
52
  from sglang.utils import get_exception_traceback
47
53
 
@@ -299,6 +305,30 @@ class LlamaModel(nn.Module):
299
305
  hidden_states, _ = self.norm(hidden_states, residual)
300
306
  return hidden_states
301
307
 
308
+ # If this function is called, it should always initialize KV cache scale
309
+ # factors (or else raise an exception). Thus, handled exceptions should
310
+ # make sure to leave KV cache scale factors in a known good (dummy) state
311
+ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
312
+ tp_size = get_tensor_model_parallel_world_size()
313
+ tp_rank = get_tensor_model_parallel_rank()
314
+ for layer_idx, scaling_factor in kv_cache_scales_loader(
315
+ quantization_param_path,
316
+ tp_rank,
317
+ tp_size,
318
+ self.config.num_hidden_layers,
319
+ self.config.__class__.model_type,
320
+ ):
321
+ if not isinstance(self.layers[layer_idx], nn.Identity):
322
+ layer_self_attn = self.layers[layer_idx].self_attn
323
+
324
+ if hasattr(layer_self_attn.attn, "k_scale"):
325
+ layer_self_attn.attn.k_scale = scaling_factor
326
+ layer_self_attn.attn.v_scale = scaling_factor
327
+ else:
328
+ raise RuntimeError(
329
+ "Self attention has no KV cache scaling " "factor attribute!"
330
+ )
331
+
302
332
 
303
333
  class LlamaForCausalLM(nn.Module):
304
334
 
@@ -534,9 +564,16 @@ class LlamaForCausalLM(nn.Module):
534
564
  torch.cuda.empty_cache()
535
565
  torch.cuda.synchronize()
536
566
 
567
+ def load_kv_cache_scales(self, quantization_param_path: str) -> None:
568
+ self.model.load_kv_cache_scales(quantization_param_path)
569
+
537
570
 
538
571
  class Phi3ForCausalLM(LlamaForCausalLM):
539
572
  pass
540
573
 
541
574
 
542
- EntryClass = [LlamaForCausalLM, Phi3ForCausalLM]
575
+ class InternLM3ForCausalLM(LlamaForCausalLM):
576
+ pass
577
+
578
+
579
+ EntryClass = [LlamaForCausalLM, Phi3ForCausalLM, InternLM3ForCausalLM]
@@ -18,9 +18,8 @@ from typing import Any, Dict, Iterable, Optional, Tuple
18
18
 
19
19
  import torch
20
20
  from torch import nn
21
- from vllm.distributed import get_tensor_model_parallel_world_size
22
- from vllm.model_executor.layers.rotary_embedding import get_rope
23
21
 
22
+ from sglang.srt.distributed import get_tensor_model_parallel_world_size
24
23
  from sglang.srt.layers.activation import SiluAndMul
25
24
  from sglang.srt.layers.layernorm import RMSNorm
26
25
  from sglang.srt.layers.linear import (
@@ -31,6 +30,7 @@ from sglang.srt.layers.linear import (
31
30
  from sglang.srt.layers.logits_processor import LogitsProcessor
32
31
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
33
32
  from sglang.srt.layers.radix_attention import RadixAttention
33
+ from sglang.srt.layers.rotary_embedding import get_rope
34
34
  from sglang.srt.layers.vocab_parallel_embedding import (
35
35
  ParallelLMHead,
36
36
  VocabParallelEmbedding,
@@ -19,20 +19,20 @@ from typing import Any, Dict, Iterable, Optional, Tuple
19
19
  import torch
20
20
  from torch import nn
21
21
  from transformers import PretrainedConfig
22
- from vllm.distributed import get_tensor_model_parallel_world_size
23
- from vllm.model_executor.layers.linear import (
22
+
23
+ from sglang.srt.distributed import get_tensor_model_parallel_world_size
24
+ from sglang.srt.layers.activation import SiluAndMul
25
+ from sglang.srt.layers.layernorm import RMSNorm
26
+ from sglang.srt.layers.linear import (
24
27
  ColumnParallelLinear,
25
28
  MergedColumnParallelLinear,
26
29
  ReplicatedLinear,
27
30
  RowParallelLinear,
28
31
  )
29
- from vllm.model_executor.layers.rotary_embedding import get_rope
30
-
31
- from sglang.srt.layers.activation import SiluAndMul
32
- from sglang.srt.layers.layernorm import RMSNorm
33
32
  from sglang.srt.layers.logits_processor import LogitsProcessor
34
33
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
35
34
  from sglang.srt.layers.radix_attention import RadixAttention
35
+ from sglang.srt.layers.rotary_embedding import get_rope
36
36
  from sglang.srt.layers.vocab_parallel_embedding import (
37
37
  ParallelLMHead,
38
38
  VocabParallelEmbedding,