sglang 0.4.3.post4__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. sglang/bench_serving.py +1 -1
  2. sglang/lang/chat_template.py +29 -0
  3. sglang/srt/_custom_ops.py +19 -17
  4. sglang/srt/configs/__init__.py +2 -0
  5. sglang/srt/configs/janus_pro.py +629 -0
  6. sglang/srt/configs/model_config.py +24 -14
  7. sglang/srt/conversation.py +80 -2
  8. sglang/srt/custom_op.py +64 -3
  9. sglang/srt/distributed/device_communicators/custom_all_reduce.py +18 -17
  10. sglang/srt/distributed/parallel_state.py +10 -1
  11. sglang/srt/entrypoints/engine.py +5 -3
  12. sglang/srt/entrypoints/http_server.py +1 -1
  13. sglang/srt/hf_transformers_utils.py +16 -1
  14. sglang/srt/layers/attention/flashinfer_backend.py +1 -1
  15. sglang/srt/layers/attention/flashinfer_mla_backend.py +317 -57
  16. sglang/srt/layers/attention/triton_backend.py +1 -3
  17. sglang/srt/layers/attention/triton_ops/decode_attention.py +6 -6
  18. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +3 -3
  19. sglang/srt/layers/attention/triton_ops/extend_attention.py +4 -4
  20. sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +3 -3
  21. sglang/srt/layers/attention/vision.py +43 -62
  22. sglang/srt/layers/linear.py +1 -1
  23. sglang/srt/layers/moe/ep_moe/kernels.py +2 -1
  24. sglang/srt/layers/moe/ep_moe/layer.py +25 -9
  25. sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  26. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  27. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  28. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  29. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  30. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  31. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +63 -23
  32. sglang/srt/layers/moe/fused_moe_triton/layer.py +16 -4
  33. sglang/srt/layers/parameter.py +10 -0
  34. sglang/srt/layers/quantization/__init__.py +90 -68
  35. sglang/srt/layers/quantization/blockwise_int8.py +1 -2
  36. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  37. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  38. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  39. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  40. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  41. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  43. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  44. sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  45. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  46. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  47. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  48. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  49. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  50. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  51. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  52. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  53. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  54. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  56. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  57. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  58. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  59. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  62. sglang/srt/layers/quantization/fp8.py +174 -106
  63. sglang/srt/layers/quantization/fp8_kernel.py +210 -38
  64. sglang/srt/layers/quantization/fp8_utils.py +156 -15
  65. sglang/srt/layers/quantization/modelopt_quant.py +5 -1
  66. sglang/srt/layers/quantization/w8a8_fp8.py +128 -0
  67. sglang/srt/layers/quantization/w8a8_int8.py +152 -3
  68. sglang/srt/layers/rotary_embedding.py +5 -3
  69. sglang/srt/layers/sampler.py +29 -35
  70. sglang/srt/layers/vocab_parallel_embedding.py +0 -1
  71. sglang/srt/lora/backend/__init__.py +9 -12
  72. sglang/srt/managers/cache_controller.py +72 -8
  73. sglang/srt/managers/image_processor.py +37 -631
  74. sglang/srt/managers/image_processors/base_image_processor.py +219 -0
  75. sglang/srt/managers/image_processors/janus_pro.py +79 -0
  76. sglang/srt/managers/image_processors/llava.py +152 -0
  77. sglang/srt/managers/image_processors/minicpmv.py +86 -0
  78. sglang/srt/managers/image_processors/mlama.py +60 -0
  79. sglang/srt/managers/image_processors/qwen_vl.py +161 -0
  80. sglang/srt/managers/io_struct.py +32 -15
  81. sglang/srt/managers/multi_modality_padding.py +134 -0
  82. sglang/srt/managers/schedule_batch.py +212 -117
  83. sglang/srt/managers/schedule_policy.py +40 -8
  84. sglang/srt/managers/scheduler.py +124 -665
  85. sglang/srt/managers/scheduler_output_processor_mixin.py +611 -0
  86. sglang/srt/managers/tokenizer_manager.py +6 -6
  87. sglang/srt/managers/tp_worker_overlap_thread.py +4 -1
  88. sglang/srt/mem_cache/base_prefix_cache.py +6 -8
  89. sglang/srt/mem_cache/chunk_cache.py +12 -44
  90. sglang/srt/mem_cache/hiradix_cache.py +63 -34
  91. sglang/srt/mem_cache/memory_pool.py +78 -17
  92. sglang/srt/mem_cache/paged_allocator.py +283 -0
  93. sglang/srt/mem_cache/radix_cache.py +117 -36
  94. sglang/srt/model_executor/cuda_graph_runner.py +9 -4
  95. sglang/srt/model_executor/forward_batch_info.py +12 -8
  96. sglang/srt/model_executor/model_runner.py +63 -63
  97. sglang/srt/model_loader/loader.py +2 -1
  98. sglang/srt/model_loader/weight_utils.py +1 -1
  99. sglang/srt/models/deepseek_janus_pro.py +2127 -0
  100. sglang/srt/models/deepseek_nextn.py +23 -3
  101. sglang/srt/models/deepseek_v2.py +25 -19
  102. sglang/srt/models/minicpmv.py +28 -89
  103. sglang/srt/models/mllama.py +1 -1
  104. sglang/srt/models/qwen2.py +0 -1
  105. sglang/srt/models/qwen2_5_vl.py +25 -50
  106. sglang/srt/models/qwen2_vl.py +33 -49
  107. sglang/srt/openai_api/adapter.py +37 -15
  108. sglang/srt/openai_api/protocol.py +8 -1
  109. sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -1
  110. sglang/srt/sampling/penaltylib/presence_penalty.py +0 -1
  111. sglang/srt/server_args.py +19 -11
  112. sglang/srt/speculative/eagle_worker.py +75 -39
  113. sglang/srt/utils.py +104 -9
  114. sglang/test/runners.py +104 -10
  115. sglang/test/test_block_fp8.py +106 -16
  116. sglang/test/test_custom_ops.py +88 -0
  117. sglang/test/test_utils.py +20 -4
  118. sglang/utils.py +0 -4
  119. sglang/version.py +1 -1
  120. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.dist-info}/METADATA +9 -10
  121. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.dist-info}/RECORD +124 -79
  122. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.dist-info}/WHEEL +1 -1
  123. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.dist-info}/LICENSE +0 -0
  124. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py CHANGED
@@ -1006,7 +1006,7 @@ async def benchmark(
1006
1006
 
1007
1007
  # Flush cache
1008
1008
  if "sglang" in backend:
1009
- requests.post(base_url + "/flush_cache")
1009
+ requests.post(base_url + "/flush_cache", headers=get_auth_headers())
1010
1010
 
1011
1011
  time.sleep(1.0)
1012
1012
 
@@ -230,6 +230,29 @@ register_chat_template(
230
230
  )
231
231
  )
232
232
 
233
+ register_chat_template(
234
+ ChatTemplate(
235
+ name="janus-pro",
236
+ default_system_prompt=None,
237
+ role_prefix_and_suffix={
238
+ "system": (
239
+ "",
240
+ "",
241
+ ),
242
+ "User": (
243
+ "<|User|>",
244
+ "",
245
+ ),
246
+ "assistant": (
247
+ "<|Assistant|>",
248
+ "<|end▁of▁sentence|>",
249
+ ),
250
+ },
251
+ stop_str=("<|end▁of▁sentence|>",),
252
+ image_token="<image_placeholder>\n",
253
+ )
254
+ )
255
+
233
256
  # The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
234
257
  register_chat_template(
235
258
  ChatTemplate(
@@ -384,6 +407,12 @@ def match_deepseek(model_path: str):
384
407
  return get_chat_template("deepseek-v3")
385
408
 
386
409
 
410
+ @register_chat_template_matching_function
411
+ def match_deepseek_janus_pro(model_path: str):
412
+ if "janus" in model_path.lower():
413
+ return get_chat_template("janus-pro")
414
+
415
+
387
416
  @register_chat_template_matching_function
388
417
  def match_dbrx(model_path: str):
389
418
  if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
sglang/srt/_custom_ops.py CHANGED
@@ -6,10 +6,12 @@ from typing import List, Tuple
6
6
  import torch
7
7
  import torch.library
8
8
 
9
- from sglang.srt.utils import is_hip, is_hpu
9
+ from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu
10
10
 
11
11
  logger = logging.getLogger(__name__)
12
- use_vllm_custom_allreduce = os.environ.get("USE_VLLM_CUSTOM_ALLREDUCE", default=True)
12
+ use_vllm_custom_allreduce = get_bool_env_var(
13
+ "USE_VLLM_CUSTOM_ALLREDUCE", default="true"
14
+ )
13
15
 
14
16
  if not is_hpu():
15
17
  # ROCm does not use vllm custom allreduce
@@ -75,42 +77,42 @@ else:
75
77
  rank: int,
76
78
  full_nvlink: bool,
77
79
  ) -> int:
78
- return sgl_kernel.ops.allreduce.init_custom_ar(
80
+ return sgl_kernel.allreduce.init_custom_ar(
79
81
  meta, rank_data, handles, offsets, rank, full_nvlink
80
82
  )
81
83
 
82
84
  def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
83
- sgl_kernel.ops.allreduce.all_reduce_reg(fa, inp, out)
85
+ sgl_kernel.allreduce.all_reduce_reg(fa, inp, out)
84
86
 
85
87
  def all_reduce_unreg(
86
88
  fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
87
89
  ) -> None:
88
- sgl_kernel.ops.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
90
+ sgl_kernel.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
89
91
 
90
92
  def dispose(fa: int) -> None:
91
- sgl_kernel.ops.allreduce.dispose(fa)
93
+ sgl_kernel.allreduce.dispose(fa)
92
94
 
93
95
  def meta_size() -> int:
94
- return sgl_kernel.ops.allreduce.meta_size()
96
+ return sgl_kernel.allreduce.meta_size()
95
97
 
96
98
  def register_buffer(
97
99
  fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
98
100
  ) -> None:
99
- return sgl_kernel.ops.allreduce.register_buffer(fa, t, handles, offsets)
101
+ return sgl_kernel.allreduce.register_buffer(fa, t, handles, offsets)
100
102
 
101
103
  def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
102
- return sgl_kernel.ops.allreduce.get_graph_buffer_ipc_meta(fa)
104
+ return sgl_kernel.allreduce.get_graph_buffer_ipc_meta(fa)
103
105
 
104
106
  def register_graph_buffers(
105
107
  fa: int, handles: List[str], offsets: List[List[int]]
106
108
  ) -> None:
107
- sgl_kernel.ops.allreduce.register_graph_buffers(fa, handles, offsets)
109
+ sgl_kernel.allreduce.register_graph_buffers(fa, handles, offsets)
108
110
 
109
111
  def allocate_meta_buffer(size: int) -> torch.Tensor:
110
- return sgl_kernel.ops.allreduce.allocate_meta_buffer(size)
112
+ return sgl_kernel.allreduce.allocate_meta_buffer(size)
111
113
 
112
114
  def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
113
- return sgl_kernel.ops.allreduce.get_meta_buffer_ipc_handle(inp)
115
+ return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
114
116
 
115
117
  else:
116
118
  # TRTLLM custom allreduce
@@ -123,7 +125,7 @@ else:
123
125
  barrier_in: List[int],
124
126
  barrier_out: List[int],
125
127
  ) -> int:
126
- return sgl_kernel.ops.init_custom_reduce(
128
+ return sgl_kernel.init_custom_reduce(
127
129
  rank_id,
128
130
  world_size,
129
131
  rank_data_base,
@@ -134,15 +136,15 @@ else:
134
136
  )
135
137
 
136
138
  def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
137
- sgl_kernel.ops.custom_reduce(fa, inp, out)
139
+ sgl_kernel.custom_reduce(fa, inp, out)
138
140
 
139
141
  def dispose(fa: int) -> None:
140
- sgl_kernel.ops.custom_dispose(fa)
142
+ sgl_kernel.custom_dispose(fa)
141
143
 
142
144
  def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
143
- return sgl_kernel.ops.get_graph_buffer_ipc_meta(fa)
145
+ return sgl_kernel.get_graph_buffer_ipc_meta(fa)
144
146
 
145
147
  def register_graph_buffers(
146
148
  fa: int, handles: List[List[int]], offsets: List[List[int]]
147
149
  ) -> None:
148
- sgl_kernel.ops.register_graph_buffers(fa, handles, offsets)
150
+ sgl_kernel.register_graph_buffers(fa, handles, offsets)
@@ -1,6 +1,7 @@
1
1
  from sglang.srt.configs.chatglm import ChatGLMConfig
2
2
  from sglang.srt.configs.dbrx import DbrxConfig
3
3
  from sglang.srt.configs.exaone import ExaoneConfig
4
+ from sglang.srt.configs.janus_pro import MultiModalityConfig
4
5
  from sglang.srt.configs.qwen2_5_vl_config import (
5
6
  Qwen2_5_VLConfig,
6
7
  Qwen2_5_VLVisionConfig,
@@ -12,4 +13,5 @@ __all__ = [
12
13
  "DbrxConfig",
13
14
  "Qwen2_5_VLConfig",
14
15
  "Qwen2_5_VLVisionConfig",
16
+ "MultiModalityConfig",
15
17
  ]