sglang 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +1 -1
- sglang/lang/chat_template.py +29 -0
- sglang/srt/_custom_ops.py +19 -17
- sglang/srt/configs/__init__.py +2 -0
- sglang/srt/configs/janus_pro.py +629 -0
- sglang/srt/configs/model_config.py +24 -14
- sglang/srt/conversation.py +80 -2
- sglang/srt/custom_op.py +64 -3
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +18 -17
- sglang/srt/distributed/parallel_state.py +10 -1
- sglang/srt/entrypoints/engine.py +5 -3
- sglang/srt/entrypoints/http_server.py +1 -1
- sglang/srt/function_call_parser.py +33 -2
- sglang/srt/hf_transformers_utils.py +16 -1
- sglang/srt/layers/attention/flashinfer_backend.py +1 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +317 -57
- sglang/srt/layers/attention/triton_backend.py +1 -3
- sglang/srt/layers/attention/triton_ops/decode_attention.py +6 -6
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +3 -3
- sglang/srt/layers/attention/triton_ops/extend_attention.py +4 -4
- sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +3 -3
- sglang/srt/layers/attention/vision.py +43 -62
- sglang/srt/layers/dp_attention.py +30 -2
- sglang/srt/layers/elementwise.py +411 -0
- sglang/srt/layers/linear.py +1 -1
- sglang/srt/layers/logits_processor.py +1 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -1
- sglang/srt/layers/moe/ep_moe/layer.py +25 -9
- sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +63 -23
- sglang/srt/layers/moe/fused_moe_triton/layer.py +16 -4
- sglang/srt/layers/moe/router.py +342 -0
- sglang/srt/layers/parameter.py +10 -0
- sglang/srt/layers/quantization/__init__.py +90 -68
- sglang/srt/layers/quantization/blockwise_int8.py +1 -2
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/fp8.py +174 -106
- sglang/srt/layers/quantization/fp8_kernel.py +210 -38
- sglang/srt/layers/quantization/fp8_utils.py +156 -15
- sglang/srt/layers/quantization/modelopt_quant.py +5 -1
- sglang/srt/layers/quantization/w8a8_fp8.py +128 -0
- sglang/srt/layers/quantization/w8a8_int8.py +152 -3
- sglang/srt/layers/rotary_embedding.py +5 -3
- sglang/srt/layers/sampler.py +29 -35
- sglang/srt/layers/vocab_parallel_embedding.py +0 -1
- sglang/srt/lora/backend/__init__.py +9 -12
- sglang/srt/managers/cache_controller.py +74 -8
- sglang/srt/managers/data_parallel_controller.py +1 -1
- sglang/srt/managers/image_processor.py +37 -631
- sglang/srt/managers/image_processors/base_image_processor.py +219 -0
- sglang/srt/managers/image_processors/janus_pro.py +79 -0
- sglang/srt/managers/image_processors/llava.py +152 -0
- sglang/srt/managers/image_processors/minicpmv.py +86 -0
- sglang/srt/managers/image_processors/mlama.py +60 -0
- sglang/srt/managers/image_processors/qwen_vl.py +161 -0
- sglang/srt/managers/io_struct.py +32 -15
- sglang/srt/managers/multi_modality_padding.py +134 -0
- sglang/srt/managers/schedule_batch.py +213 -118
- sglang/srt/managers/schedule_policy.py +40 -8
- sglang/srt/managers/scheduler.py +176 -683
- sglang/srt/managers/scheduler_output_processor_mixin.py +614 -0
- sglang/srt/managers/tokenizer_manager.py +6 -6
- sglang/srt/managers/tp_worker_overlap_thread.py +4 -1
- sglang/srt/mem_cache/base_prefix_cache.py +6 -8
- sglang/srt/mem_cache/chunk_cache.py +12 -44
- sglang/srt/mem_cache/hiradix_cache.py +71 -34
- sglang/srt/mem_cache/memory_pool.py +81 -17
- sglang/srt/mem_cache/paged_allocator.py +283 -0
- sglang/srt/mem_cache/radix_cache.py +117 -36
- sglang/srt/model_executor/cuda_graph_runner.py +68 -20
- sglang/srt/model_executor/forward_batch_info.py +23 -10
- sglang/srt/model_executor/model_runner.py +63 -63
- sglang/srt/model_loader/loader.py +2 -1
- sglang/srt/model_loader/weight_utils.py +1 -1
- sglang/srt/models/deepseek_janus_pro.py +2127 -0
- sglang/srt/models/deepseek_nextn.py +23 -3
- sglang/srt/models/deepseek_v2.py +200 -191
- sglang/srt/models/grok.py +374 -119
- sglang/srt/models/minicpmv.py +28 -89
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/qwen2.py +0 -1
- sglang/srt/models/qwen2_5_vl.py +25 -50
- sglang/srt/models/qwen2_vl.py +33 -49
- sglang/srt/openai_api/adapter.py +59 -35
- sglang/srt/openai_api/protocol.py +8 -1
- sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -1
- sglang/srt/sampling/penaltylib/presence_penalty.py +0 -1
- sglang/srt/server_args.py +24 -16
- sglang/srt/speculative/eagle_worker.py +75 -39
- sglang/srt/utils.py +104 -9
- sglang/test/runners.py +104 -10
- sglang/test/test_block_fp8.py +106 -16
- sglang/test/test_custom_ops.py +88 -0
- sglang/test/test_utils.py +20 -4
- sglang/utils.py +0 -4
- sglang/version.py +1 -1
- {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/METADATA +9 -10
- {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/RECORD +131 -84
- {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/WHEEL +1 -1
- {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/LICENSE +0 -0
- {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py
CHANGED
sglang/lang/chat_template.py
CHANGED
@@ -230,6 +230,29 @@ register_chat_template(
|
|
230
230
|
)
|
231
231
|
)
|
232
232
|
|
233
|
+
register_chat_template(
|
234
|
+
ChatTemplate(
|
235
|
+
name="janus-pro",
|
236
|
+
default_system_prompt=None,
|
237
|
+
role_prefix_and_suffix={
|
238
|
+
"system": (
|
239
|
+
"",
|
240
|
+
"",
|
241
|
+
),
|
242
|
+
"User": (
|
243
|
+
"<|User|>",
|
244
|
+
"",
|
245
|
+
),
|
246
|
+
"assistant": (
|
247
|
+
"<|Assistant|>",
|
248
|
+
"<|end▁of▁sentence|>",
|
249
|
+
),
|
250
|
+
},
|
251
|
+
stop_str=("<|end▁of▁sentence|>",),
|
252
|
+
image_token="<image_placeholder>\n",
|
253
|
+
)
|
254
|
+
)
|
255
|
+
|
233
256
|
# The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
|
234
257
|
register_chat_template(
|
235
258
|
ChatTemplate(
|
@@ -384,6 +407,12 @@ def match_deepseek(model_path: str):
|
|
384
407
|
return get_chat_template("deepseek-v3")
|
385
408
|
|
386
409
|
|
410
|
+
@register_chat_template_matching_function
|
411
|
+
def match_deepseek_janus_pro(model_path: str):
|
412
|
+
if "janus" in model_path.lower():
|
413
|
+
return get_chat_template("janus-pro")
|
414
|
+
|
415
|
+
|
387
416
|
@register_chat_template_matching_function
|
388
417
|
def match_dbrx(model_path: str):
|
389
418
|
if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
|
sglang/srt/_custom_ops.py
CHANGED
@@ -6,10 +6,12 @@ from typing import List, Tuple
|
|
6
6
|
import torch
|
7
7
|
import torch.library
|
8
8
|
|
9
|
-
from sglang.srt.utils import is_hip, is_hpu
|
9
|
+
from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu
|
10
10
|
|
11
11
|
logger = logging.getLogger(__name__)
|
12
|
-
use_vllm_custom_allreduce =
|
12
|
+
use_vllm_custom_allreduce = get_bool_env_var(
|
13
|
+
"USE_VLLM_CUSTOM_ALLREDUCE", default="true"
|
14
|
+
)
|
13
15
|
|
14
16
|
if not is_hpu():
|
15
17
|
# ROCm does not use vllm custom allreduce
|
@@ -75,42 +77,42 @@ else:
|
|
75
77
|
rank: int,
|
76
78
|
full_nvlink: bool,
|
77
79
|
) -> int:
|
78
|
-
return sgl_kernel.
|
80
|
+
return sgl_kernel.allreduce.init_custom_ar(
|
79
81
|
meta, rank_data, handles, offsets, rank, full_nvlink
|
80
82
|
)
|
81
83
|
|
82
84
|
def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
|
83
|
-
sgl_kernel.
|
85
|
+
sgl_kernel.allreduce.all_reduce_reg(fa, inp, out)
|
84
86
|
|
85
87
|
def all_reduce_unreg(
|
86
88
|
fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
|
87
89
|
) -> None:
|
88
|
-
sgl_kernel.
|
90
|
+
sgl_kernel.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
|
89
91
|
|
90
92
|
def dispose(fa: int) -> None:
|
91
|
-
sgl_kernel.
|
93
|
+
sgl_kernel.allreduce.dispose(fa)
|
92
94
|
|
93
95
|
def meta_size() -> int:
|
94
|
-
return sgl_kernel.
|
96
|
+
return sgl_kernel.allreduce.meta_size()
|
95
97
|
|
96
98
|
def register_buffer(
|
97
99
|
fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
|
98
100
|
) -> None:
|
99
|
-
return sgl_kernel.
|
101
|
+
return sgl_kernel.allreduce.register_buffer(fa, t, handles, offsets)
|
100
102
|
|
101
103
|
def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
|
102
|
-
return sgl_kernel.
|
104
|
+
return sgl_kernel.allreduce.get_graph_buffer_ipc_meta(fa)
|
103
105
|
|
104
106
|
def register_graph_buffers(
|
105
107
|
fa: int, handles: List[str], offsets: List[List[int]]
|
106
108
|
) -> None:
|
107
|
-
sgl_kernel.
|
109
|
+
sgl_kernel.allreduce.register_graph_buffers(fa, handles, offsets)
|
108
110
|
|
109
111
|
def allocate_meta_buffer(size: int) -> torch.Tensor:
|
110
|
-
return sgl_kernel.
|
112
|
+
return sgl_kernel.allreduce.allocate_meta_buffer(size)
|
111
113
|
|
112
114
|
def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
|
113
|
-
return sgl_kernel.
|
115
|
+
return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
|
114
116
|
|
115
117
|
else:
|
116
118
|
# TRTLLM custom allreduce
|
@@ -123,7 +125,7 @@ else:
|
|
123
125
|
barrier_in: List[int],
|
124
126
|
barrier_out: List[int],
|
125
127
|
) -> int:
|
126
|
-
return sgl_kernel.
|
128
|
+
return sgl_kernel.init_custom_reduce(
|
127
129
|
rank_id,
|
128
130
|
world_size,
|
129
131
|
rank_data_base,
|
@@ -134,15 +136,15 @@ else:
|
|
134
136
|
)
|
135
137
|
|
136
138
|
def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
|
137
|
-
sgl_kernel.
|
139
|
+
sgl_kernel.custom_reduce(fa, inp, out)
|
138
140
|
|
139
141
|
def dispose(fa: int) -> None:
|
140
|
-
sgl_kernel.
|
142
|
+
sgl_kernel.custom_dispose(fa)
|
141
143
|
|
142
144
|
def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
|
143
|
-
return sgl_kernel.
|
145
|
+
return sgl_kernel.get_graph_buffer_ipc_meta(fa)
|
144
146
|
|
145
147
|
def register_graph_buffers(
|
146
148
|
fa: int, handles: List[List[int]], offsets: List[List[int]]
|
147
149
|
) -> None:
|
148
|
-
sgl_kernel.
|
150
|
+
sgl_kernel.register_graph_buffers(fa, handles, offsets)
|
sglang/srt/configs/__init__.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from sglang.srt.configs.chatglm import ChatGLMConfig
|
2
2
|
from sglang.srt.configs.dbrx import DbrxConfig
|
3
3
|
from sglang.srt.configs.exaone import ExaoneConfig
|
4
|
+
from sglang.srt.configs.janus_pro import MultiModalityConfig
|
4
5
|
from sglang.srt.configs.qwen2_5_vl_config import (
|
5
6
|
Qwen2_5_VLConfig,
|
6
7
|
Qwen2_5_VLVisionConfig,
|
@@ -12,4 +13,5 @@ __all__ = [
|
|
12
13
|
"DbrxConfig",
|
13
14
|
"Qwen2_5_VLConfig",
|
14
15
|
"Qwen2_5_VLVisionConfig",
|
16
|
+
"MultiModalityConfig",
|
15
17
|
]
|