sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +1 -1
- sglang/bench_offline_throughput.py +19 -0
- sglang/bench_one_batch.py +2 -2
- sglang/bench_serving.py +123 -79
- sglang/global_config.py +8 -3
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/ir.py +1 -1
- sglang/srt/_custom_ops.py +83 -91
- sglang/srt/configs/load_config.py +4 -1
- sglang/srt/configs/model_config.py +48 -2
- sglang/srt/configs/qwen2_5_vl_config.py +5 -2
- sglang/srt/constrained/base_grammar_backend.py +117 -15
- sglang/srt/constrained/llguidance_backend.py +151 -0
- sglang/srt/constrained/outlines_backend.py +24 -33
- sglang/srt/constrained/xgrammar_backend.py +69 -38
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
- sglang/srt/distributed/parallel_state.py +48 -3
- sglang/srt/entrypoints/engine.py +67 -9
- sglang/srt/entrypoints/http_server.py +190 -41
- sglang/srt/entrypoints/verl_engine.py +147 -0
- sglang/srt/function_call_parser.py +0 -1
- sglang/srt/layers/activation.py +11 -0
- sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
- sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
- sglang/srt/layers/attention/flashinfer_backend.py +302 -414
- sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
- sglang/srt/layers/attention/torch_native_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +13 -8
- sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
- sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
- sglang/srt/layers/attention/utils.py +39 -0
- sglang/srt/layers/attention/vision.py +60 -63
- sglang/srt/layers/dp_attention.py +142 -1
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/linear.py +3 -1
- sglang/srt/layers/logits_processor.py +281 -45
- sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
- sglang/srt/layers/moe/ep_moe/layer.py +140 -28
- sglang/srt/layers/moe/fused_moe_native.py +2 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
- sglang/srt/layers/moe/topk.py +13 -4
- sglang/srt/layers/quantization/__init__.py +111 -7
- sglang/srt/layers/quantization/blockwise_int8.py +409 -0
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/fp8.py +69 -28
- sglang/srt/layers/quantization/fp8_utils.py +17 -1
- sglang/srt/layers/quantization/gptq.py +416 -0
- sglang/srt/layers/quantization/int8_kernel.py +327 -0
- sglang/srt/layers/quantization/int8_utils.py +73 -0
- sglang/srt/layers/quantization/modelopt_quant.py +18 -1
- sglang/srt/layers/radix_attention.py +1 -0
- sglang/srt/layers/rotary_embedding.py +0 -1
- sglang/srt/layers/sampler.py +76 -31
- sglang/srt/layers/vocab_parallel_embedding.py +14 -13
- sglang/srt/lora/lora.py +17 -1
- sglang/srt/lora/lora_config.py +5 -0
- sglang/srt/lora/lora_manager.py +1 -3
- sglang/srt/managers/cache_controller.py +193 -62
- sglang/srt/managers/configure_logging.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +6 -2
- sglang/srt/managers/detokenizer_manager.py +124 -102
- sglang/srt/managers/image_processor.py +2 -1
- sglang/srt/managers/io_struct.py +144 -6
- sglang/srt/managers/schedule_batch.py +237 -197
- sglang/srt/managers/schedule_policy.py +29 -29
- sglang/srt/managers/scheduler.py +773 -334
- sglang/srt/managers/session_controller.py +6 -2
- sglang/srt/managers/tokenizer_manager.py +225 -68
- sglang/srt/managers/tp_worker.py +15 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
- sglang/srt/mem_cache/chunk_cache.py +18 -11
- sglang/srt/mem_cache/hiradix_cache.py +394 -0
- sglang/srt/mem_cache/memory_pool.py +68 -37
- sglang/srt/mem_cache/radix_cache.py +58 -47
- sglang/srt/metrics/collector.py +102 -36
- sglang/srt/model_executor/cuda_graph_runner.py +56 -31
- sglang/srt/model_executor/forward_batch_info.py +49 -16
- sglang/srt/model_executor/model_runner.py +280 -81
- sglang/srt/model_loader/loader.py +3 -3
- sglang/srt/model_loader/weight_utils.py +36 -14
- sglang/srt/models/baichuan.py +31 -6
- sglang/srt/models/chatglm.py +39 -7
- sglang/srt/models/commandr.py +29 -5
- sglang/srt/models/dbrx.py +31 -5
- sglang/srt/models/deepseek.py +43 -6
- sglang/srt/models/deepseek_nextn.py +32 -19
- sglang/srt/models/deepseek_v2.py +265 -32
- sglang/srt/models/exaone.py +19 -9
- sglang/srt/models/gemma.py +22 -8
- sglang/srt/models/gemma2.py +25 -12
- sglang/srt/models/gemma2_reward.py +5 -1
- sglang/srt/models/gpt2.py +28 -13
- sglang/srt/models/gpt_bigcode.py +27 -5
- sglang/srt/models/granite.py +21 -9
- sglang/srt/models/grok.py +21 -4
- sglang/srt/models/internlm2.py +36 -6
- sglang/srt/models/internlm2_reward.py +5 -1
- sglang/srt/models/llama.py +26 -9
- sglang/srt/models/llama_classification.py +5 -1
- sglang/srt/models/llama_eagle.py +17 -4
- sglang/srt/models/llama_embedding.py +5 -1
- sglang/srt/models/llama_reward.py +7 -2
- sglang/srt/models/llava.py +19 -3
- sglang/srt/models/llavavid.py +10 -1
- sglang/srt/models/minicpm.py +26 -2
- sglang/srt/models/minicpm3.py +39 -3
- sglang/srt/models/minicpmv.py +45 -14
- sglang/srt/models/mixtral.py +20 -9
- sglang/srt/models/mixtral_quant.py +50 -8
- sglang/srt/models/mllama.py +57 -11
- sglang/srt/models/olmo.py +34 -6
- sglang/srt/models/olmo2.py +34 -13
- sglang/srt/models/olmoe.py +26 -4
- sglang/srt/models/phi3_small.py +29 -10
- sglang/srt/models/qwen.py +26 -3
- sglang/srt/models/qwen2.py +26 -4
- sglang/srt/models/qwen2_5_vl.py +46 -8
- sglang/srt/models/qwen2_eagle.py +17 -5
- sglang/srt/models/qwen2_moe.py +44 -6
- sglang/srt/models/qwen2_rm.py +78 -0
- sglang/srt/models/qwen2_vl.py +39 -8
- sglang/srt/models/stablelm.py +32 -5
- sglang/srt/models/torch_native_llama.py +5 -2
- sglang/srt/models/xverse.py +21 -9
- sglang/srt/models/xverse_moe.py +45 -7
- sglang/srt/models/yivl.py +2 -1
- sglang/srt/openai_api/adapter.py +109 -24
- sglang/srt/openai_api/protocol.py +17 -1
- sglang/srt/reasoning_parser.py +154 -0
- sglang/srt/sampling/penaltylib/__init__.py +4 -6
- sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
- sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
- sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
- sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
- sglang/srt/sampling/sampling_batch_info.py +79 -157
- sglang/srt/sampling/sampling_params.py +16 -13
- sglang/srt/server_args.py +135 -60
- sglang/srt/speculative/build_eagle_tree.py +8 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -12
- sglang/srt/speculative/eagle_utils.py +92 -57
- sglang/srt/speculative/eagle_worker.py +238 -111
- sglang/srt/speculative/spec_info.py +1 -13
- sglang/srt/utils.py +43 -17
- sglang/srt/warmup.py +47 -0
- sglang/test/few_shot_gsm8k.py +4 -1
- sglang/test/runners.py +389 -126
- sglang/test/send_one.py +88 -0
- sglang/test/test_block_fp8_ep.py +361 -0
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +138 -84
- sglang/utils.py +50 -60
- sglang/version.py +1 -1
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/METADATA +22 -15
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/RECORD +200 -166
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/WHEEL +1 -1
- sglang/bench_latency.py +0 -1
- sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
- sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
- sglang/test/srt/sampling/penaltylib/utils.py +0 -344
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/LICENSE +0 -0
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/top_level.txt +0 -0
sglang/srt/_custom_ops.py
CHANGED
@@ -1,21 +1,19 @@
|
|
1
1
|
# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/_custom_ops.py
|
2
|
-
import contextlib
|
3
|
-
import functools
|
4
|
-
import importlib
|
5
2
|
import logging
|
6
3
|
import os
|
7
|
-
from typing import
|
4
|
+
from typing import List, Tuple
|
8
5
|
|
9
6
|
import torch
|
10
7
|
import torch.library
|
11
8
|
|
12
|
-
from sglang.srt.utils import is_hpu
|
9
|
+
from sglang.srt.utils import is_hip, is_hpu
|
13
10
|
|
14
11
|
logger = logging.getLogger(__name__)
|
15
12
|
use_vllm_custom_allreduce = os.environ.get("USE_VLLM_CUSTOM_ALLREDUCE", default=True)
|
16
13
|
|
17
14
|
if not is_hpu():
|
18
|
-
|
15
|
+
# ROCm does not use vllm custom allreduce
|
16
|
+
if use_vllm_custom_allreduce and not is_hip():
|
19
17
|
try:
|
20
18
|
import vllm._C
|
21
19
|
except ImportError as e:
|
@@ -27,37 +25,8 @@ if not is_hpu():
|
|
27
25
|
logger.warning("Failed to import from custom_ar with %r", e)
|
28
26
|
|
29
27
|
|
30
|
-
|
31
|
-
|
32
|
-
@functools.wraps(fn)
|
33
|
-
def wrapper(*args, **kwargs):
|
34
|
-
try:
|
35
|
-
return fn(*args, **kwargs)
|
36
|
-
|
37
|
-
except NotImplementedError as e:
|
38
|
-
msg = (
|
39
|
-
"Error in calling custom op %s: %s\n"
|
40
|
-
"Not implemented or built, mostly likely because the current current device "
|
41
|
-
"does not support this kernel (less likely TORCH_CUDA_ARCH_LIST was set "
|
42
|
-
"incorrectly while building)"
|
43
|
-
)
|
44
|
-
logger.error(msg, fn.__name__, e)
|
45
|
-
raise NotImplementedError(msg % (fn.__name__, e)) from e
|
46
|
-
except AttributeError as e:
|
47
|
-
msg = (
|
48
|
-
"Error in calling custom op %s: %s\n"
|
49
|
-
"Possibly you have built or installed an obsolete version of vllm.\n"
|
50
|
-
"Please try a clean build and install of vllm,"
|
51
|
-
"or remove old built files such as vllm/*cpython*.so and build/ ."
|
52
|
-
)
|
53
|
-
logger.error(msg, fn.__name__, e)
|
54
|
-
raise e
|
55
|
-
|
56
|
-
return wrapper
|
57
|
-
|
58
|
-
|
59
|
-
if use_vllm_custom_allreduce:
|
60
|
-
# custom ar
|
28
|
+
if use_vllm_custom_allreduce and not is_hip():
|
29
|
+
# vLLM custom allreduce
|
61
30
|
def init_custom_ar(
|
62
31
|
ipc_tensors: List[torch.Tensor],
|
63
32
|
rank_data: torch.Tensor,
|
@@ -95,62 +64,85 @@ if use_vllm_custom_allreduce:
|
|
95
64
|
torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
|
96
65
|
|
97
66
|
else:
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
buffers,
|
113
|
-
tmp_result_buffers,
|
114
|
-
barrier_in,
|
115
|
-
barrier_out,
|
116
|
-
)
|
67
|
+
if is_hip():
|
68
|
+
# ROCM custom allreduce
|
69
|
+
|
70
|
+
def init_custom_ar(
|
71
|
+
meta: torch.Tensor,
|
72
|
+
rank_data: torch.Tensor,
|
73
|
+
handles: List[str],
|
74
|
+
offsets: List[int],
|
75
|
+
rank: int,
|
76
|
+
full_nvlink: bool,
|
77
|
+
) -> int:
|
78
|
+
return sgl_kernel.ops.allreduce.init_custom_ar(
|
79
|
+
meta, rank_data, handles, offsets, rank, full_nvlink
|
80
|
+
)
|
117
81
|
|
118
|
-
|
119
|
-
|
82
|
+
def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
|
83
|
+
sgl_kernel.ops.allreduce.all_reduce_reg(fa, inp, out)
|
120
84
|
|
121
|
-
|
122
|
-
|
85
|
+
def all_reduce_unreg(
|
86
|
+
fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
|
87
|
+
) -> None:
|
88
|
+
sgl_kernel.ops.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
|
123
89
|
|
124
|
-
|
125
|
-
|
90
|
+
def dispose(fa: int) -> None:
|
91
|
+
sgl_kernel.ops.allreduce.dispose(fa)
|
126
92
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
93
|
+
def meta_size() -> int:
|
94
|
+
return sgl_kernel.ops.allreduce.meta_size()
|
95
|
+
|
96
|
+
def register_buffer(
|
97
|
+
fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
|
98
|
+
) -> None:
|
99
|
+
return sgl_kernel.ops.allreduce.register_buffer(fa, t, handles, offsets)
|
100
|
+
|
101
|
+
def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
|
102
|
+
return sgl_kernel.ops.allreduce.get_graph_buffer_ipc_meta(fa)
|
103
|
+
|
104
|
+
def register_graph_buffers(
|
105
|
+
fa: int, handles: List[str], offsets: List[List[int]]
|
106
|
+
) -> None:
|
107
|
+
sgl_kernel.ops.allreduce.register_graph_buffers(fa, handles, offsets)
|
108
|
+
|
109
|
+
def allocate_meta_buffer(size: int) -> torch.Tensor:
|
110
|
+
return sgl_kernel.ops.allreduce.allocate_meta_buffer(size)
|
111
|
+
|
112
|
+
def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
|
113
|
+
return sgl_kernel.ops.allreduce.get_meta_buffer_ipc_handle(inp)
|
114
|
+
|
115
|
+
else:
|
116
|
+
# TRTLLM custom allreduce
|
117
|
+
def init_custom_ar(
|
118
|
+
rank_id: int,
|
119
|
+
world_size: int,
|
120
|
+
rank_data_base: torch.Tensor,
|
121
|
+
buffers: List[int],
|
122
|
+
tmp_result_buffers: List[int],
|
123
|
+
barrier_in: List[int],
|
124
|
+
barrier_out: List[int],
|
125
|
+
) -> int:
|
126
|
+
return sgl_kernel.ops.init_custom_reduce(
|
127
|
+
rank_id,
|
128
|
+
world_size,
|
129
|
+
rank_data_base,
|
130
|
+
buffers,
|
131
|
+
tmp_result_buffers,
|
132
|
+
barrier_in,
|
133
|
+
barrier_out,
|
134
|
+
)
|
135
|
+
|
136
|
+
def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
|
137
|
+
sgl_kernel.ops.custom_reduce(fa, inp, out)
|
138
|
+
|
139
|
+
def dispose(fa: int) -> None:
|
140
|
+
sgl_kernel.ops.custom_dispose(fa)
|
141
|
+
|
142
|
+
def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
|
143
|
+
return sgl_kernel.ops.get_graph_buffer_ipc_meta(fa)
|
154
144
|
|
155
|
-
|
156
|
-
|
145
|
+
def register_graph_buffers(
|
146
|
+
fa: int, handles: List[List[int]], offsets: List[List[int]]
|
147
|
+
) -> None:
|
148
|
+
sgl_kernel.ops.register_graph_buffers(fa, handles, offsets)
|
@@ -21,6 +21,7 @@ class LoadFormat(str, enum.Enum):
|
|
21
21
|
BITSANDBYTES = "bitsandbytes"
|
22
22
|
MISTRAL = "mistral"
|
23
23
|
LAYERED = "layered"
|
24
|
+
JAX = "jax"
|
24
25
|
|
25
26
|
|
26
27
|
@dataclass
|
@@ -42,13 +43,15 @@ class LoadConfig:
|
|
42
43
|
ignore_patterns: The list of patterns to ignore when loading the model.
|
43
44
|
Default to "original/**/*" to avoid repeated loading of llama's
|
44
45
|
checkpoints.
|
45
|
-
|
46
|
+
decryption_key_file: If set, decrypts the output files with a password read
|
47
|
+
from this file (after PBKDF2).
|
46
48
|
"""
|
47
49
|
|
48
50
|
load_format: Union[str, LoadFormat] = LoadFormat.AUTO
|
49
51
|
download_dir: Optional[str] = None
|
50
52
|
model_loader_extra_config: Optional[Union[str, dict]] = field(default_factory=dict)
|
51
53
|
ignore_patterns: Optional[Union[List[str], str]] = None
|
54
|
+
decryption_key_file: Optional[str] = None
|
52
55
|
|
53
56
|
def __post_init__(self):
|
54
57
|
model_loader_extra_config = self.model_loader_extra_config or {}
|
@@ -14,6 +14,7 @@
|
|
14
14
|
|
15
15
|
import json
|
16
16
|
import logging
|
17
|
+
import math
|
17
18
|
from enum import IntEnum, auto
|
18
19
|
from typing import List, Optional, Set, Union
|
19
20
|
|
@@ -39,10 +40,11 @@ class ModelConfig:
|
|
39
40
|
trust_remote_code: bool = True,
|
40
41
|
revision: Optional[str] = None,
|
41
42
|
context_length: Optional[int] = None,
|
42
|
-
model_override_args: Optional[
|
43
|
+
model_override_args: Optional[str] = None,
|
43
44
|
is_embedding: Optional[bool] = None,
|
44
45
|
dtype: str = "auto",
|
45
46
|
quantization: Optional[str] = None,
|
47
|
+
override_config_file: Optional[str] = None,
|
46
48
|
) -> None:
|
47
49
|
self.model_path = model_path
|
48
50
|
self.revision = revision
|
@@ -50,11 +52,16 @@ class ModelConfig:
|
|
50
52
|
|
51
53
|
# Parse args
|
52
54
|
self.model_override_args = json.loads(model_override_args)
|
55
|
+
kwargs = {}
|
56
|
+
if override_config_file and override_config_file.strip():
|
57
|
+
kwargs["_configuration_file"] = override_config_file.strip()
|
58
|
+
|
53
59
|
self.hf_config = get_config(
|
54
60
|
model_path,
|
55
61
|
trust_remote_code=trust_remote_code,
|
56
62
|
revision=revision,
|
57
63
|
model_override_args=self.model_override_args,
|
64
|
+
**kwargs,
|
58
65
|
)
|
59
66
|
self.hf_text_config = get_hf_text_config(self.hf_config)
|
60
67
|
|
@@ -63,6 +70,9 @@ class ModelConfig:
|
|
63
70
|
self.hf_config.architectures, is_embedding
|
64
71
|
)
|
65
72
|
self.is_multimodal = is_multimodal_model(self.hf_config.architectures)
|
73
|
+
self.is_multimodal_gen = is_multimodal_gen_model(self.hf_config.architectures)
|
74
|
+
self.is_image_gen = is_image_gen_model(self.hf_config.architectures)
|
75
|
+
self.is_audio_model = is_audio_model(self.hf_config.architectures)
|
66
76
|
self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
|
67
77
|
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
|
68
78
|
|
@@ -70,7 +80,9 @@ class ModelConfig:
|
|
70
80
|
derived_context_len = get_context_length(self.hf_text_config)
|
71
81
|
if context_length is not None:
|
72
82
|
if context_length > derived_context_len:
|
73
|
-
if get_bool_env_var(
|
83
|
+
if get_bool_env_var(
|
84
|
+
"SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", default="False"
|
85
|
+
):
|
74
86
|
logger.warning(
|
75
87
|
f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
|
76
88
|
f"This may lead to incorrect model outputs or CUDA errors."
|
@@ -103,7 +115,20 @@ class ModelConfig:
|
|
103
115
|
self.head_dim = 256
|
104
116
|
self.attention_arch = AttentionArch.MLA
|
105
117
|
self.kv_lora_rank = self.hf_config.kv_lora_rank
|
118
|
+
self.qk_nope_head_dim = self.hf_config.qk_nope_head_dim
|
106
119
|
self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
|
120
|
+
self.v_head_dim = self.hf_config.v_head_dim
|
121
|
+
|
122
|
+
# Handle rope scaling with yarn
|
123
|
+
self.scaling = 1 / math.sqrt(self.qk_nope_head_dim + self.qk_rope_head_dim)
|
124
|
+
if self.hf_config.rope_scaling:
|
125
|
+
mscale_all_dim = self.hf_config.rope_scaling.get(
|
126
|
+
"mscale_all_dim", False
|
127
|
+
)
|
128
|
+
scaling_factor = self.hf_config.rope_scaling["factor"]
|
129
|
+
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
|
130
|
+
self.scaling = self.scaling * mscale * mscale
|
131
|
+
|
107
132
|
elif "MiniCPM3ForCausalLM" in self.hf_config.architectures:
|
108
133
|
self.head_dim = 128
|
109
134
|
self.attention_arch = AttentionArch.MLA
|
@@ -389,6 +414,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
|
|
389
414
|
or "LlamaForSequenceClassification" in model_architectures
|
390
415
|
or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
|
391
416
|
or "InternLM2ForRewardModel" in model_architectures
|
417
|
+
or "Qwen2ForRewardModel" in model_architectures
|
392
418
|
):
|
393
419
|
return False
|
394
420
|
else:
|
@@ -401,6 +427,8 @@ def is_multimodal_model(model_architectures: List[str]):
|
|
401
427
|
or "LlavaQwenForCausalLM" in model_architectures
|
402
428
|
or "LlavaMistralForCausalLM" in model_architectures
|
403
429
|
or "LlavaVidForCausalLM" in model_architectures
|
430
|
+
or "Grok1VForCausalLM" in model_architectures
|
431
|
+
or "Grok1AForCausalLM" in model_architectures
|
404
432
|
or "MllamaForConditionalGeneration" in model_architectures
|
405
433
|
or "Qwen2VLForConditionalGeneration" in model_architectures
|
406
434
|
or "Qwen2_5_VLForConditionalGeneration" in model_architectures
|
@@ -411,5 +439,23 @@ def is_multimodal_model(model_architectures: List[str]):
|
|
411
439
|
return False
|
412
440
|
|
413
441
|
|
442
|
+
def is_multimodal_gen_model(model_architectures: List[str]):
|
443
|
+
return False
|
444
|
+
|
445
|
+
|
446
|
+
def is_image_gen_model(model_architectures: List[str]):
|
447
|
+
return False
|
448
|
+
|
449
|
+
|
450
|
+
def is_audio_model(model_architectures: List[str]):
|
451
|
+
return False
|
452
|
+
|
453
|
+
|
414
454
|
def is_encoder_decoder_model(model_architectures: List[str]):
|
415
455
|
return "MllamaForConditionalGeneration" in model_architectures
|
456
|
+
|
457
|
+
|
458
|
+
def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
|
459
|
+
if scale <= 1:
|
460
|
+
return 1.0
|
461
|
+
return 0.1 * mscale * math.log(scale) + 1.0
|
@@ -48,13 +48,16 @@ from transformers.image_utils import (
|
|
48
48
|
validate_preprocess_arguments,
|
49
49
|
)
|
50
50
|
from transformers.modeling_rope_utils import rope_config_validation
|
51
|
-
from transformers.models.mllama.image_processing_mllama import is_valid_list_of_images
|
52
51
|
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
|
53
52
|
from transformers.processing_utils import ProcessingKwargs, Unpack, VideosKwargs
|
54
53
|
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
55
54
|
from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
56
55
|
|
57
56
|
|
57
|
+
def is_valid_list_of_images(images: List):
|
58
|
+
return images and all(is_valid_image(image) for image in images)
|
59
|
+
|
60
|
+
|
58
61
|
class Qwen2_5_VLVisionConfig(PretrainedConfig):
|
59
62
|
model_type = "qwen2_5_vl"
|
60
63
|
base_config_key = "vision_config"
|
@@ -999,5 +1002,5 @@ class Qwen2_5_VLImageProcessor(BaseImageProcessor):
|
|
999
1002
|
return BatchFeature(data=data, tensor_type=return_tensors)
|
1000
1003
|
|
1001
1004
|
|
1002
|
-
AutoImageProcessor.register(Qwen2_5_VLConfig, Qwen2_5_VLImageProcessor)
|
1005
|
+
AutoImageProcessor.register(Qwen2_5_VLConfig, None, Qwen2_5_VLImageProcessor, None)
|
1003
1006
|
AutoProcessor.register(Qwen2_5_VLConfig, Qwen2_5_VLProcessor)
|
@@ -13,31 +13,130 @@
|
|
13
13
|
# ==============================================================================
|
14
14
|
"""The baseclass of a backend for grammar-guided constrained decoding."""
|
15
15
|
|
16
|
+
import logging
|
17
|
+
from abc import ABC, abstractmethod
|
16
18
|
from concurrent.futures import Future, ThreadPoolExecutor
|
17
19
|
from dataclasses import dataclass
|
18
20
|
from threading import Event, Lock
|
19
|
-
from typing import
|
21
|
+
from typing import Dict, List, Optional, Tuple
|
22
|
+
|
23
|
+
import torch
|
20
24
|
|
21
25
|
from sglang.srt.server_args import ServerArgs
|
22
26
|
|
27
|
+
logger = logging.getLogger(__name__)
|
28
|
+
|
29
|
+
|
30
|
+
class BaseGrammarObject(ABC):
|
31
|
+
@abstractmethod
|
32
|
+
def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
|
33
|
+
"""
|
34
|
+
Try to jump forward in the grammar.
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
A jump forward helper which may be used in `jump_forward_str_state`.
|
38
|
+
None if the jump forward is not possible.
|
39
|
+
"""
|
40
|
+
raise NotImplementedError
|
41
|
+
|
42
|
+
@abstractmethod
|
43
|
+
def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
|
44
|
+
"""
|
45
|
+
Jump forward for the grammar.
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
A tuple of the jump forward string and the next state of the grammar
|
49
|
+
(which can be used in `jump_and_retokenize` if needed).
|
50
|
+
"""
|
51
|
+
raise NotImplementedError
|
52
|
+
|
53
|
+
@abstractmethod
|
54
|
+
def jump_and_retokenize(
|
55
|
+
self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
|
56
|
+
) -> None:
|
57
|
+
"""
|
58
|
+
Jump forward occurs, and update the grammar state if needed.
|
59
|
+
"""
|
60
|
+
raise NotImplementedError
|
61
|
+
|
62
|
+
@abstractmethod
|
63
|
+
def allocate_vocab_mask(
|
64
|
+
self, vocab_size: int, batch_size: int, device
|
65
|
+
) -> torch.Tensor:
|
66
|
+
raise NotImplementedError
|
67
|
+
|
68
|
+
@abstractmethod
|
69
|
+
def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
|
70
|
+
raise NotImplementedError
|
71
|
+
|
72
|
+
@staticmethod
|
73
|
+
@abstractmethod
|
74
|
+
def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
|
75
|
+
raise NotImplementedError
|
76
|
+
|
77
|
+
@staticmethod
|
78
|
+
@abstractmethod
|
79
|
+
def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
|
80
|
+
raise NotImplementedError
|
81
|
+
|
82
|
+
@abstractmethod
|
83
|
+
def copy(self) -> "BaseGrammarObject":
|
84
|
+
raise NotImplementedError
|
85
|
+
|
23
86
|
|
24
87
|
@dataclass
|
25
88
|
class CacheEntry:
|
26
|
-
value:
|
89
|
+
value: Optional[BaseGrammarObject]
|
27
90
|
event: Event
|
28
91
|
|
29
92
|
|
30
|
-
class
|
31
|
-
pass
|
32
|
-
|
33
|
-
|
34
|
-
class BaseGrammarBackend:
|
93
|
+
class BaseGrammarBackend(ABC):
|
35
94
|
def __init__(self):
|
36
95
|
self.executor = ThreadPoolExecutor()
|
37
|
-
self.cache = {}
|
96
|
+
self.cache: Dict[Tuple[str, str], CacheEntry] = {}
|
38
97
|
self.cache_lock = Lock()
|
39
98
|
|
40
|
-
def
|
99
|
+
def _not_supported(self, key_type: str, key_string: str) -> None:
|
100
|
+
logger.warning(f"Skip unsupported {key_type}: {key_type}={key_string}")
|
101
|
+
|
102
|
+
def dispatch_fallback(
|
103
|
+
self, key_type: str, key_string: str
|
104
|
+
) -> Optional[BaseGrammarObject]:
|
105
|
+
"""
|
106
|
+
This function should not be reached in any case.
|
107
|
+
"""
|
108
|
+
raise ValueError(f"Invalid key_type: {key_type}={key_string}")
|
109
|
+
|
110
|
+
@abstractmethod
|
111
|
+
def dispatch_json(self, key_string: str) -> Optional[BaseGrammarObject]:
|
112
|
+
return self._not_supported("json", key_string)
|
113
|
+
|
114
|
+
@abstractmethod
|
115
|
+
def dispatch_regex(self, key_string: str) -> Optional[BaseGrammarObject]:
|
116
|
+
return self._not_supported("regex", key_string)
|
117
|
+
|
118
|
+
@abstractmethod
|
119
|
+
def dispatch_ebnf(self, key_string: str) -> Optional[BaseGrammarObject]:
|
120
|
+
return self._not_supported("ebnf", key_string)
|
121
|
+
|
122
|
+
@abstractmethod
|
123
|
+
def dispatch_structural_tag(self, key_string: str) -> Optional[BaseGrammarObject]:
|
124
|
+
return self._not_supported("structural_tag", key_string)
|
125
|
+
|
126
|
+
def _init_value_dispatch(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
|
127
|
+
key_type, key_string = key
|
128
|
+
if key_type == "json":
|
129
|
+
return self.dispatch_json(key_string)
|
130
|
+
elif key_type == "regex":
|
131
|
+
return self.dispatch_regex(key_string)
|
132
|
+
elif key_type == "ebnf":
|
133
|
+
return self.dispatch_ebnf(key_string)
|
134
|
+
elif key_type == "structural_tag":
|
135
|
+
return self.dispatch_structural_tag(key_string)
|
136
|
+
else:
|
137
|
+
return self.dispatch_fallback(key_type, key_string)
|
138
|
+
|
139
|
+
def _init_value(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
|
41
140
|
with self.cache_lock:
|
42
141
|
if key in self.cache:
|
43
142
|
cache_hit = True
|
@@ -50,13 +149,10 @@ class BaseGrammarBackend:
|
|
50
149
|
if cache_hit:
|
51
150
|
entry.event.wait()
|
52
151
|
else:
|
53
|
-
entry.value = self.
|
152
|
+
entry.value = self._init_value_dispatch(key)
|
54
153
|
entry.event.set()
|
55
154
|
return entry.value.copy() if entry.value else None
|
56
155
|
|
57
|
-
def init_value_impl(self, key: Tuple[str, str]) -> BaseGrammarObject:
|
58
|
-
raise NotImplementedError()
|
59
|
-
|
60
156
|
def get_cached_value(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
|
61
157
|
with self.cache_lock:
|
62
158
|
entry = self.cache.get(key)
|
@@ -66,7 +162,7 @@ class BaseGrammarBackend:
|
|
66
162
|
return val.copy() if val else None
|
67
163
|
|
68
164
|
def get_future_value(self, key: Tuple[str, str]) -> Future:
|
69
|
-
return self.executor.submit(self.
|
165
|
+
return self.executor.submit(self._init_value, key)
|
70
166
|
|
71
167
|
def reset(self):
|
72
168
|
with self.cache_lock:
|
@@ -80,12 +176,18 @@ def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
|
|
80
176
|
grammar_backend = OutlinesGrammarBackend(
|
81
177
|
tokenizer,
|
82
178
|
whitespace_pattern=server_args.constrained_json_whitespace_pattern,
|
83
|
-
allow_jump_forward=not server_args.disable_jump_forward,
|
84
179
|
)
|
85
180
|
elif server_args.grammar_backend == "xgrammar":
|
86
181
|
from sglang.srt.constrained.xgrammar_backend import XGrammarGrammarBackend
|
87
182
|
|
88
183
|
grammar_backend = XGrammarGrammarBackend(tokenizer, vocab_size=vocab_size)
|
184
|
+
elif server_args.grammar_backend == "llguidance":
|
185
|
+
from sglang.srt.constrained.llguidance_backend import GuidanceBackend
|
186
|
+
|
187
|
+
grammar_backend = GuidanceBackend(
|
188
|
+
tokenizer=tokenizer,
|
189
|
+
whitespace_pattern=server_args.constrained_json_whitespace_pattern,
|
190
|
+
)
|
89
191
|
else:
|
90
192
|
raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
|
91
193
|
|