sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. sglang/api.py +1 -1
  2. sglang/bench_offline_throughput.py +19 -0
  3. sglang/bench_one_batch.py +2 -2
  4. sglang/bench_serving.py +123 -79
  5. sglang/global_config.py +8 -3
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/lang/ir.py +1 -1
  8. sglang/srt/_custom_ops.py +83 -91
  9. sglang/srt/configs/load_config.py +4 -1
  10. sglang/srt/configs/model_config.py +48 -2
  11. sglang/srt/configs/qwen2_5_vl_config.py +5 -2
  12. sglang/srt/constrained/base_grammar_backend.py +117 -15
  13. sglang/srt/constrained/llguidance_backend.py +151 -0
  14. sglang/srt/constrained/outlines_backend.py +24 -33
  15. sglang/srt/constrained/xgrammar_backend.py +69 -38
  16. sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
  17. sglang/srt/distributed/parallel_state.py +48 -3
  18. sglang/srt/entrypoints/engine.py +67 -9
  19. sglang/srt/entrypoints/http_server.py +190 -41
  20. sglang/srt/entrypoints/verl_engine.py +147 -0
  21. sglang/srt/function_call_parser.py +0 -1
  22. sglang/srt/layers/activation.py +11 -0
  23. sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
  24. sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
  25. sglang/srt/layers/attention/flashinfer_backend.py +220 -378
  26. sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
  27. sglang/srt/layers/attention/torch_native_backend.py +1 -1
  28. sglang/srt/layers/attention/triton_backend.py +9 -6
  29. sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
  30. sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
  31. sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
  32. sglang/srt/layers/attention/utils.py +39 -0
  33. sglang/srt/layers/attention/vision.py +60 -63
  34. sglang/srt/layers/dp_attention.py +142 -1
  35. sglang/srt/layers/layernorm.py +1 -1
  36. sglang/srt/layers/linear.py +3 -1
  37. sglang/srt/layers/logits_processor.py +281 -45
  38. sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
  39. sglang/srt/layers/moe/ep_moe/layer.py +140 -28
  40. sglang/srt/layers/moe/fused_moe_native.py +2 -0
  41. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
  43. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
  44. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
  45. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
  46. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
  47. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
  48. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
  49. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
  50. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
  51. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
  52. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
  53. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
  54. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
  55. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
  56. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
  57. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
  61. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
  62. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
  63. sglang/srt/layers/moe/topk.py +13 -4
  64. sglang/srt/layers/quantization/__init__.py +111 -7
  65. sglang/srt/layers/quantization/blockwise_int8.py +409 -0
  66. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  69. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  71. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  72. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  73. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  76. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  77. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  78. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  80. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  81. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  82. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  84. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  85. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  86. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  88. sglang/srt/layers/quantization/fp8.py +69 -28
  89. sglang/srt/layers/quantization/fp8_utils.py +17 -1
  90. sglang/srt/layers/quantization/gptq.py +416 -0
  91. sglang/srt/layers/quantization/int8_kernel.py +327 -0
  92. sglang/srt/layers/quantization/int8_utils.py +73 -0
  93. sglang/srt/layers/quantization/modelopt_quant.py +18 -1
  94. sglang/srt/layers/radix_attention.py +1 -0
  95. sglang/srt/layers/rotary_embedding.py +0 -1
  96. sglang/srt/layers/sampler.py +76 -31
  97. sglang/srt/layers/vocab_parallel_embedding.py +14 -13
  98. sglang/srt/lora/lora.py +17 -1
  99. sglang/srt/lora/lora_config.py +5 -0
  100. sglang/srt/lora/lora_manager.py +1 -3
  101. sglang/srt/managers/cache_controller.py +193 -62
  102. sglang/srt/managers/configure_logging.py +2 -1
  103. sglang/srt/managers/data_parallel_controller.py +6 -2
  104. sglang/srt/managers/detokenizer_manager.py +124 -102
  105. sglang/srt/managers/image_processor.py +2 -1
  106. sglang/srt/managers/io_struct.py +143 -6
  107. sglang/srt/managers/schedule_batch.py +237 -197
  108. sglang/srt/managers/schedule_policy.py +29 -29
  109. sglang/srt/managers/scheduler.py +681 -259
  110. sglang/srt/managers/session_controller.py +6 -2
  111. sglang/srt/managers/tokenizer_manager.py +224 -68
  112. sglang/srt/managers/tp_worker.py +15 -4
  113. sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
  114. sglang/srt/mem_cache/chunk_cache.py +18 -11
  115. sglang/srt/mem_cache/hiradix_cache.py +394 -0
  116. sglang/srt/mem_cache/memory_pool.py +44 -18
  117. sglang/srt/mem_cache/radix_cache.py +58 -47
  118. sglang/srt/metrics/collector.py +94 -36
  119. sglang/srt/model_executor/cuda_graph_runner.py +55 -24
  120. sglang/srt/model_executor/forward_batch_info.py +49 -16
  121. sglang/srt/model_executor/model_runner.py +208 -28
  122. sglang/srt/model_loader/loader.py +3 -3
  123. sglang/srt/model_loader/weight_utils.py +36 -14
  124. sglang/srt/models/baichuan.py +31 -6
  125. sglang/srt/models/chatglm.py +39 -7
  126. sglang/srt/models/commandr.py +29 -5
  127. sglang/srt/models/dbrx.py +31 -5
  128. sglang/srt/models/deepseek.py +43 -6
  129. sglang/srt/models/deepseek_nextn.py +32 -19
  130. sglang/srt/models/deepseek_v2.py +265 -32
  131. sglang/srt/models/exaone.py +19 -9
  132. sglang/srt/models/gemma.py +22 -8
  133. sglang/srt/models/gemma2.py +25 -12
  134. sglang/srt/models/gemma2_reward.py +5 -1
  135. sglang/srt/models/gpt2.py +28 -13
  136. sglang/srt/models/gpt_bigcode.py +27 -5
  137. sglang/srt/models/granite.py +21 -9
  138. sglang/srt/models/grok.py +21 -4
  139. sglang/srt/models/internlm2.py +36 -6
  140. sglang/srt/models/internlm2_reward.py +5 -1
  141. sglang/srt/models/llama.py +26 -9
  142. sglang/srt/models/llama_classification.py +5 -1
  143. sglang/srt/models/llama_eagle.py +17 -4
  144. sglang/srt/models/llama_embedding.py +5 -1
  145. sglang/srt/models/llama_reward.py +7 -2
  146. sglang/srt/models/llava.py +19 -3
  147. sglang/srt/models/llavavid.py +10 -1
  148. sglang/srt/models/minicpm.py +26 -2
  149. sglang/srt/models/minicpm3.py +39 -3
  150. sglang/srt/models/minicpmv.py +45 -14
  151. sglang/srt/models/mixtral.py +20 -9
  152. sglang/srt/models/mixtral_quant.py +50 -8
  153. sglang/srt/models/mllama.py +57 -11
  154. sglang/srt/models/olmo.py +34 -6
  155. sglang/srt/models/olmo2.py +34 -13
  156. sglang/srt/models/olmoe.py +26 -4
  157. sglang/srt/models/phi3_small.py +29 -10
  158. sglang/srt/models/qwen.py +26 -3
  159. sglang/srt/models/qwen2.py +26 -4
  160. sglang/srt/models/qwen2_5_vl.py +46 -8
  161. sglang/srt/models/qwen2_eagle.py +17 -5
  162. sglang/srt/models/qwen2_moe.py +44 -6
  163. sglang/srt/models/qwen2_rm.py +78 -0
  164. sglang/srt/models/qwen2_vl.py +39 -8
  165. sglang/srt/models/stablelm.py +32 -5
  166. sglang/srt/models/torch_native_llama.py +5 -2
  167. sglang/srt/models/xverse.py +21 -9
  168. sglang/srt/models/xverse_moe.py +45 -7
  169. sglang/srt/models/yivl.py +2 -1
  170. sglang/srt/openai_api/adapter.py +109 -24
  171. sglang/srt/openai_api/protocol.py +17 -1
  172. sglang/srt/reasoning_parser.py +154 -0
  173. sglang/srt/sampling/penaltylib/__init__.py +4 -6
  174. sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
  175. sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
  176. sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
  177. sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
  178. sglang/srt/sampling/sampling_batch_info.py +79 -157
  179. sglang/srt/sampling/sampling_params.py +16 -13
  180. sglang/srt/server_args.py +136 -52
  181. sglang/srt/speculative/build_eagle_tree.py +2 -8
  182. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -1
  183. sglang/srt/speculative/eagle_utils.py +92 -58
  184. sglang/srt/speculative/eagle_worker.py +186 -94
  185. sglang/srt/speculative/spec_info.py +1 -13
  186. sglang/srt/utils.py +43 -17
  187. sglang/srt/warmup.py +47 -0
  188. sglang/test/few_shot_gsm8k.py +4 -1
  189. sglang/test/runners.py +389 -126
  190. sglang/test/send_one.py +88 -0
  191. sglang/test/test_block_fp8_ep.py +361 -0
  192. sglang/test/test_programs.py +1 -1
  193. sglang/test/test_utils.py +138 -84
  194. sglang/utils.py +50 -60
  195. sglang/version.py +1 -1
  196. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/METADATA +21 -15
  197. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/RECORD +200 -166
  198. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/WHEEL +1 -1
  199. sglang/bench_latency.py +0 -1
  200. sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
  201. sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
  202. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
  203. sglang/test/srt/sampling/penaltylib/utils.py +0 -344
  204. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/LICENSE +0 -0
  205. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/top_level.txt +0 -0
sglang/srt/_custom_ops.py CHANGED
@@ -1,21 +1,19 @@
1
1
  # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/_custom_ops.py
2
- import contextlib
3
- import functools
4
- import importlib
5
2
  import logging
6
3
  import os
7
- from typing import TYPE_CHECKING, List, Optional, Tuple, Union
4
+ from typing import List, Tuple
8
5
 
9
6
  import torch
10
7
  import torch.library
11
8
 
12
- from sglang.srt.utils import is_hpu
9
+ from sglang.srt.utils import is_hip, is_hpu
13
10
 
14
11
  logger = logging.getLogger(__name__)
15
12
  use_vllm_custom_allreduce = os.environ.get("USE_VLLM_CUSTOM_ALLREDUCE", default=True)
16
13
 
17
14
  if not is_hpu():
18
- if use_vllm_custom_allreduce:
15
+ # ROCm does not use vllm custom allreduce
16
+ if use_vllm_custom_allreduce and not is_hip():
19
17
  try:
20
18
  import vllm._C
21
19
  except ImportError as e:
@@ -27,37 +25,8 @@ if not is_hpu():
27
25
  logger.warning("Failed to import from custom_ar with %r", e)
28
26
 
29
27
 
30
- def hint_on_error(fn):
31
-
32
- @functools.wraps(fn)
33
- def wrapper(*args, **kwargs):
34
- try:
35
- return fn(*args, **kwargs)
36
-
37
- except NotImplementedError as e:
38
- msg = (
39
- "Error in calling custom op %s: %s\n"
40
- "Not implemented or built, mostly likely because the current current device "
41
- "does not support this kernel (less likely TORCH_CUDA_ARCH_LIST was set "
42
- "incorrectly while building)"
43
- )
44
- logger.error(msg, fn.__name__, e)
45
- raise NotImplementedError(msg % (fn.__name__, e)) from e
46
- except AttributeError as e:
47
- msg = (
48
- "Error in calling custom op %s: %s\n"
49
- "Possibly you have built or installed an obsolete version of vllm.\n"
50
- "Please try a clean build and install of vllm,"
51
- "or remove old built files such as vllm/*cpython*.so and build/ ."
52
- )
53
- logger.error(msg, fn.__name__, e)
54
- raise e
55
-
56
- return wrapper
57
-
58
-
59
- if use_vllm_custom_allreduce:
60
- # custom ar
28
+ if use_vllm_custom_allreduce and not is_hip():
29
+ # vLLM custom allreduce
61
30
  def init_custom_ar(
62
31
  ipc_tensors: List[torch.Tensor],
63
32
  rank_data: torch.Tensor,
@@ -95,62 +64,85 @@ if use_vllm_custom_allreduce:
95
64
  torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
96
65
 
97
66
  else:
98
- # custom ar
99
- def init_custom_ar(
100
- rank_id: int,
101
- world_size: int,
102
- rank_data_base: torch.Tensor,
103
- buffers: List[int],
104
- tmp_result_buffers: List[int],
105
- barrier_in: List[int],
106
- barrier_out: List[int],
107
- ) -> int:
108
- return sgl_kernel.ops.init_custom_reduce(
109
- rank_id,
110
- world_size,
111
- rank_data_base,
112
- buffers,
113
- tmp_result_buffers,
114
- barrier_in,
115
- barrier_out,
116
- )
67
+ if is_hip():
68
+ # ROCM custom allreduce
69
+
70
+ def init_custom_ar(
71
+ meta: torch.Tensor,
72
+ rank_data: torch.Tensor,
73
+ handles: List[str],
74
+ offsets: List[int],
75
+ rank: int,
76
+ full_nvlink: bool,
77
+ ) -> int:
78
+ return sgl_kernel.ops.allreduce.init_custom_ar(
79
+ meta, rank_data, handles, offsets, rank, full_nvlink
80
+ )
117
81
 
118
- def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
119
- sgl_kernel.ops.custom_reduce(fa, inp, out)
82
+ def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
83
+ sgl_kernel.ops.allreduce.all_reduce_reg(fa, inp, out)
120
84
 
121
- def dispose(fa: int) -> None:
122
- sgl_kernel.ops.custom_dispose(fa)
85
+ def all_reduce_unreg(
86
+ fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
87
+ ) -> None:
88
+ sgl_kernel.ops.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
123
89
 
124
- def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
125
- return sgl_kernel.ops.get_graph_buffer_ipc_meta(fa)
90
+ def dispose(fa: int) -> None:
91
+ sgl_kernel.ops.allreduce.dispose(fa)
126
92
 
127
- def register_graph_buffers(
128
- fa: int, handles: List[List[int]], offsets: List[List[int]]
129
- ) -> None:
130
- sgl_kernel.ops.register_graph_buffers(fa, handles, offsets)
131
-
132
-
133
- # temporary fix for https://github.com/vllm-project/vllm/issues/5456
134
- # TODO: remove this in v0.6.0
135
- names_and_values = globals()
136
- names_and_values_to_update = {}
137
- # prepare variables to avoid dict size change during iteration
138
- k, v, arg = None, None, None
139
- fn_type = type(lambda x: x)
140
- for k, v in names_and_values.items():
141
- # find functions that are defined in this file and have torch.Tensor
142
- # in their annotations. `arg == "torch.Tensor"` is used to handle
143
- # the case when users use `import __annotations__` to turn type
144
- # hints into strings.
145
- if (
146
- isinstance(v, fn_type)
147
- and v.__code__.co_filename == __file__
148
- and any(
149
- arg is torch.Tensor or arg == "torch.Tensor"
150
- for arg in v.__annotations__.values()
151
- )
152
- ):
153
- names_and_values_to_update[k] = hint_on_error(v)
93
+ def meta_size() -> int:
94
+ return sgl_kernel.ops.allreduce.meta_size()
95
+
96
+ def register_buffer(
97
+ fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
98
+ ) -> None:
99
+ return sgl_kernel.ops.allreduce.register_buffer(fa, t, handles, offsets)
100
+
101
+ def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
102
+ return sgl_kernel.ops.allreduce.get_graph_buffer_ipc_meta(fa)
103
+
104
+ def register_graph_buffers(
105
+ fa: int, handles: List[str], offsets: List[List[int]]
106
+ ) -> None:
107
+ sgl_kernel.ops.allreduce.register_graph_buffers(fa, handles, offsets)
108
+
109
+ def allocate_meta_buffer(size: int) -> torch.Tensor:
110
+ return sgl_kernel.ops.allreduce.allocate_meta_buffer(size)
111
+
112
+ def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
113
+ return sgl_kernel.ops.allreduce.get_meta_buffer_ipc_handle(inp)
114
+
115
+ else:
116
+ # TRTLLM custom allreduce
117
+ def init_custom_ar(
118
+ rank_id: int,
119
+ world_size: int,
120
+ rank_data_base: torch.Tensor,
121
+ buffers: List[int],
122
+ tmp_result_buffers: List[int],
123
+ barrier_in: List[int],
124
+ barrier_out: List[int],
125
+ ) -> int:
126
+ return sgl_kernel.ops.init_custom_reduce(
127
+ rank_id,
128
+ world_size,
129
+ rank_data_base,
130
+ buffers,
131
+ tmp_result_buffers,
132
+ barrier_in,
133
+ barrier_out,
134
+ )
135
+
136
+ def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
137
+ sgl_kernel.ops.custom_reduce(fa, inp, out)
138
+
139
+ def dispose(fa: int) -> None:
140
+ sgl_kernel.ops.custom_dispose(fa)
141
+
142
+ def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
143
+ return sgl_kernel.ops.get_graph_buffer_ipc_meta(fa)
154
144
 
155
- names_and_values.update(names_and_values_to_update)
156
- del names_and_values_to_update, names_and_values, v, k, fn_type
145
+ def register_graph_buffers(
146
+ fa: int, handles: List[List[int]], offsets: List[List[int]]
147
+ ) -> None:
148
+ sgl_kernel.ops.register_graph_buffers(fa, handles, offsets)
@@ -21,6 +21,7 @@ class LoadFormat(str, enum.Enum):
21
21
  BITSANDBYTES = "bitsandbytes"
22
22
  MISTRAL = "mistral"
23
23
  LAYERED = "layered"
24
+ JAX = "jax"
24
25
 
25
26
 
26
27
  @dataclass
@@ -42,13 +43,15 @@ class LoadConfig:
42
43
  ignore_patterns: The list of patterns to ignore when loading the model.
43
44
  Default to "original/**/*" to avoid repeated loading of llama's
44
45
  checkpoints.
45
-
46
+ decryption_key_file: If set, decrypts the output files with a password read
47
+ from this file (after PBKDF2).
46
48
  """
47
49
 
48
50
  load_format: Union[str, LoadFormat] = LoadFormat.AUTO
49
51
  download_dir: Optional[str] = None
50
52
  model_loader_extra_config: Optional[Union[str, dict]] = field(default_factory=dict)
51
53
  ignore_patterns: Optional[Union[List[str], str]] = None
54
+ decryption_key_file: Optional[str] = None
52
55
 
53
56
  def __post_init__(self):
54
57
  model_loader_extra_config = self.model_loader_extra_config or {}
@@ -14,6 +14,7 @@
14
14
 
15
15
  import json
16
16
  import logging
17
+ import math
17
18
  from enum import IntEnum, auto
18
19
  from typing import List, Optional, Set, Union
19
20
 
@@ -39,10 +40,11 @@ class ModelConfig:
39
40
  trust_remote_code: bool = True,
40
41
  revision: Optional[str] = None,
41
42
  context_length: Optional[int] = None,
42
- model_override_args: Optional[dict] = None,
43
+ model_override_args: Optional[str] = None,
43
44
  is_embedding: Optional[bool] = None,
44
45
  dtype: str = "auto",
45
46
  quantization: Optional[str] = None,
47
+ override_config_file: Optional[str] = None,
46
48
  ) -> None:
47
49
  self.model_path = model_path
48
50
  self.revision = revision
@@ -50,11 +52,16 @@ class ModelConfig:
50
52
 
51
53
  # Parse args
52
54
  self.model_override_args = json.loads(model_override_args)
55
+ kwargs = {}
56
+ if override_config_file and override_config_file.strip():
57
+ kwargs["_configuration_file"] = override_config_file.strip()
58
+
53
59
  self.hf_config = get_config(
54
60
  model_path,
55
61
  trust_remote_code=trust_remote_code,
56
62
  revision=revision,
57
63
  model_override_args=self.model_override_args,
64
+ **kwargs,
58
65
  )
59
66
  self.hf_text_config = get_hf_text_config(self.hf_config)
60
67
 
@@ -63,6 +70,9 @@ class ModelConfig:
63
70
  self.hf_config.architectures, is_embedding
64
71
  )
65
72
  self.is_multimodal = is_multimodal_model(self.hf_config.architectures)
73
+ self.is_multimodal_gen = is_multimodal_gen_model(self.hf_config.architectures)
74
+ self.is_image_gen = is_image_gen_model(self.hf_config.architectures)
75
+ self.is_audio_model = is_audio_model(self.hf_config.architectures)
66
76
  self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
67
77
  self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
68
78
 
@@ -70,7 +80,9 @@ class ModelConfig:
70
80
  derived_context_len = get_context_length(self.hf_text_config)
71
81
  if context_length is not None:
72
82
  if context_length > derived_context_len:
73
- if get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN"):
83
+ if get_bool_env_var(
84
+ "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", default="False"
85
+ ):
74
86
  logger.warning(
75
87
  f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
76
88
  f"This may lead to incorrect model outputs or CUDA errors."
@@ -103,7 +115,20 @@ class ModelConfig:
103
115
  self.head_dim = 256
104
116
  self.attention_arch = AttentionArch.MLA
105
117
  self.kv_lora_rank = self.hf_config.kv_lora_rank
118
+ self.qk_nope_head_dim = self.hf_config.qk_nope_head_dim
106
119
  self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
120
+ self.v_head_dim = self.hf_config.v_head_dim
121
+
122
+ # Handle rope scaling with yarn
123
+ self.scaling = 1 / math.sqrt(self.qk_nope_head_dim + self.qk_rope_head_dim)
124
+ if self.hf_config.rope_scaling:
125
+ mscale_all_dim = self.hf_config.rope_scaling.get(
126
+ "mscale_all_dim", False
127
+ )
128
+ scaling_factor = self.hf_config.rope_scaling["factor"]
129
+ mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
130
+ self.scaling = self.scaling * mscale * mscale
131
+
107
132
  elif "MiniCPM3ForCausalLM" in self.hf_config.architectures:
108
133
  self.head_dim = 128
109
134
  self.attention_arch = AttentionArch.MLA
@@ -389,6 +414,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
389
414
  or "LlamaForSequenceClassification" in model_architectures
390
415
  or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
391
416
  or "InternLM2ForRewardModel" in model_architectures
417
+ or "Qwen2ForRewardModel" in model_architectures
392
418
  ):
393
419
  return False
394
420
  else:
@@ -401,6 +427,8 @@ def is_multimodal_model(model_architectures: List[str]):
401
427
  or "LlavaQwenForCausalLM" in model_architectures
402
428
  or "LlavaMistralForCausalLM" in model_architectures
403
429
  or "LlavaVidForCausalLM" in model_architectures
430
+ or "Grok1VForCausalLM" in model_architectures
431
+ or "Grok1AForCausalLM" in model_architectures
404
432
  or "MllamaForConditionalGeneration" in model_architectures
405
433
  or "Qwen2VLForConditionalGeneration" in model_architectures
406
434
  or "Qwen2_5_VLForConditionalGeneration" in model_architectures
@@ -411,5 +439,23 @@ def is_multimodal_model(model_architectures: List[str]):
411
439
  return False
412
440
 
413
441
 
442
+ def is_multimodal_gen_model(model_architectures: List[str]):
443
+ return False
444
+
445
+
446
+ def is_image_gen_model(model_architectures: List[str]):
447
+ return False
448
+
449
+
450
+ def is_audio_model(model_architectures: List[str]):
451
+ return False
452
+
453
+
414
454
  def is_encoder_decoder_model(model_architectures: List[str]):
415
455
  return "MllamaForConditionalGeneration" in model_architectures
456
+
457
+
458
+ def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
459
+ if scale <= 1:
460
+ return 1.0
461
+ return 0.1 * mscale * math.log(scale) + 1.0
@@ -48,13 +48,16 @@ from transformers.image_utils import (
48
48
  validate_preprocess_arguments,
49
49
  )
50
50
  from transformers.modeling_rope_utils import rope_config_validation
51
- from transformers.models.mllama.image_processing_mllama import is_valid_list_of_images
52
51
  from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
53
52
  from transformers.processing_utils import ProcessingKwargs, Unpack, VideosKwargs
54
53
  from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
55
54
  from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
56
55
 
57
56
 
57
+ def is_valid_list_of_images(images: List):
58
+ return images and all(is_valid_image(image) for image in images)
59
+
60
+
58
61
  class Qwen2_5_VLVisionConfig(PretrainedConfig):
59
62
  model_type = "qwen2_5_vl"
60
63
  base_config_key = "vision_config"
@@ -999,5 +1002,5 @@ class Qwen2_5_VLImageProcessor(BaseImageProcessor):
999
1002
  return BatchFeature(data=data, tensor_type=return_tensors)
1000
1003
 
1001
1004
 
1002
- AutoImageProcessor.register(Qwen2_5_VLConfig, Qwen2_5_VLImageProcessor)
1005
+ AutoImageProcessor.register(Qwen2_5_VLConfig, None, Qwen2_5_VLImageProcessor, None)
1003
1006
  AutoProcessor.register(Qwen2_5_VLConfig, Qwen2_5_VLProcessor)
@@ -13,31 +13,130 @@
13
13
  # ==============================================================================
14
14
  """The baseclass of a backend for grammar-guided constrained decoding."""
15
15
 
16
+ import logging
17
+ from abc import ABC, abstractmethod
16
18
  from concurrent.futures import Future, ThreadPoolExecutor
17
19
  from dataclasses import dataclass
18
20
  from threading import Event, Lock
19
- from typing import Any, Optional, Tuple
21
+ from typing import Dict, List, Optional, Tuple
22
+
23
+ import torch
20
24
 
21
25
  from sglang.srt.server_args import ServerArgs
22
26
 
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class BaseGrammarObject(ABC):
31
+ @abstractmethod
32
+ def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
33
+ """
34
+ Try to jump forward in the grammar.
35
+
36
+ Returns:
37
+ A jump forward helper which may be used in `jump_forward_str_state`.
38
+ None if the jump forward is not possible.
39
+ """
40
+ raise NotImplementedError
41
+
42
+ @abstractmethod
43
+ def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
44
+ """
45
+ Jump forward for the grammar.
46
+
47
+ Returns:
48
+ A tuple of the jump forward string and the next state of the grammar
49
+ (which can be used in `jump_and_retokenize` if needed).
50
+ """
51
+ raise NotImplementedError
52
+
53
+ @abstractmethod
54
+ def jump_and_retokenize(
55
+ self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
56
+ ) -> None:
57
+ """
58
+ Jump forward occurs, and update the grammar state if needed.
59
+ """
60
+ raise NotImplementedError
61
+
62
+ @abstractmethod
63
+ def allocate_vocab_mask(
64
+ self, vocab_size: int, batch_size: int, device
65
+ ) -> torch.Tensor:
66
+ raise NotImplementedError
67
+
68
+ @abstractmethod
69
+ def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
70
+ raise NotImplementedError
71
+
72
+ @staticmethod
73
+ @abstractmethod
74
+ def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
75
+ raise NotImplementedError
76
+
77
+ @staticmethod
78
+ @abstractmethod
79
+ def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
80
+ raise NotImplementedError
81
+
82
+ @abstractmethod
83
+ def copy(self) -> "BaseGrammarObject":
84
+ raise NotImplementedError
85
+
23
86
 
24
87
  @dataclass
25
88
  class CacheEntry:
26
- value: Any
89
+ value: Optional[BaseGrammarObject]
27
90
  event: Event
28
91
 
29
92
 
30
- class BaseGrammarObject:
31
- pass
32
-
33
-
34
- class BaseGrammarBackend:
93
+ class BaseGrammarBackend(ABC):
35
94
  def __init__(self):
36
95
  self.executor = ThreadPoolExecutor()
37
- self.cache = {}
96
+ self.cache: Dict[Tuple[str, str], CacheEntry] = {}
38
97
  self.cache_lock = Lock()
39
98
 
40
- def init_value(self, key: Tuple[str, str]) -> BaseGrammarObject:
99
+ def _not_supported(self, key_type: str, key_string: str) -> None:
100
+ logger.warning(f"Skip unsupported {key_type}: {key_type}={key_string}")
101
+
102
+ def dispatch_fallback(
103
+ self, key_type: str, key_string: str
104
+ ) -> Optional[BaseGrammarObject]:
105
+ """
106
+ This function should not be reached in any case.
107
+ """
108
+ raise ValueError(f"Invalid key_type: {key_type}={key_string}")
109
+
110
+ @abstractmethod
111
+ def dispatch_json(self, key_string: str) -> Optional[BaseGrammarObject]:
112
+ return self._not_supported("json", key_string)
113
+
114
+ @abstractmethod
115
+ def dispatch_regex(self, key_string: str) -> Optional[BaseGrammarObject]:
116
+ return self._not_supported("regex", key_string)
117
+
118
+ @abstractmethod
119
+ def dispatch_ebnf(self, key_string: str) -> Optional[BaseGrammarObject]:
120
+ return self._not_supported("ebnf", key_string)
121
+
122
+ @abstractmethod
123
+ def dispatch_structural_tag(self, key_string: str) -> Optional[BaseGrammarObject]:
124
+ return self._not_supported("structural_tag", key_string)
125
+
126
+ def _init_value_dispatch(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
127
+ key_type, key_string = key
128
+ if key_type == "json":
129
+ return self.dispatch_json(key_string)
130
+ elif key_type == "regex":
131
+ return self.dispatch_regex(key_string)
132
+ elif key_type == "ebnf":
133
+ return self.dispatch_ebnf(key_string)
134
+ elif key_type == "structural_tag":
135
+ return self.dispatch_structural_tag(key_string)
136
+ else:
137
+ return self.dispatch_fallback(key_type, key_string)
138
+
139
+ def _init_value(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
41
140
  with self.cache_lock:
42
141
  if key in self.cache:
43
142
  cache_hit = True
@@ -50,13 +149,10 @@ class BaseGrammarBackend:
50
149
  if cache_hit:
51
150
  entry.event.wait()
52
151
  else:
53
- entry.value = self.init_value_impl(key)
152
+ entry.value = self._init_value_dispatch(key)
54
153
  entry.event.set()
55
154
  return entry.value.copy() if entry.value else None
56
155
 
57
- def init_value_impl(self, key: Tuple[str, str]) -> BaseGrammarObject:
58
- raise NotImplementedError()
59
-
60
156
  def get_cached_value(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
61
157
  with self.cache_lock:
62
158
  entry = self.cache.get(key)
@@ -66,7 +162,7 @@ class BaseGrammarBackend:
66
162
  return val.copy() if val else None
67
163
 
68
164
  def get_future_value(self, key: Tuple[str, str]) -> Future:
69
- return self.executor.submit(self.init_value, key)
165
+ return self.executor.submit(self._init_value, key)
70
166
 
71
167
  def reset(self):
72
168
  with self.cache_lock:
@@ -80,12 +176,18 @@ def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
80
176
  grammar_backend = OutlinesGrammarBackend(
81
177
  tokenizer,
82
178
  whitespace_pattern=server_args.constrained_json_whitespace_pattern,
83
- allow_jump_forward=not server_args.disable_jump_forward,
84
179
  )
85
180
  elif server_args.grammar_backend == "xgrammar":
86
181
  from sglang.srt.constrained.xgrammar_backend import XGrammarGrammarBackend
87
182
 
88
183
  grammar_backend = XGrammarGrammarBackend(tokenizer, vocab_size=vocab_size)
184
+ elif server_args.grammar_backend == "llguidance":
185
+ from sglang.srt.constrained.llguidance_backend import GuidanceBackend
186
+
187
+ grammar_backend = GuidanceBackend(
188
+ tokenizer=tokenizer,
189
+ whitespace_pattern=server_args.constrained_json_whitespace_pattern,
190
+ )
89
191
  else:
90
192
  raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
91
193