sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,576 @@
1
+ from transformers import PretrainedConfig
2
+ from transformers.modeling_rope_utils import rope_config_validation
3
+
4
+
5
+ class Qwen3VLVisionConfig(PretrainedConfig):
6
+ model_type = "qwen3_vl"
7
+ base_config_key = "vision_config"
8
+
9
+ def __init__(
10
+ self,
11
+ depth=27,
12
+ hidden_size=1152,
13
+ hidden_act="gelu_pytorch_tanh",
14
+ intermediate_size=4304,
15
+ num_heads=16,
16
+ in_channels=3,
17
+ patch_size=16,
18
+ spatial_merge_size=2,
19
+ temporal_patch_size=2,
20
+ out_hidden_size=3584,
21
+ num_position_embeddings=2304,
22
+ deepstack_visual_indexes=[8, 16, 24],
23
+ initializer_range=0.02,
24
+ **kwargs,
25
+ ):
26
+ super().__init__(**kwargs)
27
+
28
+ self.depth = depth
29
+ self.hidden_size = hidden_size
30
+ self.hidden_act = hidden_act
31
+ self.intermediate_size = intermediate_size
32
+ self.num_heads = num_heads
33
+ self.in_channels = in_channels
34
+ self.patch_size = patch_size
35
+ self.spatial_merge_size = spatial_merge_size
36
+ self.temporal_patch_size = temporal_patch_size
37
+ self.out_hidden_size = out_hidden_size
38
+ self.num_position_embeddings = num_position_embeddings
39
+ self.initializer_range = initializer_range
40
+ self.deepstack_visual_indexes = deepstack_visual_indexes
41
+
42
+
43
+ class Qwen3VLTextConfig(PretrainedConfig):
44
+ r"""
45
+ This is the configuration class to store the configuration of a [`Qwen3VLTextModel`]. It is used to instantiate a
46
+ Qwen3-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
47
+ with the defaults will yield a similar configuration to that of
48
+ Qwen3-VL-4B-Instruct [Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct).
49
+
50
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
51
+ documentation from [`PretrainedConfig`] for more information.
52
+
53
+ Args:
54
+ vocab_size (`int`, *optional*, defaults to 151936):
55
+ Vocabulary size of the Qwen3VL model. Defines the number of different tokens that can be represented by the
56
+ `inputs_ids` passed when calling [`Qwen3VLModel`]
57
+ hidden_size (`int`, *optional*, defaults to 4096):
58
+ Dimension of the hidden representations.
59
+ intermediate_size (`int`, *optional*, defaults to 22016):
60
+ Dimension of the MLP representations.
61
+ num_hidden_layers (`int`, *optional*, defaults to 32):
62
+ Number of hidden layers in the Transformer encoder.
63
+ num_attention_heads (`int`, *optional*, defaults to 32):
64
+ Number of attention heads for each attention layer in the Transformer encoder.
65
+ num_key_value_heads (`int`, *optional*, defaults to 32):
66
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
67
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
68
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
69
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
70
+ by meanpooling all the original heads within that group. For more details, check out [this
71
+ paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
72
+ head_dim (`int`, *optional*, defaults to 128):
73
+ The dimension of the head. If not specified, will default to `hidden_size // num_attention_heads`.
74
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
75
+ The non-linear activation function (function or string) in the decoder.
76
+ max_position_embeddings (`int`, *optional*, defaults to 128000):
77
+ The maximum sequence length that this model might ever be used with.
78
+ initializer_range (`float`, *optional*, defaults to 0.02):
79
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
80
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
81
+ The epsilon used by the rms normalization layers.
82
+ use_cache (`bool`, *optional*, defaults to `True`):
83
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
84
+ relevant if `config.is_decoder=True`.
85
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
86
+ Whether the model's input and output word embeddings should be tied.
87
+ rope_theta (`float`, *optional*, defaults to 5000000.0):
88
+ The base period of the RoPE embeddings.
89
+ rope_scaling (`Dict`, *optional*):
90
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
91
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
92
+ accordingly.
93
+ Expected contents:
94
+ `rope_type` (`str`):
95
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
96
+ 'llama3'], with 'default' being the original RoPE implementation.
97
+ `factor` (`float`, *optional*):
98
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
99
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
100
+ original maximum pre-trained length.
101
+ `original_max_position_embeddings` (`int`, *optional*):
102
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
103
+ pretraining.
104
+ `attention_factor` (`float`, *optional*):
105
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
106
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
107
+ `factor` field to infer the suggested value.
108
+ `beta_fast` (`float`, *optional*):
109
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
110
+ ramp function. If unspecified, it defaults to 32.
111
+ `beta_slow` (`float`, *optional*):
112
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
113
+ ramp function. If unspecified, it defaults to 1.
114
+ `short_factor` (`list[float]`, *optional*):
115
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
116
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
117
+ size divided by the number of attention heads divided by 2
118
+ `long_factor` (`list[float]`, *optional*):
119
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
120
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
121
+ size divided by the number of attention heads divided by 2
122
+ `low_freq_factor` (`float`, *optional*):
123
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
124
+ `high_freq_factor` (`float`, *optional*):
125
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
126
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
127
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
128
+ attention_dropout (`float`, *optional*, defaults to 0.0):
129
+ The dropout ratio for the attention probabilities.
130
+
131
+ ```python
132
+ >>> from transformers import Qwen3VLTextModel, Qwen3VLTextConfig
133
+
134
+ >>> # Initializing a Qwen3VL style configuration
135
+ >>> configuration = Qwen3VLTextConfig()
136
+
137
+ >>> # Initializing a model from the Qwen3-VL-7B style configuration
138
+ >>> model = Qwen3VLTextModel(configuration)
139
+
140
+ >>> # Accessing the model configuration
141
+ >>> configuration = model.config
142
+ ```"""
143
+
144
+ model_type = "qwen3_vl_text"
145
+ base_config_key = "text_config"
146
+
147
+ def __init__(
148
+ self,
149
+ vocab_size=151936,
150
+ hidden_size=4096,
151
+ intermediate_size=22016,
152
+ num_hidden_layers=32,
153
+ num_attention_heads=32,
154
+ num_key_value_heads=32,
155
+ head_dim=128,
156
+ hidden_act="silu",
157
+ max_position_embeddings=128000,
158
+ initializer_range=0.02,
159
+ rms_norm_eps=1e-6,
160
+ use_cache=True,
161
+ tie_word_embeddings=False,
162
+ rope_theta=5000000.0,
163
+ rope_scaling=None,
164
+ attention_bias=False,
165
+ attention_dropout=0.0,
166
+ **kwargs,
167
+ ):
168
+ self.vocab_size = vocab_size
169
+ self.max_position_embeddings = max_position_embeddings
170
+ self.hidden_size = hidden_size
171
+ self.intermediate_size = intermediate_size
172
+ self.num_hidden_layers = num_hidden_layers
173
+ self.num_attention_heads = num_attention_heads
174
+
175
+ # for backward compatibility
176
+ if num_key_value_heads is None:
177
+ num_key_value_heads = num_attention_heads
178
+
179
+ self.num_key_value_heads = num_key_value_heads
180
+ self.head_dim = head_dim
181
+ self.hidden_act = hidden_act
182
+ self.initializer_range = initializer_range
183
+ self.rms_norm_eps = rms_norm_eps
184
+ self.use_cache = use_cache
185
+ self.rope_theta = rope_theta
186
+ self.rope_scaling = rope_scaling
187
+ self.attention_bias = attention_bias
188
+ self.attention_dropout = attention_dropout
189
+
190
+ rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"})
191
+
192
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
193
+
194
+
195
+ class Qwen3VLConfig(PretrainedConfig):
196
+ r"""
197
+ This is the configuration class to store the configuration of a [`Qwen3VLModel`]. It is used to instantiate a
198
+ Qwen3-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
199
+ with the defaults will yield a similar configuration to that of
200
+ Qwen3-VL-4B-Instruct [Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct).
201
+
202
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
203
+ documentation from [`PretrainedConfig`] for more information.
204
+
205
+
206
+ Args:
207
+ text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLTextConfig`):
208
+ The config object or dictionary of the text backbone.
209
+ vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLVisionConfig`):
210
+ The config object or dictionary of the vision backbone.
211
+ image_token_id (`int`, *optional*, defaults to 151655):
212
+ The image token index to encode the image prompt.
213
+ video_token_id (`int`, *optional*, defaults to 151656):
214
+ The video token index to encode the image prompt.
215
+ vision_start_token_id (`int`, *optional*, defaults to 151652):
216
+ The start token index to encode the image prompt.
217
+ vision_end_token_id (`int`, *optional*, defaults to 151653):
218
+ The end token index to encode the image prompt.
219
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
220
+ Whether to tie the word embeddings.
221
+
222
+ ```python
223
+ >>> from transformers import Qwen3VLForConditionalGeneration, Qwen3VLConfig
224
+
225
+ >>> # Initializing a Qwen3-VL style configuration
226
+ >>> configuration = Qwen3VLConfig()
227
+
228
+ >>> # Initializing a model from the Qwen3-VL-4B style configuration
229
+ >>> model = Qwen3VLForConditionalGeneration(configuration)
230
+
231
+ >>> # Accessing the model configuration
232
+ >>> configuration = model.config
233
+ ```"""
234
+
235
+ model_type = "qwen3_vl"
236
+ sub_configs = {
237
+ "vision_config": Qwen3VLVisionConfig,
238
+ "text_config": Qwen3VLTextConfig,
239
+ }
240
+ keys_to_ignore_at_inference = ["past_key_values"]
241
+
242
+ def __init__(
243
+ self,
244
+ text_config=None,
245
+ vision_config=None,
246
+ image_token_id=151655,
247
+ video_token_id=151656,
248
+ vision_start_token_id=151652,
249
+ vision_end_token_id=151653,
250
+ tie_word_embeddings=False,
251
+ **kwargs,
252
+ ):
253
+ if isinstance(vision_config, dict):
254
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
255
+ elif vision_config is None:
256
+ self.vision_config = self.sub_configs["vision_config"]()
257
+
258
+ if isinstance(text_config, dict):
259
+ self.text_config = self.sub_configs["text_config"](**text_config)
260
+ elif text_config is None:
261
+ self.text_config = self.sub_configs["text_config"]()
262
+
263
+ self.image_token_id = image_token_id
264
+ self.video_token_id = video_token_id
265
+ self.vision_start_token_id = vision_start_token_id
266
+ self.vision_end_token_id = vision_end_token_id
267
+ super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
268
+
269
+
270
+ class Qwen3VLMoeTextConfig(PretrainedConfig):
271
+ r"""
272
+ This is the configuration class to store the configuration of a [`Qwen3VLMoeTextModel`]. It is used to instantiate a
273
+ Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration
274
+ with the defaults will yield a similar configuration to that of
275
+ Qwen3-VL-30B-A3B-Instruct [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct).
276
+
277
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
278
+ documentation from [`PretrainedConfig`] for more information.
279
+
280
+ Args:
281
+ vocab_size (`int`, *optional*, defaults to 151936):
282
+ Vocabulary size of the Qwen2MoE model. Defines the number of different tokens that can be represented by the
283
+ `inputs_ids` passed when calling [`Qwen2MoeModel`]
284
+ hidden_size (`int`, *optional*, defaults to 2048):
285
+ Dimension of the hidden representations.
286
+ intermediate_size (`int`, *optional*, defaults to 5632):
287
+ Dimension of the MLP representations.
288
+ num_hidden_layers (`int`, *optional*, defaults to 24):
289
+ Number of hidden layers in the Transformer encoder.
290
+ num_attention_heads (`int`, *optional*, defaults to 16):
291
+ Number of attention heads for each attention layer in the Transformer encoder.
292
+ num_key_value_heads (`int`, *optional*, defaults to 16):
293
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
294
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
295
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
296
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
297
+ by meanpooling all the original heads within that group. For more details checkout [this
298
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
299
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
300
+ The non-linear activation function (function or string) in the decoder.
301
+ max_position_embeddings (`int`, *optional*, defaults to 128000):
302
+ The maximum sequence length that this model might ever be used with.
303
+ initializer_range (`float`, *optional*, defaults to 0.02):
304
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
305
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
306
+ The epsilon used by the rms normalization layers.
307
+ use_cache (`bool`, *optional*, defaults to `True`):
308
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
309
+ relevant if `config.is_decoder=True`.
310
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
311
+ Whether the model's input and output word embeddings should be tied.
312
+ rope_theta (`float`, *optional*, defaults to 5000000.0):
313
+ The base period of the RoPE embeddings.
314
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
315
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
316
+ attention_dropout (`float`, *optional*, defaults to 0.0):
317
+ The dropout ratio for the attention probabilities.
318
+ decoder_sparse_step (`int`, *optional*, defaults to 1):
319
+ The frequency of the MoE layer.
320
+ moe_intermediate_size (`int`, *optional*, defaults to 1408):
321
+ Intermediate size of the routed expert.
322
+ num_experts_per_tok (`int`, *optional*, defaults to 4):
323
+ Number of selected experts.
324
+ num_experts (`int`, *optional*, defaults to 60):
325
+ Number of routed experts.
326
+ norm_topk_prob (`bool`, *optional*, defaults to `True`):
327
+ Whether to normalize the topk probabilities.
328
+ mlp_only_layers (`List[int]`, *optional*, defaults to `[]`):
329
+ Indicate which layers use Qwen3VLMoeMLP rather than Qwen3VLMoeSparseMoeBlock
330
+ The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
331
+ If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
332
+ rope_scaling (`Dict`, *optional*):
333
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
334
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
335
+ accordingly.
336
+ Expected contents:
337
+ `rope_type` (`str`):
338
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
339
+ 'llama3'], with 'default' being the original RoPE implementation.
340
+ `factor` (`float`, *optional*):
341
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
342
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
343
+ original maximum pre-trained length.
344
+ `original_max_position_embeddings` (`int`, *optional*):
345
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
346
+ pretraining.
347
+ `attention_factor` (`float`, *optional*):
348
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
349
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
350
+ `factor` field to infer the suggested value.
351
+ `beta_fast` (`float`, *optional*):
352
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
353
+ ramp function. If unspecified, it defaults to 32.
354
+ `beta_slow` (`float`, *optional*):
355
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
356
+ ramp function. If unspecified, it defaults to 1.
357
+ `short_factor` (`List[float]`, *optional*):
358
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
359
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
360
+ size divided by the number of attention heads divided by 2
361
+ `long_factor` (`List[float]`, *optional*):
362
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
363
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
364
+ size divided by the number of attention heads divided by 2
365
+ `low_freq_factor` (`float`, *optional*):
366
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
367
+ `high_freq_factor` (`float`, *optional*):
368
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
369
+ head_dim (`int`, *optional*):
370
+ The dimension of the head. If not specified, will default to `hidden_size // num_attention_heads`.
371
+
372
+ ```python
373
+ >>> from transformers import Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeConfig
374
+
375
+ >>> # Initializing a Qwen3VLMoe style configuration
376
+ >>> configuration = Qwen3VLMoeConfig()
377
+
378
+ >>> # Initializing a model from the Qwen3-VL-30B-A3B style configuration
379
+ >>> model = Qwen3VLMoeForConditionalGeneration(configuration)
380
+
381
+ >>> # Accessing the model configuration
382
+ >>> configuration = model.config
383
+ ```"""
384
+
385
+ model_type = "qwen3_vl_moe_text"
386
+ base_config_key = "text_config"
387
+ keys_to_ignore_at_inference = ["past_key_values"]
388
+ # Default tensor parallel plan for base model `Qwen3VLMoe`
389
+ base_model_tp_plan = {
390
+ "layers.*.self_attn.q_proj": "colwise",
391
+ "layers.*.self_attn.k_proj": "colwise",
392
+ "layers.*.self_attn.v_proj": "colwise",
393
+ "layers.*.self_attn.o_proj": "rowwise",
394
+ "layers.*.mlp.gate_proj": "colwise",
395
+ "layers.*.mlp.up_proj": "colwise",
396
+ "layers.*.mlp.down_proj": "rowwise",
397
+ }
398
+ base_model_pp_plan = {
399
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
400
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
401
+ "norm": (["hidden_states"], ["hidden_states"]),
402
+ }
403
+
404
+ def __init__(
405
+ self,
406
+ vocab_size=151936,
407
+ hidden_size=2048,
408
+ intermediate_size=5632,
409
+ num_hidden_layers=24,
410
+ num_attention_heads=16,
411
+ num_key_value_heads=16,
412
+ hidden_act="silu",
413
+ max_position_embeddings=128000,
414
+ initializer_range=0.02,
415
+ rms_norm_eps=1e-6,
416
+ use_cache=True,
417
+ tie_word_embeddings=False,
418
+ rope_theta=5000000.0,
419
+ attention_bias=False,
420
+ attention_dropout=0.0,
421
+ decoder_sparse_step=1,
422
+ moe_intermediate_size=1408,
423
+ num_experts_per_tok=4,
424
+ num_experts=60,
425
+ norm_topk_prob=True,
426
+ mlp_only_layers=None,
427
+ rope_scaling=None,
428
+ head_dim=None,
429
+ **kwargs,
430
+ ):
431
+ self.vocab_size = vocab_size
432
+ self.max_position_embeddings = max_position_embeddings
433
+ self.hidden_size = hidden_size
434
+ self.intermediate_size = intermediate_size
435
+ self.num_hidden_layers = num_hidden_layers
436
+ self.num_attention_heads = num_attention_heads
437
+
438
+ # for backward compatibility
439
+ if num_key_value_heads is None:
440
+ num_key_value_heads = num_attention_heads
441
+
442
+ self.num_key_value_heads = num_key_value_heads
443
+ self.hidden_act = hidden_act
444
+ self.initializer_range = initializer_range
445
+ self.rms_norm_eps = rms_norm_eps
446
+ self.use_cache = use_cache
447
+ self.rope_theta = rope_theta
448
+ self.attention_bias = attention_bias
449
+ self.attention_dropout = attention_dropout
450
+ self.rope_scaling = rope_scaling
451
+ self.head_dim = head_dim or hidden_size // num_attention_heads
452
+
453
+ rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"})
454
+
455
+ # MoE arguments
456
+ self.decoder_sparse_step = decoder_sparse_step
457
+ self.moe_intermediate_size = moe_intermediate_size
458
+ self.num_experts_per_tok = num_experts_per_tok
459
+ self.num_experts = num_experts
460
+ self.norm_topk_prob = norm_topk_prob
461
+ self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
462
+
463
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
464
+
465
+
466
+ class Qwen3VLMoeVisionConfig(PretrainedConfig):
467
+ model_type = "qwen3_vl_moe"
468
+ base_config_key = "vision_config"
469
+
470
+ def __init__(
471
+ self,
472
+ depth=27,
473
+ hidden_size=1152,
474
+ hidden_act="gelu_pytorch_tanh",
475
+ intermediate_size=4304,
476
+ num_heads=16,
477
+ in_channels=3,
478
+ patch_size=16,
479
+ spatial_merge_size=2,
480
+ temporal_patch_size=2,
481
+ out_hidden_size=3584,
482
+ num_position_embeddings=2304,
483
+ deepstack_visual_indexes=[8, 16, 24],
484
+ initializer_range=0.02,
485
+ **kwargs,
486
+ ):
487
+ super().__init__(**kwargs)
488
+
489
+ self.depth = depth
490
+ self.hidden_size = hidden_size
491
+ self.hidden_act = hidden_act
492
+ self.intermediate_size = intermediate_size
493
+ self.num_heads = num_heads
494
+ self.in_channels = in_channels
495
+ self.patch_size = patch_size
496
+ self.spatial_merge_size = spatial_merge_size
497
+ self.temporal_patch_size = temporal_patch_size
498
+ self.out_hidden_size = out_hidden_size
499
+ self.num_position_embeddings = num_position_embeddings
500
+ self.initializer_range = initializer_range
501
+ self.deepstack_visual_indexes = deepstack_visual_indexes
502
+
503
+
504
+ class Qwen3VLMoeConfig(PretrainedConfig):
505
+ r"""
506
+ This is the configuration class to store the configuration of a [`Qwen3VLMoeModel`]. It is used to instantiate a
507
+ Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration
508
+ with the defaults will yield a similar configuration to that of
509
+ Qwen3-VL-30B-A3B-Instruct [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct).
510
+
511
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
512
+ documentation from [`PretrainedConfig`] for more information.
513
+
514
+
515
+ Args:
516
+ text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLMoeTextConfig`):
517
+ The config object or dictionary of the text backbone.
518
+ vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLMoeVisionConfig`):
519
+ The config object or dictionary of the vision backbone.
520
+ image_token_id (`int`, *optional*, defaults to 151655):
521
+ The image token index to encode the image prompt.
522
+ video_token_id (`int`, *optional*, defaults to 151656):
523
+ The video token index to encode the image prompt.
524
+ vision_start_token_id (`int`, *optional*, defaults to 151652):
525
+ The start token index to encode the image prompt.
526
+ vision_end_token_id (`int`, *optional*, defaults to 151653):
527
+ The end token index to encode the image prompt.
528
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
529
+ Whether to tie the word embeddings.
530
+
531
+ ```python
532
+ >>> from transformers import Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeConfig
533
+
534
+ >>> # Initializing a Qwen3-VL-MOE style configuration
535
+ >>> configuration = Qwen3VLMoeConfig()
536
+
537
+ >>> # Initializing a model from the Qwen3-VL-30B-A3B style configuration
538
+ >>> model = Qwen3VLMoeForConditionalGeneration(configuration)
539
+
540
+ >>> # Accessing the model configuration
541
+ >>> configuration = model.config
542
+ ```"""
543
+
544
+ model_type = "qwen3_vl_moe"
545
+ sub_configs = {
546
+ "vision_config": Qwen3VLMoeVisionConfig,
547
+ "text_config": Qwen3VLMoeTextConfig,
548
+ }
549
+ keys_to_ignore_at_inference = ["past_key_values"]
550
+
551
+ def __init__(
552
+ self,
553
+ text_config=None,
554
+ vision_config=None,
555
+ image_token_id=151655,
556
+ video_token_id=151656,
557
+ vision_start_token_id=151652,
558
+ vision_end_token_id=151653,
559
+ tie_word_embeddings=False,
560
+ **kwargs,
561
+ ):
562
+ if isinstance(vision_config, dict):
563
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
564
+ elif vision_config is None:
565
+ self.vision_config = self.sub_configs["vision_config"]()
566
+
567
+ if isinstance(text_config, dict):
568
+ self.text_config = self.sub_configs["text_config"](**text_config)
569
+ elif text_config is None:
570
+ self.text_config = self.sub_configs["text_config"]()
571
+
572
+ self.image_token_id = image_token_id
573
+ self.video_token_id = video_token_id
574
+ self.vision_start_token_id = vision_start_token_id
575
+ self.vision_end_token_id = vision_end_token_id
576
+ super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
@@ -1,7 +1,7 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
 
3
3
  import logging
4
- from typing import Generator, List, Optional, Tuple
4
+ from typing import Generator, Optional, Tuple
5
5
  from urllib.parse import urlparse
6
6
 
7
7
  import torch
@@ -35,6 +35,7 @@ class GrammarStats:
35
35
  is_cache_hit: bool = False
36
36
  is_grammar_aborted: bool = False
37
37
  tree_traversal_time: List[float] = field(default_factory=list)
38
+ dispatch_type: Optional[str] = None
38
39
 
39
40
 
40
41
  class BaseGrammarObject:
@@ -223,13 +224,17 @@ def create_grammar_backend(
223
224
  eos_list = list(eos_token_ids) if eos_token_ids else None
224
225
 
225
226
  grammar_backend = XGrammarGrammarBackend(
226
- tokenizer, vocab_size=vocab_size, model_eos_token_ids=eos_list
227
+ tokenizer,
228
+ vocab_size=vocab_size,
229
+ model_eos_token_ids=eos_list,
230
+ any_whitespace=not server_args.constrained_json_disable_any_whitespace,
227
231
  )
228
232
  elif name == "llguidance":
229
233
  from sglang.srt.constrained.llguidance_backend import GuidanceBackend
230
234
 
231
235
  grammar_backend = GuidanceBackend(
232
236
  tokenizer=tokenizer,
237
+ any_whitespace=not server_args.constrained_json_disable_any_whitespace,
233
238
  whitespace_pattern=server_args.constrained_json_whitespace_pattern,
234
239
  )
235
240
  elif name == "none":
@@ -32,6 +32,7 @@ from sglang.srt.constrained.base_grammar_backend import (
32
32
  BaseGrammarBackend,
33
33
  BaseGrammarObject,
34
34
  )
35
+ from sglang.srt.constrained.utils import is_legacy_structural_tag
35
36
 
36
37
  logger = logging.getLogger(__name__)
37
38
 
@@ -110,12 +111,14 @@ class GuidanceBackend(BaseGrammarBackend):
110
111
  def __init__(
111
112
  self,
112
113
  tokenizer,
114
+ any_whitespace: bool = True,
113
115
  whitespace_pattern: Optional[str] = None,
114
116
  n_vocab: Optional[int] = None,
115
117
  ):
116
118
  super().__init__()
117
119
 
118
120
  self.tokenizer = tokenizer
121
+ self.any_whitespace = any_whitespace
119
122
  self.whitespace_pattern = whitespace_pattern
120
123
  self.llguidance_tokenizer = from_tokenizer(self.tokenizer, n_vocab)
121
124
 
@@ -134,6 +137,7 @@ class GuidanceBackend(BaseGrammarBackend):
134
137
  serialized_grammar = LLMatcher.grammar_from_json_schema(
135
138
  key_string,
136
139
  defaults={
140
+ "whitespace_flexible": self.any_whitespace,
137
141
  "whitespace_pattern": self.whitespace_pattern,
138
142
  },
139
143
  )
@@ -157,6 +161,7 @@ class GuidanceBackend(BaseGrammarBackend):
157
161
  def dispatch_structural_tag(self, key_string: str) -> Optional[GuidanceGrammar]:
158
162
  try:
159
163
  structural_tag = json.loads(key_string)
164
+ assert is_legacy_structural_tag(structural_tag)
160
165
  tags = [
161
166
  StructTag(
162
167
  begin=structure["begin"],
@@ -115,7 +115,7 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
115
115
  def __init__(
116
116
  self,
117
117
  tokenizer,
118
- whitespace_pattern: bool,
118
+ whitespace_pattern: str | None,
119
119
  ):
120
120
  super().__init__()
121
121
 
@@ -37,7 +37,7 @@ except ImportError:
37
37
 
38
38
  IP_REGEX = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
39
39
 
40
- # Env var was set in sglang.srt.server_args.ServerArgs.__post__init__
40
+ # Env var was set in sglang.srt.server_args.ServerArgs.__post_init__
41
41
  DISABLE_DISK_CACHE = get_bool_env_var("SGLANG_DISABLE_OUTLINES_DISK_CACHE", "true")
42
42
 
43
43
  logger = logging.getLogger(__name__)