sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (408) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +330 -156
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +8 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +134 -23
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +70 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +66 -66
  69. sglang/srt/entrypoints/grpc_server.py +431 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +120 -8
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +42 -4
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +3 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +18 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/utils.py +2 -2
  93. sglang/srt/grpc/compile_proto.py +3 -3
  94. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  95. sglang/srt/grpc/health_servicer.py +189 -0
  96. sglang/srt/grpc/scheduler_launcher.py +181 -0
  97. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  98. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  99. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  100. sglang/srt/layers/activation.py +4 -1
  101. sglang/srt/layers/attention/aiter_backend.py +3 -3
  102. sglang/srt/layers/attention/ascend_backend.py +17 -1
  103. sglang/srt/layers/attention/attention_registry.py +43 -23
  104. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  105. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  106. sglang/srt/layers/attention/fla/chunk.py +0 -1
  107. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  108. sglang/srt/layers/attention/fla/index.py +0 -2
  109. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  110. sglang/srt/layers/attention/fla/utils.py +0 -3
  111. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  112. sglang/srt/layers/attention/flashattention_backend.py +12 -8
  113. sglang/srt/layers/attention/flashinfer_backend.py +248 -21
  114. sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
  115. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  116. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  117. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  118. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  119. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  121. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  122. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  123. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  124. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  125. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  127. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  128. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  129. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  130. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  131. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  132. sglang/srt/layers/attention/nsa/utils.py +0 -1
  133. sglang/srt/layers/attention/nsa_backend.py +404 -90
  134. sglang/srt/layers/attention/triton_backend.py +208 -34
  135. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  136. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  137. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  138. sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
  139. sglang/srt/layers/attention/utils.py +11 -7
  140. sglang/srt/layers/attention/vision.py +3 -3
  141. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  142. sglang/srt/layers/communicator.py +11 -7
  143. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  146. sglang/srt/layers/dp_attention.py +17 -0
  147. sglang/srt/layers/layernorm.py +45 -15
  148. sglang/srt/layers/linear.py +9 -1
  149. sglang/srt/layers/logits_processor.py +147 -17
  150. sglang/srt/layers/modelopt_utils.py +11 -0
  151. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  152. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  153. sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
  154. sglang/srt/layers/moe/ep_moe/layer.py +119 -397
  155. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  159. sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
  160. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  161. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  162. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  163. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  164. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  165. sglang/srt/layers/moe/router.py +51 -15
  166. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  167. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  168. sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
  169. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  170. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  171. sglang/srt/layers/moe/topk.py +3 -2
  172. sglang/srt/layers/moe/utils.py +17 -1
  173. sglang/srt/layers/quantization/__init__.py +2 -53
  174. sglang/srt/layers/quantization/awq.py +183 -6
  175. sglang/srt/layers/quantization/awq_triton.py +29 -0
  176. sglang/srt/layers/quantization/base_config.py +20 -1
  177. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  178. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  179. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  180. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  181. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  183. sglang/srt/layers/quantization/fp8.py +84 -18
  184. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  185. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  186. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  187. sglang/srt/layers/quantization/gptq.py +0 -1
  188. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  189. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  190. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  191. sglang/srt/layers/quantization/mxfp4.py +5 -30
  192. sglang/srt/layers/quantization/petit.py +1 -1
  193. sglang/srt/layers/quantization/quark/quark.py +3 -1
  194. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  195. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  196. sglang/srt/layers/quantization/unquant.py +1 -4
  197. sglang/srt/layers/quantization/utils.py +0 -1
  198. sglang/srt/layers/quantization/w4afp8.py +51 -20
  199. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  200. sglang/srt/layers/radix_attention.py +59 -9
  201. sglang/srt/layers/rotary_embedding.py +673 -16
  202. sglang/srt/layers/sampler.py +36 -16
  203. sglang/srt/layers/sparse_pooler.py +98 -0
  204. sglang/srt/layers/utils.py +0 -1
  205. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  206. sglang/srt/lora/backend/triton_backend.py +0 -1
  207. sglang/srt/lora/eviction_policy.py +139 -0
  208. sglang/srt/lora/lora_manager.py +24 -9
  209. sglang/srt/lora/lora_registry.py +1 -1
  210. sglang/srt/lora/mem_pool.py +40 -16
  211. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  212. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  213. sglang/srt/managers/cache_controller.py +48 -17
  214. sglang/srt/managers/data_parallel_controller.py +146 -42
  215. sglang/srt/managers/detokenizer_manager.py +40 -13
  216. sglang/srt/managers/io_struct.py +66 -16
  217. sglang/srt/managers/mm_utils.py +20 -18
  218. sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
  219. sglang/srt/managers/overlap_utils.py +96 -19
  220. sglang/srt/managers/schedule_batch.py +241 -511
  221. sglang/srt/managers/schedule_policy.py +15 -2
  222. sglang/srt/managers/scheduler.py +399 -499
  223. sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
  224. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  225. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  226. sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
  227. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  228. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  229. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  230. sglang/srt/managers/tokenizer_manager.py +378 -90
  231. sglang/srt/managers/tp_worker.py +212 -161
  232. sglang/srt/managers/utils.py +78 -2
  233. sglang/srt/mem_cache/allocator.py +7 -2
  234. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  235. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  236. sglang/srt/mem_cache/chunk_cache.py +13 -2
  237. sglang/srt/mem_cache/common.py +480 -0
  238. sglang/srt/mem_cache/evict_policy.py +16 -1
  239. sglang/srt/mem_cache/hicache_storage.py +4 -1
  240. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  241. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  242. sglang/srt/mem_cache/memory_pool.py +435 -219
  243. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  244. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  245. sglang/srt/mem_cache/radix_cache.py +53 -19
  246. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  247. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  249. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  250. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  251. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  252. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  253. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  254. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  255. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  256. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  257. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  258. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  259. sglang/srt/metrics/collector.py +31 -0
  260. sglang/srt/metrics/func_timer.py +1 -1
  261. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  262. sglang/srt/model_executor/forward_batch_info.py +28 -23
  263. sglang/srt/model_executor/model_runner.py +379 -139
  264. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  265. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  266. sglang/srt/model_loader/__init__.py +1 -1
  267. sglang/srt/model_loader/loader.py +424 -27
  268. sglang/srt/model_loader/utils.py +0 -1
  269. sglang/srt/model_loader/weight_utils.py +47 -28
  270. sglang/srt/models/apertus.py +2 -3
  271. sglang/srt/models/arcee.py +2 -2
  272. sglang/srt/models/bailing_moe.py +13 -52
  273. sglang/srt/models/bailing_moe_nextn.py +3 -4
  274. sglang/srt/models/bert.py +1 -1
  275. sglang/srt/models/deepseek_nextn.py +19 -3
  276. sglang/srt/models/deepseek_ocr.py +1516 -0
  277. sglang/srt/models/deepseek_v2.py +273 -98
  278. sglang/srt/models/dots_ocr.py +0 -2
  279. sglang/srt/models/dots_vlm.py +0 -1
  280. sglang/srt/models/dots_vlm_vit.py +1 -1
  281. sglang/srt/models/falcon_h1.py +13 -19
  282. sglang/srt/models/gemma3_mm.py +16 -0
  283. sglang/srt/models/gemma3n_mm.py +1 -2
  284. sglang/srt/models/glm4_moe.py +14 -37
  285. sglang/srt/models/glm4_moe_nextn.py +2 -2
  286. sglang/srt/models/glm4v.py +2 -1
  287. sglang/srt/models/glm4v_moe.py +5 -5
  288. sglang/srt/models/gpt_oss.py +5 -5
  289. sglang/srt/models/grok.py +10 -23
  290. sglang/srt/models/hunyuan.py +2 -7
  291. sglang/srt/models/interns1.py +0 -1
  292. sglang/srt/models/kimi_vl.py +1 -7
  293. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  294. sglang/srt/models/llama.py +2 -2
  295. sglang/srt/models/llama_eagle3.py +1 -1
  296. sglang/srt/models/longcat_flash.py +5 -22
  297. sglang/srt/models/longcat_flash_nextn.py +3 -14
  298. sglang/srt/models/mimo.py +2 -13
  299. sglang/srt/models/mimo_mtp.py +1 -2
  300. sglang/srt/models/minicpmo.py +7 -5
  301. sglang/srt/models/mixtral.py +1 -4
  302. sglang/srt/models/mllama.py +1 -1
  303. sglang/srt/models/mllama4.py +13 -3
  304. sglang/srt/models/nemotron_h.py +511 -0
  305. sglang/srt/models/olmo2.py +31 -4
  306. sglang/srt/models/opt.py +5 -5
  307. sglang/srt/models/phi.py +1 -1
  308. sglang/srt/models/phi4mm.py +1 -1
  309. sglang/srt/models/phimoe.py +0 -1
  310. sglang/srt/models/pixtral.py +0 -3
  311. sglang/srt/models/points_v15_chat.py +186 -0
  312. sglang/srt/models/qwen.py +0 -1
  313. sglang/srt/models/qwen2_5_vl.py +3 -3
  314. sglang/srt/models/qwen2_audio.py +2 -15
  315. sglang/srt/models/qwen2_moe.py +15 -12
  316. sglang/srt/models/qwen2_vl.py +5 -2
  317. sglang/srt/models/qwen3_moe.py +19 -35
  318. sglang/srt/models/qwen3_next.py +7 -12
  319. sglang/srt/models/qwen3_next_mtp.py +3 -4
  320. sglang/srt/models/qwen3_omni_moe.py +661 -0
  321. sglang/srt/models/qwen3_vl.py +37 -33
  322. sglang/srt/models/qwen3_vl_moe.py +57 -185
  323. sglang/srt/models/roberta.py +55 -3
  324. sglang/srt/models/sarashina2_vision.py +0 -1
  325. sglang/srt/models/step3_vl.py +3 -5
  326. sglang/srt/models/utils.py +11 -1
  327. sglang/srt/multimodal/processors/base_processor.py +6 -2
  328. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  329. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  330. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  331. sglang/srt/multimodal/processors/glm4v.py +1 -5
  332. sglang/srt/multimodal/processors/internvl.py +0 -2
  333. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  334. sglang/srt/multimodal/processors/mllama4.py +0 -8
  335. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  336. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  337. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  338. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  339. sglang/srt/parser/conversation.py +41 -0
  340. sglang/srt/parser/reasoning_parser.py +0 -1
  341. sglang/srt/sampling/custom_logit_processor.py +77 -2
  342. sglang/srt/sampling/sampling_batch_info.py +17 -22
  343. sglang/srt/sampling/sampling_params.py +70 -2
  344. sglang/srt/server_args.py +577 -73
  345. sglang/srt/server_args_config_parser.py +1 -1
  346. sglang/srt/single_batch_overlap.py +38 -28
  347. sglang/srt/speculative/base_spec_worker.py +34 -0
  348. sglang/srt/speculative/draft_utils.py +226 -0
  349. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  350. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  351. sglang/srt/speculative/eagle_info.py +57 -18
  352. sglang/srt/speculative/eagle_info_v2.py +458 -0
  353. sglang/srt/speculative/eagle_utils.py +138 -0
  354. sglang/srt/speculative/eagle_worker.py +83 -280
  355. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  356. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  357. sglang/srt/speculative/ngram_worker.py +12 -11
  358. sglang/srt/speculative/spec_info.py +2 -0
  359. sglang/srt/speculative/spec_utils.py +38 -3
  360. sglang/srt/speculative/standalone_worker.py +4 -14
  361. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  362. sglang/srt/two_batch_overlap.py +28 -14
  363. sglang/srt/utils/__init__.py +1 -1
  364. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  365. sglang/srt/utils/common.py +192 -47
  366. sglang/srt/utils/hf_transformers_utils.py +40 -17
  367. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  368. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  369. sglang/srt/utils/profile_merger.py +199 -0
  370. sglang/test/attention/test_flashattn_backend.py +1 -1
  371. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  372. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  373. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  374. sglang/test/few_shot_gsm8k_engine.py +2 -4
  375. sglang/test/kit_matched_stop.py +157 -0
  376. sglang/test/longbench_v2/__init__.py +1 -0
  377. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  378. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  379. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  380. sglang/test/run_eval.py +41 -0
  381. sglang/test/runners.py +2 -0
  382. sglang/test/send_one.py +42 -7
  383. sglang/test/simple_eval_common.py +3 -0
  384. sglang/test/simple_eval_gpqa.py +0 -1
  385. sglang/test/simple_eval_humaneval.py +0 -3
  386. sglang/test/simple_eval_longbench_v2.py +344 -0
  387. sglang/test/test_block_fp8.py +1 -2
  388. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  389. sglang/test/test_cutlass_moe.py +1 -2
  390. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  391. sglang/test/test_deterministic.py +232 -99
  392. sglang/test/test_deterministic_utils.py +73 -0
  393. sglang/test/test_disaggregation_utils.py +81 -0
  394. sglang/test/test_marlin_moe.py +0 -1
  395. sglang/test/test_utils.py +85 -20
  396. sglang/version.py +1 -1
  397. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
  398. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
  399. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  400. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  401. sglang/srt/speculative/build_eagle_tree.py +0 -427
  402. sglang/test/test_block_fp8_ep.py +0 -358
  403. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  404. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  405. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  406. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,613 @@
1
+ from transformers import PretrainedConfig
2
+ from transformers.configuration_utils import layer_type_validation
3
+ from transformers.modeling_rope_utils import rope_config_validation
4
+
5
+ from sglang.utils import logger
6
+
7
+
8
+ class Qwen3OmniMoeAudioEncoderConfig(PretrainedConfig):
9
+ model_type = "qwen3_omni_moe_audio_encoder"
10
+
11
+ def __init__(
12
+ self,
13
+ num_mel_bins=128,
14
+ encoder_layers=32,
15
+ encoder_attention_heads=20,
16
+ encoder_ffn_dim=5120,
17
+ d_model=1280,
18
+ dropout=0,
19
+ attention_dropout=0,
20
+ activation_function="gelu",
21
+ activation_dropout=0,
22
+ scale_embedding=False,
23
+ initializer_range=0.02,
24
+ max_source_positions=1500,
25
+ n_window=100,
26
+ output_dim=3584,
27
+ n_window_infer=400,
28
+ conv_chunksize=500,
29
+ downsample_hidden_size=480,
30
+ **kwargs,
31
+ ):
32
+ super().__init__(**kwargs)
33
+
34
+ self.num_mel_bins = num_mel_bins
35
+ self.d_model = d_model
36
+ self.encoder_layers = encoder_layers
37
+ self.encoder_attention_heads = encoder_attention_heads
38
+ self.encoder_ffn_dim = encoder_ffn_dim
39
+ self.dropout = dropout
40
+ self.attention_dropout = attention_dropout
41
+ self.activation_function = activation_function
42
+ self.activation_dropout = activation_dropout
43
+ self.num_hidden_layers = encoder_layers
44
+ self.initializer_range = initializer_range
45
+ self.scale_embedding = (
46
+ scale_embedding # scale factor will be sqrt(d_model) if True
47
+ )
48
+ self.max_source_positions = max_source_positions
49
+ self.n_window = n_window
50
+ self.output_dim = output_dim
51
+ self.n_window_infer = n_window_infer
52
+ self.conv_chunksize = conv_chunksize
53
+ self.downsample_hidden_size = downsample_hidden_size
54
+
55
+
56
+ class Qwen3OmniMoeVisionEncoderConfig(PretrainedConfig):
57
+ model_type = "qwen3_omni_moe_vision_encoder"
58
+ base_config_key = "vision_config"
59
+
60
+ def __init__(
61
+ self,
62
+ depth=27,
63
+ hidden_size=1152,
64
+ hidden_act="gelu_pytorch_tanh",
65
+ intermediate_size=4304,
66
+ num_heads=16,
67
+ in_channels=3,
68
+ patch_size=16,
69
+ spatial_merge_size=2,
70
+ temporal_patch_size=2,
71
+ out_hidden_size=3584,
72
+ num_position_embeddings=2304,
73
+ deepstack_visual_indexes=[8, 16, 24],
74
+ initializer_range=0.02,
75
+ **kwargs,
76
+ ):
77
+ super().__init__(**kwargs)
78
+
79
+ self.depth = depth
80
+ self.hidden_size = hidden_size
81
+ self.hidden_act = hidden_act
82
+ self.intermediate_size = intermediate_size
83
+ self.num_heads = num_heads
84
+ self.in_channels = in_channels
85
+ self.patch_size = patch_size
86
+ self.spatial_merge_size = spatial_merge_size
87
+ self.temporal_patch_size = temporal_patch_size
88
+ self.out_hidden_size = out_hidden_size
89
+ self.num_position_embeddings = num_position_embeddings
90
+ self.initializer_range = initializer_range
91
+ self.deepstack_visual_indexes = deepstack_visual_indexes
92
+
93
+
94
+ class Qwen3OmniMoeTextConfig(PretrainedConfig):
95
+ model_type = "qwen3_omni_moe_text"
96
+ keys_to_ignore_at_inference = ["past_key_values"]
97
+
98
+ # Default tensor parallel plan for base model `Qwen3OmniMoeText`
99
+ base_model_tp_plan = {
100
+ "layers.*.self_attn.q_proj": "colwise",
101
+ "layers.*.self_attn.k_proj": "colwise",
102
+ "layers.*.self_attn.v_proj": "colwise",
103
+ "layers.*.self_attn.o_proj": "rowwise",
104
+ "layers.*.mlp.experts.*.gate_proj": "colwise",
105
+ "layers.*.mlp.experts.*.up_proj": "colwise",
106
+ "layers.*.mlp.experts.*.down_proj": "rowwise",
107
+ "layers.*.mlp.gate_proj": "colwise",
108
+ "layers.*.mlp.up_proj": "colwise",
109
+ "layers.*.mlp.down_proj": "rowwise",
110
+ }
111
+ base_model_pp_plan = {
112
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
113
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
114
+ "norm": (["hidden_states"], ["hidden_states"]),
115
+ }
116
+
117
+ def __init__(
118
+ self,
119
+ vocab_size=3584,
120
+ hidden_size=2048,
121
+ intermediate_size=18944,
122
+ num_hidden_layers=28,
123
+ num_attention_heads=28,
124
+ num_key_value_heads=4,
125
+ hidden_act="silu",
126
+ max_position_embeddings=32768,
127
+ initializer_range=0.02,
128
+ rms_norm_eps=1e-6,
129
+ use_cache=True,
130
+ tie_word_embeddings=False,
131
+ rope_theta=1000000.0,
132
+ rope_scaling=None,
133
+ attention_bias=False,
134
+ sliding_window=None,
135
+ attention_dropout=0,
136
+ decoder_sparse_step=1,
137
+ moe_intermediate_size=768,
138
+ num_experts_per_tok=8,
139
+ num_experts=128,
140
+ norm_topk_prob=True,
141
+ output_router_logits=False,
142
+ router_aux_loss_coef=0.001,
143
+ mlp_only_layers=None,
144
+ **kwargs,
145
+ ):
146
+ super().__init__(
147
+ tie_word_embeddings=tie_word_embeddings,
148
+ **kwargs,
149
+ )
150
+ self.vocab_size = vocab_size
151
+ self.max_position_embeddings = max_position_embeddings
152
+ self.hidden_size = hidden_size
153
+ self.intermediate_size = intermediate_size
154
+ self.num_hidden_layers = num_hidden_layers
155
+ self.num_attention_heads = num_attention_heads
156
+ self.sliding_window = sliding_window
157
+
158
+ self.num_key_value_heads = num_key_value_heads
159
+ self.hidden_act = hidden_act
160
+ self.initializer_range = initializer_range
161
+ self.rms_norm_eps = rms_norm_eps
162
+ self.use_cache = use_cache
163
+ self.rope_theta = rope_theta
164
+ self.rope_scaling = rope_scaling
165
+ self.attention_bias = attention_bias
166
+ self.attention_dropout = attention_dropout
167
+ # Validate the correctness of rotary position embeddings parameters
168
+ # BC: if there is a 'type' field, move it to 'rope_type'.
169
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
170
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
171
+ rope_config_validation(self)
172
+
173
+ # MoE arguments
174
+ self.decoder_sparse_step = decoder_sparse_step
175
+ self.moe_intermediate_size = moe_intermediate_size
176
+ self.num_experts_per_tok = num_experts_per_tok
177
+ self.num_experts = num_experts
178
+ self.norm_topk_prob = norm_topk_prob
179
+ self.output_router_logits = output_router_logits
180
+ self.router_aux_loss_coef = router_aux_loss_coef
181
+ self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
182
+
183
+
184
+ class Qwen3OmniMoeThinkerConfig(PretrainedConfig):
185
+ model_type = "qwen3_omni_moe_thinker"
186
+ attribute_map = {
187
+ "image_token_id": "image_token_index",
188
+ "video_token_id": "video_token_index",
189
+ "audio_token_id": "audio_token_index",
190
+ }
191
+ sub_configs = {
192
+ "audio_config": Qwen3OmniMoeAudioEncoderConfig,
193
+ "vision_config": Qwen3OmniMoeVisionEncoderConfig,
194
+ "text_config": Qwen3OmniMoeTextConfig,
195
+ }
196
+
197
+ def __init__(
198
+ self,
199
+ audio_config=None,
200
+ vision_config=None,
201
+ text_config=None,
202
+ audio_token_id=151646,
203
+ image_token_id=151655,
204
+ video_token_id=151656,
205
+ position_id_per_seconds=25,
206
+ audio_start_token_id=151647,
207
+ user_token_id=872,
208
+ initializer_range=0.02,
209
+ **kwargs,
210
+ ):
211
+ super().__init__(**kwargs)
212
+ self.user_token_id = user_token_id
213
+ self.position_id_per_seconds = position_id_per_seconds
214
+ self.audio_start_token_id = audio_start_token_id
215
+ self.initializer_range = initializer_range
216
+
217
+ if isinstance(vision_config, dict):
218
+ vision_config = Qwen3OmniMoeVisionEncoderConfig(**vision_config)
219
+ elif vision_config is None:
220
+ vision_config = Qwen3OmniMoeVisionEncoderConfig()
221
+ self.vision_config = vision_config
222
+
223
+ if isinstance(audio_config, dict):
224
+ audio_config = Qwen3OmniMoeAudioEncoderConfig(**audio_config)
225
+ elif audio_config is None:
226
+ audio_config = Qwen3OmniMoeAudioEncoderConfig()
227
+ self.audio_config = audio_config
228
+
229
+ if isinstance(text_config, dict):
230
+ text_config = Qwen3OmniMoeTextConfig(**text_config)
231
+ elif text_config is None:
232
+ text_config = Qwen3OmniMoeTextConfig()
233
+ self.text_config = text_config
234
+ self.audio_token_id = audio_token_id
235
+ self.image_token_id = image_token_id
236
+ self.video_token_id = video_token_id
237
+
238
+
239
+ class Qwen3OmniMoeTalkerCodePredictorConfig(PretrainedConfig):
240
+
241
+ model_type = "qwen3_omni_moe_talker_code_predictor"
242
+ keys_to_ignore_at_inference = ["past_key_values"]
243
+
244
+ # Default tensor parallel plan for base model `Qwen3OmniMoeTalkerCodePredictor`
245
+ base_model_tp_plan = {
246
+ "layers.*.self_attn.q_proj": "colwise",
247
+ "layers.*.self_attn.k_proj": "colwise",
248
+ "layers.*.self_attn.v_proj": "colwise",
249
+ "layers.*.self_attn.o_proj": "rowwise",
250
+ "layers.*.mlp.gate_proj": "colwise",
251
+ "layers.*.mlp.up_proj": "colwise",
252
+ "layers.*.mlp.down_proj": "rowwise",
253
+ }
254
+ base_model_pp_plan = {
255
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
256
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
257
+ "norm": (["hidden_states"], ["hidden_states"]),
258
+ }
259
+
260
+ def __init__(
261
+ self,
262
+ vocab_size=2048,
263
+ hidden_size=1024,
264
+ intermediate_size=3072,
265
+ num_hidden_layers=5,
266
+ num_attention_heads=16,
267
+ num_key_value_heads=8,
268
+ head_dim=128,
269
+ hidden_act="silu",
270
+ max_position_embeddings=32768,
271
+ initializer_range=0.02,
272
+ rms_norm_eps=0.000001,
273
+ use_cache=True,
274
+ tie_word_embeddings=False,
275
+ rope_theta=10000,
276
+ rope_scaling=None,
277
+ attention_bias=False,
278
+ sliding_window=None,
279
+ layer_types=None,
280
+ attention_dropout=0,
281
+ num_code_groups=32,
282
+ **kwargs,
283
+ ):
284
+ super().__init__(
285
+ tie_word_embeddings=tie_word_embeddings,
286
+ **kwargs,
287
+ )
288
+ self.vocab_size = vocab_size
289
+ self.max_position_embeddings = max_position_embeddings
290
+ self.hidden_size = hidden_size
291
+ self.intermediate_size = intermediate_size
292
+ self.num_hidden_layers = num_hidden_layers
293
+ self.num_attention_heads = num_attention_heads
294
+ self.sliding_window = sliding_window
295
+
296
+ # for backward compatibility
297
+ if num_key_value_heads is None:
298
+ num_key_value_heads = num_attention_heads
299
+
300
+ self.num_key_value_heads = num_key_value_heads
301
+ self.head_dim = head_dim
302
+ self.hidden_act = hidden_act
303
+ self.initializer_range = initializer_range
304
+ self.rms_norm_eps = rms_norm_eps
305
+ self.use_cache = use_cache
306
+ self.rope_theta = rope_theta
307
+ self.rope_scaling = rope_scaling
308
+ self.attention_bias = attention_bias
309
+ self.attention_dropout = attention_dropout
310
+ # Validate the correctness of rotary position embeddings parameters
311
+ # BC: if there is a 'type' field, move it to 'rope_type'.
312
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
313
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
314
+ rope_config_validation(self)
315
+
316
+ self.layer_types = layer_types
317
+ if self.layer_types is None:
318
+ self.layer_types = [
319
+ (
320
+ "sliding_attention"
321
+ if self.sliding_window is not None and i >= self.max_window_layers
322
+ else "full_attention"
323
+ )
324
+ for i in range(self.num_hidden_layers)
325
+ ]
326
+ layer_type_validation(self.layer_types, self.num_hidden_layers)
327
+ self.num_code_groups = num_code_groups
328
+
329
+
330
+ class Qwen3OmniMoeTalkerTextConfig(PretrainedConfig):
331
+
332
+ model_type = "qwen3_omni_moe_talker_text"
333
+ keys_to_ignore_at_inference = ["past_key_values"]
334
+
335
+ # Default tensor parallel plan for base model `Qwen3OmniMoeTalkerText`
336
+ base_model_tp_plan = {
337
+ "layers.*.self_attn.q_proj": "colwise",
338
+ "layers.*.self_attn.k_proj": "colwise",
339
+ "layers.*.self_attn.v_proj": "colwise",
340
+ "layers.*.self_attn.o_proj": "rowwise",
341
+ "layers.*.mlp.experts.*.gate_proj": "colwise",
342
+ "layers.*.mlp.experts.*.up_proj": "colwise",
343
+ "layers.*.mlp.experts.*.down_proj": "rowwise",
344
+ "layers.*.mlp.gate_proj": "colwise",
345
+ "layers.*.mlp.up_proj": "colwise",
346
+ "layers.*.mlp.down_proj": "rowwise",
347
+ }
348
+ base_model_pp_plan = {
349
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
350
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
351
+ "norm": (["hidden_states"], ["hidden_states"]),
352
+ }
353
+
354
+ def __init__(
355
+ self,
356
+ vocab_size=3072,
357
+ hidden_size=1024,
358
+ intermediate_size=2048,
359
+ num_hidden_layers=20,
360
+ num_attention_heads=16,
361
+ num_key_value_heads=2,
362
+ hidden_act="silu",
363
+ max_position_embeddings=32768,
364
+ initializer_range=0.02,
365
+ rms_norm_eps=0.000001,
366
+ use_cache=True,
367
+ tie_word_embeddings=False,
368
+ rope_theta=10000,
369
+ rope_scaling=None,
370
+ attention_bias=False,
371
+ sliding_window=None,
372
+ attention_dropout=0,
373
+ decoder_sparse_step=1,
374
+ moe_intermediate_size=384,
375
+ num_experts_per_tok=8,
376
+ num_experts=128,
377
+ norm_topk_prob=False,
378
+ output_router_logits=False,
379
+ router_aux_loss_coef=0.001,
380
+ mlp_only_layers=None,
381
+ **kwargs,
382
+ ):
383
+ super().__init__(
384
+ tie_word_embeddings=tie_word_embeddings,
385
+ **kwargs,
386
+ )
387
+ self.vocab_size = vocab_size
388
+ self.max_position_embeddings = max_position_embeddings
389
+ self.hidden_size = hidden_size
390
+ self.intermediate_size = intermediate_size
391
+ self.num_hidden_layers = num_hidden_layers
392
+ self.num_attention_heads = num_attention_heads
393
+ self.sliding_window = sliding_window
394
+
395
+ self.num_key_value_heads = num_key_value_heads
396
+ self.hidden_act = hidden_act
397
+ self.initializer_range = initializer_range
398
+ self.rms_norm_eps = rms_norm_eps
399
+ self.use_cache = use_cache
400
+ self.rope_theta = rope_theta
401
+ self.rope_scaling = rope_scaling
402
+ self.attention_bias = attention_bias
403
+ self.attention_dropout = attention_dropout
404
+ # Validate the correctness of rotary position embeddings parameters
405
+ # BC: if there is a 'type' field, move it to 'rope_type'.
406
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
407
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
408
+ rope_config_validation(self)
409
+
410
+ # MoE arguments
411
+ self.decoder_sparse_step = decoder_sparse_step
412
+ self.moe_intermediate_size = moe_intermediate_size
413
+ self.num_experts_per_tok = num_experts_per_tok
414
+ self.num_experts = num_experts
415
+ self.norm_topk_prob = norm_topk_prob
416
+ self.output_router_logits = output_router_logits
417
+ self.router_aux_loss_coef = router_aux_loss_coef
418
+ self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
419
+
420
+
421
+ class Qwen3OmniMoeTalkerConfig(PretrainedConfig):
422
+
423
+ sub_configs = {
424
+ "code_predictor_config": Qwen3OmniMoeTalkerCodePredictorConfig,
425
+ "text_config": Qwen3OmniMoeTalkerTextConfig,
426
+ }
427
+
428
+ def __init__(
429
+ self,
430
+ code_predictor_config=None,
431
+ text_config=None,
432
+ num_code_groups=32,
433
+ thinker_hidden_size=2048,
434
+ codec_eos_token_id=4198,
435
+ accept_hidden_layer=18,
436
+ codec_nothink_id=4203,
437
+ codec_think_bos_id=4204,
438
+ codec_think_eos_id=4205,
439
+ codec_pad_id=4196,
440
+ codec_bos_id=4197,
441
+ audio_token_id=151646,
442
+ image_token_id=151655,
443
+ video_token_id=151656,
444
+ vision_start_token_id=151652,
445
+ position_id_per_seconds=25,
446
+ audio_start_token_id=151669,
447
+ speaker_id=None,
448
+ **kwargs,
449
+ ):
450
+ super().__init__(**kwargs)
451
+ if code_predictor_config is None:
452
+ code_predictor_config = {}
453
+ self.code_predictor_config = Qwen3OmniMoeTalkerCodePredictorConfig()
454
+ logger.info(
455
+ "code_predictor_config is None. Initializing code_predictor_config model with default values"
456
+ )
457
+ elif isinstance(code_predictor_config, Qwen3OmniMoeTalkerCodePredictorConfig):
458
+ self.code_predictor_config = code_predictor_config
459
+ else:
460
+ self.code_predictor_config = Qwen3OmniMoeTalkerCodePredictorConfig(
461
+ **code_predictor_config
462
+ )
463
+
464
+ if text_config is None:
465
+ text_config = {}
466
+ self.text_config = Qwen3OmniMoeTalkerTextConfig()
467
+ logger.info(
468
+ "talker text_config is None. Initializing talker text model with default values"
469
+ )
470
+ elif isinstance(text_config, Qwen3OmniMoeTalkerTextConfig):
471
+ self.text_config = text_config
472
+ else:
473
+ self.text_config = Qwen3OmniMoeTalkerTextConfig(**text_config)
474
+ self.num_code_groups = num_code_groups
475
+ self.thinker_hidden_size = thinker_hidden_size
476
+ self.codec_eos_token_id = codec_eos_token_id
477
+ self.accept_hidden_layer = accept_hidden_layer
478
+ self.codec_nothink_id = codec_nothink_id
479
+ self.codec_think_bos_id = codec_think_bos_id
480
+ self.codec_think_eos_id = codec_think_eos_id
481
+ self.codec_pad_id = codec_pad_id
482
+ self.codec_bos_id = codec_bos_id
483
+ self.audio_token_id = audio_token_id
484
+ self.image_token_id = image_token_id
485
+ self.video_token_id = video_token_id
486
+ self.position_id_per_seconds = position_id_per_seconds
487
+ self.audio_start_token_id = audio_start_token_id
488
+ self.vision_start_token_id = vision_start_token_id
489
+ self.speaker_id = speaker_id
490
+
491
+
492
+ class Qwen3OmniMoeCode2WavConfig(PretrainedConfig):
493
+
494
+ def __init__(
495
+ self,
496
+ codebook_size=2048,
497
+ hidden_size=1024,
498
+ max_position_embeddings=8000,
499
+ rope_theta=10000,
500
+ num_attention_heads=16,
501
+ num_key_value_heads=16,
502
+ attention_bias=False,
503
+ sliding_window=72,
504
+ intermediate_size=3072,
505
+ hidden_act="silu",
506
+ layer_scale_initial_scale=0.01,
507
+ rms_norm_eps=1e-5,
508
+ num_hidden_layers=8,
509
+ num_quantizers=16,
510
+ upsample_rates=(8, 5, 4, 3),
511
+ upsampling_ratios=(2, 2),
512
+ decoder_dim=1536,
513
+ attention_dropout=0.0,
514
+ **kwargs,
515
+ ):
516
+ super().__init__(**kwargs)
517
+ self.codebook_size = codebook_size
518
+ self.hidden_size = hidden_size
519
+ self.max_position_embeddings = max_position_embeddings
520
+ self.rope_theta = rope_theta
521
+ self.num_attention_heads = num_attention_heads
522
+ self.num_key_value_heads = num_key_value_heads
523
+ self.attention_bias = attention_bias
524
+ self.sliding_window = sliding_window
525
+ self.intermediate_size = intermediate_size
526
+ self.hidden_act = hidden_act
527
+ self.layer_scale_initial_scale = layer_scale_initial_scale
528
+ self.rms_norm_eps = rms_norm_eps
529
+ self.num_hidden_layers = num_hidden_layers
530
+ self.num_quantizers = num_quantizers
531
+ self.upsample_rates = upsample_rates
532
+ self.upsampling_ratios = upsampling_ratios
533
+ self.decoder_dim = decoder_dim
534
+ self.attention_dropout = attention_dropout
535
+
536
+ @property
537
+ def layer_types(self):
538
+ """
539
+ All layer in code2wav should be sliding attention
540
+ """
541
+ return ["sliding_attention"] * self.num_hidden_layers
542
+
543
+
544
+ class Qwen3OmniMoeConfig(PretrainedConfig):
545
+
546
+ model_type = "qwen3_omni_moe"
547
+ sub_configs = {
548
+ "thinker_config": Qwen3OmniMoeThinkerConfig,
549
+ "talker_config": Qwen3OmniMoeTalkerConfig,
550
+ "code2wav_config": Qwen3OmniMoeCode2WavConfig,
551
+ }
552
+
553
+ def __init__(
554
+ self,
555
+ thinker_config=None,
556
+ talker_config=None,
557
+ code2wav_config=None,
558
+ enable_audio_output=True,
559
+ im_start_token_id=151644,
560
+ im_end_token_id=151645,
561
+ tts_pad_token_id=151671,
562
+ tts_bos_token_id=151672,
563
+ tts_eos_token_id=151673,
564
+ system_token_id=8948,
565
+ user_token_id=872,
566
+ assistant_token_id=77091,
567
+ **kwargs,
568
+ ):
569
+ super().__init__(**kwargs)
570
+ if thinker_config is None:
571
+ thinker_config = {}
572
+ logger.info(
573
+ "thinker_config is None. Initializing thinker model with default values"
574
+ )
575
+
576
+ if talker_config is None:
577
+ talker_config = {}
578
+ logger.info(
579
+ "talker_config is None. Initializing talker model with default values"
580
+ )
581
+
582
+ if code2wav_config is None:
583
+ code2wav_config = {}
584
+ logger.info(
585
+ "code2wav_config is None. Initializing code2wav model with default values"
586
+ )
587
+
588
+ self.thinker_config = Qwen3OmniMoeThinkerConfig(**thinker_config)
589
+ self.talker_config = Qwen3OmniMoeTalkerConfig(**talker_config)
590
+ self.code2wav_config = Qwen3OmniMoeCode2WavConfig(**code2wav_config)
591
+ self.enable_audio_output = enable_audio_output
592
+ self.im_start_token_id = im_start_token_id
593
+ self.im_end_token_id = im_end_token_id
594
+ self.tts_pad_token_id = tts_pad_token_id
595
+ self.tts_bos_token_id = tts_bos_token_id
596
+ self.tts_eos_token_id = tts_eos_token_id
597
+ self.system_token_id = system_token_id
598
+ self.user_token_id = user_token_id
599
+ self.assistant_token_id = assistant_token_id
600
+
601
+ def get_text_config(self, decoder=False) -> "PretrainedConfig":
602
+ """
603
+ Returns the config that is meant to be used with text IO. On most models, it is the original config instance
604
+ itself. On specific composite models, it is under a set of valid names.
605
+
606
+ Args:
607
+ decoder (`Optional[bool]`, *optional*, defaults to `False`):
608
+ If set to `True`, then only search for decoder config names.
609
+ """
610
+ # Overridden for deeply nested config like Qwen2-Omni. We don't have any omni model
611
+ # except for Qwen yet. This has to be generalized if more deeply nested configs are
612
+ # added. NOTE: currently method used only by vLLM
613
+ return self.thinker_config.get_text_config()
@@ -1,5 +1,3 @@
1
- from typing import Optional, Union
2
-
3
1
  from transformers import PretrainedConfig
4
2
  from transformers.modeling_rope_utils import rope_config_validation
5
3
 
@@ -576,11 +574,3 @@ class Qwen3VLMoeConfig(PretrainedConfig):
576
574
  self.vision_start_token_id = vision_start_token_id
577
575
  self.vision_end_token_id = vision_end_token_id
578
576
  super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
579
-
580
-
581
- __all__ = [
582
- "Qwen3VLMoeConfig",
583
- "Qwen3VLMoeVisionConfig",
584
- "Qwen3VLConfig",
585
- "Qwen3VLVisionConfig",
586
- ]
@@ -1,7 +1,7 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
 
3
3
  import logging
4
- from typing import Generator, List, Optional, Tuple
4
+ from typing import Generator, Optional, Tuple
5
5
  from urllib.parse import urlparse
6
6
 
7
7
  import torch
@@ -224,13 +224,17 @@ def create_grammar_backend(
224
224
  eos_list = list(eos_token_ids) if eos_token_ids else None
225
225
 
226
226
  grammar_backend = XGrammarGrammarBackend(
227
- tokenizer, vocab_size=vocab_size, model_eos_token_ids=eos_list
227
+ tokenizer,
228
+ vocab_size=vocab_size,
229
+ model_eos_token_ids=eos_list,
230
+ any_whitespace=not server_args.constrained_json_disable_any_whitespace,
228
231
  )
229
232
  elif name == "llguidance":
230
233
  from sglang.srt.constrained.llguidance_backend import GuidanceBackend
231
234
 
232
235
  grammar_backend = GuidanceBackend(
233
236
  tokenizer=tokenizer,
237
+ any_whitespace=not server_args.constrained_json_disable_any_whitespace,
234
238
  whitespace_pattern=server_args.constrained_json_whitespace_pattern,
235
239
  )
236
240
  elif name == "none":
@@ -32,6 +32,7 @@ from sglang.srt.constrained.base_grammar_backend import (
32
32
  BaseGrammarBackend,
33
33
  BaseGrammarObject,
34
34
  )
35
+ from sglang.srt.constrained.utils import is_legacy_structural_tag
35
36
 
36
37
  logger = logging.getLogger(__name__)
37
38
 
@@ -110,12 +111,14 @@ class GuidanceBackend(BaseGrammarBackend):
110
111
  def __init__(
111
112
  self,
112
113
  tokenizer,
114
+ any_whitespace: bool = True,
113
115
  whitespace_pattern: Optional[str] = None,
114
116
  n_vocab: Optional[int] = None,
115
117
  ):
116
118
  super().__init__()
117
119
 
118
120
  self.tokenizer = tokenizer
121
+ self.any_whitespace = any_whitespace
119
122
  self.whitespace_pattern = whitespace_pattern
120
123
  self.llguidance_tokenizer = from_tokenizer(self.tokenizer, n_vocab)
121
124
 
@@ -134,6 +137,7 @@ class GuidanceBackend(BaseGrammarBackend):
134
137
  serialized_grammar = LLMatcher.grammar_from_json_schema(
135
138
  key_string,
136
139
  defaults={
140
+ "whitespace_flexible": self.any_whitespace,
137
141
  "whitespace_pattern": self.whitespace_pattern,
138
142
  },
139
143
  )
@@ -157,6 +161,7 @@ class GuidanceBackend(BaseGrammarBackend):
157
161
  def dispatch_structural_tag(self, key_string: str) -> Optional[GuidanceGrammar]:
158
162
  try:
159
163
  structural_tag = json.loads(key_string)
164
+ assert is_legacy_structural_tag(structural_tag)
160
165
  tags = [
161
166
  StructTag(
162
167
  begin=structure["begin"],