sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +8 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +360 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/model_config.py +181 -82
  17. sglang/srt/configs/qwen3_next.py +326 -0
  18. sglang/srt/configs/qwen3_vl.py +586 -0
  19. sglang/srt/connector/__init__.py +8 -1
  20. sglang/srt/connector/remote_instance.py +82 -0
  21. sglang/srt/constrained/base_grammar_backend.py +49 -12
  22. sglang/srt/constrained/llguidance_backend.py +0 -1
  23. sglang/srt/constrained/outlines_backend.py +0 -1
  24. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  25. sglang/srt/constrained/xgrammar_backend.py +30 -9
  26. sglang/srt/custom_op.py +11 -1
  27. sglang/srt/debug_utils/dump_comparator.py +81 -44
  28. sglang/srt/debug_utils/dump_loader.py +97 -0
  29. sglang/srt/debug_utils/dumper.py +21 -6
  30. sglang/srt/debug_utils/text_comparator.py +73 -11
  31. sglang/srt/disaggregation/ascend/conn.py +2 -2
  32. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  33. sglang/srt/disaggregation/base/conn.py +1 -1
  34. sglang/srt/disaggregation/common/conn.py +279 -108
  35. sglang/srt/disaggregation/decode.py +71 -19
  36. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  37. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  38. sglang/srt/disaggregation/fake/conn.py +1 -1
  39. sglang/srt/disaggregation/mini_lb.py +6 -445
  40. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  41. sglang/srt/disaggregation/nixl/conn.py +326 -53
  42. sglang/srt/disaggregation/prefill.py +36 -17
  43. sglang/srt/disaggregation/utils.py +40 -54
  44. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  45. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  46. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  47. sglang/srt/distributed/parallel_state.py +156 -80
  48. sglang/srt/entrypoints/engine.py +59 -18
  49. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  50. sglang/srt/entrypoints/grpc_server.py +810 -0
  51. sglang/srt/entrypoints/http_server.py +130 -59
  52. sglang/srt/entrypoints/openai/protocol.py +112 -4
  53. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  54. sglang/srt/entrypoints/openai/serving_chat.py +204 -55
  55. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  56. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  57. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  58. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  59. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  60. sglang/srt/environ.py +285 -0
  61. sglang/srt/eplb/eplb_manager.py +2 -2
  62. sglang/srt/eplb/expert_distribution.py +26 -13
  63. sglang/srt/eplb/expert_location.py +38 -8
  64. sglang/srt/eplb/expert_location_updater.py +1 -1
  65. sglang/srt/function_call/base_format_detector.py +3 -6
  66. sglang/srt/function_call/ebnf_composer.py +11 -9
  67. sglang/srt/function_call/function_call_parser.py +9 -2
  68. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  69. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  70. sglang/srt/function_call/json_array_parser.py +63 -0
  71. sglang/srt/function_call/kimik2_detector.py +17 -4
  72. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  73. sglang/srt/function_call/utils.py +96 -5
  74. sglang/srt/grpc/__init__.py +1 -0
  75. sglang/srt/grpc/compile_proto.py +245 -0
  76. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  77. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  79. sglang/srt/layers/activation.py +143 -9
  80. sglang/srt/layers/attention/aiter_backend.py +14 -15
  81. sglang/srt/layers/attention/ascend_backend.py +115 -9
  82. sglang/srt/layers/attention/attention_registry.py +206 -0
  83. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  84. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  85. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  86. sglang/srt/layers/attention/fla/chunk.py +242 -0
  87. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  88. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  89. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  90. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  91. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  92. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  93. sglang/srt/layers/attention/fla/index.py +37 -0
  94. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  95. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  96. sglang/srt/layers/attention/fla/op.py +66 -0
  97. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  98. sglang/srt/layers/attention/fla/utils.py +331 -0
  99. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  100. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  101. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  102. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  103. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  104. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  105. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  106. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  107. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  108. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  109. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  110. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  111. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  112. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  113. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  114. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  115. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  119. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  120. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  121. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  122. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  123. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  124. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  125. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  126. sglang/srt/layers/attention/nsa/utils.py +24 -0
  127. sglang/srt/layers/attention/nsa_backend.py +887 -0
  128. sglang/srt/layers/attention/tbo_backend.py +6 -6
  129. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  130. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  131. sglang/srt/layers/attention/triton_backend.py +57 -7
  132. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  133. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  134. sglang/srt/layers/attention/vision.py +58 -0
  135. sglang/srt/layers/attention/wave_backend.py +4 -4
  136. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  137. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  138. sglang/srt/layers/communicator.py +8 -0
  139. sglang/srt/layers/dp_attention.py +41 -2
  140. sglang/srt/layers/elementwise.py +3 -1
  141. sglang/srt/layers/layernorm.py +34 -15
  142. sglang/srt/layers/linear.py +55 -7
  143. sglang/srt/layers/logits_processor.py +44 -12
  144. sglang/srt/layers/moe/__init__.py +2 -1
  145. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  146. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  147. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  148. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  149. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  150. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  151. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  164. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  165. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  166. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  167. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  168. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  169. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  170. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  171. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  172. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  173. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  174. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  175. sglang/srt/layers/moe/topk.py +30 -9
  176. sglang/srt/layers/moe/utils.py +22 -6
  177. sglang/srt/layers/parameter.py +23 -6
  178. sglang/srt/layers/quantization/awq.py +19 -7
  179. sglang/srt/layers/quantization/base_config.py +11 -6
  180. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  182. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  185. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  186. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  187. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  188. sglang/srt/layers/quantization/fp8.py +78 -49
  189. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  190. sglang/srt/layers/quantization/gptq.py +25 -17
  191. sglang/srt/layers/quantization/modelopt_quant.py +190 -55
  192. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  193. sglang/srt/layers/quantization/mxfp4.py +74 -42
  194. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  195. sglang/srt/layers/quantization/unquant.py +135 -47
  196. sglang/srt/layers/quantization/w4afp8.py +26 -17
  197. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  198. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  199. sglang/srt/layers/rotary_embedding.py +78 -31
  200. sglang/srt/layers/sampler.py +213 -21
  201. sglang/srt/layers/utils.py +23 -0
  202. sglang/srt/lora/backend/base_backend.py +50 -8
  203. sglang/srt/lora/backend/chunked_backend.py +348 -0
  204. sglang/srt/lora/backend/triton_backend.py +99 -5
  205. sglang/srt/lora/layers.py +32 -0
  206. sglang/srt/lora/lora.py +8 -3
  207. sglang/srt/lora/lora_manager.py +52 -118
  208. sglang/srt/lora/mem_pool.py +25 -11
  209. sglang/srt/lora/triton_ops/__init__.py +4 -0
  210. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  211. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  212. sglang/srt/lora/utils.py +22 -11
  213. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  214. sglang/srt/managers/cache_controller.py +199 -301
  215. sglang/srt/managers/data_parallel_controller.py +115 -80
  216. sglang/srt/managers/detokenizer_manager.py +19 -15
  217. sglang/srt/managers/disagg_service.py +46 -0
  218. sglang/srt/managers/io_struct.py +340 -109
  219. sglang/srt/managers/mm_utils.py +44 -6
  220. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  221. sglang/srt/managers/multimodal_processor.py +1 -2
  222. sglang/srt/managers/overlap_utils.py +53 -0
  223. sglang/srt/managers/schedule_batch.py +240 -138
  224. sglang/srt/managers/schedule_policy.py +144 -17
  225. sglang/srt/managers/scheduler.py +502 -209
  226. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  227. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  228. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  229. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  230. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  231. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  232. sglang/srt/managers/tokenizer_manager.py +320 -632
  233. sglang/srt/managers/tp_worker.py +81 -22
  234. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  235. sglang/srt/managers/utils.py +1 -45
  236. sglang/srt/mem_cache/allocator.py +14 -20
  237. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  238. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  239. sglang/srt/mem_cache/chunk_cache.py +8 -1
  240. sglang/srt/mem_cache/evict_policy.py +23 -0
  241. sglang/srt/mem_cache/hicache_storage.py +43 -24
  242. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  243. sglang/srt/mem_cache/memory_pool.py +535 -58
  244. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  245. sglang/srt/mem_cache/radix_cache.py +222 -73
  246. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  247. sglang/srt/mem_cache/storage/__init__.py +10 -0
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  249. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  250. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  251. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  252. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  253. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  254. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  255. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  256. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  257. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  259. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  260. sglang/srt/metrics/collector.py +511 -132
  261. sglang/srt/metrics/func_timer.py +2 -7
  262. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  263. sglang/srt/metrics/utils.py +8 -1
  264. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  265. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  266. sglang/srt/model_executor/forward_batch_info.py +82 -40
  267. sglang/srt/model_executor/model_runner.py +432 -157
  268. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  269. sglang/srt/model_loader/__init__.py +9 -3
  270. sglang/srt/model_loader/loader.py +133 -5
  271. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  272. sglang/srt/model_loader/weight_utils.py +158 -3
  273. sglang/srt/models/apertus.py +686 -0
  274. sglang/srt/models/bailing_moe.py +820 -217
  275. sglang/srt/models/bailing_moe_nextn.py +168 -0
  276. sglang/srt/models/deepseek_nextn.py +6 -1
  277. sglang/srt/models/deepseek_v2.py +607 -130
  278. sglang/srt/models/dots_ocr.py +173 -0
  279. sglang/srt/models/dots_vlm.py +174 -0
  280. sglang/srt/models/dots_vlm_vit.py +337 -0
  281. sglang/srt/models/ernie4.py +1 -1
  282. sglang/srt/models/falcon_h1.py +576 -0
  283. sglang/srt/models/gemma3_causal.py +0 -2
  284. sglang/srt/models/gemma3_mm.py +1 -1
  285. sglang/srt/models/gemma3n_mm.py +2 -2
  286. sglang/srt/models/glm4_moe.py +4 -4
  287. sglang/srt/models/glm4_moe_nextn.py +2 -2
  288. sglang/srt/models/glm4v.py +5 -3
  289. sglang/srt/models/glm4v_moe.py +4 -1
  290. sglang/srt/models/gpt_oss.py +8 -31
  291. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  292. sglang/srt/models/llama.py +4 -0
  293. sglang/srt/models/llama4.py +9 -0
  294. sglang/srt/models/llama_eagle3.py +13 -0
  295. sglang/srt/models/longcat_flash.py +3 -3
  296. sglang/srt/models/longcat_flash_nextn.py +1 -1
  297. sglang/srt/models/mllama4.py +40 -4
  298. sglang/srt/models/opt.py +637 -0
  299. sglang/srt/models/qwen2_5_vl.py +29 -5
  300. sglang/srt/models/qwen2_audio.py +1 -1
  301. sglang/srt/models/qwen2_moe.py +120 -13
  302. sglang/srt/models/qwen2_vl.py +1 -1
  303. sglang/srt/models/qwen3.py +18 -3
  304. sglang/srt/models/qwen3_moe.py +32 -4
  305. sglang/srt/models/qwen3_next.py +1069 -0
  306. sglang/srt/models/qwen3_next_mtp.py +112 -0
  307. sglang/srt/models/qwen3_vl.py +787 -0
  308. sglang/srt/models/qwen3_vl_moe.py +471 -0
  309. sglang/srt/models/registry.py +15 -3
  310. sglang/srt/models/sarashina2_vision.py +269 -0
  311. sglang/srt/models/solar.py +505 -0
  312. sglang/srt/models/starcoder2.py +357 -0
  313. sglang/srt/models/step3_vl.py +1 -1
  314. sglang/srt/models/torch_native_llama.py +9 -2
  315. sglang/srt/models/utils.py +51 -0
  316. sglang/srt/multimodal/processors/base_processor.py +15 -7
  317. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  318. sglang/srt/multimodal/processors/glm4v.py +9 -9
  319. sglang/srt/multimodal/processors/internvl.py +153 -129
  320. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  321. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  322. sglang/srt/offloader.py +27 -3
  323. sglang/srt/parser/jinja_template_utils.py +6 -0
  324. sglang/srt/sampling/sampling_batch_info.py +38 -17
  325. sglang/srt/sampling/sampling_params.py +7 -0
  326. sglang/srt/server_args.py +966 -267
  327. sglang/srt/server_args_config_parser.py +146 -0
  328. sglang/srt/single_batch_overlap.py +151 -0
  329. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  330. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  331. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  332. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  333. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  334. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  335. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  336. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  337. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  338. sglang/srt/speculative/eagle_worker.py +99 -28
  339. sglang/srt/speculative/ngram_utils.py +428 -0
  340. sglang/srt/speculative/ngram_worker.py +245 -0
  341. sglang/srt/speculative/spec_info.py +52 -0
  342. sglang/srt/speculative/spec_utils.py +606 -0
  343. sglang/srt/speculative/standalone_worker.py +109 -0
  344. sglang/srt/torch_memory_saver_adapter.py +5 -7
  345. sglang/srt/tracing/trace.py +578 -0
  346. sglang/srt/two_batch_overlap.py +8 -5
  347. sglang/srt/utils/__init__.py +2 -0
  348. sglang/srt/{utils.py → utils/common.py} +433 -77
  349. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  350. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  351. sglang/srt/utils/rpd_utils.py +452 -0
  352. sglang/srt/utils/slow_rank_detector.py +71 -0
  353. sglang/srt/warmup.py +8 -4
  354. sglang/srt/weight_sync/utils.py +2 -2
  355. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  356. sglang/test/get_logits_ut.py +57 -0
  357. sglang/test/run_eval.py +79 -11
  358. sglang/test/runners.py +5 -1
  359. sglang/test/simple_eval_common.py +5 -2
  360. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  361. sglang/test/test_block_fp8.py +2 -2
  362. sglang/test/test_cutlass_moe.py +24 -6
  363. sglang/test/test_deterministic.py +297 -0
  364. sglang/test/test_disaggregation_utils.py +77 -0
  365. sglang/test/test_fp4_moe.py +370 -1
  366. sglang/test/test_programs.py +1 -1
  367. sglang/test/test_utils.py +383 -5
  368. sglang/utils.py +21 -1
  369. sglang/version.py +1 -1
  370. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +69 -124
  371. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +375 -245
  372. sglang/srt/disaggregation/launch_lb.py +0 -118
  373. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  374. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  375. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
  376. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
  377. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,586 @@
1
+ from typing import Optional, Union
2
+
3
+ from transformers import PretrainedConfig
4
+ from transformers.modeling_rope_utils import rope_config_validation
5
+
6
+
7
+ class Qwen3VLVisionConfig(PretrainedConfig):
8
+ model_type = "qwen3_vl"
9
+ base_config_key = "vision_config"
10
+
11
+ def __init__(
12
+ self,
13
+ depth=27,
14
+ hidden_size=1152,
15
+ hidden_act="gelu_pytorch_tanh",
16
+ intermediate_size=4304,
17
+ num_heads=16,
18
+ in_channels=3,
19
+ patch_size=16,
20
+ spatial_merge_size=2,
21
+ temporal_patch_size=2,
22
+ out_hidden_size=3584,
23
+ num_position_embeddings=2304,
24
+ deepstack_visual_indexes=[8, 16, 24],
25
+ initializer_range=0.02,
26
+ **kwargs,
27
+ ):
28
+ super().__init__(**kwargs)
29
+
30
+ self.depth = depth
31
+ self.hidden_size = hidden_size
32
+ self.hidden_act = hidden_act
33
+ self.intermediate_size = intermediate_size
34
+ self.num_heads = num_heads
35
+ self.in_channels = in_channels
36
+ self.patch_size = patch_size
37
+ self.spatial_merge_size = spatial_merge_size
38
+ self.temporal_patch_size = temporal_patch_size
39
+ self.out_hidden_size = out_hidden_size
40
+ self.num_position_embeddings = num_position_embeddings
41
+ self.initializer_range = initializer_range
42
+ self.deepstack_visual_indexes = deepstack_visual_indexes
43
+
44
+
45
+ class Qwen3VLTextConfig(PretrainedConfig):
46
+ r"""
47
+ This is the configuration class to store the configuration of a [`Qwen3VLTextModel`]. It is used to instantiate a
48
+ Qwen3-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
49
+ with the defaults will yield a similar configuration to that of
50
+ Qwen3-VL-4B-Instruct [Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct).
51
+
52
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
53
+ documentation from [`PretrainedConfig`] for more information.
54
+
55
+ Args:
56
+ vocab_size (`int`, *optional*, defaults to 151936):
57
+ Vocabulary size of the Qwen3VL model. Defines the number of different tokens that can be represented by the
58
+ `inputs_ids` passed when calling [`Qwen3VLModel`]
59
+ hidden_size (`int`, *optional*, defaults to 4096):
60
+ Dimension of the hidden representations.
61
+ intermediate_size (`int`, *optional*, defaults to 22016):
62
+ Dimension of the MLP representations.
63
+ num_hidden_layers (`int`, *optional*, defaults to 32):
64
+ Number of hidden layers in the Transformer encoder.
65
+ num_attention_heads (`int`, *optional*, defaults to 32):
66
+ Number of attention heads for each attention layer in the Transformer encoder.
67
+ num_key_value_heads (`int`, *optional*, defaults to 32):
68
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
69
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
70
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
71
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
72
+ by meanpooling all the original heads within that group. For more details, check out [this
73
+ paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
74
+ head_dim (`int`, *optional*, defaults to 128):
75
+ The dimension of the head. If not specified, will default to `hidden_size // num_attention_heads`.
76
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
77
+ The non-linear activation function (function or string) in the decoder.
78
+ max_position_embeddings (`int`, *optional*, defaults to 128000):
79
+ The maximum sequence length that this model might ever be used with.
80
+ initializer_range (`float`, *optional*, defaults to 0.02):
81
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
82
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
83
+ The epsilon used by the rms normalization layers.
84
+ use_cache (`bool`, *optional*, defaults to `True`):
85
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
86
+ relevant if `config.is_decoder=True`.
87
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
88
+ Whether the model's input and output word embeddings should be tied.
89
+ rope_theta (`float`, *optional*, defaults to 5000000.0):
90
+ The base period of the RoPE embeddings.
91
+ rope_scaling (`Dict`, *optional*):
92
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
93
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
94
+ accordingly.
95
+ Expected contents:
96
+ `rope_type` (`str`):
97
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
98
+ 'llama3'], with 'default' being the original RoPE implementation.
99
+ `factor` (`float`, *optional*):
100
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
101
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
102
+ original maximum pre-trained length.
103
+ `original_max_position_embeddings` (`int`, *optional*):
104
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
105
+ pretraining.
106
+ `attention_factor` (`float`, *optional*):
107
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
108
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
109
+ `factor` field to infer the suggested value.
110
+ `beta_fast` (`float`, *optional*):
111
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
112
+ ramp function. If unspecified, it defaults to 32.
113
+ `beta_slow` (`float`, *optional*):
114
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
115
+ ramp function. If unspecified, it defaults to 1.
116
+ `short_factor` (`list[float]`, *optional*):
117
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
118
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
119
+ size divided by the number of attention heads divided by 2
120
+ `long_factor` (`list[float]`, *optional*):
121
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
122
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
123
+ size divided by the number of attention heads divided by 2
124
+ `low_freq_factor` (`float`, *optional*):
125
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
126
+ `high_freq_factor` (`float`, *optional*):
127
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
128
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
129
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
130
+ attention_dropout (`float`, *optional*, defaults to 0.0):
131
+ The dropout ratio for the attention probabilities.
132
+
133
+ ```python
134
+ >>> from transformers import Qwen3VLTextModel, Qwen3VLTextConfig
135
+
136
+ >>> # Initializing a Qwen3VL style configuration
137
+ >>> configuration = Qwen3VLTextConfig()
138
+
139
+ >>> # Initializing a model from the Qwen3-VL-7B style configuration
140
+ >>> model = Qwen3VLTextModel(configuration)
141
+
142
+ >>> # Accessing the model configuration
143
+ >>> configuration = model.config
144
+ ```"""
145
+
146
+ model_type = "qwen3_vl_text"
147
+ base_config_key = "text_config"
148
+
149
+ def __init__(
150
+ self,
151
+ vocab_size=151936,
152
+ hidden_size=4096,
153
+ intermediate_size=22016,
154
+ num_hidden_layers=32,
155
+ num_attention_heads=32,
156
+ num_key_value_heads=32,
157
+ head_dim=128,
158
+ hidden_act="silu",
159
+ max_position_embeddings=128000,
160
+ initializer_range=0.02,
161
+ rms_norm_eps=1e-6,
162
+ use_cache=True,
163
+ tie_word_embeddings=False,
164
+ rope_theta=5000000.0,
165
+ rope_scaling=None,
166
+ attention_bias=False,
167
+ attention_dropout=0.0,
168
+ **kwargs,
169
+ ):
170
+ self.vocab_size = vocab_size
171
+ self.max_position_embeddings = max_position_embeddings
172
+ self.hidden_size = hidden_size
173
+ self.intermediate_size = intermediate_size
174
+ self.num_hidden_layers = num_hidden_layers
175
+ self.num_attention_heads = num_attention_heads
176
+
177
+ # for backward compatibility
178
+ if num_key_value_heads is None:
179
+ num_key_value_heads = num_attention_heads
180
+
181
+ self.num_key_value_heads = num_key_value_heads
182
+ self.head_dim = head_dim
183
+ self.hidden_act = hidden_act
184
+ self.initializer_range = initializer_range
185
+ self.rms_norm_eps = rms_norm_eps
186
+ self.use_cache = use_cache
187
+ self.rope_theta = rope_theta
188
+ self.rope_scaling = rope_scaling
189
+ self.attention_bias = attention_bias
190
+ self.attention_dropout = attention_dropout
191
+
192
+ rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"})
193
+
194
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
195
+
196
+
197
+ class Qwen3VLConfig(PretrainedConfig):
198
+ r"""
199
+ This is the configuration class to store the configuration of a [`Qwen3VLModel`]. It is used to instantiate a
200
+ Qwen3-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
201
+ with the defaults will yield a similar configuration to that of
202
+ Qwen3-VL-4B-Instruct [Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct).
203
+
204
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
205
+ documentation from [`PretrainedConfig`] for more information.
206
+
207
+
208
+ Args:
209
+ text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLTextConfig`):
210
+ The config object or dictionary of the text backbone.
211
+ vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLVisionConfig`):
212
+ The config object or dictionary of the vision backbone.
213
+ image_token_id (`int`, *optional*, defaults to 151655):
214
+ The image token index to encode the image prompt.
215
+ video_token_id (`int`, *optional*, defaults to 151656):
216
+ The video token index to encode the image prompt.
217
+ vision_start_token_id (`int`, *optional*, defaults to 151652):
218
+ The start token index to encode the image prompt.
219
+ vision_end_token_id (`int`, *optional*, defaults to 151653):
220
+ The end token index to encode the image prompt.
221
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
222
+ Whether to tie the word embeddings.
223
+
224
+ ```python
225
+ >>> from transformers import Qwen3VLForConditionalGeneration, Qwen3VLConfig
226
+
227
+ >>> # Initializing a Qwen3-VL style configuration
228
+ >>> configuration = Qwen3VLConfig()
229
+
230
+ >>> # Initializing a model from the Qwen3-VL-4B style configuration
231
+ >>> model = Qwen3VLForConditionalGeneration(configuration)
232
+
233
+ >>> # Accessing the model configuration
234
+ >>> configuration = model.config
235
+ ```"""
236
+
237
+ model_type = "qwen3_vl"
238
+ sub_configs = {
239
+ "vision_config": Qwen3VLVisionConfig,
240
+ "text_config": Qwen3VLTextConfig,
241
+ }
242
+ keys_to_ignore_at_inference = ["past_key_values"]
243
+
244
+ def __init__(
245
+ self,
246
+ text_config=None,
247
+ vision_config=None,
248
+ image_token_id=151655,
249
+ video_token_id=151656,
250
+ vision_start_token_id=151652,
251
+ vision_end_token_id=151653,
252
+ tie_word_embeddings=False,
253
+ **kwargs,
254
+ ):
255
+ if isinstance(vision_config, dict):
256
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
257
+ elif vision_config is None:
258
+ self.vision_config = self.sub_configs["vision_config"]()
259
+
260
+ if isinstance(text_config, dict):
261
+ self.text_config = self.sub_configs["text_config"](**text_config)
262
+ elif text_config is None:
263
+ self.text_config = self.sub_configs["text_config"]()
264
+
265
+ self.image_token_id = image_token_id
266
+ self.video_token_id = video_token_id
267
+ self.vision_start_token_id = vision_start_token_id
268
+ self.vision_end_token_id = vision_end_token_id
269
+ super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
270
+
271
+
272
+ class Qwen3VLMoeTextConfig(PretrainedConfig):
273
+ r"""
274
+ This is the configuration class to store the configuration of a [`Qwen3VLMoeTextModel`]. It is used to instantiate a
275
+ Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration
276
+ with the defaults will yield a similar configuration to that of
277
+ Qwen3-VL-30B-A3B-Instruct [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct).
278
+
279
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
280
+ documentation from [`PretrainedConfig`] for more information.
281
+
282
+ Args:
283
+ vocab_size (`int`, *optional*, defaults to 151936):
284
+ Vocabulary size of the Qwen2MoE model. Defines the number of different tokens that can be represented by the
285
+ `inputs_ids` passed when calling [`Qwen2MoeModel`]
286
+ hidden_size (`int`, *optional*, defaults to 2048):
287
+ Dimension of the hidden representations.
288
+ intermediate_size (`int`, *optional*, defaults to 5632):
289
+ Dimension of the MLP representations.
290
+ num_hidden_layers (`int`, *optional*, defaults to 24):
291
+ Number of hidden layers in the Transformer encoder.
292
+ num_attention_heads (`int`, *optional*, defaults to 16):
293
+ Number of attention heads for each attention layer in the Transformer encoder.
294
+ num_key_value_heads (`int`, *optional*, defaults to 16):
295
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
296
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
297
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
298
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
299
+ by meanpooling all the original heads within that group. For more details checkout [this
300
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
301
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
302
+ The non-linear activation function (function or string) in the decoder.
303
+ max_position_embeddings (`int`, *optional*, defaults to 128000):
304
+ The maximum sequence length that this model might ever be used with.
305
+ initializer_range (`float`, *optional*, defaults to 0.02):
306
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
307
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
308
+ The epsilon used by the rms normalization layers.
309
+ use_cache (`bool`, *optional*, defaults to `True`):
310
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
311
+ relevant if `config.is_decoder=True`.
312
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
313
+ Whether the model's input and output word embeddings should be tied.
314
+ rope_theta (`float`, *optional*, defaults to 5000000.0):
315
+ The base period of the RoPE embeddings.
316
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
317
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
318
+ attention_dropout (`float`, *optional*, defaults to 0.0):
319
+ The dropout ratio for the attention probabilities.
320
+ decoder_sparse_step (`int`, *optional*, defaults to 1):
321
+ The frequency of the MoE layer.
322
+ moe_intermediate_size (`int`, *optional*, defaults to 1408):
323
+ Intermediate size of the routed expert.
324
+ num_experts_per_tok (`int`, *optional*, defaults to 4):
325
+ Number of selected experts.
326
+ num_experts (`int`, *optional*, defaults to 60):
327
+ Number of routed experts.
328
+ norm_topk_prob (`bool`, *optional*, defaults to `True`):
329
+ Whether to normalize the topk probabilities.
330
+ mlp_only_layers (`List[int]`, *optional*, defaults to `[]`):
331
+ Indicate which layers use Qwen3VLMoeMLP rather than Qwen3VLMoeSparseMoeBlock
332
+ The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
333
+ If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
334
+ rope_scaling (`Dict`, *optional*):
335
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
336
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
337
+ accordingly.
338
+ Expected contents:
339
+ `rope_type` (`str`):
340
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
341
+ 'llama3'], with 'default' being the original RoPE implementation.
342
+ `factor` (`float`, *optional*):
343
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
344
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
345
+ original maximum pre-trained length.
346
+ `original_max_position_embeddings` (`int`, *optional*):
347
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
348
+ pretraining.
349
+ `attention_factor` (`float`, *optional*):
350
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
351
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
352
+ `factor` field to infer the suggested value.
353
+ `beta_fast` (`float`, *optional*):
354
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
355
+ ramp function. If unspecified, it defaults to 32.
356
+ `beta_slow` (`float`, *optional*):
357
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
358
+ ramp function. If unspecified, it defaults to 1.
359
+ `short_factor` (`List[float]`, *optional*):
360
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
361
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
362
+ size divided by the number of attention heads divided by 2
363
+ `long_factor` (`List[float]`, *optional*):
364
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
365
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
366
+ size divided by the number of attention heads divided by 2
367
+ `low_freq_factor` (`float`, *optional*):
368
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
369
+ `high_freq_factor` (`float`, *optional*):
370
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
371
+ head_dim (`int`, *optional*):
372
+ The dimension of the head. If not specified, will default to `hidden_size // num_attention_heads`.
373
+
374
+ ```python
375
+ >>> from transformers import Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeConfig
376
+
377
+ >>> # Initializing a Qwen3VLMoe style configuration
378
+ >>> configuration = Qwen3VLMoeConfig()
379
+
380
+ >>> # Initializing a model from the Qwen3-VL-30B-A3B style configuration
381
+ >>> model = Qwen3VLMoeForConditionalGeneration(configuration)
382
+
383
+ >>> # Accessing the model configuration
384
+ >>> configuration = model.config
385
+ ```"""
386
+
387
+ model_type = "qwen3_vl_moe_text"
388
+ base_config_key = "text_config"
389
+ keys_to_ignore_at_inference = ["past_key_values"]
390
+ # Default tensor parallel plan for base model `Qwen3VLMoe`
391
+ base_model_tp_plan = {
392
+ "layers.*.self_attn.q_proj": "colwise",
393
+ "layers.*.self_attn.k_proj": "colwise",
394
+ "layers.*.self_attn.v_proj": "colwise",
395
+ "layers.*.self_attn.o_proj": "rowwise",
396
+ "layers.*.mlp.gate_proj": "colwise",
397
+ "layers.*.mlp.up_proj": "colwise",
398
+ "layers.*.mlp.down_proj": "rowwise",
399
+ }
400
+ base_model_pp_plan = {
401
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
402
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
403
+ "norm": (["hidden_states"], ["hidden_states"]),
404
+ }
405
+
406
+ def __init__(
407
+ self,
408
+ vocab_size=151936,
409
+ hidden_size=2048,
410
+ intermediate_size=5632,
411
+ num_hidden_layers=24,
412
+ num_attention_heads=16,
413
+ num_key_value_heads=16,
414
+ hidden_act="silu",
415
+ max_position_embeddings=128000,
416
+ initializer_range=0.02,
417
+ rms_norm_eps=1e-6,
418
+ use_cache=True,
419
+ tie_word_embeddings=False,
420
+ rope_theta=5000000.0,
421
+ attention_bias=False,
422
+ attention_dropout=0.0,
423
+ decoder_sparse_step=1,
424
+ moe_intermediate_size=1408,
425
+ num_experts_per_tok=4,
426
+ num_experts=60,
427
+ norm_topk_prob=True,
428
+ mlp_only_layers=None,
429
+ rope_scaling=None,
430
+ head_dim=None,
431
+ **kwargs,
432
+ ):
433
+ self.vocab_size = vocab_size
434
+ self.max_position_embeddings = max_position_embeddings
435
+ self.hidden_size = hidden_size
436
+ self.intermediate_size = intermediate_size
437
+ self.num_hidden_layers = num_hidden_layers
438
+ self.num_attention_heads = num_attention_heads
439
+
440
+ # for backward compatibility
441
+ if num_key_value_heads is None:
442
+ num_key_value_heads = num_attention_heads
443
+
444
+ self.num_key_value_heads = num_key_value_heads
445
+ self.hidden_act = hidden_act
446
+ self.initializer_range = initializer_range
447
+ self.rms_norm_eps = rms_norm_eps
448
+ self.use_cache = use_cache
449
+ self.rope_theta = rope_theta
450
+ self.attention_bias = attention_bias
451
+ self.attention_dropout = attention_dropout
452
+ self.rope_scaling = rope_scaling
453
+ self.head_dim = head_dim or hidden_size // num_attention_heads
454
+
455
+ rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"})
456
+
457
+ # MoE arguments
458
+ self.decoder_sparse_step = decoder_sparse_step
459
+ self.moe_intermediate_size = moe_intermediate_size
460
+ self.num_experts_per_tok = num_experts_per_tok
461
+ self.num_experts = num_experts
462
+ self.norm_topk_prob = norm_topk_prob
463
+ self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
464
+
465
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
466
+
467
+
468
+ class Qwen3VLMoeVisionConfig(PretrainedConfig):
469
+ model_type = "qwen3_vl_moe"
470
+ base_config_key = "vision_config"
471
+
472
+ def __init__(
473
+ self,
474
+ depth=27,
475
+ hidden_size=1152,
476
+ hidden_act="gelu_pytorch_tanh",
477
+ intermediate_size=4304,
478
+ num_heads=16,
479
+ in_channels=3,
480
+ patch_size=16,
481
+ spatial_merge_size=2,
482
+ temporal_patch_size=2,
483
+ out_hidden_size=3584,
484
+ num_position_embeddings=2304,
485
+ deepstack_visual_indexes=[8, 16, 24],
486
+ initializer_range=0.02,
487
+ **kwargs,
488
+ ):
489
+ super().__init__(**kwargs)
490
+
491
+ self.depth = depth
492
+ self.hidden_size = hidden_size
493
+ self.hidden_act = hidden_act
494
+ self.intermediate_size = intermediate_size
495
+ self.num_heads = num_heads
496
+ self.in_channels = in_channels
497
+ self.patch_size = patch_size
498
+ self.spatial_merge_size = spatial_merge_size
499
+ self.temporal_patch_size = temporal_patch_size
500
+ self.out_hidden_size = out_hidden_size
501
+ self.num_position_embeddings = num_position_embeddings
502
+ self.initializer_range = initializer_range
503
+ self.deepstack_visual_indexes = deepstack_visual_indexes
504
+
505
+
506
+ class Qwen3VLMoeConfig(PretrainedConfig):
507
+ r"""
508
+ This is the configuration class to store the configuration of a [`Qwen3VLMoeModel`]. It is used to instantiate a
509
+ Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration
510
+ with the defaults will yield a similar configuration to that of
511
+ Qwen3-VL-30B-A3B-Instruct [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct).
512
+
513
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
514
+ documentation from [`PretrainedConfig`] for more information.
515
+
516
+
517
+ Args:
518
+ text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLMoeTextConfig`):
519
+ The config object or dictionary of the text backbone.
520
+ vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLMoeVisionConfig`):
521
+ The config object or dictionary of the vision backbone.
522
+ image_token_id (`int`, *optional*, defaults to 151655):
523
+ The image token index to encode the image prompt.
524
+ video_token_id (`int`, *optional*, defaults to 151656):
525
+ The video token index to encode the image prompt.
526
+ vision_start_token_id (`int`, *optional*, defaults to 151652):
527
+ The start token index to encode the image prompt.
528
+ vision_end_token_id (`int`, *optional*, defaults to 151653):
529
+ The end token index to encode the image prompt.
530
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
531
+ Whether to tie the word embeddings.
532
+
533
+ ```python
534
+ >>> from transformers import Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeConfig
535
+
536
+ >>> # Initializing a Qwen3-VL-MOE style configuration
537
+ >>> configuration = Qwen3VLMoeConfig()
538
+
539
+ >>> # Initializing a model from the Qwen3-VL-30B-A3B style configuration
540
+ >>> model = Qwen3VLMoeForConditionalGeneration(configuration)
541
+
542
+ >>> # Accessing the model configuration
543
+ >>> configuration = model.config
544
+ ```"""
545
+
546
+ model_type = "qwen3_vl_moe"
547
+ sub_configs = {
548
+ "vision_config": Qwen3VLMoeVisionConfig,
549
+ "text_config": Qwen3VLMoeTextConfig,
550
+ }
551
+ keys_to_ignore_at_inference = ["past_key_values"]
552
+
553
+ def __init__(
554
+ self,
555
+ text_config=None,
556
+ vision_config=None,
557
+ image_token_id=151655,
558
+ video_token_id=151656,
559
+ vision_start_token_id=151652,
560
+ vision_end_token_id=151653,
561
+ tie_word_embeddings=False,
562
+ **kwargs,
563
+ ):
564
+ if isinstance(vision_config, dict):
565
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
566
+ elif vision_config is None:
567
+ self.vision_config = self.sub_configs["vision_config"]()
568
+
569
+ if isinstance(text_config, dict):
570
+ self.text_config = self.sub_configs["text_config"](**text_config)
571
+ elif text_config is None:
572
+ self.text_config = self.sub_configs["text_config"]()
573
+
574
+ self.image_token_id = image_token_id
575
+ self.video_token_id = video_token_id
576
+ self.vision_start_token_id = vision_start_token_id
577
+ self.vision_end_token_id = vision_end_token_id
578
+ super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
579
+
580
+
581
+ __all__ = [
582
+ "Qwen3VLMoeConfig",
583
+ "Qwen3VLMoeVisionConfig",
584
+ "Qwen3VLConfig",
585
+ "Qwen3VLVisionConfig",
586
+ ]
@@ -9,6 +9,7 @@ from sglang.srt.connector.base_connector import (
9
9
  BaseKVConnector,
10
10
  )
11
11
  from sglang.srt.connector.redis import RedisConnector
12
+ from sglang.srt.connector.remote_instance import RemoteInstanceConnector
12
13
  from sglang.srt.connector.s3 import S3Connector
13
14
  from sglang.srt.utils import parse_connector_type
14
15
 
@@ -18,14 +19,17 @@ logger = logging.getLogger(__name__)
18
19
  class ConnectorType(str, enum.Enum):
19
20
  FS = "filesystem"
20
21
  KV = "KV"
22
+ INSTANCE = "instance"
21
23
 
22
24
 
23
- def create_remote_connector(url, **kwargs) -> BaseConnector:
25
+ def create_remote_connector(url, device, **kwargs) -> BaseConnector:
24
26
  connector_type = parse_connector_type(url)
25
27
  if connector_type == "redis":
26
28
  return RedisConnector(url)
27
29
  elif connector_type == "s3":
28
30
  return S3Connector(url)
31
+ elif connector_type == "instance":
32
+ return RemoteInstanceConnector(url, device)
29
33
  else:
30
34
  raise ValueError(f"Invalid connector type: {url}")
31
35
 
@@ -35,6 +39,8 @@ def get_connector_type(client: BaseConnector) -> ConnectorType:
35
39
  return ConnectorType.KV
36
40
  if isinstance(client, BaseFileConnector):
37
41
  return ConnectorType.FS
42
+ if isinstance(client, RemoteInstanceConnector):
43
+ return ConnectorType.INSTANCE
38
44
 
39
45
  raise ValueError(f"Invalid connector type: {client}")
40
46
 
@@ -44,6 +50,7 @@ __all__ = [
44
50
  "BaseFileConnector",
45
51
  "BaseKVConnector",
46
52
  "RedisConnector",
53
+ "RemoteInstanceConnector",
47
54
  "S3Connector",
48
55
  "ConnectorType",
49
56
  "create_remote_connector",