sglang 0.5.0rc1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. sglang/bench_one_batch.py +0 -7
  2. sglang/bench_one_batch_server.py +7 -2
  3. sglang/bench_serving.py +3 -3
  4. sglang/eval/llama3_eval.py +0 -1
  5. sglang/srt/configs/model_config.py +25 -9
  6. sglang/srt/configs/update_config.py +40 -5
  7. sglang/srt/constrained/xgrammar_backend.py +23 -11
  8. sglang/srt/conversation.py +2 -15
  9. sglang/srt/disaggregation/ascend/conn.py +1 -3
  10. sglang/srt/disaggregation/base/conn.py +1 -0
  11. sglang/srt/disaggregation/decode.py +1 -2
  12. sglang/srt/disaggregation/launch_lb.py +7 -1
  13. sglang/srt/disaggregation/mini_lb.py +11 -5
  14. sglang/srt/disaggregation/mooncake/conn.py +141 -47
  15. sglang/srt/disaggregation/prefill.py +261 -5
  16. sglang/srt/disaggregation/utils.py +2 -1
  17. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
  18. sglang/srt/distributed/device_communicators/pynccl.py +68 -18
  19. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
  20. sglang/srt/distributed/naive_distributed.py +112 -0
  21. sglang/srt/distributed/parallel_state.py +90 -4
  22. sglang/srt/entrypoints/context.py +20 -1
  23. sglang/srt/entrypoints/engine.py +29 -4
  24. sglang/srt/entrypoints/http_server.py +76 -0
  25. sglang/srt/entrypoints/openai/protocol.py +4 -2
  26. sglang/srt/entrypoints/openai/serving_chat.py +23 -6
  27. sglang/srt/entrypoints/openai/serving_completions.py +10 -1
  28. sglang/srt/entrypoints/openai/serving_responses.py +2 -2
  29. sglang/srt/eplb/expert_distribution.py +2 -3
  30. sglang/srt/function_call/deepseekv3_detector.py +1 -1
  31. sglang/srt/hf_transformers_utils.py +24 -0
  32. sglang/srt/host_shared_memory.py +83 -0
  33. sglang/srt/layers/attention/ascend_backend.py +132 -22
  34. sglang/srt/layers/attention/flashattention_backend.py +24 -17
  35. sglang/srt/layers/attention/flashinfer_backend.py +14 -3
  36. sglang/srt/layers/attention/flashinfer_mla_backend.py +227 -76
  37. sglang/srt/layers/attention/triton_backend.py +109 -73
  38. sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
  39. sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
  40. sglang/srt/layers/attention/trtllm_mha_backend.py +398 -36
  41. sglang/srt/layers/attention/trtllm_mla_backend.py +49 -19
  42. sglang/srt/layers/attention/utils.py +94 -15
  43. sglang/srt/layers/attention/vision.py +40 -13
  44. sglang/srt/layers/attention/vision_utils.py +65 -0
  45. sglang/srt/layers/communicator.py +58 -10
  46. sglang/srt/layers/dp_attention.py +137 -27
  47. sglang/srt/layers/elementwise.py +94 -0
  48. sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
  49. sglang/srt/layers/layernorm.py +8 -1
  50. sglang/srt/layers/linear.py +24 -0
  51. sglang/srt/layers/logits_processor.py +16 -18
  52. sglang/srt/layers/moe/__init__.py +31 -0
  53. sglang/srt/layers/moe/ep_moe/layer.py +37 -33
  54. sglang/srt/layers/moe/fused_moe_native.py +14 -25
  55. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  62. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  63. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  68. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  69. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  70. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
  71. sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
  72. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
  73. sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
  74. sglang/srt/layers/moe/moe_runner/base.py +13 -0
  75. sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
  76. sglang/srt/layers/moe/router.py +15 -9
  77. sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
  78. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
  79. sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
  80. sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
  81. sglang/srt/layers/moe/topk.py +167 -83
  82. sglang/srt/layers/moe/utils.py +159 -18
  83. sglang/srt/layers/multimodal.py +156 -40
  84. sglang/srt/layers/quantization/__init__.py +18 -46
  85. sglang/srt/layers/quantization/awq.py +22 -23
  86. sglang/srt/layers/quantization/base_config.py +2 -6
  87. sglang/srt/layers/quantization/blockwise_int8.py +4 -12
  88. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -29
  89. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
  90. sglang/srt/layers/quantization/fp8.py +127 -119
  91. sglang/srt/layers/quantization/fp8_kernel.py +195 -24
  92. sglang/srt/layers/quantization/fp8_utils.py +34 -9
  93. sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
  94. sglang/srt/layers/quantization/gptq.py +17 -21
  95. sglang/srt/layers/quantization/marlin_utils.py +26 -8
  96. sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
  97. sglang/srt/layers/quantization/modelopt_quant.py +217 -98
  98. sglang/srt/layers/quantization/moe_wna16.py +10 -15
  99. sglang/srt/layers/quantization/mxfp4.py +222 -39
  100. sglang/srt/layers/quantization/quark/quark.py +390 -0
  101. sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
  102. sglang/srt/layers/quantization/unquant.py +34 -70
  103. sglang/srt/layers/quantization/utils.py +77 -2
  104. sglang/srt/layers/quantization/w4afp8.py +7 -8
  105. sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
  106. sglang/srt/layers/quantization/w8a8_int8.py +5 -13
  107. sglang/srt/layers/radix_attention.py +6 -0
  108. sglang/srt/layers/rotary_embedding.py +1 -0
  109. sglang/srt/layers/sampler.py +5 -2
  110. sglang/srt/lora/layers.py +6 -2
  111. sglang/srt/lora/lora_manager.py +21 -22
  112. sglang/srt/lora/lora_registry.py +3 -3
  113. sglang/srt/lora/mem_pool.py +26 -24
  114. sglang/srt/lora/utils.py +10 -12
  115. sglang/srt/managers/cache_controller.py +80 -19
  116. sglang/srt/managers/detokenizer_manager.py +10 -2
  117. sglang/srt/managers/io_struct.py +23 -0
  118. sglang/srt/managers/mm_utils.py +1 -1
  119. sglang/srt/managers/schedule_batch.py +22 -48
  120. sglang/srt/managers/scheduler.py +28 -20
  121. sglang/srt/managers/session_controller.py +1 -1
  122. sglang/srt/managers/template_manager.py +7 -5
  123. sglang/srt/managers/tokenizer_manager.py +88 -39
  124. sglang/srt/managers/tp_worker.py +1 -0
  125. sglang/srt/managers/utils.py +59 -1
  126. sglang/srt/mem_cache/allocator.py +10 -157
  127. sglang/srt/mem_cache/allocator_ascend.py +147 -0
  128. sglang/srt/mem_cache/chunk_cache.py +1 -1
  129. sglang/srt/mem_cache/hicache_storage.py +14 -4
  130. sglang/srt/mem_cache/memory_pool.py +3 -3
  131. sglang/srt/mem_cache/memory_pool_host.py +35 -2
  132. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
  133. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
  134. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
  135. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
  136. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
  137. sglang/srt/model_executor/cuda_graph_runner.py +33 -33
  138. sglang/srt/model_executor/forward_batch_info.py +11 -10
  139. sglang/srt/model_executor/model_runner.py +93 -78
  140. sglang/srt/model_executor/npu_graph_runner.py +94 -0
  141. sglang/srt/model_loader/loader.py +24 -6
  142. sglang/srt/models/dbrx.py +12 -6
  143. sglang/srt/models/deepseek.py +2 -1
  144. sglang/srt/models/deepseek_nextn.py +5 -2
  145. sglang/srt/models/deepseek_v2.py +226 -223
  146. sglang/srt/models/ernie4.py +2 -2
  147. sglang/srt/models/glm4_moe.py +27 -65
  148. sglang/srt/models/glm4_moe_nextn.py +2 -1
  149. sglang/srt/models/glm4v.py +52 -1
  150. sglang/srt/models/glm4v_moe.py +8 -11
  151. sglang/srt/models/gpt_oss.py +41 -76
  152. sglang/srt/models/granitemoe.py +0 -1
  153. sglang/srt/models/grok.py +376 -48
  154. sglang/srt/models/interns1.py +12 -47
  155. sglang/srt/models/internvl.py +6 -51
  156. sglang/srt/models/llama.py +10 -2
  157. sglang/srt/models/llama4.py +18 -7
  158. sglang/srt/models/minicpm3.py +0 -1
  159. sglang/srt/models/mixtral.py +0 -2
  160. sglang/srt/models/nemotron_nas.py +435 -0
  161. sglang/srt/models/olmoe.py +0 -1
  162. sglang/srt/models/phi4mm.py +3 -21
  163. sglang/srt/models/qwen2.py +2 -2
  164. sglang/srt/models/qwen2_5_vl.py +2 -0
  165. sglang/srt/models/qwen2_moe.py +23 -23
  166. sglang/srt/models/qwen3.py +2 -2
  167. sglang/srt/models/qwen3_classification.py +84 -0
  168. sglang/srt/models/qwen3_moe.py +27 -43
  169. sglang/srt/models/step3_vl.py +8 -3
  170. sglang/srt/models/xverse_moe.py +11 -5
  171. sglang/srt/multimodal/processors/base_processor.py +3 -3
  172. sglang/srt/multimodal/processors/internvl.py +7 -2
  173. sglang/srt/multimodal/processors/llava.py +11 -7
  174. sglang/srt/offloader.py +433 -0
  175. sglang/srt/operations.py +22 -2
  176. sglang/srt/reasoning_parser.py +4 -3
  177. sglang/srt/sampling/sampling_batch_info.py +7 -4
  178. sglang/srt/server_args.py +264 -105
  179. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -21
  180. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
  181. sglang/srt/speculative/eagle_utils.py +36 -13
  182. sglang/srt/speculative/eagle_worker.py +56 -3
  183. sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
  184. sglang/srt/two_batch_overlap.py +20 -19
  185. sglang/srt/utils.py +68 -70
  186. sglang/test/runners.py +8 -5
  187. sglang/test/test_block_fp8.py +5 -6
  188. sglang/test/test_block_fp8_ep.py +13 -19
  189. sglang/test/test_cutlass_moe.py +4 -6
  190. sglang/test/test_cutlass_w4a8_moe.py +4 -3
  191. sglang/test/test_fp4_moe.py +4 -3
  192. sglang/test/test_marlin_moe.py +1 -1
  193. sglang/test/test_marlin_utils.py +1 -1
  194. sglang/test/test_utils.py +7 -0
  195. sglang/utils.py +0 -1
  196. sglang/version.py +1 -1
  197. {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/METADATA +11 -11
  198. {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/RECORD +201 -171
  199. sglang/srt/layers/quantization/fp4.py +0 -557
  200. sglang/srt/layers/quantization/scalar_type.py +0 -352
  201. {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
  202. {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
  203. {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0
@@ -88,6 +88,7 @@ from sglang.srt.managers.io_struct import (
88
88
  UpdateWeightFromDiskReqInput,
89
89
  UpdateWeightsFromDistributedReqInput,
90
90
  UpdateWeightsFromTensorReqInput,
91
+ UpdateWeightVersionReqInput,
91
92
  VertexGenerateReqInput,
92
93
  )
93
94
  from sglang.srt.managers.template_manager import TemplateManager
@@ -342,10 +343,19 @@ async def get_model_info():
342
343
  "tokenizer_path": _global_state.tokenizer_manager.server_args.tokenizer_path,
343
344
  "is_generation": _global_state.tokenizer_manager.is_generation,
344
345
  "preferred_sampling_params": _global_state.tokenizer_manager.server_args.preferred_sampling_params,
346
+ "weight_version": _global_state.tokenizer_manager.server_args.weight_version,
345
347
  }
346
348
  return result
347
349
 
348
350
 
351
+ @app.get("/get_weight_version")
352
+ async def get_weight_version():
353
+ """Get the current weight version."""
354
+ return {
355
+ "weight_version": _global_state.tokenizer_manager.server_args.weight_version
356
+ }
357
+
358
+
349
359
  @app.get("/get_server_info")
350
360
  async def get_server_info():
351
361
  # Returns interna states per DP.
@@ -501,6 +511,18 @@ async def stop_profile_async():
501
511
  )
502
512
 
503
513
 
514
+ @app.api_route("/freeze_gc", methods=["GET", "POST"])
515
+ async def freeze_gc_async():
516
+ """
517
+ See engine.freeze_gc for more details.
518
+ """
519
+ await _global_state.tokenizer_manager.freeze_gc()
520
+ return Response(
521
+ content="Garbage collection frozen.\n",
522
+ status_code=200,
523
+ )
524
+
525
+
504
526
  @app.api_route("/start_expert_distribution_record", methods=["GET", "POST"])
505
527
  async def start_expert_distribution_record_async():
506
528
  """Start recording the expert distribution. Clear the previous record if any."""
@@ -537,6 +559,12 @@ async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: R
537
559
  success, message, num_paused_requests = (
538
560
  await _global_state.tokenizer_manager.update_weights_from_disk(obj, request)
539
561
  )
562
+
563
+ # Update weight version if provided and weights update was successful
564
+ if success and obj.weight_version is not None:
565
+ _update_weight_version_if_provided(obj.weight_version)
566
+ message += f" Weight version updated to {obj.weight_version}."
567
+
540
568
  content = {
541
569
  "success": success,
542
570
  "message": message,
@@ -583,6 +611,12 @@ async def update_weights_from_tensor(
583
611
  success, message = await _global_state.tokenizer_manager.update_weights_from_tensor(
584
612
  obj, request
585
613
  )
614
+
615
+ # Update weight version if provided and weights update was successful
616
+ if success and obj.weight_version is not None:
617
+ _update_weight_version_if_provided(obj.weight_version)
618
+ message += f" Weight version updated to {obj.weight_version}."
619
+
586
620
  content = {"success": success, "message": message}
587
621
  return ORJSONResponse(
588
622
  content, status_code=200 if success else HTTPStatus.BAD_REQUEST
@@ -599,6 +633,12 @@ async def update_weights_from_distributed(
599
633
  obj, request
600
634
  )
601
635
  )
636
+
637
+ # Update weight version if provided and weights update was successful
638
+ if success and obj.weight_version is not None:
639
+ _update_weight_version_if_provided(obj.weight_version)
640
+ message += f" Weight version updated to {obj.weight_version}."
641
+
602
642
  content = {"success": success, "message": message}
603
643
  if success:
604
644
  return ORJSONResponse(content, status_code=200)
@@ -606,6 +646,36 @@ async def update_weights_from_distributed(
606
646
  return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
607
647
 
608
648
 
649
+ @app.post("/update_weight_version")
650
+ async def update_weight_version(obj: UpdateWeightVersionReqInput, request: Request):
651
+ """Update the weight version. This operation requires no active requests."""
652
+ if obj.abort_all_requests:
653
+ _global_state.tokenizer_manager.abort_request(abort_all=True)
654
+
655
+ # Use a simple approach without the complex lock mechanism for now
656
+ # since weight_version update is a simple operation that doesn't affect model weights
657
+ try:
658
+ # Update the weight version in server args (the single source of truth)
659
+ _global_state.tokenizer_manager.server_args.weight_version = obj.new_version
660
+
661
+ return ORJSONResponse(
662
+ {
663
+ "success": True,
664
+ "message": f"Weight version updated to {obj.new_version}",
665
+ "new_version": obj.new_version,
666
+ },
667
+ status_code=HTTPStatus.OK,
668
+ )
669
+ except Exception as e:
670
+ return ORJSONResponse(
671
+ {
672
+ "success": False,
673
+ "message": f"Failed to update weight version: {str(e)}",
674
+ },
675
+ status_code=HTTPStatus.BAD_REQUEST,
676
+ )
677
+
678
+
609
679
  @app.api_route("/get_weights_by_name", methods=["GET", "POST"])
610
680
  async def get_weights_by_name(obj: GetWeightsByNameReqInput, request: Request):
611
681
  """Get model parameter by name."""
@@ -966,6 +1036,12 @@ async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Reque
966
1036
  return ORJSONResponse({"predictions": ret})
967
1037
 
968
1038
 
1039
+ def _update_weight_version_if_provided(weight_version: Optional[str]) -> None:
1040
+ """Update weight version if provided."""
1041
+ if weight_version is not None:
1042
+ _global_state.tokenizer_manager.server_args.weight_version = weight_version
1043
+
1044
+
969
1045
  def _create_error_response(e):
970
1046
  return ORJSONResponse(
971
1047
  {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
@@ -240,6 +240,7 @@ class CompletionResponse(BaseModel):
240
240
  model: str
241
241
  choices: List[CompletionResponseChoice]
242
242
  usage: UsageInfo
243
+ metadata: Optional[Dict[str, Any]] = None
243
244
 
244
245
 
245
246
  class CompletionResponseStreamChoice(BaseModel):
@@ -517,6 +518,7 @@ class ChatCompletionResponse(BaseModel):
517
518
  model: str
518
519
  choices: List[ChatCompletionResponseChoice]
519
520
  usage: UsageInfo
521
+ metadata: Optional[Dict[str, Any]] = None
520
522
 
521
523
 
522
524
  class DeltaMessage(BaseModel):
@@ -735,8 +737,8 @@ class ResponsesRequest(BaseModel):
735
737
  else:
736
738
  max_tokens = default_max_tokens
737
739
 
738
- # Avoid exceed the context length by minus 1 token
739
- max_tokens -= 1
740
+ # Avoid exceed the context length by minus 2 token
741
+ max_tokens -= 2
740
742
 
741
743
  # Get parameters with defaults
742
744
  temperature = self.temperature
@@ -81,12 +81,25 @@ class OpenAIServingChat(OpenAIServingBase):
81
81
  f"This model supports at most {server_context_length} completion tokens."
82
82
  )
83
83
 
84
+ if request.response_format and request.response_format.type == "json_schema":
85
+ schema = getattr(request.response_format.json_schema, "schema_", None)
86
+ if schema is None:
87
+ return "schema_ is required for json_schema response format request."
88
+
84
89
  return None
85
90
 
86
91
  def _convert_to_internal_request(
87
92
  self,
88
93
  request: ChatCompletionRequest,
89
94
  ) -> tuple[GenerateReqInput, ChatCompletionRequest]:
95
+ reasoning_effort = (
96
+ request.chat_template_kwargs.pop("reasoning_effort", None)
97
+ if request.chat_template_kwargs
98
+ else None
99
+ )
100
+ if reasoning_effort is not None:
101
+ request.reasoning_effort = reasoning_effort
102
+
90
103
  """Convert OpenAI chat completion request to internal format"""
91
104
  is_multimodal = self.tokenizer_manager.model_config.is_multimodal
92
105
 
@@ -723,6 +736,7 @@ class OpenAIServingChat(OpenAIServingBase):
723
736
  model=request.model,
724
737
  choices=choices,
725
738
  usage=usage,
739
+ metadata={"weight_version": ret[0]["meta_info"]["weight_version"]},
726
740
  )
727
741
 
728
742
  def _process_logprobs_tokens(
@@ -858,12 +872,15 @@ class OpenAIServingChat(OpenAIServingBase):
858
872
  Returns:
859
873
  The boolean value of 'enable_thinking' if found, otherwise False.
860
874
  """
861
- if (
862
- hasattr(request, "chat_template_kwargs")
863
- and request.chat_template_kwargs
864
- and request.chat_template_kwargs.get("enable_thinking") is not None
865
- ):
866
- return request.chat_template_kwargs.get("enable_thinking")
875
+ if hasattr(request, "chat_template_kwargs") and request.chat_template_kwargs:
876
+ # For Qwen3 models, `enable_thinking` is supported.
877
+ if request.chat_template_kwargs.get("enable_thinking") is not None:
878
+ return request.chat_template_kwargs.get("enable_thinking")
879
+ # For DeepSeek-V3.1 models, `thinking` is supported.
880
+ elif request.chat_template_kwargs.get("thinking") is not None:
881
+ return request.chat_template_kwargs.get("thinking")
882
+ else:
883
+ return False
867
884
  return False
868
885
 
869
886
  async def _process_tool_call_stream(
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  import time
3
- from typing import Any, AsyncGenerator, Dict, List, Union
3
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Union
4
4
 
5
5
  from fastapi import Request
6
6
  from fastapi.responses import ORJSONResponse, StreamingResponse
@@ -41,6 +41,14 @@ class OpenAIServingCompletion(OpenAIServingBase):
41
41
  def _request_id_prefix(self) -> str:
42
42
  return "cmpl-"
43
43
 
44
+ def _validate_request(self, request: CompletionRequest) -> Optional[str]:
45
+ """Validate that the input is valid."""
46
+ prompt = request.prompt
47
+ if not prompt or (isinstance(prompt, list) and all(not p for p in prompt)):
48
+ return "Prompt cannot be empty"
49
+
50
+ return None
51
+
44
52
  def _convert_to_internal_request(
45
53
  self,
46
54
  request: CompletionRequest,
@@ -373,6 +381,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
373
381
  created=created,
374
382
  choices=choices,
375
383
  usage=usage,
384
+ metadata={"weight_version": ret[0]["meta_info"]["weight_version"]},
376
385
  )
377
386
 
378
387
  def _get_echo_text(self, request: CompletionRequest, index: int) -> str:
@@ -944,7 +944,7 @@ class OpenAIServingResponses(OpenAIServingChat):
944
944
  type="output_text",
945
945
  text="",
946
946
  annotations=[],
947
- logprobs=[],
947
+ logprobs=None,
948
948
  ),
949
949
  )
950
950
  )
@@ -992,7 +992,7 @@ class OpenAIServingResponses(OpenAIServingChat):
992
992
  type="output_text",
993
993
  text="",
994
994
  annotations=[],
995
- logprobs=[],
995
+ logprobs=None,
996
996
  ),
997
997
  )
998
998
  )
@@ -25,7 +25,6 @@ import torch
25
25
  import torch.distributed
26
26
 
27
27
  from sglang.srt.eplb.expert_location import ExpertLocationMetadata
28
- from sglang.srt.managers.schedule_batch import global_server_args_dict
29
28
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
30
29
  from sglang.srt.server_args import ServerArgs
31
30
  from sglang.srt.utils import Withable, get_bool_env_var
@@ -288,14 +287,14 @@ class _SinglePassGatherer(ABC):
288
287
  )
289
288
 
290
289
  if server_args.expert_distribution_recorder_mode == "stat_approx":
291
- if server_args.moe_a2a_backend is not None and (
290
+ if server_args.moe_a2a_backend != "none" and (
292
291
  server_args.deepep_mode == "normal"
293
292
  ):
294
293
  return _DeepepNormalSinglePassGatherer(expert_location_metadata, rank)
295
294
  else:
296
295
  raise NotImplementedError
297
296
 
298
- if server_args.moe_a2a_backend is not None:
297
+ if server_args.moe_a2a_backend != "none":
299
298
  if server_args.deepep_mode == "normal":
300
299
  return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
301
300
  elif server_args.deepep_mode == "low_latency":
@@ -215,6 +215,6 @@ class DeepSeekV3Detector(BaseFormatDetector):
215
215
  sequence_start_token=self.bot_token,
216
216
  sequence_end_token=self.eot_token,
217
217
  tool_call_separator="",
218
- call_rule_fmt='"<|tool▁call▁begin|>function<|tool▁sep|>{name}\\n```json\\n" {arguments_rule} "\\n```<|tool▁call▁end|>"',
218
+ call_rule_fmt='"<|tool▁call▁begin|>function<|tool▁sep|>{name}\\n```json\\n"{arguments_rule}"\\n```<|tool▁call▁end|>"',
219
219
  function_format="json",
220
220
  )
@@ -129,6 +129,25 @@ def get_config(
129
129
  config = AutoConfig.from_pretrained(
130
130
  model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
131
131
  )
132
+ if (
133
+ config.architectures is not None
134
+ and config.architectures[0] == "Phi4MMForCausalLM"
135
+ ):
136
+ # Phi4MMForCausalLM uses a hard-coded vision_config. See:
137
+ # https://github.com/vllm-project/vllm/blob/6071e989df1531b59ef35568f83f7351afb0b51e/vllm/model_executor/models/phi4mm.py#L71
138
+ # We set it here to support cases where num_attention_heads is not divisible by the TP size.
139
+ from transformers import SiglipVisionConfig
140
+
141
+ vision_config = {
142
+ "hidden_size": 1152,
143
+ "image_size": 448,
144
+ "intermediate_size": 4304,
145
+ "model_type": "siglip_vision_model",
146
+ "num_attention_heads": 16,
147
+ "num_hidden_layers": 26, # Model is originally 27-layer, we only need the first 26 layers for feature extraction.
148
+ "patch_size": 14,
149
+ }
150
+ config.vision_config = SiglipVisionConfig(**vision_config)
132
151
  text_config = get_hf_text_config(config=config)
133
152
 
134
153
  if isinstance(model, str) and text_config is not None:
@@ -244,6 +263,11 @@ def get_tokenizer(
244
263
  **kwargs,
245
264
  ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
246
265
  """Gets a tokenizer for the given model name via Huggingface."""
266
+ if tokenizer_name.endswith(".json"):
267
+ from sglang.srt.tokenizer.tiktoken_tokenizer import TiktokenTokenizer
268
+
269
+ return TiktokenTokenizer(tokenizer_name)
270
+
247
271
  if tokenizer_mode == "slow":
248
272
  if kwargs.get("use_fast", False):
249
273
  raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
@@ -0,0 +1,83 @@
1
+ import logging
2
+ import os
3
+ from dataclasses import dataclass
4
+ from multiprocessing import shared_memory
5
+ from pathlib import Path
6
+ from typing import List, Optional
7
+
8
+ import numpy as np
9
+ import torch
10
+
11
+ from sglang.srt.distributed.naive_distributed import get_naive_distributed
12
+ from sglang.srt.utils import check_cuda_result
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class HostSharedMemoryManager:
18
+ def __init__(self, base_name: str):
19
+ self._base_name = Path(base_name)
20
+ self._operation_index = 0
21
+ self._records: List[_Record] = []
22
+
23
+ def malloc(self, *, shape, dtype):
24
+ meta_tensor = torch.empty(size=shape, dtype=dtype, device="meta")
25
+ raw = self._malloc_raw(num_bytes=meta_tensor.nbytes)
26
+ return raw.view(dtype).view(*shape)
27
+
28
+ def _malloc_raw(self, *, num_bytes: int) -> torch.Tensor:
29
+ import cuda.bindings.runtime as cuda_rt
30
+
31
+ self._operation_index += 1
32
+ shm_name = f"{self._base_name}_op{self._operation_index}"
33
+
34
+ # TODO handle dispose
35
+ if get_naive_distributed().get_rank() == 0:
36
+ shm = shared_memory.SharedMemory(name=shm_name, create=True, size=num_bytes)
37
+
38
+ get_naive_distributed().barrier()
39
+
40
+ if get_naive_distributed().get_rank() != 0:
41
+ shm = shared_memory.SharedMemory(name=shm_name)
42
+
43
+ np_array = np.ndarray((num_bytes,), dtype=np.uint8, buffer=shm.buf)
44
+ tensor = torch.from_numpy(np_array)
45
+
46
+ check_cuda_result(
47
+ cuda_rt.cudaHostRegister(
48
+ tensor.data_ptr(), num_bytes, cuda_rt.cudaHostRegisterPortable
49
+ )
50
+ )
51
+
52
+ get_naive_distributed().barrier()
53
+
54
+ self._records.append(
55
+ _Record(
56
+ shm=shm,
57
+ np_array=np_array,
58
+ tensor=tensor,
59
+ )
60
+ )
61
+ return tensor
62
+
63
+
64
+ @dataclass
65
+ class _Record:
66
+ shm: shared_memory.SharedMemory
67
+ np_array: np.ndarray
68
+ tensor: torch.Tensor
69
+
70
+
71
+ # Can have multi instances if needed
72
+ _instance: Optional[HostSharedMemoryManager] = None
73
+
74
+
75
+ def get_host_shared_memory_manager():
76
+ assert _instance is not None
77
+ return _instance
78
+
79
+
80
+ def set_host_shared_memory_manager(instance: HostSharedMemoryManager):
81
+ global _instance
82
+ assert _instance is None
83
+ _instance = instance
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
- from typing import TYPE_CHECKING, Optional
4
+ from typing import TYPE_CHECKING, List, Optional
5
5
 
6
6
  import torch
7
7
  import torch_npu
@@ -27,6 +27,7 @@ class ForwardMetadata:
27
27
  # seq len inputs
28
28
  extend_seq_lens_cpu_int: Optional[torch.Tensor] = None
29
29
  seq_lens_cpu_int: Optional[torch.Tensor] = None
30
+ seq_lens_cpu_list: Optional[List[int]] = None
30
31
 
31
32
 
32
33
  class AscendAttnBackend(AttentionBackend):
@@ -51,7 +52,7 @@ class AscendAttnBackend(AttentionBackend):
51
52
 
52
53
  def __init__(self, model_runner: ModelRunner):
53
54
  super().__init__()
54
- self.forward_metadata = ForwardMetadata()
55
+ self.forward_metadata = None
55
56
  self.device = model_runner.device
56
57
  self.gen_attention_mask(128, model_runner.dtype)
57
58
  self.page_size = model_runner.page_size
@@ -60,9 +61,15 @@ class AscendAttnBackend(AttentionBackend):
60
61
  self.kv_lora_rank = model_runner.model_config.kv_lora_rank
61
62
  self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
62
63
  self.native_attn = TorchNativeAttnBackend(model_runner)
64
+ self.graph_metadata = {}
65
+ self.max_context_len = model_runner.model_config.context_len
66
+ self.req_to_token = model_runner.req_to_token_pool.req_to_token
67
+ self.graph_mode = False
63
68
 
64
69
  def init_forward_metadata(self, forward_batch: ForwardBatch):
65
70
  """Init the metadata for a forward pass."""
71
+ self.forward_metadata = ForwardMetadata()
72
+
66
73
  self.forward_metadata.block_tables = (
67
74
  forward_batch.req_to_token_pool.req_to_token[
68
75
  forward_batch.req_pool_indices, : forward_batch.seq_lens.max()
@@ -75,6 +82,63 @@ class AscendAttnBackend(AttentionBackend):
75
82
  )
76
83
  self.forward_metadata.seq_lens_cpu_int = forward_batch.seq_lens_cpu.int()
77
84
 
85
+ self.graph_mode = False
86
+
87
+ def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
88
+ self.graph_metadata = {
89
+ "block_tables": torch.empty(
90
+ (max_bs, self.max_context_len // self.page_size),
91
+ dtype=torch.int32,
92
+ device=self.device,
93
+ ),
94
+ }
95
+
96
+ def init_forward_metadata_capture_cuda_graph(
97
+ self,
98
+ bs: int,
99
+ num_tokens: int,
100
+ req_pool_indices: torch.Tensor,
101
+ seq_lens: torch.Tensor,
102
+ encoder_lens: Optional[torch.Tensor],
103
+ forward_mode: ForwardMode,
104
+ spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
105
+ ):
106
+ metadata = ForwardMetadata()
107
+
108
+ metadata.block_tables = self.graph_metadata["block_tables"][:bs, :]
109
+ metadata.seq_lens_cpu_list = seq_lens.cpu().int().tolist()
110
+
111
+ self.graph_metadata[bs] = metadata
112
+ self.forward_metadata = metadata
113
+
114
+ self.graph_mode = True
115
+
116
+ def init_forward_metadata_replay_cuda_graph(
117
+ self,
118
+ bs: int,
119
+ req_pool_indices: torch.Tensor,
120
+ seq_lens: torch.Tensor,
121
+ seq_lens_sum: int,
122
+ encoder_lens: Optional[torch.Tensor],
123
+ forward_mode: ForwardMode,
124
+ spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
125
+ seq_lens_cpu: Optional[torch.Tensor],
126
+ ):
127
+ metadata = self.graph_metadata[bs]
128
+ max_len = seq_lens_cpu[:bs].max().item()
129
+ max_seq_pages = (max_len + self.page_size - 1) // self.page_size
130
+
131
+ metadata.block_tables[:bs, :max_seq_pages].copy_(
132
+ self.req_to_token[req_pool_indices[:bs], :max_len][:, :: self.page_size]
133
+ // self.page_size
134
+ )
135
+ metadata.block_tables[:bs, max_seq_pages:].fill_(0)
136
+ metadata.block_tables[bs:, :].fill_(0)
137
+
138
+ self.forward_metadata = metadata
139
+
140
+ self.graph_mode = True
141
+
78
142
  def get_cuda_graph_seq_len_fill_value(self):
79
143
  return 1
80
144
 
@@ -167,28 +231,74 @@ class AscendAttnBackend(AttentionBackend):
167
231
  layer, forward_batch.out_cache_loc, k, v
168
232
  )
169
233
  if not self.use_mla:
170
- k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
171
- v_cache = forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id)
234
+ if self.graph_mode:
235
+ k_cache = forward_batch.token_to_kv_pool.get_key_buffer(
236
+ layer.layer_id
237
+ ).view(-1, self.page_size, layer.tp_k_head_num * layer.qk_head_dim)
238
+ v_cache = forward_batch.token_to_kv_pool.get_value_buffer(
239
+ layer.layer_id
240
+ ).view(-1, self.page_size, layer.tp_v_head_num * layer.v_head_dim)
241
+ query = q.view(-1, 1, layer.tp_q_head_num * layer.qk_head_dim)
242
+ num_tokens = query.shape[0]
243
+ workspace = (
244
+ torch_npu._npu_fused_infer_attention_score_get_max_workspace(
245
+ query,
246
+ k_cache,
247
+ v_cache,
248
+ block_table=self.forward_metadata.block_tables,
249
+ block_size=self.page_size,
250
+ num_heads=layer.tp_q_head_num,
251
+ num_key_value_heads=layer.tp_k_head_num,
252
+ input_layout="BSH",
253
+ scale=layer.scaling,
254
+ actual_seq_lengths_kv=self.forward_metadata.seq_lens_cpu_list,
255
+ )
256
+ )
257
+ output = torch.empty(
258
+ (num_tokens, 1, layer.tp_q_head_num * layer.v_head_dim),
259
+ dtype=q.dtype,
260
+ device=q.device,
261
+ )
262
+ softmax_lse = torch.empty(1, dtype=q.dtype, device=q.device)
263
+ torch_npu.npu_fused_infer_attention_score.out(
264
+ query,
265
+ k_cache,
266
+ v_cache,
267
+ block_table=self.forward_metadata.block_tables,
268
+ block_size=self.page_size,
269
+ num_heads=layer.tp_q_head_num,
270
+ num_key_value_heads=layer.tp_k_head_num,
271
+ input_layout="BSH",
272
+ scale=layer.scaling,
273
+ actual_seq_lengths_kv=self.forward_metadata.seq_lens_cpu_list,
274
+ workspace=workspace,
275
+ out=[output, softmax_lse],
276
+ )
277
+ else:
278
+ k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
279
+ v_cache = forward_batch.token_to_kv_pool.get_value_buffer(
280
+ layer.layer_id
281
+ )
172
282
 
173
- query = q.view(-1, layer.tp_q_head_num, layer.qk_head_dim)
174
- num_tokens = query.shape[0]
175
- output = torch.empty(
176
- (num_tokens, layer.tp_q_head_num, layer.v_head_dim),
177
- dtype=query.dtype,
178
- device=query.device,
179
- )
283
+ query = q.view(-1, layer.tp_q_head_num, layer.qk_head_dim)
284
+ num_tokens = query.shape[0]
285
+ output = torch.empty(
286
+ (num_tokens, layer.tp_q_head_num, layer.v_head_dim),
287
+ dtype=query.dtype,
288
+ device=query.device,
289
+ )
180
290
 
181
- torch_npu._npu_paged_attention(
182
- query=query,
183
- key_cache=k_cache,
184
- value_cache=v_cache,
185
- num_heads=layer.tp_q_head_num,
186
- num_kv_heads=layer.tp_k_head_num,
187
- scale_value=layer.scaling,
188
- block_table=self.forward_metadata.block_tables,
189
- context_lens=self.forward_metadata.seq_lens_cpu_int,
190
- out=output,
191
- )
291
+ torch_npu._npu_paged_attention(
292
+ query=query,
293
+ key_cache=k_cache,
294
+ value_cache=v_cache,
295
+ num_heads=layer.tp_q_head_num,
296
+ num_kv_heads=layer.tp_k_head_num,
297
+ scale_value=layer.scaling,
298
+ block_table=self.forward_metadata.block_tables,
299
+ context_lens=self.forward_metadata.seq_lens_cpu_int,
300
+ out=output,
301
+ )
192
302
  return output.view(num_tokens, layer.tp_q_head_num * layer.v_head_dim)
193
303
  else:
194
304
  query = q.view(-1, layer.tp_q_head_num, layer.head_dim)