sglang 0.4.1.post1__py3-none-any.whl → 0.4.1.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. sglang/bench_offline_throughput.py +1 -0
  2. sglang/srt/configs/model_config.py +11 -2
  3. sglang/srt/layers/attention/__init__.py +0 -1
  4. sglang/srt/layers/attention/flashinfer_backend.py +54 -41
  5. sglang/srt/layers/logits_processor.py +30 -2
  6. sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  7. sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  8. sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  9. sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  10. sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  11. sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  12. sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  13. sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  14. sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  15. sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  16. sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  17. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  18. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  19. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  20. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  21. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  22. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  23. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  24. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  25. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  26. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  27. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  28. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  29. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  30. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  31. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  32. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  33. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  34. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  35. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  36. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  37. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  38. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  39. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  40. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  41. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  42. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  43. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  44. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  45. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  46. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  47. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  48. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  49. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  50. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  51. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  52. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  53. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  54. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  55. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  56. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  61. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  62. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  63. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  64. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  65. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +178 -0
  67. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  68. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  69. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  70. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  71. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +175 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +46 -26
  77. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  78. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  80. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  81. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  82. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  84. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  85. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  88. sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  89. sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  90. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  91. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  92. sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  93. sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  94. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  95. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  96. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  97. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  98. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  99. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  100. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  101. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  102. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  103. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  104. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  105. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  106. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  107. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  108. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  109. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  110. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  111. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  112. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  113. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  114. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  115. sglang/srt/layers/quantization/fp8.py +42 -2
  116. sglang/srt/layers/quantization/fp8_kernel.py +77 -18
  117. sglang/srt/layers/quantization/fp8_utils.py +8 -2
  118. sglang/srt/managers/detokenizer_manager.py +2 -0
  119. sglang/srt/managers/io_struct.py +40 -9
  120. sglang/srt/managers/schedule_batch.py +22 -15
  121. sglang/srt/managers/scheduler.py +69 -21
  122. sglang/srt/managers/session_controller.py +102 -27
  123. sglang/srt/managers/tokenizer_manager.py +48 -10
  124. sglang/srt/managers/tp_worker.py +7 -0
  125. sglang/srt/managers/tp_worker_overlap_thread.py +5 -0
  126. sglang/srt/model_executor/forward_batch_info.py +42 -3
  127. sglang/srt/model_executor/model_runner.py +4 -0
  128. sglang/srt/models/llama.py +11 -0
  129. sglang/srt/models/llama_eagle.py +132 -0
  130. sglang/srt/openai_api/adapter.py +60 -2
  131. sglang/srt/openai_api/protocol.py +48 -0
  132. sglang/srt/server.py +26 -3
  133. sglang/srt/server_args.py +24 -30
  134. sglang/srt/speculative/spec_info.py +19 -0
  135. sglang/srt/utils.py +62 -0
  136. sglang/version.py +1 -1
  137. {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post3.dist-info}/METADATA +3 -3
  138. sglang-0.4.1.post3.dist-info/RECORD +305 -0
  139. sglang-0.4.1.post1.dist-info/RECORD +0 -195
  140. {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post3.dist-info}/LICENSE +0 -0
  141. {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post3.dist-info}/WHEEL +0 -0
  142. {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post3.dist-info}/top_level.txt +0 -0
@@ -45,6 +45,7 @@ if TYPE_CHECKING:
45
45
  from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
46
46
  from sglang.srt.model_executor.model_runner import ModelRunner
47
47
  from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
48
+ from sglang.srt.speculative.spec_info import SpecInfo, SpeculativeAlgorithm
48
49
 
49
50
 
50
51
  class ForwardMode(IntEnum):
@@ -59,6 +60,11 @@ class ForwardMode(IntEnum):
59
60
  # No sequence to forward. For data parallel attention, some workers wil be IDLE if no sequence are allocated.
60
61
  IDLE = auto()
61
62
 
63
+ # Used in speculative decoding: verify a batch in the target model.
64
+ TARGET_VERIFY = auto()
65
+ # Used in speculative decoding: extend a batch in the draft model.
66
+ DRAFT_EXTEND = auto()
67
+
62
68
  # A dummy first batch to start the pipeline for overlap scheduler.
63
69
  # It is now used for triggering the sampling_info_done event for the first prefill batch.
64
70
  DUMMY_FIRST = auto()
@@ -67,7 +73,12 @@ class ForwardMode(IntEnum):
67
73
  return self == ForwardMode.PREFILL
68
74
 
69
75
  def is_extend(self):
70
- return self == ForwardMode.EXTEND or self == ForwardMode.MIXED
76
+ return (
77
+ self == ForwardMode.EXTEND
78
+ or self == ForwardMode.MIXED
79
+ or self == ForwardMode.DRAFT_EXTEND
80
+ or self == self.TARGET_VERIFY
81
+ )
71
82
 
72
83
  def is_decode(self):
73
84
  return self == ForwardMode.DECODE
@@ -78,6 +89,15 @@ class ForwardMode(IntEnum):
78
89
  def is_idle(self):
79
90
  return self == ForwardMode.IDLE
80
91
 
92
+ def is_target_verify(self):
93
+ return self == ForwardMode.TARGET_VERIFY
94
+
95
+ def is_draft_extend(self):
96
+ return self == ForwardMode.DRAFT_EXTEND
97
+
98
+ def is_cuda_graph(self):
99
+ return self in (ForwardMode.DECODE, ForwardMode.TARGET_VERIFY)
100
+
81
101
  def is_dummy_first(self):
82
102
  return self == ForwardMode.DUMMY_FIRST
83
103
 
@@ -141,14 +161,18 @@ class ForwardBatch:
141
161
  token_to_kv_pool: BaseTokenToKVPool = None
142
162
  attn_backend: AttentionBackend = None
143
163
 
144
- # For Qwen2-VL
145
- mrope_positions: torch.Tensor = None
164
+ # Speculative decoding
165
+ spec_info: SpecInfo = None
166
+ spec_algorithm: SpeculativeAlgorithm = None
146
167
 
147
168
  # For DP attention
148
169
  global_num_tokens: Optional[List[int]] = None
149
170
  gathered_buffer: Optional[torch.Tensor] = None
150
171
  can_run_dp_cuda_graph: bool = False
151
172
 
173
+ # For Qwen2-VL
174
+ mrope_positions: torch.Tensor = None
175
+
152
176
  def compute_mrope_positions(
153
177
  self, model_runner: ModelRunner, batch: ModelWorkerBatch
154
178
  ):
@@ -351,3 +375,18 @@ def compute_position_torch(
351
375
  extend_start_loc = torch.zeros_like(extend_seq_lens)
352
376
  extend_start_loc[1:] = torch.cumsum(extend_seq_lens[:-1], dim=0)
353
377
  return positions.to(torch.int64), extend_start_loc
378
+
379
+
380
+ class CaptureHiddenMode(IntEnum):
381
+ NULL = auto()
382
+ FULL = auto()
383
+ LAST = auto()
384
+
385
+ def need_capture(self):
386
+ return self != CaptureHiddenMode.NULL
387
+
388
+ def is_full(self):
389
+ return self == CaptureHiddenMode.FULL
390
+
391
+ def is_last(self):
392
+ return self == CaptureHiddenMode.LAST
@@ -429,6 +429,10 @@ class ModelRunner:
429
429
  logger.error(error_msg)
430
430
  return False, error_msg
431
431
 
432
+ def update_weights_from_tensor(self, name, tensor: torch.Tensor):
433
+ self.model.load_weights([(name, tensor)])
434
+ return True, "Success" # TODO error handling
435
+
432
436
  def get_weights_by_name(
433
437
  self, name: str, truncate_size: int = 100
434
438
  ) -> Optional[torch.Tensor]:
@@ -516,6 +516,17 @@ class LlamaForCausalLM(nn.Module):
516
516
  )
517
517
  return None
518
518
 
519
+ def get_embed_and_head(self):
520
+ return self.model.embed_tokens.weight, self.lm_head.weight
521
+
522
+ def set_embed_and_head(self, embed, head):
523
+ del self.model.embed_tokens.weight
524
+ del self.lm_head.weight
525
+ self.model.embed_tokens.weight = embed
526
+ self.lm_head.weight = head
527
+ torch.cuda.empty_cache()
528
+ torch.cuda.synchronize()
529
+
519
530
 
520
531
  class Phi3ForCausalLM(LlamaForCausalLM):
521
532
  pass
@@ -0,0 +1,132 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
16
+ # Adapted from
17
+ # https://github.com/SafeAILab/EAGLE/blob/main/eagle/model/cnets.py
18
+ """Inference-only LLaMA-EAGLE model compatible with HuggingFace weights."""
19
+
20
+ from typing import Iterable, Optional, Tuple
21
+
22
+ import torch
23
+ from torch import nn
24
+ from transformers import LlamaConfig
25
+
26
+ from sglang.srt.layers.logits_processor import LogitsProcessor
27
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
28
+ from sglang.srt.layers.vocab_parallel_embedding import (
29
+ ParallelLMHead,
30
+ VocabParallelEmbedding,
31
+ )
32
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
33
+ from sglang.srt.models.llama import LlamaDecoderLayer, LlamaForCausalLM
34
+
35
+
36
+ class LlamaDecoderLayer(LlamaDecoderLayer):
37
+ def __init__(
38
+ self,
39
+ config: LlamaConfig,
40
+ layer_id: int = 0,
41
+ quant_config: Optional[QuantizationConfig] = None,
42
+ prefix: str = "",
43
+ ) -> None:
44
+ super().__init__(config, layer_id, quant_config, prefix)
45
+
46
+ # Skip the input_layernorm
47
+ # https://github.com/SafeAILab/EAGLE/blob/35c78f6cdc19a73e05cf5c330b4c358dad970c6a/eagle/model/cnets.py#L427
48
+ if layer_id == 0:
49
+ del self.input_layernorm
50
+ setattr(self, "input_layernorm", lambda x: x)
51
+
52
+
53
+ class LlamaModel(nn.Module):
54
+ def __init__(
55
+ self,
56
+ config: LlamaConfig,
57
+ quant_config: Optional[QuantizationConfig] = None,
58
+ ) -> None:
59
+ super().__init__()
60
+ self.config = config
61
+ self.vocab_size = config.vocab_size
62
+ self.embed_tokens = VocabParallelEmbedding(
63
+ config.vocab_size,
64
+ config.hidden_size,
65
+ )
66
+ self.layers = nn.ModuleList(
67
+ [
68
+ LlamaDecoderLayer(
69
+ config, i, quant_config=quant_config, prefix=f"model.layers.{i}"
70
+ )
71
+ for i in range(config.num_hidden_layers)
72
+ ]
73
+ )
74
+ self.fc = torch.nn.Linear(config.hidden_size * 2, config.hidden_size)
75
+
76
+ def forward(
77
+ self,
78
+ input_ids: torch.Tensor,
79
+ positions: torch.Tensor,
80
+ forward_batch: ForwardBatch,
81
+ input_embeds: torch.Tensor = None,
82
+ ) -> torch.Tensor:
83
+ if input_embeds is None:
84
+ hidden_states = self.embed_tokens(input_ids)
85
+ else:
86
+ hidden_states = input_embeds
87
+
88
+ hidden_states = self.fc(
89
+ torch.cat((hidden_states, forward_batch.spec_info.hidden_states), dim=-1)
90
+ )
91
+
92
+ residual = None
93
+ for i in range(len(self.layers)):
94
+ layer = self.layers[i]
95
+ hidden_states, residual = layer(
96
+ positions,
97
+ hidden_states,
98
+ forward_batch,
99
+ residual,
100
+ )
101
+ return hidden_states + residual
102
+
103
+
104
+ class LlamaForCausalLMEagle(LlamaForCausalLM):
105
+ def __init__(
106
+ self,
107
+ config: LlamaConfig,
108
+ quant_config: Optional[QuantizationConfig] = None,
109
+ cache_config=None,
110
+ ) -> None:
111
+ nn.Module.__init__(self)
112
+ self.config = config
113
+ self.quant_config = quant_config
114
+ self.model = LlamaModel(config, quant_config=quant_config)
115
+ # Llama 3.2 1B Instruct set tie_word_embeddings to True
116
+ # Llama 3.1 8B Instruct set tie_word_embeddings to False
117
+ if self.config.tie_word_embeddings:
118
+ self.lm_head = self.model.embed_tokens
119
+ else:
120
+ self.lm_head = ParallelLMHead(
121
+ config.vocab_size, config.hidden_size, quant_config=quant_config
122
+ )
123
+ self.logits_processor = LogitsProcessor(config)
124
+
125
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
126
+ for name, loaded_weight in weights:
127
+ if "lm_head" not in name:
128
+ name = "model." + name
129
+ super().load_weights([(name, loaded_weight)])
130
+
131
+
132
+ EntryClass = [LlamaForCausalLMEagle]
@@ -65,10 +65,13 @@ from sglang.srt.openai_api.protocol import (
65
65
  FileDeleteResponse,
66
66
  FileRequest,
67
67
  FileResponse,
68
+ FunctionResponse,
68
69
  LogProbs,
70
+ ToolCall,
69
71
  TopLogprob,
70
72
  UsageInfo,
71
73
  )
74
+ from sglang.srt.utils import TOOLS_TAG_LIST, parse_tool_response
72
75
  from sglang.utils import get_exception_traceback
73
76
 
74
77
  logger = logging.getLogger(__name__)
@@ -879,6 +882,21 @@ def v1_chat_generate_request(
879
882
  # None skips any image processing in GenerateReqInput.
880
883
  if not isinstance(request.messages, str):
881
884
  # Apply chat template and its stop strings.
885
+ tools = None
886
+ if request.tools and request.tool_choice != "none":
887
+ request.skip_special_tokens = False
888
+ if request.stream:
889
+ logger.warning("Streaming is not supported with tools.")
890
+ request.stream = False
891
+ if not isinstance(request.tool_choice, str):
892
+ tools = [
893
+ item.function.model_dump()
894
+ for item in request.tools
895
+ if item.function.name == request.tool_choice.function.name
896
+ ]
897
+ else:
898
+ tools = [item.function.model_dump() for item in request.tools]
899
+
882
900
  if chat_template_name is None:
883
901
  openai_compatible_messages = []
884
902
  for message in request.messages:
@@ -902,6 +920,7 @@ def v1_chat_generate_request(
902
920
  openai_compatible_messages,
903
921
  tokenize=True,
904
922
  add_generation_prompt=True,
923
+ tools=tools,
905
924
  )
906
925
  if assistant_prefix:
907
926
  prompt_ids += tokenizer_manager.tokenizer.encode(assistant_prefix)
@@ -1041,11 +1060,46 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
1041
1060
 
1042
1061
  finish_reason = ret_item["meta_info"]["finish_reason"]
1043
1062
 
1063
+ tool_calls = None
1064
+ text = ret_item["text"]
1065
+
1066
+ if isinstance(request, list):
1067
+ tool_choice = request[idx].tool_choice
1068
+ tools = request[idx].tools
1069
+ else:
1070
+ tool_choice = request.tool_choice
1071
+ tools = request.tools
1072
+
1073
+ if tool_choice != "none" and any([i in text for i in TOOLS_TAG_LIST]):
1074
+ if finish_reason == "stop":
1075
+ finish_reason = "tool_calls"
1076
+ try:
1077
+ text, call_info_list = parse_tool_response(text, tools) # noqa
1078
+ tool_calls = [
1079
+ ToolCall(
1080
+ id=str(call_info[0]),
1081
+ function=FunctionResponse(
1082
+ name=call_info[1], arguments=call_info[2]
1083
+ ),
1084
+ )
1085
+ for call_info in call_info_list
1086
+ ]
1087
+ except Exception as e:
1088
+ logger.error(f"Exception: {e}")
1089
+ return create_error_response(
1090
+ HTTPStatus.BAD_REQUEST,
1091
+ "Failed to parse fc related info to json format!",
1092
+ )
1093
+
1044
1094
  if to_file:
1045
1095
  # to make the choice data json serializable
1046
1096
  choice_data = {
1047
1097
  "index": 0,
1048
- "message": {"role": "assistant", "content": ret_item["text"]},
1098
+ "message": {
1099
+ "role": "assistant",
1100
+ "content": ret_item["text"] if tool_calls is None else None,
1101
+ "tool_calls": tool_calls,
1102
+ },
1049
1103
  "logprobs": choice_logprobs,
1050
1104
  "finish_reason": (finish_reason["type"] if finish_reason else ""),
1051
1105
  "matched_stop": (
@@ -1057,7 +1111,11 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
1057
1111
  else:
1058
1112
  choice_data = ChatCompletionResponseChoice(
1059
1113
  index=idx,
1060
- message=ChatMessage(role="assistant", content=ret_item["text"]),
1114
+ message=ChatMessage(
1115
+ role="assistant",
1116
+ content=ret_item["text"] if tool_calls is None else None,
1117
+ tool_calls=tool_calls,
1118
+ ),
1061
1119
  logprobs=choice_logprobs,
1062
1120
  finish_reason=(finish_reason["type"] if finish_reason else ""),
1063
1121
  matched_stop=(
@@ -257,6 +257,34 @@ class ResponseFormat(BaseModel):
257
257
  json_schema: Optional[JsonSchemaResponseFormat] = None
258
258
 
259
259
 
260
+ class Function(BaseModel):
261
+ """Function descriptions."""
262
+
263
+ description: Optional[str] = Field(default=None, examples=[None])
264
+ name: str
265
+ parameters: Optional[object] = None
266
+
267
+
268
+ class Tool(BaseModel):
269
+ """Function wrapper."""
270
+
271
+ type: str = Field(default="function", examples=["function"])
272
+ function: Function
273
+
274
+
275
+ class ToolChoiceFuncName(BaseModel):
276
+ """The name of tool choice function."""
277
+
278
+ name: str
279
+
280
+
281
+ class ToolChoice(BaseModel):
282
+ """The tool choice definition."""
283
+
284
+ function: ToolChoiceFuncName
285
+ type: Literal["function"] = Field(default="function", examples=["function"])
286
+
287
+
260
288
  class ChatCompletionRequest(BaseModel):
261
289
  # Ordered by official OpenAI API documentation
262
290
  # https://platform.openai.com/docs/api-reference/chat/create
@@ -277,6 +305,10 @@ class ChatCompletionRequest(BaseModel):
277
305
  temperature: float = 0.7
278
306
  top_p: float = 1.0
279
307
  user: Optional[str] = None
308
+ tools: Optional[List[Tool]] = Field(default=None, examples=[None])
309
+ tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
310
+ default="auto", examples=["none"]
311
+ ) # noqa
280
312
 
281
313
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
282
314
  top_k: int = -1
@@ -292,9 +324,25 @@ class ChatCompletionRequest(BaseModel):
292
324
  ebnf: Optional[str] = None
293
325
 
294
326
 
327
+ class FunctionResponse(BaseModel):
328
+ """Function response."""
329
+
330
+ name: str
331
+ arguments: str
332
+
333
+
334
+ class ToolCall(BaseModel):
335
+ """Tool call response."""
336
+
337
+ id: str
338
+ type: Literal["function"] = "function"
339
+ function: FunctionResponse
340
+
341
+
295
342
  class ChatMessage(BaseModel):
296
343
  role: Optional[str] = None
297
344
  content: Optional[str] = None
345
+ tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
298
346
 
299
347
 
300
348
  class ChatCompletionResponseChoice(BaseModel):
sglang/srt/server.py CHANGED
@@ -57,6 +57,7 @@ from sglang.srt.managers.io_struct import (
57
57
  OpenSessionReqInput,
58
58
  UpdateWeightFromDiskReqInput,
59
59
  UpdateWeightsFromDistributedReqInput,
60
+ UpdateWeightsFromTensorReqInput,
60
61
  )
61
62
  from sglang.srt.managers.scheduler import run_scheduler_process
62
63
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
@@ -109,6 +110,7 @@ app.add_middleware(
109
110
  tokenizer_manager: TokenizerManager = None
110
111
  scheduler_info: Dict = None
111
112
 
113
+
112
114
  ##### Native API endpoints #####
113
115
 
114
116
 
@@ -257,6 +259,10 @@ async def open_session(obj: OpenSessionReqInput, request: Request):
257
259
  """Open a session, and return its unique session id."""
258
260
  try:
259
261
  session_id = await tokenizer_manager.open_session(obj, request)
262
+ if session_id is None:
263
+ raise Exception(
264
+ "Failed to open the session. Check if a session with the same id is still open."
265
+ )
260
266
  return session_id
261
267
  except Exception as e:
262
268
  return _create_error_response(e)
@@ -484,7 +490,16 @@ def launch_engine(
484
490
  # Wait for model to finish loading
485
491
  scheduler_infos = []
486
492
  for i in range(len(scheduler_pipe_readers)):
487
- data = scheduler_pipe_readers[i].recv()
493
+ try:
494
+ data = scheduler_pipe_readers[i].recv()
495
+ except EOFError as e:
496
+ logger.exception(e)
497
+ logger.error(
498
+ f"Rank {i} scheduler is dead. Please check if there are relevant logs."
499
+ )
500
+ scheduler_procs[i].join()
501
+ logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
502
+ raise
488
503
 
489
504
  if data["status"] != "ready":
490
505
  raise RuntimeError(
@@ -492,7 +507,7 @@ def launch_engine(
492
507
  )
493
508
  scheduler_infos.append(data)
494
509
 
495
- # Assume all schedulers have same max_total_num_tokens
510
+ # Assume all schedulers have same scheduler_info
496
511
  scheduler_info = scheduler_infos[0]
497
512
 
498
513
 
@@ -857,6 +872,14 @@ class Engine:
857
872
  tokenizer_manager.update_weights_from_distributed(obj, None)
858
873
  )
859
874
 
875
+ def update_weights_from_tensor(self, name, tensor):
876
+ """Update weights from distributed source."""
877
+ obj = UpdateWeightsFromTensorReqInput(name=name, tensor=tensor)
878
+ loop = asyncio.get_event_loop()
879
+ return loop.run_until_complete(
880
+ tokenizer_manager.update_weights_from_tensor(obj, None)
881
+ )
882
+
860
883
  def get_weights_by_name(self, name, truncate_size=100):
861
884
  """Get weights by parameter name."""
862
885
  obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
@@ -871,7 +894,7 @@ class Runtime:
871
894
  using the commond line interface.
872
895
 
873
896
  It is mainly used for the frontend language.
874
- You should use the Engine class if you want to do normal offline processing.
897
+ You should use the Engine class above if you want to do normal offline processing.
875
898
  """
876
899
 
877
900
  def __init__(
sglang/srt/server_args.py CHANGED
@@ -54,8 +54,9 @@ class ServerArgs:
54
54
  chat_template: Optional[str] = None
55
55
  is_embedding: bool = False
56
56
  revision: Optional[str] = None
57
+ return_token_ids: bool = False
57
58
 
58
- # Port
59
+ # Port for the HTTP server
59
60
  host: str = "127.0.0.1"
60
61
  port: int = 30000
61
62
 
@@ -68,6 +69,7 @@ class ServerArgs:
68
69
  schedule_policy: str = "lpm"
69
70
  schedule_conservativeness: float = 1.0
70
71
  cpu_offload_gb: int = 0
72
+ prefill_only_one_req: bool = False
71
73
 
72
74
  # Other runtime options
73
75
  tp_size: int = 1
@@ -94,6 +96,7 @@ class ServerArgs:
94
96
  # Data parallelism
95
97
  dp_size: int = 1
96
98
  load_balance_method: str = "round_robin"
99
+
97
100
  # Expert parallelism
98
101
  ep_size: int = 1
99
102
 
@@ -217,6 +220,13 @@ class ServerArgs:
217
220
  )
218
221
  self.disable_cuda_graph = True
219
222
 
223
+ # Expert parallelism
224
+ if self.enable_ep_moe:
225
+ self.ep_size = self.tp_size
226
+ logger.info(
227
+ f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
228
+ )
229
+
220
230
  # Others
221
231
  if self.enable_dp_attention:
222
232
  self.dp_size = self.tp_size
@@ -229,12 +239,6 @@ class ServerArgs:
229
239
  "Data parallel size is adjusted to be the same as tensor parallel size. "
230
240
  "Overlap scheduler is disabled."
231
241
  )
232
- # Expert parallelism
233
- if self.enable_ep_moe:
234
- self.ep_size = self.tp_size
235
- logger.info(
236
- f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
237
- )
238
242
 
239
243
  # GGUF
240
244
  if (
@@ -277,6 +281,12 @@ class ServerArgs:
277
281
  action="store_true",
278
282
  help="If set, skip init tokenizer and pass input_ids in generate request",
279
283
  )
284
+ parser.add_argument(
285
+ "--return-token-ids",
286
+ action="store_true",
287
+ default=ServerArgs.return_token_ids,
288
+ help="Whether to return token IDs in the output, this may introduce additional overhead.",
289
+ )
280
290
  parser.add_argument(
281
291
  "--load-format",
282
292
  type=str,
@@ -430,13 +440,18 @@ class ServerArgs:
430
440
  default=ServerArgs.schedule_conservativeness,
431
441
  help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
432
442
  )
433
-
434
443
  parser.add_argument(
435
444
  "--cpu-offload-gb",
436
445
  type=int,
437
446
  default=ServerArgs.cpu_offload_gb,
438
447
  help="How many GBs of RAM to reserve for CPU offloading",
439
448
  )
449
+ parser.add_argument(
450
+ "--prefill-only-one-req",
451
+ type=bool,
452
+ help="If true, we only prefill one request at one prefill batch",
453
+ default=ServerArgs.prefill_only_one_req,
454
+ )
440
455
 
441
456
  # Other runtime options
442
457
  parser.add_argument(
@@ -555,6 +570,7 @@ class ServerArgs:
555
570
  "shortest_queue",
556
571
  ],
557
572
  )
573
+
558
574
  # Expert parallelism
559
575
  parser.add_argument(
560
576
  "--expert-parallel-size",
@@ -777,28 +793,6 @@ class ServerArgs:
777
793
  help="Delete the model checkpoint after loading the model.",
778
794
  )
779
795
 
780
- # Deprecated arguments
781
- parser.add_argument(
782
- "--enable-overlap-schedule",
783
- action=DeprecatedAction,
784
- help="'--enable-overlap-schedule' is deprecated. It is enabled by default now. Please drop this argument.",
785
- )
786
- parser.add_argument(
787
- "--disable-flashinfer",
788
- action=DeprecatedAction,
789
- help="'--disable-flashinfer' is deprecated. Please use '--attention-backend triton' instead.",
790
- )
791
- parser.add_argument(
792
- "--disable-flashinfer-sampling",
793
- action=DeprecatedAction,
794
- help="'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead.",
795
- )
796
- parser.add_argument(
797
- "--disable-disk-cache",
798
- action=DeprecatedAction,
799
- help="'--disable-disk-cache' is deprecated. Please use '--disable-outlines-disk-cache' instead.",
800
- )
801
-
802
796
  @classmethod
803
797
  def from_cli_args(cls, args: argparse.Namespace):
804
798
  args.tp_size = args.tensor_parallel_size
@@ -0,0 +1,19 @@
1
+ from enum import IntEnum, auto
2
+
3
+
4
+ class SpeculativeAlgorithm(IntEnum):
5
+ EAGLE = auto()
6
+
7
+ def is_eagle(self):
8
+ return self == SpeculativeAlgorithm.EAGLE
9
+
10
+ @staticmethod
11
+ def from_string(name: str):
12
+ name_map = {
13
+ "EAGLE": SpeculativeAlgorithm.EAGLE,
14
+ }
15
+ return name_map[name]
16
+
17
+
18
+ class SpecInfo:
19
+ pass