sglang 0.4.2.post1__tar.gz → 0.4.2.post2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (401) hide show
  1. {sglang-0.4.2.post1/sglang.egg-info → sglang-0.4.2.post2}/PKG-INFO +15 -6
  2. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/README.md +5 -1
  3. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/pyproject.toml +10 -12
  4. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/constrained/outlines_backend.py +9 -1
  5. sglang-0.4.2.post2/sglang/srt/custom_op.py +40 -0
  6. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/entrypoints/engine.py +2 -2
  7. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/activation.py +10 -5
  8. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/attention/flashinfer_backend.py +284 -39
  9. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/attention/triton_backend.py +71 -7
  10. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/attention/triton_ops/decode_attention.py +53 -59
  11. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/layernorm.py +1 -5
  12. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/ep_moe/layer.py +1 -3
  13. sglang-0.4.2.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  14. sglang-0.4.2.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  15. sglang-0.4.2.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +200 -0
  16. sglang-0.4.2.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +200 -0
  17. sglang-0.4.2.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +200 -0
  18. sglang-0.4.2.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +178 -0
  19. sglang-0.4.2.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +200 -0
  20. sglang-0.4.2.post2/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +175 -0
  21. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -11
  22. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -3
  23. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/topk.py +4 -0
  24. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  25. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  26. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  27. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  28. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  29. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  30. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  31. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  32. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  33. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  34. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  35. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  36. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  37. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  38. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  39. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  40. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  41. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  43. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  44. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  45. sglang-0.4.2.post2/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  46. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/fp8_kernel.py +140 -2
  47. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/rotary_embedding.py +1 -3
  48. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/sampler.py +4 -4
  49. sglang-0.4.2.post2/sglang/srt/lora/backend/__init__.py +8 -0
  50. sglang-0.4.2.post2/sglang/srt/lora/backend/base_backend.py +95 -0
  51. sglang-0.4.2.post2/sglang/srt/lora/backend/flashinfer_backend.py +91 -0
  52. sglang-0.4.2.post2/sglang/srt/lora/backend/triton_backend.py +61 -0
  53. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/lora/lora.py +127 -112
  54. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/lora/lora_manager.py +50 -18
  55. sglang-0.4.2.post2/sglang/srt/lora/triton_ops/__init__.py +5 -0
  56. sglang-0.4.2.post2/sglang/srt/lora/triton_ops/qkv_lora_b.py +182 -0
  57. sglang-0.4.2.post2/sglang/srt/lora/triton_ops/sgemm_lora_a.py +143 -0
  58. sglang-0.4.2.post2/sglang/srt/lora/triton_ops/sgemm_lora_b.py +159 -0
  59. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/model_executor/cuda_graph_runner.py +77 -80
  60. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/model_executor/forward_batch_info.py +58 -59
  61. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/model_executor/model_runner.py +2 -2
  62. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/qwen2_vl.py +1 -1
  63. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/server_args.py +13 -2
  64. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/speculative/build_eagle_tree.py +4 -2
  65. sglang-0.4.2.post2/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +213 -0
  66. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/speculative/eagle_utils.py +361 -372
  67. sglang-0.4.2.post2/sglang/srt/speculative/eagle_worker.py +315 -0
  68. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/utils.py +7 -0
  69. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/test/runners.py +2 -0
  70. sglang-0.4.2.post2/sglang/version.py +1 -0
  71. {sglang-0.4.2.post1 → sglang-0.4.2.post2/sglang.egg-info}/PKG-INFO +15 -6
  72. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang.egg-info/SOURCES.txt +40 -1
  73. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang.egg-info/requires.txt +9 -4
  74. sglang-0.4.2.post1/sglang/srt/layers/custom_op_util.py +0 -25
  75. sglang-0.4.2.post1/sglang/srt/speculative/eagle_worker.py +0 -183
  76. sglang-0.4.2.post1/sglang/version.py +0 -1
  77. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/LICENSE +0 -0
  78. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/setup.cfg +0 -0
  79. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/__init__.py +0 -0
  80. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/api.py +0 -0
  81. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/bench_latency.py +0 -0
  82. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/bench_offline_throughput.py +0 -0
  83. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/bench_one_batch.py +0 -0
  84. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/bench_one_batch_server.py +0 -0
  85. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/bench_serving.py +0 -0
  86. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/check_env.py +0 -0
  87. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/global_config.py +0 -0
  88. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/lang/__init__.py +0 -0
  89. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/lang/backend/__init__.py +0 -0
  90. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/lang/backend/anthropic.py +0 -0
  91. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/lang/backend/base_backend.py +0 -0
  92. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/lang/backend/litellm.py +0 -0
  93. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/lang/backend/openai.py +0 -0
  94. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/lang/backend/runtime_endpoint.py +0 -0
  95. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/lang/backend/vertexai.py +0 -0
  96. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/lang/chat_template.py +0 -0
  97. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/lang/choices.py +0 -0
  98. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/lang/compiler.py +0 -0
  99. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/lang/interpreter.py +0 -0
  100. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/lang/ir.py +0 -0
  101. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/lang/tracer.py +0 -0
  102. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/launch_server.py +0 -0
  103. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/llama3_eval.py +0 -0
  104. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/_custom_ops.py +0 -0
  105. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/aio_rwlock.py +0 -0
  106. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/configs/__init__.py +0 -0
  107. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/configs/chatglm.py +0 -0
  108. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/configs/dbrx.py +0 -0
  109. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/configs/device_config.py +0 -0
  110. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/configs/exaone.py +0 -0
  111. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/configs/load_config.py +0 -0
  112. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/configs/model_config.py +0 -0
  113. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/configs/qwen2vl.py +0 -0
  114. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  115. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  116. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/constrained/xgrammar_backend.py +0 -0
  117. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/conversation.py +0 -0
  118. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/distributed/__init__.py +0 -0
  119. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/distributed/communication_op.py +0 -0
  120. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  121. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  122. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  123. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  124. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  125. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  126. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  127. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  128. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/distributed/parallel_state.py +0 -0
  129. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/distributed/utils.py +0 -0
  130. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/entrypoints/http_server.py +0 -0
  131. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/function_call_parser.py +0 -0
  132. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/hf_transformers_utils.py +0 -0
  133. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/attention/__init__.py +0 -0
  134. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  135. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  136. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  137. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  138. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  139. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/attention/vision.py +0 -0
  140. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/dp_attention.py +0 -0
  141. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/linear.py +0 -0
  142. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/logits_processor.py +0 -0
  143. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
  144. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
  145. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_native.py +0 -0
  146. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
  147. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  148. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  149. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  150. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  151. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  152. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  153. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  154. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  155. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  156. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  157. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  158. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  159. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  160. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  161. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  162. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  163. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  164. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  165. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  166. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  167. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  168. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  169. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  170. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  171. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  172. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  173. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
  174. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  175. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
  176. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  177. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  178. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  179. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  180. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  181. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  182. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  183. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  184. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  185. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
  186. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  187. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  188. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
  189. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  190. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  191. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  192. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
  193. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  194. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
  195. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  196. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  197. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  198. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  199. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
  200. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
  201. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  202. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  203. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
  204. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
  205. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  206. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  207. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  208. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  209. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
  210. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  211. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  212. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  213. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  214. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
  215. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
  216. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
  217. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  218. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
  219. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  220. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  221. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  222. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
  223. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
  224. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  225. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  226. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  227. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  228. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  229. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
  230. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
  231. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
  232. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  233. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
  234. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  235. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
  236. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
  237. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
  238. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
  239. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/parameter.py +0 -0
  240. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/pooler.py +0 -0
  241. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/__init__.py +0 -0
  242. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/base_config.py +0 -0
  243. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  244. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  245. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  246. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  247. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  248. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  249. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  250. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  251. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  252. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  253. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  254. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  255. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  256. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  257. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  258. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  259. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  260. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  261. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  262. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  263. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  264. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  265. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  266. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  267. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  268. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  269. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  270. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  271. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  272. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  273. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  274. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  275. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  276. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  277. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  278. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  279. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  280. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  281. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  282. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  283. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  284. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  285. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  286. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  287. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  288. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  289. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
  290. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/fp8.py +0 -0
  291. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
  292. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
  293. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/modelopt_quant.py +0 -0
  294. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/quantization/w8a8_int8.py +0 -0
  295. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/radix_attention.py +0 -0
  296. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/torchao_utils.py +0 -0
  297. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  298. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/lora/lora_config.py +0 -0
  299. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/managers/cache_controller.py +0 -0
  300. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/managers/configure_logging.py +0 -0
  301. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/managers/data_parallel_controller.py +0 -0
  302. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/managers/detokenizer_manager.py +0 -0
  303. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/managers/image_processor.py +0 -0
  304. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/managers/io_struct.py +0 -0
  305. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/managers/schedule_batch.py +0 -0
  306. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/managers/schedule_policy.py +0 -0
  307. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/managers/scheduler.py +0 -0
  308. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/managers/session_controller.py +0 -0
  309. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/managers/tokenizer_manager.py +0 -0
  310. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/managers/tp_worker.py +0 -0
  311. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
  312. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/managers/utils.py +0 -0
  313. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  314. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  315. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
  316. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/mem_cache/memory_pool.py +0 -0
  317. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/mem_cache/radix_cache.py +0 -0
  318. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/metrics/collector.py +0 -0
  319. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/metrics/func_timer.py +0 -0
  320. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/mm_utils.py +0 -0
  321. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/model_loader/__init__.py +0 -0
  322. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/model_loader/loader.py +0 -0
  323. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/model_loader/utils.py +0 -0
  324. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/model_loader/weight_utils.py +0 -0
  325. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/model_parallel.py +0 -0
  326. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/baichuan.py +0 -0
  327. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/chatglm.py +0 -0
  328. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/commandr.py +0 -0
  329. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/dbrx.py +0 -0
  330. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/deepseek.py +0 -0
  331. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/deepseek_v2.py +0 -0
  332. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/exaone.py +0 -0
  333. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/gemma.py +0 -0
  334. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/gemma2.py +0 -0
  335. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/gemma2_reward.py +0 -0
  336. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/gpt2.py +0 -0
  337. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/gpt_bigcode.py +0 -0
  338. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/granite.py +0 -0
  339. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/grok.py +0 -0
  340. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/internlm2.py +0 -0
  341. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/internlm2_reward.py +0 -0
  342. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/llama.py +0 -0
  343. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/llama_classification.py +0 -0
  344. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/llama_eagle.py +0 -0
  345. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/llama_embedding.py +0 -0
  346. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/llama_reward.py +0 -0
  347. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/llava.py +0 -0
  348. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/llavavid.py +0 -0
  349. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/minicpm.py +0 -0
  350. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/minicpm3.py +0 -0
  351. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/minicpmv.py +0 -0
  352. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/mistral.py +0 -0
  353. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/mixtral.py +0 -0
  354. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/mixtral_quant.py +0 -0
  355. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/mllama.py +0 -0
  356. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/olmo.py +0 -0
  357. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/olmo2.py +0 -0
  358. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/olmoe.py +0 -0
  359. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/phi3_small.py +0 -0
  360. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/qwen.py +0 -0
  361. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/qwen2.py +0 -0
  362. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/qwen2_eagle.py +0 -0
  363. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/qwen2_moe.py +0 -0
  364. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/registry.py +0 -0
  365. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/stablelm.py +0 -0
  366. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/torch_native_llama.py +0 -0
  367. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/xverse.py +0 -0
  368. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/xverse_moe.py +0 -0
  369. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/models/yivl.py +0 -0
  370. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/openai_api/adapter.py +0 -0
  371. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/openai_api/protocol.py +0 -0
  372. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/sampling/custom_logit_processor.py +0 -0
  373. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  374. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  375. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  376. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  377. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  378. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  379. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  380. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/sampling/sampling_params.py +0 -0
  381. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/server.py +0 -0
  382. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/speculative/spec_info.py +0 -0
  383. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/srt/torch_memory_saver_adapter.py +0 -0
  384. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/test/few_shot_gsm8k.py +0 -0
  385. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  386. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/test/run_eval.py +0 -0
  387. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/test/simple_eval_common.py +0 -0
  388. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/test/simple_eval_gpqa.py +0 -0
  389. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/test/simple_eval_humaneval.py +0 -0
  390. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/test/simple_eval_math.py +0 -0
  391. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/test/simple_eval_mgsm.py +0 -0
  392. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/test/simple_eval_mmlu.py +0 -0
  393. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  394. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/test/test_activation.py +0 -0
  395. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/test/test_block_fp8.py +0 -0
  396. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/test/test_layernorm.py +0 -0
  397. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/test/test_programs.py +0 -0
  398. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/test/test_utils.py +0 -0
  399. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang/utils.py +0 -0
  400. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang.egg-info/dependency_links.txt +0 -0
  401. {sglang-0.4.2.post1 → sglang-0.4.2.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: sglang
3
- Version: 0.4.2.post1
3
+ Version: 0.4.2.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -225,7 +225,6 @@ Requires-Dist: huggingface_hub; extra == "runtime-common"
225
225
  Requires-Dist: interegular; extra == "runtime-common"
226
226
  Requires-Dist: modelscope; extra == "runtime-common"
227
227
  Requires-Dist: orjson; extra == "runtime-common"
228
- Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
229
228
  Requires-Dist: packaging; extra == "runtime-common"
230
229
  Requires-Dist: pillow; extra == "runtime-common"
231
230
  Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
@@ -240,21 +239,27 @@ Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
240
239
  Provides-Extra: srt
241
240
  Requires-Dist: sglang[runtime_common]; extra == "srt"
242
241
  Requires-Dist: cuda-python; extra == "srt"
243
- Requires-Dist: sgl-kernel>=0.0.3; extra == "srt"
242
+ Requires-Dist: sgl-kernel>=0.0.3.post1; extra == "srt"
244
243
  Requires-Dist: torch; extra == "srt"
245
244
  Requires-Dist: vllm==0.6.4.post1; extra == "srt"
246
- Requires-Dist: flashinfer==0.1.6; extra == "srt"
245
+ Requires-Dist: flashinfer_python>=0.2.0.post2; extra == "srt"
246
+ Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt"
247
247
  Provides-Extra: srt-hip
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
249
249
  Requires-Dist: torch; extra == "srt-hip"
250
- Requires-Dist: vllm==0.6.3.post2.dev1; extra == "srt-hip"
250
+ Requires-Dist: vllm==0.6.7.dev2; extra == "srt-hip"
251
+ Requires-Dist: outlines==0.1.11; extra == "srt-hip"
252
+ Requires-Dist: sgl-kernel>=0.0.3.post1; extra == "srt-hip"
251
253
  Provides-Extra: srt-xpu
252
254
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
255
+ Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt-xpu"
253
256
  Provides-Extra: srt-hpu
254
257
  Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
258
+ Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt-hpu"
255
259
  Provides-Extra: srt-cpu
256
260
  Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
257
261
  Requires-Dist: torch; extra == "srt-cpu"
262
+ Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt-cpu"
258
263
  Provides-Extra: openai
259
264
  Requires-Dist: openai>=1.0; extra == "openai"
260
265
  Requires-Dist: tiktoken; extra == "openai"
@@ -372,7 +377,11 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
372
377
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
373
378
 
374
379
  ## Adoption and Sponsorship
375
- The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
380
+ The project is supported by (alphabetically): AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS CORP, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
381
+
382
+ ## Contact Us
383
+
384
+ For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
376
385
 
377
386
  ## Acknowledgment and Citation
378
387
  We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -58,7 +58,11 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
58
58
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
59
59
 
60
60
  ## Adoption and Sponsorship
61
- The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
61
+ The project is supported by (alphabetically): AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS CORP, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
62
+
63
+ ## Contact Us
64
+
65
+ For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
62
66
 
63
67
  ## Acknowledgment and Citation
64
68
  We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.2.post1"
7
+ version = "0.4.2.post2"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -19,31 +19,29 @@ dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"]
19
19
  runtime_common = [
20
20
  "aiohttp", "decord", "fastapi",
21
21
  "hf_transfer", "huggingface_hub", "interegular", "modelscope",
22
- "orjson", "outlines>=0.0.44,<0.1.0",
23
- "packaging", "pillow", "prometheus-client>=0.20.0",
24
- "psutil", "pydantic", "python-multipart",
25
- "pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop",
26
- "xgrammar>=0.1.10"
22
+ "orjson", "packaging", "pillow", "prometheus-client>=0.20.0",
23
+ "psutil", "pydantic", "python-multipart", "pyzmq>=25.1.2",
24
+ "torchao>=0.7.0", "uvicorn", "uvloop", "xgrammar>=0.1.10"
27
25
  ]
28
26
  srt = [
29
27
  "sglang[runtime_common]", "cuda-python",
30
- "sgl-kernel>=0.0.3", "torch", "vllm==0.6.4.post1",
31
- "flashinfer==0.1.6"
28
+ "sgl-kernel>=0.0.3.post1", "torch", "vllm==0.6.4.post1",
29
+ "flashinfer_python>=0.2.0.post2", "outlines>=0.0.44,<0.1.0"
32
30
  ]
33
31
 
34
32
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
35
33
  # => base docker rocm/vllm-dev:20241022, not from public vllm whl
36
- srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post2.dev1"]
34
+ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11", "sgl-kernel>=0.0.3.post1"]
37
35
  # xpu is not enabled in public vllm and torch whl,
38
36
  # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
39
- srt_xpu = ["sglang[runtime_common]"]
37
+ srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<0.1.0"]
40
38
  #For Intel Gaudi(device : hpu) follow the installation guide
41
39
  #https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
42
- srt_hpu = ["sglang[runtime_common]"]
40
+ srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<0.1.0"]
43
41
  # CPU: currently, there are no pre-built vllm wheels for CPU.
44
42
  # To install vllm for CPU, please follow the instruction here:
45
43
  # https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
46
- srt_cpu = ["sglang[runtime_common]", "torch"]
44
+ srt_cpu = ["sglang[runtime_common]", "torch", "outlines>=0.0.44,<0.1.0"]
47
45
 
48
46
  openai = ["openai>=1.0", "tiktoken"]
49
47
  anthropic = ["anthropic>=0.20.0"]
@@ -20,7 +20,6 @@ from typing import Dict, List, Optional, Tuple, Union
20
20
  import interegular
21
21
  import torch
22
22
  from outlines.fsm.guide import RegexGuide
23
- from outlines.fsm.json_schema import build_regex_from_schema
24
23
  from outlines.models.transformers import TransformerTokenizer
25
24
  from pydantic import BaseModel
26
25
 
@@ -29,6 +28,15 @@ from sglang.srt.constrained.base_grammar_backend import (
29
28
  BaseGrammarObject,
30
29
  )
31
30
  from sglang.srt.constrained.outlines_jump_forward import OutlinesJumpForwardMap
31
+ from sglang.srt.utils import is_hip
32
+
33
+ is_hip_ = is_hip()
34
+
35
+ if is_hip_:
36
+ from outlines_core.fsm.json_schema import build_regex_from_schema
37
+ else:
38
+ from outlines.fsm.json_schema import build_regex_from_schema
39
+
32
40
 
33
41
  logger = logging.getLogger(__name__)
34
42
 
@@ -0,0 +1,40 @@
1
+ import torch
2
+ from torch import nn
3
+
4
+ _is_cuda = torch.cuda.is_available() and torch.version.cuda
5
+ _is_rocm = torch.cuda.is_available() and torch.version.hip
6
+
7
+
8
+ class CustomOp(nn.Module):
9
+ def __init__(self):
10
+ super().__init__()
11
+ self._forward_method = self.dispatch_forward()
12
+
13
+ def forward(self, *args, **kwargs):
14
+ return self._forward_method(*args, **kwargs)
15
+
16
+ def forward_native(self, *args, **kwargs):
17
+ raise NotImplementedError
18
+
19
+ def forward_cuda(self, *args, **kwargs):
20
+ raise NotImplementedError
21
+
22
+ def forward_hip(self, *args, **kwargs):
23
+ return self.forward_cuda(*args, **kwargs)
24
+
25
+ def forward_xpu(self, *args, **kwargs):
26
+ return self.forward_native(*args, **kwargs)
27
+
28
+ def forward_hpu(self, *args, **kwargs):
29
+ return self.forward_native(*args, **kwargs)
30
+
31
+ def forward_cpu(self, *args, **kwargs):
32
+ return self.forward_native(*args, **kwargs)
33
+
34
+ def dispatch_forward(self):
35
+ if _is_cuda:
36
+ return self.forward_cuda
37
+ elif _is_rocm:
38
+ return self.forward_hip
39
+ else:
40
+ return self.forward_native
@@ -316,8 +316,8 @@ def _set_envs_and_config(server_args: ServerArgs):
316
316
  # Check flashinfer version
317
317
  if server_args.attention_backend == "flashinfer":
318
318
  assert_pkg_version(
319
- "flashinfer",
320
- "0.1.6",
319
+ "flashinfer_python",
320
+ "0.2.0.post2",
321
321
  "Please uninstall the old version and "
322
322
  "reinstall the latest version by following the instructions "
323
323
  "at https://docs.flashinfer.ai/installation.html.",
@@ -25,21 +25,18 @@ from sglang.srt.utils import is_cuda_available
25
25
  if is_cuda_available():
26
26
  from sgl_kernel import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
27
27
 
28
- from vllm.model_executor.custom_op import CustomOp
29
-
28
+ from sglang.srt.custom_op import CustomOp
30
29
  from sglang.srt.distributed import (
31
30
  divide,
32
31
  get_tensor_model_parallel_rank,
33
32
  get_tensor_model_parallel_world_size,
34
33
  )
35
- from sglang.srt.layers.custom_op_util import register_custom_op
36
34
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
37
35
  from sglang.srt.utils import set_weight_attrs
38
36
 
39
37
  logger = logging.getLogger(__name__)
40
38
 
41
39
 
42
- @register_custom_op("sglang_silu_and_mul")
43
40
  class SiluAndMul(CustomOp):
44
41
  def forward_native(self, x: torch.Tensor) -> torch.Tensor:
45
42
  d = x.shape[-1] // 2
@@ -53,7 +50,6 @@ class SiluAndMul(CustomOp):
53
50
  return out
54
51
 
55
52
 
56
- @register_custom_op("sglang_gelu_and_mul")
57
53
  class GeluAndMul(CustomOp):
58
54
  def __init__(self, approximate="tanh"):
59
55
  super().__init__()
@@ -76,6 +72,15 @@ class GeluAndMul(CustomOp):
76
72
  return out
77
73
 
78
74
 
75
+ class QuickGELU(CustomOp):
76
+ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
77
+ return x * torch.sigmoid(1.702 * x)
78
+
79
+ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
80
+ # TODO(zhyncs): Implement the CUDA kernel for QuickGELU in sgl-kernel
81
+ return self.forward_native(x)
82
+
83
+
79
84
  class ScaledActivation(nn.Module):
80
85
  """An activation function with post-scale parameters.
81
86
 
@@ -10,6 +10,7 @@ Each backend supports two operators: extend (i.e. prefill with cached prefix) an
10
10
  import os
11
11
  from dataclasses import dataclass
12
12
  from enum import Enum, auto
13
+ from functools import partial
13
14
  from typing import TYPE_CHECKING, List, Optional, Union
14
15
 
15
16
  import torch
@@ -34,6 +35,7 @@ if is_flashinfer_available():
34
35
  BatchPrefillWithRaggedKVCacheWrapper,
35
36
  )
36
37
  from flashinfer.cascade import merge_state
38
+ from flashinfer.decode import PosEncodingMode
37
39
 
38
40
 
39
41
  class WrapperDispatch(Enum):
@@ -53,10 +55,19 @@ class PrefillMetadata:
53
55
  extend_no_prefix: bool
54
56
 
55
57
 
58
+ # Reuse this workspace buffer across all flashinfer wrappers
59
+ global_workspace_buffer = None
60
+
61
+
56
62
  class FlashInferAttnBackend(AttentionBackend):
57
63
  """Flashinfer attention kernels."""
58
64
 
59
- def __init__(self, model_runner: ModelRunner):
65
+ def __init__(
66
+ self,
67
+ model_runner: ModelRunner,
68
+ skip_prefill: bool = False,
69
+ kv_indptr_buf: Optional[torch.Tensor] = None,
70
+ ):
60
71
  super().__init__()
61
72
 
62
73
  # Parse constants
@@ -69,6 +80,7 @@ class FlashInferAttnBackend(AttentionBackend):
69
80
  ),
70
81
  )
71
82
  self.max_context_len = model_runner.model_config.context_len
83
+ self.skip_prefill = skip_prefill
72
84
 
73
85
  assert not (
74
86
  model_runner.sliding_window_size is not None
@@ -90,16 +102,26 @@ class FlashInferAttnBackend(AttentionBackend):
90
102
  global_config.flashinfer_workspace_size = 512 * 1024 * 1024
91
103
 
92
104
  # Allocate buffers
93
- self.workspace_buffer = torch.empty(
94
- global_config.flashinfer_workspace_size,
95
- dtype=torch.uint8,
96
- device=model_runner.device,
97
- )
105
+ global global_workspace_buffer
106
+ if global_workspace_buffer is None:
107
+ global_workspace_buffer = torch.empty(
108
+ global_config.flashinfer_workspace_size,
109
+ dtype=torch.uint8,
110
+ device=model_runner.device,
111
+ )
112
+ self.workspace_buffer = global_workspace_buffer
98
113
  max_bs = model_runner.req_to_token_pool.size
99
- self.kv_indptr = [
100
- torch.zeros((max_bs + 1,), dtype=torch.int32, device=model_runner.device)
101
- for _ in range(self.num_wrappers)
102
- ]
114
+ if kv_indptr_buf is None:
115
+ self.kv_indptr = [
116
+ torch.zeros(
117
+ (max_bs + 1,), dtype=torch.int32, device=model_runner.device
118
+ )
119
+ for _ in range(self.num_wrappers)
120
+ ]
121
+ else:
122
+ assert self.num_wrappers == 1
123
+ self.kv_indptr = [kv_indptr_buf]
124
+
103
125
  self.kv_last_page_len = torch.ones(
104
126
  (max_bs,), dtype=torch.int32, device=model_runner.device
105
127
  )
@@ -122,12 +144,17 @@ class FlashInferAttnBackend(AttentionBackend):
122
144
  self.prefill_wrappers_verify = []
123
145
  self.decode_wrappers = []
124
146
  for _ in range(self.num_wrappers):
125
- self.prefill_wrappers_paged.append(
126
- BatchPrefillWithPagedKVCacheWrapper(self.workspace_buffer, "NHD")
127
- )
128
- self.prefill_wrappers_verify.append(
129
- BatchPrefillWithPagedKVCacheWrapper(self.workspace_buffer, "NHD")
130
- )
147
+ if not skip_prefill:
148
+ self.prefill_wrappers_paged.append(
149
+ BatchPrefillWithPagedKVCacheWrapper(
150
+ self.workspace_buffer,
151
+ "NHD",
152
+ backend="fa2",
153
+ )
154
+ )
155
+ self.prefill_wrappers_verify.append(
156
+ BatchPrefillWithPagedKVCacheWrapper(self.workspace_buffer, "NHD")
157
+ )
131
158
  self.decode_wrappers.append(
132
159
  BatchDecodeWithPagedKVCacheWrapper(
133
160
  self.workspace_buffer,
@@ -137,10 +164,11 @@ class FlashInferAttnBackend(AttentionBackend):
137
164
  )
138
165
 
139
166
  # Create indices updater
167
+ if not skip_prefill:
168
+ self.indices_updater_prefill = FlashInferIndicesUpdaterPrefill(
169
+ model_runner, self
170
+ )
140
171
  self.indices_updater_decode = FlashInferIndicesUpdaterDecode(model_runner, self)
141
- self.indices_updater_prefill = FlashInferIndicesUpdaterPrefill(
142
- model_runner, self
143
- )
144
172
 
145
173
  # Other metadata
146
174
  self.forward_metadata: Union[PrefillMetadata, DecodeMetadata] = None
@@ -211,23 +239,30 @@ class FlashInferAttnBackend(AttentionBackend):
211
239
  self.prefill_wrappers_paged, use_ragged, extend_no_prefix
212
240
  )
213
241
 
214
- def init_cuda_graph_state(self, max_bs: int):
215
- cuda_graph_kv_indices = torch.zeros(
216
- (max_bs * self.max_context_len,),
217
- dtype=torch.int32,
218
- device="cuda",
219
- )
242
+ def init_cuda_graph_state(
243
+ self, max_bs: int, kv_indices_buf: Optional[torch.Tensor] = None
244
+ ):
245
+ if kv_indices_buf is None:
246
+ cuda_graph_kv_indices = torch.zeros(
247
+ (max_bs * self.max_context_len,),
248
+ dtype=torch.int32,
249
+ device="cuda",
250
+ )
251
+ else:
252
+ cuda_graph_kv_indices = kv_indices_buf
253
+
220
254
  self.cuda_graph_kv_indices = [cuda_graph_kv_indices] + [
221
255
  cuda_graph_kv_indices.clone() for _ in range(self.num_wrappers - 1)
222
256
  ]
223
257
 
224
- self.cuda_graph_custom_mask = torch.zeros(
225
- (max_bs * self.max_context_len),
226
- dtype=torch.uint8,
227
- device="cuda",
228
- )
229
- self.cuda_graph_qk_indptr = [x.clone() for x in self.kv_indptr]
230
- self.cuda_graph_qo_indptr = [x.clone() for x in self.kv_indptr]
258
+ if not self.skip_prefill:
259
+ self.cuda_graph_custom_mask = torch.zeros(
260
+ (max_bs * self.max_context_len),
261
+ dtype=torch.uint8,
262
+ device="cuda",
263
+ )
264
+ self.cuda_graph_qk_indptr = [x.clone() for x in self.kv_indptr]
265
+ self.cuda_graph_qo_indptr = [x.clone() for x in self.kv_indptr]
231
266
 
232
267
  def init_forward_metadata_capture_cuda_graph(
233
268
  self,
@@ -279,7 +314,7 @@ class FlashInferAttnBackend(AttentionBackend):
279
314
  paged_kv_indices_buf=self.cuda_graph_kv_indices[i],
280
315
  paged_kv_last_page_len_buf=self.kv_last_page_len[:bs],
281
316
  custom_mask_buf=self.cuda_graph_custom_mask,
282
- qk_indptr_buf=self.cuda_graph_qk_indptr[i][: bs + 1],
317
+ mask_indptr_buf=self.cuda_graph_qk_indptr[i][: bs + 1],
283
318
  )
284
319
  )
285
320
  seq_lens_sum = seq_lens.sum().item()
@@ -602,11 +637,8 @@ class FlashInferIndicesUpdaterDecode:
602
637
  self.req_to_token.shape[1],
603
638
  )
604
639
  else:
605
- bs, kv_indices, kv_indptr = spec_info.generate_attn_arg_decode(
606
- req_pool_indices,
607
- paged_kernel_lens,
608
- self.req_to_token,
609
- )
640
+ kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
641
+ bs = kv_indptr.shape[0] - 1
610
642
 
611
643
  wrapper.end_forward()
612
644
  wrapper.begin_forward(
@@ -800,7 +832,9 @@ class FlashInferIndicesUpdaterPrefill:
800
832
  kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
801
833
  kv_indptr = kv_indptr[: bs + 1]
802
834
  kv_indices = torch.empty(
803
- paged_kernel_lens_sum, dtype=torch.int32, device="cuda"
835
+ paged_kernel_lens_sum + 256,
836
+ dtype=torch.int32,
837
+ device=req_pool_indices.device,
804
838
  )
805
839
  create_flashinfer_kv_indices_triton[(bs,)](
806
840
  self.req_to_token,
@@ -852,6 +886,132 @@ class FlashInferIndicesUpdaterPrefill:
852
886
  )
853
887
 
854
888
 
889
+ class FlashInferMultiStepDraftBackend:
890
+ """
891
+ Wrap multiple flashinfer attention backends as one for multiple consecutive
892
+ draft decoding steps.
893
+ """
894
+
895
+ def __init__(
896
+ self,
897
+ model_runner: ModelRunner,
898
+ topk: int,
899
+ speculative_num_steps: int,
900
+ ):
901
+ from sglang.srt.speculative.eagle_utils import generate_draft_decode_kv_indices
902
+
903
+ self.topk = topk
904
+ self.speculative_num_steps = speculative_num_steps
905
+ self.generate_draft_decode_kv_indices = generate_draft_decode_kv_indices
906
+ max_bs = model_runner.req_to_token_pool.size
907
+ self.kv_indptr = torch.zeros(
908
+ (
909
+ self.speculative_num_steps,
910
+ max_bs + 1,
911
+ ),
912
+ dtype=torch.int32,
913
+ device=model_runner.device,
914
+ )
915
+ self.attn_backends = []
916
+ for i in range(self.speculative_num_steps):
917
+ self.attn_backends.append(
918
+ FlashInferAttnBackend(
919
+ model_runner,
920
+ skip_prefill=True,
921
+ kv_indptr_buf=self.kv_indptr[i],
922
+ )
923
+ )
924
+ self.max_context_len = self.attn_backends[0].max_context_len
925
+ # Cached variables for generate_draft_decode_kv_indices
926
+ self.pool_len = model_runner.req_to_token_pool.req_to_token.shape[1]
927
+ self.kv_indptr_stride = self.kv_indptr.shape[1]
928
+
929
+ def common_template(self, forward_batch: ForwardBatch, call_fn: int):
930
+ num_seqs = forward_batch.batch_size
931
+ bs = self.topk * num_seqs
932
+ seq_lens_sum = forward_batch.seq_lens_sum
933
+ self.generate_draft_decode_kv_indices[
934
+ (self.speculative_num_steps, num_seqs, self.topk)
935
+ ](
936
+ forward_batch.req_pool_indices,
937
+ forward_batch.req_to_token_pool.req_to_token,
938
+ forward_batch.seq_lens,
939
+ self.cuda_graph_kv_indices,
940
+ self.kv_indptr,
941
+ forward_batch.positions,
942
+ num_seqs,
943
+ self.topk,
944
+ self.pool_len,
945
+ self.kv_indptr_stride,
946
+ self.kv_indptr.shape[1],
947
+ triton.next_power_of_2(num_seqs),
948
+ triton.next_power_of_2(self.speculative_num_steps),
949
+ triton.next_power_of_2(bs),
950
+ )
951
+ for i in range(self.speculative_num_steps):
952
+ forward_batch.spec_info.kv_indptr = self.kv_indptr[i, : bs + 1]
953
+ forward_batch.spec_info.kv_indices = self.cuda_graph_kv_indices[i][
954
+ : seq_lens_sum * self.topk + bs * (i + 1)
955
+ ]
956
+ call_fn(i, forward_batch)
957
+
958
+ def init_forward_metadata(self, forward_batch: ForwardBatch):
959
+ def call_fn(i, forward_batch):
960
+ forward_batch.spec_info.kv_indptr = (
961
+ forward_batch.spec_info.kv_indptr.clone()
962
+ )
963
+ forward_batch.spec_info.kv_indices = (
964
+ forward_batch.spec_info.kv_indices.clone()
965
+ )
966
+ self.attn_backends[i].init_forward_metadata(forward_batch)
967
+
968
+ self.common_template(forward_batch, call_fn)
969
+
970
+ def init_cuda_graph_state(self, max_bs: int):
971
+ self.cuda_graph_kv_indices = torch.zeros(
972
+ (self.speculative_num_steps, max_bs * self.max_context_len),
973
+ dtype=torch.int32,
974
+ device="cuda",
975
+ )
976
+ self.kv_indptr_stride = self.cuda_graph_kv_indices.shape[1]
977
+ for i in range(self.speculative_num_steps):
978
+ self.attn_backends[i].init_cuda_graph_state(
979
+ max_bs, kv_indices_buf=self.cuda_graph_kv_indices[i]
980
+ )
981
+
982
+ def init_forward_metadata_capture_cuda_graph(self, forward_batch: ForwardBatch):
983
+ def call_fn(i, forward_batch):
984
+ self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
985
+ forward_batch.batch_size,
986
+ forward_batch.batch_size * self.topk,
987
+ forward_batch.req_pool_indices,
988
+ forward_batch.seq_lens,
989
+ encoder_lens=None,
990
+ forward_mode=ForwardMode.DECODE,
991
+ spec_info=forward_batch.spec_info,
992
+ )
993
+ decode_wrapper = self.attn_backends[i].decode_cuda_graph_metadata[
994
+ forward_batch.batch_size
995
+ ][0]
996
+ decode_wrapper.begin_forward = partial(fast_decode_plan, decode_wrapper)
997
+
998
+ self.common_template(forward_batch, call_fn)
999
+
1000
+ def init_forward_metadata_replay_cuda_graph(self, forward_batch):
1001
+ def call_fn(i, forward_batch):
1002
+ self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
1003
+ forward_batch.batch_size,
1004
+ forward_batch.req_pool_indices,
1005
+ forward_batch.seq_lens,
1006
+ seq_lens_sum=-1,
1007
+ encoder_lens=None,
1008
+ forward_mode=ForwardMode.DECODE,
1009
+ spec_info=forward_batch.spec_info,
1010
+ )
1011
+
1012
+ self.common_template(forward_batch, call_fn)
1013
+
1014
+
855
1015
  @triton.jit
856
1016
  def create_flashinfer_kv_indices_triton(
857
1017
  req_to_token_ptr, # [max_batch, max_context_len]
@@ -935,3 +1095,88 @@ def should_use_tensor_core(
935
1095
  return gqa_group_size > 4
936
1096
  else:
937
1097
  return False
1098
+
1099
+
1100
+ def fast_decode_plan(
1101
+ self,
1102
+ indptr: torch.Tensor,
1103
+ indices: torch.Tensor,
1104
+ last_page_len: torch.Tensor,
1105
+ num_qo_heads: int,
1106
+ num_kv_heads: int,
1107
+ head_dim: int,
1108
+ page_size: int,
1109
+ pos_encoding_mode: str = "NONE",
1110
+ window_left: int = -1,
1111
+ logits_soft_cap: Optional[float] = None,
1112
+ data_type: Union[str, torch.dtype] = "float16",
1113
+ q_data_type: Optional[Union[str, torch.dtype]] = None,
1114
+ sm_scale: Optional[float] = None,
1115
+ rope_scale: Optional[float] = None,
1116
+ rope_theta: Optional[float] = None,
1117
+ ) -> None:
1118
+ """A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for FlashInferMultiStepDraftBackend."""
1119
+ batch_size = len(last_page_len)
1120
+ if logits_soft_cap is None:
1121
+ logits_soft_cap = 0.0
1122
+ if self.is_cuda_graph_enabled:
1123
+ if batch_size != self._fixed_batch_size:
1124
+ raise ValueError(
1125
+ "The batch size should be fixed in cudagraph mode, the runtime batch size {} "
1126
+ " mismatches the batch size set during initialization {}".format(
1127
+ batch_size, self._fixed_batch_size
1128
+ )
1129
+ )
1130
+ if len(indices) > len(self._paged_kv_indices_buf):
1131
+ raise ValueError(
1132
+ "The size of indices should be less than or equal to the allocated buffer"
1133
+ )
1134
+ else:
1135
+ self._paged_kv_indptr_buf = indptr
1136
+ self._paged_kv_indices_buf = indices
1137
+ self._paged_kv_last_page_len_buf = last_page_len
1138
+ # NOTE(Zihao): the following tensors acts as placeholder to pass dtype info
1139
+ if not q_data_type:
1140
+ q_data_type = data_type
1141
+ if not hasattr(self, "empty_q_data"):
1142
+ self.empty_q_data = torch.empty(
1143
+ 0,
1144
+ dtype=(
1145
+ getattr(torch, q_data_type)
1146
+ if isinstance(q_data_type, str)
1147
+ else q_data_type
1148
+ ),
1149
+ )
1150
+ self.empty_kv_cache = torch.empty(
1151
+ 0,
1152
+ dtype=(
1153
+ getattr(torch, data_type) if isinstance(data_type, str) else data_type
1154
+ ),
1155
+ )
1156
+ self.last_page_len = torch.ones(32768, dtype=torch.int32)
1157
+ empty_q_data = self.empty_q_data
1158
+ empty_kv_cache = self.empty_kv_cache
1159
+ stream = torch.cuda.current_stream()
1160
+ self._cached_module.plan(
1161
+ self._float_workspace_buffer,
1162
+ self._int_workspace_buffer,
1163
+ self._pin_memory_int_workspace_buffer,
1164
+ indptr.to("cpu"),
1165
+ batch_size,
1166
+ num_qo_heads,
1167
+ num_kv_heads,
1168
+ page_size,
1169
+ self.is_cuda_graph_enabled,
1170
+ window_left,
1171
+ logits_soft_cap,
1172
+ head_dim,
1173
+ empty_q_data,
1174
+ empty_kv_cache,
1175
+ stream.cuda_stream,
1176
+ )
1177
+ self._pos_encoding_mode = pos_encoding_mode
1178
+ self._window_left = window_left
1179
+ self._logits_soft_cap = logits_soft_cap
1180
+ self._sm_scale = sm_scale
1181
+ self._rope_scale = rope_scale
1182
+ self._rope_theta = rope_theta